Total coverage: 212059 (11%)of 1996279
2 3 2 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 // SPDX-License-Identifier: GPL-2.0 /* * Handling of different ABIs (personalities). * * We group personalities into execution domains which have their * own handlers for kernel entry points, signal mapping, etc... * * 2001-05-06 Complete rewrite, Christoph Hellwig (hch@infradead.org) */ #include <linux/init.h> #include <linux/kernel.h> #include <linux/kmod.h> #include <linux/module.h> #include <linux/personality.h> #include <linux/proc_fs.h> #include <linux/sched.h> #include <linux/seq_file.h> #include <linux/syscalls.h> #include <linux/sysctl.h> #include <linux/types.h> #ifdef CONFIG_PROC_FS static int execdomains_proc_show(struct seq_file *m, void *v) { seq_puts(m, "0-0\tLinux \t[kernel]\n"); return 0; } static int __init proc_execdomains_init(void) { proc_create_single("execdomains", 0, NULL, execdomains_proc_show); return 0; } module_init(proc_execdomains_init); #endif SYSCALL_DEFINE1(personality, unsigned int, personality) { unsigned int old = current->personality; if (personality != 0xffffffff) set_personality(personality); return old; }
2 1 2 2 2 2 2 2 2 2 133 85 133 27 133 160 160 77 77 77 77 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 // SPDX-License-Identifier: GPL-2.0 #include <linux/tcp.h> #include <net/tcp.h> static u32 tcp_rack_reo_wnd(const struct sock *sk) { const struct tcp_sock *tp = tcp_sk(sk); if (!tp->reord_seen) { /* If reordering has not been observed, be aggressive during * the recovery or starting the recovery by DUPACK threshold. */ if (inet_csk(sk)->icsk_ca_state >= TCP_CA_Recovery) return 0; if (tp->sacked_out >= tp->reordering && !(READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_recovery) & TCP_RACK_NO_DUPTHRESH)) return 0; } /* To be more reordering resilient, allow min_rtt/4 settling delay. * Use min_rtt instead of the smoothed RTT because reordering is * often a path property and less related to queuing or delayed ACKs. * Upon receiving DSACKs, linearly increase the window up to the * smoothed RTT. */ return min((tcp_min_rtt(tp) >> 2) * tp->rack.reo_wnd_steps, tp->srtt_us >> 3); } s32 tcp_rack_skb_timeout(struct tcp_sock *tp, struct sk_buff *skb, u32 reo_wnd) { return tp->rack.rtt_us + reo_wnd - tcp_stamp_us_delta(tp->tcp_mstamp, tcp_skb_timestamp_us(skb)); } /* RACK loss detection (IETF draft draft-ietf-tcpm-rack-01): * * Marks a packet lost, if some packet sent later has been (s)acked. * The underlying idea is similar to the traditional dupthresh and FACK * but they look at different metrics: * * dupthresh: 3 OOO packets delivered (packet count) * FACK: sequence delta to highest sacked sequence (sequence space) * RACK: sent time delta to the latest delivered packet (time domain) * * The advantage of RACK is it applies to both original and retransmitted * packet and therefore is robust against tail losses. Another advantage * is being more resilient to reordering by simply allowing some * "settling delay", instead of tweaking the dupthresh. * * When tcp_rack_detect_loss() detects some packets are lost and we * are not already in the CA_Recovery state, either tcp_rack_reo_timeout() * or tcp_time_to_recover()'s "Trick#1: the loss is proven" code path will * make us enter the CA_Recovery state. */ static void tcp_rack_detect_loss(struct sock *sk, u32 *reo_timeout) { struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *skb, *n; u32 reo_wnd; *reo_timeout = 0; reo_wnd = tcp_rack_reo_wnd(sk); list_for_each_entry_safe(skb, n, &tp->tsorted_sent_queue, tcp_tsorted_anchor) { struct tcp_skb_cb *scb = TCP_SKB_CB(skb); s32 remaining; /* Skip ones marked lost but not yet retransmitted */ if ((scb->sacked & TCPCB_LOST) && !(scb->sacked & TCPCB_SACKED_RETRANS)) continue; if (!tcp_skb_sent_after(tp->rack.mstamp, tcp_skb_timestamp_us(skb), tp->rack.end_seq, scb->end_seq)) break; /* A packet is lost if it has not been s/acked beyond * the recent RTT plus the reordering window. */ remaining = tcp_rack_skb_timeout(tp, skb, reo_wnd); if (remaining <= 0) { tcp_mark_skb_lost(sk, skb); list_del_init(&skb->tcp_tsorted_anchor); } else { /* Record maximum wait time */ *reo_timeout = max_t(u32, *reo_timeout, remaining); } } } bool tcp_rack_mark_lost(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); u32 timeout; if (!tp->rack.advanced) return false; /* Reset the advanced flag to avoid unnecessary queue scanning */ tp->rack.advanced = 0; tcp_rack_detect_loss(sk, &timeout); if (timeout) { timeout = usecs_to_jiffies(timeout + TCP_TIMEOUT_MIN_US); inet_csk_reset_xmit_timer(sk, ICSK_TIME_REO_TIMEOUT, timeout, inet_csk(sk)->icsk_rto); } return !!timeout; } /* Record the most recently (re)sent time among the (s)acked packets * This is "Step 3: Advance RACK.xmit_time and update RACK.RTT" from * draft-cheng-tcpm-rack-00.txt */ void tcp_rack_advance(struct tcp_sock *tp, u8 sacked, u32 end_seq, u64 xmit_time) { u32 rtt_us; rtt_us = tcp_stamp_us_delta(tp->tcp_mstamp, xmit_time); if (rtt_us < tcp_min_rtt(tp) && (sacked & TCPCB_RETRANS)) { /* If the sacked packet was retransmitted, it's ambiguous * whether the retransmission or the original (or the prior * retransmission) was sacked. * * If the original is lost, there is no ambiguity. Otherwise * we assume the original can be delayed up to aRTT + min_rtt. * the aRTT term is bounded by the fast recovery or timeout, * so it's at least one RTT (i.e., retransmission is at least * an RTT later). */ return; } tp->rack.advanced = 1; tp->rack.rtt_us = rtt_us; if (tcp_skb_sent_after(xmit_time, tp->rack.mstamp, end_seq, tp->rack.end_seq)) { tp->rack.mstamp = xmit_time; tp->rack.end_seq = end_seq; } } /* We have waited long enough to accommodate reordering. Mark the expired * packets lost and retransmit them. */ void tcp_rack_reo_timeout(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); u32 timeout, prior_inflight; u32 lost = tp->lost; prior_inflight = tcp_packets_in_flight(tp); tcp_rack_detect_loss(sk, &timeout); if (prior_inflight != tcp_packets_in_flight(tp)) { if (inet_csk(sk)->icsk_ca_state != TCP_CA_Recovery) { tcp_enter_recovery(sk, false); if (!inet_csk(sk)->icsk_ca_ops->cong_control) tcp_cwnd_reduction(sk, 1, tp->lost - lost, 0); } tcp_xmit_retransmit_queue(sk); } if (inet_csk(sk)->icsk_pending != ICSK_TIME_RETRANS) tcp_rearm_rto(sk); } /* Updates the RACK's reo_wnd based on DSACK and no. of recoveries. * * If a DSACK is received that seems like it may have been due to reordering * triggering fast recovery, increment reo_wnd by min_rtt/4 (upper bounded * by srtt), since there is possibility that spurious retransmission was * due to reordering delay longer than reo_wnd. * * Persist the current reo_wnd value for TCP_RACK_RECOVERY_THRESH (16) * no. of successful recoveries (accounts for full DSACK-based loss * recovery undo). After that, reset it to default (min_rtt/4). * * At max, reo_wnd is incremented only once per rtt. So that the new * DSACK on which we are reacting, is due to the spurious retx (approx) * after the reo_wnd has been updated last time. * * reo_wnd is tracked in terms of steps (of min_rtt/4), rather than * absolute value to account for change in rtt. */ void tcp_rack_update_reo_wnd(struct sock *sk, struct rate_sample *rs) { struct tcp_sock *tp = tcp_sk(sk); if ((READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_recovery) & TCP_RACK_STATIC_REO_WND) || !rs->prior_delivered) return; /* Disregard DSACK if a rtt has not passed since we adjusted reo_wnd */ if (before(rs->prior_delivered, tp->rack.last_delivered)) tp->rack.dsack_seen = 0; /* Adjust the reo_wnd if update is pending */ if (tp->rack.dsack_seen) { tp->rack.reo_wnd_steps = min_t(u32, 0xFF, tp->rack.reo_wnd_steps + 1); tp->rack.dsack_seen = 0; tp->rack.last_delivered = tp->delivered; tp->rack.reo_wnd_persist = TCP_RACK_RECOVERY_THRESH; } else if (!tp->rack.reo_wnd_persist) { tp->rack.reo_wnd_steps = 1; } } /* RFC6582 NewReno recovery for non-SACK connection. It simply retransmits * the next unacked packet upon receiving * a) three or more DUPACKs to start the fast recovery * b) an ACK acknowledging new data during the fast recovery. */ void tcp_newreno_mark_lost(struct sock *sk, bool snd_una_advanced) { const u8 state = inet_csk(sk)->icsk_ca_state; struct tcp_sock *tp = tcp_sk(sk); if ((state < TCP_CA_Recovery && tp->sacked_out >= tp->reordering) || (state == TCP_CA_Recovery && snd_una_advanced)) { struct sk_buff *skb = tcp_rtx_queue_head(sk); u32 mss; if (TCP_SKB_CB(skb)->sacked & TCPCB_LOST) return; mss = tcp_skb_mss(skb); if (tcp_skb_pcount(skb) > 1 && skb->len > mss) tcp_fragment(sk, TCP_FRAG_IN_RTX_QUEUE, skb, mss, mss, GFP_ATOMIC); tcp_mark_skb_lost(sk, skb); } }
2 2 2 6 4 4 1 1 6 4 6 4 1 1 4 2 5 5 6 6 5 6 6 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808 4809 4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 4820 4821 4822 4823 4824 4825 4826 4827 4828 4829 4830 4831 4832 4833 4834 4835 4836 4837 4838 4839 4840 4841 4842 4843 4844 4845 4846 4847 4848 4849 4850 4851 4852 4853 4854 4855 4856 4857 4858 4859 4860 4861 4862 4863 4864 4865 4866 4867 4868 4869 4870 4871 4872 4873 4874 4875 4876 4877 4878 4879 4880 4881 4882 4883 4884 4885 4886 4887 4888 4889 4890 4891 4892 4893 4894 4895 4896 4897 4898 4899 4900 4901 4902 4903 4904 4905 4906 4907 4908 4909 4910 4911 4912 4913 4914 4915 4916 4917 4918 4919 4920 4921 4922 4923 4924 4925 4926 4927 4928 4929 4930 4931 4932 4933 4934 4935 4936 4937 4938 4939 4940 4941 4942 4943 4944 4945 4946 4947 4948 4949 4950 4951 4952 4953 4954 4955 4956 4957 4958 4959 4960 4961 4962 4963 4964 4965 4966 4967 4968 4969 4970 4971 4972 4973 4974 4975 4976 4977 4978 4979 4980 4981 4982 4983 4984 4985 4986 4987 4988 4989 4990 4991 4992 4993 4994 4995 4996 4997 4998 4999 5000 5001 5002 5003 5004 5005 5006 5007 5008 5009 5010 5011 5012 5013 5014 5015 5016 5017 5018 5019 5020 5021 5022 5023 5024 5025 5026 5027 5028 5029 5030 5031 5032 5033 5034 5035 5036 5037 5038 5039 5040 5041 5042 5043 5044 5045 5046 5047 5048 5049 5050 5051 5052 5053 5054 5055 5056 5057 5058 5059 5060 5061 5062 5063 5064 5065 5066 5067 5068 5069 5070 5071 5072 5073 5074 5075 5076 5077 5078 5079 5080 5081 5082 5083 5084 5085 5086 5087 5088 5089 5090 5091 5092 5093 5094 5095 5096 5097 5098 5099 5100 5101 5102 5103 5104 5105 5106 5107 5108 5109 5110 5111 5112 5113 5114 5115 5116 5117 5118 5119 5120 5121 5122 5123 5124 5125 5126 5127 5128 5129 5130 5131 5132 5133 5134 5135 5136 5137 5138 5139 5140 5141 5142 5143 5144 5145 5146 5147 5148 5149 5150 5151 5152 5153 5154 5155 5156 5157 5158 5159 5160 5161 5162 5163 5164 5165 5166 5167 5168 5169 5170 5171 5172 5173 5174 5175 5176 5177 5178 5179 5180 5181 5182 5183 5184 5185 5186 5187 5188 5189 5190 5191 5192 5193 5194 5195 5196 5197 5198 5199 5200 5201 5202 5203 5204 5205 5206 5207 5208 5209 5210 5211 5212 5213 5214 5215 5216 5217 5218 5219 5220 5221 5222 5223 5224 5225 5226 5227 5228 5229 5230 5231 5232 5233 5234 5235 5236 5237 5238 5239 5240 5241 5242 5243 5244 5245 5246 5247 5248 5249 5250 5251 5252 5253 5254 5255 5256 5257 5258 5259 5260 5261 5262 5263 5264 5265 5266 5267 5268 5269 5270 5271 5272 5273 5274 5275 5276 5277 5278 5279 5280 5281 5282 5283 5284 5285 5286 5287 5288 5289 5290 5291 5292 5293 5294 5295 5296 5297 5298 5299 5300 5301 5302 5303 5304 5305 5306 5307 5308 5309 5310 5311 5312 5313 5314 5315 5316 5317 5318 5319 5320 5321 5322 5323 5324 5325 5326 5327 5328 5329 5330 5331 5332 5333 5334 5335 5336 5337 5338 5339 5340 5341 5342 5343 5344 5345 5346 5347 5348 5349 5350 5351 5352 5353 5354 5355 5356 5357 5358 5359 5360 5361 5362 5363 5364 5365 5366 5367 5368 5369 5370 5371 5372 5373 5374 5375 5376 5377 5378 5379 5380 5381 5382 5383 5384 5385 5386 5387 5388 5389 5390 5391 5392 5393 5394 5395 5396 5397 5398 5399 5400 5401 5402 5403 5404 5405 5406 5407 5408 5409 5410 5411 5412 5413 5414 5415 5416 5417 5418 5419 5420 5421 5422 5423 5424 5425 5426 5427 5428 5429 5430 5431 5432 5433 5434 5435 5436 5437 5438 5439 5440 5441 5442 5443 5444 5445 5446 5447 5448 5449 5450 5451 5452 5453 5454 5455 5456 5457 5458 5459 5460 5461 5462 5463 5464 5465 5466 5467 5468 5469 5470 5471 5472 5473 5474 5475 5476 5477 5478 5479 5480 5481 5482 5483 5484 5485 5486 5487 5488 5489 5490 5491 5492 5493 5494 5495 5496 5497 5498 5499 5500 5501 5502 5503 5504 5505 5506 5507 5508 5509 5510 5511 5512 5513 5514 5515 5516 5517 5518 5519 5520 5521 5522 5523 5524 5525 5526 5527 5528 5529 5530 5531 5532 5533 5534 5535 5536 5537 5538 5539 5540 5541 5542 5543 5544 5545 5546 5547 5548 5549 5550 5551 5552 5553 5554 5555 5556 5557 5558 5559 5560 5561 5562 5563 5564 5565 5566 5567 5568 5569 5570 5571 5572 5573 5574 5575 5576 5577 5578 5579 5580 5581 5582 5583 5584 5585 5586 5587 5588 5589 5590 5591 5592 5593 5594 5595 5596 5597 5598 5599 5600 5601 5602 5603 5604 5605 5606 5607 5608 5609 5610 5611 5612 5613 5614 5615 5616 5617 5618 5619 5620 5621 5622 5623 5624 5625 5626 5627 5628 5629 5630 5631 5632 5633 5634 5635 5636 5637 5638 5639 5640 5641 5642 5643 5644 5645 5646 5647 5648 5649 5650 5651 5652 5653 5654 5655 5656 5657 5658 5659 5660 5661 5662 5663 5664 5665 5666 5667 5668 5669 5670 5671 5672 5673 5674 5675 5676 5677 5678 5679 5680 5681 5682 5683 5684 5685 5686 5687 5688 5689 5690 5691 5692 5693 5694 5695 5696 5697 5698 5699 5700 5701 5702 5703 5704 5705 5706 5707 5708 5709 5710 5711 5712 5713 5714 5715 5716 5717 5718 5719 5720 5721 5722 5723 5724 5725 5726 5727 5728 5729 5730 5731 5732 5733 5734 5735 5736 5737 5738 5739 5740 5741 5742 5743 5744 5745 5746 5747 5748 5749 5750 5751 5752 5753 5754 5755 5756 5757 5758 5759 5760 5761 5762 5763 5764 5765 5766 5767 5768 5769 5770 5771 5772 5773 5774 5775 5776 5777 5778 5779 5780 5781 5782 5783 5784 5785 5786 5787 5788 5789 5790 5791 5792 5793 5794 5795 5796 5797 5798 5799 5800 5801 5802 5803 5804 5805 5806 5807 5808 5809 5810 5811 5812 5813 5814 5815 5816 5817 5818 5819 5820 5821 5822 5823 5824 5825 5826 5827 5828 5829 5830 5831 5832 5833 5834 5835 5836 5837 5838 5839 5840 5841 5842 5843 5844 5845 5846 5847 5848 5849 5850 5851 5852 5853 5854 5855 5856 5857 5858 5859 5860 5861 5862 5863 5864 5865 5866 5867 5868 5869 5870 5871 5872 5873 5874 5875 5876 5877 5878 5879 5880 5881 5882 5883 5884 5885 5886 5887 5888 5889 5890 5891 5892 5893 5894 5895 5896 5897 5898 5899 5900 5901 5902 5903 5904 5905 5906 5907 5908 5909 5910 5911 5912 5913 5914 5915 5916 5917 5918 5919 5920 5921 5922 5923 5924 5925 5926 5927 5928 5929 5930 5931 5932 5933 5934 5935 5936 5937 5938 5939 5940 5941 5942 5943 5944 5945 5946 5947 5948 5949 5950 5951 5952 5953 5954 5955 5956 5957 5958 5959 5960 5961 5962 5963 5964 5965 5966 5967 5968 5969 5970 5971 5972 5973 5974 5975 5976 5977 5978 5979 5980 5981 5982 5983 5984 5985 5986 5987 5988 5989 5990 5991 5992 5993 5994 5995 5996 5997 5998 5999 6000 6001 6002 6003 6004 6005 6006 6007 6008 6009 6010 6011 6012 6013 6014 6015 6016 6017 6018 6019 6020 6021 6022 6023 6024 6025 6026 6027 6028 6029 6030 6031 6032 6033 6034 6035 6036 6037 6038 6039 6040 6041 6042 6043 6044 6045 6046 6047 6048 6049 6050 6051 6052 6053 6054 6055 6056 6057 6058 6059 6060 6061 6062 6063 6064 6065 6066 6067 6068 6069 6070 6071 6072 6073 6074 6075 6076 6077 6078 6079 6080 6081 6082 6083 6084 6085 6086 6087 6088 6089 6090 6091 6092 6093 6094 6095 6096 6097 6098 6099 6100 6101 6102 6103 6104 6105 6106 6107 6108 6109 6110 6111 6112 6113 6114 6115 6116 6117 6118 6119 6120 6121 6122 6123 6124 6125 6126 6127 6128 6129 6130 6131 6132 6133 6134 6135 6136 6137 6138 6139 6140 6141 6142 6143 6144 6145 6146 6147 6148 6149 6150 6151 6152 6153 6154 6155 6156 6157 6158 6159 6160 6161 6162 6163 6164 6165 6166 6167 6168 6169 6170 6171 6172 6173 6174 6175 6176 6177 6178 6179 6180 6181 6182 6183 6184 6185 6186 6187 6188 6189 6190 6191 6192 6193 6194 6195 6196 6197 6198 6199 6200 6201 6202 6203 6204 6205 6206 6207 6208 6209 6210 6211 6212 6213 6214 6215 6216 6217 6218 6219 6220 6221 6222 6223 6224 6225 6226 6227 6228 6229 6230 6231 6232 6233 6234 6235 6236 6237 6238 6239 6240 6241 6242 6243 6244 6245 6246 6247 6248 6249 6250 6251 6252 6253 6254 6255 6256 6257 6258 6259 6260 6261 6262 6263 6264 6265 6266 6267 6268 6269 6270 6271 6272 6273 6274 6275 6276 6277 6278 6279 6280 6281 6282 6283 6284 6285 6286 6287 6288 6289 6290 6291 6292 6293 6294 6295 6296 6297 6298 6299 6300 6301 6302 6303 6304 6305 6306 6307 6308 6309 6310 6311 6312 6313 6314 6315 6316 6317 6318 6319 6320 6321 6322 6323 6324 6325 6326 6327 6328 6329 6330 6331 6332 6333 6334 6335 6336 6337 6338 6339 6340 6341 6342 6343 6344 6345 6346 6347 6348 6349 6350 6351 6352 6353 6354 6355 6356 6357 6358 6359 6360 6361 6362 6363 6364 6365 6366 6367 6368 6369 6370 6371 6372 6373 6374 6375 6376 6377 6378 6379 6380 6381 6382 6383 6384 6385 6386 6387 6388 6389 6390 6391 6392 6393 6394 6395 6396 6397 6398 6399 6400 6401 6402 6403 6404 6405 6406 6407 6408 6409 6410 6411 6412 6413 6414 6415 6416 6417 6418 6419 6420 6421 6422 6423 6424 6425 6426 6427 6428 6429 6430 6431 6432 6433 6434 6435 6436 6437 6438 6439 6440 6441 6442 6443 6444 6445 6446 6447 6448 6449 6450 6451 6452 6453 6454 6455 6456 6457 6458 6459 6460 6461 6462 6463 6464 6465 6466 6467 6468 6469 6470 6471 6472 6473 6474 6475 6476 6477 6478 6479 6480 6481 6482 6483 6484 6485 6486 6487 6488 6489 6490 6491 6492 6493 6494 6495 6496 6497 6498 6499 6500 6501 6502 6503 6504 6505 6506 6507 6508 6509 6510 6511 6512 6513 6514 6515 6516 6517 6518 6519 6520 6521 6522 6523 6524 6525 6526 6527 6528 6529 6530 6531 6532 6533 6534 6535 6536 6537 6538 6539 6540 6541 6542 6543 6544 6545 6546 6547 6548 6549 6550 6551 6552 6553 6554 6555 6556 6557 6558 6559 6560 6561 6562 6563 6564 6565 6566 6567 6568 6569 6570 6571 6572 6573 6574 6575 6576 6577 6578 6579 6580 6581 6582 6583 6584 6585 6586 6587 6588 6589 6590 6591 6592 6593 6594 6595 6596 6597 6598 6599 6600 6601 6602 6603 6604 6605 6606 6607 6608 6609 6610 6611 6612 6613 6614 6615 6616 6617 6618 6619 6620 6621 6622 6623 6624 6625 6626 6627 6628 6629 6630 6631 6632 6633 6634 6635 6636 6637 6638 6639 6640 6641 6642 6643 6644 6645 6646 6647 6648 6649 6650 6651 6652 6653 6654 6655 6656 6657 6658 6659 6660 6661 6662 6663 6664 6665 6666 6667 6668 6669 6670 6671 6672 6673 6674 6675 6676 6677 6678 6679 6680 6681 6682 6683 6684 6685 6686 6687 6688 6689 6690 6691 6692 6693 6694 6695 6696 6697 6698 6699 6700 6701 6702 6703 6704 6705 6706 6707 6708 6709 6710 6711 6712 6713 6714 6715 6716 6717 6718 6719 6720 6721 6722 6723 6724 6725 6726 6727 6728 6729 6730 6731 6732 6733 6734 6735 6736 6737 6738 6739 6740 6741 6742 6743 6744 6745 6746 6747 6748 6749 6750 6751 6752 6753 6754 6755 6756 6757 6758 6759 6760 6761 6762 6763 6764 6765 6766 6767 6768 6769 6770 6771 6772 6773 6774 6775 6776 6777 6778 6779 6780 6781 6782 6783 6784 6785 6786 6787 6788 6789 6790 6791 6792 6793 6794 6795 6796 6797 6798 6799 6800 6801 6802 6803 6804 6805 6806 6807 6808 6809 6810 6811 6812 6813 6814 6815 6816 6817 6818 6819 6820 6821 6822 6823 6824 6825 6826 6827 6828 6829 6830 6831 6832 6833 6834 6835 6836 6837 6838 6839 6840 6841 6842 6843 6844 6845 6846 6847 6848 6849 6850 6851 6852 6853 6854 6855 6856 6857 6858 6859 6860 6861 6862 6863 6864 6865 6866 6867 6868 6869 6870 6871 6872 6873 6874 6875 6876 6877 6878 6879 6880 6881 6882 6883 6884 6885 6886 6887 6888 6889 6890 6891 6892 6893 6894 6895 6896 6897 6898 6899 6900 6901 6902 6903 6904 6905 6906 6907 6908 6909 6910 6911 6912 6913 6914 6915 6916 6917 6918 6919 6920 6921 6922 6923 6924 6925 6926 6927 6928 6929 6930 6931 6932 6933 6934 6935 6936 6937 6938 6939 6940 6941 6942 6943 6944 6945 6946 6947 6948 6949 6950 6951 6952 6953 6954 6955 6956 6957 6958 6959 6960 6961 6962 6963 6964 6965 6966 6967 6968 6969 6970 6971 6972 6973 6974 6975 6976 6977 6978 6979 6980 6981 6982 6983 6984 6985 6986 6987 6988 6989 6990 6991 6992 6993 6994 6995 6996 6997 6998 6999 7000 7001 7002 7003 7004 7005 7006 7007 7008 7009 7010 7011 7012 7013 7014 7015 7016 7017 7018 7019 7020 7021 7022 7023 7024 7025 7026 7027 7028 7029 7030 7031 7032 7033 7034 7035 7036 7037 7038 7039 7040 7041 7042 7043 7044 7045 7046 7047 7048 7049 7050 7051 7052 7053 7054 7055 7056 7057 7058 7059 7060 7061 7062 7063 7064 7065 7066 7067 7068 7069 7070 7071 7072 7073 7074 7075 7076 7077 7078 7079 7080 7081 7082 7083 7084 7085 7086 7087 7088 7089 7090 7091 7092 7093 7094 7095 7096 7097 7098 7099 7100 7101 7102 7103 7104 7105 7106 7107 7108 7109 7110 7111 7112 7113 7114 7115 7116 7117 7118 7119 7120 7121 7122 7123 7124 7125 7126 7127 7128 7129 7130 7131 7132 7133 7134 7135 7136 7137 7138 7139 7140 7141 7142 7143 7144 7145 7146 7147 7148 7149 7150 7151 7152 7153 7154 7155 7156 7157 7158 7159 7160 7161 7162 7163 7164 7165 7166 7167 7168 7169 7170 7171 7172 7173 7174 7175 7176 7177 7178 7179 7180 7181 7182 7183 7184 7185 7186 7187 7188 7189 7190 7191 7192 7193 7194 7195 7196 7197 7198 7199 7200 7201 7202 7203 7204 7205 7206 7207 7208 7209 7210 7211 7212 7213 7214 7215 7216 7217 7218 7219 7220 7221 7222 7223 7224 7225 7226 7227 7228 7229 7230 7231 7232 7233 7234 7235 7236 7237 7238 7239 7240 7241 7242 7243 7244 7245 7246 7247 7248 7249 7250 7251 7252 7253 7254 7255 7256 7257 7258 7259 7260 7261 7262 7263 7264 7265 7266 7267 7268 7269 7270 7271 7272 7273 7274 7275 7276 7277 7278 7279 7280 7281 7282 7283 7284 7285 7286 7287 7288 7289 7290 7291 7292 7293 7294 7295 7296 7297 7298 7299 7300 7301 7302 7303 7304 7305 7306 7307 7308 7309 7310 7311 7312 7313 7314 7315 7316 7317 7318 7319 7320 7321 7322 7323 7324 7325 7326 7327 7328 7329 7330 7331 7332 7333 7334 7335 7336 7337 7338 7339 7340 7341 7342 7343 7344 7345 7346 7347 7348 7349 7350 7351 7352 7353 7354 7355 7356 7357 7358 7359 7360 7361 7362 7363 7364 7365 7366 7367 7368 7369 7370 7371 7372 7373 7374 7375 7376 7377 7378 7379 7380 7381 7382 7383 7384 7385 7386 7387 7388 7389 7390 7391 7392 7393 7394 7395 7396 7397 7398 7399 7400 7401 7402 7403 7404 7405 7406 7407 7408 7409 7410 7411 7412 7413 7414 7415 7416 7417 7418 7419 7420 7421 7422 7423 7424 7425 7426 7427 7428 7429 7430 7431 7432 7433 7434 7435 7436 7437 7438 7439 7440 7441 7442 7443 7444 7445 7446 7447 7448 7449 7450 7451 7452 7453 7454 7455 7456 7457 7458 7459 7460 7461 7462 7463 7464 7465 7466 7467 7468 7469 7470 7471 7472 7473 7474 7475 7476 7477 7478 7479 7480 7481 7482 7483 7484 7485 7486 7487 7488 7489 7490 7491 7492 7493 7494 7495 7496 7497 7498 7499 7500 7501 7502 7503 7504 7505 7506 7507 7508 7509 7510 7511 7512 7513 7514 7515 7516 7517 7518 7519 7520 7521 7522 7523 7524 7525 7526 7527 7528 7529 7530 7531 7532 7533 7534 7535 7536 7537 7538 7539 7540 7541 7542 7543 7544 7545 7546 7547 7548 7549 7550 7551 7552 7553 7554 7555 7556 7557 7558 7559 7560 7561 7562 7563 7564 7565 7566 7567 7568 7569 7570 7571 7572 7573 7574 7575 7576 7577 7578 7579 7580 7581 7582 7583 7584 7585 7586 7587 7588 7589 7590 7591 7592 7593 7594 7595 7596 7597 7598 7599 7600 7601 7602 7603 7604 7605 7606 7607 7608 7609 7610 7611 7612 7613 7614 7615 7616 7617 7618 7619 7620 7621 7622 7623 7624 7625 7626 7627 7628 7629 7630 7631 7632 7633 7634 7635 7636 7637 7638 7639 7640 7641 7642 7643 7644 7645 7646 7647 7648 7649 7650 7651 7652 7653 7654 7655 7656 7657 7658 7659 7660 7661 7662 7663 7664 7665 7666 7667 7668 7669 7670 7671 7672 7673 7674 7675 7676 7677 7678 7679 7680 7681 7682 7683 7684 7685 7686 7687 7688 7689 7690 7691 7692 7693 7694 7695 7696 7697 7698 7699 7700 7701 7702 7703 7704 7705 7706 7707 7708 7709 7710 7711 7712 7713 7714 7715 7716 7717 7718 7719 7720 7721 7722 7723 7724 7725 7726 7727 7728 7729 7730 7731 7732 7733 7734 7735 7736 7737 7738 7739 7740 7741 7742 7743 7744 7745 7746 7747 7748 7749 7750 7751 7752 7753 7754 7755 7756 7757 7758 7759 7760 7761 7762 7763 7764 7765 7766 7767 7768 7769 7770 7771 7772 7773 7774 7775 7776 7777 7778 7779 7780 7781 7782 7783 7784 7785 7786 7787 7788 7789 7790 7791 7792 7793 7794 7795 7796 7797 7798 7799 7800 7801 7802 7803 7804 7805 7806 7807 7808 7809 7810 7811 7812 7813 7814 7815 7816 7817 7818 7819 7820 7821 7822 7823 7824 7825 7826 7827 7828 7829 7830 7831 7832 7833 7834 7835 7836 7837 7838 7839 7840 7841 7842 7843 7844 7845 7846 7847 7848 7849 7850 7851 7852 7853 7854 7855 7856 7857 7858 7859 7860 7861 7862 7863 7864 7865 7866 7867 7868 7869 7870 7871 7872 7873 7874 7875 7876 7877 7878 7879 7880 7881 7882 7883 7884 7885 7886 7887 7888 7889 7890 7891 7892 7893 7894 7895 7896 7897 7898 7899 7900 7901 7902 7903 7904 7905 7906 7907 7908 7909 7910 7911 7912 7913 7914 7915 7916 7917 7918 7919 7920 7921 7922 7923 7924 7925 7926 7927 7928 7929 7930 7931 7932 7933 7934 7935 7936 7937 7938 7939 7940 7941 7942 7943 7944 7945 7946 7947 7948 7949 7950 7951 7952 7953 7954 7955 7956 7957 7958 7959 7960 7961 7962 7963 7964 7965 7966 7967 7968 7969 7970 7971 7972 7973 7974 7975 7976 7977 7978 7979 7980 7981 7982 7983 7984 7985 7986 7987 7988 7989 7990 7991 7992 7993 7994 7995 7996 7997 7998 7999 8000 8001 8002 8003 8004 8005 8006 8007 8008 8009 8010 8011 8012 8013 8014 8015 8016 8017 8018 8019 8020 8021 8022 8023 8024 8025 8026 8027 8028 8029 8030 8031 8032 8033 8034 8035 8036 8037 8038 8039 8040 8041 8042 8043 8044 8045 8046 8047 8048 8049 8050 8051 8052 8053 8054 8055 8056 8057 8058 8059 8060 8061 8062 8063 8064 8065 8066 8067 8068 8069 8070 8071 8072 8073 8074 8075 8076 8077 8078 8079 8080 8081 8082 8083 8084 8085 8086 8087 8088 8089 8090 8091 8092 8093 8094 8095 8096 8097 8098 8099 8100 8101 8102 8103 8104 8105 8106 8107 8108 8109 8110 8111 8112 8113 8114 8115 8116 8117 8118 8119 8120 8121 8122 8123 8124 8125 8126 8127 8128 8129 8130 8131 8132 8133 8134 8135 8136 8137 8138 8139 8140 8141 8142 8143 8144 8145 8146 8147 8148 8149 8150 8151 8152 8153 8154 8155 8156 8157 8158 8159 8160 8161 8162 8163 8164 8165 8166 8167 8168 8169 8170 8171 8172 8173 8174 8175 8176 8177 8178 8179 8180 8181 8182 8183 8184 8185 8186 8187 8188 8189 8190 8191 8192 8193 8194 8195 8196 8197 8198 8199 8200 8201 8202 8203 8204 8205 8206 8207 8208 8209 8210 8211 8212 8213 8214 8215 8216 8217 8218 8219 8220 8221 8222 8223 8224 8225 8226 8227 8228 8229 8230 8231 8232 8233 8234 8235 8236 8237 8238 8239 8240 8241 8242 8243 8244 8245 8246 8247 8248 8249 8250 8251 8252 8253 8254 8255 8256 8257 8258 8259 8260 8261 8262 8263 8264 8265 8266 8267 8268 8269 8270 8271 8272 8273 8274 8275 8276 8277 8278 8279 8280 8281 8282 8283 8284 8285 8286 8287 8288 8289 8290 8291 8292 8293 8294 8295 8296 8297 8298 8299 8300 8301 8302 8303 8304 8305 8306 8307 8308 8309 8310 8311 8312 8313 8314 8315 8316 8317 8318 8319 8320 8321 8322 8323 8324 8325 8326 8327 8328 8329 8330 8331 8332 8333 8334 8335 8336 8337 8338 8339 8340 8341 8342 8343 8344 8345 8346 8347 8348 8349 8350 8351 8352 8353 // SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2007 Oracle. All rights reserved. */ #include <linux/sched.h> #include <linux/sched/mm.h> #include <linux/slab.h> #include <linux/ratelimit.h> #include <linux/kthread.h> #include <linux/semaphore.h> #include <linux/uuid.h> #include <linux/list_sort.h> #include <linux/namei.h> #include "misc.h" #include "disk-io.h" #include "extent-tree.h" #include "transaction.h" #include "volumes.h" #include "raid56.h" #include "rcu-string.h" #include "dev-replace.h" #include "sysfs.h" #include "tree-checker.h" #include "space-info.h" #include "block-group.h" #include "discard.h" #include "zoned.h" #include "fs.h" #include "accessors.h" #include "uuid-tree.h" #include "ioctl.h" #include "relocation.h" #include "scrub.h" #include "super.h" #include "raid-stripe-tree.h" #define BTRFS_BLOCK_GROUP_STRIPE_MASK (BTRFS_BLOCK_GROUP_RAID0 | \ BTRFS_BLOCK_GROUP_RAID10 | \ BTRFS_BLOCK_GROUP_RAID56_MASK) struct btrfs_io_geometry { u32 stripe_index; u32 stripe_nr; int mirror_num; int num_stripes; u64 stripe_offset; u64 raid56_full_stripe_start; int max_errors; enum btrfs_map_op op; bool use_rst; }; const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = { [BTRFS_RAID_RAID10] = { .sub_stripes = 2, .dev_stripes = 1, .devs_max = 0, /* 0 == as many as possible */ .devs_min = 2, .tolerated_failures = 1, .devs_increment = 2, .ncopies = 2, .nparity = 0, .raid_name = "raid10", .bg_flag = BTRFS_BLOCK_GROUP_RAID10, .mindev_error = BTRFS_ERROR_DEV_RAID10_MIN_NOT_MET, }, [BTRFS_RAID_RAID1] = { .sub_stripes = 1, .dev_stripes = 1, .devs_max = 2, .devs_min = 2, .tolerated_failures = 1, .devs_increment = 2, .ncopies = 2, .nparity = 0, .raid_name = "raid1", .bg_flag = BTRFS_BLOCK_GROUP_RAID1, .mindev_error = BTRFS_ERROR_DEV_RAID1_MIN_NOT_MET, }, [BTRFS_RAID_RAID1C3] = { .sub_stripes = 1, .dev_stripes = 1, .devs_max = 3, .devs_min = 3, .tolerated_failures = 2, .devs_increment = 3, .ncopies = 3, .nparity = 0, .raid_name = "raid1c3", .bg_flag = BTRFS_BLOCK_GROUP_RAID1C3, .mindev_error = BTRFS_ERROR_DEV_RAID1C3_MIN_NOT_MET, }, [BTRFS_RAID_RAID1C4] = { .sub_stripes = 1, .dev_stripes = 1, .devs_max = 4, .devs_min = 4, .tolerated_failures = 3, .devs_increment = 4, .ncopies = 4, .nparity = 0, .raid_name = "raid1c4", .bg_flag = BTRFS_BLOCK_GROUP_RAID1C4, .mindev_error = BTRFS_ERROR_DEV_RAID1C4_MIN_NOT_MET, }, [BTRFS_RAID_DUP] = { .sub_stripes = 1, .dev_stripes = 2, .devs_max = 1, .devs_min = 1, .tolerated_failures = 0, .devs_increment = 1, .ncopies = 2, .nparity = 0, .raid_name = "dup", .bg_flag = BTRFS_BLOCK_GROUP_DUP, .mindev_error = 0, }, [BTRFS_RAID_RAID0] = { .sub_stripes = 1, .dev_stripes = 1, .devs_max = 0, .devs_min = 1, .tolerated_failures = 0, .devs_increment = 1, .ncopies = 1, .nparity = 0, .raid_name = "raid0", .bg_flag = BTRFS_BLOCK_GROUP_RAID0, .mindev_error = 0, }, [BTRFS_RAID_SINGLE] = { .sub_stripes = 1, .dev_stripes = 1, .devs_max = 1, .devs_min = 1, .tolerated_failures = 0, .devs_increment = 1, .ncopies = 1, .nparity = 0, .raid_name = "single", .bg_flag = 0, .mindev_error = 0, }, [BTRFS_RAID_RAID5] = { .sub_stripes = 1, .dev_stripes = 1, .devs_max = 0, .devs_min = 2, .tolerated_failures = 1, .devs_increment = 1, .ncopies = 1, .nparity = 1, .raid_name = "raid5", .bg_flag = BTRFS_BLOCK_GROUP_RAID5, .mindev_error = BTRFS_ERROR_DEV_RAID5_MIN_NOT_MET, }, [BTRFS_RAID_RAID6] = { .sub_stripes = 1, .dev_stripes = 1, .devs_max = 0, .devs_min = 3, .tolerated_failures = 2, .devs_increment = 1, .ncopies = 1, .nparity = 2, .raid_name = "raid6", .bg_flag = BTRFS_BLOCK_GROUP_RAID6, .mindev_error = BTRFS_ERROR_DEV_RAID6_MIN_NOT_MET, }, }; /* * Convert block group flags (BTRFS_BLOCK_GROUP_*) to btrfs_raid_types, which * can be used as index to access btrfs_raid_array[]. */ enum btrfs_raid_types __attribute_const__ btrfs_bg_flags_to_raid_index(u64 flags) { const u64 profile = (flags & BTRFS_BLOCK_GROUP_PROFILE_MASK); if (!profile) return BTRFS_RAID_SINGLE; return BTRFS_BG_FLAG_TO_INDEX(profile); } const char *btrfs_bg_type_to_raid_name(u64 flags) { const int index = btrfs_bg_flags_to_raid_index(flags); if (index >= BTRFS_NR_RAID_TYPES) return NULL; return btrfs_raid_array[index].raid_name; } int btrfs_nr_parity_stripes(u64 type) { enum btrfs_raid_types index = btrfs_bg_flags_to_raid_index(type); return btrfs_raid_array[index].nparity; } /* * Fill @buf with textual description of @bg_flags, no more than @size_buf * bytes including terminating null byte. */ void btrfs_describe_block_groups(u64 bg_flags, char *buf, u32 size_buf) { int i; int ret; char *bp = buf; u64 flags = bg_flags; u32 size_bp = size_buf; if (!flags) { strcpy(bp, "NONE"); return; } #define DESCRIBE_FLAG(flag, desc) \ do { \ if (flags & (flag)) { \ ret = snprintf(bp, size_bp, "%s|", (desc)); \ if (ret < 0 || ret >= size_bp) \ goto out_overflow; \ size_bp -= ret; \ bp += ret; \ flags &= ~(flag); \ } \ } while (0) DESCRIBE_FLAG(BTRFS_BLOCK_GROUP_DATA, "data"); DESCRIBE_FLAG(BTRFS_BLOCK_GROUP_SYSTEM, "system"); DESCRIBE_FLAG(BTRFS_BLOCK_GROUP_METADATA, "metadata"); DESCRIBE_FLAG(BTRFS_AVAIL_ALLOC_BIT_SINGLE, "single"); for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) DESCRIBE_FLAG(btrfs_raid_array[i].bg_flag, btrfs_raid_array[i].raid_name); #undef DESCRIBE_FLAG if (flags) { ret = snprintf(bp, size_bp, "0x%llx|", flags); size_bp -= ret; } if (size_bp < size_buf) buf[size_buf - size_bp - 1] = '\0'; /* remove last | */ /* * The text is trimmed, it's up to the caller to provide sufficiently * large buffer */ out_overflow:; } static int init_first_rw_device(struct btrfs_trans_handle *trans); static int btrfs_relocate_sys_chunks(struct btrfs_fs_info *fs_info); static void btrfs_dev_stat_print_on_load(struct btrfs_device *device); /* * Device locking * ============== * * There are several mutexes that protect manipulation of devices and low-level * structures like chunks but not block groups, extents or files * * uuid_mutex (global lock) * ------------------------ * protects the fs_uuids list that tracks all per-fs fs_devices, resulting from * the SCAN_DEV ioctl registration or from mount either implicitly (the first * device) or requested by the device= mount option * * the mutex can be very coarse and can cover long-running operations * * protects: updates to fs_devices counters like missing devices, rw devices, * seeding, structure cloning, opening/closing devices at mount/umount time * * global::fs_devs - add, remove, updates to the global list * * does not protect: manipulation of the fs_devices::devices list in general * but in mount context it could be used to exclude list modifications by eg. * scan ioctl * * btrfs_device::name - renames (write side), read is RCU * * fs_devices::device_list_mutex (per-fs, with RCU) * ------------------------------------------------ * protects updates to fs_devices::devices, ie. adding and deleting * * simple list traversal with read-only actions can be done with RCU protection * * may be used to exclude some operations from running concurrently without any * modifications to the list (see write_all_supers) * * Is not required at mount and close times, because our device list is * protected by the uuid_mutex at that point. * * balance_mutex * ------------- * protects balance structures (status, state) and context accessed from * several places (internally, ioctl) * * chunk_mutex * ----------- * protects chunks, adding or removing during allocation, trim or when a new * device is added/removed. Additionally it also protects post_commit_list of * individual devices, since they can be added to the transaction's * post_commit_list only with chunk_mutex held. * * cleaner_mutex * ------------- * a big lock that is held by the cleaner thread and prevents running subvolume * cleaning together with relocation or delayed iputs * * * Lock nesting * ============ * * uuid_mutex * device_list_mutex * chunk_mutex * balance_mutex * * * Exclusive operations * ==================== * * Maintains the exclusivity of the following operations that apply to the * whole filesystem and cannot run in parallel. * * - Balance (*) * - Device add * - Device remove * - Device replace (*) * - Resize * * The device operations (as above) can be in one of the following states: * * - Running state * - Paused state * - Completed state * * Only device operations marked with (*) can go into the Paused state for the * following reasons: * * - ioctl (only Balance can be Paused through ioctl) * - filesystem remounted as read-only * - filesystem unmounted and mounted as read-only * - system power-cycle and filesystem mounted as read-only * - filesystem or device errors leading to forced read-only * * The status of exclusive operation is set and cleared atomically. * During the course of Paused state, fs_info::exclusive_operation remains set. * A device operation in Paused or Running state can be canceled or resumed * either by ioctl (Balance only) or when remounted as read-write. * The exclusive status is cleared when the device operation is canceled or * completed. */ DEFINE_MUTEX(uuid_mutex); static LIST_HEAD(fs_uuids); struct list_head * __attribute_const__ btrfs_get_fs_uuids(void) { return &fs_uuids; } /* * Allocate new btrfs_fs_devices structure identified by a fsid. * * @fsid: if not NULL, copy the UUID to fs_devices::fsid and to * fs_devices::metadata_fsid * * Return a pointer to a new struct btrfs_fs_devices on success, or ERR_PTR(). * The returned struct is not linked onto any lists and can be destroyed with * kfree() right away. */ static struct btrfs_fs_devices *alloc_fs_devices(const u8 *fsid) { struct btrfs_fs_devices *fs_devs; fs_devs = kzalloc(sizeof(*fs_devs), GFP_KERNEL); if (!fs_devs) return ERR_PTR(-ENOMEM); mutex_init(&fs_devs->device_list_mutex); INIT_LIST_HEAD(&fs_devs->devices); INIT_LIST_HEAD(&fs_devs->alloc_list); INIT_LIST_HEAD(&fs_devs->fs_list); INIT_LIST_HEAD(&fs_devs->seed_list); if (fsid) { memcpy(fs_devs->fsid, fsid, BTRFS_FSID_SIZE); memcpy(fs_devs->metadata_uuid, fsid, BTRFS_FSID_SIZE); } return fs_devs; } static void btrfs_free_device(struct btrfs_device *device) { WARN_ON(!list_empty(&device->post_commit_list)); rcu_string_free(device->name); extent_io_tree_release(&device->alloc_state); btrfs_destroy_dev_zone_info(device); kfree(device); } static void free_fs_devices(struct btrfs_fs_devices *fs_devices) { struct btrfs_device *device; WARN_ON(fs_devices->opened); while (!list_empty(&fs_devices->devices)) { device = list_entry(fs_devices->devices.next, struct btrfs_device, dev_list); list_del(&device->dev_list); btrfs_free_device(device); } kfree(fs_devices); } void __exit btrfs_cleanup_fs_uuids(void) { struct btrfs_fs_devices *fs_devices; while (!list_empty(&fs_uuids)) { fs_devices = list_entry(fs_uuids.next, struct btrfs_fs_devices, fs_list); list_del(&fs_devices->fs_list); free_fs_devices(fs_devices); } } static bool match_fsid_fs_devices(const struct btrfs_fs_devices *fs_devices, const u8 *fsid, const u8 *metadata_fsid) { if (memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE) != 0) return false; if (!metadata_fsid) return true; if (memcmp(metadata_fsid, fs_devices->metadata_uuid, BTRFS_FSID_SIZE) != 0) return false; return true; } static noinline struct btrfs_fs_devices *find_fsid( const u8 *fsid, const u8 *metadata_fsid) { struct btrfs_fs_devices *fs_devices; ASSERT(fsid); /* Handle non-split brain cases */ list_for_each_entry(fs_devices, &fs_uuids, fs_list) { if (match_fsid_fs_devices(fs_devices, fsid, metadata_fsid)) return fs_devices; } return NULL; } static int btrfs_get_bdev_and_sb(const char *device_path, blk_mode_t flags, void *holder, int flush, struct file **bdev_file, struct btrfs_super_block **disk_super) { struct block_device *bdev; int ret; *bdev_file = bdev_file_open_by_path(device_path, flags, holder, NULL); if (IS_ERR(*bdev_file)) { ret = PTR_ERR(*bdev_file); btrfs_err(NULL, "failed to open device for path %s with flags 0x%x: %d", device_path, flags, ret); goto error; } bdev = file_bdev(*bdev_file); if (flush) sync_blockdev(bdev); if (holder) { ret = set_blocksize(*bdev_file, BTRFS_BDEV_BLOCKSIZE); if (ret) { fput(*bdev_file); goto error; } } invalidate_bdev(bdev); *disk_super = btrfs_read_dev_super(bdev); if (IS_ERR(*disk_super)) { ret = PTR_ERR(*disk_super); fput(*bdev_file); goto error; } return 0; error: *disk_super = NULL; *bdev_file = NULL; return ret; } /* * Search and remove all stale devices (which are not mounted). When both * inputs are NULL, it will search and release all stale devices. * * @devt: Optional. When provided will it release all unmounted devices * matching this devt only. * @skip_device: Optional. Will skip this device when searching for the stale * devices. * * Return: 0 for success or if @devt is 0. * -EBUSY if @devt is a mounted device. * -ENOENT if @devt does not match any device in the list. */ static int btrfs_free_stale_devices(dev_t devt, struct btrfs_device *skip_device) { struct btrfs_fs_devices *fs_devices, *tmp_fs_devices; struct btrfs_device *device, *tmp_device; int ret; bool freed = false; lockdep_assert_held(&uuid_mutex); /* Return good status if there is no instance of devt. */ ret = 0; list_for_each_entry_safe(fs_devices, tmp_fs_devices, &fs_uuids, fs_list) { mutex_lock(&fs_devices->device_list_mutex); list_for_each_entry_safe(device, tmp_device, &fs_devices->devices, dev_list) { if (skip_device && skip_device == device) continue; if (devt && devt != device->devt) continue; if (fs_devices->opened) { if (devt) ret = -EBUSY; break; } /* delete the stale device */ fs_devices->num_devices--; list_del(&device->dev_list); btrfs_free_device(device); freed = true; } mutex_unlock(&fs_devices->device_list_mutex); if (fs_devices->num_devices == 0) { btrfs_sysfs_remove_fsid(fs_devices); list_del(&fs_devices->fs_list); free_fs_devices(fs_devices); } } /* If there is at least one freed device return 0. */ if (freed) return 0; return ret; } static struct btrfs_fs_devices *find_fsid_by_device( struct btrfs_super_block *disk_super, dev_t devt, bool *same_fsid_diff_dev) { struct btrfs_fs_devices *fsid_fs_devices; struct btrfs_fs_devices *devt_fs_devices; const bool has_metadata_uuid = (btrfs_super_incompat_flags(disk_super) & BTRFS_FEATURE_INCOMPAT_METADATA_UUID); bool found_by_devt = false; /* Find the fs_device by the usual method, if found use it. */ fsid_fs_devices = find_fsid(disk_super->fsid, has_metadata_uuid ? disk_super->metadata_uuid : NULL); /* The temp_fsid feature is supported only with single device filesystem. */ if (btrfs_super_num_devices(disk_super) != 1) return fsid_fs_devices; /* * A seed device is an integral component of the sprout device, which * functions as a multi-device filesystem. So, temp-fsid feature is * not supported. */ if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_SEEDING) return fsid_fs_devices; /* Try to find a fs_devices by matching devt. */ list_for_each_entry(devt_fs_devices, &fs_uuids, fs_list) { struct btrfs_device *device; list_for_each_entry(device, &devt_fs_devices->devices, dev_list) { if (device->devt == devt) { found_by_devt = true; break; } } if (found_by_devt) break; } if (found_by_devt) { /* Existing device. */ if (fsid_fs_devices == NULL) { if (devt_fs_devices->opened == 0) { /* Stale device. */ return NULL; } else { /* temp_fsid is mounting a subvol. */ return devt_fs_devices; } } else { /* Regular or temp_fsid device mounting a subvol. */ return devt_fs_devices; } } else { /* New device. */ if (fsid_fs_devices == NULL) { return NULL; } else { /* sb::fsid is already used create a new temp_fsid. */ *same_fsid_diff_dev = true; return NULL; } } /* Not reached. */ } /* * This is only used on mount, and we are protected from competing things * messing with our fs_devices by the uuid_mutex, thus we do not need the * fs_devices->device_list_mutex here. */ static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices, struct btrfs_device *device, blk_mode_t flags, void *holder) { struct file *bdev_file; struct btrfs_super_block *disk_super; u64 devid; int ret; if (device->bdev) return -EINVAL; if (!device->name) return -EINVAL; ret = btrfs_get_bdev_and_sb(device->name->str, flags, holder, 1, &bdev_file, &disk_super); if (ret) return ret; devid = btrfs_stack_device_id(&disk_super->dev_item); if (devid != device->devid) goto error_free_page; if (memcmp(device->uuid, disk_super->dev_item.uuid, BTRFS_UUID_SIZE)) goto error_free_page; device->generation = btrfs_super_generation(disk_super); if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_SEEDING) { if (btrfs_super_incompat_flags(disk_super) & BTRFS_FEATURE_INCOMPAT_METADATA_UUID) { pr_err( "BTRFS: Invalid seeding and uuid-changed device detected\n"); goto error_free_page; } clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state); fs_devices->seeding = true; } else { if (bdev_read_only(file_bdev(bdev_file))) clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state); else set_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state); } if (!bdev_nonrot(file_bdev(bdev_file))) fs_devices->rotating = true; if (bdev_max_discard_sectors(file_bdev(bdev_file))) fs_devices->discardable = true; device->bdev_file = bdev_file; device->bdev = file_bdev(bdev_file); clear_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state); if (device->devt != device->bdev->bd_dev) { btrfs_warn(NULL, "device %s maj:min changed from %d:%d to %d:%d", device->name->str, MAJOR(device->devt), MINOR(device->devt), MAJOR(device->bdev->bd_dev), MINOR(device->bdev->bd_dev)); device->devt = device->bdev->bd_dev; } fs_devices->open_devices++; if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) && device->devid != BTRFS_DEV_REPLACE_DEVID) { fs_devices->rw_devices++; list_add_tail(&device->dev_alloc_list, &fs_devices->alloc_list); } btrfs_release_disk_super(disk_super); return 0; error_free_page: btrfs_release_disk_super(disk_super); fput(bdev_file); return -EINVAL; } const u8 *btrfs_sb_fsid_ptr(const struct btrfs_super_block *sb) { bool has_metadata_uuid = (btrfs_super_incompat_flags(sb) & BTRFS_FEATURE_INCOMPAT_METADATA_UUID); return has_metadata_uuid ? sb->metadata_uuid : sb->fsid; } /* * We can have very weird soft links passed in. * One example is "/proc/self/fd/<fd>", which can be a soft link to * a block device. * * But it's never a good idea to use those weird names. * Here we check if the path (not following symlinks) is a good one inside * "/dev/". */ static bool is_good_dev_path(const char *dev_path) { struct path path = { .mnt = NULL, .dentry = NULL }; char *path_buf = NULL; char *resolved_path; bool is_good = false; int ret; if (!dev_path) goto out; path_buf = kmalloc(PATH_MAX, GFP_KERNEL); if (!path_buf) goto out; /* * Do not follow soft link, just check if the original path is inside * "/dev/". */ ret = kern_path(dev_path, 0, &path); if (ret) goto out; resolved_path = d_path(&path, path_buf, PATH_MAX); if (IS_ERR(resolved_path)) goto out; if (strncmp(resolved_path, "/dev/", strlen("/dev/"))) goto out; is_good = true; out: kfree(path_buf); path_put(&path); return is_good; } static int get_canonical_dev_path(const char *dev_path, char *canonical) { struct path path = { .mnt = NULL, .dentry = NULL }; char *path_buf = NULL; char *resolved_path; int ret; if (!dev_path) { ret = -EINVAL; goto out; } path_buf = kmalloc(PATH_MAX, GFP_KERNEL); if (!path_buf) { ret = -ENOMEM; goto out; } ret = kern_path(dev_path, LOOKUP_FOLLOW, &path); if (ret) goto out; resolved_path = d_path(&path, path_buf, PATH_MAX); if (IS_ERR(resolved_path)) { ret = PTR_ERR(resolved_path); goto out; } ret = strscpy(canonical, resolved_path, PATH_MAX); out: kfree(path_buf); path_put(&path); return ret; } static bool is_same_device(struct btrfs_device *device, const char *new_path) { struct path old = { .mnt = NULL, .dentry = NULL }; struct path new = { .mnt = NULL, .dentry = NULL }; char *old_path = NULL; bool is_same = false; int ret; if (!device->name) goto out; old_path = kzalloc(PATH_MAX, GFP_NOFS); if (!old_path) goto out; rcu_read_lock(); ret = strscpy(old_path, rcu_str_deref(device->name), PATH_MAX); rcu_read_unlock(); if (ret < 0) goto out; ret = kern_path(old_path, LOOKUP_FOLLOW, &old); if (ret) goto out; ret = kern_path(new_path, LOOKUP_FOLLOW, &new); if (ret) goto out; if (path_equal(&old, &new)) is_same = true; out: kfree(old_path); path_put(&old); path_put(&new); return is_same; } /* * Add new device to list of registered devices * * Returns: * device pointer which was just added or updated when successful * error pointer when failed */ static noinline struct btrfs_device *device_list_add(const char *path, struct btrfs_super_block *disk_super, bool *new_device_added) { struct btrfs_device *device; struct btrfs_fs_devices *fs_devices = NULL; struct rcu_string *name; u64 found_transid = btrfs_super_generation(disk_super); u64 devid = btrfs_stack_device_id(&disk_super->dev_item); dev_t path_devt; int error; bool same_fsid_diff_dev = false; bool has_metadata_uuid = (btrfs_super_incompat_flags(disk_super) & BTRFS_FEATURE_INCOMPAT_METADATA_UUID); if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_CHANGING_FSID_V2) { btrfs_err(NULL, "device %s has incomplete metadata_uuid change, please use btrfstune to complete", path); return ERR_PTR(-EAGAIN); } error = lookup_bdev(path, &path_devt); if (error) { btrfs_err(NULL, "failed to lookup block device for path %s: %d", path, error); return ERR_PTR(error); } fs_devices = find_fsid_by_device(disk_super, path_devt, &same_fsid_diff_dev); if (!fs_devices) { fs_devices = alloc_fs_devices(disk_super->fsid); if (IS_ERR(fs_devices)) return ERR_CAST(fs_devices); if (has_metadata_uuid) memcpy(fs_devices->metadata_uuid, disk_super->metadata_uuid, BTRFS_FSID_SIZE); if (same_fsid_diff_dev) { generate_random_uuid(fs_devices->fsid); fs_devices->temp_fsid = true; pr_info("BTRFS: device %s (%d:%d) using temp-fsid %pU\n", path, MAJOR(path_devt), MINOR(path_devt), fs_devices->fsid); } mutex_lock(&fs_devices->device_list_mutex); list_add(&fs_devices->fs_list, &fs_uuids); device = NULL; } else { struct btrfs_dev_lookup_args args = { .devid = devid, .uuid = disk_super->dev_item.uuid, }; mutex_lock(&fs_devices->device_list_mutex); device = btrfs_find_device(fs_devices, &args); if (found_transid > fs_devices->latest_generation) { memcpy(fs_devices->fsid, disk_super->fsid, BTRFS_FSID_SIZE); memcpy(fs_devices->metadata_uuid, btrfs_sb_fsid_ptr(disk_super), BTRFS_FSID_SIZE); } } if (!device) { unsigned int nofs_flag; if (fs_devices->opened) { btrfs_err(NULL, "device %s (%d:%d) belongs to fsid %pU, and the fs is already mounted, scanned by %s (%d)", path, MAJOR(path_devt), MINOR(path_devt), fs_devices->fsid, current->comm, task_pid_nr(current)); mutex_unlock(&fs_devices->device_list_mutex); return ERR_PTR(-EBUSY); } nofs_flag = memalloc_nofs_save(); device = btrfs_alloc_device(NULL, &devid, disk_super->dev_item.uuid, path); memalloc_nofs_restore(nofs_flag); if (IS_ERR(device)) { mutex_unlock(&fs_devices->device_list_mutex); /* we can safely leave the fs_devices entry around */ return device; } device->devt = path_devt; list_add_rcu(&device->dev_list, &fs_devices->devices); fs_devices->num_devices++; device->fs_devices = fs_devices; *new_device_added = true; if (disk_super->label[0]) pr_info( "BTRFS: device label %s devid %llu transid %llu %s (%d:%d) scanned by %s (%d)\n", disk_super->label, devid, found_transid, path, MAJOR(path_devt), MINOR(path_devt), current->comm, task_pid_nr(current)); else pr_info( "BTRFS: device fsid %pU devid %llu transid %llu %s (%d:%d) scanned by %s (%d)\n", disk_super->fsid, devid, found_transid, path, MAJOR(path_devt), MINOR(path_devt), current->comm, task_pid_nr(current)); } else if (!device->name || !is_same_device(device, path)) { /* * When FS is already mounted. * 1. If you are here and if the device->name is NULL that * means this device was missing at time of FS mount. * 2. If you are here and if the device->name is different * from 'path' that means either * a. The same device disappeared and reappeared with * different name. or * b. The missing-disk-which-was-replaced, has * reappeared now. * * We must allow 1 and 2a above. But 2b would be a spurious * and unintentional. * * Further in case of 1 and 2a above, the disk at 'path' * would have missed some transaction when it was away and * in case of 2a the stale bdev has to be updated as well. * 2b must not be allowed at all time. */ /* * For now, we do allow update to btrfs_fs_device through the * btrfs dev scan cli after FS has been mounted. We're still * tracking a problem where systems fail mount by subvolume id * when we reject replacement on a mounted FS. */ if (!fs_devices->opened && found_transid < device->generation) { /* * That is if the FS is _not_ mounted and if you * are here, that means there is more than one * disk with same uuid and devid.We keep the one * with larger generation number or the last-in if * generation are equal. */ mutex_unlock(&fs_devices->device_list_mutex); btrfs_err(NULL, "device %s already registered with a higher generation, found %llu expect %llu", path, found_transid, device->generation); return ERR_PTR(-EEXIST); } /* * We are going to replace the device path for a given devid, * make sure it's the same device if the device is mounted * * NOTE: the device->fs_info may not be reliable here so pass * in a NULL to message helpers instead. This avoids a possible * use-after-free when the fs_info and fs_info->sb are already * torn down. */ if (device->bdev) { if (device->devt != path_devt) { mutex_unlock(&fs_devices->device_list_mutex); btrfs_warn_in_rcu(NULL, "duplicate device %s devid %llu generation %llu scanned by %s (%d)", path, devid, found_transid, current->comm, task_pid_nr(current)); return ERR_PTR(-EEXIST); } btrfs_info_in_rcu(NULL, "devid %llu device path %s changed to %s scanned by %s (%d)", devid, btrfs_dev_name(device), path, current->comm, task_pid_nr(current)); } name = rcu_string_strdup(path, GFP_NOFS); if (!name) { mutex_unlock(&fs_devices->device_list_mutex); return ERR_PTR(-ENOMEM); } rcu_string_free(device->name); rcu_assign_pointer(device->name, name); if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) { fs_devices->missing_devices--; clear_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state); } device->devt = path_devt; } /* * Unmount does not free the btrfs_device struct but would zero * generation along with most of the other members. So just update * it back. We need it to pick the disk with largest generation * (as above). */ if (!fs_devices->opened) { device->generation = found_transid; fs_devices->latest_generation = max_t(u64, found_transid, fs_devices->latest_generation); } fs_devices->total_devices = btrfs_super_num_devices(disk_super); mutex_unlock(&fs_devices->device_list_mutex); return device; } static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig) { struct btrfs_fs_devices *fs_devices; struct btrfs_device *device; struct btrfs_device *orig_dev; int ret = 0; lockdep_assert_held(&uuid_mutex); fs_devices = alloc_fs_devices(orig->fsid); if (IS_ERR(fs_devices)) return fs_devices; fs_devices->total_devices = orig->total_devices; list_for_each_entry(orig_dev, &orig->devices, dev_list) { const char *dev_path = NULL; /* * This is ok to do without RCU read locked because we hold the * uuid mutex so nothing we touch in here is going to disappear. */ if (orig_dev->name) dev_path = orig_dev->name->str; device = btrfs_alloc_device(NULL, &orig_dev->devid, orig_dev->uuid, dev_path); if (IS_ERR(device)) { ret = PTR_ERR(device); goto error; } if (orig_dev->zone_info) { struct btrfs_zoned_device_info *zone_info; zone_info = btrfs_clone_dev_zone_info(orig_dev); if (!zone_info) { btrfs_free_device(device); ret = -ENOMEM; goto error; } device->zone_info = zone_info; } list_add(&device->dev_list, &fs_devices->devices); device->fs_devices = fs_devices; fs_devices->num_devices++; } return fs_devices; error: free_fs_devices(fs_devices); return ERR_PTR(ret); } static void __btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices, struct btrfs_device **latest_dev) { struct btrfs_device *device, *next; /* This is the initialized path, it is safe to release the devices. */ list_for_each_entry_safe(device, next, &fs_devices->devices, dev_list) { if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state)) { if (!test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state) && !test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state) && (!*latest_dev || device->generation > (*latest_dev)->generation)) { *latest_dev = device; } continue; } /* * We have already validated the presence of BTRFS_DEV_REPLACE_DEVID, * in btrfs_init_dev_replace() so just continue. */ if (device->devid == BTRFS_DEV_REPLACE_DEVID) continue; if (device->bdev_file) { fput(device->bdev_file); device->bdev = NULL; device->bdev_file = NULL; fs_devices->open_devices--; } if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) { list_del_init(&device->dev_alloc_list); clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state); fs_devices->rw_devices--; } list_del_init(&device->dev_list); fs_devices->num_devices--; btrfs_free_device(device); } } /* * After we have read the system tree and know devids belonging to this * filesystem, remove the device which does not belong there. */ void btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices) { struct btrfs_device *latest_dev = NULL; struct btrfs_fs_devices *seed_dev; mutex_lock(&uuid_mutex); __btrfs_free_extra_devids(fs_devices, &latest_dev); list_for_each_entry(seed_dev, &fs_devices->seed_list, seed_list) __btrfs_free_extra_devids(seed_dev, &latest_dev); fs_devices->latest_dev = latest_dev; mutex_unlock(&uuid_mutex); } static void btrfs_close_bdev(struct btrfs_device *device) { if (!device->bdev) return; if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) { sync_blockdev(device->bdev); invalidate_bdev(device->bdev); } fput(device->bdev_file); } static void btrfs_close_one_device(struct btrfs_device *device) { struct btrfs_fs_devices *fs_devices = device->fs_devices; if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) && device->devid != BTRFS_DEV_REPLACE_DEVID) { list_del_init(&device->dev_alloc_list); fs_devices->rw_devices--; } if (device->devid == BTRFS_DEV_REPLACE_DEVID) clear_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state); if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) { clear_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state); fs_devices->missing_devices--; } btrfs_close_bdev(device); if (device->bdev) { fs_devices->open_devices--; device->bdev = NULL; device->bdev_file = NULL; } clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state); btrfs_destroy_dev_zone_info(device); device->fs_info = NULL; atomic_set(&device->dev_stats_ccnt, 0); extent_io_tree_release(&device->alloc_state); /* * Reset the flush error record. We might have a transient flush error * in this mount, and if so we aborted the current transaction and set * the fs to an error state, guaranteeing no super blocks can be further * committed. However that error might be transient and if we unmount the * filesystem and mount it again, we should allow the mount to succeed * (btrfs_check_rw_degradable() should not fail) - if after mounting the * filesystem again we still get flush errors, then we will again abort * any transaction and set the error state, guaranteeing no commits of * unsafe super blocks. */ device->last_flush_error = 0; /* Verify the device is back in a pristine state */ WARN_ON(test_bit(BTRFS_DEV_STATE_FLUSH_SENT, &device->dev_state)); WARN_ON(test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)); WARN_ON(!list_empty(&device->dev_alloc_list)); WARN_ON(!list_empty(&device->post_commit_list)); } static void close_fs_devices(struct btrfs_fs_devices *fs_devices) { struct btrfs_device *device, *tmp; lockdep_assert_held(&uuid_mutex); if (--fs_devices->opened > 0) return; list_for_each_entry_safe(device, tmp, &fs_devices->devices, dev_list) btrfs_close_one_device(device); WARN_ON(fs_devices->open_devices); WARN_ON(fs_devices->rw_devices); fs_devices->opened = 0; fs_devices->seeding = false; fs_devices->fs_info = NULL; } void btrfs_close_devices(struct btrfs_fs_devices *fs_devices) { LIST_HEAD(list); struct btrfs_fs_devices *tmp; mutex_lock(&uuid_mutex); close_fs_devices(fs_devices); if (!fs_devices->opened) { list_splice_init(&fs_devices->seed_list, &list); /* * If the struct btrfs_fs_devices is not assembled with any * other device, it can be re-initialized during the next mount * without the needing device-scan step. Therefore, it can be * fully freed. */ if (fs_devices->num_devices == 1) { list_del(&fs_devices->fs_list); free_fs_devices(fs_devices); } } list_for_each_entry_safe(fs_devices, tmp, &list, seed_list) { close_fs_devices(fs_devices); list_del(&fs_devices->seed_list); free_fs_devices(fs_devices); } mutex_unlock(&uuid_mutex); } static int open_fs_devices(struct btrfs_fs_devices *fs_devices, blk_mode_t flags, void *holder) { struct btrfs_device *device; struct btrfs_device *latest_dev = NULL; struct btrfs_device *tmp_device; s64 __maybe_unused value = 0; int ret = 0; list_for_each_entry_safe(device, tmp_device, &fs_devices->devices, dev_list) { int ret2; ret2 = btrfs_open_one_device(fs_devices, device, flags, holder); if (ret2 == 0 && (!latest_dev || device->generation > latest_dev->generation)) { latest_dev = device; } else if (ret2 == -ENODATA) { fs_devices->num_devices--; list_del(&device->dev_list); btrfs_free_device(device); } if (ret == 0 && ret2 != 0) ret = ret2; } if (fs_devices->open_devices == 0) { if (ret) return ret; return -EINVAL; } fs_devices->opened = 1; fs_devices->latest_dev = latest_dev; fs_devices->total_rw_bytes = 0; fs_devices->chunk_alloc_policy = BTRFS_CHUNK_ALLOC_REGULAR; #ifdef CONFIG_BTRFS_EXPERIMENTAL fs_devices->rr_min_contig_read = BTRFS_DEFAULT_RR_MIN_CONTIG_READ; fs_devices->read_devid = latest_dev->devid; fs_devices->read_policy = btrfs_read_policy_to_enum(btrfs_get_mod_read_policy(), &value); if (fs_devices->read_policy == BTRFS_READ_POLICY_RR) fs_devices->collect_fs_stats = true; if (value) { if (fs_devices->read_policy == BTRFS_READ_POLICY_RR) fs_devices->rr_min_contig_read = value; if (fs_devices->read_policy == BTRFS_READ_POLICY_DEVID) fs_devices->read_devid = value; } #else fs_devices->read_policy = BTRFS_READ_POLICY_PID; #endif return 0; } static int devid_cmp(void *priv, const struct list_head *a, const struct list_head *b) { const struct btrfs_device *dev1, *dev2; dev1 = list_entry(a, struct btrfs_device, dev_list); dev2 = list_entry(b, struct btrfs_device, dev_list); if (dev1->devid < dev2->devid) return -1; else if (dev1->devid > dev2->devid) return 1; return 0; } int btrfs_open_devices(struct btrfs_fs_devices *fs_devices, blk_mode_t flags, void *holder) { int ret; lockdep_assert_held(&uuid_mutex); /* * The device_list_mutex cannot be taken here in case opening the * underlying device takes further locks like open_mutex. * * We also don't need the lock here as this is called during mount and * exclusion is provided by uuid_mutex */ if (fs_devices->opened) { fs_devices->opened++; ret = 0; } else { list_sort(NULL, &fs_devices->devices, devid_cmp); ret = open_fs_devices(fs_devices, flags, holder); } return ret; } void btrfs_release_disk_super(struct btrfs_super_block *super) { struct page *page = virt_to_page(super); put_page(page); } static struct btrfs_super_block *btrfs_read_disk_super(struct block_device *bdev, u64 bytenr, u64 bytenr_orig) { struct btrfs_super_block *disk_super; struct page *page; void *p; pgoff_t index; /* make sure our super fits in the device */ if (bytenr + PAGE_SIZE >= bdev_nr_bytes(bdev)) return ERR_PTR(-EINVAL); /* make sure our super fits in the page */ if (sizeof(*disk_super) > PAGE_SIZE) return ERR_PTR(-EINVAL); /* make sure our super doesn't straddle pages on disk */ index = bytenr >> PAGE_SHIFT; if ((bytenr + sizeof(*disk_super) - 1) >> PAGE_SHIFT != index) return ERR_PTR(-EINVAL); /* pull in the page with our super */ page = read_cache_page_gfp(bdev->bd_mapping, index, GFP_KERNEL); if (IS_ERR(page)) return ERR_CAST(page); p = page_address(page); /* align our pointer to the offset of the super block */ disk_super = p + offset_in_page(bytenr); if (btrfs_super_bytenr(disk_super) != bytenr_orig || btrfs_super_magic(disk_super) != BTRFS_MAGIC) { btrfs_release_disk_super(p); return ERR_PTR(-EINVAL); } if (disk_super->label[0] && disk_super->label[BTRFS_LABEL_SIZE - 1]) disk_super->label[BTRFS_LABEL_SIZE - 1] = 0; return disk_super; } int btrfs_forget_devices(dev_t devt) { int ret; mutex_lock(&uuid_mutex); ret = btrfs_free_stale_devices(devt, NULL); mutex_unlock(&uuid_mutex); return ret; } static bool btrfs_skip_registration(struct btrfs_super_block *disk_super, const char *path, dev_t devt, bool mount_arg_dev) { struct btrfs_fs_devices *fs_devices; /* * Do not skip device registration for mounted devices with matching * maj:min but different paths. Booting without initrd relies on * /dev/root initially, later replaced with the actual root device. * A successful scan ensures grub2-probe selects the correct device. */ list_for_each_entry(fs_devices, &fs_uuids, fs_list) { struct btrfs_device *device; mutex_lock(&fs_devices->device_list_mutex); if (!fs_devices->opened) { mutex_unlock(&fs_devices->device_list_mutex); continue; } list_for_each_entry(device, &fs_devices->devices, dev_list) { if (device->bdev && (device->bdev->bd_dev == devt) && strcmp(device->name->str, path) != 0) { mutex_unlock(&fs_devices->device_list_mutex); /* Do not skip registration. */ return false; } } mutex_unlock(&fs_devices->device_list_mutex); } if (!mount_arg_dev && btrfs_super_num_devices(disk_super) == 1 && !(btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_SEEDING)) return true; return false; } /* * Look for a btrfs signature on a device. This may be called out of the mount path * and we are not allowed to call set_blocksize during the scan. The superblock * is read via pagecache. * * With @mount_arg_dev it's a scan during mount time that will always register * the device or return an error. Multi-device and seeding devices are registered * in both cases. */ struct btrfs_device *btrfs_scan_one_device(const char *path, blk_mode_t flags, bool mount_arg_dev) { struct btrfs_super_block *disk_super; bool new_device_added = false; struct btrfs_device *device = NULL; struct file *bdev_file; char *canonical_path = NULL; u64 bytenr; dev_t devt; int ret; lockdep_assert_held(&uuid_mutex); if (!is_good_dev_path(path)) { canonical_path = kmalloc(PATH_MAX, GFP_KERNEL); if (canonical_path) { ret = get_canonical_dev_path(path, canonical_path); if (ret < 0) { kfree(canonical_path); canonical_path = NULL; } } } /* * Avoid an exclusive open here, as the systemd-udev may initiate the * device scan which may race with the user's mount or mkfs command, * resulting in failure. * Since the device scan is solely for reading purposes, there is no * need for an exclusive open. Additionally, the devices are read again * during the mount process. It is ok to get some inconsistent * values temporarily, as the device paths of the fsid are the only * required information for assembling the volume. */ bdev_file = bdev_file_open_by_path(path, flags, NULL, NULL); if (IS_ERR(bdev_file)) return ERR_CAST(bdev_file); /* * We would like to check all the super blocks, but doing so would * allow a mount to succeed after a mkfs from a different filesystem. * Currently, recovery from a bad primary btrfs superblock is done * using the userspace command 'btrfs check --super'. */ ret = btrfs_sb_log_location_bdev(file_bdev(bdev_file), 0, READ, &bytenr); if (ret) { device = ERR_PTR(ret); goto error_bdev_put; } disk_super = btrfs_read_disk_super(file_bdev(bdev_file), bytenr, btrfs_sb_offset(0)); if (IS_ERR(disk_super)) { device = ERR_CAST(disk_super); goto error_bdev_put; } devt = file_bdev(bdev_file)->bd_dev; if (btrfs_skip_registration(disk_super, path, devt, mount_arg_dev)) { pr_debug("BTRFS: skip registering single non-seed device %s (%d:%d)\n", path, MAJOR(devt), MINOR(devt)); btrfs_free_stale_devices(devt, NULL); device = NULL; goto free_disk_super; } device = device_list_add(canonical_path ? : path, disk_super, &new_device_added); if (!IS_ERR(device) && new_device_added) btrfs_free_stale_devices(device->devt, device); free_disk_super: btrfs_release_disk_super(disk_super); error_bdev_put: fput(bdev_file); kfree(canonical_path); return device; } /* * Try to find a chunk that intersects [start, start + len] range and when one * such is found, record the end of it in *start */ static bool contains_pending_extent(struct btrfs_device *device, u64 *start, u64 len) { u64 physical_start, physical_end; lockdep_assert_held(&device->fs_info->chunk_mutex); if (find_first_extent_bit(&device->alloc_state, *start, &physical_start, &physical_end, CHUNK_ALLOCATED, NULL)) { if (in_range(physical_start, *start, len) || in_range(*start, physical_start, physical_end + 1 - physical_start)) { *start = physical_end + 1; return true; } } return false; } static u64 dev_extent_search_start(struct btrfs_device *device) { switch (device->fs_devices->chunk_alloc_policy) { case BTRFS_CHUNK_ALLOC_REGULAR: return BTRFS_DEVICE_RANGE_RESERVED; case BTRFS_CHUNK_ALLOC_ZONED: /* * We don't care about the starting region like regular * allocator, because we anyway use/reserve the first two zones * for superblock logging. */ return 0; default: BUG(); } } static bool dev_extent_hole_check_zoned(struct btrfs_device *device, u64 *hole_start, u64 *hole_size, u64 num_bytes) { u64 zone_size = device->zone_info->zone_size; u64 pos; int ret; bool changed = false; ASSERT(IS_ALIGNED(*hole_start, zone_size)); while (*hole_size > 0) { pos = btrfs_find_allocatable_zones(device, *hole_start, *hole_start + *hole_size, num_bytes); if (pos != *hole_start) { *hole_size = *hole_start + *hole_size - pos; *hole_start = pos; changed = true; if (*hole_size < num_bytes) break; } ret = btrfs_ensure_empty_zones(device, pos, num_bytes); /* Range is ensured to be empty */ if (!ret) return changed; /* Given hole range was invalid (outside of device) */ if (ret == -ERANGE) { *hole_start += *hole_size; *hole_size = 0; return true; } *hole_start += zone_size; *hole_size -= zone_size; changed = true; } return changed; } /* * Check if specified hole is suitable for allocation. * * @device: the device which we have the hole * @hole_start: starting position of the hole * @hole_size: the size of the hole * @num_bytes: the size of the free space that we need * * This function may modify @hole_start and @hole_size to reflect the suitable * position for allocation. Returns 1 if hole position is updated, 0 otherwise. */ static bool dev_extent_hole_check(struct btrfs_device *device, u64 *hole_start, u64 *hole_size, u64 num_bytes) { bool changed = false; u64 hole_end = *hole_start + *hole_size; for (;;) { /* * Check before we set max_hole_start, otherwise we could end up * sending back this offset anyway. */ if (contains_pending_extent(device, hole_start, *hole_size)) { if (hole_end >= *hole_start) *hole_size = hole_end - *hole_start; else *hole_size = 0; changed = true; } switch (device->fs_devices->chunk_alloc_policy) { case BTRFS_CHUNK_ALLOC_REGULAR: /* No extra check */ break; case BTRFS_CHUNK_ALLOC_ZONED: if (dev_extent_hole_check_zoned(device, hole_start, hole_size, num_bytes)) { changed = true; /* * The changed hole can contain pending extent. * Loop again to check that. */ continue; } break; default: BUG(); } break; } return changed; } /* * Find free space in the specified device. * * @device: the device which we search the free space in * @num_bytes: the size of the free space that we need * @search_start: the position from which to begin the search * @start: store the start of the free space. * @len: the size of the free space. that we find, or the size * of the max free space if we don't find suitable free space * * This does a pretty simple search, the expectation is that it is called very * infrequently and that a given device has a small number of extents. * * @start is used to store the start of the free space if we find. But if we * don't find suitable free space, it will be used to store the start position * of the max free space. * * @len is used to store the size of the free space that we find. * But if we don't find suitable free space, it is used to store the size of * the max free space. * * NOTE: This function will search *commit* root of device tree, and does extra * check to ensure dev extents are not double allocated. * This makes the function safe to allocate dev extents but may not report * correct usable device space, as device extent freed in current transaction * is not reported as available. */ static int find_free_dev_extent(struct btrfs_device *device, u64 num_bytes, u64 *start, u64 *len) { struct btrfs_fs_info *fs_info = device->fs_info; struct btrfs_root *root = fs_info->dev_root; struct btrfs_key key; struct btrfs_dev_extent *dev_extent; struct btrfs_path *path; u64 search_start; u64 hole_size; u64 max_hole_start; u64 max_hole_size = 0; u64 extent_end; u64 search_end = device->total_bytes; int ret; int slot; struct extent_buffer *l; search_start = dev_extent_search_start(device); max_hole_start = search_start; WARN_ON(device->zone_info && !IS_ALIGNED(num_bytes, device->zone_info->zone_size)); path = btrfs_alloc_path(); if (!path) { ret = -ENOMEM; goto out; } again: if (search_start >= search_end || test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) { ret = -ENOSPC; goto out; } path->reada = READA_FORWARD; path->search_commit_root = 1; path->skip_locking = 1; key.objectid = device->devid; key.offset = search_start; key.type = BTRFS_DEV_EXTENT_KEY; ret = btrfs_search_backwards(root, &key, path); if (ret < 0) goto out; while (search_start < search_end) { l = path->nodes[0]; slot = path->slots[0]; if (slot >= btrfs_header_nritems(l)) { ret = btrfs_next_leaf(root, path); if (ret == 0) continue; if (ret < 0) goto out; break; } btrfs_item_key_to_cpu(l, &key, slot); if (key.objectid < device->devid) goto next; if (key.objectid > device->devid) break; if (key.type != BTRFS_DEV_EXTENT_KEY) goto next; if (key.offset > search_end) break; if (key.offset > search_start) { hole_size = key.offset - search_start; dev_extent_hole_check(device, &search_start, &hole_size, num_bytes); if (hole_size > max_hole_size) { max_hole_start = search_start; max_hole_size = hole_size; } /* * If this free space is greater than which we need, * it must be the max free space that we have found * until now, so max_hole_start must point to the start * of this free space and the length of this free space * is stored in max_hole_size. Thus, we return * max_hole_start and max_hole_size and go back to the * caller. */ if (hole_size >= num_bytes) { ret = 0; goto out; } } dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent); extent_end = key.offset + btrfs_dev_extent_length(l, dev_extent); if (extent_end > search_start) search_start = extent_end; next: path->slots[0]++; cond_resched(); } /* * At this point, search_start should be the end of * allocated dev extents, and when shrinking the device, * search_end may be smaller than search_start. */ if (search_end > search_start) { hole_size = search_end - search_start; if (dev_extent_hole_check(device, &search_start, &hole_size, num_bytes)) { btrfs_release_path(path); goto again; } if (hole_size > max_hole_size) { max_hole_start = search_start; max_hole_size = hole_size; } } /* See above. */ if (max_hole_size < num_bytes) ret = -ENOSPC; else ret = 0; ASSERT(max_hole_start + max_hole_size <= search_end); out: btrfs_free_path(path); *start = max_hole_start; if (len) *len = max_hole_size; return ret; } static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans, struct btrfs_device *device, u64 start, u64 *dev_extent_len) { struct btrfs_fs_info *fs_info = device->fs_info; struct btrfs_root *root = fs_info->dev_root; int ret; struct btrfs_path *path; struct btrfs_key key; struct btrfs_key found_key; struct extent_buffer *leaf = NULL; struct btrfs_dev_extent *extent = NULL; path = btrfs_alloc_path(); if (!path) return -ENOMEM; key.objectid = device->devid; key.offset = start; key.type = BTRFS_DEV_EXTENT_KEY; again: ret = btrfs_search_slot(trans, root, &key, path, -1, 1); if (ret > 0) { ret = btrfs_previous_item(root, path, key.objectid, BTRFS_DEV_EXTENT_KEY); if (ret) goto out; leaf = path->nodes[0]; btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); extent = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_extent); BUG_ON(found_key.offset > start || found_key.offset + btrfs_dev_extent_length(leaf, extent) < start); key = found_key; btrfs_release_path(path); goto again; } else if (ret == 0) { leaf = path->nodes[0]; extent = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_extent); } else { goto out; } *dev_extent_len = btrfs_dev_extent_length(leaf, extent); ret = btrfs_del_item(trans, root, path); if (ret == 0) set_bit(BTRFS_TRANS_HAVE_FREE_BGS, &trans->transaction->flags); out: btrfs_free_path(path); return ret; } static u64 find_next_chunk(struct btrfs_fs_info *fs_info) { struct rb_node *n; u64 ret = 0; read_lock(&fs_info->mapping_tree_lock); n = rb_last(&fs_info->mapping_tree.rb_root); if (n) { struct btrfs_chunk_map *map; map = rb_entry(n, struct btrfs_chunk_map, rb_node); ret = map->start + map->chunk_len; } read_unlock(&fs_info->mapping_tree_lock); return ret; } static noinline int find_next_devid(struct btrfs_fs_info *fs_info, u64 *devid_ret) { int ret; struct btrfs_key key; struct btrfs_key found_key; struct btrfs_path *path; path = btrfs_alloc_path(); if (!path) return -ENOMEM; key.objectid = BTRFS_DEV_ITEMS_OBJECTID; key.type = BTRFS_DEV_ITEM_KEY; key.offset = (u64)-1; ret = btrfs_search_slot(NULL, fs_info->chunk_root, &key, path, 0, 0); if (ret < 0) goto error; if (ret == 0) { /* Corruption */ btrfs_err(fs_info, "corrupted chunk tree devid -1 matched"); ret = -EUCLEAN; goto error; } ret = btrfs_previous_item(fs_info->chunk_root, path, BTRFS_DEV_ITEMS_OBJECTID, BTRFS_DEV_ITEM_KEY); if (ret) { *devid_ret = 1; } else { btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]); *devid_ret = found_key.offset + 1; } ret = 0; error: btrfs_free_path(path); return ret; } /* * the device information is stored in the chunk root * the btrfs_device struct should be fully filled in */ static int btrfs_add_dev_item(struct btrfs_trans_handle *trans, struct btrfs_device *device) { int ret; struct btrfs_path *path; struct btrfs_dev_item *dev_item; struct extent_buffer *leaf; struct btrfs_key key; unsigned long ptr; path = btrfs_alloc_path(); if (!path) return -ENOMEM; key.objectid = BTRFS_DEV_ITEMS_OBJECTID; key.type = BTRFS_DEV_ITEM_KEY; key.offset = device->devid; btrfs_reserve_chunk_metadata(trans, true); ret = btrfs_insert_empty_item(trans, trans->fs_info->chunk_root, path, &key, sizeof(*dev_item)); btrfs_trans_release_chunk_metadata(trans); if (ret) goto out; leaf = path->nodes[0]; dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item); btrfs_set_device_id(leaf, dev_item, device->devid); btrfs_set_device_generation(leaf, dev_item, 0); btrfs_set_device_type(leaf, dev_item, device->type); btrfs_set_device_io_align(leaf, dev_item, device->io_align); btrfs_set_device_io_width(leaf, dev_item, device->io_width); btrfs_set_device_sector_size(leaf, dev_item, device->sector_size); btrfs_set_device_total_bytes(leaf, dev_item, btrfs_device_get_disk_total_bytes(device)); btrfs_set_device_bytes_used(leaf, dev_item, btrfs_device_get_bytes_used(device)); btrfs_set_device_group(leaf, dev_item, 0); btrfs_set_device_seek_speed(leaf, dev_item, 0); btrfs_set_device_bandwidth(leaf, dev_item, 0); btrfs_set_device_start_offset(leaf, dev_item, 0); ptr = btrfs_device_uuid(dev_item); write_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE); ptr = btrfs_device_fsid(dev_item); write_extent_buffer(leaf, trans->fs_info->fs_devices->metadata_uuid, ptr, BTRFS_FSID_SIZE); ret = 0; out: btrfs_free_path(path); return ret; } /* * Function to update ctime/mtime for a given device path. * Mainly used for ctime/mtime based probe like libblkid. * * We don't care about errors here, this is just to be kind to userspace. */ static void update_dev_time(const char *device_path) { struct path path; int ret; ret = kern_path(device_path, LOOKUP_FOLLOW, &path); if (ret) return; inode_update_time(d_inode(path.dentry), S_MTIME | S_CTIME | S_VERSION); path_put(&path); } static int btrfs_rm_dev_item(struct btrfs_trans_handle *trans, struct btrfs_device *device) { struct btrfs_root *root = device->fs_info->chunk_root; int ret; struct btrfs_path *path; struct btrfs_key key; path = btrfs_alloc_path(); if (!path) return -ENOMEM; key.objectid = BTRFS_DEV_ITEMS_OBJECTID; key.type = BTRFS_DEV_ITEM_KEY; key.offset = device->devid; btrfs_reserve_chunk_metadata(trans, false); ret = btrfs_search_slot(trans, root, &key, path, -1, 1); btrfs_trans_release_chunk_metadata(trans); if (ret) { if (ret > 0) ret = -ENOENT; goto out; } ret = btrfs_del_item(trans, root, path); out: btrfs_free_path(path); return ret; } /* * Verify that @num_devices satisfies the RAID profile constraints in the whole * filesystem. It's up to the caller to adjust that number regarding eg. device * replace. */ static int btrfs_check_raid_min_devices(struct btrfs_fs_info *fs_info, u64 num_devices) { u64 all_avail; unsigned seq; int i; do { seq = read_seqbegin(&fs_info->profiles_lock); all_avail = fs_info->avail_data_alloc_bits | fs_info->avail_system_alloc_bits | fs_info->avail_metadata_alloc_bits; } while (read_seqretry(&fs_info->profiles_lock, seq)); for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) { if (!(all_avail & btrfs_raid_array[i].bg_flag)) continue; if (num_devices < btrfs_raid_array[i].devs_min) return btrfs_raid_array[i].mindev_error; } return 0; } static struct btrfs_device * btrfs_find_next_active_device( struct btrfs_fs_devices *fs_devs, struct btrfs_device *device) { struct btrfs_device *next_device; list_for_each_entry(next_device, &fs_devs->devices, dev_list) { if (next_device != device && !test_bit(BTRFS_DEV_STATE_MISSING, &next_device->dev_state) && next_device->bdev) return next_device; } return NULL; } /* * Helper function to check if the given device is part of s_bdev / latest_dev * and replace it with the provided or the next active device, in the context * where this function called, there should be always be another device (or * this_dev) which is active. */ void __cold btrfs_assign_next_active_device(struct btrfs_device *device, struct btrfs_device *next_device) { struct btrfs_fs_info *fs_info = device->fs_info; if (!next_device) next_device = btrfs_find_next_active_device(fs_info->fs_devices, device); ASSERT(next_device); if (fs_info->sb->s_bdev && (fs_info->sb->s_bdev == device->bdev)) fs_info->sb->s_bdev = next_device->bdev; if (fs_info->fs_devices->latest_dev->bdev == device->bdev) fs_info->fs_devices->latest_dev = next_device; } /* * Return btrfs_fs_devices::num_devices excluding the device that's being * currently replaced. */ static u64 btrfs_num_devices(struct btrfs_fs_info *fs_info) { u64 num_devices = fs_info->fs_devices->num_devices; down_read(&fs_info->dev_replace.rwsem); if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace)) { ASSERT(num_devices > 1); num_devices--; } up_read(&fs_info->dev_replace.rwsem); return num_devices; } static void btrfs_scratch_superblock(struct btrfs_fs_info *fs_info, struct block_device *bdev, int copy_num) { struct btrfs_super_block *disk_super; const size_t len = sizeof(disk_super->magic); const u64 bytenr = btrfs_sb_offset(copy_num); int ret; disk_super = btrfs_read_disk_super(bdev, bytenr, bytenr); if (IS_ERR(disk_super)) return; memset(&disk_super->magic, 0, len); folio_mark_dirty(virt_to_folio(disk_super)); btrfs_release_disk_super(disk_super); ret = sync_blockdev_range(bdev, bytenr, bytenr + len - 1); if (ret) btrfs_warn(fs_info, "error clearing superblock number %d (%d)", copy_num, ret); } void btrfs_scratch_superblocks(struct btrfs_fs_info *fs_info, struct btrfs_device *device) { int copy_num; struct block_device *bdev = device->bdev; if (!bdev) return; for (copy_num = 0; copy_num < BTRFS_SUPER_MIRROR_MAX; copy_num++) { if (bdev_is_zoned(bdev)) btrfs_reset_sb_log_zones(bdev, copy_num); else btrfs_scratch_superblock(fs_info, bdev, copy_num); } /* Notify udev that device has changed */ btrfs_kobject_uevent(bdev, KOBJ_CHANGE); /* Update ctime/mtime for device path for libblkid */ update_dev_time(device->name->str); } int btrfs_rm_device(struct btrfs_fs_info *fs_info, struct btrfs_dev_lookup_args *args, struct file **bdev_file) { struct btrfs_trans_handle *trans; struct btrfs_device *device; struct btrfs_fs_devices *cur_devices; struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; u64 num_devices; int ret = 0; if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) { btrfs_err(fs_info, "device remove not supported on extent tree v2 yet"); return -EINVAL; } /* * The device list in fs_devices is accessed without locks (neither * uuid_mutex nor device_list_mutex) as it won't change on a mounted * filesystem and another device rm cannot run. */ num_devices = btrfs_num_devices(fs_info); ret = btrfs_check_raid_min_devices(fs_info, num_devices - 1); if (ret) return ret; device = btrfs_find_device(fs_info->fs_devices, args); if (!device) { if (args->missing) ret = BTRFS_ERROR_DEV_MISSING_NOT_FOUND; else ret = -ENOENT; return ret; } if (btrfs_pinned_by_swapfile(fs_info, device)) { btrfs_warn_in_rcu(fs_info, "cannot remove device %s (devid %llu) due to active swapfile", btrfs_dev_name(device), device->devid); return -ETXTBSY; } if (test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) return BTRFS_ERROR_DEV_TGT_REPLACE; if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) && fs_info->fs_devices->rw_devices == 1) return BTRFS_ERROR_DEV_ONLY_WRITABLE; if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) { mutex_lock(&fs_info->chunk_mutex); list_del_init(&device->dev_alloc_list); device->fs_devices->rw_devices--; mutex_unlock(&fs_info->chunk_mutex); } ret = btrfs_shrink_device(device, 0); if (ret) goto error_undo; trans = btrfs_start_transaction(fs_info->chunk_root, 0); if (IS_ERR(trans)) { ret = PTR_ERR(trans); goto error_undo; } ret = btrfs_rm_dev_item(trans, device); if (ret) { /* Any error in dev item removal is critical */ btrfs_crit(fs_info, "failed to remove device item for devid %llu: %d", device->devid, ret); btrfs_abort_transaction(trans, ret); btrfs_end_transaction(trans); return ret; } clear_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state); btrfs_scrub_cancel_dev(device); /* * the device list mutex makes sure that we don't change * the device list while someone else is writing out all * the device supers. Whoever is writing all supers, should * lock the device list mutex before getting the number of * devices in the super block (super_copy). Conversely, * whoever updates the number of devices in the super block * (super_copy) should hold the device list mutex. */ /* * In normal cases the cur_devices == fs_devices. But in case * of deleting a seed device, the cur_devices should point to * its own fs_devices listed under the fs_devices->seed_list. */ cur_devices = device->fs_devices; mutex_lock(&fs_devices->device_list_mutex); list_del_rcu(&device->dev_list); cur_devices->num_devices--; cur_devices->total_devices--; /* Update total_devices of the parent fs_devices if it's seed */ if (cur_devices != fs_devices) fs_devices->total_devices--; if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) cur_devices->missing_devices--; btrfs_assign_next_active_device(device, NULL); if (device->bdev_file) { cur_devices->open_devices--; /* remove sysfs entry */ btrfs_sysfs_remove_device(device); } num_devices = btrfs_super_num_devices(fs_info->super_copy) - 1; btrfs_set_super_num_devices(fs_info->super_copy, num_devices); mutex_unlock(&fs_devices->device_list_mutex); /* * At this point, the device is zero sized and detached from the * devices list. All that's left is to zero out the old supers and * free the device. * * We cannot call btrfs_close_bdev() here because we're holding the sb * write lock, and fput() on the block device will pull in the * ->open_mutex on the block device and it's dependencies. Instead * just flush the device and let the caller do the final bdev_release. */ if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) { btrfs_scratch_superblocks(fs_info, device); if (device->bdev) { sync_blockdev(device->bdev); invalidate_bdev(device->bdev); } } *bdev_file = device->bdev_file; synchronize_rcu(); btrfs_free_device(device); /* * This can happen if cur_devices is the private seed devices list. We * cannot call close_fs_devices() here because it expects the uuid_mutex * to be held, but in fact we don't need that for the private * seed_devices, we can simply decrement cur_devices->opened and then * remove it from our list and free the fs_devices. */ if (cur_devices->num_devices == 0) { list_del_init(&cur_devices->seed_list); ASSERT(cur_devices->opened == 1); cur_devices->opened--; free_fs_devices(cur_devices); } ret = btrfs_commit_transaction(trans); return ret; error_undo: if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) { mutex_lock(&fs_info->chunk_mutex); list_add(&device->dev_alloc_list, &fs_devices->alloc_list); device->fs_devices->rw_devices++; mutex_unlock(&fs_info->chunk_mutex); } return ret; } void btrfs_rm_dev_replace_remove_srcdev(struct btrfs_device *srcdev) { struct btrfs_fs_devices *fs_devices; lockdep_assert_held(&srcdev->fs_info->fs_devices->device_list_mutex); /* * in case of fs with no seed, srcdev->fs_devices will point * to fs_devices of fs_info. However when the dev being replaced is * a seed dev it will point to the seed's local fs_devices. In short * srcdev will have its correct fs_devices in both the cases. */ fs_devices = srcdev->fs_devices; list_del_rcu(&srcdev->dev_list); list_del(&srcdev->dev_alloc_list); fs_devices->num_devices--; if (test_bit(BTRFS_DEV_STATE_MISSING, &srcdev->dev_state)) fs_devices->missing_devices--; if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &srcdev->dev_state)) fs_devices->rw_devices--; if (srcdev->bdev) fs_devices->open_devices--; } void btrfs_rm_dev_replace_free_srcdev(struct btrfs_device *srcdev) { struct btrfs_fs_devices *fs_devices = srcdev->fs_devices; mutex_lock(&uuid_mutex); btrfs_close_bdev(srcdev); synchronize_rcu(); btrfs_free_device(srcdev); /* if this is no devs we rather delete the fs_devices */ if (!fs_devices->num_devices) { /* * On a mounted FS, num_devices can't be zero unless it's a * seed. In case of a seed device being replaced, the replace * target added to the sprout FS, so there will be no more * device left under the seed FS. */ ASSERT(fs_devices->seeding); list_del_init(&fs_devices->seed_list); close_fs_devices(fs_devices); free_fs_devices(fs_devices); } mutex_unlock(&uuid_mutex); } void btrfs_destroy_dev_replace_tgtdev(struct btrfs_device *tgtdev) { struct btrfs_fs_devices *fs_devices = tgtdev->fs_info->fs_devices; mutex_lock(&fs_devices->device_list_mutex); btrfs_sysfs_remove_device(tgtdev); if (tgtdev->bdev) fs_devices->open_devices--; fs_devices->num_devices--; btrfs_assign_next_active_device(tgtdev, NULL); list_del_rcu(&tgtdev->dev_list); mutex_unlock(&fs_devices->device_list_mutex); btrfs_scratch_superblocks(tgtdev->fs_info, tgtdev); btrfs_close_bdev(tgtdev); synchronize_rcu(); btrfs_free_device(tgtdev); } /* * Populate args from device at path. * * @fs_info: the filesystem * @args: the args to populate * @path: the path to the device * * This will read the super block of the device at @path and populate @args with * the devid, fsid, and uuid. This is meant to be used for ioctls that need to * lookup a device to operate on, but need to do it before we take any locks. * This properly handles the special case of "missing" that a user may pass in, * and does some basic sanity checks. The caller must make sure that @path is * properly NUL terminated before calling in, and must call * btrfs_put_dev_args_from_path() in order to free up the temporary fsid and * uuid buffers. * * Return: 0 for success, -errno for failure */ int btrfs_get_dev_args_from_path(struct btrfs_fs_info *fs_info, struct btrfs_dev_lookup_args *args, const char *path) { struct btrfs_super_block *disk_super; struct file *bdev_file; int ret; if (!path || !path[0]) return -EINVAL; if (!strcmp(path, "missing")) { args->missing = true; return 0; } args->uuid = kzalloc(BTRFS_UUID_SIZE, GFP_KERNEL); args->fsid = kzalloc(BTRFS_FSID_SIZE, GFP_KERNEL); if (!args->uuid || !args->fsid) { btrfs_put_dev_args_from_path(args); return -ENOMEM; } ret = btrfs_get_bdev_and_sb(path, BLK_OPEN_READ, NULL, 0, &bdev_file, &disk_super); if (ret) { btrfs_put_dev_args_from_path(args); return ret; } args->devid = btrfs_stack_device_id(&disk_super->dev_item); memcpy(args->uuid, disk_super->dev_item.uuid, BTRFS_UUID_SIZE); if (btrfs_fs_incompat(fs_info, METADATA_UUID)) memcpy(args->fsid, disk_super->metadata_uuid, BTRFS_FSID_SIZE); else memcpy(args->fsid, disk_super->fsid, BTRFS_FSID_SIZE); btrfs_release_disk_super(disk_super); fput(bdev_file); return 0; } /* * Only use this jointly with btrfs_get_dev_args_from_path() because we will * allocate our ->uuid and ->fsid pointers, everybody else uses local variables * that don't need to be freed. */ void btrfs_put_dev_args_from_path(struct btrfs_dev_lookup_args *args) { kfree(args->uuid); kfree(args->fsid); args->uuid = NULL; args->fsid = NULL; } struct btrfs_device *btrfs_find_device_by_devspec( struct btrfs_fs_info *fs_info, u64 devid, const char *device_path) { BTRFS_DEV_LOOKUP_ARGS(args); struct btrfs_device *device; int ret; if (devid) { args.devid = devid; device = btrfs_find_device(fs_info->fs_devices, &args); if (!device) return ERR_PTR(-ENOENT); return device; } ret = btrfs_get_dev_args_from_path(fs_info, &args, device_path); if (ret) return ERR_PTR(ret); device = btrfs_find_device(fs_info->fs_devices, &args); btrfs_put_dev_args_from_path(&args); if (!device) return ERR_PTR(-ENOENT); return device; } static struct btrfs_fs_devices *btrfs_init_sprout(struct btrfs_fs_info *fs_info) { struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; struct btrfs_fs_devices *old_devices; struct btrfs_fs_devices *seed_devices; lockdep_assert_held(&uuid_mutex); if (!fs_devices->seeding) return ERR_PTR(-EINVAL); /* * Private copy of the seed devices, anchored at * fs_info->fs_devices->seed_list */ seed_devices = alloc_fs_devices(NULL); if (IS_ERR(seed_devices)) return seed_devices; /* * It's necessary to retain a copy of the original seed fs_devices in * fs_uuids so that filesystems which have been seeded can successfully * reference the seed device from open_seed_devices. This also supports * multiple fs seed. */ old_devices = clone_fs_devices(fs_devices); if (IS_ERR(old_devices)) { kfree(seed_devices); return old_devices; } list_add(&old_devices->fs_list, &fs_uuids); memcpy(seed_devices, fs_devices, sizeof(*seed_devices)); seed_devices->opened = 1; INIT_LIST_HEAD(&seed_devices->devices); INIT_LIST_HEAD(&seed_devices->alloc_list); mutex_init(&seed_devices->device_list_mutex); return seed_devices; } /* * Splice seed devices into the sprout fs_devices. * Generate a new fsid for the sprouted read-write filesystem. */ static void btrfs_setup_sprout(struct btrfs_fs_info *fs_info, struct btrfs_fs_devices *seed_devices) { struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; struct btrfs_super_block *disk_super = fs_info->super_copy; struct btrfs_device *device; u64 super_flags; /* * We are updating the fsid, the thread leading to device_list_add() * could race, so uuid_mutex is needed. */ lockdep_assert_held(&uuid_mutex); /* * The threads listed below may traverse dev_list but can do that without * device_list_mutex: * - All device ops and balance - as we are in btrfs_exclop_start. * - Various dev_list readers - are using RCU. * - btrfs_ioctl_fitrim() - is using RCU. * * For-read threads as below are using device_list_mutex: * - Readonly scrub btrfs_scrub_dev() * - Readonly scrub btrfs_scrub_progress() * - btrfs_get_dev_stats() */ lockdep_assert_held(&fs_devices->device_list_mutex); list_splice_init_rcu(&fs_devices->devices, &seed_devices->devices, synchronize_rcu); list_for_each_entry(device, &seed_devices->devices, dev_list) device->fs_devices = seed_devices; fs_devices->seeding = false; fs_devices->num_devices = 0; fs_devices->open_devices = 0; fs_devices->missing_devices = 0; fs_devices->rotating = false; list_add(&seed_devices->seed_list, &fs_devices->seed_list); generate_random_uuid(fs_devices->fsid); memcpy(fs_devices->metadata_uuid, fs_devices->fsid, BTRFS_FSID_SIZE); memcpy(disk_super->fsid, fs_devices->fsid, BTRFS_FSID_SIZE); super_flags = btrfs_super_flags(disk_super) & ~BTRFS_SUPER_FLAG_SEEDING; btrfs_set_super_flags(disk_super, super_flags); } /* * Store the expected generation for seed devices in device items. */ static int btrfs_finish_sprout(struct btrfs_trans_handle *trans) { BTRFS_DEV_LOOKUP_ARGS(args); struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_root *root = fs_info->chunk_root; struct btrfs_path *path; struct extent_buffer *leaf; struct btrfs_dev_item *dev_item; struct btrfs_device *device; struct btrfs_key key; u8 fs_uuid[BTRFS_FSID_SIZE]; u8 dev_uuid[BTRFS_UUID_SIZE]; int ret; path = btrfs_alloc_path(); if (!path) return -ENOMEM; key.objectid = BTRFS_DEV_ITEMS_OBJECTID; key.offset = 0; key.type = BTRFS_DEV_ITEM_KEY; while (1) { btrfs_reserve_chunk_metadata(trans, false); ret = btrfs_search_slot(trans, root, &key, path, 0, 1); btrfs_trans_release_chunk_metadata(trans); if (ret < 0) goto error; leaf = path->nodes[0]; next_slot: if (path->slots[0] >= btrfs_header_nritems(leaf)) { ret = btrfs_next_leaf(root, path); if (ret > 0) break; if (ret < 0) goto error; leaf = path->nodes[0]; btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); btrfs_release_path(path); continue; } btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); if (key.objectid != BTRFS_DEV_ITEMS_OBJECTID || key.type != BTRFS_DEV_ITEM_KEY) break; dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item); args.devid = btrfs_device_id(leaf, dev_item); read_extent_buffer(leaf, dev_uuid, btrfs_device_uuid(dev_item), BTRFS_UUID_SIZE); read_extent_buffer(leaf, fs_uuid, btrfs_device_fsid(dev_item), BTRFS_FSID_SIZE); args.uuid = dev_uuid; args.fsid = fs_uuid; device = btrfs_find_device(fs_info->fs_devices, &args); BUG_ON(!device); /* Logic error */ if (device->fs_devices->seeding) btrfs_set_device_generation(leaf, dev_item, device->generation); path->slots[0]++; goto next_slot; } ret = 0; error: btrfs_free_path(path); return ret; } int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path) { struct btrfs_root *root = fs_info->dev_root; struct btrfs_trans_handle *trans; struct btrfs_device *device; struct file *bdev_file; struct super_block *sb = fs_info->sb; struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; struct btrfs_fs_devices *seed_devices = NULL; u64 orig_super_total_bytes; u64 orig_super_num_devices; int ret = 0; bool seeding_dev = false; bool locked = false; if (sb_rdonly(sb) && !fs_devices->seeding) return -EROFS; bdev_file = bdev_file_open_by_path(device_path, BLK_OPEN_WRITE, fs_info->bdev_holder, NULL); if (IS_ERR(bdev_file)) return PTR_ERR(bdev_file); if (!btrfs_check_device_zone_type(fs_info, file_bdev(bdev_file))) { ret = -EINVAL; goto error; } if (fs_devices->seeding) { seeding_dev = true; down_write(&sb->s_umount); mutex_lock(&uuid_mutex); locked = true; } sync_blockdev(file_bdev(bdev_file)); rcu_read_lock(); list_for_each_entry_rcu(device, &fs_devices->devices, dev_list) { if (device->bdev == file_bdev(bdev_file)) { ret = -EEXIST; rcu_read_unlock(); goto error; } } rcu_read_unlock(); device = btrfs_alloc_device(fs_info, NULL, NULL, device_path); if (IS_ERR(device)) { /* we can safely leave the fs_devices entry around */ ret = PTR_ERR(device); goto error; } device->fs_info = fs_info; device->bdev_file = bdev_file; device->bdev = file_bdev(bdev_file); ret = lookup_bdev(device_path, &device->devt); if (ret) goto error_free_device; ret = btrfs_get_dev_zone_info(device, false); if (ret) goto error_free_device; trans = btrfs_start_transaction(root, 0); if (IS_ERR(trans)) { ret = PTR_ERR(trans); goto error_free_zone; } set_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state); device->generation = trans->transid; device->io_width = fs_info->sectorsize; device->io_align = fs_info->sectorsize; device->sector_size = fs_info->sectorsize; device->total_bytes = round_down(bdev_nr_bytes(device->bdev), fs_info->sectorsize); device->disk_total_bytes = device->total_bytes; device->commit_total_bytes = device->total_bytes; set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state); clear_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state); device->dev_stats_valid = 1; set_blocksize(device->bdev_file, BTRFS_BDEV_BLOCKSIZE); if (seeding_dev) { /* GFP_KERNEL allocation must not be under device_list_mutex */ seed_devices = btrfs_init_sprout(fs_info); if (IS_ERR(seed_devices)) { ret = PTR_ERR(seed_devices); btrfs_abort_transaction(trans, ret); goto error_trans; } } mutex_lock(&fs_devices->device_list_mutex); if (seeding_dev) { btrfs_setup_sprout(fs_info, seed_devices); btrfs_assign_next_active_device(fs_info->fs_devices->latest_dev, device); } device->fs_devices = fs_devices; mutex_lock(&fs_info->chunk_mutex); list_add_rcu(&device->dev_list, &fs_devices->devices); list_add(&device->dev_alloc_list, &fs_devices->alloc_list); fs_devices->num_devices++; fs_devices->open_devices++; fs_devices->rw_devices++; fs_devices->total_devices++; fs_devices->total_rw_bytes += device->total_bytes; atomic64_add(device->total_bytes, &fs_info->free_chunk_space); if (!bdev_nonrot(device->bdev)) fs_devices->rotating = true; orig_super_total_bytes = btrfs_super_total_bytes(fs_info->super_copy); btrfs_set_super_total_bytes(fs_info->super_copy, round_down(orig_super_total_bytes + device->total_bytes, fs_info->sectorsize)); orig_super_num_devices = btrfs_super_num_devices(fs_info->super_copy); btrfs_set_super_num_devices(fs_info->super_copy, orig_super_num_devices + 1); /* * we've got more storage, clear any full flags on the space * infos */ btrfs_clear_space_info_full(fs_info); mutex_unlock(&fs_info->chunk_mutex); /* Add sysfs device entry */ btrfs_sysfs_add_device(device); mutex_unlock(&fs_devices->device_list_mutex); if (seeding_dev) { mutex_lock(&fs_info->chunk_mutex); ret = init_first_rw_device(trans); mutex_unlock(&fs_info->chunk_mutex); if (ret) { btrfs_abort_transaction(trans, ret); goto error_sysfs; } } ret = btrfs_add_dev_item(trans, device); if (ret) { btrfs_abort_transaction(trans, ret); goto error_sysfs; } if (seeding_dev) { ret = btrfs_finish_sprout(trans); if (ret) { btrfs_abort_transaction(trans, ret); goto error_sysfs; } /* * fs_devices now represents the newly sprouted filesystem and * its fsid has been changed by btrfs_sprout_splice(). */ btrfs_sysfs_update_sprout_fsid(fs_devices); } ret = btrfs_commit_transaction(trans); if (seeding_dev) { mutex_unlock(&uuid_mutex); up_write(&sb->s_umount); locked = false; if (ret) /* transaction commit */ return ret; ret = btrfs_relocate_sys_chunks(fs_info); if (ret < 0) btrfs_handle_fs_error(fs_info, ret, "Failed to relocate sys chunks after device initialization. This can be fixed using the \"btrfs balance\" command."); trans = btrfs_attach_transaction(root); if (IS_ERR(trans)) { if (PTR_ERR(trans) == -ENOENT) return 0; ret = PTR_ERR(trans); trans = NULL; goto error_sysfs; } ret = btrfs_commit_transaction(trans); } /* * Now that we have written a new super block to this device, check all * other fs_devices list if device_path alienates any other scanned * device. * We can ignore the return value as it typically returns -EINVAL and * only succeeds if the device was an alien. */ btrfs_forget_devices(device->devt); /* Update ctime/mtime for blkid or udev */ update_dev_time(device_path); return ret; error_sysfs: btrfs_sysfs_remove_device(device); mutex_lock(&fs_info->fs_devices->device_list_mutex); mutex_lock(&fs_info->chunk_mutex); list_del_rcu(&device->dev_list); list_del(&device->dev_alloc_list); fs_info->fs_devices->num_devices--; fs_info->fs_devices->open_devices--; fs_info->fs_devices->rw_devices--; fs_info->fs_devices->total_devices--; fs_info->fs_devices->total_rw_bytes -= device->total_bytes; atomic64_sub(device->total_bytes, &fs_info->free_chunk_space); btrfs_set_super_total_bytes(fs_info->super_copy, orig_super_total_bytes); btrfs_set_super_num_devices(fs_info->super_copy, orig_super_num_devices); mutex_unlock(&fs_info->chunk_mutex); mutex_unlock(&fs_info->fs_devices->device_list_mutex); error_trans: if (trans) btrfs_end_transaction(trans); error_free_zone: btrfs_destroy_dev_zone_info(device); error_free_device: btrfs_free_device(device); error: fput(bdev_file); if (locked) { mutex_unlock(&uuid_mutex); up_write(&sb->s_umount); } return ret; } static noinline int btrfs_update_device(struct btrfs_trans_handle *trans, struct btrfs_device *device) { int ret; struct btrfs_path *path; struct btrfs_root *root = device->fs_info->chunk_root; struct btrfs_dev_item *dev_item; struct extent_buffer *leaf; struct btrfs_key key; path = btrfs_alloc_path(); if (!path) return -ENOMEM; key.objectid = BTRFS_DEV_ITEMS_OBJECTID; key.type = BTRFS_DEV_ITEM_KEY; key.offset = device->devid; ret = btrfs_search_slot(trans, root, &key, path, 0, 1); if (ret < 0) goto out; if (ret > 0) { ret = -ENOENT; goto out; } leaf = path->nodes[0]; dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item); btrfs_set_device_id(leaf, dev_item, device->devid); btrfs_set_device_type(leaf, dev_item, device->type); btrfs_set_device_io_align(leaf, dev_item, device->io_align); btrfs_set_device_io_width(leaf, dev_item, device->io_width); btrfs_set_device_sector_size(leaf, dev_item, device->sector_size); btrfs_set_device_total_bytes(leaf, dev_item, btrfs_device_get_disk_total_bytes(device)); btrfs_set_device_bytes_used(leaf, dev_item, btrfs_device_get_bytes_used(device)); out: btrfs_free_path(path); return ret; } int btrfs_grow_device(struct btrfs_trans_handle *trans, struct btrfs_device *device, u64 new_size) { struct btrfs_fs_info *fs_info = device->fs_info; struct btrfs_super_block *super_copy = fs_info->super_copy; u64 old_total; u64 diff; int ret; if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) return -EACCES; new_size = round_down(new_size, fs_info->sectorsize); mutex_lock(&fs_info->chunk_mutex); old_total = btrfs_super_total_bytes(super_copy); diff = round_down(new_size - device->total_bytes, fs_info->sectorsize); if (new_size <= device->total_bytes || test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) { mutex_unlock(&fs_info->chunk_mutex); return -EINVAL; } btrfs_set_super_total_bytes(super_copy, round_down(old_total + diff, fs_info->sectorsize)); device->fs_devices->total_rw_bytes += diff; atomic64_add(diff, &fs_info->free_chunk_space); btrfs_device_set_total_bytes(device, new_size); btrfs_device_set_disk_total_bytes(device, new_size); btrfs_clear_space_info_full(device->fs_info); if (list_empty(&device->post_commit_list)) list_add_tail(&device->post_commit_list, &trans->transaction->dev_update_list); mutex_unlock(&fs_info->chunk_mutex); btrfs_reserve_chunk_metadata(trans, false); ret = btrfs_update_device(trans, device); btrfs_trans_release_chunk_metadata(trans); return ret; } static int btrfs_free_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset) { struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_root *root = fs_info->chunk_root; int ret; struct btrfs_path *path; struct btrfs_key key; path = btrfs_alloc_path(); if (!path) return -ENOMEM; key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID; key.offset = chunk_offset; key.type = BTRFS_CHUNK_ITEM_KEY; ret = btrfs_search_slot(trans, root, &key, path, -1, 1); if (ret < 0) goto out; else if (ret > 0) { /* Logic error or corruption */ btrfs_err(fs_info, "failed to lookup chunk %llu when freeing", chunk_offset); btrfs_abort_transaction(trans, -ENOENT); ret = -EUCLEAN; goto out; } ret = btrfs_del_item(trans, root, path); if (ret < 0) { btrfs_err(fs_info, "failed to delete chunk %llu item", chunk_offset); btrfs_abort_transaction(trans, ret); goto out; } out: btrfs_free_path(path); return ret; } static int btrfs_del_sys_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset) { struct btrfs_super_block *super_copy = fs_info->super_copy; struct btrfs_disk_key *disk_key; struct btrfs_chunk *chunk; u8 *ptr; int ret = 0; u32 num_stripes; u32 array_size; u32 len = 0; u32 cur; struct btrfs_key key; lockdep_assert_held(&fs_info->chunk_mutex); array_size = btrfs_super_sys_array_size(super_copy); ptr = super_copy->sys_chunk_array; cur = 0; while (cur < array_size) { disk_key = (struct btrfs_disk_key *)ptr; btrfs_disk_key_to_cpu(&key, disk_key); len = sizeof(*disk_key); if (key.type == BTRFS_CHUNK_ITEM_KEY) { chunk = (struct btrfs_chunk *)(ptr + len); num_stripes = btrfs_stack_chunk_num_stripes(chunk); len += btrfs_chunk_item_size(num_stripes); } else { ret = -EIO; break; } if (key.objectid == BTRFS_FIRST_CHUNK_TREE_OBJECTID && key.offset == chunk_offset) { memmove(ptr, ptr + len, array_size - (cur + len)); array_size -= len; btrfs_set_super_sys_array_size(super_copy, array_size); } else { ptr += len; cur += len; } } return ret; } struct btrfs_chunk_map *btrfs_find_chunk_map_nolock(struct btrfs_fs_info *fs_info, u64 logical, u64 length) { struct rb_node *node = fs_info->mapping_tree.rb_root.rb_node; struct rb_node *prev = NULL; struct rb_node *orig_prev; struct btrfs_chunk_map *map; struct btrfs_chunk_map *prev_map = NULL; while (node) { map = rb_entry(node, struct btrfs_chunk_map, rb_node); prev = node; prev_map = map; if (logical < map->start) { node = node->rb_left; } else if (logical >= map->start + map->chunk_len) { node = node->rb_right; } else { refcount_inc(&map->refs); return map; } } if (!prev) return NULL; orig_prev = prev; while (prev && logical >= prev_map->start + prev_map->chunk_len) { prev = rb_next(prev); prev_map = rb_entry(prev, struct btrfs_chunk_map, rb_node); } if (!prev) { prev = orig_prev; prev_map = rb_entry(prev, struct btrfs_chunk_map, rb_node); while (prev && logical < prev_map->start) { prev = rb_prev(prev); prev_map = rb_entry(prev, struct btrfs_chunk_map, rb_node); } } if (prev) { u64 end = logical + length; /* * Caller can pass a U64_MAX length when it wants to get any * chunk starting at an offset of 'logical' or higher, so deal * with underflow by resetting the end offset to U64_MAX. */ if (end < logical) end = U64_MAX; if (end > prev_map->start && logical < prev_map->start + prev_map->chunk_len) { refcount_inc(&prev_map->refs); return prev_map; } } return NULL; } struct btrfs_chunk_map *btrfs_find_chunk_map(struct btrfs_fs_info *fs_info, u64 logical, u64 length) { struct btrfs_chunk_map *map; read_lock(&fs_info->mapping_tree_lock); map = btrfs_find_chunk_map_nolock(fs_info, logical, length); read_unlock(&fs_info->mapping_tree_lock); return map; } /* * Find the mapping containing the given logical extent. * * @logical: Logical block offset in bytes. * @length: Length of extent in bytes. * * Return: Chunk mapping or ERR_PTR. */ struct btrfs_chunk_map *btrfs_get_chunk_map(struct btrfs_fs_info *fs_info, u64 logical, u64 length) { struct btrfs_chunk_map *map; map = btrfs_find_chunk_map(fs_info, logical, length); if (unlikely(!map)) { btrfs_crit(fs_info, "unable to find chunk map for logical %llu length %llu", logical, length); return ERR_PTR(-EINVAL); } if (unlikely(map->start > logical || map->start + map->chunk_len <= logical)) { btrfs_crit(fs_info, "found a bad chunk map, wanted %llu-%llu, found %llu-%llu", logical, logical + length, map->start, map->start + map->chunk_len); btrfs_free_chunk_map(map); return ERR_PTR(-EINVAL); } /* Callers are responsible for dropping the reference. */ return map; } static int remove_chunk_item(struct btrfs_trans_handle *trans, struct btrfs_chunk_map *map, u64 chunk_offset) { int i; /* * Removing chunk items and updating the device items in the chunks btree * requires holding the chunk_mutex. * See the comment at btrfs_chunk_alloc() for the details. */ lockdep_assert_held(&trans->fs_info->chunk_mutex); for (i = 0; i < map->num_stripes; i++) { int ret; ret = btrfs_update_device(trans, map->stripes[i].dev); if (ret) return ret; } return btrfs_free_chunk(trans, chunk_offset); } int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset) { struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_chunk_map *map; u64 dev_extent_len = 0; int i, ret = 0; struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; map = btrfs_get_chunk_map(fs_info, chunk_offset, 1); if (IS_ERR(map)) { /* * This is a logic error, but we don't want to just rely on the * user having built with ASSERT enabled, so if ASSERT doesn't * do anything we still error out. */ ASSERT(0); return PTR_ERR(map); } /* * First delete the device extent items from the devices btree. * We take the device_list_mutex to avoid racing with the finishing phase * of a device replace operation. See the comment below before acquiring * fs_info->chunk_mutex. Note that here we do not acquire the chunk_mutex * because that can result in a deadlock when deleting the device extent * items from the devices btree - COWing an extent buffer from the btree * may result in allocating a new metadata chunk, which would attempt to * lock again fs_info->chunk_mutex. */ mutex_lock(&fs_devices->device_list_mutex); for (i = 0; i < map->num_stripes; i++) { struct btrfs_device *device = map->stripes[i].dev; ret = btrfs_free_dev_extent(trans, device, map->stripes[i].physical, &dev_extent_len); if (ret) { mutex_unlock(&fs_devices->device_list_mutex); btrfs_abort_transaction(trans, ret); goto out; } if (device->bytes_used > 0) { mutex_lock(&fs_info->chunk_mutex); btrfs_device_set_bytes_used(device, device->bytes_used - dev_extent_len); atomic64_add(dev_extent_len, &fs_info->free_chunk_space); btrfs_clear_space_info_full(fs_info); mutex_unlock(&fs_info->chunk_mutex); } } mutex_unlock(&fs_devices->device_list_mutex); /* * We acquire fs_info->chunk_mutex for 2 reasons: * * 1) Just like with the first phase of the chunk allocation, we must * reserve system space, do all chunk btree updates and deletions, and * update the system chunk array in the superblock while holding this * mutex. This is for similar reasons as explained on the comment at * the top of btrfs_chunk_alloc(); * * 2) Prevent races with the final phase of a device replace operation * that replaces the device object associated with the map's stripes, * because the device object's id can change at any time during that * final phase of the device replace operation * (dev-replace.c:btrfs_dev_replace_finishing()), so we could grab the * replaced device and then see it with an ID of * BTRFS_DEV_REPLACE_DEVID, which would cause a failure when updating * the device item, which does not exists on the chunk btree. * The finishing phase of device replace acquires both the * device_list_mutex and the chunk_mutex, in that order, so we are * safe by just acquiring the chunk_mutex. */ trans->removing_chunk = true; mutex_lock(&fs_info->chunk_mutex); check_system_chunk(trans, map->type); ret = remove_chunk_item(trans, map, chunk_offset); /* * Normally we should not get -ENOSPC since we reserved space before * through the call to check_system_chunk(). * * Despite our system space_info having enough free space, we may not * be able to allocate extents from its block groups, because all have * an incompatible profile, which will force us to allocate a new system * block group with the right profile, or right after we called * check_system_space() above, a scrub turned the only system block group * with enough free space into RO mode. * This is explained with more detail at do_chunk_alloc(). * * So if we get -ENOSPC, allocate a new system chunk and retry once. */ if (ret == -ENOSPC) { const u64 sys_flags = btrfs_system_alloc_profile(fs_info); struct btrfs_block_group *sys_bg; sys_bg = btrfs_create_chunk(trans, sys_flags); if (IS_ERR(sys_bg)) { ret = PTR_ERR(sys_bg); btrfs_abort_transaction(trans, ret); goto out; } ret = btrfs_chunk_alloc_add_chunk_item(trans, sys_bg); if (ret) { btrfs_abort_transaction(trans, ret); goto out; } ret = remove_chunk_item(trans, map, chunk_offset); if (ret) { btrfs_abort_transaction(trans, ret); goto out; } } else if (ret) { btrfs_abort_transaction(trans, ret); goto out; } trace_btrfs_chunk_free(fs_info, map, chunk_offset, map->chunk_len); if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) { ret = btrfs_del_sys_chunk(fs_info, chunk_offset); if (ret) { btrfs_abort_transaction(trans, ret); goto out; } } mutex_unlock(&fs_info->chunk_mutex); trans->removing_chunk = false; /* * We are done with chunk btree updates and deletions, so release the * system space we previously reserved (with check_system_chunk()). */ btrfs_trans_release_chunk_metadata(trans); ret = btrfs_remove_block_group(trans, map); if (ret) { btrfs_abort_transaction(trans, ret); goto out; } out: if (trans->removing_chunk) { mutex_unlock(&fs_info->chunk_mutex); trans->removing_chunk = false; } /* once for us */ btrfs_free_chunk_map(map); return ret; } int btrfs_relocate_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset) { struct btrfs_root *root = fs_info->chunk_root; struct btrfs_trans_handle *trans; struct btrfs_block_group *block_group; u64 length; int ret; if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) { btrfs_err(fs_info, "relocate: not supported on extent tree v2 yet"); return -EINVAL; } /* * Prevent races with automatic removal of unused block groups. * After we relocate and before we remove the chunk with offset * chunk_offset, automatic removal of the block group can kick in, * resulting in a failure when calling btrfs_remove_chunk() below. * * Make sure to acquire this mutex before doing a tree search (dev * or chunk trees) to find chunks. Otherwise the cleaner kthread might * call btrfs_remove_chunk() (through btrfs_delete_unused_bgs()) after * we release the path used to search the chunk/dev tree and before * the current task acquires this mutex and calls us. */ lockdep_assert_held(&fs_info->reclaim_bgs_lock); /* step one, relocate all the extents inside this chunk */ btrfs_scrub_pause(fs_info); ret = btrfs_relocate_block_group(fs_info, chunk_offset); btrfs_scrub_continue(fs_info); if (ret) { /* * If we had a transaction abort, stop all running scrubs. * See transaction.c:cleanup_transaction() why we do it here. */ if (BTRFS_FS_ERROR(fs_info)) btrfs_scrub_cancel(fs_info); return ret; } block_group = btrfs_lookup_block_group(fs_info, chunk_offset); if (!block_group) return -ENOENT; btrfs_discard_cancel_work(&fs_info->discard_ctl, block_group); length = block_group->length; btrfs_put_block_group(block_group); /* * On a zoned file system, discard the whole block group, this will * trigger a REQ_OP_ZONE_RESET operation on the device zone. If * resetting the zone fails, don't treat it as a fatal problem from the * filesystem's point of view. */ if (btrfs_is_zoned(fs_info)) { ret = btrfs_discard_extent(fs_info, chunk_offset, length, NULL); if (ret) btrfs_info(fs_info, "failed to reset zone %llu after relocation", chunk_offset); } trans = btrfs_start_trans_remove_block_group(root->fs_info, chunk_offset); if (IS_ERR(trans)) { ret = PTR_ERR(trans); btrfs_handle_fs_error(root->fs_info, ret, NULL); return ret; } /* * step two, delete the device extents and the * chunk tree entries */ ret = btrfs_remove_chunk(trans, chunk_offset); btrfs_end_transaction(trans); return ret; } static int btrfs_relocate_sys_chunks(struct btrfs_fs_info *fs_info) { struct btrfs_root *chunk_root = fs_info->chunk_root; struct btrfs_path *path; struct extent_buffer *leaf; struct btrfs_chunk *chunk; struct btrfs_key key; struct btrfs_key found_key; u64 chunk_type; bool retried = false; int failed = 0; int ret; path = btrfs_alloc_path(); if (!path) return -ENOMEM; again: key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID; key.offset = (u64)-1; key.type = BTRFS_CHUNK_ITEM_KEY; while (1) { mutex_lock(&fs_info->reclaim_bgs_lock); ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0); if (ret < 0) { mutex_unlock(&fs_info->reclaim_bgs_lock); goto error; } if (ret == 0) { /* * On the first search we would find chunk tree with * offset -1, which is not possible. On subsequent * loops this would find an existing item on an invalid * offset (one less than the previous one, wrong * alignment and size). */ ret = -EUCLEAN; mutex_unlock(&fs_info->reclaim_bgs_lock); goto error; } ret = btrfs_previous_item(chunk_root, path, key.objectid, key.type); if (ret) mutex_unlock(&fs_info->reclaim_bgs_lock); if (ret < 0) goto error; if (ret > 0) break; leaf = path->nodes[0]; btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); chunk = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_chunk); chunk_type = btrfs_chunk_type(leaf, chunk); btrfs_release_path(path); if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) { ret = btrfs_relocate_chunk(fs_info, found_key.offset); if (ret == -ENOSPC) failed++; else BUG_ON(ret); } mutex_unlock(&fs_info->reclaim_bgs_lock); if (found_key.offset == 0) break; key.offset = found_key.offset - 1; } ret = 0; if (failed && !retried) { failed = 0; retried = true; goto again; } else if (WARN_ON(failed && retried)) { ret = -ENOSPC; } error: btrfs_free_path(path); return ret; } /* * return 1 : allocate a data chunk successfully, * return <0: errors during allocating a data chunk, * return 0 : no need to allocate a data chunk. */ static int btrfs_may_alloc_data_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset) { struct btrfs_block_group *cache; u64 bytes_used; u64 chunk_type; cache = btrfs_lookup_block_group(fs_info, chunk_offset); ASSERT(cache); chunk_type = cache->flags; btrfs_put_block_group(cache); if (!(chunk_type & BTRFS_BLOCK_GROUP_DATA)) return 0; spin_lock(&fs_info->data_sinfo->lock); bytes_used = fs_info->data_sinfo->bytes_used; spin_unlock(&fs_info->data_sinfo->lock); if (!bytes_used) { struct btrfs_trans_handle *trans; int ret; trans = btrfs_join_transaction(fs_info->tree_root); if (IS_ERR(trans)) return PTR_ERR(trans); ret = btrfs_force_chunk_alloc(trans, BTRFS_BLOCK_GROUP_DATA); btrfs_end_transaction(trans); if (ret < 0) return ret; return 1; } return 0; } static void btrfs_disk_balance_args_to_cpu(struct btrfs_balance_args *cpu, const struct btrfs_disk_balance_args *disk) { memset(cpu, 0, sizeof(*cpu)); cpu->profiles = le64_to_cpu(disk->profiles); cpu->usage = le64_to_cpu(disk->usage); cpu->devid = le64_to_cpu(disk->devid); cpu->pstart = le64_to_cpu(disk->pstart); cpu->pend = le64_to_cpu(disk->pend); cpu->vstart = le64_to_cpu(disk->vstart); cpu->vend = le64_to_cpu(disk->vend); cpu->target = le64_to_cpu(disk->target); cpu->flags = le64_to_cpu(disk->flags); cpu->limit = le64_to_cpu(disk->limit); cpu->stripes_min = le32_to_cpu(disk->stripes_min); cpu->stripes_max = le32_to_cpu(disk->stripes_max); } static void btrfs_cpu_balance_args_to_disk(struct btrfs_disk_balance_args *disk, const struct btrfs_balance_args *cpu) { memset(disk, 0, sizeof(*disk)); disk->profiles = cpu_to_le64(cpu->profiles); disk->usage = cpu_to_le64(cpu->usage); disk->devid = cpu_to_le64(cpu->devid); disk->pstart = cpu_to_le64(cpu->pstart); disk->pend = cpu_to_le64(cpu->pend); disk->vstart = cpu_to_le64(cpu->vstart); disk->vend = cpu_to_le64(cpu->vend); disk->target = cpu_to_le64(cpu->target); disk->flags = cpu_to_le64(cpu->flags); disk->limit = cpu_to_le64(cpu->limit); disk->stripes_min = cpu_to_le32(cpu->stripes_min); disk->stripes_max = cpu_to_le32(cpu->stripes_max); } static int insert_balance_item(struct btrfs_fs_info *fs_info, struct btrfs_balance_control *bctl) { struct btrfs_root *root = fs_info->tree_root; struct btrfs_trans_handle *trans; struct btrfs_balance_item *item; struct btrfs_disk_balance_args disk_bargs; struct btrfs_path *path; struct extent_buffer *leaf; struct btrfs_key key; int ret, err; path = btrfs_alloc_path(); if (!path) return -ENOMEM; trans = btrfs_start_transaction(root, 0); if (IS_ERR(trans)) { btrfs_free_path(path); return PTR_ERR(trans); } key.objectid = BTRFS_BALANCE_OBJECTID; key.type = BTRFS_TEMPORARY_ITEM_KEY; key.offset = 0; ret = btrfs_insert_empty_item(trans, root, path, &key, sizeof(*item)); if (ret) goto out; leaf = path->nodes[0]; item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_balance_item); memzero_extent_buffer(leaf, (unsigned long)item, sizeof(*item)); btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->data); btrfs_set_balance_data(leaf, item, &disk_bargs); btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->meta); btrfs_set_balance_meta(leaf, item, &disk_bargs); btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->sys); btrfs_set_balance_sys(leaf, item, &disk_bargs); btrfs_set_balance_flags(leaf, item, bctl->flags); out: btrfs_free_path(path); err = btrfs_commit_transaction(trans); if (err && !ret) ret = err; return ret; } static int del_balance_item(struct btrfs_fs_info *fs_info) { struct btrfs_root *root = fs_info->tree_root; struct btrfs_trans_handle *trans; struct btrfs_path *path; struct btrfs_key key; int ret, err; path = btrfs_alloc_path(); if (!path) return -ENOMEM; trans = btrfs_start_transaction_fallback_global_rsv(root, 0); if (IS_ERR(trans)) { btrfs_free_path(path); return PTR_ERR(trans); } key.objectid = BTRFS_BALANCE_OBJECTID; key.type = BTRFS_TEMPORARY_ITEM_KEY; key.offset = 0; ret = btrfs_search_slot(trans, root, &key, path, -1, 1); if (ret < 0) goto out; if (ret > 0) { ret = -ENOENT; goto out; } ret = btrfs_del_item(trans, root, path); out: btrfs_free_path(path); err = btrfs_commit_transaction(trans); if (err && !ret) ret = err; return ret; } /* * This is a heuristic used to reduce the number of chunks balanced on * resume after balance was interrupted. */ static void update_balance_args(struct btrfs_balance_control *bctl) { /* * Turn on soft mode for chunk types that were being converted. */ if (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) bctl->data.flags |= BTRFS_BALANCE_ARGS_SOFT; if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) bctl->sys.flags |= BTRFS_BALANCE_ARGS_SOFT; if (bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) bctl->meta.flags |= BTRFS_BALANCE_ARGS_SOFT; /* * Turn on usage filter if is not already used. The idea is * that chunks that we have already balanced should be * reasonably full. Don't do it for chunks that are being * converted - that will keep us from relocating unconverted * (albeit full) chunks. */ if (!(bctl->data.flags & BTRFS_BALANCE_ARGS_USAGE) && !(bctl->data.flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) && !(bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT)) { bctl->data.flags |= BTRFS_BALANCE_ARGS_USAGE; bctl->data.usage = 90; } if (!(bctl->sys.flags & BTRFS_BALANCE_ARGS_USAGE) && !(bctl->sys.flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) && !(bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT)) { bctl->sys.flags |= BTRFS_BALANCE_ARGS_USAGE; bctl->sys.usage = 90; } if (!(bctl->meta.flags & BTRFS_BALANCE_ARGS_USAGE) && !(bctl->meta.flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) && !(bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT)) { bctl->meta.flags |= BTRFS_BALANCE_ARGS_USAGE; bctl->meta.usage = 90; } } /* * Clear the balance status in fs_info and delete the balance item from disk. */ static void reset_balance_state(struct btrfs_fs_info *fs_info) { struct btrfs_balance_control *bctl = fs_info->balance_ctl; int ret; ASSERT(fs_info->balance_ctl); spin_lock(&fs_info->balance_lock); fs_info->balance_ctl = NULL; spin_unlock(&fs_info->balance_lock); kfree(bctl); ret = del_balance_item(fs_info); if (ret) btrfs_handle_fs_error(fs_info, ret, NULL); } /* * Balance filters. Return 1 if chunk should be filtered out * (should not be balanced). */ static int chunk_profiles_filter(u64 chunk_type, struct btrfs_balance_args *bargs) { chunk_type = chunk_to_extended(chunk_type) & BTRFS_EXTENDED_PROFILE_MASK; if (bargs->profiles & chunk_type) return 0; return 1; } static int chunk_usage_range_filter(struct btrfs_fs_info *fs_info, u64 chunk_offset, struct btrfs_balance_args *bargs) { struct btrfs_block_group *cache; u64 chunk_used; u64 user_thresh_min; u64 user_thresh_max; int ret = 1; cache = btrfs_lookup_block_group(fs_info, chunk_offset); chunk_used = cache->used; if (bargs->usage_min == 0) user_thresh_min = 0; else user_thresh_min = mult_perc(cache->length, bargs->usage_min); if (bargs->usage_max == 0) user_thresh_max = 1; else if (bargs->usage_max > 100) user_thresh_max = cache->length; else user_thresh_max = mult_perc(cache->length, bargs->usage_max); if (user_thresh_min <= chunk_used && chunk_used < user_thresh_max) ret = 0; btrfs_put_block_group(cache); return ret; } static int chunk_usage_filter(struct btrfs_fs_info *fs_info, u64 chunk_offset, struct btrfs_balance_args *bargs) { struct btrfs_block_group *cache; u64 chunk_used, user_thresh; int ret = 1; cache = btrfs_lookup_block_group(fs_info, chunk_offset); chunk_used = cache->used; if (bargs->usage_min == 0) user_thresh = 1; else if (bargs->usage > 100) user_thresh = cache->length; else user_thresh = mult_perc(cache->length, bargs->usage); if (chunk_used < user_thresh) ret = 0; btrfs_put_block_group(cache); return ret; } static int chunk_devid_filter(struct extent_buffer *leaf, struct btrfs_chunk *chunk, struct btrfs_balance_args *bargs) { struct btrfs_stripe *stripe; int num_stripes = btrfs_chunk_num_stripes(leaf, chunk); int i; for (i = 0; i < num_stripes; i++) { stripe = btrfs_stripe_nr(chunk, i); if (btrfs_stripe_devid(leaf, stripe) == bargs->devid) return 0; } return 1; } static u64 calc_data_stripes(u64 type, int num_stripes) { const int index = btrfs_bg_flags_to_raid_index(type); const int ncopies = btrfs_raid_array[index].ncopies; const int nparity = btrfs_raid_array[index].nparity; return (num_stripes - nparity) / ncopies; } /* [pstart, pend) */ static int chunk_drange_filter(struct extent_buffer *leaf, struct btrfs_chunk *chunk, struct btrfs_balance_args *bargs) { struct btrfs_stripe *stripe; int num_stripes = btrfs_chunk_num_stripes(leaf, chunk); u64 stripe_offset; u64 stripe_length; u64 type; int factor; int i; if (!(bargs->flags & BTRFS_BALANCE_ARGS_DEVID)) return 0; type = btrfs_chunk_type(leaf, chunk); factor = calc_data_stripes(type, num_stripes); for (i = 0; i < num_stripes; i++) { stripe = btrfs_stripe_nr(chunk, i); if (btrfs_stripe_devid(leaf, stripe) != bargs->devid) continue; stripe_offset = btrfs_stripe_offset(leaf, stripe); stripe_length = btrfs_chunk_length(leaf, chunk); stripe_length = div_u64(stripe_length, factor); if (stripe_offset < bargs->pend && stripe_offset + stripe_length > bargs->pstart) return 0; } return 1; } /* [vstart, vend) */ static int chunk_vrange_filter(struct extent_buffer *leaf, struct btrfs_chunk *chunk, u64 chunk_offset, struct btrfs_balance_args *bargs) { if (chunk_offset < bargs->vend && chunk_offset + btrfs_chunk_length(leaf, chunk) > bargs->vstart) /* at least part of the chunk is inside this vrange */ return 0; return 1; } static int chunk_stripes_range_filter(struct extent_buffer *leaf, struct btrfs_chunk *chunk, struct btrfs_balance_args *bargs) { int num_stripes = btrfs_chunk_num_stripes(leaf, chunk); if (bargs->stripes_min <= num_stripes && num_stripes <= bargs->stripes_max) return 0; return 1; } static int chunk_soft_convert_filter(u64 chunk_type, struct btrfs_balance_args *bargs) { if (!(bargs->flags & BTRFS_BALANCE_ARGS_CONVERT)) return 0; chunk_type = chunk_to_extended(chunk_type) & BTRFS_EXTENDED_PROFILE_MASK; if (bargs->target == chunk_type) return 1; return 0; } static int should_balance_chunk(struct extent_buffer *leaf, struct btrfs_chunk *chunk, u64 chunk_offset) { struct btrfs_fs_info *fs_info = leaf->fs_info; struct btrfs_balance_control *bctl = fs_info->balance_ctl; struct btrfs_balance_args *bargs = NULL; u64 chunk_type = btrfs_chunk_type(leaf, chunk); /* type filter */ if (!((chunk_type & BTRFS_BLOCK_GROUP_TYPE_MASK) & (bctl->flags & BTRFS_BALANCE_TYPE_MASK))) { return 0; } if (chunk_type & BTRFS_BLOCK_GROUP_DATA) bargs = &bctl->data; else if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) bargs = &bctl->sys; else if (chunk_type & BTRFS_BLOCK_GROUP_METADATA) bargs = &bctl->meta; /* profiles filter */ if ((bargs->flags & BTRFS_BALANCE_ARGS_PROFILES) && chunk_profiles_filter(chunk_type, bargs)) { return 0; } /* usage filter */ if ((bargs->flags & BTRFS_BALANCE_ARGS_USAGE) && chunk_usage_filter(fs_info, chunk_offset, bargs)) { return 0; } else if ((bargs->flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) && chunk_usage_range_filter(fs_info, chunk_offset, bargs)) { return 0; } /* devid filter */ if ((bargs->flags & BTRFS_BALANCE_ARGS_DEVID) && chunk_devid_filter(leaf, chunk, bargs)) { return 0; } /* drange filter, makes sense only with devid filter */ if ((bargs->flags & BTRFS_BALANCE_ARGS_DRANGE) && chunk_drange_filter(leaf, chunk, bargs)) { return 0; } /* vrange filter */ if ((bargs->flags & BTRFS_BALANCE_ARGS_VRANGE) && chunk_vrange_filter(leaf, chunk, chunk_offset, bargs)) { return 0; } /* stripes filter */ if ((bargs->flags & BTRFS_BALANCE_ARGS_STRIPES_RANGE) && chunk_stripes_range_filter(leaf, chunk, bargs)) { return 0; } /* soft profile changing mode */ if ((bargs->flags & BTRFS_BALANCE_ARGS_SOFT) && chunk_soft_convert_filter(chunk_type, bargs)) { return 0; } /* * limited by count, must be the last filter */ if ((bargs->flags & BTRFS_BALANCE_ARGS_LIMIT)) { if (bargs->limit == 0) return 0; else bargs->limit--; } else if ((bargs->flags & BTRFS_BALANCE_ARGS_LIMIT_RANGE)) { /* * Same logic as the 'limit' filter; the minimum cannot be * determined here because we do not have the global information * about the count of all chunks that satisfy the filters. */ if (bargs->limit_max == 0) return 0; else bargs->limit_max--; } return 1; } static int __btrfs_balance(struct btrfs_fs_info *fs_info) { struct btrfs_balance_control *bctl = fs_info->balance_ctl; struct btrfs_root *chunk_root = fs_info->chunk_root; u64 chunk_type; struct btrfs_chunk *chunk; struct btrfs_path *path = NULL; struct btrfs_key key; struct btrfs_key found_key; struct extent_buffer *leaf; int slot; int ret; int enospc_errors = 0; bool counting = true; /* The single value limit and min/max limits use the same bytes in the */ u64 limit_data = bctl->data.limit; u64 limit_meta = bctl->meta.limit; u64 limit_sys = bctl->sys.limit; u32 count_data = 0; u32 count_meta = 0; u32 count_sys = 0; int chunk_reserved = 0; path = btrfs_alloc_path(); if (!path) { ret = -ENOMEM; goto error; } /* zero out stat counters */ spin_lock(&fs_info->balance_lock); memset(&bctl->stat, 0, sizeof(bctl->stat)); spin_unlock(&fs_info->balance_lock); again: if (!counting) { /* * The single value limit and min/max limits use the same bytes * in the */ bctl->data.limit = limit_data; bctl->meta.limit = limit_meta; bctl->sys.limit = limit_sys; } key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID; key.offset = (u64)-1; key.type = BTRFS_CHUNK_ITEM_KEY; while (1) { if ((!counting && atomic_read(&fs_info->balance_pause_req)) || atomic_read(&fs_info->balance_cancel_req)) { ret = -ECANCELED; goto error; } mutex_lock(&fs_info->reclaim_bgs_lock); ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0); if (ret < 0) { mutex_unlock(&fs_info->reclaim_bgs_lock); goto error; } /* * this shouldn't happen, it means the last relocate * failed */ if (ret == 0) BUG(); /* FIXME break ? */ ret = btrfs_previous_item(chunk_root, path, 0, BTRFS_CHUNK_ITEM_KEY); if (ret) { mutex_unlock(&fs_info->reclaim_bgs_lock); ret = 0; break; } leaf = path->nodes[0]; slot = path->slots[0]; btrfs_item_key_to_cpu(leaf, &found_key, slot); if (found_key.objectid != key.objectid) { mutex_unlock(&fs_info->reclaim_bgs_lock); break; } chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk); chunk_type = btrfs_chunk_type(leaf, chunk); if (!counting) { spin_lock(&fs_info->balance_lock); bctl->stat.considered++; spin_unlock(&fs_info->balance_lock); } ret = should_balance_chunk(leaf, chunk, found_key.offset); btrfs_release_path(path); if (!ret) { mutex_unlock(&fs_info->reclaim_bgs_lock); goto loop; } if (counting) { mutex_unlock(&fs_info->reclaim_bgs_lock); spin_lock(&fs_info->balance_lock); bctl->stat.expected++; spin_unlock(&fs_info->balance_lock); if (chunk_type & BTRFS_BLOCK_GROUP_DATA) count_data++; else if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) count_sys++; else if (chunk_type & BTRFS_BLOCK_GROUP_METADATA) count_meta++; goto loop; } /* * Apply limit_min filter, no need to check if the LIMITS * filter is used, limit_min is 0 by default */ if (((chunk_type & BTRFS_BLOCK_GROUP_DATA) && count_data < bctl->data.limit_min) || ((chunk_type & BTRFS_BLOCK_GROUP_METADATA) && count_meta < bctl->meta.limit_min) || ((chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) && count_sys < bctl->sys.limit_min)) { mutex_unlock(&fs_info->reclaim_bgs_lock); goto loop; } if (!chunk_reserved) { /* * We may be relocating the only data chunk we have, * which could potentially end up with losing data's * raid profile, so lets allocate an empty one in * advance. */ ret = btrfs_may_alloc_data_chunk(fs_info, found_key.offset); if (ret < 0) { mutex_unlock(&fs_info->reclaim_bgs_lock); goto error; } else if (ret == 1) { chunk_reserved = 1; } } ret = btrfs_relocate_chunk(fs_info, found_key.offset); mutex_unlock(&fs_info->reclaim_bgs_lock); if (ret == -ENOSPC) { enospc_errors++; } else if (ret == -ETXTBSY) { btrfs_info(fs_info, "skipping relocation of block group %llu due to active swapfile", found_key.offset); ret = 0; } else if (ret) { goto error; } else { spin_lock(&fs_info->balance_lock); bctl->stat.completed++; spin_unlock(&fs_info->balance_lock); } loop: if (found_key.offset == 0) break; key.offset = found_key.offset - 1; } if (counting) { btrfs_release_path(path); counting = false; goto again; } error: btrfs_free_path(path); if (enospc_errors) { btrfs_info(fs_info, "%d enospc errors during balance", enospc_errors); if (!ret) ret = -ENOSPC; } return ret; } /* * See if a given profile is valid and reduced. * * @flags: profile to validate * @extended: if true @flags is treated as an extended profile */ static int alloc_profile_is_valid(u64 flags, int extended) { u64 mask = (extended ? BTRFS_EXTENDED_PROFILE_MASK : BTRFS_BLOCK_GROUP_PROFILE_MASK); flags &= ~BTRFS_BLOCK_GROUP_TYPE_MASK; /* 1) check that all other bits are zeroed */ if (flags & ~mask) return 0; /* 2) see if profile is reduced */ if (flags == 0) return !extended; /* "0" is valid for usual profiles */ return has_single_bit_set(flags); } /* * Validate target profile against allowed profiles and return true if it's OK. * Otherwise print the error message and return false. */ static inline int validate_convert_profile(struct btrfs_fs_info *fs_info, const struct btrfs_balance_args *bargs, u64 allowed, const char *type) { if (!(bargs->flags & BTRFS_BALANCE_ARGS_CONVERT)) return true; /* Profile is valid and does not have bits outside of the allowed set */ if (alloc_profile_is_valid(bargs->target, 1) && (bargs->target & ~allowed) == 0) return true; btrfs_err(fs_info, "balance: invalid convert %s profile %s", type, btrfs_bg_type_to_raid_name(bargs->target)); return false; } /* * Fill @buf with textual description of balance filter flags @bargs, up to * @size_buf including the terminating null. The output may be trimmed if it * does not fit into the provided buffer. */ static void describe_balance_args(struct btrfs_balance_args *bargs, char *buf, u32 size_buf) { int ret; u32 size_bp = size_buf; char *bp = buf; u64 flags = bargs->flags; char tmp_buf[128] = {'\0'}; if (!flags) return; #define CHECK_APPEND_NOARG(a) \ do { \ ret = snprintf(bp, size_bp, (a)); \ if (ret < 0 || ret >= size_bp) \ goto out_overflow; \ size_bp -= ret; \ bp += ret; \ } while (0) #define CHECK_APPEND_1ARG(a, v1) \ do { \ ret = snprintf(bp, size_bp, (a), (v1)); \ if (ret < 0 || ret >= size_bp) \ goto out_overflow; \ size_bp -= ret; \ bp += ret; \ } while (0) #define CHECK_APPEND_2ARG(a, v1, v2) \ do { \ ret = snprintf(bp, size_bp, (a), (v1), (v2)); \ if (ret < 0 || ret >= size_bp) \ goto out_overflow; \ size_bp -= ret; \ bp += ret; \ } while (0) if (flags & BTRFS_BALANCE_ARGS_CONVERT) CHECK_APPEND_1ARG("convert=%s,", btrfs_bg_type_to_raid_name(bargs->target)); if (flags & BTRFS_BALANCE_ARGS_SOFT) CHECK_APPEND_NOARG("soft,"); if (flags & BTRFS_BALANCE_ARGS_PROFILES) { btrfs_describe_block_groups(bargs->profiles, tmp_buf, sizeof(tmp_buf)); CHECK_APPEND_1ARG("profiles=%s,", tmp_buf); } if (flags & BTRFS_BALANCE_ARGS_USAGE) CHECK_APPEND_1ARG("usage=%llu,", bargs->usage); if (flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) CHECK_APPEND_2ARG("usage=%u..%u,", bargs->usage_min, bargs->usage_max); if (flags & BTRFS_BALANCE_ARGS_DEVID) CHECK_APPEND_1ARG("devid=%llu,", bargs->devid); if (flags & BTRFS_BALANCE_ARGS_DRANGE) CHECK_APPEND_2ARG("drange=%llu..%llu,", bargs->pstart, bargs->pend); if (flags & BTRFS_BALANCE_ARGS_VRANGE) CHECK_APPEND_2ARG("vrange=%llu..%llu,", bargs->vstart, bargs->vend); if (flags & BTRFS_BALANCE_ARGS_LIMIT) CHECK_APPEND_1ARG("limit=%llu,", bargs->limit); if (flags & BTRFS_BALANCE_ARGS_LIMIT_RANGE) CHECK_APPEND_2ARG("limit=%u..%u,", bargs->limit_min, bargs->limit_max); if (flags & BTRFS_BALANCE_ARGS_STRIPES_RANGE) CHECK_APPEND_2ARG("stripes=%u..%u,", bargs->stripes_min, bargs->stripes_max); #undef CHECK_APPEND_2ARG #undef CHECK_APPEND_1ARG #undef CHECK_APPEND_NOARG out_overflow: if (size_bp < size_buf) buf[size_buf - size_bp - 1] = '\0'; /* remove last , */ else buf[0] = '\0'; } static void describe_balance_start_or_resume(struct btrfs_fs_info *fs_info) { u32 size_buf = 1024; char tmp_buf[192] = {'\0'}; char *buf; char *bp; u32 size_bp = size_buf; int ret; struct btrfs_balance_control *bctl = fs_info->balance_ctl; buf = kzalloc(size_buf, GFP_KERNEL); if (!buf) return; bp = buf; #define CHECK_APPEND_1ARG(a, v1) \ do { \ ret = snprintf(bp, size_bp, (a), (v1)); \ if (ret < 0 || ret >= size_bp) \ goto out_overflow; \ size_bp -= ret; \ bp += ret; \ } while (0) if (bctl->flags & BTRFS_BALANCE_FORCE) CHECK_APPEND_1ARG("%s", "-f "); if (bctl->flags & BTRFS_BALANCE_DATA) { describe_balance_args(&bctl->data, tmp_buf, sizeof(tmp_buf)); CHECK_APPEND_1ARG("-d%s ", tmp_buf); } if (bctl->flags & BTRFS_BALANCE_METADATA) { describe_balance_args(&bctl->meta, tmp_buf, sizeof(tmp_buf)); CHECK_APPEND_1ARG("-m%s ", tmp_buf); } if (bctl->flags & BTRFS_BALANCE_SYSTEM) { describe_balance_args(&bctl->sys, tmp_buf, sizeof(tmp_buf)); CHECK_APPEND_1ARG("-s%s ", tmp_buf); } #undef CHECK_APPEND_1ARG out_overflow: if (size_bp < size_buf) buf[size_buf - size_bp - 1] = '\0'; /* remove last " " */ btrfs_info(fs_info, "balance: %s %s", (bctl->flags & BTRFS_BALANCE_RESUME) ? "resume" : "start", buf); kfree(buf); } /* * Should be called with balance mutexe held */ int btrfs_balance(struct btrfs_fs_info *fs_info, struct btrfs_balance_control *bctl, struct btrfs_ioctl_balance_args *bargs) { u64 meta_target, data_target; u64 allowed; int mixed = 0; int ret; u64 num_devices; unsigned seq; bool reducing_redundancy; bool paused = false; int i; if (btrfs_fs_closing(fs_info) || atomic_read(&fs_info->balance_pause_req) || btrfs_should_cancel_balance(fs_info)) { ret = -EINVAL; goto out; } allowed = btrfs_super_incompat_flags(fs_info->super_copy); if (allowed & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS) mixed = 1; /* * In case of mixed groups both data and meta should be picked, * and identical options should be given for both of them. */ allowed = BTRFS_BALANCE_DATA | BTRFS_BALANCE_METADATA; if (mixed && (bctl->flags & allowed)) { if (!(bctl->flags & BTRFS_BALANCE_DATA) || !(bctl->flags & BTRFS_BALANCE_METADATA) || memcmp(&bctl->data, &bctl->meta, sizeof(bctl->data))) { btrfs_err(fs_info, "balance: mixed groups data and metadata options must be the same"); ret = -EINVAL; goto out; } } /* * rw_devices will not change at the moment, device add/delete/replace * are exclusive */ num_devices = fs_info->fs_devices->rw_devices; /* * SINGLE profile on-disk has no profile bit, but in-memory we have a * special bit for it, to make it easier to distinguish. Thus we need * to set it manually, or balance would refuse the profile. */ allowed = BTRFS_AVAIL_ALLOC_BIT_SINGLE; for (i = 0; i < ARRAY_SIZE(btrfs_raid_array); i++) if (num_devices >= btrfs_raid_array[i].devs_min) allowed |= btrfs_raid_array[i].bg_flag; if (!validate_convert_profile(fs_info, &bctl->data, allowed, "data") || !validate_convert_profile(fs_info, &bctl->meta, allowed, "metadata") || !validate_convert_profile(fs_info, &bctl->sys, allowed, "system")) { ret = -EINVAL; goto out; } /* * Allow to reduce metadata or system integrity only if force set for * profiles with redundancy (copies, parity) */ allowed = 0; for (i = 0; i < ARRAY_SIZE(btrfs_raid_array); i++) { if (btrfs_raid_array[i].ncopies >= 2 || btrfs_raid_array[i].tolerated_failures >= 1) allowed |= btrfs_raid_array[i].bg_flag; } do { seq = read_seqbegin(&fs_info->profiles_lock); if (((bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) && (fs_info->avail_system_alloc_bits & allowed) && !(bctl->sys.target & allowed)) || ((bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) && (fs_info->avail_metadata_alloc_bits & allowed) && !(bctl->meta.target & allowed))) reducing_redundancy = true; else reducing_redundancy = false; /* if we're not converting, the target field is uninitialized */ meta_target = (bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) ? bctl->meta.target : fs_info->avail_metadata_alloc_bits; data_target = (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) ? bctl->data.target : fs_info->avail_data_alloc_bits; } while (read_seqretry(&fs_info->profiles_lock, seq)); if (reducing_redundancy) { if (bctl->flags & BTRFS_BALANCE_FORCE) { btrfs_info(fs_info, "balance: force reducing metadata redundancy"); } else { btrfs_err(fs_info, "balance: reduces metadata redundancy, use --force if you want this"); ret = -EINVAL; goto out; } } if (btrfs_get_num_tolerated_disk_barrier_failures(meta_target) < btrfs_get_num_tolerated_disk_barrier_failures(data_target)) { btrfs_warn(fs_info, "balance: metadata profile %s has lower redundancy than data profile %s", btrfs_bg_type_to_raid_name(meta_target), btrfs_bg_type_to_raid_name(data_target)); } ret = insert_balance_item(fs_info, bctl); if (ret && ret != -EEXIST) goto out; if (!(bctl->flags & BTRFS_BALANCE_RESUME)) { BUG_ON(ret == -EEXIST); BUG_ON(fs_info->balance_ctl); spin_lock(&fs_info->balance_lock); fs_info->balance_ctl = bctl; spin_unlock(&fs_info->balance_lock); } else { BUG_ON(ret != -EEXIST); spin_lock(&fs_info->balance_lock); update_balance_args(bctl); spin_unlock(&fs_info->balance_lock); } ASSERT(!test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)); set_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags); describe_balance_start_or_resume(fs_info); mutex_unlock(&fs_info->balance_mutex); ret = __btrfs_balance(fs_info); mutex_lock(&fs_info->balance_mutex); if (ret == -ECANCELED && atomic_read(&fs_info->balance_pause_req)) { btrfs_info(fs_info, "balance: paused"); btrfs_exclop_balance(fs_info, BTRFS_EXCLOP_BALANCE_PAUSED); paused = true; } /* * Balance can be canceled by: * * - Regular cancel request * Then ret == -ECANCELED and balance_cancel_req > 0 * * - Fatal signal to "btrfs" process * Either the signal caught by wait_reserve_ticket() and callers * got -EINTR, or caught by btrfs_should_cancel_balance() and * got -ECANCELED. * Either way, in this case balance_cancel_req = 0, and * ret == -EINTR or ret == -ECANCELED. * * So here we only check the return value to catch canceled balance. */ else if (ret == -ECANCELED || ret == -EINTR) btrfs_info(fs_info, "balance: canceled"); else btrfs_info(fs_info, "balance: ended with status: %d", ret); clear_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags); if (bargs) { memset(bargs, 0, sizeof(*bargs)); btrfs_update_ioctl_balance_args(fs_info, bargs); } /* We didn't pause, we can clean everything up. */ if (!paused) { reset_balance_state(fs_info); btrfs_exclop_finish(fs_info); } wake_up(&fs_info->balance_wait_q); return ret; out: if (bctl->flags & BTRFS_BALANCE_RESUME) reset_balance_state(fs_info); else kfree(bctl); btrfs_exclop_finish(fs_info); return ret; } static int balance_kthread(void *data) { struct btrfs_fs_info *fs_info = data; int ret = 0; sb_start_write(fs_info->sb); mutex_lock(&fs_info->balance_mutex); if (fs_info->balance_ctl) ret = btrfs_balance(fs_info, fs_info->balance_ctl, NULL); mutex_unlock(&fs_info->balance_mutex); sb_end_write(fs_info->sb); return ret; } int btrfs_resume_balance_async(struct btrfs_fs_info *fs_info) { struct task_struct *tsk; mutex_lock(&fs_info->balance_mutex); if (!fs_info->balance_ctl) { mutex_unlock(&fs_info->balance_mutex); return 0; } mutex_unlock(&fs_info->balance_mutex); if (btrfs_test_opt(fs_info, SKIP_BALANCE)) { btrfs_info(fs_info, "balance: resume skipped"); return 0; } spin_lock(&fs_info->super_lock); ASSERT(fs_info->exclusive_operation == BTRFS_EXCLOP_BALANCE_PAUSED); fs_info->exclusive_operation = BTRFS_EXCLOP_BALANCE; spin_unlock(&fs_info->super_lock); /* * A ro->rw remount sequence should continue with the paused balance * regardless of who pauses it, system or the user as of now, so set * the resume flag. */ spin_lock(&fs_info->balance_lock); fs_info->balance_ctl->flags |= BTRFS_BALANCE_RESUME; spin_unlock(&fs_info->balance_lock); tsk = kthread_run(balance_kthread, fs_info, "btrfs-balance"); return PTR_ERR_OR_ZERO(tsk); } int btrfs_recover_balance(struct btrfs_fs_info *fs_info) { struct btrfs_balance_control *bctl; struct btrfs_balance_item *item; struct btrfs_disk_balance_args disk_bargs; struct btrfs_path *path; struct extent_buffer *leaf; struct btrfs_key key; int ret; path = btrfs_alloc_path(); if (!path) return -ENOMEM; key.objectid = BTRFS_BALANCE_OBJECTID; key.type = BTRFS_TEMPORARY_ITEM_KEY; key.offset = 0; ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0); if (ret < 0) goto out; if (ret > 0) { /* ret = -ENOENT; */ ret = 0; goto out; } bctl = kzalloc(sizeof(*bctl), GFP_NOFS); if (!bctl) { ret = -ENOMEM; goto out; } leaf = path->nodes[0]; item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_balance_item); bctl->flags = btrfs_balance_flags(leaf, item); bctl->flags |= BTRFS_BALANCE_RESUME; btrfs_balance_data(leaf, item, &disk_bargs); btrfs_disk_balance_args_to_cpu(&bctl->data, &disk_bargs); btrfs_balance_meta(leaf, item, &disk_bargs); btrfs_disk_balance_args_to_cpu(&bctl->meta, &disk_bargs); btrfs_balance_sys(leaf, item, &disk_bargs); btrfs_disk_balance_args_to_cpu(&bctl->sys, &disk_bargs); /* * This should never happen, as the paused balance state is recovered * during mount without any chance of other exclusive ops to collide. * * This gives the exclusive op status to balance and keeps in paused * state until user intervention (cancel or umount). If the ownership * cannot be assigned, show a message but do not fail. The balance * is in a paused state and must have fs_info::balance_ctl properly * set up. */ if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE_PAUSED)) btrfs_warn(fs_info, "balance: cannot set exclusive op status, resume manually"); btrfs_release_path(path); mutex_lock(&fs_info->balance_mutex); BUG_ON(fs_info->balance_ctl); spin_lock(&fs_info->balance_lock); fs_info->balance_ctl = bctl; spin_unlock(&fs_info->balance_lock); mutex_unlock(&fs_info->balance_mutex); out: btrfs_free_path(path); return ret; } int btrfs_pause_balance(struct btrfs_fs_info *fs_info) { int ret = 0; mutex_lock(&fs_info->balance_mutex); if (!fs_info->balance_ctl) { mutex_unlock(&fs_info->balance_mutex); return -ENOTCONN; } if (test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)) { atomic_inc(&fs_info->balance_pause_req); mutex_unlock(&fs_info->balance_mutex); wait_event(fs_info->balance_wait_q, !test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)); mutex_lock(&fs_info->balance_mutex); /* we are good with balance_ctl ripped off from under us */ BUG_ON(test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)); atomic_dec(&fs_info->balance_pause_req); } else { ret = -ENOTCONN; } mutex_unlock(&fs_info->balance_mutex); return ret; } int btrfs_cancel_balance(struct btrfs_fs_info *fs_info) { mutex_lock(&fs_info->balance_mutex); if (!fs_info->balance_ctl) { mutex_unlock(&fs_info->balance_mutex); return -ENOTCONN; } /* * A paused balance with the item stored on disk can be resumed at * mount time if the mount is read-write. Otherwise it's still paused * and we must not allow cancelling as it deletes the item. */ if (sb_rdonly(fs_info->sb)) { mutex_unlock(&fs_info->balance_mutex); return -EROFS; } atomic_inc(&fs_info->balance_cancel_req); /* * if we are running just wait and return, balance item is * deleted in btrfs_balance in this case */ if (test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)) { mutex_unlock(&fs_info->balance_mutex); wait_event(fs_info->balance_wait_q, !test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)); mutex_lock(&fs_info->balance_mutex); } else { mutex_unlock(&fs_info->balance_mutex); /* * Lock released to allow other waiters to continue, we'll * reexamine the status again. */ mutex_lock(&fs_info->balance_mutex); if (fs_info->balance_ctl) { reset_balance_state(fs_info); btrfs_exclop_finish(fs_info); btrfs_info(fs_info, "balance: canceled"); } } ASSERT(!test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)); atomic_dec(&fs_info->balance_cancel_req); mutex_unlock(&fs_info->balance_mutex); return 0; } /* * shrinking a device means finding all of the device extents past * the new size, and then following the back refs to the chunks. * The chunk relocation code actually frees the device extent */ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size) { struct btrfs_fs_info *fs_info = device->fs_info; struct btrfs_root *root = fs_info->dev_root; struct btrfs_trans_handle *trans; struct btrfs_dev_extent *dev_extent = NULL; struct btrfs_path *path; u64 length; u64 chunk_offset; int ret; int slot; int failed = 0; bool retried = false; struct extent_buffer *l; struct btrfs_key key; struct btrfs_super_block *super_copy = fs_info->super_copy; u64 old_total = btrfs_super_total_bytes(super_copy); u64 old_size = btrfs_device_get_total_bytes(device); u64 diff; u64 start; u64 free_diff = 0; new_size = round_down(new_size, fs_info->sectorsize); start = new_size; diff = round_down(old_size - new_size, fs_info->sectorsize); if (test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) return -EINVAL; path = btrfs_alloc_path(); if (!path) return -ENOMEM; path->reada = READA_BACK; trans = btrfs_start_transaction(root, 0); if (IS_ERR(trans)) { btrfs_free_path(path); return PTR_ERR(trans); } mutex_lock(&fs_info->chunk_mutex); btrfs_device_set_total_bytes(device, new_size); if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) { device->fs_devices->total_rw_bytes -= diff; /* * The new free_chunk_space is new_size - used, so we have to * subtract the delta of the old free_chunk_space which included * old_size - used. If used > new_size then just subtract this * entire device's free space. */ if (device->bytes_used < new_size) free_diff = (old_size - device->bytes_used) - (new_size - device->bytes_used); else free_diff = old_size - device->bytes_used; atomic64_sub(free_diff, &fs_info->free_chunk_space); } /* * Once the device's size has been set to the new size, ensure all * in-memory chunks are synced to disk so that the loop below sees them * and relocates them accordingly. */ if (contains_pending_extent(device, &start, diff)) { mutex_unlock(&fs_info->chunk_mutex); ret = btrfs_commit_transaction(trans); if (ret) goto done; } else { mutex_unlock(&fs_info->chunk_mutex); btrfs_end_transaction(trans); } again: key.objectid = device->devid; key.offset = (u64)-1; key.type = BTRFS_DEV_EXTENT_KEY; do { mutex_lock(&fs_info->reclaim_bgs_lock); ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); if (ret < 0) { mutex_unlock(&fs_info->reclaim_bgs_lock); goto done; } ret = btrfs_previous_item(root, path, 0, key.type); if (ret) { mutex_unlock(&fs_info->reclaim_bgs_lock); if (ret < 0) goto done; ret = 0; btrfs_release_path(path); break; } l = path->nodes[0]; slot = path->slots[0]; btrfs_item_key_to_cpu(l, &key, path->slots[0]); if (key.objectid != device->devid) { mutex_unlock(&fs_info->reclaim_bgs_lock); btrfs_release_path(path); break; } dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent); length = btrfs_dev_extent_length(l, dev_extent); if (key.offset + length <= new_size) { mutex_unlock(&fs_info->reclaim_bgs_lock); btrfs_release_path(path); break; } chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent); btrfs_release_path(path); /* * We may be relocating the only data chunk we have, * which could potentially end up with losing data's * raid profile, so lets allocate an empty one in * advance. */ ret = btrfs_may_alloc_data_chunk(fs_info, chunk_offset); if (ret < 0) { mutex_unlock(&fs_info->reclaim_bgs_lock); goto done; } ret = btrfs_relocate_chunk(fs_info, chunk_offset); mutex_unlock(&fs_info->reclaim_bgs_lock); if (ret == -ENOSPC) { failed++; } else if (ret) { if (ret == -ETXTBSY) { btrfs_warn(fs_info, "could not shrink block group %llu due to active swapfile", chunk_offset); } goto done; } } while (key.offset-- > 0); if (failed && !retried) { failed = 0; retried = true; goto again; } else if (failed && retried) { ret = -ENOSPC; goto done; } /* Shrinking succeeded, else we would be at "done". */ trans = btrfs_start_transaction(root, 0); if (IS_ERR(trans)) { ret = PTR_ERR(trans); goto done; } mutex_lock(&fs_info->chunk_mutex); /* Clear all state bits beyond the shrunk device size */ clear_extent_bits(&device->alloc_state, new_size, (u64)-1, CHUNK_STATE_MASK); btrfs_device_set_disk_total_bytes(device, new_size); if (list_empty(&device->post_commit_list)) list_add_tail(&device->post_commit_list, &trans->transaction->dev_update_list); WARN_ON(diff > old_total); btrfs_set_super_total_bytes(super_copy, round_down(old_total - diff, fs_info->sectorsize)); mutex_unlock(&fs_info->chunk_mutex); btrfs_reserve_chunk_metadata(trans, false); /* Now btrfs_update_device() will change the on-disk size. */ ret = btrfs_update_device(trans, device); btrfs_trans_release_chunk_metadata(trans); if (ret < 0) { btrfs_abort_transaction(trans, ret); btrfs_end_transaction(trans); } else { ret = btrfs_commit_transaction(trans); } done: btrfs_free_path(path); if (ret) { mutex_lock(&fs_info->chunk_mutex); btrfs_device_set_total_bytes(device, old_size); if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) { device->fs_devices->total_rw_bytes += diff; atomic64_add(free_diff, &fs_info->free_chunk_space); } mutex_unlock(&fs_info->chunk_mutex); } return ret; } static int btrfs_add_system_chunk(struct btrfs_fs_info *fs_info, struct btrfs_key *key, struct btrfs_chunk *chunk, int item_size) { struct btrfs_super_block *super_copy = fs_info->super_copy; struct btrfs_disk_key disk_key; u32 array_size; u8 *ptr; lockdep_assert_held(&fs_info->chunk_mutex); array_size = btrfs_super_sys_array_size(super_copy); if (array_size + item_size + sizeof(disk_key) > BTRFS_SYSTEM_CHUNK_ARRAY_SIZE) return -EFBIG; ptr = super_copy->sys_chunk_array + array_size; btrfs_cpu_key_to_disk(&disk_key, key); memcpy(ptr, &disk_key, sizeof(disk_key)); ptr += sizeof(disk_key); memcpy(ptr, chunk, item_size); item_size += sizeof(disk_key); btrfs_set_super_sys_array_size(super_copy, array_size + item_size); return 0; } /* * sort the devices in descending order by max_avail, total_avail */ static int btrfs_cmp_device_info(const void *a, const void *b) { const struct btrfs_device_info *di_a = a; const struct btrfs_device_info *di_b = b; if (di_a->max_avail > di_b->max_avail) return -1; if (di_a->max_avail < di_b->max_avail) return 1; if (di_a->total_avail > di_b->total_avail) return -1; if (di_a->total_avail < di_b->total_avail) return 1; return 0; } static void check_raid56_incompat_flag(struct btrfs_fs_info *info, u64 type) { if (!(type & BTRFS_BLOCK_GROUP_RAID56_MASK)) return; btrfs_set_fs_incompat(info, RAID56); } static void check_raid1c34_incompat_flag(struct btrfs_fs_info *info, u64 type) { if (!(type & (BTRFS_BLOCK_GROUP_RAID1C3 | BTRFS_BLOCK_GROUP_RAID1C4))) return; btrfs_set_fs_incompat(info, RAID1C34); } /* * Structure used internally for btrfs_create_chunk() function. * Wraps needed parameters. */ struct alloc_chunk_ctl { u64 start; u64 type; /* Total number of stripes to allocate */ int num_stripes; /* sub_stripes info for map */ int sub_stripes; /* Stripes per device */ int dev_stripes; /* Maximum number of devices to use */ int devs_max; /* Minimum number of devices to use */ int devs_min; /* ndevs has to be a multiple of this */ int devs_increment; /* Number of copies */ int ncopies; /* Number of stripes worth of bytes to store parity information */ int nparity; u64 max_stripe_size; u64 max_chunk_size; u64 dev_extent_min; u64 stripe_size; u64 chunk_size; int ndevs; }; static void init_alloc_chunk_ctl_policy_regular( struct btrfs_fs_devices *fs_devices, struct alloc_chunk_ctl *ctl) { struct btrfs_space_info *space_info; space_info = btrfs_find_space_info(fs_devices->fs_info, ctl->type); ASSERT(space_info); ctl->max_chunk_size = READ_ONCE(space_info->chunk_size); ctl->max_stripe_size = min_t(u64, ctl->max_chunk_size, SZ_1G); if (ctl->type & BTRFS_BLOCK_GROUP_SYSTEM) ctl->devs_max = min_t(int, ctl->devs_max, BTRFS_MAX_DEVS_SYS_CHUNK); /* We don't want a chunk larger than 10% of writable space */ ctl->max_chunk_size = min(mult_perc(fs_devices->total_rw_bytes, 10), ctl->max_chunk_size); ctl->dev_extent_min = btrfs_stripe_nr_to_offset(ctl->dev_stripes); } static void init_alloc_chunk_ctl_policy_zoned( struct btrfs_fs_devices *fs_devices, struct alloc_chunk_ctl *ctl) { u64 zone_size = fs_devices->fs_info->zone_size; u64 limit; int min_num_stripes = ctl->devs_min * ctl->dev_stripes; int min_data_stripes = (min_num_stripes - ctl->nparity) / ctl->ncopies; u64 min_chunk_size = min_data_stripes * zone_size; u64 type = ctl->type; ctl->max_stripe_size = zone_size; if (type & BTRFS_BLOCK_GROUP_DATA) { ctl->max_chunk_size = round_down(BTRFS_MAX_DATA_CHUNK_SIZE, zone_size); } else if (type & BTRFS_BLOCK_GROUP_METADATA) { ctl->max_chunk_size = ctl->max_stripe_size; } else if (type & BTRFS_BLOCK_GROUP_SYSTEM) { ctl->max_chunk_size = 2 * ctl->max_stripe_size; ctl->devs_max = min_t(int, ctl->devs_max, BTRFS_MAX_DEVS_SYS_CHUNK); } else { BUG(); } /* We don't want a chunk larger than 10% of writable space */ limit = max(round_down(mult_perc(fs_devices->total_rw_bytes, 10), zone_size), min_chunk_size); ctl->max_chunk_size = min(limit, ctl->max_chunk_size); ctl->dev_extent_min = zone_size * ctl->dev_stripes; } static void init_alloc_chunk_ctl(struct btrfs_fs_devices *fs_devices, struct alloc_chunk_ctl *ctl) { int index = btrfs_bg_flags_to_raid_index(ctl->type); ctl->sub_stripes = btrfs_raid_array[index].sub_stripes; ctl->dev_stripes = btrfs_raid_array[index].dev_stripes; ctl->devs_max = btrfs_raid_array[index].devs_max; if (!ctl->devs_max) ctl->devs_max = BTRFS_MAX_DEVS(fs_devices->fs_info); ctl->devs_min = btrfs_raid_array[index].devs_min; ctl->devs_increment = btrfs_raid_array[index].devs_increment; ctl->ncopies = btrfs_raid_array[index].ncopies; ctl->nparity = btrfs_raid_array[index].nparity; ctl->ndevs = 0; switch (fs_devices->chunk_alloc_policy) { case BTRFS_CHUNK_ALLOC_REGULAR: init_alloc_chunk_ctl_policy_regular(fs_devices, ctl); break; case BTRFS_CHUNK_ALLOC_ZONED: init_alloc_chunk_ctl_policy_zoned(fs_devices, ctl); break; default: BUG(); } } static int gather_device_info(struct btrfs_fs_devices *fs_devices, struct alloc_chunk_ctl *ctl, struct btrfs_device_info *devices_info) { struct btrfs_fs_info *info = fs_devices->fs_info; struct btrfs_device *device; u64 total_avail; u64 dev_extent_want = ctl->max_stripe_size * ctl->dev_stripes; int ret; int ndevs = 0; u64 max_avail; u64 dev_offset; /* * in the first pass through the devices list, we gather information * about the available holes on each device. */ list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) { if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) { WARN(1, KERN_ERR "BTRFS: read-only device in alloc_list\n"); continue; } if (!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state) || test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) continue; if (device->total_bytes > device->bytes_used) total_avail = device->total_bytes - device->bytes_used; else total_avail = 0; /* If there is no space on this device, skip it. */ if (total_avail < ctl->dev_extent_min) continue; ret = find_free_dev_extent(device, dev_extent_want, &dev_offset, &max_avail); if (ret && ret != -ENOSPC) return ret; if (ret == 0) max_avail = dev_extent_want; if (max_avail < ctl->dev_extent_min) { if (btrfs_test_opt(info, ENOSPC_DEBUG)) btrfs_debug(info, "%s: devid %llu has no free space, have=%llu want=%llu", __func__, device->devid, max_avail, ctl->dev_extent_min); continue; } if (ndevs == fs_devices->rw_devices) { WARN(1, "%s: found more than %llu devices\n", __func__, fs_devices->rw_devices); break; } devices_info[ndevs].dev_offset = dev_offset; devices_info[ndevs].max_avail = max_avail; devices_info[ndevs].total_avail = total_avail; devices_info[ndevs].dev = device; ++ndevs; } ctl->ndevs = ndevs; /* * now sort the devices by hole size / available space */ sort(devices_info, ndevs, sizeof(struct btrfs_device_info), btrfs_cmp_device_info, NULL); return 0; } static int decide_stripe_size_regular(struct alloc_chunk_ctl *ctl, struct btrfs_device_info *devices_info) { /* Number of stripes that count for block group size */ int data_stripes; /* * The primary goal is to maximize the number of stripes, so use as * many devices as possible, even if the stripes are not maximum sized. * * The DUP profile stores more than one stripe per device, the * max_avail is the total size so we have to adjust. */ ctl->stripe_size = div_u64(devices_info[ctl->ndevs - 1].max_avail, ctl->dev_stripes); ctl->num_stripes = ctl->ndevs * ctl->dev_stripes; /* This will have to be fixed for RAID1 and RAID10 over more drives */ data_stripes = (ctl->num_stripes - ctl->nparity) / ctl->ncopies; /* * Use the number of data stripes to figure out how big this chunk is * really going to be in terms of logical address space, and compare * that answer with the max chunk size. If it's higher, we try to * reduce stripe_size. */ if (ctl->stripe_size * data_stripes > ctl->max_chunk_size) { /* * Reduce stripe_size, round it up to a 16MB boundary again and * then use it, unless it ends up being even bigger than the * previous value we had already. */ ctl->stripe_size = min(round_up(div_u64(ctl->max_chunk_size, data_stripes), SZ_16M), ctl->stripe_size); } /* Stripe size should not go beyond 1G. */ ctl->stripe_size = min_t(u64, ctl->stripe_size, SZ_1G); /* Align to BTRFS_STRIPE_LEN */ ctl->stripe_size = round_down(ctl->stripe_size, BTRFS_STRIPE_LEN); ctl->chunk_size = ctl->stripe_size * data_stripes; return 0; } static int decide_stripe_size_zoned(struct alloc_chunk_ctl *ctl, struct btrfs_device_info *devices_info) { u64 zone_size = devices_info[0].dev->zone_info->zone_size; /* Number of stripes that count for block group size */ int data_stripes; /* * It should hold because: * dev_extent_min == dev_extent_want == zone_size * dev_stripes */ ASSERT(devices_info[ctl->ndevs - 1].max_avail == ctl->dev_extent_min); ctl->stripe_size = zone_size; ctl->num_stripes = ctl->ndevs * ctl->dev_stripes; data_stripes = (ctl->num_stripes - ctl->nparity) / ctl->ncopies; /* stripe_size is fixed in zoned filesystem. Reduce ndevs instead. */ if (ctl->stripe_size * data_stripes > ctl->max_chunk_size) { ctl->ndevs = div_u64(div_u64(ctl->max_chunk_size * ctl->ncopies, ctl->stripe_size) + ctl->nparity, ctl->dev_stripes); ctl->num_stripes = ctl->ndevs * ctl->dev_stripes; data_stripes = (ctl->num_stripes - ctl->nparity) / ctl->ncopies; ASSERT(ctl->stripe_size * data_stripes <= ctl->max_chunk_size); } ctl->chunk_size = ctl->stripe_size * data_stripes; return 0; } static int decide_stripe_size(struct btrfs_fs_devices *fs_devices, struct alloc_chunk_ctl *ctl, struct btrfs_device_info *devices_info) { struct btrfs_fs_info *info = fs_devices->fs_info; /* * Round down to number of usable stripes, devs_increment can be any * number so we can't use round_down() that requires power of 2, while * rounddown is safe. */ ctl->ndevs = rounddown(ctl->ndevs, ctl->devs_increment); if (ctl->ndevs < ctl->devs_min) { if (btrfs_test_opt(info, ENOSPC_DEBUG)) { btrfs_debug(info, "%s: not enough devices with free space: have=%d minimum required=%d", __func__, ctl->ndevs, ctl->devs_min); } return -ENOSPC; } ctl->ndevs = min(ctl->ndevs, ctl->devs_max); switch (fs_devices->chunk_alloc_policy) { case BTRFS_CHUNK_ALLOC_REGULAR: return decide_stripe_size_regular(ctl, devices_info); case BTRFS_CHUNK_ALLOC_ZONED: return decide_stripe_size_zoned(ctl, devices_info); default: BUG(); } } static void chunk_map_device_set_bits(struct btrfs_chunk_map *map, unsigned int bits) { for (int i = 0; i < map->num_stripes; i++) { struct btrfs_io_stripe *stripe = &map->stripes[i]; struct btrfs_device *device = stripe->dev; set_extent_bit(&device->alloc_state, stripe->physical, stripe->physical + map->stripe_size - 1, bits | EXTENT_NOWAIT, NULL); } } static void chunk_map_device_clear_bits(struct btrfs_chunk_map *map, unsigned int bits) { for (int i = 0; i < map->num_stripes; i++) { struct btrfs_io_stripe *stripe = &map->stripes[i]; struct btrfs_device *device = stripe->dev; __clear_extent_bit(&device->alloc_state, stripe->physical, stripe->physical + map->stripe_size - 1, bits | EXTENT_NOWAIT, NULL, NULL); } } void btrfs_remove_chunk_map(struct btrfs_fs_info *fs_info, struct btrfs_chunk_map *map) { write_lock(&fs_info->mapping_tree_lock); rb_erase_cached(&map->rb_node, &fs_info->mapping_tree); RB_CLEAR_NODE(&map->rb_node); chunk_map_device_clear_bits(map, CHUNK_ALLOCATED); write_unlock(&fs_info->mapping_tree_lock); /* Once for the tree reference. */ btrfs_free_chunk_map(map); } static int btrfs_chunk_map_cmp(const struct rb_node *new, const struct rb_node *exist) { const struct btrfs_chunk_map *new_map = rb_entry(new, struct btrfs_chunk_map, rb_node); const struct btrfs_chunk_map *exist_map = rb_entry(exist, struct btrfs_chunk_map, rb_node); if (new_map->start == exist_map->start) return 0; if (new_map->start < exist_map->start) return -1; return 1; } EXPORT_FOR_TESTS int btrfs_add_chunk_map(struct btrfs_fs_info *fs_info, struct btrfs_chunk_map *map) { struct rb_node *exist; write_lock(&fs_info->mapping_tree_lock); exist = rb_find_add_cached(&map->rb_node, &fs_info->mapping_tree, btrfs_chunk_map_cmp); if (exist) { write_unlock(&fs_info->mapping_tree_lock); return -EEXIST; } chunk_map_device_set_bits(map, CHUNK_ALLOCATED); chunk_map_device_clear_bits(map, CHUNK_TRIMMED); write_unlock(&fs_info->mapping_tree_lock); return 0; } EXPORT_FOR_TESTS struct btrfs_chunk_map *btrfs_alloc_chunk_map(int num_stripes, gfp_t gfp) { struct btrfs_chunk_map *map; map = kmalloc(btrfs_chunk_map_size(num_stripes), gfp); if (!map) return NULL; refcount_set(&map->refs, 1); RB_CLEAR_NODE(&map->rb_node); return map; } static struct btrfs_block_group *create_chunk(struct btrfs_trans_handle *trans, struct alloc_chunk_ctl *ctl, struct btrfs_device_info *devices_info) { struct btrfs_fs_info *info = trans->fs_info; struct btrfs_chunk_map *map; struct btrfs_block_group *block_group; u64 start = ctl->start; u64 type = ctl->type; int ret; map = btrfs_alloc_chunk_map(ctl->num_stripes, GFP_NOFS); if (!map) return ERR_PTR(-ENOMEM); map->start = start; map->chunk_len = ctl->chunk_size; map->stripe_size = ctl->stripe_size; map->type = type; map->io_align = BTRFS_STRIPE_LEN; map->io_width = BTRFS_STRIPE_LEN; map->sub_stripes = ctl->sub_stripes; map->num_stripes = ctl->num_stripes; for (int i = 0; i < ctl->ndevs; i++) { for (int j = 0; j < ctl->dev_stripes; j++) { int s = i * ctl->dev_stripes + j; map->stripes[s].dev = devices_info[i].dev; map->stripes[s].physical = devices_info[i].dev_offset + j * ctl->stripe_size; } } trace_btrfs_chunk_alloc(info, map, start, ctl->chunk_size); ret = btrfs_add_chunk_map(info, map); if (ret) { btrfs_free_chunk_map(map); return ERR_PTR(ret); } block_group = btrfs_make_block_group(trans, type, start, ctl->chunk_size); if (IS_ERR(block_group)) { btrfs_remove_chunk_map(info, map); return block_group; } for (int i = 0; i < map->num_stripes; i++) { struct btrfs_device *dev = map->stripes[i].dev; btrfs_device_set_bytes_used(dev, dev->bytes_used + ctl->stripe_size); if (list_empty(&dev->post_commit_list)) list_add_tail(&dev->post_commit_list, &trans->transaction->dev_update_list); } atomic64_sub(ctl->stripe_size * map->num_stripes, &info->free_chunk_space); check_raid56_incompat_flag(info, type); check_raid1c34_incompat_flag(info, type); return block_group; } struct btrfs_block_group *btrfs_create_chunk(struct btrfs_trans_handle *trans, u64 type) { struct btrfs_fs_info *info = trans->fs_info; struct btrfs_fs_devices *fs_devices = info->fs_devices; struct btrfs_device_info *devices_info = NULL; struct alloc_chunk_ctl ctl; struct btrfs_block_group *block_group; int ret; lockdep_assert_held(&info->chunk_mutex); if (!alloc_profile_is_valid(type, 0)) { ASSERT(0); return ERR_PTR(-EINVAL); } if (list_empty(&fs_devices->alloc_list)) { if (btrfs_test_opt(info, ENOSPC_DEBUG)) btrfs_debug(info, "%s: no writable device", __func__); return ERR_PTR(-ENOSPC); } if (!(type & BTRFS_BLOCK_GROUP_TYPE_MASK)) { btrfs_err(info, "invalid chunk type 0x%llx requested", type); ASSERT(0); return ERR_PTR(-EINVAL); } ctl.start = find_next_chunk(info); ctl.type = type; init_alloc_chunk_ctl(fs_devices, &ctl); devices_info = kcalloc(fs_devices->rw_devices, sizeof(*devices_info), GFP_NOFS); if (!devices_info) return ERR_PTR(-ENOMEM); ret = gather_device_info(fs_devices, &ctl, devices_info); if (ret < 0) { block_group = ERR_PTR(ret); goto out; } ret = decide_stripe_size(fs_devices, &ctl, devices_info); if (ret < 0) { block_group = ERR_PTR(ret); goto out; } block_group = create_chunk(trans, &ctl, devices_info); out: kfree(devices_info); return block_group; } /* * This function, btrfs_chunk_alloc_add_chunk_item(), typically belongs to the * phase 1 of chunk allocation. It belongs to phase 2 only when allocating system * chunks. * * See the comment at btrfs_chunk_alloc() for details about the chunk allocation * phases. */ int btrfs_chunk_alloc_add_chunk_item(struct btrfs_trans_handle *trans, struct btrfs_block_group *bg) { struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_root *chunk_root = fs_info->chunk_root; struct btrfs_key key; struct btrfs_chunk *chunk; struct btrfs_stripe *stripe; struct btrfs_chunk_map *map; size_t item_size; int i; int ret; /* * We take the chunk_mutex for 2 reasons: * * 1) Updates and insertions in the chunk btree must be done while holding * the chunk_mutex, as well as updating the system chunk array in the * superblock. See the comment on top of btrfs_chunk_alloc() for the * details; * * 2) To prevent races with the final phase of a device replace operation * that replaces the device object associated with the map's stripes, * because the device object's id can change at any time during that * final phase of the device replace operation * (dev-replace.c:btrfs_dev_replace_finishing()), so we could grab the * replaced device and then see it with an ID of BTRFS_DEV_REPLACE_DEVID, * which would cause a failure when updating the device item, which does * not exists, or persisting a stripe of the chunk item with such ID. * Here we can't use the device_list_mutex because our caller already * has locked the chunk_mutex, and the final phase of device replace * acquires both mutexes - first the device_list_mutex and then the * chunk_mutex. Using any of those two mutexes protects us from a * concurrent device replace. */ lockdep_assert_held(&fs_info->chunk_mutex); map = btrfs_get_chunk_map(fs_info, bg->start, bg->length); if (IS_ERR(map)) { ret = PTR_ERR(map); btrfs_abort_transaction(trans, ret); return ret; } item_size = btrfs_chunk_item_size(map->num_stripes); chunk = kzalloc(item_size, GFP_NOFS); if (!chunk) { ret = -ENOMEM; btrfs_abort_transaction(trans, ret); goto out; } for (i = 0; i < map->num_stripes; i++) { struct btrfs_device *device = map->stripes[i].dev; ret = btrfs_update_device(trans, device); if (ret) goto out; } stripe = &chunk->stripe; for (i = 0; i < map->num_stripes; i++) { struct btrfs_device *device = map->stripes[i].dev; const u64 dev_offset = map->stripes[i].physical; btrfs_set_stack_stripe_devid(stripe, device->devid); btrfs_set_stack_stripe_offset(stripe, dev_offset); memcpy(stripe->dev_uuid, device->uuid, BTRFS_UUID_SIZE); stripe++; } btrfs_set_stack_chunk_length(chunk, bg->length); btrfs_set_stack_chunk_owner(chunk, BTRFS_EXTENT_TREE_OBJECTID); btrfs_set_stack_chunk_stripe_len(chunk, BTRFS_STRIPE_LEN); btrfs_set_stack_chunk_type(chunk, map->type); btrfs_set_stack_chunk_num_stripes(chunk, map->num_stripes); btrfs_set_stack_chunk_io_align(chunk, BTRFS_STRIPE_LEN); btrfs_set_stack_chunk_io_width(chunk, BTRFS_STRIPE_LEN); btrfs_set_stack_chunk_sector_size(chunk, fs_info->sectorsize); btrfs_set_stack_chunk_sub_stripes(chunk, map->sub_stripes); key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID; key.type = BTRFS_CHUNK_ITEM_KEY; key.offset = bg->start; ret = btrfs_insert_item(trans, chunk_root, &key, chunk, item_size); if (ret) goto out; set_bit(BLOCK_GROUP_FLAG_CHUNK_ITEM_INSERTED, &bg->runtime_flags); if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) { ret = btrfs_add_system_chunk(fs_info, &key, chunk, item_size); if (ret) goto out; } out: kfree(chunk); btrfs_free_chunk_map(map); return ret; } static noinline int init_first_rw_device(struct btrfs_trans_handle *trans) { struct btrfs_fs_info *fs_info = trans->fs_info; u64 alloc_profile; struct btrfs_block_group *meta_bg; struct btrfs_block_group *sys_bg; /* * When adding a new device for sprouting, the seed device is read-only * so we must first allocate a metadata and a system chunk. But before * adding the block group items to the extent, device and chunk btrees, * we must first: * * 1) Create both chunks without doing any changes to the btrees, as * otherwise we would get -ENOSPC since the block groups from the * seed device are read-only; * * 2) Add the device item for the new sprout device - finishing the setup * of a new block group requires updating the device item in the chunk * btree, so it must exist when we attempt to do it. The previous step * ensures this does not fail with -ENOSPC. * * After that we can add the block group items to their btrees: * update existing device item in the chunk btree, add a new block group * item to the extent btree, add a new chunk item to the chunk btree and * finally add the new device extent items to the devices btree. */ alloc_profile = btrfs_metadata_alloc_profile(fs_info); meta_bg = btrfs_create_chunk(trans, alloc_profile); if (IS_ERR(meta_bg)) return PTR_ERR(meta_bg); alloc_profile = btrfs_system_alloc_profile(fs_info); sys_bg = btrfs_create_chunk(trans, alloc_profile); if (IS_ERR(sys_bg)) return PTR_ERR(sys_bg); return 0; } static inline int btrfs_chunk_max_errors(struct btrfs_chunk_map *map) { const int index = btrfs_bg_flags_to_raid_index(map->type); return btrfs_raid_array[index].tolerated_failures; } bool btrfs_chunk_writeable(struct btrfs_fs_info *fs_info, u64 chunk_offset) { struct btrfs_chunk_map *map; int miss_ndevs = 0; int i; bool ret = true; map = btrfs_get_chunk_map(fs_info, chunk_offset, 1); if (IS_ERR(map)) return false; for (i = 0; i < map->num_stripes; i++) { if (test_bit(BTRFS_DEV_STATE_MISSING, &map->stripes[i].dev->dev_state)) { miss_ndevs++; continue; } if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &map->stripes[i].dev->dev_state)) { ret = false; goto end; } } /* * If the number of missing devices is larger than max errors, we can * not write the data into that chunk successfully. */ if (miss_ndevs > btrfs_chunk_max_errors(map)) ret = false; end: btrfs_free_chunk_map(map); return ret; } void btrfs_mapping_tree_free(struct btrfs_fs_info *fs_info) { write_lock(&fs_info->mapping_tree_lock); while (!RB_EMPTY_ROOT(&fs_info->mapping_tree.rb_root)) { struct btrfs_chunk_map *map; struct rb_node *node; node = rb_first_cached(&fs_info->mapping_tree); map = rb_entry(node, struct btrfs_chunk_map, rb_node); rb_erase_cached(&map->rb_node, &fs_info->mapping_tree); RB_CLEAR_NODE(&map->rb_node); chunk_map_device_clear_bits(map, CHUNK_ALLOCATED); /* Once for the tree ref. */ btrfs_free_chunk_map(map); cond_resched_rwlock_write(&fs_info->mapping_tree_lock); } write_unlock(&fs_info->mapping_tree_lock); } static int btrfs_chunk_map_num_copies(const struct btrfs_chunk_map *map) { enum btrfs_raid_types index = btrfs_bg_flags_to_raid_index(map->type); if (map->type & BTRFS_BLOCK_GROUP_RAID5) return 2; /* * There could be two corrupted data stripes, we need to loop retry in * order to rebuild the correct data. * * Fail a stripe at a time on every retry except the stripe under * reconstruction. */ if (map->type & BTRFS_BLOCK_GROUP_RAID6) return map->num_stripes; /* Non-RAID56, use their ncopies from btrfs_raid_array. */ return btrfs_raid_array[index].ncopies; } int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len) { struct btrfs_chunk_map *map; int ret; map = btrfs_get_chunk_map(fs_info, logical, len); if (IS_ERR(map)) /* * We could return errors for these cases, but that could get * ugly and we'd probably do the same thing which is just not do * anything else and exit, so return 1 so the callers don't try * to use other copies. */ return 1; ret = btrfs_chunk_map_num_copies(map); btrfs_free_chunk_map(map); return ret; } unsigned long btrfs_full_stripe_len(struct btrfs_fs_info *fs_info, u64 logical) { struct btrfs_chunk_map *map; unsigned long len = fs_info->sectorsize; if (!btrfs_fs_incompat(fs_info, RAID56)) return len; map = btrfs_get_chunk_map(fs_info, logical, len); if (!WARN_ON(IS_ERR(map))) { if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) len = btrfs_stripe_nr_to_offset(nr_data_stripes(map)); btrfs_free_chunk_map(map); } return len; } #ifdef CONFIG_BTRFS_EXPERIMENTAL static int btrfs_read_preferred(struct btrfs_chunk_map *map, int first, int num_stripes) { for (int index = first; index < first + num_stripes; index++) { const struct btrfs_device *device = map->stripes[index].dev; if (device->devid == READ_ONCE(device->fs_devices->read_devid)) return index; } /* If no read-preferred device is set use the first stripe. */ return first; } struct stripe_mirror { u64 devid; int num; }; static int btrfs_cmp_devid(const void *a, const void *b) { const struct stripe_mirror *s1 = (const struct stripe_mirror *)a; const struct stripe_mirror *s2 = (const struct stripe_mirror *)b; if (s1->devid < s2->devid) return -1; if (s1->devid > s2->devid) return 1; return 0; } /* * Select a stripe for reading using the round-robin algorithm. * * 1. Compute the read cycle as the total sectors read divided by the minimum * sectors per device. * 2. Determine the stripe number for the current read by taking the modulus * of the read cycle with the total number of stripes: * * stripe index = (total sectors / min sectors per dev) % num stripes * * The calculated stripe index is then used to select the corresponding device * from the list of devices, which is ordered by devid. */ static int btrfs_read_rr(const struct btrfs_chunk_map *map, int first, int num_stripes) { struct stripe_mirror stripes[BTRFS_RAID1_MAX_MIRRORS] = { 0 }; struct btrfs_device *device = map->stripes[first].dev; struct btrfs_fs_info *fs_info = device->fs_devices->fs_info; unsigned int read_cycle; unsigned int total_reads; unsigned int min_reads_per_dev; total_reads = percpu_counter_sum(&fs_info->stats_read_blocks); min_reads_per_dev = READ_ONCE(fs_info->fs_devices->rr_min_contig_read) >> fs_info->sectorsize_bits; for (int index = 0, i = first; i < first + num_stripes; i++) { stripes[index].devid = map->stripes[i].dev->devid; stripes[index].num = i; index++; } sort(stripes, num_stripes, sizeof(struct stripe_mirror), btrfs_cmp_devid, NULL); read_cycle = total_reads / min_reads_per_dev; return stripes[read_cycle % num_stripes].num; } #endif static int find_live_mirror(struct btrfs_fs_info *fs_info, struct btrfs_chunk_map *map, int first, int dev_replace_is_ongoing) { const enum btrfs_read_policy policy = READ_ONCE(fs_info->fs_devices->read_policy); int i; int num_stripes; int preferred_mirror; int tolerance; struct btrfs_device *srcdev; ASSERT((map->type & (BTRFS_BLOCK_GROUP_RAID1_MASK | BTRFS_BLOCK_GROUP_RAID10))); if (map->type & BTRFS_BLOCK_GROUP_RAID10) num_stripes = map->sub_stripes; else num_stripes = map->num_stripes; switch (policy) { default: /* Shouldn't happen, just warn and use pid instead of failing */ btrfs_warn_rl(fs_info, "unknown read_policy type %u, reset to pid", policy); WRITE_ONCE(fs_info->fs_devices->read_policy, BTRFS_READ_POLICY_PID); fallthrough; case BTRFS_READ_POLICY_PID: preferred_mirror = first + (current->pid % num_stripes); break; #ifdef CONFIG_BTRFS_EXPERIMENTAL case BTRFS_READ_POLICY_RR: preferred_mirror = btrfs_read_rr(map, first, num_stripes); break; case BTRFS_READ_POLICY_DEVID: preferred_mirror = btrfs_read_preferred(map, first, num_stripes); break; #endif } if (dev_replace_is_ongoing && fs_info->dev_replace.cont_reading_from_srcdev_mode == BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_AVOID) srcdev = fs_info->dev_replace.srcdev; else srcdev = NULL; /* * try to avoid the drive that is the source drive for a * dev-replace procedure, only choose it if no other non-missing * mirror is available */ for (tolerance = 0; tolerance < 2; tolerance++) { if (map->stripes[preferred_mirror].dev->bdev && (tolerance || map->stripes[preferred_mirror].dev != srcdev)) return preferred_mirror; for (i = first; i < first + num_stripes; i++) { if (map->stripes[i].dev->bdev && (tolerance || map->stripes[i].dev != srcdev)) return i; } } /* we couldn't find one that doesn't fail. Just return something * and the io error handling code will clean up eventually */ return preferred_mirror; } EXPORT_FOR_TESTS struct btrfs_io_context *alloc_btrfs_io_context(struct btrfs_fs_info *fs_info, u64 logical, u16 total_stripes) { struct btrfs_io_context *bioc; bioc = kzalloc( /* The size of btrfs_io_context */ sizeof(struct btrfs_io_context) + /* Plus the variable array for the stripes */ sizeof(struct btrfs_io_stripe) * (total_stripes), GFP_NOFS); if (!bioc) return NULL; refcount_set(&bioc->refs, 1); bioc->fs_info = fs_info; bioc->replace_stripe_src = -1; bioc->full_stripe_logical = (u64)-1; bioc->logical = logical; return bioc; } void btrfs_get_bioc(struct btrfs_io_context *bioc) { WARN_ON(!refcount_read(&bioc->refs)); refcount_inc(&bioc->refs); } void btrfs_put_bioc(struct btrfs_io_context *bioc) { if (!bioc) return; if (refcount_dec_and_test(&bioc->refs)) kfree(bioc); } /* * Please note that, discard won't be sent to target device of device * replace. */ struct btrfs_discard_stripe *btrfs_map_discard(struct btrfs_fs_info *fs_info, u64 logical, u64 *length_ret, u32 *num_stripes) { struct btrfs_chunk_map *map; struct btrfs_discard_stripe *stripes; u64 length = *length_ret; u64 offset; u32 stripe_nr; u32 stripe_nr_end; u32 stripe_cnt; u64 stripe_end_offset; u64 stripe_offset; u32 stripe_index; u32 factor = 0; u32 sub_stripes = 0; u32 stripes_per_dev = 0; u32 remaining_stripes = 0; u32 last_stripe = 0; int ret; int i; map = btrfs_get_chunk_map(fs_info, logical, length); if (IS_ERR(map)) return ERR_CAST(map); /* we don't discard raid56 yet */ if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { ret = -EOPNOTSUPP; goto out_free_map; } offset = logical - map->start; length = min_t(u64, map->start + map->chunk_len - logical, length); *length_ret = length; /* * stripe_nr counts the total number of stripes we have to stride * to get to this block */ stripe_nr = offset >> BTRFS_STRIPE_LEN_SHIFT; /* stripe_offset is the offset of this block in its stripe */ stripe_offset = offset - btrfs_stripe_nr_to_offset(stripe_nr); stripe_nr_end = round_up(offset + length, BTRFS_STRIPE_LEN) >> BTRFS_STRIPE_LEN_SHIFT; stripe_cnt = stripe_nr_end - stripe_nr; stripe_end_offset = btrfs_stripe_nr_to_offset(stripe_nr_end) - (offset + length); /* * after this, stripe_nr is the number of stripes on this * device we have to walk to find the data, and stripe_index is * the number of our device in the stripe array */ *num_stripes = 1; stripe_index = 0; if (map->type & (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID10)) { if (map->type & BTRFS_BLOCK_GROUP_RAID0) sub_stripes = 1; else sub_stripes = map->sub_stripes; factor = map->num_stripes / sub_stripes; *num_stripes = min_t(u64, map->num_stripes, sub_stripes * stripe_cnt); stripe_index = stripe_nr % factor; stripe_nr /= factor; stripe_index *= sub_stripes; remaining_stripes = stripe_cnt % factor; stripes_per_dev = stripe_cnt / factor; last_stripe = ((stripe_nr_end - 1) % factor) * sub_stripes; } else if (map->type & (BTRFS_BLOCK_GROUP_RAID1_MASK | BTRFS_BLOCK_GROUP_DUP)) { *num_stripes = map->num_stripes; } else { stripe_index = stripe_nr % map->num_stripes; stripe_nr /= map->num_stripes; } stripes = kcalloc(*num_stripes, sizeof(*stripes), GFP_NOFS); if (!stripes) { ret = -ENOMEM; goto out_free_map; } for (i = 0; i < *num_stripes; i++) { stripes[i].physical = map->stripes[stripe_index].physical + stripe_offset + btrfs_stripe_nr_to_offset(stripe_nr); stripes[i].dev = map->stripes[stripe_index].dev; if (map->type & (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID10)) { stripes[i].length = btrfs_stripe_nr_to_offset(stripes_per_dev); if (i / sub_stripes < remaining_stripes) stripes[i].length += BTRFS_STRIPE_LEN; /* * Special for the first stripe and * the last stripe: * * |-------|...|-------| * |----------| * off end_off */ if (i < sub_stripes) stripes[i].length -= stripe_offset; if (stripe_index >= last_stripe && stripe_index <= (last_stripe + sub_stripes - 1)) stripes[i].length -= stripe_end_offset; if (i == sub_stripes - 1) stripe_offset = 0; } else { stripes[i].length = length; } stripe_index++; if (stripe_index == map->num_stripes) { stripe_index = 0; stripe_nr++; } } btrfs_free_chunk_map(map); return stripes; out_free_map: btrfs_free_chunk_map(map); return ERR_PTR(ret); } static bool is_block_group_to_copy(struct btrfs_fs_info *fs_info, u64 logical) { struct btrfs_block_group *cache; bool ret; /* Non zoned filesystem does not use "to_copy" flag */ if (!btrfs_is_zoned(fs_info)) return false; cache = btrfs_lookup_block_group(fs_info, logical); ret = test_bit(BLOCK_GROUP_FLAG_TO_COPY, &cache->runtime_flags); btrfs_put_block_group(cache); return ret; } static void handle_ops_on_dev_replace(struct btrfs_io_context *bioc, struct btrfs_dev_replace *dev_replace, u64 logical, struct btrfs_io_geometry *io_geom) { u64 srcdev_devid = dev_replace->srcdev->devid; /* * At this stage, num_stripes is still the real number of stripes, * excluding the duplicated stripes. */ int num_stripes = io_geom->num_stripes; int max_errors = io_geom->max_errors; int nr_extra_stripes = 0; int i; /* * A block group which has "to_copy" set will eventually be copied by * the dev-replace process. We can avoid cloning IO here. */ if (is_block_group_to_copy(dev_replace->srcdev->fs_info, logical)) return; /* * Duplicate the write operations while the dev-replace procedure is * running. Since the copying of the old disk to the new disk takes * place at run time while the filesystem is mounted writable, the * regular write operations to the old disk have to be duplicated to go * to the new disk as well. * * Note that device->missing is handled by the caller, and that the * write to the old disk is already set up in the stripes array. */ for (i = 0; i < num_stripes; i++) { struct btrfs_io_stripe *old = &bioc->stripes[i]; struct btrfs_io_stripe *new = &bioc->stripes[num_stripes + nr_extra_stripes]; if (old->dev->devid != srcdev_devid) continue; new->physical = old->physical; new->dev = dev_replace->tgtdev; if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) bioc->replace_stripe_src = i; nr_extra_stripes++; } /* We can only have at most 2 extra nr_stripes (for DUP). */ ASSERT(nr_extra_stripes <= 2); /* * For GET_READ_MIRRORS, we can only return at most 1 extra stripe for * replace. * If we have 2 extra stripes, only choose the one with smaller physical. */ if (io_geom->op == BTRFS_MAP_GET_READ_MIRRORS && nr_extra_stripes == 2) { struct btrfs_io_stripe *first = &bioc->stripes[num_stripes]; struct btrfs_io_stripe *second = &bioc->stripes[num_stripes + 1]; /* Only DUP can have two extra stripes. */ ASSERT(bioc->map_type & BTRFS_BLOCK_GROUP_DUP); /* * Swap the last stripe stripes and reduce @nr_extra_stripes. * The extra stripe would still be there, but won't be accessed. */ if (first->physical > second->physical) { swap(second->physical, first->physical); swap(second->dev, first->dev); nr_extra_stripes--; } } io_geom->num_stripes = num_stripes + nr_extra_stripes; io_geom->max_errors = max_errors + nr_extra_stripes; bioc->replace_nr_stripes = nr_extra_stripes; } static u64 btrfs_max_io_len(struct btrfs_chunk_map *map, u64 offset, struct btrfs_io_geometry *io_geom) { /* * Stripe_nr is the stripe where this block falls. stripe_offset is * the offset of this block in its stripe. */ io_geom->stripe_offset = offset & BTRFS_STRIPE_LEN_MASK; io_geom->stripe_nr = offset >> BTRFS_STRIPE_LEN_SHIFT; ASSERT(io_geom->stripe_offset < U32_MAX); if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { unsigned long full_stripe_len = btrfs_stripe_nr_to_offset(nr_data_stripes(map)); /* * For full stripe start, we use previously calculated * @stripe_nr. Align it to nr_data_stripes, then multiply with * STRIPE_LEN. * * By this we can avoid u64 division completely. And we have * to go rounddown(), not round_down(), as nr_data_stripes is * not ensured to be power of 2. */ io_geom->raid56_full_stripe_start = btrfs_stripe_nr_to_offset( rounddown(io_geom->stripe_nr, nr_data_stripes(map))); ASSERT(io_geom->raid56_full_stripe_start + full_stripe_len > offset); ASSERT(io_geom->raid56_full_stripe_start <= offset); /* * For writes to RAID56, allow to write a full stripe set, but * no straddling of stripe sets. */ if (io_geom->op == BTRFS_MAP_WRITE) return full_stripe_len - (offset - io_geom->raid56_full_stripe_start); } /* * For other RAID types and for RAID56 reads, allow a single stripe (on * a single disk). */ if (map->type & BTRFS_BLOCK_GROUP_STRIPE_MASK) return BTRFS_STRIPE_LEN - io_geom->stripe_offset; return U64_MAX; } static int set_io_stripe(struct btrfs_fs_info *fs_info, u64 logical, u64 *length, struct btrfs_io_stripe *dst, struct btrfs_chunk_map *map, struct btrfs_io_geometry *io_geom) { dst->dev = map->stripes[io_geom->stripe_index].dev; if (io_geom->op == BTRFS_MAP_READ && io_geom->use_rst) return btrfs_get_raid_extent_offset(fs_info, logical, length, map->type, io_geom->stripe_index, dst); dst->physical = map->stripes[io_geom->stripe_index].physical + io_geom->stripe_offset + btrfs_stripe_nr_to_offset(io_geom->stripe_nr); return 0; } static bool is_single_device_io(struct btrfs_fs_info *fs_info, const struct btrfs_io_stripe *smap, const struct btrfs_chunk_map *map, int num_alloc_stripes, struct btrfs_io_geometry *io_geom) { if (!smap) return false; if (num_alloc_stripes != 1) return false; if (io_geom->use_rst && io_geom->op != BTRFS_MAP_READ) return false; if ((map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) && io_geom->mirror_num > 1) return false; return true; } static void map_blocks_raid0(const struct btrfs_chunk_map *map, struct btrfs_io_geometry *io_geom) { io_geom->stripe_index = io_geom->stripe_nr % map->num_stripes; io_geom->stripe_nr /= map->num_stripes; if (io_geom->op == BTRFS_MAP_READ) io_geom->mirror_num = 1; } static void map_blocks_raid1(struct btrfs_fs_info *fs_info, struct btrfs_chunk_map *map, struct btrfs_io_geometry *io_geom, bool dev_replace_is_ongoing) { if (io_geom->op != BTRFS_MAP_READ) { io_geom->num_stripes = map->num_stripes; return; } if (io_geom->mirror_num) { io_geom->stripe_index = io_geom->mirror_num - 1; return; } io_geom->stripe_index = find_live_mirror(fs_info, map, 0, dev_replace_is_ongoing); io_geom->mirror_num = io_geom->stripe_index + 1; } static void map_blocks_dup(const struct btrfs_chunk_map *map, struct btrfs_io_geometry *io_geom) { if (io_geom->op != BTRFS_MAP_READ) { io_geom->num_stripes = map->num_stripes; return; } if (io_geom->mirror_num) { io_geom->stripe_index = io_geom->mirror_num - 1; return; } io_geom->mirror_num = 1; } static void map_blocks_raid10(struct btrfs_fs_info *fs_info, struct btrfs_chunk_map *map, struct btrfs_io_geometry *io_geom, bool dev_replace_is_ongoing) { u32 factor = map->num_stripes / map->sub_stripes; int old_stripe_index; io_geom->stripe_index = (io_geom->stripe_nr % factor) * map->sub_stripes; io_geom->stripe_nr /= factor; if (io_geom->op != BTRFS_MAP_READ) { io_geom->num_stripes = map->sub_stripes; return; } if (io_geom->mirror_num) { io_geom->stripe_index += io_geom->mirror_num - 1; return; } old_stripe_index = io_geom->stripe_index; io_geom->stripe_index = find_live_mirror(fs_info, map, io_geom->stripe_index, dev_replace_is_ongoing); io_geom->mirror_num = io_geom->stripe_index - old_stripe_index + 1; } static void map_blocks_raid56_write(struct btrfs_chunk_map *map, struct btrfs_io_geometry *io_geom, u64 logical, u64 *length) { int data_stripes = nr_data_stripes(map); /* * Needs full stripe mapping. * * Push stripe_nr back to the start of the full stripe For those cases * needing a full stripe, @stripe_nr is the full stripe number. * * Originally we go raid56_full_stripe_start / full_stripe_len, but * that can be expensive. Here we just divide @stripe_nr with * @data_stripes. */ io_geom->stripe_nr /= data_stripes; /* RAID[56] write or recovery. Return all stripes */ io_geom->num_stripes = map->num_stripes; io_geom->max_errors = btrfs_chunk_max_errors(map); /* Return the length to the full stripe end. */ *length = min(logical + *length, io_geom->raid56_full_stripe_start + map->start + btrfs_stripe_nr_to_offset(data_stripes)) - logical; io_geom->stripe_index = 0; io_geom->stripe_offset = 0; } static void map_blocks_raid56_read(struct btrfs_chunk_map *map, struct btrfs_io_geometry *io_geom) { int data_stripes = nr_data_stripes(map); ASSERT(io_geom->mirror_num <= 1); /* Just grab the data stripe directly. */ io_geom->stripe_index = io_geom->stripe_nr % data_stripes; io_geom->stripe_nr /= data_stripes; /* We distribute the parity blocks across stripes. */ io_geom->stripe_index = (io_geom->stripe_nr + io_geom->stripe_index) % map->num_stripes; if (io_geom->op == BTRFS_MAP_READ && io_geom->mirror_num < 1) io_geom->mirror_num = 1; } static void map_blocks_single(const struct btrfs_chunk_map *map, struct btrfs_io_geometry *io_geom) { io_geom->stripe_index = io_geom->stripe_nr % map->num_stripes; io_geom->stripe_nr /= map->num_stripes; io_geom->mirror_num = io_geom->stripe_index + 1; } /* * Map one logical range to one or more physical ranges. * * @length: (Mandatory) mapped length of this run. * One logical range can be split into different segments * due to factors like zones and RAID0/5/6/10 stripe * boundaries. * * @bioc_ret: (Mandatory) returned btrfs_io_context structure. * which has one or more physical ranges (btrfs_io_stripe) * recorded inside. * Caller should call btrfs_put_bioc() to free it after use. * * @smap: (Optional) single physical range optimization. * If the map request can be fulfilled by one single * physical range, and this is parameter is not NULL, * then @bioc_ret would be NULL, and @smap would be * updated. * * @mirror_num_ret: (Mandatory) returned mirror number if the original * value is 0. * * Mirror number 0 means to choose any live mirrors. * * For non-RAID56 profiles, non-zero mirror_num means * the Nth mirror. (e.g. mirror_num 1 means the first * copy). * * For RAID56 profile, mirror 1 means rebuild from P and * the remaining data stripes. * * For RAID6 profile, mirror > 2 means mark another * data/P stripe error and rebuild from the remaining * stripes.. */ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, u64 logical, u64 *length, struct btrfs_io_context **bioc_ret, struct btrfs_io_stripe *smap, int *mirror_num_ret) { struct btrfs_chunk_map *map; struct btrfs_io_geometry io_geom = { 0 }; u64 map_offset; int ret = 0; int num_copies; struct btrfs_io_context *bioc = NULL; struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; int dev_replace_is_ongoing = 0; u16 num_alloc_stripes; u64 max_len; ASSERT(bioc_ret); io_geom.mirror_num = (mirror_num_ret ? *mirror_num_ret : 0); io_geom.num_stripes = 1; io_geom.stripe_index = 0; io_geom.op = op; map = btrfs_get_chunk_map(fs_info, logical, *length); if (IS_ERR(map)) return PTR_ERR(map); num_copies = btrfs_chunk_map_num_copies(map); if (io_geom.mirror_num > num_copies) return -EINVAL; map_offset = logical - map->start; io_geom.raid56_full_stripe_start = (u64)-1; max_len = btrfs_max_io_len(map, map_offset, &io_geom); *length = min_t(u64, map->chunk_len - map_offset, max_len); io_geom.use_rst = btrfs_need_stripe_tree_update(fs_info, map->type); if (dev_replace->replace_task != current) down_read(&dev_replace->rwsem); dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing(dev_replace); /* * Hold the semaphore for read during the whole operation, write is * requested at commit time but must wait. */ if (!dev_replace_is_ongoing && dev_replace->replace_task != current) up_read(&dev_replace->rwsem); switch (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) { case BTRFS_BLOCK_GROUP_RAID0: map_blocks_raid0(map, &io_geom); break; case BTRFS_BLOCK_GROUP_RAID1: case BTRFS_BLOCK_GROUP_RAID1C3: case BTRFS_BLOCK_GROUP_RAID1C4: map_blocks_raid1(fs_info, map, &io_geom, dev_replace_is_ongoing); break; case BTRFS_BLOCK_GROUP_DUP: map_blocks_dup(map, &io_geom); break; case BTRFS_BLOCK_GROUP_RAID10: map_blocks_raid10(fs_info, map, &io_geom, dev_replace_is_ongoing); break; case BTRFS_BLOCK_GROUP_RAID5: case BTRFS_BLOCK_GROUP_RAID6: if (op != BTRFS_MAP_READ || io_geom.mirror_num > 1) map_blocks_raid56_write(map, &io_geom, logical, length); else map_blocks_raid56_read(map, &io_geom); break; default: /* * After this, stripe_nr is the number of stripes on this * device we have to walk to find the data, and stripe_index is * the number of our device in the stripe array */ map_blocks_single(map, &io_geom); break; } if (io_geom.stripe_index >= map->num_stripes) { btrfs_crit(fs_info, "stripe index math went horribly wrong, got stripe_index=%u, num_stripes=%u", io_geom.stripe_index, map->num_stripes); ret = -EINVAL; goto out; } num_alloc_stripes = io_geom.num_stripes; if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL && op != BTRFS_MAP_READ) /* * For replace case, we need to add extra stripes for extra * duplicated stripes. * * For both WRITE and GET_READ_MIRRORS, we may have at most * 2 more stripes (DUP types, otherwise 1). */ num_alloc_stripes += 2; /* * If this I/O maps to a single device, try to return the device and * physical block information on the stack instead of allocating an * I/O context structure. */ if (is_single_device_io(fs_info, smap, map, num_alloc_stripes, &io_geom)) { ret = set_io_stripe(fs_info, logical, length, smap, map, &io_geom); if (mirror_num_ret) *mirror_num_ret = io_geom.mirror_num; *bioc_ret = NULL; goto out; } bioc = alloc_btrfs_io_context(fs_info, logical, num_alloc_stripes); if (!bioc) { ret = -ENOMEM; goto out; } bioc->map_type = map->type; bioc->use_rst = io_geom.use_rst; /* * For RAID56 full map, we need to make sure the stripes[] follows the * rule that data stripes are all ordered, then followed with P and Q * (if we have). * * It's still mostly the same as other profiles, just with extra rotation. */ if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK && (op != BTRFS_MAP_READ || io_geom.mirror_num > 1)) { /* * For RAID56 @stripe_nr is already the number of full stripes * before us, which is also the rotation value (needs to modulo * with num_stripes). * * In this case, we just add @stripe_nr with @i, then do the * modulo, to reduce one modulo call. */ bioc->full_stripe_logical = map->start + btrfs_stripe_nr_to_offset(io_geom.stripe_nr * nr_data_stripes(map)); for (int i = 0; i < io_geom.num_stripes; i++) { struct btrfs_io_stripe *dst = &bioc->stripes[i]; u32 stripe_index; stripe_index = (i + io_geom.stripe_nr) % io_geom.num_stripes; dst->dev = map->stripes[stripe_index].dev; dst->physical = map->stripes[stripe_index].physical + io_geom.stripe_offset + btrfs_stripe_nr_to_offset(io_geom.stripe_nr); } } else { /* * For all other non-RAID56 profiles, just copy the target * stripe into the bioc. */ for (int i = 0; i < io_geom.num_stripes; i++) { ret = set_io_stripe(fs_info, logical, length, &bioc->stripes[i], map, &io_geom); if (ret < 0) break; io_geom.stripe_index++; } } if (ret) { *bioc_ret = NULL; btrfs_put_bioc(bioc); goto out; } if (op != BTRFS_MAP_READ) io_geom.max_errors = btrfs_chunk_max_errors(map); if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL && op != BTRFS_MAP_READ) { handle_ops_on_dev_replace(bioc, dev_replace, logical, &io_geom); } *bioc_ret = bioc; bioc->num_stripes = io_geom.num_stripes; bioc->max_errors = io_geom.max_errors; bioc->mirror_num = io_geom.mirror_num; out: if (dev_replace_is_ongoing && dev_replace->replace_task != current) { lockdep_assert_held(&dev_replace->rwsem); /* Unlock and let waiting writers proceed */ up_read(&dev_replace->rwsem); } btrfs_free_chunk_map(map); return ret; } static bool dev_args_match_fs_devices(const struct btrfs_dev_lookup_args *args, const struct btrfs_fs_devices *fs_devices) { if (args->fsid == NULL) return true; if (memcmp(fs_devices->metadata_uuid, args->fsid, BTRFS_FSID_SIZE) == 0) return true; return false; } static bool dev_args_match_device(const struct btrfs_dev_lookup_args *args, const struct btrfs_device *device) { if (args->missing) { if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state) && !device->bdev) return true; return false; } if (device->devid != args->devid) return false; if (args->uuid && memcmp(device->uuid, args->uuid, BTRFS_UUID_SIZE) != 0) return false; return true; } /* * Find a device specified by @devid or @uuid in the list of @fs_devices, or * return NULL. * * If devid and uuid are both specified, the match must be exact, otherwise * only devid is used. */ struct btrfs_device *btrfs_find_device(const struct btrfs_fs_devices *fs_devices, const struct btrfs_dev_lookup_args *args) { struct btrfs_device *device; struct btrfs_fs_devices *seed_devs; if (dev_args_match_fs_devices(args, fs_devices)) { list_for_each_entry(device, &fs_devices->devices, dev_list) { if (dev_args_match_device(args, device)) return device; } } list_for_each_entry(seed_devs, &fs_devices->seed_list, seed_list) { if (!dev_args_match_fs_devices(args, seed_devs)) continue; list_for_each_entry(device, &seed_devs->devices, dev_list) { if (dev_args_match_device(args, device)) return device; } } return NULL; } static struct btrfs_device *add_missing_dev(struct btrfs_fs_devices *fs_devices, u64 devid, u8 *dev_uuid) { struct btrfs_device *device; unsigned int nofs_flag; /* * We call this under the chunk_mutex, so we want to use NOFS for this * allocation, however we don't want to change btrfs_alloc_device() to * always do NOFS because we use it in a lot of other GFP_KERNEL safe * places. */ nofs_flag = memalloc_nofs_save(); device = btrfs_alloc_device(NULL, &devid, dev_uuid, NULL); memalloc_nofs_restore(nofs_flag); if (IS_ERR(device)) return device; list_add(&device->dev_list, &fs_devices->devices); device->fs_devices = fs_devices; fs_devices->num_devices++; set_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state); fs_devices->missing_devices++; return device; } /* * Allocate new device struct, set up devid and UUID. * * @fs_info: used only for generating a new devid, can be NULL if * devid is provided (i.e. @devid != NULL). * @devid: a pointer to devid for this device. If NULL a new devid * is generated. * @uuid: a pointer to UUID for this device. If NULL a new UUID * is generated. * @path: a pointer to device path if available, NULL otherwise. * * Return: a pointer to a new &struct btrfs_device on success; ERR_PTR() * on error. Returned struct is not linked onto any lists and must be * destroyed with btrfs_free_device. */ struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info, const u64 *devid, const u8 *uuid, const char *path) { struct btrfs_device *dev; u64 tmp; if (WARN_ON(!devid && !fs_info)) return ERR_PTR(-EINVAL); dev = kzalloc(sizeof(*dev), GFP_KERNEL); if (!dev) return ERR_PTR(-ENOMEM); INIT_LIST_HEAD(&dev->dev_list); INIT_LIST_HEAD(&dev->dev_alloc_list); INIT_LIST_HEAD(&dev->post_commit_list); atomic_set(&dev->dev_stats_ccnt, 0); btrfs_device_data_ordered_init(dev); extent_io_tree_init(fs_info, &dev->alloc_state, IO_TREE_DEVICE_ALLOC_STATE); if (devid) tmp = *devid; else { int ret; ret = find_next_devid(fs_info, &tmp); if (ret) { btrfs_free_device(dev); return ERR_PTR(ret); } } dev->devid = tmp; if (uuid) memcpy(dev->uuid, uuid, BTRFS_UUID_SIZE); else generate_random_uuid(dev->uuid); if (path) { struct rcu_string *name; name = rcu_string_strdup(path, GFP_KERNEL); if (!name) { btrfs_free_device(dev); return ERR_PTR(-ENOMEM); } rcu_assign_pointer(dev->name, name); } return dev; } static void btrfs_report_missing_device(struct btrfs_fs_info *fs_info, u64 devid, u8 *uuid, bool error) { if (error) btrfs_err_rl(fs_info, "devid %llu uuid %pU is missing", devid, uuid); else btrfs_warn_rl(fs_info, "devid %llu uuid %pU is missing", devid, uuid); } u64 btrfs_calc_stripe_length(const struct btrfs_chunk_map *map) { const int data_stripes = calc_data_stripes(map->type, map->num_stripes); return div_u64(map->chunk_len, data_stripes); } #if BITS_PER_LONG == 32 /* * Due to page cache limit, metadata beyond BTRFS_32BIT_MAX_FILE_SIZE * can't be accessed on 32bit systems. * * This function do mount time check to reject the fs if it already has * metadata chunk beyond that limit. */ static int check_32bit_meta_chunk(struct btrfs_fs_info *fs_info, u64 logical, u64 length, u64 type) { if (!(type & BTRFS_BLOCK_GROUP_METADATA)) return 0; if (logical + length < MAX_LFS_FILESIZE) return 0; btrfs_err_32bit_limit(fs_info); return -EOVERFLOW; } /* * This is to give early warning for any metadata chunk reaching * BTRFS_32BIT_EARLY_WARN_THRESHOLD. * Although we can still access the metadata, it's not going to be possible * once the limit is reached. */ static void warn_32bit_meta_chunk(struct btrfs_fs_info *fs_info, u64 logical, u64 length, u64 type) { if (!(type & BTRFS_BLOCK_GROUP_METADATA)) return; if (logical + length < BTRFS_32BIT_EARLY_WARN_THRESHOLD) return; btrfs_warn_32bit_limit(fs_info); } #endif static struct btrfs_device *handle_missing_device(struct btrfs_fs_info *fs_info, u64 devid, u8 *uuid) { struct btrfs_device *dev; if (!btrfs_test_opt(fs_info, DEGRADED)) { btrfs_report_missing_device(fs_info, devid, uuid, true); return ERR_PTR(-ENOENT); } dev = add_missing_dev(fs_info->fs_devices, devid, uuid); if (IS_ERR(dev)) { btrfs_err(fs_info, "failed to init missing device %llu: %ld", devid, PTR_ERR(dev)); return dev; } btrfs_report_missing_device(fs_info, devid, uuid, false); return dev; } static int read_one_chunk(struct btrfs_key *key, struct extent_buffer *leaf, struct btrfs_chunk *chunk) { BTRFS_DEV_LOOKUP_ARGS(args); struct btrfs_fs_info *fs_info = leaf->fs_info; struct btrfs_chunk_map *map; u64 logical; u64 length; u64 devid; u64 type; u8 uuid[BTRFS_UUID_SIZE]; int index; int num_stripes; int ret; int i; logical = key->offset; length = btrfs_chunk_length(leaf, chunk); type = btrfs_chunk_type(leaf, chunk); index = btrfs_bg_flags_to_raid_index(type); num_stripes = btrfs_chunk_num_stripes(leaf, chunk); #if BITS_PER_LONG == 32 ret = check_32bit_meta_chunk(fs_info, logical, length, type); if (ret < 0) return ret; warn_32bit_meta_chunk(fs_info, logical, length, type); #endif map = btrfs_find_chunk_map(fs_info, logical, 1); /* already mapped? */ if (map && map->start <= logical && map->start + map->chunk_len > logical) { btrfs_free_chunk_map(map); return 0; } else if (map) { btrfs_free_chunk_map(map); } map = btrfs_alloc_chunk_map(num_stripes, GFP_NOFS); if (!map) return -ENOMEM; map->start = logical; map->chunk_len = length; map->num_stripes = num_stripes; map->io_width = btrfs_chunk_io_width(leaf, chunk); map->io_align = btrfs_chunk_io_align(leaf, chunk); map->type = type; /* * We can't use the sub_stripes value, as for profiles other than * RAID10, they may have 0 as sub_stripes for filesystems created by * older mkfs (<v5.4). * In that case, it can cause divide-by-zero errors later. * Since currently sub_stripes is fixed for each profile, let's * use the trusted value instead. */ map->sub_stripes = btrfs_raid_array[index].sub_stripes; map->verified_stripes = 0; map->stripe_size = btrfs_calc_stripe_length(map); for (i = 0; i < num_stripes; i++) { map->stripes[i].physical = btrfs_stripe_offset_nr(leaf, chunk, i); devid = btrfs_stripe_devid_nr(leaf, chunk, i); args.devid = devid; read_extent_buffer(leaf, uuid, (unsigned long) btrfs_stripe_dev_uuid_nr(chunk, i), BTRFS_UUID_SIZE); args.uuid = uuid; map->stripes[i].dev = btrfs_find_device(fs_info->fs_devices, &args); if (!map->stripes[i].dev) { map->stripes[i].dev = handle_missing_device(fs_info, devid, uuid); if (IS_ERR(map->stripes[i].dev)) { ret = PTR_ERR(map->stripes[i].dev); btrfs_free_chunk_map(map); return ret; } } set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &(map->stripes[i].dev->dev_state)); } ret = btrfs_add_chunk_map(fs_info, map); if (ret < 0) { btrfs_err(fs_info, "failed to add chunk map, start=%llu len=%llu: %d", map->start, map->chunk_len, ret); btrfs_free_chunk_map(map); } return ret; } static void fill_device_from_item(struct extent_buffer *leaf, struct btrfs_dev_item *dev_item, struct btrfs_device *device) { unsigned long ptr; device->devid = btrfs_device_id(leaf, dev_item); device->disk_total_bytes = btrfs_device_total_bytes(leaf, dev_item); device->total_bytes = device->disk_total_bytes; device->commit_total_bytes = device->disk_total_bytes; device->bytes_used = btrfs_device_bytes_used(leaf, dev_item); device->commit_bytes_used = device->bytes_used; device->type = btrfs_device_type(leaf, dev_item); device->io_align = btrfs_device_io_align(leaf, dev_item); device->io_width = btrfs_device_io_width(leaf, dev_item); device->sector_size = btrfs_device_sector_size(leaf, dev_item); WARN_ON(device->devid == BTRFS_DEV_REPLACE_DEVID); clear_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state); ptr = btrfs_device_uuid(dev_item); read_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE); } static struct btrfs_fs_devices *open_seed_devices(struct btrfs_fs_info *fs_info, u8 *fsid) { struct btrfs_fs_devices *fs_devices; int ret; lockdep_assert_held(&uuid_mutex); ASSERT(fsid); /* This will match only for multi-device seed fs */ list_for_each_entry(fs_devices, &fs_info->fs_devices->seed_list, seed_list) if (!memcmp(fs_devices->fsid, fsid, BTRFS_FSID_SIZE)) return fs_devices; fs_devices = find_fsid(fsid, NULL); if (!fs_devices) { if (!btrfs_test_opt(fs_info, DEGRADED)) { btrfs_err(fs_info, "failed to find fsid %pU when attempting to open seed devices", fsid); return ERR_PTR(-ENOENT); } fs_devices = alloc_fs_devices(fsid); if (IS_ERR(fs_devices)) return fs_devices; fs_devices->seeding = true; fs_devices->opened = 1; return fs_devices; } /* * Upon first call for a seed fs fsid, just create a private copy of the * respective fs_devices and anchor it at fs_info->fs_devices->seed_list */ fs_devices = clone_fs_devices(fs_devices); if (IS_ERR(fs_devices)) return fs_devices; ret = open_fs_devices(fs_devices, BLK_OPEN_READ, fs_info->bdev_holder); if (ret) { free_fs_devices(fs_devices); return ERR_PTR(ret); } if (!fs_devices->seeding) { close_fs_devices(fs_devices); free_fs_devices(fs_devices); return ERR_PTR(-EINVAL); } list_add(&fs_devices->seed_list, &fs_info->fs_devices->seed_list); return fs_devices; } static int read_one_dev(struct extent_buffer *leaf, struct btrfs_dev_item *dev_item) { BTRFS_DEV_LOOKUP_ARGS(args); struct btrfs_fs_info *fs_info = leaf->fs_info; struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; struct btrfs_device *device; u64 devid; int ret; u8 fs_uuid[BTRFS_FSID_SIZE]; u8 dev_uuid[BTRFS_UUID_SIZE]; devid = btrfs_device_id(leaf, dev_item); args.devid = devid; read_extent_buffer(leaf, dev_uuid, btrfs_device_uuid(dev_item), BTRFS_UUID_SIZE); read_extent_buffer(leaf, fs_uuid, btrfs_device_fsid(dev_item), BTRFS_FSID_SIZE); args.uuid = dev_uuid; args.fsid = fs_uuid; if (memcmp(fs_uuid, fs_devices->metadata_uuid, BTRFS_FSID_SIZE)) { fs_devices = open_seed_devices(fs_info, fs_uuid); if (IS_ERR(fs_devices)) return PTR_ERR(fs_devices); } device = btrfs_find_device(fs_info->fs_devices, &args); if (!device) { if (!btrfs_test_opt(fs_info, DEGRADED)) { btrfs_report_missing_device(fs_info, devid, dev_uuid, true); return -ENOENT; } device = add_missing_dev(fs_devices, devid, dev_uuid); if (IS_ERR(device)) { btrfs_err(fs_info, "failed to add missing dev %llu: %ld", devid, PTR_ERR(device)); return PTR_ERR(device); } btrfs_report_missing_device(fs_info, devid, dev_uuid, false); } else { if (!device->bdev) { if (!btrfs_test_opt(fs_info, DEGRADED)) { btrfs_report_missing_device(fs_info, devid, dev_uuid, true); return -ENOENT; } btrfs_report_missing_device(fs_info, devid, dev_uuid, false); } if (!device->bdev && !test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) { /* * this happens when a device that was properly setup * in the device info lists suddenly goes bad. * device->bdev is NULL, and so we have to set * device->missing to one here */ device->fs_devices->missing_devices++; set_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state); } /* Move the device to its own fs_devices */ if (device->fs_devices != fs_devices) { ASSERT(test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)); list_move(&device->dev_list, &fs_devices->devices); device->fs_devices->num_devices--; fs_devices->num_devices++; device->fs_devices->missing_devices--; fs_devices->missing_devices++; device->fs_devices = fs_devices; } } if (device->fs_devices != fs_info->fs_devices) { BUG_ON(test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)); if (device->generation != btrfs_device_generation(leaf, dev_item)) return -EINVAL; } fill_device_from_item(leaf, dev_item, device); if (device->bdev) { u64 max_total_bytes = bdev_nr_bytes(device->bdev); if (device->total_bytes > max_total_bytes) { btrfs_err(fs_info, "device total_bytes should be at most %llu but found %llu", max_total_bytes, device->total_bytes); return -EINVAL; } } set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state); if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) && !test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) { device->fs_devices->total_rw_bytes += device->total_bytes; atomic64_add(device->total_bytes - device->bytes_used, &fs_info->free_chunk_space); } ret = 0; return ret; } int btrfs_read_sys_array(struct btrfs_fs_info *fs_info) { struct btrfs_super_block *super_copy = fs_info->super_copy; struct extent_buffer *sb; u8 *array_ptr; unsigned long sb_array_offset; int ret = 0; u32 array_size; u32 cur_offset; struct btrfs_key key; ASSERT(BTRFS_SUPER_INFO_SIZE <= fs_info->nodesize); /* * We allocated a dummy extent, just to use extent buffer accessors. * There will be unused space after BTRFS_SUPER_INFO_SIZE, but * that's fine, we will not go beyond system chunk array anyway. */ sb = alloc_dummy_extent_buffer(fs_info, BTRFS_SUPER_INFO_OFFSET); if (!sb) return -ENOMEM; set_extent_buffer_uptodate(sb); write_extent_buffer(sb, super_copy, 0, BTRFS_SUPER_INFO_SIZE); array_size = btrfs_super_sys_array_size(super_copy); array_ptr = super_copy->sys_chunk_array; sb_array_offset = offsetof(struct btrfs_super_block, sys_chunk_array); cur_offset = 0; while (cur_offset < array_size) { struct btrfs_chunk *chunk; struct btrfs_disk_key *disk_key = (struct btrfs_disk_key *)array_ptr; u32 len = sizeof(*disk_key); /* * The sys_chunk_array has been already verified at super block * read time. Only do ASSERT()s for basic checks. */ ASSERT(cur_offset + len <= array_size); btrfs_disk_key_to_cpu(&key, disk_key); array_ptr += len; sb_array_offset += len; cur_offset += len; ASSERT(key.type == BTRFS_CHUNK_ITEM_KEY); chunk = (struct btrfs_chunk *)sb_array_offset; ASSERT(btrfs_chunk_type(sb, chunk) & BTRFS_BLOCK_GROUP_SYSTEM); len = btrfs_chunk_item_size(btrfs_chunk_num_stripes(sb, chunk)); ASSERT(cur_offset + len <= array_size); ret = read_one_chunk(&key, sb, chunk); if (ret) break; array_ptr += len; sb_array_offset += len; cur_offset += len; } clear_extent_buffer_uptodate(sb); free_extent_buffer_stale(sb); return ret; } /* * Check if all chunks in the fs are OK for read-write degraded mount * * If the @failing_dev is specified, it's accounted as missing. * * Return true if all chunks meet the minimal RW mount requirements. * Return false if any chunk doesn't meet the minimal RW mount requirements. */ bool btrfs_check_rw_degradable(struct btrfs_fs_info *fs_info, struct btrfs_device *failing_dev) { struct btrfs_chunk_map *map; u64 next_start; bool ret = true; map = btrfs_find_chunk_map(fs_info, 0, U64_MAX); /* No chunk at all? Return false anyway */ if (!map) { ret = false; goto out; } while (map) { int missing = 0; int max_tolerated; int i; max_tolerated = btrfs_get_num_tolerated_disk_barrier_failures( map->type); for (i = 0; i < map->num_stripes; i++) { struct btrfs_device *dev = map->stripes[i].dev; if (!dev || !dev->bdev || test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state) || dev->last_flush_error) missing++; else if (failing_dev && failing_dev == dev) missing++; } if (missing > max_tolerated) { if (!failing_dev) btrfs_warn(fs_info, "chunk %llu missing %d devices, max tolerance is %d for writable mount", map->start, missing, max_tolerated); btrfs_free_chunk_map(map); ret = false; goto out; } next_start = map->start + map->chunk_len; btrfs_free_chunk_map(map); map = btrfs_find_chunk_map(fs_info, next_start, U64_MAX - next_start); } out: return ret; } static void readahead_tree_node_children(struct extent_buffer *node) { int i; const int nr_items = btrfs_header_nritems(node); for (i = 0; i < nr_items; i++) btrfs_readahead_node_child(node, i); } int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info) { struct btrfs_root *root = fs_info->chunk_root; struct btrfs_path *path; struct extent_buffer *leaf; struct btrfs_key key; struct btrfs_key found_key; int ret; int slot; int iter_ret = 0; u64 total_dev = 0; u64 last_ra_node = 0; path = btrfs_alloc_path(); if (!path) return -ENOMEM; /* * uuid_mutex is needed only if we are mounting a sprout FS * otherwise we don't need it. */ mutex_lock(&uuid_mutex); /* * It is possible for mount and umount to race in such a way that * we execute this code path, but open_fs_devices failed to clear * total_rw_bytes. We certainly want it cleared before reading the * device items, so clear it here. */ fs_info->fs_devices->total_rw_bytes = 0; /* * Lockdep complains about possible circular locking dependency between * a disk's open_mutex (struct gendisk.open_mutex), the rw semaphores * used for freeze procection of a fs (struct super_block.s_writers), * which we take when starting a transaction, and extent buffers of the * chunk tree if we call read_one_dev() while holding a lock on an * extent buffer of the chunk tree. Since we are mounting the filesystem * and at this point there can't be any concurrent task modifying the * chunk tree, to keep it simple, just skip locking on the chunk tree. */ ASSERT(!test_bit(BTRFS_FS_OPEN, &fs_info->flags)); path->skip_locking = 1; /* * Read all device items, and then all the chunk items. All * device items are found before any chunk item (their object id * is smaller than the lowest possible object id for a chunk * item - BTRFS_FIRST_CHUNK_TREE_OBJECTID). */ key.objectid = BTRFS_DEV_ITEMS_OBJECTID; key.offset = 0; key.type = 0; btrfs_for_each_slot(root, &key, &found_key, path, iter_ret) { struct extent_buffer *node = path->nodes[1]; leaf = path->nodes[0]; slot = path->slots[0]; if (node) { if (last_ra_node != node->start) { readahead_tree_node_children(node); last_ra_node = node->start; } } if (found_key.type == BTRFS_DEV_ITEM_KEY) { struct btrfs_dev_item *dev_item; dev_item = btrfs_item_ptr(leaf, slot, struct btrfs_dev_item); ret = read_one_dev(leaf, dev_item); if (ret) goto error; total_dev++; } else if (found_key.type == BTRFS_CHUNK_ITEM_KEY) { struct btrfs_chunk *chunk; /* * We are only called at mount time, so no need to take * fs_info->chunk_mutex. Plus, to avoid lockdep warnings, * we always lock first fs_info->chunk_mutex before * acquiring any locks on the chunk tree. This is a * requirement for chunk allocation, see the comment on * top of btrfs_chunk_alloc() for details. */ chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk); ret = read_one_chunk(&found_key, leaf, chunk); if (ret) goto error; } } /* Catch error found during iteration */ if (iter_ret < 0) { ret = iter_ret; goto error; } /* * After loading chunk tree, we've got all device information, * do another round of validation checks. */ if (total_dev != fs_info->fs_devices->total_devices) { btrfs_warn(fs_info, "super block num_devices %llu mismatch with DEV_ITEM count %llu, will be repaired on next transaction commit", btrfs_super_num_devices(fs_info->super_copy), total_dev); fs_info->fs_devices->total_devices = total_dev; btrfs_set_super_num_devices(fs_info->super_copy, total_dev); } if (btrfs_super_total_bytes(fs_info->super_copy) < fs_info->fs_devices->total_rw_bytes) { btrfs_err(fs_info, "super_total_bytes %llu mismatch with fs_devices total_rw_bytes %llu", btrfs_super_total_bytes(fs_info->super_copy), fs_info->fs_devices->total_rw_bytes); ret = -EINVAL; goto error; } ret = 0; error: mutex_unlock(&uuid_mutex); btrfs_free_path(path); return ret; } int btrfs_init_devices_late(struct btrfs_fs_info *fs_info) { struct btrfs_fs_devices *fs_devices = fs_info->fs_devices, *seed_devs; struct btrfs_device *device; int ret = 0; mutex_lock(&fs_devices->device_list_mutex); list_for_each_entry(device, &fs_devices->devices, dev_list) device->fs_info = fs_info; list_for_each_entry(seed_devs, &fs_devices->seed_list, seed_list) { list_for_each_entry(device, &seed_devs->devices, dev_list) { device->fs_info = fs_info; ret = btrfs_get_dev_zone_info(device, false); if (ret) break; } seed_devs->fs_info = fs_info; } mutex_unlock(&fs_devices->device_list_mutex); return ret; } static u64 btrfs_dev_stats_value(const struct extent_buffer *eb, const struct btrfs_dev_stats_item *ptr, int index) { u64 val; read_extent_buffer(eb, &val, offsetof(struct btrfs_dev_stats_item, values) + ((unsigned long)ptr) + (index * sizeof(u64)), sizeof(val)); return val; } static void btrfs_set_dev_stats_value(struct extent_buffer *eb, struct btrfs_dev_stats_item *ptr, int index, u64 val) { write_extent_buffer(eb, &val, offsetof(struct btrfs_dev_stats_item, values) + ((unsigned long)ptr) + (index * sizeof(u64)), sizeof(val)); } static int btrfs_device_init_dev_stats(struct btrfs_device *device, struct btrfs_path *path) { struct btrfs_dev_stats_item *ptr; struct extent_buffer *eb; struct btrfs_key key; int item_size; int i, ret, slot; if (!device->fs_info->dev_root) return 0; key.objectid = BTRFS_DEV_STATS_OBJECTID; key.type = BTRFS_PERSISTENT_ITEM_KEY; key.offset = device->devid; ret = btrfs_search_slot(NULL, device->fs_info->dev_root, &key, path, 0, 0); if (ret) { for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) btrfs_dev_stat_set(device, i, 0); device->dev_stats_valid = 1; btrfs_release_path(path); return ret < 0 ? ret : 0; } slot = path->slots[0]; eb = path->nodes[0]; item_size = btrfs_item_size(eb, slot); ptr = btrfs_item_ptr(eb, slot, struct btrfs_dev_stats_item); for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) { if (item_size >= (1 + i) * sizeof(__le64)) btrfs_dev_stat_set(device, i, btrfs_dev_stats_value(eb, ptr, i)); else btrfs_dev_stat_set(device, i, 0); } device->dev_stats_valid = 1; btrfs_dev_stat_print_on_load(device); btrfs_release_path(path); return 0; } int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info) { struct btrfs_fs_devices *fs_devices = fs_info->fs_devices, *seed_devs; struct btrfs_device *device; struct btrfs_path *path = NULL; int ret = 0; path = btrfs_alloc_path(); if (!path) return -ENOMEM; mutex_lock(&fs_devices->device_list_mutex); list_for_each_entry(device, &fs_devices->devices, dev_list) { ret = btrfs_device_init_dev_stats(device, path); if (ret) goto out; } list_for_each_entry(seed_devs, &fs_devices->seed_list, seed_list) { list_for_each_entry(device, &seed_devs->devices, dev_list) { ret = btrfs_device_init_dev_stats(device, path); if (ret) goto out; } } out: mutex_unlock(&fs_devices->device_list_mutex); btrfs_free_path(path); return ret; } static int update_dev_stat_item(struct btrfs_trans_handle *trans, struct btrfs_device *device) { struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_root *dev_root = fs_info->dev_root; struct btrfs_path *path; struct btrfs_key key; struct extent_buffer *eb; struct btrfs_dev_stats_item *ptr; int ret; int i; key.objectid = BTRFS_DEV_STATS_OBJECTID; key.type = BTRFS_PERSISTENT_ITEM_KEY; key.offset = device->devid; path = btrfs_alloc_path(); if (!path) return -ENOMEM; ret = btrfs_search_slot(trans, dev_root, &key, path, -1, 1); if (ret < 0) { btrfs_warn_in_rcu(fs_info, "error %d while searching for dev_stats item for device %s", ret, btrfs_dev_name(device)); goto out; } if (ret == 0 && btrfs_item_size(path->nodes[0], path->slots[0]) < sizeof(*ptr)) { /* need to delete old one and insert a new one */ ret = btrfs_del_item(trans, dev_root, path); if (ret != 0) { btrfs_warn_in_rcu(fs_info, "delete too small dev_stats item for device %s failed %d", btrfs_dev_name(device), ret); goto out; } ret = 1; } if (ret == 1) { /* need to insert a new item */ btrfs_release_path(path); ret = btrfs_insert_empty_item(trans, dev_root, path, &key, sizeof(*ptr)); if (ret < 0) { btrfs_warn_in_rcu(fs_info, "insert dev_stats item for device %s failed %d", btrfs_dev_name(device), ret); goto out; } } eb = path->nodes[0]; ptr = btrfs_item_ptr(eb, path->slots[0], struct btrfs_dev_stats_item); for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) btrfs_set_dev_stats_value(eb, ptr, i, btrfs_dev_stat_read(device, i)); out: btrfs_free_path(path); return ret; } /* * called from commit_transaction. Writes all changed device stats to disk. */ int btrfs_run_dev_stats(struct btrfs_trans_handle *trans) { struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; struct btrfs_device *device; int stats_cnt; int ret = 0; mutex_lock(&fs_devices->device_list_mutex); list_for_each_entry(device, &fs_devices->devices, dev_list) { stats_cnt = atomic_read(&device->dev_stats_ccnt); if (!device->dev_stats_valid || stats_cnt == 0) continue; /* * There is a LOAD-LOAD control dependency between the value of * dev_stats_ccnt and updating the on-disk values which requires * reading the in-memory counters. Such control dependencies * require explicit read memory barriers. * * This memory barriers pairs with smp_mb__before_atomic in * btrfs_dev_stat_inc/btrfs_dev_stat_set and with the full * barrier implied by atomic_xchg in * btrfs_dev_stats_read_and_reset */ smp_rmb(); ret = update_dev_stat_item(trans, device); if (!ret) atomic_sub(stats_cnt, &device->dev_stats_ccnt); } mutex_unlock(&fs_devices->device_list_mutex); return ret; } void btrfs_dev_stat_inc_and_print(struct btrfs_device *dev, int index) { btrfs_dev_stat_inc(dev, index); if (!dev->dev_stats_valid) return; btrfs_err_rl_in_rcu(dev->fs_info, "bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u", btrfs_dev_name(dev), btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS), btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_READ_ERRS), btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_FLUSH_ERRS), btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_CORRUPTION_ERRS), btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_GENERATION_ERRS)); } static void btrfs_dev_stat_print_on_load(struct btrfs_device *dev) { int i; for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) if (btrfs_dev_stat_read(dev, i) != 0) break; if (i == BTRFS_DEV_STAT_VALUES_MAX) return; /* all values == 0, suppress message */ btrfs_info_in_rcu(dev->fs_info, "bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u", btrfs_dev_name(dev), btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS), btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_READ_ERRS), btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_FLUSH_ERRS), btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_CORRUPTION_ERRS), btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_GENERATION_ERRS)); } int btrfs_get_dev_stats(struct btrfs_fs_info *fs_info, struct btrfs_ioctl_get_dev_stats *stats) { BTRFS_DEV_LOOKUP_ARGS(args); struct btrfs_device *dev; struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; int i; mutex_lock(&fs_devices->device_list_mutex); args.devid = stats->devid; dev = btrfs_find_device(fs_info->fs_devices, &args); mutex_unlock(&fs_devices->device_list_mutex); if (!dev) { btrfs_warn(fs_info, "get dev_stats failed, device not found"); return -ENODEV; } else if (!dev->dev_stats_valid) { btrfs_warn(fs_info, "get dev_stats failed, not yet valid"); return -ENODEV; } else if (stats->flags & BTRFS_DEV_STATS_RESET) { for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) { if (stats->nr_items > i) stats->values[i] = btrfs_dev_stat_read_and_reset(dev, i); else btrfs_dev_stat_set(dev, i, 0); } btrfs_info(fs_info, "device stats zeroed by %s (%d)", current->comm, task_pid_nr(current)); } else { for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) if (stats->nr_items > i) stats->values[i] = btrfs_dev_stat_read(dev, i); } if (stats->nr_items > BTRFS_DEV_STAT_VALUES_MAX) stats->nr_items = BTRFS_DEV_STAT_VALUES_MAX; return 0; } /* * Update the size and bytes used for each device where it changed. This is * delayed since we would otherwise get errors while writing out the * superblocks. * * Must be invoked during transaction commit. */ void btrfs_commit_device_sizes(struct btrfs_transaction *trans) { struct btrfs_device *curr, *next; ASSERT(trans->state == TRANS_STATE_COMMIT_DOING); if (list_empty(&trans->dev_update_list)) return; /* * We don't need the device_list_mutex here. This list is owned by the * transaction and the transaction must complete before the device is * released. */ mutex_lock(&trans->fs_info->chunk_mutex); list_for_each_entry_safe(curr, next, &trans->dev_update_list, post_commit_list) { list_del_init(&curr->post_commit_list); curr->commit_total_bytes = curr->disk_total_bytes; curr->commit_bytes_used = curr->bytes_used; } mutex_unlock(&trans->fs_info->chunk_mutex); } /* * Multiplicity factor for simple profiles: DUP, RAID1-like and RAID10. */ int btrfs_bg_type_to_factor(u64 flags) { const int index = btrfs_bg_flags_to_raid_index(flags); return btrfs_raid_array[index].ncopies; } static int verify_one_dev_extent(struct btrfs_fs_info *fs_info, u64 chunk_offset, u64 devid, u64 physical_offset, u64 physical_len) { struct btrfs_dev_lookup_args args = { .devid = devid }; struct btrfs_chunk_map *map; struct btrfs_device *dev; u64 stripe_len; bool found = false; int ret = 0; int i; map = btrfs_find_chunk_map(fs_info, chunk_offset, 1); if (!map) { btrfs_err(fs_info, "dev extent physical offset %llu on devid %llu doesn't have corresponding chunk", physical_offset, devid); ret = -EUCLEAN; goto out; } stripe_len = btrfs_calc_stripe_length(map); if (physical_len != stripe_len) { btrfs_err(fs_info, "dev extent physical offset %llu on devid %llu length doesn't match chunk %llu, have %llu expect %llu", physical_offset, devid, map->start, physical_len, stripe_len); ret = -EUCLEAN; goto out; } /* * Very old mkfs.btrfs (before v4.1) will not respect the reserved * space. Although kernel can handle it without problem, better to warn * the users. */ if (physical_offset < BTRFS_DEVICE_RANGE_RESERVED) btrfs_warn(fs_info, "devid %llu physical %llu len %llu inside the reserved space", devid, physical_offset, physical_len); for (i = 0; i < map->num_stripes; i++) { if (map->stripes[i].dev->devid == devid && map->stripes[i].physical == physical_offset) { found = true; if (map->verified_stripes >= map->num_stripes) { btrfs_err(fs_info, "too many dev extents for chunk %llu found", map->start); ret = -EUCLEAN; goto out; } map->verified_stripes++; break; } } if (!found) { btrfs_err(fs_info, "dev extent physical offset %llu devid %llu has no corresponding chunk", physical_offset, devid); ret = -EUCLEAN; } /* Make sure no dev extent is beyond device boundary */ dev = btrfs_find_device(fs_info->fs_devices, &args); if (!dev) { btrfs_err(fs_info, "failed to find devid %llu", devid); ret = -EUCLEAN; goto out; } if (physical_offset + physical_len > dev->disk_total_bytes) { btrfs_err(fs_info, "dev extent devid %llu physical offset %llu len %llu is beyond device boundary %llu", devid, physical_offset, physical_len, dev->disk_total_bytes); ret = -EUCLEAN; goto out; } if (dev->zone_info) { u64 zone_size = dev->zone_info->zone_size; if (!IS_ALIGNED(physical_offset, zone_size) || !IS_ALIGNED(physical_len, zone_size)) { btrfs_err(fs_info, "zoned: dev extent devid %llu physical offset %llu len %llu is not aligned to device zone", devid, physical_offset, physical_len); ret = -EUCLEAN; goto out; } } out: btrfs_free_chunk_map(map); return ret; } static int verify_chunk_dev_extent_mapping(struct btrfs_fs_info *fs_info) { struct rb_node *node; int ret = 0; read_lock(&fs_info->mapping_tree_lock); for (node = rb_first_cached(&fs_info->mapping_tree); node; node = rb_next(node)) { struct btrfs_chunk_map *map; map = rb_entry(node, struct btrfs_chunk_map, rb_node); if (map->num_stripes != map->verified_stripes) { btrfs_err(fs_info, "chunk %llu has missing dev extent, have %d expect %d", map->start, map->verified_stripes, map->num_stripes); ret = -EUCLEAN; goto out; } } out: read_unlock(&fs_info->mapping_tree_lock); return ret; } /* * Ensure that all dev extents are mapped to correct chunk, otherwise * later chunk allocation/free would cause unexpected behavior. * * NOTE: This will iterate through the whole device tree, which should be of * the same size level as the chunk tree. This slightly increases mount time. */ int btrfs_verify_dev_extents(struct btrfs_fs_info *fs_info) { struct btrfs_path *path; struct btrfs_root *root = fs_info->dev_root; struct btrfs_key key; u64 prev_devid = 0; u64 prev_dev_ext_end = 0; int ret = 0; /* * We don't have a dev_root because we mounted with ignorebadroots and * failed to load the root, so we want to skip the verification in this * case for sure. * * However if the dev root is fine, but the tree itself is corrupted * we'd still fail to mount. This verification is only to make sure * writes can happen safely, so instead just bypass this check * completely in the case of IGNOREBADROOTS. */ if (btrfs_test_opt(fs_info, IGNOREBADROOTS)) return 0; key.objectid = 1; key.type = BTRFS_DEV_EXTENT_KEY; key.offset = 0; path = btrfs_alloc_path(); if (!path) return -ENOMEM; path->reada = READA_FORWARD; ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); if (ret < 0) goto out; if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) { ret = btrfs_next_leaf(root, path); if (ret < 0) goto out; /* No dev extents at all? Not good */ if (ret > 0) { ret = -EUCLEAN; goto out; } } while (1) { struct extent_buffer *leaf = path->nodes[0]; struct btrfs_dev_extent *dext; int slot = path->slots[0]; u64 chunk_offset; u64 physical_offset; u64 physical_len; u64 devid; btrfs_item_key_to_cpu(leaf, &key, slot); if (key.type != BTRFS_DEV_EXTENT_KEY) break; devid = key.objectid; physical_offset = key.offset; dext = btrfs_item_ptr(leaf, slot, struct btrfs_dev_extent); chunk_offset = btrfs_dev_extent_chunk_offset(leaf, dext); physical_len = btrfs_dev_extent_length(leaf, dext); /* Check if this dev extent overlaps with the previous one */ if (devid == prev_devid && physical_offset < prev_dev_ext_end) { btrfs_err(fs_info, "dev extent devid %llu physical offset %llu overlap with previous dev extent end %llu", devid, physical_offset, prev_dev_ext_end); ret = -EUCLEAN; goto out; } ret = verify_one_dev_extent(fs_info, chunk_offset, devid, physical_offset, physical_len); if (ret < 0) goto out; prev_devid = devid; prev_dev_ext_end = physical_offset + physical_len; ret = btrfs_next_item(root, path); if (ret < 0) goto out; if (ret > 0) { ret = 0; break; } } /* Ensure all chunks have corresponding dev extents */ ret = verify_chunk_dev_extent_mapping(fs_info); out: btrfs_free_path(path); return ret; } /* * Check whether the given block group or device is pinned by any inode being * used as a swapfile. */ bool btrfs_pinned_by_swapfile(struct btrfs_fs_info *fs_info, void *ptr) { struct btrfs_swapfile_pin *sp; struct rb_node *node; spin_lock(&fs_info->swapfile_pins_lock); node = fs_info->swapfile_pins.rb_node; while (node) { sp = rb_entry(node, struct btrfs_swapfile_pin, node); if (ptr < sp->ptr) node = node->rb_left; else if (ptr > sp->ptr) node = node->rb_right; else break; } spin_unlock(&fs_info->swapfile_pins_lock); return node != NULL; } static int relocating_repair_kthread(void *data) { struct btrfs_block_group *cache = data; struct btrfs_fs_info *fs_info = cache->fs_info; u64 target; int ret = 0; target = cache->start; btrfs_put_block_group(cache); sb_start_write(fs_info->sb); if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE)) { btrfs_info(fs_info, "zoned: skip relocating block group %llu to repair: EBUSY", target); sb_end_write(fs_info->sb); return -EBUSY; } mutex_lock(&fs_info->reclaim_bgs_lock); /* Ensure block group still exists */ cache = btrfs_lookup_block_group(fs_info, target); if (!cache) goto out; if (!test_bit(BLOCK_GROUP_FLAG_RELOCATING_REPAIR, &cache->runtime_flags)) goto out; ret = btrfs_may_alloc_data_chunk(fs_info, target); if (ret < 0) goto out; btrfs_info(fs_info, "zoned: relocating block group %llu to repair IO failure", target); ret = btrfs_relocate_chunk(fs_info, target); out: if (cache) btrfs_put_block_group(cache); mutex_unlock(&fs_info->reclaim_bgs_lock); btrfs_exclop_finish(fs_info); sb_end_write(fs_info->sb); return ret; } bool btrfs_repair_one_zone(struct btrfs_fs_info *fs_info, u64 logical) { struct btrfs_block_group *cache; if (!btrfs_is_zoned(fs_info)) return false; /* Do not attempt to repair in degraded state */ if (btrfs_test_opt(fs_info, DEGRADED)) return true; cache = btrfs_lookup_block_group(fs_info, logical); if (!cache) return true; if (test_and_set_bit(BLOCK_GROUP_FLAG_RELOCATING_REPAIR, &cache->runtime_flags)) { btrfs_put_block_group(cache); return true; } kthread_run(relocating_repair_kthread, cache, "btrfs-relocating-repair"); return true; } static void map_raid56_repair_block(struct btrfs_io_context *bioc, struct btrfs_io_stripe *smap, u64 logical) { int data_stripes = nr_bioc_data_stripes(bioc); int i; for (i = 0; i < data_stripes; i++) { u64 stripe_start = bioc->full_stripe_logical + btrfs_stripe_nr_to_offset(i); if (logical >= stripe_start && logical < stripe_start + BTRFS_STRIPE_LEN) break; } ASSERT(i < data_stripes); smap->dev = bioc->stripes[i].dev; smap->physical = bioc->stripes[i].physical + ((logical - bioc->full_stripe_logical) & BTRFS_STRIPE_LEN_MASK); } /* * Map a repair write into a single device. * * A repair write is triggered by read time repair or scrub, which would only * update the contents of a single device. * Not update any other mirrors nor go through RMW path. * * Callers should ensure: * * - Call btrfs_bio_counter_inc_blocked() first * - The range does not cross stripe boundary * - Has a valid @mirror_num passed in. */ int btrfs_map_repair_block(struct btrfs_fs_info *fs_info, struct btrfs_io_stripe *smap, u64 logical, u32 length, int mirror_num) { struct btrfs_io_context *bioc = NULL; u64 map_length = length; int mirror_ret = mirror_num; int ret; ASSERT(mirror_num > 0); ret = btrfs_map_block(fs_info, BTRFS_MAP_WRITE, logical, &map_length, &bioc, smap, &mirror_ret); if (ret < 0) return ret; /* The map range should not cross stripe boundary. */ ASSERT(map_length >= length); /* Already mapped to single stripe. */ if (!bioc) goto out; /* Map the RAID56 multi-stripe writes to a single one. */ if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) { map_raid56_repair_block(bioc, smap, logical); goto out; } ASSERT(mirror_num <= bioc->num_stripes); smap->dev = bioc->stripes[mirror_num - 1].dev; smap->physical = bioc->stripes[mirror_num - 1].physical; out: btrfs_put_bioc(bioc); ASSERT(smap->dev); return 0; }
84 52 84 84 84 84 84 84 84 84 84 84 84 84 84 84 83 84 84 52 51 52 51 52 84 84 84 52 111 111 52 84 52 52 52 52 84 84 84 84 84 84 111 111 84 84 52 52 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 // SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 1992, 1998-2006 Linus Torvalds, Ingo Molnar * Copyright (C) 2005-2006, Thomas Gleixner, Russell King * * This file contains the core interrupt handling code, for irq-chip based * architectures. Detailed information is available in * Documentation/core-api/genericirq.rst */ #include <linux/irq.h> #include <linux/msi.h> #include <linux/module.h> #include <linux/interrupt.h> #include <linux/kernel_stat.h> #include <linux/irqdomain.h> #include <trace/events/irq.h> #include "internals.h" static irqreturn_t bad_chained_irq(int irq, void *dev_id) { WARN_ONCE(1, "Chained irq %d should not call an action\n", irq); return IRQ_NONE; } /* * Chained handlers should never call action on their IRQ. This default * action will emit warning if such thing happens. */ struct irqaction chained_action = { .handler = bad_chained_irq, }; /** * irq_set_chip - set the irq chip for an irq * @irq: irq number * @chip: pointer to irq chip description structure */ int irq_set_chip(unsigned int irq, const struct irq_chip *chip) { unsigned long flags; struct irq_desc *desc = irq_get_desc_lock(irq, &flags, 0); if (!desc) return -EINVAL; desc->irq_data.chip = (struct irq_chip *)(chip ?: &no_irq_chip); irq_put_desc_unlock(desc, flags); /* * For !CONFIG_SPARSE_IRQ make the irq show up in * allocated_irqs. */ irq_mark_irq(irq); return 0; } EXPORT_SYMBOL(irq_set_chip); /** * irq_set_irq_type - set the irq trigger type for an irq * @irq: irq number * @type: IRQ_TYPE_{LEVEL,EDGE}_* value - see include/linux/irq.h */ int irq_set_irq_type(unsigned int irq, unsigned int type) { unsigned long flags; struct irq_desc *desc = irq_get_desc_buslock(irq, &flags, IRQ_GET_DESC_CHECK_GLOBAL); int ret = 0; if (!desc) return -EINVAL; ret = __irq_set_trigger(desc, type); irq_put_desc_busunlock(desc, flags); return ret; } EXPORT_SYMBOL(irq_set_irq_type); /** * irq_set_handler_data - set irq handler data for an irq * @irq: Interrupt number * @data: Pointer to interrupt specific data * * Set the hardware irq controller data for an irq */ int irq_set_handler_data(unsigned int irq, void *data) { unsigned long flags; struct irq_desc *desc = irq_get_desc_lock(irq, &flags, 0); if (!desc) return -EINVAL; desc->irq_common_data.handler_data = data; irq_put_desc_unlock(desc, flags); return 0; } EXPORT_SYMBOL(irq_set_handler_data); /** * irq_set_msi_desc_off - set MSI descriptor data for an irq at offset * @irq_base: Interrupt number base * @irq_offset: Interrupt number offset * @entry: Pointer to MSI descriptor data * * Set the MSI descriptor entry for an irq at offset */ int irq_set_msi_desc_off(unsigned int irq_base, unsigned int irq_offset, struct msi_desc *entry) { unsigned long flags; struct irq_desc *desc = irq_get_desc_lock(irq_base + irq_offset, &flags, IRQ_GET_DESC_CHECK_GLOBAL); if (!desc) return -EINVAL; desc->irq_common_data.msi_desc = entry; if (entry && !irq_offset) entry->irq = irq_base; irq_put_desc_unlock(desc, flags); return 0; } /** * irq_set_msi_desc - set MSI descriptor data for an irq * @irq: Interrupt number * @entry: Pointer to MSI descriptor data * * Set the MSI descriptor entry for an irq */ int irq_set_msi_desc(unsigned int irq, struct msi_desc *entry) { return irq_set_msi_desc_off(irq, 0, entry); } /** * irq_set_chip_data - set irq chip data for an irq * @irq: Interrupt number * @data: Pointer to chip specific data * * Set the hardware irq chip data for an irq */ int irq_set_chip_data(unsigned int irq, void *data) { unsigned long flags; struct irq_desc *desc = irq_get_desc_lock(irq, &flags, 0); if (!desc) return -EINVAL; desc->irq_data.chip_data = data; irq_put_desc_unlock(desc, flags); return 0; } EXPORT_SYMBOL(irq_set_chip_data); struct irq_data *irq_get_irq_data(unsigned int irq) { struct irq_desc *desc = irq_to_desc(irq); return desc ? &desc->irq_data : NULL; } EXPORT_SYMBOL_GPL(irq_get_irq_data); static void irq_state_clr_disabled(struct irq_desc *desc) { irqd_clear(&desc->irq_data, IRQD_IRQ_DISABLED); } static void irq_state_clr_masked(struct irq_desc *desc) { irqd_clear(&desc->irq_data, IRQD_IRQ_MASKED); } static void irq_state_clr_started(struct irq_desc *desc) { irqd_clear(&desc->irq_data, IRQD_IRQ_STARTED); } static void irq_state_set_started(struct irq_desc *desc) { irqd_set(&desc->irq_data, IRQD_IRQ_STARTED); } enum { IRQ_STARTUP_NORMAL, IRQ_STARTUP_MANAGED, IRQ_STARTUP_ABORT, }; #ifdef CONFIG_SMP static int __irq_startup_managed(struct irq_desc *desc, const struct cpumask *aff, bool force) { struct irq_data *d = irq_desc_get_irq_data(desc); if (!irqd_affinity_is_managed(d)) return IRQ_STARTUP_NORMAL; irqd_clr_managed_shutdown(d); if (!cpumask_intersects(aff, cpu_online_mask)) { /* * Catch code which fiddles with enable_irq() on a managed * and potentially shutdown IRQ. Chained interrupt * installment or irq auto probing should not happen on * managed irqs either. */ if (WARN_ON_ONCE(force)) return IRQ_STARTUP_ABORT; /* * The interrupt was requested, but there is no online CPU * in it's affinity mask. Put it into managed shutdown * state and let the cpu hotplug mechanism start it up once * a CPU in the mask becomes available. */ return IRQ_STARTUP_ABORT; } /* * Managed interrupts have reserved resources, so this should not * happen. */ if (WARN_ON(irq_domain_activate_irq(d, false))) return IRQ_STARTUP_ABORT; return IRQ_STARTUP_MANAGED; } #else static __always_inline int __irq_startup_managed(struct irq_desc *desc, const struct cpumask *aff, bool force) { return IRQ_STARTUP_NORMAL; } #endif static int __irq_startup(struct irq_desc *desc) { struct irq_data *d = irq_desc_get_irq_data(desc); int ret = 0; /* Warn if this interrupt is not activated but try nevertheless */ WARN_ON_ONCE(!irqd_is_activated(d)); if (d->chip->irq_startup) { ret = d->chip->irq_startup(d); irq_state_clr_disabled(desc); irq_state_clr_masked(desc); } else { irq_enable(desc); } irq_state_set_started(desc); return ret; } int irq_startup(struct irq_desc *desc, bool resend, bool force) { struct irq_data *d = irq_desc_get_irq_data(desc); const struct cpumask *aff = irq_data_get_affinity_mask(d); int ret = 0; desc->depth = 0; if (irqd_is_started(d)) { irq_enable(desc); } else { switch (__irq_startup_managed(desc, aff, force)) { case IRQ_STARTUP_NORMAL: if (d->chip->flags & IRQCHIP_AFFINITY_PRE_STARTUP) irq_setup_affinity(desc); ret = __irq_startup(desc); if (!(d->chip->flags & IRQCHIP_AFFINITY_PRE_STARTUP)) irq_setup_affinity(desc); break; case IRQ_STARTUP_MANAGED: irq_do_set_affinity(d, aff, false); ret = __irq_startup(desc); break; case IRQ_STARTUP_ABORT: irqd_set_managed_shutdown(d); return 0; } } if (resend) check_irq_resend(desc, false); return ret; } int irq_activate(struct irq_desc *desc) { struct irq_data *d = irq_desc_get_irq_data(desc); if (!irqd_affinity_is_managed(d)) return irq_domain_activate_irq(d, false); return 0; } int irq_activate_and_startup(struct irq_desc *desc, bool resend) { if (WARN_ON(irq_activate(desc))) return 0; return irq_startup(desc, resend, IRQ_START_FORCE); } static void __irq_disable(struct irq_desc *desc, bool mask); void irq_shutdown(struct irq_desc *desc) { if (irqd_is_started(&desc->irq_data)) { clear_irq_resend(desc); desc->depth = 1; if (desc->irq_data.chip->irq_shutdown) { desc->irq_data.chip->irq_shutdown(&desc->irq_data); irq_state_set_disabled(desc); irq_state_set_masked(desc); } else { __irq_disable(desc, true); } irq_state_clr_started(desc); } } void irq_shutdown_and_deactivate(struct irq_desc *desc) { irq_shutdown(desc); /* * This must be called even if the interrupt was never started up, * because the activation can happen before the interrupt is * available for request/startup. It has it's own state tracking so * it's safe to call it unconditionally. */ irq_domain_deactivate_irq(&desc->irq_data); } void irq_enable(struct irq_desc *desc) { if (!irqd_irq_disabled(&desc->irq_data)) { unmask_irq(desc); } else { irq_state_clr_disabled(desc); if (desc->irq_data.chip->irq_enable) { desc->irq_data.chip->irq_enable(&desc->irq_data); irq_state_clr_masked(desc); } else { unmask_irq(desc); } } } static void __irq_disable(struct irq_desc *desc, bool mask) { if (irqd_irq_disabled(&desc->irq_data)) { if (mask) mask_irq(desc); } else { irq_state_set_disabled(desc); if (desc->irq_data.chip->irq_disable) { desc->irq_data.chip->irq_disable(&desc->irq_data); irq_state_set_masked(desc); } else if (mask) { mask_irq(desc); } } } /** * irq_disable - Mark interrupt disabled * @desc: irq descriptor which should be disabled * * If the chip does not implement the irq_disable callback, we * use a lazy disable approach. That means we mark the interrupt * disabled, but leave the hardware unmasked. That's an * optimization because we avoid the hardware access for the * common case where no interrupt happens after we marked it * disabled. If an interrupt happens, then the interrupt flow * handler masks the line at the hardware level and marks it * pending. * * If the interrupt chip does not implement the irq_disable callback, * a driver can disable the lazy approach for a particular irq line by * calling 'irq_set_status_flags(irq, IRQ_DISABLE_UNLAZY)'. This can * be used for devices which cannot disable the interrupt at the * device level under certain circumstances and have to use * disable_irq[_nosync] instead. */ void irq_disable(struct irq_desc *desc) { __irq_disable(desc, irq_settings_disable_unlazy(desc)); } void irq_percpu_enable(struct irq_desc *desc, unsigned int cpu) { if (desc->irq_data.chip->irq_enable) desc->irq_data.chip->irq_enable(&desc->irq_data); else desc->irq_data.chip->irq_unmask(&desc->irq_data); cpumask_set_cpu(cpu, desc->percpu_enabled); } void irq_percpu_disable(struct irq_desc *desc, unsigned int cpu) { if (desc->irq_data.chip->irq_disable) desc->irq_data.chip->irq_disable(&desc->irq_data); else desc->irq_data.chip->irq_mask(&desc->irq_data); cpumask_clear_cpu(cpu, desc->percpu_enabled); } static inline void mask_ack_irq(struct irq_desc *desc) { if (desc->irq_data.chip->irq_mask_ack) { desc->irq_data.chip->irq_mask_ack(&desc->irq_data); irq_state_set_masked(desc); } else { mask_irq(desc); if (desc->irq_data.chip->irq_ack) desc->irq_data.chip->irq_ack(&desc->irq_data); } } void mask_irq(struct irq_desc *desc) { if (irqd_irq_masked(&desc->irq_data)) return; if (desc->irq_data.chip->irq_mask) { desc->irq_data.chip->irq_mask(&desc->irq_data); irq_state_set_masked(desc); } } void unmask_irq(struct irq_desc *desc) { if (!irqd_irq_masked(&desc->irq_data)) return; if (desc->irq_data.chip->irq_unmask) { desc->irq_data.chip->irq_unmask(&desc->irq_data); irq_state_clr_masked(desc); } } void unmask_threaded_irq(struct irq_desc *desc) { struct irq_chip *chip = desc->irq_data.chip; if (chip->flags & IRQCHIP_EOI_THREADED) chip->irq_eoi(&desc->irq_data); unmask_irq(desc); } /* * handle_nested_irq - Handle a nested irq from a irq thread * @irq: the interrupt number * * Handle interrupts which are nested into a threaded interrupt * handler. The handler function is called inside the calling * threads context. */ void handle_nested_irq(unsigned int irq) { struct irq_desc *desc = irq_to_desc(irq); struct irqaction *action; irqreturn_t action_ret; might_sleep(); raw_spin_lock_irq(&desc->lock); desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING); action = desc->action; if (unlikely(!action || irqd_irq_disabled(&desc->irq_data))) { desc->istate |= IRQS_PENDING; raw_spin_unlock_irq(&desc->lock); return; } kstat_incr_irqs_this_cpu(desc); atomic_inc(&desc->threads_active); raw_spin_unlock_irq(&desc->lock); action_ret = IRQ_NONE; for_each_action_of_desc(desc, action) action_ret |= action->thread_fn(action->irq, action->dev_id); if (!irq_settings_no_debug(desc)) note_interrupt(desc, action_ret); wake_threads_waitq(desc); } EXPORT_SYMBOL_GPL(handle_nested_irq); static bool irq_check_poll(struct irq_desc *desc) { if (!(desc->istate & IRQS_POLL_INPROGRESS)) return false; return irq_wait_for_poll(desc); } static bool irq_may_run(struct irq_desc *desc) { unsigned int mask = IRQD_IRQ_INPROGRESS | IRQD_WAKEUP_ARMED; /* * If the interrupt is not in progress and is not an armed * wakeup interrupt, proceed. */ if (!irqd_has_set(&desc->irq_data, mask)) return true; /* * If the interrupt is an armed wakeup source, mark it pending * and suspended, disable it and notify the pm core about the * event. */ if (irq_pm_check_wakeup(desc)) return false; /* * Handle a potential concurrent poll on a different core. */ return irq_check_poll(desc); } /** * handle_simple_irq - Simple and software-decoded IRQs. * @desc: the interrupt description structure for this irq * * Simple interrupts are either sent from a demultiplexing interrupt * handler or come from hardware, where no interrupt hardware control * is necessary. * * Note: The caller is expected to handle the ack, clear, mask and * unmask issues if necessary. */ void handle_simple_irq(struct irq_desc *desc) { raw_spin_lock(&desc->lock); if (!irq_may_run(desc)) goto out_unlock; desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING); if (unlikely(!desc->action || irqd_irq_disabled(&desc->irq_data))) { desc->istate |= IRQS_PENDING; goto out_unlock; } kstat_incr_irqs_this_cpu(desc); handle_irq_event(desc); out_unlock: raw_spin_unlock(&desc->lock); } EXPORT_SYMBOL_GPL(handle_simple_irq); /** * handle_untracked_irq - Simple and software-decoded IRQs. * @desc: the interrupt description structure for this irq * * Untracked interrupts are sent from a demultiplexing interrupt * handler when the demultiplexer does not know which device it its * multiplexed irq domain generated the interrupt. IRQ's handled * through here are not subjected to stats tracking, randomness, or * spurious interrupt detection. * * Note: Like handle_simple_irq, the caller is expected to handle * the ack, clear, mask and unmask issues if necessary. */ void handle_untracked_irq(struct irq_desc *desc) { raw_spin_lock(&desc->lock); if (!irq_may_run(desc)) goto out_unlock; desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING); if (unlikely(!desc->action || irqd_irq_disabled(&desc->irq_data))) { desc->istate |= IRQS_PENDING; goto out_unlock; } desc->istate &= ~IRQS_PENDING; irqd_set(&desc->irq_data, IRQD_IRQ_INPROGRESS); raw_spin_unlock(&desc->lock); __handle_irq_event_percpu(desc); raw_spin_lock(&desc->lock); irqd_clear(&desc->irq_data, IRQD_IRQ_INPROGRESS); out_unlock: raw_spin_unlock(&desc->lock); } EXPORT_SYMBOL_GPL(handle_untracked_irq); /* * Called unconditionally from handle_level_irq() and only for oneshot * interrupts from handle_fasteoi_irq() */ static void cond_unmask_irq(struct irq_desc *desc) { /* * We need to unmask in the following cases: * - Standard level irq (IRQF_ONESHOT is not set) * - Oneshot irq which did not wake the thread (caused by a * spurious interrupt or a primary handler handling it * completely). */ if (!irqd_irq_disabled(&desc->irq_data) && irqd_irq_masked(&desc->irq_data) && !desc->threads_oneshot) unmask_irq(desc); } /** * handle_level_irq - Level type irq handler * @desc: the interrupt description structure for this irq * * Level type interrupts are active as long as the hardware line has * the active level. This may require to mask the interrupt and unmask * it after the associated handler has acknowledged the device, so the * interrupt line is back to inactive. */ void handle_level_irq(struct irq_desc *desc) { raw_spin_lock(&desc->lock); mask_ack_irq(desc); if (!irq_may_run(desc)) goto out_unlock; desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING); /* * If its disabled or no action available * keep it masked and get out of here */ if (unlikely(!desc->action || irqd_irq_disabled(&desc->irq_data))) { desc->istate |= IRQS_PENDING; goto out_unlock; } kstat_incr_irqs_this_cpu(desc); handle_irq_event(desc); cond_unmask_irq(desc); out_unlock: raw_spin_unlock(&desc->lock); } EXPORT_SYMBOL_GPL(handle_level_irq); static void cond_unmask_eoi_irq(struct irq_desc *desc, struct irq_chip *chip) { if (!(desc->istate & IRQS_ONESHOT)) { chip->irq_eoi(&desc->irq_data); return; } /* * We need to unmask in the following cases: * - Oneshot irq which did not wake the thread (caused by a * spurious interrupt or a primary handler handling it * completely). */ if (!irqd_irq_disabled(&desc->irq_data) && irqd_irq_masked(&desc->irq_data) && !desc->threads_oneshot) { chip->irq_eoi(&desc->irq_data); unmask_irq(desc); } else if (!(chip->flags & IRQCHIP_EOI_THREADED)) { chip->irq_eoi(&desc->irq_data); } } /** * handle_fasteoi_irq - irq handler for transparent controllers * @desc: the interrupt description structure for this irq * * Only a single callback will be issued to the chip: an ->eoi() * call when the interrupt has been serviced. This enables support * for modern forms of interrupt handlers, which handle the flow * details in hardware, transparently. */ void handle_fasteoi_irq(struct irq_desc *desc) { struct irq_chip *chip = desc->irq_data.chip; raw_spin_lock(&desc->lock); /* * When an affinity change races with IRQ handling, the next interrupt * can arrive on the new CPU before the original CPU has completed * handling the previous one - it may need to be resent. */ if (!irq_may_run(desc)) { if (irqd_needs_resend_when_in_progress(&desc->irq_data)) desc->istate |= IRQS_PENDING; goto out; } desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING); /* * If its disabled or no action available * then mask it and get out of here: */ if (unlikely(!desc->action || irqd_irq_disabled(&desc->irq_data))) { desc->istate |= IRQS_PENDING; mask_irq(desc); goto out; } kstat_incr_irqs_this_cpu(desc); if (desc->istate & IRQS_ONESHOT) mask_irq(desc); handle_irq_event(desc); cond_unmask_eoi_irq(desc, chip); /* * When the race described above happens this will resend the interrupt. */ if (unlikely(desc->istate & IRQS_PENDING)) check_irq_resend(desc, false); raw_spin_unlock(&desc->lock); return; out: if (!(chip->flags & IRQCHIP_EOI_IF_HANDLED)) chip->irq_eoi(&desc->irq_data); raw_spin_unlock(&desc->lock); } EXPORT_SYMBOL_GPL(handle_fasteoi_irq); /** * handle_fasteoi_nmi - irq handler for NMI interrupt lines * @desc: the interrupt description structure for this irq * * A simple NMI-safe handler, considering the restrictions * from request_nmi. * * Only a single callback will be issued to the chip: an ->eoi() * call when the interrupt has been serviced. This enables support * for modern forms of interrupt handlers, which handle the flow * details in hardware, transparently. */ void handle_fasteoi_nmi(struct irq_desc *desc) { struct irq_chip *chip = irq_desc_get_chip(desc); struct irqaction *action = desc->action; unsigned int irq = irq_desc_get_irq(desc); irqreturn_t res; __kstat_incr_irqs_this_cpu(desc); trace_irq_handler_entry(irq, action); /* * NMIs cannot be shared, there is only one action. */ res = action->handler(irq, action->dev_id); trace_irq_handler_exit(irq, action, res); if (chip->irq_eoi) chip->irq_eoi(&desc->irq_data); } EXPORT_SYMBOL_GPL(handle_fasteoi_nmi); /** * handle_edge_irq - edge type IRQ handler * @desc: the interrupt description structure for this irq * * Interrupt occurs on the falling and/or rising edge of a hardware * signal. The occurrence is latched into the irq controller hardware * and must be acked in order to be reenabled. After the ack another * interrupt can happen on the same source even before the first one * is handled by the associated event handler. If this happens it * might be necessary to disable (mask) the interrupt depending on the * controller hardware. This requires to reenable the interrupt inside * of the loop which handles the interrupts which have arrived while * the handler was running. If all pending interrupts are handled, the * loop is left. */ void handle_edge_irq(struct irq_desc *desc) { raw_spin_lock(&desc->lock); desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING); if (!irq_may_run(desc)) { desc->istate |= IRQS_PENDING; mask_ack_irq(desc); goto out_unlock; } /* * If its disabled or no action available then mask it and get * out of here. */ if (irqd_irq_disabled(&desc->irq_data) || !desc->action) { desc->istate |= IRQS_PENDING; mask_ack_irq(desc); goto out_unlock; } kstat_incr_irqs_this_cpu(desc); /* Start handling the irq */ desc->irq_data.chip->irq_ack(&desc->irq_data); do { if (unlikely(!desc->action)) { mask_irq(desc); goto out_unlock; } /* * When another irq arrived while we were handling * one, we could have masked the irq. * Reenable it, if it was not disabled in meantime. */ if (unlikely(desc->istate & IRQS_PENDING)) { if (!irqd_irq_disabled(&desc->irq_data) && irqd_irq_masked(&desc->irq_data)) unmask_irq(desc); } handle_irq_event(desc); } while ((desc->istate & IRQS_PENDING) && !irqd_irq_disabled(&desc->irq_data)); out_unlock: raw_spin_unlock(&desc->lock); } EXPORT_SYMBOL(handle_edge_irq); #ifdef CONFIG_IRQ_EDGE_EOI_HANDLER /** * handle_edge_eoi_irq - edge eoi type IRQ handler * @desc: the interrupt description structure for this irq * * Similar as the above handle_edge_irq, but using eoi and w/o the * mask/unmask logic. */ void handle_edge_eoi_irq(struct irq_desc *desc) { struct irq_chip *chip = irq_desc_get_chip(desc); raw_spin_lock(&desc->lock); desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING); if (!irq_may_run(desc)) { desc->istate |= IRQS_PENDING; goto out_eoi; } /* * If its disabled or no action available then mask it and get * out of here. */ if (irqd_irq_disabled(&desc->irq_data) || !desc->action) { desc->istate |= IRQS_PENDING; goto out_eoi; } kstat_incr_irqs_this_cpu(desc); do { if (unlikely(!desc->action)) goto out_eoi; handle_irq_event(desc); } while ((desc->istate & IRQS_PENDING) && !irqd_irq_disabled(&desc->irq_data)); out_eoi: chip->irq_eoi(&desc->irq_data); raw_spin_unlock(&desc->lock); } #endif /** * handle_percpu_irq - Per CPU local irq handler * @desc: the interrupt description structure for this irq * * Per CPU interrupts on SMP machines without locking requirements */ void handle_percpu_irq(struct irq_desc *desc) { struct irq_chip *chip = irq_desc_get_chip(desc); /* * PER CPU interrupts are not serialized. Do not touch * desc->tot_count. */ __kstat_incr_irqs_this_cpu(desc); if (chip->irq_ack) chip->irq_ack(&desc->irq_data); handle_irq_event_percpu(desc); if (chip->irq_eoi) chip->irq_eoi(&desc->irq_data); } /** * handle_percpu_devid_irq - Per CPU local irq handler with per cpu dev ids * @desc: the interrupt description structure for this irq * * Per CPU interrupts on SMP machines without locking requirements. Same as * handle_percpu_irq() above but with the following extras: * * action->percpu_dev_id is a pointer to percpu variables which * contain the real device id for the cpu on which this handler is * called */ void handle_percpu_devid_irq(struct irq_desc *desc) { struct irq_chip *chip = irq_desc_get_chip(desc); struct irqaction *action = desc->action; unsigned int irq = irq_desc_get_irq(desc); irqreturn_t res; /* * PER CPU interrupts are not serialized. Do not touch * desc->tot_count. */ __kstat_incr_irqs_this_cpu(desc); if (chip->irq_ack) chip->irq_ack(&desc->irq_data); if (likely(action)) { trace_irq_handler_entry(irq, action); res = action->handler(irq, raw_cpu_ptr(action->percpu_dev_id)); trace_irq_handler_exit(irq, action, res); } else { unsigned int cpu = smp_processor_id(); bool enabled = cpumask_test_cpu(cpu, desc->percpu_enabled); if (enabled) irq_percpu_disable(desc, cpu); pr_err_once("Spurious%s percpu IRQ%u on CPU%u\n", enabled ? " and unmasked" : "", irq, cpu); } if (chip->irq_eoi) chip->irq_eoi(&desc->irq_data); } /** * handle_percpu_devid_fasteoi_nmi - Per CPU local NMI handler with per cpu * dev ids * @desc: the interrupt description structure for this irq * * Similar to handle_fasteoi_nmi, but handling the dev_id cookie * as a percpu pointer. */ void handle_percpu_devid_fasteoi_nmi(struct irq_desc *desc) { struct irq_chip *chip = irq_desc_get_chip(desc); struct irqaction *action = desc->action; unsigned int irq = irq_desc_get_irq(desc); irqreturn_t res; __kstat_incr_irqs_this_cpu(desc); trace_irq_handler_entry(irq, action); res = action->handler(irq, raw_cpu_ptr(action->percpu_dev_id)); trace_irq_handler_exit(irq, action, res); if (chip->irq_eoi) chip->irq_eoi(&desc->irq_data); } static void __irq_do_set_handler(struct irq_desc *desc, irq_flow_handler_t handle, int is_chained, const char *name) { if (!handle) { handle = handle_bad_irq; } else { struct irq_data *irq_data = &desc->irq_data; #ifdef CONFIG_IRQ_DOMAIN_HIERARCHY /* * With hierarchical domains we might run into a * situation where the outermost chip is not yet set * up, but the inner chips are there. Instead of * bailing we install the handler, but obviously we * cannot enable/startup the interrupt at this point. */ while (irq_data) { if (irq_data->chip != &no_irq_chip) break; /* * Bail out if the outer chip is not set up * and the interrupt supposed to be started * right away. */ if (WARN_ON(is_chained)) return; /* Try the parent */ irq_data = irq_data->parent_data; } #endif if (WARN_ON(!irq_data || irq_data->chip == &no_irq_chip)) return; } /* Uninstall? */ if (handle == handle_bad_irq) { if (desc->irq_data.chip != &no_irq_chip) mask_ack_irq(desc); irq_state_set_disabled(desc); if (is_chained) { desc->action = NULL; WARN_ON(irq_chip_pm_put(irq_desc_get_irq_data(desc))); } desc->depth = 1; } desc->handle_irq = handle; desc->name = name; if (handle != handle_bad_irq && is_chained) { unsigned int type = irqd_get_trigger_type(&desc->irq_data); /* * We're about to start this interrupt immediately, * hence the need to set the trigger configuration. * But the .set_type callback may have overridden the * flow handler, ignoring that we're dealing with a * chained interrupt. Reset it immediately because we * do know better. */ if (type != IRQ_TYPE_NONE) { __irq_set_trigger(desc, type); desc->handle_irq = handle; } irq_settings_set_noprobe(desc); irq_settings_set_norequest(desc); irq_settings_set_nothread(desc); desc->action = &chained_action; WARN_ON(irq_chip_pm_get(irq_desc_get_irq_data(desc))); irq_activate_and_startup(desc, IRQ_RESEND); } } void __irq_set_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained, const char *name) { unsigned long flags; struct irq_desc *desc = irq_get_desc_buslock(irq, &flags, 0); if (!desc) return; __irq_do_set_handler(desc, handle, is_chained, name); irq_put_desc_busunlock(desc, flags); } EXPORT_SYMBOL_GPL(__irq_set_handler); void irq_set_chained_handler_and_data(unsigned int irq, irq_flow_handler_t handle, void *data) { unsigned long flags; struct irq_desc *desc = irq_get_desc_buslock(irq, &flags, 0); if (!desc) return; desc->irq_common_data.handler_data = data; __irq_do_set_handler(desc, handle, 1, NULL); irq_put_desc_busunlock(desc, flags); } EXPORT_SYMBOL_GPL(irq_set_chained_handler_and_data); void irq_set_chip_and_handler_name(unsigned int irq, const struct irq_chip *chip, irq_flow_handler_t handle, const char *name) { irq_set_chip(irq, chip); __irq_set_handler(irq, handle, 0, name); } EXPORT_SYMBOL_GPL(irq_set_chip_and_handler_name); void irq_modify_status(unsigned int irq, unsigned long clr, unsigned long set) { unsigned long flags, trigger, tmp; struct irq_desc *desc = irq_get_desc_lock(irq, &flags, 0); if (!desc) return; /* * Warn when a driver sets the no autoenable flag on an already * active interrupt. */ WARN_ON_ONCE(!desc->depth && (set & _IRQ_NOAUTOEN)); irq_settings_clr_and_set(desc, clr, set); trigger = irqd_get_trigger_type(&desc->irq_data); irqd_clear(&desc->irq_data, IRQD_NO_BALANCING | IRQD_PER_CPU | IRQD_TRIGGER_MASK | IRQD_LEVEL); if (irq_settings_has_no_balance_set(desc)) irqd_set(&desc->irq_data, IRQD_NO_BALANCING); if (irq_settings_is_per_cpu(desc)) irqd_set(&desc->irq_data, IRQD_PER_CPU); if (irq_settings_is_level(desc)) irqd_set(&desc->irq_data, IRQD_LEVEL); tmp = irq_settings_get_trigger_mask(desc); if (tmp != IRQ_TYPE_NONE) trigger = tmp; irqd_set(&desc->irq_data, trigger); irq_put_desc_unlock(desc, flags); } EXPORT_SYMBOL_GPL(irq_modify_status); #ifdef CONFIG_DEPRECATED_IRQ_CPU_ONOFFLINE /** * irq_cpu_online - Invoke all irq_cpu_online functions. * * Iterate through all irqs and invoke the chip.irq_cpu_online() * for each. */ void irq_cpu_online(void) { struct irq_desc *desc; struct irq_chip *chip; unsigned long flags; unsigned int irq; for_each_active_irq(irq) { desc = irq_to_desc(irq); if (!desc) continue; raw_spin_lock_irqsave(&desc->lock, flags); chip = irq_data_get_irq_chip(&desc->irq_data); if (chip && chip->irq_cpu_online && (!(chip->flags & IRQCHIP_ONOFFLINE_ENABLED) || !irqd_irq_disabled(&desc->irq_data))) chip->irq_cpu_online(&desc->irq_data); raw_spin_unlock_irqrestore(&desc->lock, flags); } } /** * irq_cpu_offline - Invoke all irq_cpu_offline functions. * * Iterate through all irqs and invoke the chip.irq_cpu_offline() * for each. */ void irq_cpu_offline(void) { struct irq_desc *desc; struct irq_chip *chip; unsigned long flags; unsigned int irq; for_each_active_irq(irq) { desc = irq_to_desc(irq); if (!desc) continue; raw_spin_lock_irqsave(&desc->lock, flags); chip = irq_data_get_irq_chip(&desc->irq_data); if (chip && chip->irq_cpu_offline && (!(chip->flags & IRQCHIP_ONOFFLINE_ENABLED) || !irqd_irq_disabled(&desc->irq_data))) chip->irq_cpu_offline(&desc->irq_data); raw_spin_unlock_irqrestore(&desc->lock, flags); } } #endif #ifdef CONFIG_IRQ_DOMAIN_HIERARCHY #ifdef CONFIG_IRQ_FASTEOI_HIERARCHY_HANDLERS /** * handle_fasteoi_ack_irq - irq handler for edge hierarchy * stacked on transparent controllers * * @desc: the interrupt description structure for this irq * * Like handle_fasteoi_irq(), but for use with hierarchy where * the irq_chip also needs to have its ->irq_ack() function * called. */ void handle_fasteoi_ack_irq(struct irq_desc *desc) { struct irq_chip *chip = desc->irq_data.chip; raw_spin_lock(&desc->lock); if (!irq_may_run(desc)) goto out; desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING); /* * If its disabled or no action available * then mask it and get out of here: */ if (unlikely(!desc->action || irqd_irq_disabled(&desc->irq_data))) { desc->istate |= IRQS_PENDING; mask_irq(desc); goto out; } kstat_incr_irqs_this_cpu(desc); if (desc->istate & IRQS_ONESHOT) mask_irq(desc); /* Start handling the irq */ desc->irq_data.chip->irq_ack(&desc->irq_data); handle_irq_event(desc); cond_unmask_eoi_irq(desc, chip); raw_spin_unlock(&desc->lock); return; out: if (!(chip->flags & IRQCHIP_EOI_IF_HANDLED)) chip->irq_eoi(&desc->irq_data); raw_spin_unlock(&desc->lock); } EXPORT_SYMBOL_GPL(handle_fasteoi_ack_irq); /** * handle_fasteoi_mask_irq - irq handler for level hierarchy * stacked on transparent controllers * * @desc: the interrupt description structure for this irq * * Like handle_fasteoi_irq(), but for use with hierarchy where * the irq_chip also needs to have its ->irq_mask_ack() function * called. */ void handle_fasteoi_mask_irq(struct irq_desc *desc) { struct irq_chip *chip = desc->irq_data.chip; raw_spin_lock(&desc->lock); mask_ack_irq(desc); if (!irq_may_run(desc)) goto out; desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING); /* * If its disabled or no action available * then mask it and get out of here: */ if (unlikely(!desc->action || irqd_irq_disabled(&desc->irq_data))) { desc->istate |= IRQS_PENDING; mask_irq(desc); goto out; } kstat_incr_irqs_this_cpu(desc); if (desc->istate & IRQS_ONESHOT) mask_irq(desc); handle_irq_event(desc); cond_unmask_eoi_irq(desc, chip); raw_spin_unlock(&desc->lock); return; out: if (!(chip->flags & IRQCHIP_EOI_IF_HANDLED)) chip->irq_eoi(&desc->irq_data); raw_spin_unlock(&desc->lock); } EXPORT_SYMBOL_GPL(handle_fasteoi_mask_irq); #endif /* CONFIG_IRQ_FASTEOI_HIERARCHY_HANDLERS */ /** * irq_chip_set_parent_state - set the state of a parent interrupt. * * @data: Pointer to interrupt specific data * @which: State to be restored (one of IRQCHIP_STATE_*) * @val: Value corresponding to @which * * Conditional success, if the underlying irqchip does not implement it. */ int irq_chip_set_parent_state(struct irq_data *data, enum irqchip_irq_state which, bool val) { data = data->parent_data; if (!data || !data->chip->irq_set_irqchip_state) return 0; return data->chip->irq_set_irqchip_state(data, which, val); } EXPORT_SYMBOL_GPL(irq_chip_set_parent_state); /** * irq_chip_get_parent_state - get the state of a parent interrupt. * * @data: Pointer to interrupt specific data * @which: one of IRQCHIP_STATE_* the caller wants to know * @state: a pointer to a boolean where the state is to be stored * * Conditional success, if the underlying irqchip does not implement it. */ int irq_chip_get_parent_state(struct irq_data *data, enum irqchip_irq_state which, bool *state) { data = data->parent_data; if (!data || !data->chip->irq_get_irqchip_state) return 0; return data->chip->irq_get_irqchip_state(data, which, state); } EXPORT_SYMBOL_GPL(irq_chip_get_parent_state); /** * irq_chip_enable_parent - Enable the parent interrupt (defaults to unmask if * NULL) * @data: Pointer to interrupt specific data */ void irq_chip_enable_parent(struct irq_data *data) { data = data->parent_data; if (data->chip->irq_enable) data->chip->irq_enable(data); else data->chip->irq_unmask(data); } EXPORT_SYMBOL_GPL(irq_chip_enable_parent); /** * irq_chip_disable_parent - Disable the parent interrupt (defaults to mask if * NULL) * @data: Pointer to interrupt specific data */ void irq_chip_disable_parent(struct irq_data *data) { data = data->parent_data; if (data->chip->irq_disable) data->chip->irq_disable(data); else data->chip->irq_mask(data); } EXPORT_SYMBOL_GPL(irq_chip_disable_parent); /** * irq_chip_ack_parent - Acknowledge the parent interrupt * @data: Pointer to interrupt specific data */ void irq_chip_ack_parent(struct irq_data *data) { data = data->parent_data; data->chip->irq_ack(data); } EXPORT_SYMBOL_GPL(irq_chip_ack_parent); /** * irq_chip_mask_parent - Mask the parent interrupt * @data: Pointer to interrupt specific data */ void irq_chip_mask_parent(struct irq_data *data) { data = data->parent_data; data->chip->irq_mask(data); } EXPORT_SYMBOL_GPL(irq_chip_mask_parent); /** * irq_chip_mask_ack_parent - Mask and acknowledge the parent interrupt * @data: Pointer to interrupt specific data */ void irq_chip_mask_ack_parent(struct irq_data *data) { data = data->parent_data; data->chip->irq_mask_ack(data); } EXPORT_SYMBOL_GPL(irq_chip_mask_ack_parent); /** * irq_chip_unmask_parent - Unmask the parent interrupt * @data: Pointer to interrupt specific data */ void irq_chip_unmask_parent(struct irq_data *data) { data = data->parent_data; data->chip->irq_unmask(data); } EXPORT_SYMBOL_GPL(irq_chip_unmask_parent); /** * irq_chip_eoi_parent - Invoke EOI on the parent interrupt * @data: Pointer to interrupt specific data */ void irq_chip_eoi_parent(struct irq_data *data) { data = data->parent_data; data->chip->irq_eoi(data); } EXPORT_SYMBOL_GPL(irq_chip_eoi_parent); /** * irq_chip_set_affinity_parent - Set affinity on the parent interrupt * @data: Pointer to interrupt specific data * @dest: The affinity mask to set * @force: Flag to enforce setting (disable online checks) * * Conditional, as the underlying parent chip might not implement it. */ int irq_chip_set_affinity_parent(struct irq_data *data, const struct cpumask *dest, bool force) { data = data->parent_data; if (data->chip->irq_set_affinity) return data->chip->irq_set_affinity(data, dest, force); return -ENOSYS; } EXPORT_SYMBOL_GPL(irq_chip_set_affinity_parent); /** * irq_chip_set_type_parent - Set IRQ type on the parent interrupt * @data: Pointer to interrupt specific data * @type: IRQ_TYPE_{LEVEL,EDGE}_* value - see include/linux/irq.h * * Conditional, as the underlying parent chip might not implement it. */ int irq_chip_set_type_parent(struct irq_data *data, unsigned int type) { data = data->parent_data; if (data->chip->irq_set_type) return data->chip->irq_set_type(data, type); return -ENOSYS; } EXPORT_SYMBOL_GPL(irq_chip_set_type_parent); /** * irq_chip_retrigger_hierarchy - Retrigger an interrupt in hardware * @data: Pointer to interrupt specific data * * Iterate through the domain hierarchy of the interrupt and check * whether a hw retrigger function exists. If yes, invoke it. */ int irq_chip_retrigger_hierarchy(struct irq_data *data) { for (data = data->parent_data; data; data = data->parent_data) if (data->chip && data->chip->irq_retrigger) return data->chip->irq_retrigger(data); return 0; } EXPORT_SYMBOL_GPL(irq_chip_retrigger_hierarchy); /** * irq_chip_set_vcpu_affinity_parent - Set vcpu affinity on the parent interrupt * @data: Pointer to interrupt specific data * @vcpu_info: The vcpu affinity information */ int irq_chip_set_vcpu_affinity_parent(struct irq_data *data, void *vcpu_info) { data = data->parent_data; if (data->chip->irq_set_vcpu_affinity) return data->chip->irq_set_vcpu_affinity(data, vcpu_info); return -ENOSYS; } EXPORT_SYMBOL_GPL(irq_chip_set_vcpu_affinity_parent); /** * irq_chip_set_wake_parent - Set/reset wake-up on the parent interrupt * @data: Pointer to interrupt specific data * @on: Whether to set or reset the wake-up capability of this irq * * Conditional, as the underlying parent chip might not implement it. */ int irq_chip_set_wake_parent(struct irq_data *data, unsigned int on) { data = data->parent_data; if (data->chip->flags & IRQCHIP_SKIP_SET_WAKE) return 0; if (data->chip->irq_set_wake) return data->chip->irq_set_wake(data, on); return -ENOSYS; } EXPORT_SYMBOL_GPL(irq_chip_set_wake_parent); /** * irq_chip_request_resources_parent - Request resources on the parent interrupt * @data: Pointer to interrupt specific data */ int irq_chip_request_resources_parent(struct irq_data *data) { data = data->parent_data; if (data->chip->irq_request_resources) return data->chip->irq_request_resources(data); /* no error on missing optional irq_chip::irq_request_resources */ return 0; } EXPORT_SYMBOL_GPL(irq_chip_request_resources_parent); /** * irq_chip_release_resources_parent - Release resources on the parent interrupt * @data: Pointer to interrupt specific data */ void irq_chip_release_resources_parent(struct irq_data *data) { data = data->parent_data; if (data->chip->irq_release_resources) data->chip->irq_release_resources(data); } EXPORT_SYMBOL_GPL(irq_chip_release_resources_parent); #endif /** * irq_chip_compose_msi_msg - Compose msi message for a irq chip * @data: Pointer to interrupt specific data * @msg: Pointer to the MSI message * * For hierarchical domains we find the first chip in the hierarchy * which implements the irq_compose_msi_msg callback. For non * hierarchical we use the top level chip. */ int irq_chip_compose_msi_msg(struct irq_data *data, struct msi_msg *msg) { struct irq_data *pos; for (pos = NULL; !pos && data; data = irqd_get_parent_data(data)) { if (data->chip && data->chip->irq_compose_msi_msg) pos = data; } if (!pos) return -ENOSYS; pos->chip->irq_compose_msi_msg(pos, msg); return 0; } static struct device *irq_get_pm_device(struct irq_data *data) { if (data->domain) return data->domain->pm_dev; return NULL; } /** * irq_chip_pm_get - Enable power for an IRQ chip * @data: Pointer to interrupt specific data * * Enable the power to the IRQ chip referenced by the interrupt data * structure. */ int irq_chip_pm_get(struct irq_data *data) { struct device *dev = irq_get_pm_device(data); int retval = 0; if (IS_ENABLED(CONFIG_PM) && dev) retval = pm_runtime_resume_and_get(dev); return retval; } /** * irq_chip_pm_put - Disable power for an IRQ chip * @data: Pointer to interrupt specific data * * Disable the power to the IRQ chip referenced by the interrupt data * structure, belongs. Note that power will only be disabled, once this * function has been called for all IRQs that have called irq_chip_pm_get(). */ int irq_chip_pm_put(struct irq_data *data) { struct device *dev = irq_get_pm_device(data); int retval = 0; if (IS_ENABLED(CONFIG_PM) && dev) retval = pm_runtime_put(dev); return (retval < 0) ? retval : 0; }
57 57 57 57 57 57 57 57 57 56 57 57 57 57 57 57 57 57 57 57 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 /* * Copyright (C) 2011-2013 Intel Corporation * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), * to deal in the Software without restriction, including without limitation * the rights to use, copy, modify, merge, publish, distribute, sublicense, * and/or sell copies of the Software, and to permit persons to whom the * Software is furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice (including the next * paragraph) shall be included in all copies or substantial portions of the * Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #include <linux/errno.h> #include <linux/export.h> #include <linux/kernel.h> #include <drm/drm_mode.h> #include <drm/drm_print.h> #include <drm/drm_rect.h> /** * drm_rect_intersect - intersect two rectangles * @r1: first rectangle * @r2: second rectangle * * Calculate the intersection of rectangles @r1 and @r2. * @r1 will be overwritten with the intersection. * * RETURNS: * %true if rectangle @r1 is still visible after the operation, * %false otherwise. */ bool drm_rect_intersect(struct drm_rect *r1, const struct drm_rect *r2) { r1->x1 = max(r1->x1, r2->x1); r1->y1 = max(r1->y1, r2->y1); r1->x2 = min(r1->x2, r2->x2); r1->y2 = min(r1->y2, r2->y2); return drm_rect_visible(r1); } EXPORT_SYMBOL(drm_rect_intersect); static u32 clip_scaled(int src, int dst, int *clip) { u64 tmp; if (dst == 0) return 0; /* Only clip what we have. Keeps the result bounded. */ *clip = min(*clip, dst); tmp = mul_u32_u32(src, dst - *clip); /* * Round toward 1.0 when clipping so that we don't accidentally * change upscaling to downscaling or vice versa. */ if (src < (dst << 16)) return DIV_ROUND_UP_ULL(tmp, dst); else return DIV_ROUND_DOWN_ULL(tmp, dst); } /** * drm_rect_clip_scaled - perform a scaled clip operation * @src: source window rectangle * @dst: destination window rectangle * @clip: clip rectangle * * Clip rectangle @dst by rectangle @clip. Clip rectangle @src by * the corresponding amounts, retaining the vertical and horizontal scaling * factors from @src to @dst. * * RETURNS: * %true if rectangle @dst is still visible after being clipped, * %false otherwise. */ bool drm_rect_clip_scaled(struct drm_rect *src, struct drm_rect *dst, const struct drm_rect *clip) { int diff; diff = clip->x1 - dst->x1; if (diff > 0) { u32 new_src_w = clip_scaled(drm_rect_width(src), drm_rect_width(dst), &diff); src->x1 = src->x2 - new_src_w; dst->x1 += diff; } diff = clip->y1 - dst->y1; if (diff > 0) { u32 new_src_h = clip_scaled(drm_rect_height(src), drm_rect_height(dst), &diff); src->y1 = src->y2 - new_src_h; dst->y1 += diff; } diff = dst->x2 - clip->x2; if (diff > 0) { u32 new_src_w = clip_scaled(drm_rect_width(src), drm_rect_width(dst), &diff); src->x2 = src->x1 + new_src_w; dst->x2 -= diff; } diff = dst->y2 - clip->y2; if (diff > 0) { u32 new_src_h = clip_scaled(drm_rect_height(src), drm_rect_height(dst), &diff); src->y2 = src->y1 + new_src_h; dst->y2 -= diff; } return drm_rect_visible(dst); } EXPORT_SYMBOL(drm_rect_clip_scaled); static int drm_calc_scale(int src, int dst) { int scale = 0; if (WARN_ON(src < 0 || dst < 0)) return -EINVAL; if (dst == 0) return 0; if (src > (dst << 16)) return DIV_ROUND_UP(src, dst); else scale = src / dst; return scale; } /** * drm_rect_calc_hscale - calculate the horizontal scaling factor * @src: source window rectangle * @dst: destination window rectangle * @min_hscale: minimum allowed horizontal scaling factor * @max_hscale: maximum allowed horizontal scaling factor * * Calculate the horizontal scaling factor as * (@src width) / (@dst width). * * If the scale is below 1 << 16, round down. If the scale is above * 1 << 16, round up. This will calculate the scale with the most * pessimistic limit calculation. * * RETURNS: * The horizontal scaling factor, or errno of out of limits. */ int drm_rect_calc_hscale(const struct drm_rect *src, const struct drm_rect *dst, int min_hscale, int max_hscale) { int src_w = drm_rect_width(src); int dst_w = drm_rect_width(dst); int hscale = drm_calc_scale(src_w, dst_w); if (hscale < 0 || dst_w == 0) return hscale; if (hscale < min_hscale || hscale > max_hscale) return -ERANGE; return hscale; } EXPORT_SYMBOL(drm_rect_calc_hscale); /** * drm_rect_calc_vscale - calculate the vertical scaling factor * @src: source window rectangle * @dst: destination window rectangle * @min_vscale: minimum allowed vertical scaling factor * @max_vscale: maximum allowed vertical scaling factor * * Calculate the vertical scaling factor as * (@src height) / (@dst height). * * If the scale is below 1 << 16, round down. If the scale is above * 1 << 16, round up. This will calculate the scale with the most * pessimistic limit calculation. * * RETURNS: * The vertical scaling factor, or errno of out of limits. */ int drm_rect_calc_vscale(const struct drm_rect *src, const struct drm_rect *dst, int min_vscale, int max_vscale) { int src_h = drm_rect_height(src); int dst_h = drm_rect_height(dst); int vscale = drm_calc_scale(src_h, dst_h); if (vscale < 0 || dst_h == 0) return vscale; if (vscale < min_vscale || vscale > max_vscale) return -ERANGE; return vscale; } EXPORT_SYMBOL(drm_rect_calc_vscale); /** * drm_rect_debug_print - print the rectangle information * @prefix: prefix string * @r: rectangle to print * @fixed_point: rectangle is in 16.16 fixed point format */ void drm_rect_debug_print(const char *prefix, const struct drm_rect *r, bool fixed_point) { if (fixed_point) DRM_DEBUG_KMS("%s" DRM_RECT_FP_FMT "\n", prefix, DRM_RECT_FP_ARG(r)); else DRM_DEBUG_KMS("%s" DRM_RECT_FMT "\n", prefix, DRM_RECT_ARG(r)); } EXPORT_SYMBOL(drm_rect_debug_print); /** * drm_rect_rotate - Rotate the rectangle * @r: rectangle to be rotated * @width: Width of the coordinate space * @height: Height of the coordinate space * @rotation: Transformation to be applied * * Apply @rotation to the coordinates of rectangle @r. * * @width and @height combined with @rotation define * the location of the new origin. * * @width correcsponds to the horizontal and @height * to the vertical axis of the untransformed coordinate * space. */ void drm_rect_rotate(struct drm_rect *r, int width, int height, unsigned int rotation) { struct drm_rect tmp; if (rotation & (DRM_MODE_REFLECT_X | DRM_MODE_REFLECT_Y)) { tmp = *r; if (rotation & DRM_MODE_REFLECT_X) { r->x1 = width - tmp.x2; r->x2 = width - tmp.x1; } if (rotation & DRM_MODE_REFLECT_Y) { r->y1 = height - tmp.y2; r->y2 = height - tmp.y1; } } switch (rotation & DRM_MODE_ROTATE_MASK) { case DRM_MODE_ROTATE_0: break; case DRM_MODE_ROTATE_90: tmp = *r; r->x1 = tmp.y1; r->x2 = tmp.y2; r->y1 = width - tmp.x2; r->y2 = width - tmp.x1; break; case DRM_MODE_ROTATE_180: tmp = *r; r->x1 = width - tmp.x2; r->x2 = width - tmp.x1; r->y1 = height - tmp.y2; r->y2 = height - tmp.y1; break; case DRM_MODE_ROTATE_270: tmp = *r; r->x1 = height - tmp.y2; r->x2 = height - tmp.y1; r->y1 = tmp.x1; r->y2 = tmp.x2; break; default: break; } } EXPORT_SYMBOL(drm_rect_rotate); /** * drm_rect_rotate_inv - Inverse rotate the rectangle * @r: rectangle to be rotated * @width: Width of the coordinate space * @height: Height of the coordinate space * @rotation: Transformation whose inverse is to be applied * * Apply the inverse of @rotation to the coordinates * of rectangle @r. * * @width and @height combined with @rotation define * the location of the new origin. * * @width correcsponds to the horizontal and @height * to the vertical axis of the original untransformed * coordinate space, so that you never have to flip * them when doing a rotatation and its inverse. * That is, if you do :: * * drm_rect_rotate(&r, width, height, rotation); * drm_rect_rotate_inv(&r, width, height, rotation); * * you will always get back the original rectangle. */ void drm_rect_rotate_inv(struct drm_rect *r, int width, int height, unsigned int rotation) { struct drm_rect tmp; switch (rotation & DRM_MODE_ROTATE_MASK) { case DRM_MODE_ROTATE_0: break; case DRM_MODE_ROTATE_90: tmp = *r; r->x1 = width - tmp.y2; r->x2 = width - tmp.y1; r->y1 = tmp.x1; r->y2 = tmp.x2; break; case DRM_MODE_ROTATE_180: tmp = *r; r->x1 = width - tmp.x2; r->x2 = width - tmp.x1; r->y1 = height - tmp.y2; r->y2 = height - tmp.y1; break; case DRM_MODE_ROTATE_270: tmp = *r; r->x1 = tmp.y1; r->x2 = tmp.y2; r->y1 = height - tmp.x2; r->y2 = height - tmp.x1; break; default: break; } if (rotation & (DRM_MODE_REFLECT_X | DRM_MODE_REFLECT_Y)) { tmp = *r; if (rotation & DRM_MODE_REFLECT_X) { r->x1 = width - tmp.x2; r->x2 = width - tmp.x1; } if (rotation & DRM_MODE_REFLECT_Y) { r->y1 = height - tmp.y2; r->y2 = height - tmp.y1; } } } EXPORT_SYMBOL(drm_rect_rotate_inv);
1 1 1 1 2 2 2 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 // SPDX-License-Identifier: GPL-2.0-only /* * linux/fs/hpfs/super.c * * Mikulas Patocka (mikulas@artax.karlin.mff.cuni.cz), 1998-1999 * * mounting, unmounting, error handling */ #include "hpfs_fn.h" #include <linux/module.h> #include <linux/fs_context.h> #include <linux/fs_parser.h> #include <linux/init.h> #include <linux/statfs.h> #include <linux/magic.h> #include <linux/sched.h> #include <linux/bitmap.h> #include <linux/slab.h> #include <linux/seq_file.h> /* Mark the filesystem dirty, so that chkdsk checks it when os/2 booted */ static void mark_dirty(struct super_block *s, int remount) { if (hpfs_sb(s)->sb_chkdsk && (remount || !sb_rdonly(s))) { struct buffer_head *bh; struct hpfs_spare_block *sb; if ((sb = hpfs_map_sector(s, 17, &bh, 0))) { sb->dirty = 1; sb->old_wrote = 0; mark_buffer_dirty(bh); sync_dirty_buffer(bh); brelse(bh); } } } /* Mark the filesystem clean (mark it dirty for chkdsk if chkdsk==2 or if there were errors) */ static void unmark_dirty(struct super_block *s) { struct buffer_head *bh; struct hpfs_spare_block *sb; if (sb_rdonly(s)) return; sync_blockdev(s->s_bdev); if ((sb = hpfs_map_sector(s, 17, &bh, 0))) { sb->dirty = hpfs_sb(s)->sb_chkdsk > 1 - hpfs_sb(s)->sb_was_error; sb->old_wrote = hpfs_sb(s)->sb_chkdsk >= 2 && !hpfs_sb(s)->sb_was_error; mark_buffer_dirty(bh); sync_dirty_buffer(bh); brelse(bh); } } /* Filesystem error... */ void hpfs_error(struct super_block *s, const char *fmt, ...) { struct va_format vaf; va_list args; va_start(args, fmt); vaf.fmt = fmt; vaf.va = &args; pr_err("filesystem error: %pV", &vaf); va_end(args); if (!hpfs_sb(s)->sb_was_error) { if (hpfs_sb(s)->sb_err == 2) { pr_cont("; crashing the system because you wanted it\n"); mark_dirty(s, 0); panic("HPFS panic"); } else if (hpfs_sb(s)->sb_err == 1) { if (sb_rdonly(s)) pr_cont("; already mounted read-only\n"); else { pr_cont("; remounting read-only\n"); mark_dirty(s, 0); s->s_flags |= SB_RDONLY; } } else if (sb_rdonly(s)) pr_cont("; going on - but anything won't be destroyed because it's read-only\n"); else pr_cont("; corrupted filesystem mounted read/write - your computer will explode within 20 seconds ... but you wanted it so!\n"); } else pr_cont("\n"); hpfs_sb(s)->sb_was_error = 1; } /* * A little trick to detect cycles in many hpfs structures and don't let the * kernel crash on corrupted filesystem. When first called, set c2 to 0. * * BTW. chkdsk doesn't detect cycles correctly. When I had 2 lost directories * nested each in other, chkdsk locked up happilly. */ int hpfs_stop_cycles(struct super_block *s, int key, int *c1, int *c2, char *msg) { if (*c2 && *c1 == key) { hpfs_error(s, "cycle detected on key %08x in %s", key, msg); return 1; } (*c2)++; if (!((*c2 - 1) & *c2)) *c1 = key; return 0; } static void free_sbi(struct hpfs_sb_info *sbi) { kfree(sbi->sb_cp_table); kfree(sbi->sb_bmp_dir); kfree(sbi); } static void lazy_free_sbi(struct rcu_head *rcu) { free_sbi(container_of(rcu, struct hpfs_sb_info, rcu)); } static void hpfs_put_super(struct super_block *s) { hpfs_lock(s); unmark_dirty(s); hpfs_unlock(s); call_rcu(&hpfs_sb(s)->rcu, lazy_free_sbi); } static unsigned hpfs_count_one_bitmap(struct super_block *s, secno secno) { struct quad_buffer_head qbh; unsigned long *bits; unsigned count; bits = hpfs_map_4sectors(s, secno, &qbh, 0); if (!bits) return (unsigned)-1; count = bitmap_weight(bits, 2048 * BITS_PER_BYTE); hpfs_brelse4(&qbh); return count; } static unsigned count_bitmaps(struct super_block *s) { unsigned n, count, n_bands; n_bands = (hpfs_sb(s)->sb_fs_size + 0x3fff) >> 14; count = 0; for (n = 0; n < COUNT_RD_AHEAD; n++) { hpfs_prefetch_bitmap(s, n); } for (n = 0; n < n_bands; n++) { unsigned c; hpfs_prefetch_bitmap(s, n + COUNT_RD_AHEAD); c = hpfs_count_one_bitmap(s, le32_to_cpu(hpfs_sb(s)->sb_bmp_dir[n])); if (c != (unsigned)-1) count += c; } return count; } unsigned hpfs_get_free_dnodes(struct super_block *s) { struct hpfs_sb_info *sbi = hpfs_sb(s); if (sbi->sb_n_free_dnodes == (unsigned)-1) { unsigned c = hpfs_count_one_bitmap(s, sbi->sb_dmap); if (c == (unsigned)-1) return 0; sbi->sb_n_free_dnodes = c; } return sbi->sb_n_free_dnodes; } static int hpfs_statfs(struct dentry *dentry, struct kstatfs *buf) { struct super_block *s = dentry->d_sb; struct hpfs_sb_info *sbi = hpfs_sb(s); u64 id = huge_encode_dev(s->s_bdev->bd_dev); hpfs_lock(s); if (sbi->sb_n_free == (unsigned)-1) sbi->sb_n_free = count_bitmaps(s); buf->f_type = s->s_magic; buf->f_bsize = 512; buf->f_blocks = sbi->sb_fs_size; buf->f_bfree = sbi->sb_n_free; buf->f_bavail = sbi->sb_n_free; buf->f_files = sbi->sb_dirband_size / 4; buf->f_ffree = hpfs_get_free_dnodes(s); buf->f_fsid = u64_to_fsid(id); buf->f_namelen = 254; hpfs_unlock(s); return 0; } long hpfs_ioctl(struct file *file, unsigned cmd, unsigned long arg) { switch (cmd) { case FITRIM: { struct fstrim_range range; secno n_trimmed; int r; if (!capable(CAP_SYS_ADMIN)) return -EPERM; if (copy_from_user(&range, (struct fstrim_range __user *)arg, sizeof(range))) return -EFAULT; r = hpfs_trim_fs(file_inode(file)->i_sb, range.start >> 9, (range.start + range.len) >> 9, (range.minlen + 511) >> 9, &n_trimmed); if (r) return r; range.len = (u64)n_trimmed << 9; if (copy_to_user((struct fstrim_range __user *)arg, &range, sizeof(range))) return -EFAULT; return 0; } default: { return -ENOIOCTLCMD; } } } static struct kmem_cache * hpfs_inode_cachep; static struct inode *hpfs_alloc_inode(struct super_block *sb) { struct hpfs_inode_info *ei; ei = alloc_inode_sb(sb, hpfs_inode_cachep, GFP_NOFS); if (!ei) return NULL; return &ei->vfs_inode; } static void hpfs_free_inode(struct inode *inode) { kmem_cache_free(hpfs_inode_cachep, hpfs_i(inode)); } static void init_once(void *foo) { struct hpfs_inode_info *ei = (struct hpfs_inode_info *) foo; inode_init_once(&ei->vfs_inode); } static int init_inodecache(void) { hpfs_inode_cachep = kmem_cache_create("hpfs_inode_cache", sizeof(struct hpfs_inode_info), 0, (SLAB_RECLAIM_ACCOUNT| SLAB_ACCOUNT), init_once); if (hpfs_inode_cachep == NULL) return -ENOMEM; return 0; } static void destroy_inodecache(void) { /* * Make sure all delayed rcu free inodes are flushed before we * destroy cache. */ rcu_barrier(); kmem_cache_destroy(hpfs_inode_cachep); } enum { Opt_help, Opt_uid, Opt_gid, Opt_umask, Opt_case, Opt_check, Opt_err, Opt_eas, Opt_chkdsk, Opt_timeshift, }; static const struct constant_table hpfs_param_case[] = { {"asis", 0}, {"lower", 1}, {} }; static const struct constant_table hpfs_param_check[] = { {"none", 0}, {"normal", 1}, {"strict", 2}, {} }; static const struct constant_table hpfs_param_err[] = { {"continue", 0}, {"remount-ro", 1}, {"panic", 2}, {} }; static const struct constant_table hpfs_param_eas[] = { {"no", 0}, {"ro", 1}, {"rw", 2}, {} }; static const struct constant_table hpfs_param_chkdsk[] = { {"no", 0}, {"errors", 1}, {"always", 2}, {} }; static const struct fs_parameter_spec hpfs_param_spec[] = { fsparam_flag ("help", Opt_help), fsparam_uid ("uid", Opt_uid), fsparam_gid ("gid", Opt_gid), fsparam_u32oct ("umask", Opt_umask), fsparam_enum ("case", Opt_case, hpfs_param_case), fsparam_enum ("check", Opt_check, hpfs_param_check), fsparam_enum ("errors", Opt_err, hpfs_param_err), fsparam_enum ("eas", Opt_eas, hpfs_param_eas), fsparam_enum ("chkdsk", Opt_chkdsk, hpfs_param_chkdsk), fsparam_s32 ("timeshift", Opt_timeshift), {} }; struct hpfs_fc_context { kuid_t uid; kgid_t gid; umode_t umask; int lowercase; int eas; int chk; int errs; int chkdsk; int timeshift; }; static inline void hpfs_help(void) { pr_info("\n\ HPFS filesystem options:\n\ help do not mount and display this text\n\ uid=xxx set uid of files that don't have uid specified in eas\n\ gid=xxx set gid of files that don't have gid specified in eas\n\ umask=xxx set mode of files that don't have mode specified in eas\n\ case=lower lowercase all files\n\ case=asis do not lowercase files (default)\n\ check=none no fs checks - kernel may crash on corrupted filesystem\n\ check=normal do some checks - it should not crash (default)\n\ check=strict do extra time-consuming checks, used for debugging\n\ errors=continue continue on errors\n\ errors=remount-ro remount read-only if errors found (default)\n\ errors=panic panic on errors\n\ chkdsk=no do not mark fs for chkdsking even if there were errors\n\ chkdsk=errors mark fs dirty if errors found (default)\n\ chkdsk=always always mark fs dirty - used for debugging\n\ eas=no ignore extended attributes\n\ eas=ro read but do not write extended attributes\n\ eas=rw r/w eas => enables chmod, chown, mknod, ln -s (default)\n\ timeshift=nnn add nnn seconds to file times\n\ \n"); } static int hpfs_parse_param(struct fs_context *fc, struct fs_parameter *param) { struct hpfs_fc_context *ctx = fc->fs_private; struct fs_parse_result result; int opt; opt = fs_parse(fc, hpfs_param_spec, param, &result); if (opt < 0) return opt; switch (opt) { case Opt_help: hpfs_help(); return -EINVAL; case Opt_uid: ctx->uid = result.uid; break; case Opt_gid: ctx->gid = result.gid; break; case Opt_umask: ctx->umask = result.uint_32; break; case Opt_case: ctx->lowercase = result.uint_32; break; case Opt_check: ctx->chk = result.uint_32; break; case Opt_err: ctx->errs = result.uint_32; break; case Opt_eas: ctx->eas = result.uint_32; break; case Opt_chkdsk: ctx->chkdsk = result.uint_32; break; case Opt_timeshift: { int m = 1; char *rhs = param->string; int timeshift; if (*rhs == '-') m = -1; if (*rhs == '+' || *rhs == '-') rhs++; timeshift = simple_strtoul(rhs, &rhs, 0) * m; if (*rhs) return -EINVAL; ctx->timeshift = timeshift; break; } default: return -EINVAL; } return 0; } static int hpfs_reconfigure(struct fs_context *fc) { struct hpfs_fc_context *ctx = fc->fs_private; struct super_block *s = fc->root->d_sb; struct hpfs_sb_info *sbi = hpfs_sb(s); sync_filesystem(s); fc->sb_flags |= SB_NOATIME; hpfs_lock(s); if (ctx->timeshift != sbi->sb_timeshift) { pr_err("timeshift can't be changed using remount.\n"); goto out_err; } unmark_dirty(s); sbi->sb_uid = ctx->uid; sbi->sb_gid = ctx->gid; sbi->sb_mode = 0777 & ~ctx->umask; sbi->sb_lowercase = ctx->lowercase; sbi->sb_eas = ctx->eas; sbi->sb_chk = ctx->chk; sbi->sb_chkdsk = ctx->chkdsk; sbi->sb_err = ctx->errs; sbi->sb_timeshift = ctx->timeshift; if (!(fc->sb_flags & SB_RDONLY)) mark_dirty(s, 1); hpfs_unlock(s); return 0; out_err: hpfs_unlock(s); return -EINVAL; } static int hpfs_show_options(struct seq_file *seq, struct dentry *root) { struct hpfs_sb_info *sbi = hpfs_sb(root->d_sb); seq_printf(seq, ",uid=%u", from_kuid_munged(&init_user_ns, sbi->sb_uid)); seq_printf(seq, ",gid=%u", from_kgid_munged(&init_user_ns, sbi->sb_gid)); seq_printf(seq, ",umask=%03o", (~sbi->sb_mode & 0777)); if (sbi->sb_lowercase) seq_printf(seq, ",case=lower"); if (!sbi->sb_chk) seq_printf(seq, ",check=none"); if (sbi->sb_chk == 2) seq_printf(seq, ",check=strict"); if (!sbi->sb_err) seq_printf(seq, ",errors=continue"); if (sbi->sb_err == 2) seq_printf(seq, ",errors=panic"); if (!sbi->sb_chkdsk) seq_printf(seq, ",chkdsk=no"); if (sbi->sb_chkdsk == 2) seq_printf(seq, ",chkdsk=always"); if (!sbi->sb_eas) seq_printf(seq, ",eas=no"); if (sbi->sb_eas == 1) seq_printf(seq, ",eas=ro"); if (sbi->sb_timeshift) seq_printf(seq, ",timeshift=%d", sbi->sb_timeshift); return 0; } /* Super operations */ static const struct super_operations hpfs_sops = { .alloc_inode = hpfs_alloc_inode, .free_inode = hpfs_free_inode, .evict_inode = hpfs_evict_inode, .put_super = hpfs_put_super, .statfs = hpfs_statfs, .show_options = hpfs_show_options, }; static int hpfs_fill_super(struct super_block *s, struct fs_context *fc) { struct hpfs_fc_context *ctx = fc->fs_private; struct buffer_head *bh0, *bh1, *bh2; struct hpfs_boot_block *bootblock; struct hpfs_super_block *superblock; struct hpfs_spare_block *spareblock; struct hpfs_sb_info *sbi; struct inode *root; int silent = fc->sb_flags & SB_SILENT; dnode_secno root_dno; struct hpfs_dirent *de = NULL; struct quad_buffer_head qbh; sbi = kzalloc(sizeof(*sbi), GFP_KERNEL); if (!sbi) { return -ENOMEM; } s->s_fs_info = sbi; mutex_init(&sbi->hpfs_mutex); hpfs_lock(s); /*sbi->sb_mounting = 1;*/ sb_set_blocksize(s, 512); sbi->sb_fs_size = -1; if (!(bootblock = hpfs_map_sector(s, 0, &bh0, 0))) goto bail1; if (!(superblock = hpfs_map_sector(s, 16, &bh1, 1))) goto bail2; if (!(spareblock = hpfs_map_sector(s, 17, &bh2, 0))) goto bail3; /* Check magics */ if (/*le16_to_cpu(bootblock->magic) != BB_MAGIC ||*/ le32_to_cpu(superblock->magic) != SB_MAGIC || le32_to_cpu(spareblock->magic) != SP_MAGIC) { if (!silent) pr_err("Bad magic ... probably not HPFS\n"); goto bail4; } /* Check version */ if (!sb_rdonly(s) && superblock->funcversion != 2 && superblock->funcversion != 3) { pr_err("Bad version %d,%d. Mount readonly to go around\n", (int)superblock->version, (int)superblock->funcversion); pr_err("please try recent version of HPFS driver at http://artax.karlin.mff.cuni.cz/~mikulas/vyplody/hpfs/index-e.cgi and if it still can't understand this format, contact author - mikulas@artax.karlin.mff.cuni.cz\n"); goto bail4; } s->s_flags |= SB_NOATIME; /* Fill superblock stuff */ s->s_magic = HPFS_SUPER_MAGIC; s->s_op = &hpfs_sops; s->s_d_op = &hpfs_dentry_operations; s->s_time_min = local_to_gmt(s, 0); s->s_time_max = local_to_gmt(s, U32_MAX); sbi->sb_root = le32_to_cpu(superblock->root); sbi->sb_fs_size = le32_to_cpu(superblock->n_sectors); sbi->sb_bitmaps = le32_to_cpu(superblock->bitmaps); sbi->sb_dirband_start = le32_to_cpu(superblock->dir_band_start); sbi->sb_dirband_size = le32_to_cpu(superblock->n_dir_band); sbi->sb_dmap = le32_to_cpu(superblock->dir_band_bitmap); sbi->sb_uid = ctx->uid; sbi->sb_gid = ctx->gid; sbi->sb_mode = 0777 & ~ctx->umask; sbi->sb_n_free = -1; sbi->sb_n_free_dnodes = -1; sbi->sb_lowercase = ctx->lowercase; sbi->sb_eas = ctx->eas; sbi->sb_chk = ctx->chk; sbi->sb_chkdsk = ctx->chkdsk; sbi->sb_err = ctx->errs; sbi->sb_timeshift = ctx->timeshift; sbi->sb_was_error = 0; sbi->sb_cp_table = NULL; sbi->sb_c_bitmap = -1; sbi->sb_max_fwd_alloc = 0xffffff; if (sbi->sb_fs_size >= 0x80000000) { hpfs_error(s, "invalid size in superblock: %08x", (unsigned)sbi->sb_fs_size); goto bail4; } if (spareblock->n_spares_used) hpfs_load_hotfix_map(s, spareblock); /* Load bitmap directory */ if (!(sbi->sb_bmp_dir = hpfs_load_bitmap_directory(s, le32_to_cpu(superblock->bitmaps)))) goto bail4; /* Check for general fs errors*/ if (spareblock->dirty && !spareblock->old_wrote) { if (sbi->sb_err == 2) { pr_err("Improperly stopped, not mounted\n"); goto bail4; } hpfs_error(s, "improperly stopped"); } if (!sb_rdonly(s)) { spareblock->dirty = 1; spareblock->old_wrote = 0; mark_buffer_dirty(bh2); } if (le32_to_cpu(spareblock->n_dnode_spares) != le32_to_cpu(spareblock->n_dnode_spares_free)) { if (sbi->sb_err >= 2) { pr_err("Spare dnodes used, try chkdsk\n"); mark_dirty(s, 0); goto bail4; } hpfs_error(s, "warning: spare dnodes used, try chkdsk"); if (sbi->sb_err == 0) pr_err("Proceeding, but your filesystem could be corrupted if you delete files or directories\n"); } if (sbi->sb_chk) { unsigned a; if (le32_to_cpu(superblock->dir_band_end) - le32_to_cpu(superblock->dir_band_start) + 1 != le32_to_cpu(superblock->n_dir_band) || le32_to_cpu(superblock->dir_band_end) < le32_to_cpu(superblock->dir_band_start) || le32_to_cpu(superblock->n_dir_band) > 0x4000) { hpfs_error(s, "dir band size mismatch: dir_band_start==%08x, dir_band_end==%08x, n_dir_band==%08x", le32_to_cpu(superblock->dir_band_start), le32_to_cpu(superblock->dir_band_end), le32_to_cpu(superblock->n_dir_band)); goto bail4; } a = sbi->sb_dirband_size; sbi->sb_dirband_size = 0; if (hpfs_chk_sectors(s, le32_to_cpu(superblock->dir_band_start), le32_to_cpu(superblock->n_dir_band), "dir_band") || hpfs_chk_sectors(s, le32_to_cpu(superblock->dir_band_bitmap), 4, "dir_band_bitmap") || hpfs_chk_sectors(s, le32_to_cpu(superblock->bitmaps), 4, "bitmaps")) { mark_dirty(s, 0); goto bail4; } sbi->sb_dirband_size = a; } else pr_err("You really don't want any checks? You are crazy...\n"); /* Load code page table */ if (le32_to_cpu(spareblock->n_code_pages)) if (!(sbi->sb_cp_table = hpfs_load_code_page(s, le32_to_cpu(spareblock->code_page_dir)))) pr_err("code page support is disabled\n"); brelse(bh2); brelse(bh1); brelse(bh0); root = iget_locked(s, sbi->sb_root); if (!root) goto bail0; hpfs_init_inode(root); hpfs_read_inode(root); unlock_new_inode(root); s->s_root = d_make_root(root); if (!s->s_root) goto bail0; /* * find the root directory's . pointer & finish filling in the inode */ root_dno = hpfs_fnode_dno(s, sbi->sb_root); if (root_dno) de = map_dirent(root, root_dno, "\001\001", 2, NULL, &qbh); if (!de) hpfs_error(s, "unable to find root dir"); else { inode_set_atime(root, local_to_gmt(s, le32_to_cpu(de->read_date)), 0); inode_set_mtime(root, local_to_gmt(s, le32_to_cpu(de->write_date)), 0); inode_set_ctime(root, local_to_gmt(s, le32_to_cpu(de->creation_date)), 0); hpfs_i(root)->i_ea_size = le32_to_cpu(de->ea_size); hpfs_i(root)->i_parent_dir = root->i_ino; if (root->i_size == -1) root->i_size = 2048; if (root->i_blocks == -1) root->i_blocks = 5; hpfs_brelse4(&qbh); } hpfs_unlock(s); return 0; bail4: brelse(bh2); bail3: brelse(bh1); bail2: brelse(bh0); bail1: bail0: hpfs_unlock(s); free_sbi(sbi); return -EINVAL; } static int hpfs_get_tree(struct fs_context *fc) { return get_tree_bdev(fc, hpfs_fill_super); } static void hpfs_free_fc(struct fs_context *fc) { kfree(fc->fs_private); } static const struct fs_context_operations hpfs_fc_context_ops = { .parse_param = hpfs_parse_param, .get_tree = hpfs_get_tree, .reconfigure = hpfs_reconfigure, .free = hpfs_free_fc, }; static int hpfs_init_fs_context(struct fs_context *fc) { struct hpfs_fc_context *ctx; ctx = kzalloc(sizeof(struct hpfs_fc_context), GFP_KERNEL); if (!ctx) return -ENOMEM; if (fc->purpose == FS_CONTEXT_FOR_RECONFIGURE) { struct super_block *sb = fc->root->d_sb; struct hpfs_sb_info *sbi = hpfs_sb(sb); ctx->uid = sbi->sb_uid; ctx->gid = sbi->sb_gid; ctx->umask = 0777 & ~sbi->sb_mode; ctx->lowercase = sbi->sb_lowercase; ctx->eas = sbi->sb_eas; ctx->chk = sbi->sb_chk; ctx->chkdsk = sbi->sb_chkdsk; ctx->errs = sbi->sb_err; ctx->timeshift = sbi->sb_timeshift; } else { ctx->uid = current_uid(); ctx->gid = current_gid(); ctx->umask = current_umask(); ctx->lowercase = 0; ctx->eas = 2; ctx->chk = 1; ctx->errs = 1; ctx->chkdsk = 1; ctx->timeshift = 0; } fc->fs_private = ctx; fc->ops = &hpfs_fc_context_ops; return 0; }; static struct file_system_type hpfs_fs_type = { .owner = THIS_MODULE, .name = "hpfs", .kill_sb = kill_block_super, .fs_flags = FS_REQUIRES_DEV, .init_fs_context = hpfs_init_fs_context, .parameters = hpfs_param_spec, }; MODULE_ALIAS_FS("hpfs"); static int __init init_hpfs_fs(void) { int err = init_inodecache(); if (err) goto out1; err = register_filesystem(&hpfs_fs_type); if (err) goto out; return 0; out: destroy_inodecache(); out1: return err; } static void __exit exit_hpfs_fs(void) { unregister_filesystem(&hpfs_fs_type); destroy_inodecache(); } module_init(init_hpfs_fs) module_exit(exit_hpfs_fs) MODULE_DESCRIPTION("OS/2 HPFS file system support"); MODULE_LICENSE("GPL");
12 20 20 20 20 20 8 5 8 2 8 8 19 19 19 19 8 8 8 20 20 20 2 20 44 8 8 8 1 1 182 1 1 1 1 24 24 24 24 24 8 24 24 24 24 24 23 6 6 6 6 6 10 10 10 10 10 10 6 16 35 16 10 15 2 2 2 2 2 2 2 2 1 2 2 2 2 2 3 1 3 1 3 3 3 3 3 2 3 3 3 3 3 3 3 3 3 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 // SPDX-License-Identifier: GPL-2.0 /* * drivers/base/power/wakeup.c - System wakeup events framework * * Copyright (c) 2010 Rafael J. Wysocki <rjw@sisk.pl>, Novell Inc. */ #define pr_fmt(fmt) "PM: " fmt #include <linux/device.h> #include <linux/slab.h> #include <linux/sched/signal.h> #include <linux/capability.h> #include <linux/export.h> #include <linux/suspend.h> #include <linux/seq_file.h> #include <linux/debugfs.h> #include <linux/pm_wakeirq.h> #include <trace/events/power.h> #include "power.h" #define list_for_each_entry_rcu_locked(pos, head, member) \ list_for_each_entry_rcu(pos, head, member, \ srcu_read_lock_held(&wakeup_srcu)) /* * If set, the suspend/hibernate code will abort transitions to a sleep state * if wakeup events are registered during or immediately before the transition. */ bool events_check_enabled __read_mostly; /* First wakeup IRQ seen by the kernel in the last cycle. */ static unsigned int wakeup_irq[2] __read_mostly; static DEFINE_RAW_SPINLOCK(wakeup_irq_lock); /* If greater than 0 and the system is suspending, terminate the suspend. */ static atomic_t pm_abort_suspend __read_mostly; /* * Combined counters of registered wakeup events and wakeup events in progress. * They need to be modified together atomically, so it's better to use one * atomic variable to hold them both. */ static atomic_t combined_event_count = ATOMIC_INIT(0); #define IN_PROGRESS_BITS (sizeof(int) * 4) #define MAX_IN_PROGRESS ((1 << IN_PROGRESS_BITS) - 1) static void split_counters(unsigned int *cnt, unsigned int *inpr) { unsigned int comb = atomic_read(&combined_event_count); *cnt = (comb >> IN_PROGRESS_BITS); *inpr = comb & MAX_IN_PROGRESS; } /* A preserved old value of the events counter. */ static unsigned int saved_count; static DEFINE_RAW_SPINLOCK(events_lock); static void pm_wakeup_timer_fn(struct timer_list *t); static LIST_HEAD(wakeup_sources); static DECLARE_WAIT_QUEUE_HEAD(wakeup_count_wait_queue); DEFINE_STATIC_SRCU(wakeup_srcu); static struct wakeup_source deleted_ws = { .name = "deleted", .lock = __SPIN_LOCK_UNLOCKED(deleted_ws.lock), }; static DEFINE_IDA(wakeup_ida); /** * wakeup_source_create - Create a struct wakeup_source object. * @name: Name of the new wakeup source. */ struct wakeup_source *wakeup_source_create(const char *name) { struct wakeup_source *ws; const char *ws_name; int id; ws = kzalloc(sizeof(*ws), GFP_KERNEL); if (!ws) goto err_ws; ws_name = kstrdup_const(name, GFP_KERNEL); if (!ws_name) goto err_name; ws->name = ws_name; id = ida_alloc(&wakeup_ida, GFP_KERNEL); if (id < 0) goto err_id; ws->id = id; return ws; err_id: kfree_const(ws->name); err_name: kfree(ws); err_ws: return NULL; } EXPORT_SYMBOL_GPL(wakeup_source_create); /* * Record wakeup_source statistics being deleted into a dummy wakeup_source. */ static void wakeup_source_record(struct wakeup_source *ws) { unsigned long flags; spin_lock_irqsave(&deleted_ws.lock, flags); if (ws->event_count) { deleted_ws.total_time = ktime_add(deleted_ws.total_time, ws->total_time); deleted_ws.prevent_sleep_time = ktime_add(deleted_ws.prevent_sleep_time, ws->prevent_sleep_time); deleted_ws.max_time = ktime_compare(deleted_ws.max_time, ws->max_time) > 0 ? deleted_ws.max_time : ws->max_time; deleted_ws.event_count += ws->event_count; deleted_ws.active_count += ws->active_count; deleted_ws.relax_count += ws->relax_count; deleted_ws.expire_count += ws->expire_count; deleted_ws.wakeup_count += ws->wakeup_count; } spin_unlock_irqrestore(&deleted_ws.lock, flags); } static void wakeup_source_free(struct wakeup_source *ws) { ida_free(&wakeup_ida, ws->id); kfree_const(ws->name); kfree(ws); } /** * wakeup_source_destroy - Destroy a struct wakeup_source object. * @ws: Wakeup source to destroy. * * Use only for wakeup source objects created with wakeup_source_create(). */ void wakeup_source_destroy(struct wakeup_source *ws) { if (!ws) return; __pm_relax(ws); wakeup_source_record(ws); wakeup_source_free(ws); } EXPORT_SYMBOL_GPL(wakeup_source_destroy); /** * wakeup_source_add - Add given object to the list of wakeup sources. * @ws: Wakeup source object to add to the list. */ void wakeup_source_add(struct wakeup_source *ws) { unsigned long flags; if (WARN_ON(!ws)) return; spin_lock_init(&ws->lock); timer_setup(&ws->timer, pm_wakeup_timer_fn, 0); ws->active = false; raw_spin_lock_irqsave(&events_lock, flags); list_add_rcu(&ws->entry, &wakeup_sources); raw_spin_unlock_irqrestore(&events_lock, flags); } EXPORT_SYMBOL_GPL(wakeup_source_add); /** * wakeup_source_remove - Remove given object from the wakeup sources list. * @ws: Wakeup source object to remove from the list. */ void wakeup_source_remove(struct wakeup_source *ws) { unsigned long flags; if (WARN_ON(!ws)) return; raw_spin_lock_irqsave(&events_lock, flags); list_del_rcu(&ws->entry); raw_spin_unlock_irqrestore(&events_lock, flags); synchronize_srcu(&wakeup_srcu); del_timer_sync(&ws->timer); /* * Clear timer.function to make wakeup_source_not_registered() treat * this wakeup source as not registered. */ ws->timer.function = NULL; } EXPORT_SYMBOL_GPL(wakeup_source_remove); /** * wakeup_source_register - Create wakeup source and add it to the list. * @dev: Device this wakeup source is associated with (or NULL if virtual). * @name: Name of the wakeup source to register. */ struct wakeup_source *wakeup_source_register(struct device *dev, const char *name) { struct wakeup_source *ws; int ret; ws = wakeup_source_create(name); if (ws) { if (!dev || device_is_registered(dev)) { ret = wakeup_source_sysfs_add(dev, ws); if (ret) { wakeup_source_free(ws); return NULL; } } wakeup_source_add(ws); } return ws; } EXPORT_SYMBOL_GPL(wakeup_source_register); /** * wakeup_source_unregister - Remove wakeup source from the list and remove it. * @ws: Wakeup source object to unregister. */ void wakeup_source_unregister(struct wakeup_source *ws) { if (ws) { wakeup_source_remove(ws); if (ws->dev) wakeup_source_sysfs_remove(ws); wakeup_source_destroy(ws); } } EXPORT_SYMBOL_GPL(wakeup_source_unregister); /** * wakeup_sources_read_lock - Lock wakeup source list for read. * * Returns an index of srcu lock for struct wakeup_srcu. * This index must be passed to the matching wakeup_sources_read_unlock(). */ int wakeup_sources_read_lock(void) { return srcu_read_lock(&wakeup_srcu); } EXPORT_SYMBOL_GPL(wakeup_sources_read_lock); /** * wakeup_sources_read_unlock - Unlock wakeup source list. * @idx: return value from corresponding wakeup_sources_read_lock() */ void wakeup_sources_read_unlock(int idx) { srcu_read_unlock(&wakeup_srcu, idx); } EXPORT_SYMBOL_GPL(wakeup_sources_read_unlock); /** * wakeup_sources_walk_start - Begin a walk on wakeup source list * * Returns first object of the list of wakeup sources. * * Note that to be safe, wakeup sources list needs to be locked by calling * wakeup_source_read_lock() for this. */ struct wakeup_source *wakeup_sources_walk_start(void) { struct list_head *ws_head = &wakeup_sources; return list_entry_rcu(ws_head->next, struct wakeup_source, entry); } EXPORT_SYMBOL_GPL(wakeup_sources_walk_start); /** * wakeup_sources_walk_next - Get next wakeup source from the list * @ws: Previous wakeup source object * * Note that to be safe, wakeup sources list needs to be locked by calling * wakeup_source_read_lock() for this. */ struct wakeup_source *wakeup_sources_walk_next(struct wakeup_source *ws) { struct list_head *ws_head = &wakeup_sources; return list_next_or_null_rcu(ws_head, &ws->entry, struct wakeup_source, entry); } EXPORT_SYMBOL_GPL(wakeup_sources_walk_next); /** * device_wakeup_attach - Attach a wakeup source object to a device object. * @dev: Device to handle. * @ws: Wakeup source object to attach to @dev. * * This causes @dev to be treated as a wakeup device. */ static int device_wakeup_attach(struct device *dev, struct wakeup_source *ws) { spin_lock_irq(&dev->power.lock); if (dev->power.wakeup) { spin_unlock_irq(&dev->power.lock); return -EEXIST; } dev->power.wakeup = ws; if (dev->power.wakeirq) device_wakeup_attach_irq(dev, dev->power.wakeirq); spin_unlock_irq(&dev->power.lock); return 0; } /** * device_wakeup_enable - Enable given device to be a wakeup source. * @dev: Device to handle. * * Create a wakeup source object, register it and attach it to @dev. */ int device_wakeup_enable(struct device *dev) { struct wakeup_source *ws; int ret; if (!dev || !dev->power.can_wakeup) return -EINVAL; if (pm_suspend_target_state != PM_SUSPEND_ON) dev_dbg(dev, "Suspicious %s() during system transition!\n", __func__); ws = wakeup_source_register(dev, dev_name(dev)); if (!ws) return -ENOMEM; ret = device_wakeup_attach(dev, ws); if (ret) wakeup_source_unregister(ws); return ret; } EXPORT_SYMBOL_GPL(device_wakeup_enable); /** * device_wakeup_attach_irq - Attach a wakeirq to a wakeup source * @dev: Device to handle * @wakeirq: Device specific wakeirq entry * * Attach a device wakeirq to the wakeup source so the device * wake IRQ can be configured automatically for suspend and * resume. * * Call under the device's power.lock lock. */ void device_wakeup_attach_irq(struct device *dev, struct wake_irq *wakeirq) { struct wakeup_source *ws; ws = dev->power.wakeup; if (!ws) return; if (ws->wakeirq) dev_err(dev, "Leftover wakeup IRQ found, overriding\n"); ws->wakeirq = wakeirq; } /** * device_wakeup_detach_irq - Detach a wakeirq from a wakeup source * @dev: Device to handle * * Removes a device wakeirq from the wakeup source. * * Call under the device's power.lock lock. */ void device_wakeup_detach_irq(struct device *dev) { struct wakeup_source *ws; ws = dev->power.wakeup; if (ws) ws->wakeirq = NULL; } /** * device_wakeup_arm_wake_irqs - * * Iterates over the list of device wakeirqs to arm them. */ void device_wakeup_arm_wake_irqs(void) { struct wakeup_source *ws; int srcuidx; srcuidx = srcu_read_lock(&wakeup_srcu); list_for_each_entry_rcu_locked(ws, &wakeup_sources, entry) dev_pm_arm_wake_irq(ws->wakeirq); srcu_read_unlock(&wakeup_srcu, srcuidx); } /** * device_wakeup_disarm_wake_irqs - * * Iterates over the list of device wakeirqs to disarm them. */ void device_wakeup_disarm_wake_irqs(void) { struct wakeup_source *ws; int srcuidx; srcuidx = srcu_read_lock(&wakeup_srcu); list_for_each_entry_rcu_locked(ws, &wakeup_sources, entry) dev_pm_disarm_wake_irq(ws->wakeirq); srcu_read_unlock(&wakeup_srcu, srcuidx); } /** * device_wakeup_detach - Detach a device's wakeup source object from it. * @dev: Device to detach the wakeup source object from. * * After it returns, @dev will not be treated as a wakeup device any more. */ static struct wakeup_source *device_wakeup_detach(struct device *dev) { struct wakeup_source *ws; spin_lock_irq(&dev->power.lock); ws = dev->power.wakeup; dev->power.wakeup = NULL; spin_unlock_irq(&dev->power.lock); return ws; } /** * device_wakeup_disable - Do not regard a device as a wakeup source any more. * @dev: Device to handle. * * Detach the @dev's wakeup source object from it, unregister this wakeup source * object and destroy it. */ void device_wakeup_disable(struct device *dev) { struct wakeup_source *ws; if (!dev || !dev->power.can_wakeup) return; ws = device_wakeup_detach(dev); wakeup_source_unregister(ws); } EXPORT_SYMBOL_GPL(device_wakeup_disable); /** * device_set_wakeup_capable - Set/reset device wakeup capability flag. * @dev: Device to handle. * @capable: Whether or not @dev is capable of waking up the system from sleep. * * If @capable is set, set the @dev's power.can_wakeup flag and add its * wakeup-related attributes to sysfs. Otherwise, unset the @dev's * power.can_wakeup flag and remove its wakeup-related attributes from sysfs. * * This function may sleep and it can't be called from any context where * sleeping is not allowed. */ void device_set_wakeup_capable(struct device *dev, bool capable) { if (!!dev->power.can_wakeup == !!capable) return; dev->power.can_wakeup = capable; if (device_is_registered(dev) && !list_empty(&dev->power.entry)) { if (capable) { int ret = wakeup_sysfs_add(dev); if (ret) dev_info(dev, "Wakeup sysfs attributes not added\n"); } else { wakeup_sysfs_remove(dev); } } } EXPORT_SYMBOL_GPL(device_set_wakeup_capable); /** * device_set_wakeup_enable - Enable or disable a device to wake up the system. * @dev: Device to handle. * @enable: enable/disable flag */ int device_set_wakeup_enable(struct device *dev, bool enable) { if (enable) return device_wakeup_enable(dev); device_wakeup_disable(dev); return 0; } EXPORT_SYMBOL_GPL(device_set_wakeup_enable); /** * wakeup_source_not_registered - validate the given wakeup source. * @ws: Wakeup source to be validated. */ static bool wakeup_source_not_registered(struct wakeup_source *ws) { /* * Use timer struct to check if the given source is initialized * by wakeup_source_add. */ return ws->timer.function != pm_wakeup_timer_fn; } /* * The functions below use the observation that each wakeup event starts a * period in which the system should not be suspended. The moment this period * will end depends on how the wakeup event is going to be processed after being * detected and all of the possible cases can be divided into two distinct * groups. * * First, a wakeup event may be detected by the same functional unit that will * carry out the entire processing of it and possibly will pass it to user space * for further processing. In that case the functional unit that has detected * the event may later "close" the "no suspend" period associated with it * directly as soon as it has been dealt with. The pair of pm_stay_awake() and * pm_relax(), balanced with each other, is supposed to be used in such * situations. * * Second, a wakeup event may be detected by one functional unit and processed * by another one. In that case the unit that has detected it cannot really * "close" the "no suspend" period associated with it, unless it knows in * advance what's going to happen to the event during processing. This * knowledge, however, may not be available to it, so it can simply specify time * to wait before the system can be suspended and pass it as the second * argument of pm_wakeup_event(). * * It is valid to call pm_relax() after pm_wakeup_event(), in which case the * "no suspend" period will be ended either by the pm_relax(), or by the timer * function executed when the timer expires, whichever comes first. */ /** * wakeup_source_activate - Mark given wakeup source as active. * @ws: Wakeup source to handle. * * Update the @ws' statistics and, if @ws has just been activated, notify the PM * core of the event by incrementing the counter of the wakeup events being * processed. */ static void wakeup_source_activate(struct wakeup_source *ws) { unsigned int cec; if (WARN_ONCE(wakeup_source_not_registered(ws), "unregistered wakeup source\n")) return; ws->active = true; ws->active_count++; ws->last_time = ktime_get(); if (ws->autosleep_enabled) ws->start_prevent_time = ws->last_time; /* Increment the counter of events in progress. */ cec = atomic_inc_return(&combined_event_count); trace_wakeup_source_activate(ws->name, cec); } /** * wakeup_source_report_event - Report wakeup event using the given source. * @ws: Wakeup source to report the event for. * @hard: If set, abort suspends in progress and wake up from suspend-to-idle. */ static void wakeup_source_report_event(struct wakeup_source *ws, bool hard) { ws->event_count++; /* This is racy, but the counter is approximate anyway. */ if (events_check_enabled) ws->wakeup_count++; if (!ws->active) wakeup_source_activate(ws); if (hard) pm_system_wakeup(); } /** * __pm_stay_awake - Notify the PM core of a wakeup event. * @ws: Wakeup source object associated with the source of the event. * * It is safe to call this function from interrupt context. */ void __pm_stay_awake(struct wakeup_source *ws) { unsigned long flags; if (!ws) return; spin_lock_irqsave(&ws->lock, flags); wakeup_source_report_event(ws, false); del_timer(&ws->timer); ws->timer_expires = 0; spin_unlock_irqrestore(&ws->lock, flags); } EXPORT_SYMBOL_GPL(__pm_stay_awake); /** * pm_stay_awake - Notify the PM core that a wakeup event is being processed. * @dev: Device the wakeup event is related to. * * Notify the PM core of a wakeup event (signaled by @dev) by calling * __pm_stay_awake for the @dev's wakeup source object. * * Call this function after detecting of a wakeup event if pm_relax() is going * to be called directly after processing the event (and possibly passing it to * user space for further processing). */ void pm_stay_awake(struct device *dev) { unsigned long flags; if (!dev) return; spin_lock_irqsave(&dev->power.lock, flags); __pm_stay_awake(dev->power.wakeup); spin_unlock_irqrestore(&dev->power.lock, flags); } EXPORT_SYMBOL_GPL(pm_stay_awake); #ifdef CONFIG_PM_AUTOSLEEP static void update_prevent_sleep_time(struct wakeup_source *ws, ktime_t now) { ktime_t delta = ktime_sub(now, ws->start_prevent_time); ws->prevent_sleep_time = ktime_add(ws->prevent_sleep_time, delta); } #else static inline void update_prevent_sleep_time(struct wakeup_source *ws, ktime_t now) {} #endif /** * wakeup_source_deactivate - Mark given wakeup source as inactive. * @ws: Wakeup source to handle. * * Update the @ws' statistics and notify the PM core that the wakeup source has * become inactive by decrementing the counter of wakeup events being processed * and incrementing the counter of registered wakeup events. */ static void wakeup_source_deactivate(struct wakeup_source *ws) { unsigned int cnt, inpr, cec; ktime_t duration; ktime_t now; ws->relax_count++; /* * __pm_relax() may be called directly or from a timer function. * If it is called directly right after the timer function has been * started, but before the timer function calls __pm_relax(), it is * possible that __pm_stay_awake() will be called in the meantime and * will set ws->active. Then, ws->active may be cleared immediately * by the __pm_relax() called from the timer function, but in such a * case ws->relax_count will be different from ws->active_count. */ if (ws->relax_count != ws->active_count) { ws->relax_count--; return; } ws->active = false; now = ktime_get(); duration = ktime_sub(now, ws->last_time); ws->total_time = ktime_add(ws->total_time, duration); if (ktime_to_ns(duration) > ktime_to_ns(ws->max_time)) ws->max_time = duration; ws->last_time = now; del_timer(&ws->timer); ws->timer_expires = 0; if (ws->autosleep_enabled) update_prevent_sleep_time(ws, now); /* * Increment the counter of registered wakeup events and decrement the * counter of wakeup events in progress simultaneously. */ cec = atomic_add_return(MAX_IN_PROGRESS, &combined_event_count); trace_wakeup_source_deactivate(ws->name, cec); split_counters(&cnt, &inpr); if (!inpr && waitqueue_active(&wakeup_count_wait_queue)) wake_up(&wakeup_count_wait_queue); } /** * __pm_relax - Notify the PM core that processing of a wakeup event has ended. * @ws: Wakeup source object associated with the source of the event. * * Call this function for wakeup events whose processing started with calling * __pm_stay_awake(). * * It is safe to call it from interrupt context. */ void __pm_relax(struct wakeup_source *ws) { unsigned long flags; if (!ws) return; spin_lock_irqsave(&ws->lock, flags); if (ws->active) wakeup_source_deactivate(ws); spin_unlock_irqrestore(&ws->lock, flags); } EXPORT_SYMBOL_GPL(__pm_relax); /** * pm_relax - Notify the PM core that processing of a wakeup event has ended. * @dev: Device that signaled the event. * * Execute __pm_relax() for the @dev's wakeup source object. */ void pm_relax(struct device *dev) { unsigned long flags; if (!dev) return; spin_lock_irqsave(&dev->power.lock, flags); __pm_relax(dev->power.wakeup); spin_unlock_irqrestore(&dev->power.lock, flags); } EXPORT_SYMBOL_GPL(pm_relax); /** * pm_wakeup_timer_fn - Delayed finalization of a wakeup event. * @t: timer list * * Call wakeup_source_deactivate() for the wakeup source whose address is stored * in @data if it is currently active and its timer has not been canceled and * the expiration time of the timer is not in future. */ static void pm_wakeup_timer_fn(struct timer_list *t) { struct wakeup_source *ws = from_timer(ws, t, timer); unsigned long flags; spin_lock_irqsave(&ws->lock, flags); if (ws->active && ws->timer_expires && time_after_eq(jiffies, ws->timer_expires)) { wakeup_source_deactivate(ws); ws->expire_count++; } spin_unlock_irqrestore(&ws->lock, flags); } /** * pm_wakeup_ws_event - Notify the PM core of a wakeup event. * @ws: Wakeup source object associated with the event source. * @msec: Anticipated event processing time (in milliseconds). * @hard: If set, abort suspends in progress and wake up from suspend-to-idle. * * Notify the PM core of a wakeup event whose source is @ws that will take * approximately @msec milliseconds to be processed by the kernel. If @ws is * not active, activate it. If @msec is nonzero, set up the @ws' timer to * execute pm_wakeup_timer_fn() in future. * * It is safe to call this function from interrupt context. */ void pm_wakeup_ws_event(struct wakeup_source *ws, unsigned int msec, bool hard) { unsigned long flags; unsigned long expires; if (!ws) return; spin_lock_irqsave(&ws->lock, flags); wakeup_source_report_event(ws, hard); if (!msec) { wakeup_source_deactivate(ws); goto unlock; } expires = jiffies + msecs_to_jiffies(msec); if (!expires) expires = 1; if (!ws->timer_expires || time_after(expires, ws->timer_expires)) { mod_timer(&ws->timer, expires); ws->timer_expires = expires; } unlock: spin_unlock_irqrestore(&ws->lock, flags); } EXPORT_SYMBOL_GPL(pm_wakeup_ws_event); /** * pm_wakeup_dev_event - Notify the PM core of a wakeup event. * @dev: Device the wakeup event is related to. * @msec: Anticipated event processing time (in milliseconds). * @hard: If set, abort suspends in progress and wake up from suspend-to-idle. * * Call pm_wakeup_ws_event() for the @dev's wakeup source object. */ void pm_wakeup_dev_event(struct device *dev, unsigned int msec, bool hard) { unsigned long flags; if (!dev) return; spin_lock_irqsave(&dev->power.lock, flags); pm_wakeup_ws_event(dev->power.wakeup, msec, hard); spin_unlock_irqrestore(&dev->power.lock, flags); } EXPORT_SYMBOL_GPL(pm_wakeup_dev_event); void pm_print_active_wakeup_sources(void) { struct wakeup_source *ws; int srcuidx, active = 0; struct wakeup_source *last_activity_ws = NULL; srcuidx = srcu_read_lock(&wakeup_srcu); list_for_each_entry_rcu_locked(ws, &wakeup_sources, entry) { if (ws->active) { pm_pr_dbg("active wakeup source: %s\n", ws->name); active = 1; } else if (!active && (!last_activity_ws || ktime_to_ns(ws->last_time) > ktime_to_ns(last_activity_ws->last_time))) { last_activity_ws = ws; } } if (!active && last_activity_ws) pm_pr_dbg("last active wakeup source: %s\n", last_activity_ws->name); srcu_read_unlock(&wakeup_srcu, srcuidx); } EXPORT_SYMBOL_GPL(pm_print_active_wakeup_sources); /** * pm_wakeup_pending - Check if power transition in progress should be aborted. * * Compare the current number of registered wakeup events with its preserved * value from the past and return true if new wakeup events have been registered * since the old value was stored. Also return true if the current number of * wakeup events being processed is different from zero. */ bool pm_wakeup_pending(void) { unsigned long flags; bool ret = false; raw_spin_lock_irqsave(&events_lock, flags); if (events_check_enabled) { unsigned int cnt, inpr; split_counters(&cnt, &inpr); ret = (cnt != saved_count || inpr > 0); events_check_enabled = !ret; } raw_spin_unlock_irqrestore(&events_lock, flags); if (ret) { pm_pr_dbg("Wakeup pending, aborting suspend\n"); pm_print_active_wakeup_sources(); } return ret || atomic_read(&pm_abort_suspend) > 0; } EXPORT_SYMBOL_GPL(pm_wakeup_pending); void pm_system_wakeup(void) { atomic_inc(&pm_abort_suspend); s2idle_wake(); } EXPORT_SYMBOL_GPL(pm_system_wakeup); void pm_system_cancel_wakeup(void) { atomic_dec_if_positive(&pm_abort_suspend); } void pm_wakeup_clear(unsigned int irq_number) { raw_spin_lock_irq(&wakeup_irq_lock); if (irq_number && wakeup_irq[0] == irq_number) wakeup_irq[0] = wakeup_irq[1]; else wakeup_irq[0] = 0; wakeup_irq[1] = 0; raw_spin_unlock_irq(&wakeup_irq_lock); if (!irq_number) atomic_set(&pm_abort_suspend, 0); } void pm_system_irq_wakeup(unsigned int irq_number) { unsigned long flags; raw_spin_lock_irqsave(&wakeup_irq_lock, flags); if (wakeup_irq[0] == 0) wakeup_irq[0] = irq_number; else if (wakeup_irq[1] == 0) wakeup_irq[1] = irq_number; else irq_number = 0; pm_pr_dbg("Triggering wakeup from IRQ %d\n", irq_number); raw_spin_unlock_irqrestore(&wakeup_irq_lock, flags); if (irq_number) pm_system_wakeup(); } unsigned int pm_wakeup_irq(void) { return wakeup_irq[0]; } /** * pm_get_wakeup_count - Read the number of registered wakeup events. * @count: Address to store the value at. * @block: Whether or not to block. * * Store the number of registered wakeup events at the address in @count. If * @block is set, block until the current number of wakeup events being * processed is zero. * * Return 'false' if the current number of wakeup events being processed is * nonzero. Otherwise return 'true'. */ bool pm_get_wakeup_count(unsigned int *count, bool block) { unsigned int cnt, inpr; if (block) { DEFINE_WAIT(wait); for (;;) { prepare_to_wait(&wakeup_count_wait_queue, &wait, TASK_INTERRUPTIBLE); split_counters(&cnt, &inpr); if (inpr == 0 || signal_pending(current)) break; pm_print_active_wakeup_sources(); schedule(); } finish_wait(&wakeup_count_wait_queue, &wait); } split_counters(&cnt, &inpr); *count = cnt; return !inpr; } /** * pm_save_wakeup_count - Save the current number of registered wakeup events. * @count: Value to compare with the current number of registered wakeup events. * * If @count is equal to the current number of registered wakeup events and the * current number of wakeup events being processed is zero, store @count as the * old number of registered wakeup events for pm_check_wakeup_events(), enable * wakeup events detection and return 'true'. Otherwise disable wakeup events * detection and return 'false'. */ bool pm_save_wakeup_count(unsigned int count) { unsigned int cnt, inpr; unsigned long flags; events_check_enabled = false; raw_spin_lock_irqsave(&events_lock, flags); split_counters(&cnt, &inpr); if (cnt == count && inpr == 0) { saved_count = count; events_check_enabled = true; } raw_spin_unlock_irqrestore(&events_lock, flags); return events_check_enabled; } #ifdef CONFIG_PM_AUTOSLEEP /** * pm_wakep_autosleep_enabled - Modify autosleep_enabled for all wakeup sources. * @set: Whether to set or to clear the autosleep_enabled flags. */ void pm_wakep_autosleep_enabled(bool set) { struct wakeup_source *ws; ktime_t now = ktime_get(); int srcuidx; srcuidx = srcu_read_lock(&wakeup_srcu); list_for_each_entry_rcu_locked(ws, &wakeup_sources, entry) { spin_lock_irq(&ws->lock); if (ws->autosleep_enabled != set) { ws->autosleep_enabled = set; if (ws->active) { if (set) ws->start_prevent_time = now; else update_prevent_sleep_time(ws, now); } } spin_unlock_irq(&ws->lock); } srcu_read_unlock(&wakeup_srcu, srcuidx); } #endif /* CONFIG_PM_AUTOSLEEP */ /** * print_wakeup_source_stats - Print wakeup source statistics information. * @m: seq_file to print the statistics into. * @ws: Wakeup source object to print the statistics for. */ static int print_wakeup_source_stats(struct seq_file *m, struct wakeup_source *ws) { unsigned long flags; ktime_t total_time; ktime_t max_time; unsigned long active_count; ktime_t active_time; ktime_t prevent_sleep_time; spin_lock_irqsave(&ws->lock, flags); total_time = ws->total_time; max_time = ws->max_time; prevent_sleep_time = ws->prevent_sleep_time; active_count = ws->active_count; if (ws->active) { ktime_t now = ktime_get(); active_time = ktime_sub(now, ws->last_time); total_time = ktime_add(total_time, active_time); if (active_time > max_time) max_time = active_time; if (ws->autosleep_enabled) prevent_sleep_time = ktime_add(prevent_sleep_time, ktime_sub(now, ws->start_prevent_time)); } else { active_time = 0; } seq_printf(m, "%-12s\t%lu\t\t%lu\t\t%lu\t\t%lu\t\t%lld\t\t%lld\t\t%lld\t\t%lld\t\t%lld\n", ws->name, active_count, ws->event_count, ws->wakeup_count, ws->expire_count, ktime_to_ms(active_time), ktime_to_ms(total_time), ktime_to_ms(max_time), ktime_to_ms(ws->last_time), ktime_to_ms(prevent_sleep_time)); spin_unlock_irqrestore(&ws->lock, flags); return 0; } static void *wakeup_sources_stats_seq_start(struct seq_file *m, loff_t *pos) { struct wakeup_source *ws; loff_t n = *pos; int *srcuidx = m->private; if (n == 0) { seq_puts(m, "name\t\tactive_count\tevent_count\twakeup_count\t" "expire_count\tactive_since\ttotal_time\tmax_time\t" "last_change\tprevent_suspend_time\n"); } *srcuidx = srcu_read_lock(&wakeup_srcu); list_for_each_entry_rcu_locked(ws, &wakeup_sources, entry) { if (n-- <= 0) return ws; } return NULL; } static void *wakeup_sources_stats_seq_next(struct seq_file *m, void *v, loff_t *pos) { struct wakeup_source *ws = v; struct wakeup_source *next_ws = NULL; ++(*pos); list_for_each_entry_continue_rcu(ws, &wakeup_sources, entry) { next_ws = ws; break; } if (!next_ws) print_wakeup_source_stats(m, &deleted_ws); return next_ws; } static void wakeup_sources_stats_seq_stop(struct seq_file *m, void *v) { int *srcuidx = m->private; srcu_read_unlock(&wakeup_srcu, *srcuidx); } /** * wakeup_sources_stats_seq_show - Print wakeup sources statistics information. * @m: seq_file to print the statistics into. * @v: wakeup_source of each iteration */ static int wakeup_sources_stats_seq_show(struct seq_file *m, void *v) { struct wakeup_source *ws = v; print_wakeup_source_stats(m, ws); return 0; } static const struct seq_operations wakeup_sources_stats_seq_ops = { .start = wakeup_sources_stats_seq_start, .next = wakeup_sources_stats_seq_next, .stop = wakeup_sources_stats_seq_stop, .show = wakeup_sources_stats_seq_show, }; static int wakeup_sources_stats_open(struct inode *inode, struct file *file) { return seq_open_private(file, &wakeup_sources_stats_seq_ops, sizeof(int)); } static const struct file_operations wakeup_sources_stats_fops = { .owner = THIS_MODULE, .open = wakeup_sources_stats_open, .read = seq_read, .llseek = seq_lseek, .release = seq_release_private, }; static int __init wakeup_sources_debugfs_init(void) { debugfs_create_file("wakeup_sources", 0444, NULL, NULL, &wakeup_sources_stats_fops); return 0; } postcore_initcall(wakeup_sources_debugfs_init);
5 5 5 8 5 5 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 // SPDX-License-Identifier: GPL-2.0-or-later /* Copyright (c) 2014 Mahesh Bandewar <maheshb@google.com> */ #include <net/ip.h> #include "ipvlan.h" static unsigned int ipvlan_netid __read_mostly; struct ipvlan_netns { unsigned int ipvl_nf_hook_refcnt; }; static struct ipvl_addr *ipvlan_skb_to_addr(struct sk_buff *skb, struct net_device *dev) { struct ipvl_addr *addr = NULL; struct ipvl_port *port; int addr_type; void *lyr3h; if (!dev || !netif_is_ipvlan_port(dev)) goto out; port = ipvlan_port_get_rcu(dev); if (!port || port->mode != IPVLAN_MODE_L3S) goto out; lyr3h = ipvlan_get_L3_hdr(port, skb, &addr_type); if (!lyr3h) goto out; addr = ipvlan_addr_lookup(port, lyr3h, addr_type, true); out: return addr; } static struct sk_buff *ipvlan_l3_rcv(struct net_device *dev, struct sk_buff *skb, u16 proto) { struct ipvl_addr *addr; struct net_device *sdev; addr = ipvlan_skb_to_addr(skb, dev); if (!addr) goto out; sdev = addr->master->dev; switch (proto) { case AF_INET: { const struct iphdr *ip4h = ip_hdr(skb); int err; err = ip_route_input_noref(skb, ip4h->daddr, ip4h->saddr, ip4h_dscp(ip4h), sdev); if (unlikely(err)) goto out; break; } #if IS_ENABLED(CONFIG_IPV6) case AF_INET6: { struct dst_entry *dst; struct ipv6hdr *ip6h = ipv6_hdr(skb); int flags = RT6_LOOKUP_F_HAS_SADDR; struct flowi6 fl6 = { .flowi6_iif = sdev->ifindex, .daddr = ip6h->daddr, .saddr = ip6h->saddr, .flowlabel = ip6_flowinfo(ip6h), .flowi6_mark = skb->mark, .flowi6_proto = ip6h->nexthdr, }; skb_dst_drop(skb); dst = ip6_route_input_lookup(dev_net(sdev), sdev, &fl6, skb, flags); skb_dst_set(skb, dst); break; } #endif default: break; } out: return skb; } static const struct l3mdev_ops ipvl_l3mdev_ops = { .l3mdev_l3_rcv = ipvlan_l3_rcv, }; static unsigned int ipvlan_nf_input(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { struct ipvl_addr *addr; unsigned int len; addr = ipvlan_skb_to_addr(skb, skb->dev); if (!addr) goto out; skb->dev = addr->master->dev; skb->skb_iif = skb->dev->ifindex; #if IS_ENABLED(CONFIG_IPV6) if (addr->atype == IPVL_IPV6) IP6CB(skb)->iif = skb->dev->ifindex; #endif len = skb->len + ETH_HLEN; ipvlan_count_rx(addr->master, len, true, false); out: return NF_ACCEPT; } static const struct nf_hook_ops ipvl_nfops[] = { { .hook = ipvlan_nf_input, .pf = NFPROTO_IPV4, .hooknum = NF_INET_LOCAL_IN, .priority = INT_MAX, }, #if IS_ENABLED(CONFIG_IPV6) { .hook = ipvlan_nf_input, .pf = NFPROTO_IPV6, .hooknum = NF_INET_LOCAL_IN, .priority = INT_MAX, }, #endif }; static int ipvlan_register_nf_hook(struct net *net) { struct ipvlan_netns *vnet = net_generic(net, ipvlan_netid); int err = 0; if (!vnet->ipvl_nf_hook_refcnt) { err = nf_register_net_hooks(net, ipvl_nfops, ARRAY_SIZE(ipvl_nfops)); if (!err) vnet->ipvl_nf_hook_refcnt = 1; } else { vnet->ipvl_nf_hook_refcnt++; } return err; } static void ipvlan_unregister_nf_hook(struct net *net) { struct ipvlan_netns *vnet = net_generic(net, ipvlan_netid); if (WARN_ON(!vnet->ipvl_nf_hook_refcnt)) return; vnet->ipvl_nf_hook_refcnt--; if (!vnet->ipvl_nf_hook_refcnt) nf_unregister_net_hooks(net, ipvl_nfops, ARRAY_SIZE(ipvl_nfops)); } void ipvlan_migrate_l3s_hook(struct net *oldnet, struct net *newnet) { struct ipvlan_netns *old_vnet; ASSERT_RTNL(); old_vnet = net_generic(oldnet, ipvlan_netid); if (!old_vnet->ipvl_nf_hook_refcnt) return; ipvlan_register_nf_hook(newnet); ipvlan_unregister_nf_hook(oldnet); } static void ipvlan_ns_exit(struct net *net) { struct ipvlan_netns *vnet = net_generic(net, ipvlan_netid); if (WARN_ON_ONCE(vnet->ipvl_nf_hook_refcnt)) { vnet->ipvl_nf_hook_refcnt = 0; nf_unregister_net_hooks(net, ipvl_nfops, ARRAY_SIZE(ipvl_nfops)); } } static struct pernet_operations ipvlan_net_ops = { .id = &ipvlan_netid, .size = sizeof(struct ipvlan_netns), .exit = ipvlan_ns_exit, }; int ipvlan_l3s_init(void) { return register_pernet_subsys(&ipvlan_net_ops); } void ipvlan_l3s_cleanup(void) { unregister_pernet_subsys(&ipvlan_net_ops); } int ipvlan_l3s_register(struct ipvl_port *port) { struct net_device *dev = port->dev; int ret; ASSERT_RTNL(); ret = ipvlan_register_nf_hook(read_pnet(&port->pnet)); if (!ret) { dev->l3mdev_ops = &ipvl_l3mdev_ops; dev->priv_flags |= IFF_L3MDEV_RX_HANDLER; } return ret; } void ipvlan_l3s_unregister(struct ipvl_port *port) { struct net_device *dev = port->dev; ASSERT_RTNL(); dev->priv_flags &= ~IFF_L3MDEV_RX_HANDLER; ipvlan_unregister_nf_hook(read_pnet(&port->pnet)); dev->l3mdev_ops = NULL; }
8 2 1 2 6 7 7 7 7 7 7 7 7 7 7 7 7 7 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 /* * net/tipc/addr.c: TIPC address utility routines * * Copyright (c) 2000-2006, 2018, Ericsson AB * Copyright (c) 2004-2005, 2010-2011, Wind River Systems * Copyright (c) 2020-2021, Red Hat Inc * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the names of the copyright holders nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * Alternatively, this software may be distributed under the terms of the * GNU General Public License ("GPL") version 2 as published by the Free * Software Foundation. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #include "addr.h" #include "core.h" bool tipc_in_scope(bool legacy_format, u32 domain, u32 addr) { if (!domain || (domain == addr)) return true; if (!legacy_format) return false; if (domain == tipc_cluster_mask(addr)) /* domain <Z.C.0> */ return true; if (domain == (addr & TIPC_ZONE_CLUSTER_MASK)) /* domain <Z.C.0> */ return true; if (domain == (addr & TIPC_ZONE_MASK)) /* domain <Z.0.0> */ return true; return false; } void tipc_set_node_id(struct net *net, u8 *id) { struct tipc_net *tn = tipc_net(net); memcpy(tn->node_id, id, NODE_ID_LEN); tipc_nodeid2string(tn->node_id_string, id); tn->trial_addr = hash128to32(id); pr_info("Node identity %s, cluster identity %u\n", tipc_own_id_string(net), tn->net_id); } void tipc_set_node_addr(struct net *net, u32 addr) { struct tipc_net *tn = tipc_net(net); u8 node_id[NODE_ID_LEN] = {0,}; tn->node_addr = addr; if (!tipc_own_id(net)) { sprintf(node_id, "%x", addr); tipc_set_node_id(net, node_id); } tn->trial_addr = addr; tn->addr_trial_end = jiffies; pr_info("Node number set to %u\n", addr); } char *tipc_nodeid2string(char *str, u8 *id) { int i; u8 c; /* Already a string ? */ for (i = 0; i < NODE_ID_LEN; i++) { c = id[i]; if (c >= '0' && c <= '9') continue; if (c >= 'A' && c <= 'Z') continue; if (c >= 'a' && c <= 'z') continue; if (c == '.') continue; if (c == ':') continue; if (c == '_') continue; if (c == '-') continue; if (c == '@') continue; if (c != 0) break; } if (i == NODE_ID_LEN) { memcpy(str, id, NODE_ID_LEN); str[NODE_ID_LEN] = 0; return str; } /* Translate to hex string */ for (i = 0; i < NODE_ID_LEN; i++) sprintf(&str[2 * i], "%02x", id[i]); /* Strip off trailing zeroes */ for (i = NODE_ID_STR_LEN - 2; str[i] == '0'; i--) str[i] = 0; return str; }
74 64 57 58 63 64 64 63 63 64 64 64 1 1 1 1 63 63 62 64 63 63 64 63 57 58 64 57 58 64 60 60 60 56 56 1 1 1 1 1 1 1 1 1 62 57 57 57 57 57 57 57 62 64 64 63 5 5 5 56 56 55 56 56 56 2 2 2 56 56 56 2 2 2 58 57 57 56 65 66 58 57 57 67 68 69 68 65 5 2 69 69 65 10 10 10 10 10 10 10 7 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 3 4 4 3 3 3 3 3 3 3 3 3 3 3 4 3 3 3 3 3 3 3 3 3 2 2 1 1 2 2 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 /* * drm_irq.c IRQ and vblank support * * \author Rickard E. (Rik) Faith <faith@valinux.com> * \author Gareth Hughes <gareth@valinux.com> * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), * to deal in the Software without restriction, including without limitation * the rights to use, copy, modify, merge, publish, distribute, sublicense, * and/or sell copies of the Software, and to permit persons to whom the * Software is furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice (including the next * paragraph) shall be included in all copies or substantial portions of the * Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL * VA LINUX SYSTEMS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, DAMAGES OR * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR * OTHER DEALINGS IN THE SOFTWARE. */ #include <linux/export.h> #include <linux/kthread.h> #include <linux/moduleparam.h> #include <drm/drm_crtc.h> #include <drm/drm_drv.h> #include <drm/drm_framebuffer.h> #include <drm/drm_managed.h> #include <drm/drm_modeset_helper_vtables.h> #include <drm/drm_print.h> #include <drm/drm_vblank.h> #include "drm_internal.h" #include "drm_trace.h" /** * DOC: vblank handling * * From the computer's perspective, every time the monitor displays * a new frame the scanout engine has "scanned out" the display image * from top to bottom, one row of pixels at a time. The current row * of pixels is referred to as the current scanline. * * In addition to the display's visible area, there's usually a couple of * extra scanlines which aren't actually displayed on the screen. * These extra scanlines don't contain image data and are occasionally used * for features like audio and infoframes. The region made up of these * scanlines is referred to as the vertical blanking region, or vblank for * short. * * For historical reference, the vertical blanking period was designed to * give the electron gun (on CRTs) enough time to move back to the top of * the screen to start scanning out the next frame. Similar for horizontal * blanking periods. They were designed to give the electron gun enough * time to move back to the other side of the screen to start scanning the * next scanline. * * :: * * * physical → ⎽⎽⎽⎽⎽⎽⎽⎽⎽⎽⎽⎽⎽⎽⎽⎽⎽⎽⎽⎽⎽⎽⎽⎽⎽⎽⎽⎽⎽⎽⎽⎽⎽⎽⎽⎽⎽⎽⎽⎽ * top of | | * display | | * | New frame | * | | * |↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓| * |~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~| ← Scanline, * |↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓| updates the * | | frame as it * | | travels down * | | ("scan out") * | Old frame | * | | * | | * | | * | | physical * | | bottom of * vertical |⎽⎽⎽⎽⎽⎽⎽⎽⎽⎽⎽⎽⎽⎽⎽⎽⎽⎽⎽⎽⎽⎽⎽⎽⎽⎽⎽⎽⎽⎽⎽⎽⎽⎽⎽⎽⎽⎽⎽⎽| ← display * blanking ┆xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx┆ * region → ┆xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx┆ * ┆xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx┆ * start of → ⎽⎽⎽⎽⎽⎽⎽⎽⎽⎽⎽⎽⎽⎽⎽⎽⎽⎽⎽⎽⎽⎽⎽⎽⎽⎽⎽⎽⎽⎽⎽⎽⎽⎽⎽⎽⎽⎽⎽⎽ * new frame * * "Physical top of display" is the reference point for the high-precision/ * corrected timestamp. * * On a lot of display hardware, programming needs to take effect during the * vertical blanking period so that settings like gamma, the image buffer * buffer to be scanned out, etc. can safely be changed without showing * any visual artifacts on the screen. In some unforgiving hardware, some of * this programming has to both start and end in the same vblank. To help * with the timing of the hardware programming, an interrupt is usually * available to notify the driver when it can start the updating of registers. * The interrupt is in this context named the vblank interrupt. * * The vblank interrupt may be fired at different points depending on the * hardware. Some hardware implementations will fire the interrupt when the * new frame start, other implementations will fire the interrupt at different * points in time. * * Vertical blanking plays a major role in graphics rendering. To achieve * tear-free display, users must synchronize page flips and/or rendering to * vertical blanking. The DRM API offers ioctls to perform page flips * synchronized to vertical blanking and wait for vertical blanking. * * The DRM core handles most of the vertical blanking management logic, which * involves filtering out spurious interrupts, keeping race-free blanking * counters, coping with counter wrap-around and resets and keeping use counts. * It relies on the driver to generate vertical blanking interrupts and * optionally provide a hardware vertical blanking counter. * * Drivers must initialize the vertical blanking handling core with a call to * drm_vblank_init(). Minimally, a driver needs to implement * &drm_crtc_funcs.enable_vblank and &drm_crtc_funcs.disable_vblank plus call * drm_crtc_handle_vblank() in its vblank interrupt handler for working vblank * support. * * Vertical blanking interrupts can be enabled by the DRM core or by drivers * themselves (for instance to handle page flipping operations). The DRM core * maintains a vertical blanking use count to ensure that the interrupts are not * disabled while a user still needs them. To increment the use count, drivers * call drm_crtc_vblank_get() and release the vblank reference again with * drm_crtc_vblank_put(). In between these two calls vblank interrupts are * guaranteed to be enabled. * * On many hardware disabling the vblank interrupt cannot be done in a race-free * manner, see &drm_vblank_crtc_config.disable_immediate and * &drm_driver.max_vblank_count. In that case the vblank core only disables the * vblanks after a timer has expired, which can be configured through the * ``vblankoffdelay`` module parameter. * * Drivers for hardware without support for vertical-blanking interrupts * must not call drm_vblank_init(). For such drivers, atomic helpers will * automatically generate fake vblank events as part of the display update. * This functionality also can be controlled by the driver by enabling and * disabling struct drm_crtc_state.no_vblank. */ /* Retry timestamp calculation up to 3 times to satisfy * drm_timestamp_precision before giving up. */ #define DRM_TIMESTAMP_MAXRETRIES 3 /* Threshold in nanoseconds for detection of redundant * vblank irq in drm_handle_vblank(). 1 msec should be ok. */ #define DRM_REDUNDANT_VBLIRQ_THRESH_NS 1000000 static bool drm_get_last_vbltimestamp(struct drm_device *dev, unsigned int pipe, ktime_t *tvblank, bool in_vblank_irq); static unsigned int drm_timestamp_precision = 20; /* Default to 20 usecs. */ static int drm_vblank_offdelay = 5000; /* Default to 5000 msecs. */ module_param_named(vblankoffdelay, drm_vblank_offdelay, int, 0600); module_param_named(timestamp_precision_usec, drm_timestamp_precision, int, 0600); MODULE_PARM_DESC(vblankoffdelay, "Delay until vblank irq auto-disable [msecs] (0: never disable, <0: disable immediately)"); MODULE_PARM_DESC(timestamp_precision_usec, "Max. error on timestamps [usecs]"); static struct drm_vblank_crtc * drm_vblank_crtc(struct drm_device *dev, unsigned int pipe) { return &dev->vblank[pipe]; } struct drm_vblank_crtc * drm_crtc_vblank_crtc(struct drm_crtc *crtc) { return drm_vblank_crtc(crtc->dev, drm_crtc_index(crtc)); } EXPORT_SYMBOL(drm_crtc_vblank_crtc); static void store_vblank(struct drm_device *dev, unsigned int pipe, u32 vblank_count_inc, ktime_t t_vblank, u32 last) { struct drm_vblank_crtc *vblank = drm_vblank_crtc(dev, pipe); assert_spin_locked(&dev->vblank_time_lock); vblank->last = last; write_seqlock(&vblank->seqlock); vblank->time = t_vblank; atomic64_add(vblank_count_inc, &vblank->count); write_sequnlock(&vblank->seqlock); } static u32 drm_max_vblank_count(struct drm_device *dev, unsigned int pipe) { struct drm_vblank_crtc *vblank = drm_vblank_crtc(dev, pipe); return vblank->max_vblank_count ?: dev->max_vblank_count; } /* * "No hw counter" fallback implementation of .get_vblank_counter() hook, * if there is no usable hardware frame counter available. */ static u32 drm_vblank_no_hw_counter(struct drm_device *dev, unsigned int pipe) { drm_WARN_ON_ONCE(dev, drm_max_vblank_count(dev, pipe) != 0); return 0; } static u32 __get_vblank_counter(struct drm_device *dev, unsigned int pipe) { if (drm_core_check_feature(dev, DRIVER_MODESET)) { struct drm_crtc *crtc = drm_crtc_from_index(dev, pipe); if (drm_WARN_ON(dev, !crtc)) return 0; if (crtc->funcs->get_vblank_counter) return crtc->funcs->get_vblank_counter(crtc); } return drm_vblank_no_hw_counter(dev, pipe); } /* * Reset the stored timestamp for the current vblank count to correspond * to the last vblank occurred. * * Only to be called from drm_crtc_vblank_on(). * * Note: caller must hold &drm_device.vbl_lock since this reads & writes * device vblank fields. */ static void drm_reset_vblank_timestamp(struct drm_device *dev, unsigned int pipe) { u32 cur_vblank; bool rc; ktime_t t_vblank; int count = DRM_TIMESTAMP_MAXRETRIES; spin_lock(&dev->vblank_time_lock); /* * sample the current counter to avoid random jumps * when drm_vblank_enable() applies the diff */ do { cur_vblank = __get_vblank_counter(dev, pipe); rc = drm_get_last_vbltimestamp(dev, pipe, &t_vblank, false); } while (cur_vblank != __get_vblank_counter(dev, pipe) && --count > 0); /* * Only reinitialize corresponding vblank timestamp if high-precision query * available and didn't fail. Otherwise reinitialize delayed at next vblank * interrupt and assign 0 for now, to mark the vblanktimestamp as invalid. */ if (!rc) t_vblank = 0; /* * +1 to make sure user will never see the same * vblank counter value before and after a modeset */ store_vblank(dev, pipe, 1, t_vblank, cur_vblank); spin_unlock(&dev->vblank_time_lock); } /* * Call back into the driver to update the appropriate vblank counter * (specified by @pipe). Deal with wraparound, if it occurred, and * update the last read value so we can deal with wraparound on the next * call if necessary. * * Only necessary when going from off->on, to account for frames we * didn't get an interrupt for. * * Note: caller must hold &drm_device.vbl_lock since this reads & writes * device vblank fields. */ static void drm_update_vblank_count(struct drm_device *dev, unsigned int pipe, bool in_vblank_irq) { struct drm_vblank_crtc *vblank = drm_vblank_crtc(dev, pipe); u32 cur_vblank, diff; bool rc; ktime_t t_vblank; int count = DRM_TIMESTAMP_MAXRETRIES; int framedur_ns = vblank->framedur_ns; u32 max_vblank_count = drm_max_vblank_count(dev, pipe); /* * Interrupts were disabled prior to this call, so deal with counter * wrap if needed. * NOTE! It's possible we lost a full dev->max_vblank_count + 1 events * here if the register is small or we had vblank interrupts off for * a long time. * * We repeat the hardware vblank counter & timestamp query until * we get consistent results. This to prevent races between gpu * updating its hardware counter while we are retrieving the * corresponding vblank timestamp. */ do { cur_vblank = __get_vblank_counter(dev, pipe); rc = drm_get_last_vbltimestamp(dev, pipe, &t_vblank, in_vblank_irq); } while (cur_vblank != __get_vblank_counter(dev, pipe) && --count > 0); if (max_vblank_count) { /* trust the hw counter when it's around */ diff = (cur_vblank - vblank->last) & max_vblank_count; } else if (rc && framedur_ns) { u64 diff_ns = ktime_to_ns(ktime_sub(t_vblank, vblank->time)); /* * Figure out how many vblanks we've missed based * on the difference in the timestamps and the * frame/field duration. */ drm_dbg_vbl(dev, "crtc %u: Calculating number of vblanks." " diff_ns = %lld, framedur_ns = %d)\n", pipe, (long long)diff_ns, framedur_ns); diff = DIV_ROUND_CLOSEST_ULL(diff_ns, framedur_ns); if (diff == 0 && in_vblank_irq) drm_dbg_vbl(dev, "crtc %u: Redundant vblirq ignored\n", pipe); } else { /* some kind of default for drivers w/o accurate vbl timestamping */ diff = in_vblank_irq ? 1 : 0; } /* * Within a drm_vblank_pre_modeset - drm_vblank_post_modeset * interval? If so then vblank irqs keep running and it will likely * happen that the hardware vblank counter is not trustworthy as it * might reset at some point in that interval and vblank timestamps * are not trustworthy either in that interval. Iow. this can result * in a bogus diff >> 1 which must be avoided as it would cause * random large forward jumps of the software vblank counter. */ if (diff > 1 && (vblank->inmodeset & 0x2)) { drm_dbg_vbl(dev, "clamping vblank bump to 1 on crtc %u: diffr=%u" " due to pre-modeset.\n", pipe, diff); diff = 1; } drm_dbg_vbl(dev, "updating vblank count on crtc %u:" " current=%llu, diff=%u, hw=%u hw_last=%u\n", pipe, (unsigned long long)atomic64_read(&vblank->count), diff, cur_vblank, vblank->last); if (diff == 0) { drm_WARN_ON_ONCE(dev, cur_vblank != vblank->last); return; } /* * Only reinitialize corresponding vblank timestamp if high-precision query * available and didn't fail, or we were called from the vblank interrupt. * Otherwise reinitialize delayed at next vblank interrupt and assign 0 * for now, to mark the vblanktimestamp as invalid. */ if (!rc && !in_vblank_irq) t_vblank = 0; store_vblank(dev, pipe, diff, t_vblank, cur_vblank); } u64 drm_vblank_count(struct drm_device *dev, unsigned int pipe) { struct drm_vblank_crtc *vblank = drm_vblank_crtc(dev, pipe); u64 count; if (drm_WARN_ON(dev, pipe >= dev->num_crtcs)) return 0; count = atomic64_read(&vblank->count); /* * This read barrier corresponds to the implicit write barrier of the * write seqlock in store_vblank(). Note that this is the only place * where we need an explicit barrier, since all other access goes * through drm_vblank_count_and_time(), which already has the required * read barrier curtesy of the read seqlock. */ smp_rmb(); return count; } /** * drm_crtc_accurate_vblank_count - retrieve the master vblank counter * @crtc: which counter to retrieve * * This function is similar to drm_crtc_vblank_count() but this function * interpolates to handle a race with vblank interrupts using the high precision * timestamping support. * * This is mostly useful for hardware that can obtain the scanout position, but * doesn't have a hardware frame counter. */ u64 drm_crtc_accurate_vblank_count(struct drm_crtc *crtc) { struct drm_device *dev = crtc->dev; unsigned int pipe = drm_crtc_index(crtc); u64 vblank; unsigned long flags; drm_WARN_ONCE(dev, drm_debug_enabled(DRM_UT_VBL) && !crtc->funcs->get_vblank_timestamp, "This function requires support for accurate vblank timestamps."); spin_lock_irqsave(&dev->vblank_time_lock, flags); drm_update_vblank_count(dev, pipe, false); vblank = drm_vblank_count(dev, pipe); spin_unlock_irqrestore(&dev->vblank_time_lock, flags); return vblank; } EXPORT_SYMBOL(drm_crtc_accurate_vblank_count); static void __disable_vblank(struct drm_device *dev, unsigned int pipe) { if (drm_core_check_feature(dev, DRIVER_MODESET)) { struct drm_crtc *crtc = drm_crtc_from_index(dev, pipe); if (drm_WARN_ON(dev, !crtc)) return; if (crtc->funcs->disable_vblank) crtc->funcs->disable_vblank(crtc); } } /* * Disable vblank irq's on crtc, make sure that last vblank count * of hardware and corresponding consistent software vblank counter * are preserved, even if there are any spurious vblank irq's after * disable. */ void drm_vblank_disable_and_save(struct drm_device *dev, unsigned int pipe) { struct drm_vblank_crtc *vblank = drm_vblank_crtc(dev, pipe); unsigned long irqflags; assert_spin_locked(&dev->vbl_lock); /* Prevent vblank irq processing while disabling vblank irqs, * so no updates of timestamps or count can happen after we've * disabled. Needed to prevent races in case of delayed irq's. */ spin_lock_irqsave(&dev->vblank_time_lock, irqflags); /* * Update vblank count and disable vblank interrupts only if the * interrupts were enabled. This avoids calling the ->disable_vblank() * operation in atomic context with the hardware potentially runtime * suspended. */ if (!vblank->enabled) goto out; /* * Update the count and timestamp to maintain the * appearance that the counter has been ticking all along until * this time. This makes the count account for the entire time * between drm_crtc_vblank_on() and drm_crtc_vblank_off(). */ drm_update_vblank_count(dev, pipe, false); __disable_vblank(dev, pipe); vblank->enabled = false; out: spin_unlock_irqrestore(&dev->vblank_time_lock, irqflags); } static void vblank_disable_fn(struct timer_list *t) { struct drm_vblank_crtc *vblank = from_timer(vblank, t, disable_timer); struct drm_device *dev = vblank->dev; unsigned int pipe = vblank->pipe; unsigned long irqflags; spin_lock_irqsave(&dev->vbl_lock, irqflags); if (atomic_read(&vblank->refcount) == 0 && vblank->enabled) { drm_dbg_core(dev, "disabling vblank on crtc %u\n", pipe); drm_vblank_disable_and_save(dev, pipe); } spin_unlock_irqrestore(&dev->vbl_lock, irqflags); } static void drm_vblank_init_release(struct drm_device *dev, void *ptr) { struct drm_vblank_crtc *vblank = ptr; drm_WARN_ON(dev, READ_ONCE(vblank->enabled) && drm_core_check_feature(dev, DRIVER_MODESET)); drm_vblank_destroy_worker(vblank); del_timer_sync(&vblank->disable_timer); } /** * drm_vblank_init - initialize vblank support * @dev: DRM device * @num_crtcs: number of CRTCs supported by @dev * * This function initializes vblank support for @num_crtcs display pipelines. * Cleanup is handled automatically through a cleanup function added with * drmm_add_action_or_reset(). * * Returns: * Zero on success or a negative error code on failure. */ int drm_vblank_init(struct drm_device *dev, unsigned int num_crtcs) { int ret; unsigned int i; spin_lock_init(&dev->vbl_lock); spin_lock_init(&dev->vblank_time_lock); dev->vblank = drmm_kcalloc(dev, num_crtcs, sizeof(*dev->vblank), GFP_KERNEL); if (!dev->vblank) return -ENOMEM; dev->num_crtcs = num_crtcs; for (i = 0; i < num_crtcs; i++) { struct drm_vblank_crtc *vblank = &dev->vblank[i]; vblank->dev = dev; vblank->pipe = i; init_waitqueue_head(&vblank->queue); timer_setup(&vblank->disable_timer, vblank_disable_fn, 0); seqlock_init(&vblank->seqlock); ret = drmm_add_action_or_reset(dev, drm_vblank_init_release, vblank); if (ret) return ret; ret = drm_vblank_worker_init(vblank); if (ret) return ret; } return 0; } EXPORT_SYMBOL(drm_vblank_init); /** * drm_dev_has_vblank - test if vblanking has been initialized for * a device * @dev: the device * * Drivers may call this function to test if vblank support is * initialized for a device. For most hardware this means that vblanking * can also be enabled. * * Atomic helpers use this function to initialize * &drm_crtc_state.no_vblank. See also drm_atomic_helper_check_modeset(). * * Returns: * True if vblanking has been initialized for the given device, false * otherwise. */ bool drm_dev_has_vblank(const struct drm_device *dev) { return dev->num_crtcs != 0; } EXPORT_SYMBOL(drm_dev_has_vblank); /** * drm_crtc_vblank_waitqueue - get vblank waitqueue for the CRTC * @crtc: which CRTC's vblank waitqueue to retrieve * * This function returns a pointer to the vblank waitqueue for the CRTC. * Drivers can use this to implement vblank waits using wait_event() and related * functions. */ wait_queue_head_t *drm_crtc_vblank_waitqueue(struct drm_crtc *crtc) { return &crtc->dev->vblank[drm_crtc_index(crtc)].queue; } EXPORT_SYMBOL(drm_crtc_vblank_waitqueue); /** * drm_calc_timestamping_constants - calculate vblank timestamp constants * @crtc: drm_crtc whose timestamp constants should be updated. * @mode: display mode containing the scanout timings * * Calculate and store various constants which are later needed by vblank and * swap-completion timestamping, e.g, by * drm_crtc_vblank_helper_get_vblank_timestamp(). They are derived from * CRTC's true scanout timing, so they take things like panel scaling or * other adjustments into account. */ void drm_calc_timestamping_constants(struct drm_crtc *crtc, const struct drm_display_mode *mode) { struct drm_device *dev = crtc->dev; unsigned int pipe = drm_crtc_index(crtc); struct drm_vblank_crtc *vblank = drm_crtc_vblank_crtc(crtc); int linedur_ns = 0, framedur_ns = 0; int dotclock = mode->crtc_clock; if (!drm_dev_has_vblank(dev)) return; if (drm_WARN_ON(dev, pipe >= dev->num_crtcs)) return; /* Valid dotclock? */ if (dotclock > 0) { int frame_size = mode->crtc_htotal * mode->crtc_vtotal; /* * Convert scanline length in pixels and video * dot clock to line duration and frame duration * in nanoseconds: */ linedur_ns = div_u64((u64) mode->crtc_htotal * 1000000, dotclock); framedur_ns = div_u64((u64) frame_size * 1000000, dotclock); /* * Fields of interlaced scanout modes are only half a frame duration. */ if (mode->flags & DRM_MODE_FLAG_INTERLACE) framedur_ns /= 2; } else { drm_err(dev, "crtc %u: Can't calculate constants, dotclock = 0!\n", crtc->base.id); } vblank->linedur_ns = linedur_ns; vblank->framedur_ns = framedur_ns; drm_mode_copy(&vblank->hwmode, mode); drm_dbg_core(dev, "crtc %u: hwmode: htotal %d, vtotal %d, vdisplay %d\n", crtc->base.id, mode->crtc_htotal, mode->crtc_vtotal, mode->crtc_vdisplay); drm_dbg_core(dev, "crtc %u: clock %d kHz framedur %d linedur %d\n", crtc->base.id, dotclock, framedur_ns, linedur_ns); } EXPORT_SYMBOL(drm_calc_timestamping_constants); /** * drm_crtc_vblank_helper_get_vblank_timestamp_internal - precise vblank * timestamp helper * @crtc: CRTC whose vblank timestamp to retrieve * @max_error: Desired maximum allowable error in timestamps (nanosecs) * On return contains true maximum error of timestamp * @vblank_time: Pointer to time which should receive the timestamp * @in_vblank_irq: * True when called from drm_crtc_handle_vblank(). Some drivers * need to apply some workarounds for gpu-specific vblank irq quirks * if flag is set. * @get_scanout_position: * Callback function to retrieve the scanout position. See * @struct drm_crtc_helper_funcs.get_scanout_position. * * Implements calculation of exact vblank timestamps from given drm_display_mode * timings and current video scanout position of a CRTC. * * The current implementation only handles standard video modes. For double scan * and interlaced modes the driver is supposed to adjust the hardware mode * (taken from &drm_crtc_state.adjusted mode for atomic modeset drivers) to * match the scanout position reported. * * Note that atomic drivers must call drm_calc_timestamping_constants() before * enabling a CRTC. The atomic helpers already take care of that in * drm_atomic_helper_calc_timestamping_constants(). * * Returns: * Returns true on success, and false on failure, i.e. when no accurate * timestamp could be acquired. */ bool drm_crtc_vblank_helper_get_vblank_timestamp_internal( struct drm_crtc *crtc, int *max_error, ktime_t *vblank_time, bool in_vblank_irq, drm_vblank_get_scanout_position_func get_scanout_position) { struct drm_device *dev = crtc->dev; unsigned int pipe = crtc->index; struct drm_vblank_crtc *vblank = &dev->vblank[pipe]; struct timespec64 ts_etime, ts_vblank_time; ktime_t stime, etime; bool vbl_status; const struct drm_display_mode *mode; int vpos, hpos, i; int delta_ns, duration_ns; if (pipe >= dev->num_crtcs) { drm_err(dev, "Invalid crtc %u\n", pipe); return false; } /* Scanout position query not supported? Should not happen. */ if (!get_scanout_position) { drm_err(dev, "Called from CRTC w/o get_scanout_position()!?\n"); return false; } if (drm_drv_uses_atomic_modeset(dev)) mode = &vblank->hwmode; else mode = &crtc->hwmode; /* If mode timing undefined, just return as no-op: * Happens during initial modesetting of a crtc. */ if (mode->crtc_clock == 0) { drm_dbg_core(dev, "crtc %u: Noop due to uninitialized mode.\n", pipe); drm_WARN_ON_ONCE(dev, drm_drv_uses_atomic_modeset(dev)); return false; } /* Get current scanout position with system timestamp. * Repeat query up to DRM_TIMESTAMP_MAXRETRIES times * if single query takes longer than max_error nanoseconds. * * This guarantees a tight bound on maximum error if * code gets preempted or delayed for some reason. */ for (i = 0; i < DRM_TIMESTAMP_MAXRETRIES; i++) { /* * Get vertical and horizontal scanout position vpos, hpos, * and bounding timestamps stime, etime, pre/post query. */ vbl_status = get_scanout_position(crtc, in_vblank_irq, &vpos, &hpos, &stime, &etime, mode); /* Return as no-op if scanout query unsupported or failed. */ if (!vbl_status) { drm_dbg_core(dev, "crtc %u : scanoutpos query failed.\n", pipe); return false; } /* Compute uncertainty in timestamp of scanout position query. */ duration_ns = ktime_to_ns(etime) - ktime_to_ns(stime); /* Accept result with < max_error nsecs timing uncertainty. */ if (duration_ns <= *max_error) break; } /* Noisy system timing? */ if (i == DRM_TIMESTAMP_MAXRETRIES) { drm_dbg_core(dev, "crtc %u: Noisy timestamp %d us > %d us [%d reps].\n", pipe, duration_ns / 1000, *max_error / 1000, i); } /* Return upper bound of timestamp precision error. */ *max_error = duration_ns; /* Convert scanout position into elapsed time at raw_time query * since start of scanout at first display scanline. delta_ns * can be negative if start of scanout hasn't happened yet. */ delta_ns = div_s64(1000000LL * (vpos * mode->crtc_htotal + hpos), mode->crtc_clock); /* Subtract time delta from raw timestamp to get final * vblank_time timestamp for end of vblank. */ *vblank_time = ktime_sub_ns(etime, delta_ns); if (!drm_debug_enabled(DRM_UT_VBL)) return true; ts_etime = ktime_to_timespec64(etime); ts_vblank_time = ktime_to_timespec64(*vblank_time); drm_dbg_vbl(dev, "crtc %u : v p(%d,%d)@ %lld.%06ld -> %lld.%06ld [e %d us, %d rep]\n", pipe, hpos, vpos, (u64)ts_etime.tv_sec, ts_etime.tv_nsec / 1000, (u64)ts_vblank_time.tv_sec, ts_vblank_time.tv_nsec / 1000, duration_ns / 1000, i); return true; } EXPORT_SYMBOL(drm_crtc_vblank_helper_get_vblank_timestamp_internal); /** * drm_crtc_vblank_helper_get_vblank_timestamp - precise vblank timestamp * helper * @crtc: CRTC whose vblank timestamp to retrieve * @max_error: Desired maximum allowable error in timestamps (nanosecs) * On return contains true maximum error of timestamp * @vblank_time: Pointer to time which should receive the timestamp * @in_vblank_irq: * True when called from drm_crtc_handle_vblank(). Some drivers * need to apply some workarounds for gpu-specific vblank irq quirks * if flag is set. * * Implements calculation of exact vblank timestamps from given drm_display_mode * timings and current video scanout position of a CRTC. This can be directly * used as the &drm_crtc_funcs.get_vblank_timestamp implementation of a kms * driver if &drm_crtc_helper_funcs.get_scanout_position is implemented. * * The current implementation only handles standard video modes. For double scan * and interlaced modes the driver is supposed to adjust the hardware mode * (taken from &drm_crtc_state.adjusted mode for atomic modeset drivers) to * match the scanout position reported. * * Note that atomic drivers must call drm_calc_timestamping_constants() before * enabling a CRTC. The atomic helpers already take care of that in * drm_atomic_helper_calc_timestamping_constants(). * * Returns: * Returns true on success, and false on failure, i.e. when no accurate * timestamp could be acquired. */ bool drm_crtc_vblank_helper_get_vblank_timestamp(struct drm_crtc *crtc, int *max_error, ktime_t *vblank_time, bool in_vblank_irq) { return drm_crtc_vblank_helper_get_vblank_timestamp_internal( crtc, max_error, vblank_time, in_vblank_irq, crtc->helper_private->get_scanout_position); } EXPORT_SYMBOL(drm_crtc_vblank_helper_get_vblank_timestamp); /** * drm_crtc_get_last_vbltimestamp - retrieve raw timestamp for the most * recent vblank interval * @crtc: CRTC whose vblank timestamp to retrieve * @tvblank: Pointer to target time which should receive the timestamp * @in_vblank_irq: * True when called from drm_crtc_handle_vblank(). Some drivers * need to apply some workarounds for gpu-specific vblank irq quirks * if flag is set. * * Fetches the system timestamp corresponding to the time of the most recent * vblank interval on specified CRTC. May call into kms-driver to * compute the timestamp with a high-precision GPU specific method. * * Returns zero if timestamp originates from uncorrected do_gettimeofday() * call, i.e., it isn't very precisely locked to the true vblank. * * Returns: * True if timestamp is considered to be very precise, false otherwise. */ static bool drm_crtc_get_last_vbltimestamp(struct drm_crtc *crtc, ktime_t *tvblank, bool in_vblank_irq) { bool ret = false; /* Define requested maximum error on timestamps (nanoseconds). */ int max_error = (int) drm_timestamp_precision * 1000; /* Query driver if possible and precision timestamping enabled. */ if (crtc && crtc->funcs->get_vblank_timestamp && max_error > 0) { ret = crtc->funcs->get_vblank_timestamp(crtc, &max_error, tvblank, in_vblank_irq); } /* GPU high precision timestamp query unsupported or failed. * Return current monotonic/gettimeofday timestamp as best estimate. */ if (!ret) *tvblank = ktime_get(); return ret; } static bool drm_get_last_vbltimestamp(struct drm_device *dev, unsigned int pipe, ktime_t *tvblank, bool in_vblank_irq) { struct drm_crtc *crtc = drm_crtc_from_index(dev, pipe); return drm_crtc_get_last_vbltimestamp(crtc, tvblank, in_vblank_irq); } /** * drm_crtc_vblank_count - retrieve "cooked" vblank counter value * @crtc: which counter to retrieve * * Fetches the "cooked" vblank count value that represents the number of * vblank events since the system was booted, including lost events due to * modesetting activity. Note that this timer isn't correct against a racing * vblank interrupt (since it only reports the software vblank counter), see * drm_crtc_accurate_vblank_count() for such use-cases. * * Note that for a given vblank counter value drm_crtc_handle_vblank() * and drm_crtc_vblank_count() or drm_crtc_vblank_count_and_time() * provide a barrier: Any writes done before calling * drm_crtc_handle_vblank() will be visible to callers of the later * functions, if the vblank count is the same or a later one. * * See also &drm_vblank_crtc.count. * * Returns: * The software vblank counter. */ u64 drm_crtc_vblank_count(struct drm_crtc *crtc) { return drm_vblank_count(crtc->dev, drm_crtc_index(crtc)); } EXPORT_SYMBOL(drm_crtc_vblank_count); /** * drm_vblank_count_and_time - retrieve "cooked" vblank counter value and the * system timestamp corresponding to that vblank counter value. * @dev: DRM device * @pipe: index of CRTC whose counter to retrieve * @vblanktime: Pointer to ktime_t to receive the vblank timestamp. * * Fetches the "cooked" vblank count value that represents the number of * vblank events since the system was booted, including lost events due to * modesetting activity. Returns corresponding system timestamp of the time * of the vblank interval that corresponds to the current vblank counter value. * * This is the legacy version of drm_crtc_vblank_count_and_time(). */ static u64 drm_vblank_count_and_time(struct drm_device *dev, unsigned int pipe, ktime_t *vblanktime) { struct drm_vblank_crtc *vblank = drm_vblank_crtc(dev, pipe); u64 vblank_count; unsigned int seq; if (drm_WARN_ON(dev, pipe >= dev->num_crtcs)) { *vblanktime = 0; return 0; } do { seq = read_seqbegin(&vblank->seqlock); vblank_count = atomic64_read(&vblank->count); *vblanktime = vblank->time; } while (read_seqretry(&vblank->seqlock, seq)); return vblank_count; } /** * drm_crtc_vblank_count_and_time - retrieve "cooked" vblank counter value * and the system timestamp corresponding to that vblank counter value * @crtc: which counter to retrieve * @vblanktime: Pointer to time to receive the vblank timestamp. * * Fetches the "cooked" vblank count value that represents the number of * vblank events since the system was booted, including lost events due to * modesetting activity. Returns corresponding system timestamp of the time * of the vblank interval that corresponds to the current vblank counter value. * * Note that for a given vblank counter value drm_crtc_handle_vblank() * and drm_crtc_vblank_count() or drm_crtc_vblank_count_and_time() * provide a barrier: Any writes done before calling * drm_crtc_handle_vblank() will be visible to callers of the later * functions, if the vblank count is the same or a later one. * * See also &drm_vblank_crtc.count. */ u64 drm_crtc_vblank_count_and_time(struct drm_crtc *crtc, ktime_t *vblanktime) { return drm_vblank_count_and_time(crtc->dev, drm_crtc_index(crtc), vblanktime); } EXPORT_SYMBOL(drm_crtc_vblank_count_and_time); /** * drm_crtc_next_vblank_start - calculate the time of the next vblank * @crtc: the crtc for which to calculate next vblank time * @vblanktime: pointer to time to receive the next vblank timestamp. * * Calculate the expected time of the start of the next vblank period, * based on time of previous vblank and frame duration */ int drm_crtc_next_vblank_start(struct drm_crtc *crtc, ktime_t *vblanktime) { struct drm_vblank_crtc *vblank; struct drm_display_mode *mode; u64 vblank_start; if (!drm_dev_has_vblank(crtc->dev)) return -EINVAL; vblank = drm_crtc_vblank_crtc(crtc); mode = &vblank->hwmode; if (!vblank->framedur_ns || !vblank->linedur_ns) return -EINVAL; if (!drm_crtc_get_last_vbltimestamp(crtc, vblanktime, false)) return -EINVAL; vblank_start = DIV_ROUND_DOWN_ULL( (u64)vblank->framedur_ns * mode->crtc_vblank_start, mode->crtc_vtotal); *vblanktime = ktime_add(*vblanktime, ns_to_ktime(vblank_start)); return 0; } EXPORT_SYMBOL(drm_crtc_next_vblank_start); static void send_vblank_event(struct drm_device *dev, struct drm_pending_vblank_event *e, u64 seq, ktime_t now) { struct timespec64 tv; switch (e->event.base.type) { case DRM_EVENT_VBLANK: case DRM_EVENT_FLIP_COMPLETE: tv = ktime_to_timespec64(now); e->event.vbl.sequence = seq; /* * e->event is a user space structure, with hardcoded unsigned * 32-bit seconds/microseconds. This is safe as we always use * monotonic timestamps since linux-4.15 */ e->event.vbl.tv_sec = tv.tv_sec; e->event.vbl.tv_usec = tv.tv_nsec / 1000; break; case DRM_EVENT_CRTC_SEQUENCE: if (seq) e->event.seq.sequence = seq; e->event.seq.time_ns = ktime_to_ns(now); break; } trace_drm_vblank_event_delivered(e->base.file_priv, e->pipe, seq); /* * Use the same timestamp for any associated fence signal to avoid * mismatch in timestamps for vsync & fence events triggered by the * same HW event. Frameworks like SurfaceFlinger in Android expects the * retire-fence timestamp to match exactly with HW vsync as it uses it * for its software vsync modeling. */ drm_send_event_timestamp_locked(dev, &e->base, now); } /** * drm_crtc_arm_vblank_event - arm vblank event after pageflip * @crtc: the source CRTC of the vblank event * @e: the event to send * * A lot of drivers need to generate vblank events for the very next vblank * interrupt. For example when the page flip interrupt happens when the page * flip gets armed, but not when it actually executes within the next vblank * period. This helper function implements exactly the required vblank arming * behaviour. * * NOTE: Drivers using this to send out the &drm_crtc_state.event as part of an * atomic commit must ensure that the next vblank happens at exactly the same * time as the atomic commit is committed to the hardware. This function itself * does **not** protect against the next vblank interrupt racing with either this * function call or the atomic commit operation. A possible sequence could be: * * 1. Driver commits new hardware state into vblank-synchronized registers. * 2. A vblank happens, committing the hardware state. Also the corresponding * vblank interrupt is fired off and fully processed by the interrupt * handler. * 3. The atomic commit operation proceeds to call drm_crtc_arm_vblank_event(). * 4. The event is only send out for the next vblank, which is wrong. * * An equivalent race can happen when the driver calls * drm_crtc_arm_vblank_event() before writing out the new hardware state. * * The only way to make this work safely is to prevent the vblank from firing * (and the hardware from committing anything else) until the entire atomic * commit sequence has run to completion. If the hardware does not have such a * feature (e.g. using a "go" bit), then it is unsafe to use this functions. * Instead drivers need to manually send out the event from their interrupt * handler by calling drm_crtc_send_vblank_event() and make sure that there's no * possible race with the hardware committing the atomic update. * * Caller must hold a vblank reference for the event @e acquired by a * drm_crtc_vblank_get(), which will be dropped when the next vblank arrives. */ void drm_crtc_arm_vblank_event(struct drm_crtc *crtc, struct drm_pending_vblank_event *e) { struct drm_device *dev = crtc->dev; unsigned int pipe = drm_crtc_index(crtc); assert_spin_locked(&dev->event_lock); e->pipe = pipe; e->sequence = drm_crtc_accurate_vblank_count(crtc) + 1; list_add_tail(&e->base.link, &dev->vblank_event_list); } EXPORT_SYMBOL(drm_crtc_arm_vblank_event); /** * drm_crtc_send_vblank_event - helper to send vblank event after pageflip * @crtc: the source CRTC of the vblank event * @e: the event to send * * Updates sequence # and timestamp on event for the most recently processed * vblank, and sends it to userspace. Caller must hold event lock. * * See drm_crtc_arm_vblank_event() for a helper which can be used in certain * situation, especially to send out events for atomic commit operations. */ void drm_crtc_send_vblank_event(struct drm_crtc *crtc, struct drm_pending_vblank_event *e) { struct drm_device *dev = crtc->dev; u64 seq; unsigned int pipe = drm_crtc_index(crtc); ktime_t now; if (drm_dev_has_vblank(dev)) { seq = drm_vblank_count_and_time(dev, pipe, &now); } else { seq = 0; now = ktime_get(); } e->pipe = pipe; send_vblank_event(dev, e, seq, now); } EXPORT_SYMBOL(drm_crtc_send_vblank_event); static int __enable_vblank(struct drm_device *dev, unsigned int pipe) { if (drm_core_check_feature(dev, DRIVER_MODESET)) { struct drm_crtc *crtc = drm_crtc_from_index(dev, pipe); if (drm_WARN_ON(dev, !crtc)) return 0; if (crtc->funcs->enable_vblank) return crtc->funcs->enable_vblank(crtc); } return -EINVAL; } static int drm_vblank_enable(struct drm_device *dev, unsigned int pipe) { struct drm_vblank_crtc *vblank = drm_vblank_crtc(dev, pipe); int ret = 0; assert_spin_locked(&dev->vbl_lock); spin_lock(&dev->vblank_time_lock); if (!vblank->enabled) { /* * Enable vblank irqs under vblank_time_lock protection. * All vblank count & timestamp updates are held off * until we are done reinitializing master counter and * timestamps. Filtercode in drm_handle_vblank() will * prevent double-accounting of same vblank interval. */ ret = __enable_vblank(dev, pipe); drm_dbg_core(dev, "enabling vblank on crtc %u, ret: %d\n", pipe, ret); if (ret) { atomic_dec(&vblank->refcount); } else { drm_update_vblank_count(dev, pipe, 0); /* drm_update_vblank_count() includes a wmb so we just * need to ensure that the compiler emits the write * to mark the vblank as enabled after the call * to drm_update_vblank_count(). */ WRITE_ONCE(vblank->enabled, true); } } spin_unlock(&dev->vblank_time_lock); return ret; } int drm_vblank_get(struct drm_device *dev, unsigned int pipe) { struct drm_vblank_crtc *vblank = drm_vblank_crtc(dev, pipe); unsigned long irqflags; int ret = 0; if (!drm_dev_has_vblank(dev)) return -EINVAL; if (drm_WARN_ON(dev, pipe >= dev->num_crtcs)) return -EINVAL; spin_lock_irqsave(&dev->vbl_lock, irqflags); /* Going from 0->1 means we have to enable interrupts again */ if (atomic_add_return(1, &vblank->refcount) == 1) { ret = drm_vblank_enable(dev, pipe); } else { if (!vblank->enabled) { atomic_dec(&vblank->refcount); ret = -EINVAL; } } spin_unlock_irqrestore(&dev->vbl_lock, irqflags); return ret; } /** * drm_crtc_vblank_get - get a reference count on vblank events * @crtc: which CRTC to own * * Acquire a reference count on vblank events to avoid having them disabled * while in use. * * Returns: * Zero on success or a negative error code on failure. */ int drm_crtc_vblank_get(struct drm_crtc *crtc) { return drm_vblank_get(crtc->dev, drm_crtc_index(crtc)); } EXPORT_SYMBOL(drm_crtc_vblank_get); void drm_vblank_put(struct drm_device *dev, unsigned int pipe) { struct drm_vblank_crtc *vblank = drm_vblank_crtc(dev, pipe); int vblank_offdelay = vblank->config.offdelay_ms; if (drm_WARN_ON(dev, pipe >= dev->num_crtcs)) return; if (drm_WARN_ON(dev, atomic_read(&vblank->refcount) == 0)) return; /* Last user schedules interrupt disable */ if (atomic_dec_and_test(&vblank->refcount)) { if (!vblank_offdelay) return; else if (vblank_offdelay < 0) vblank_disable_fn(&vblank->disable_timer); else if (!vblank->config.disable_immediate) mod_timer(&vblank->disable_timer, jiffies + ((vblank_offdelay * HZ) / 1000)); } } /** * drm_crtc_vblank_put - give up ownership of vblank events * @crtc: which counter to give up * * Release ownership of a given vblank counter, turning off interrupts * if possible. Disable interrupts after &drm_vblank_crtc_config.offdelay_ms * milliseconds. */ void drm_crtc_vblank_put(struct drm_crtc *crtc) { drm_vblank_put(crtc->dev, drm_crtc_index(crtc)); } EXPORT_SYMBOL(drm_crtc_vblank_put); /** * drm_wait_one_vblank - wait for one vblank * @dev: DRM device * @pipe: CRTC index * * This waits for one vblank to pass on @pipe, using the irq driver interfaces. * It is a failure to call this when the vblank irq for @pipe is disabled, e.g. * due to lack of driver support or because the crtc is off. * * This is the legacy version of drm_crtc_wait_one_vblank(). */ void drm_wait_one_vblank(struct drm_device *dev, unsigned int pipe) { struct drm_vblank_crtc *vblank = drm_vblank_crtc(dev, pipe); int ret; u64 last; if (drm_WARN_ON(dev, pipe >= dev->num_crtcs)) return; ret = drm_vblank_get(dev, pipe); if (drm_WARN(dev, ret, "vblank not available on crtc %i, ret=%i\n", pipe, ret)) return; last = drm_vblank_count(dev, pipe); ret = wait_event_timeout(vblank->queue, last != drm_vblank_count(dev, pipe), msecs_to_jiffies(100)); drm_WARN(dev, ret == 0, "vblank wait timed out on crtc %i\n", pipe); drm_vblank_put(dev, pipe); } EXPORT_SYMBOL(drm_wait_one_vblank); /** * drm_crtc_wait_one_vblank - wait for one vblank * @crtc: DRM crtc * * This waits for one vblank to pass on @crtc, using the irq driver interfaces. * It is a failure to call this when the vblank irq for @crtc is disabled, e.g. * due to lack of driver support or because the crtc is off. */ void drm_crtc_wait_one_vblank(struct drm_crtc *crtc) { drm_wait_one_vblank(crtc->dev, drm_crtc_index(crtc)); } EXPORT_SYMBOL(drm_crtc_wait_one_vblank); /** * drm_crtc_vblank_off - disable vblank events on a CRTC * @crtc: CRTC in question * * Drivers can use this function to shut down the vblank interrupt handling when * disabling a crtc. This function ensures that the latest vblank frame count is * stored so that drm_vblank_on can restore it again. * * Drivers must use this function when the hardware vblank counter can get * reset, e.g. when suspending or disabling the @crtc in general. */ void drm_crtc_vblank_off(struct drm_crtc *crtc) { struct drm_device *dev = crtc->dev; unsigned int pipe = drm_crtc_index(crtc); struct drm_vblank_crtc *vblank = drm_crtc_vblank_crtc(crtc); struct drm_pending_vblank_event *e, *t; ktime_t now; u64 seq; if (drm_WARN_ON(dev, pipe >= dev->num_crtcs)) return; /* * Grab event_lock early to prevent vblank work from being scheduled * while we're in the middle of shutting down vblank interrupts */ spin_lock_irq(&dev->event_lock); spin_lock(&dev->vbl_lock); drm_dbg_vbl(dev, "crtc %d, vblank enabled %d, inmodeset %d\n", pipe, vblank->enabled, vblank->inmodeset); /* Avoid redundant vblank disables without previous * drm_crtc_vblank_on(). */ if (drm_core_check_feature(dev, DRIVER_ATOMIC) || !vblank->inmodeset) drm_vblank_disable_and_save(dev, pipe); wake_up(&vblank->queue); /* * Prevent subsequent drm_vblank_get() from re-enabling * the vblank interrupt by bumping the refcount. */ if (!vblank->inmodeset) { atomic_inc(&vblank->refcount); vblank->inmodeset = 1; } spin_unlock(&dev->vbl_lock); /* Send any queued vblank events, lest the natives grow disquiet */ seq = drm_vblank_count_and_time(dev, pipe, &now); list_for_each_entry_safe(e, t, &dev->vblank_event_list, base.link) { if (e->pipe != pipe) continue; drm_dbg_core(dev, "Sending premature vblank event on disable: " "wanted %llu, current %llu\n", e->sequence, seq); list_del(&e->base.link); drm_vblank_put(dev, pipe); send_vblank_event(dev, e, seq, now); } /* Cancel any leftover pending vblank work */ drm_vblank_cancel_pending_works(vblank); spin_unlock_irq(&dev->event_lock); /* Will be reset by the modeset helpers when re-enabling the crtc by * calling drm_calc_timestamping_constants(). */ vblank->hwmode.crtc_clock = 0; /* Wait for any vblank work that's still executing to finish */ drm_vblank_flush_worker(vblank); } EXPORT_SYMBOL(drm_crtc_vblank_off); /** * drm_crtc_vblank_reset - reset vblank state to off on a CRTC * @crtc: CRTC in question * * Drivers can use this function to reset the vblank state to off at load time. * Drivers should use this together with the drm_crtc_vblank_off() and * drm_crtc_vblank_on() functions. The difference compared to * drm_crtc_vblank_off() is that this function doesn't save the vblank counter * and hence doesn't need to call any driver hooks. * * This is useful for recovering driver state e.g. on driver load, or on resume. */ void drm_crtc_vblank_reset(struct drm_crtc *crtc) { struct drm_device *dev = crtc->dev; struct drm_vblank_crtc *vblank = drm_crtc_vblank_crtc(crtc); spin_lock_irq(&dev->vbl_lock); /* * Prevent subsequent drm_vblank_get() from enabling the vblank * interrupt by bumping the refcount. */ if (!vblank->inmodeset) { atomic_inc(&vblank->refcount); vblank->inmodeset = 1; } spin_unlock_irq(&dev->vbl_lock); drm_WARN_ON(dev, !list_empty(&dev->vblank_event_list)); drm_WARN_ON(dev, !list_empty(&vblank->pending_work)); } EXPORT_SYMBOL(drm_crtc_vblank_reset); /** * drm_crtc_set_max_vblank_count - configure the hw max vblank counter value * @crtc: CRTC in question * @max_vblank_count: max hardware vblank counter value * * Update the maximum hardware vblank counter value for @crtc * at runtime. Useful for hardware where the operation of the * hardware vblank counter depends on the currently active * display configuration. * * For example, if the hardware vblank counter does not work * when a specific connector is active the maximum can be set * to zero. And when that specific connector isn't active the * maximum can again be set to the appropriate non-zero value. * * If used, must be called before drm_vblank_on(). */ void drm_crtc_set_max_vblank_count(struct drm_crtc *crtc, u32 max_vblank_count) { struct drm_device *dev = crtc->dev; struct drm_vblank_crtc *vblank = drm_crtc_vblank_crtc(crtc); drm_WARN_ON(dev, dev->max_vblank_count); drm_WARN_ON(dev, !READ_ONCE(vblank->inmodeset)); vblank->max_vblank_count = max_vblank_count; } EXPORT_SYMBOL(drm_crtc_set_max_vblank_count); /** * drm_crtc_vblank_on_config - enable vblank events on a CRTC with custom * configuration options * @crtc: CRTC in question * @config: Vblank configuration value * * See drm_crtc_vblank_on(). In addition, this function allows you to provide a * custom vblank configuration for a given CRTC. * * Note that @config is copied, the pointer does not need to stay valid beyond * this function call. For details of the parameters see * struct drm_vblank_crtc_config. */ void drm_crtc_vblank_on_config(struct drm_crtc *crtc, const struct drm_vblank_crtc_config *config) { struct drm_device *dev = crtc->dev; unsigned int pipe = drm_crtc_index(crtc); struct drm_vblank_crtc *vblank = drm_crtc_vblank_crtc(crtc); if (drm_WARN_ON(dev, pipe >= dev->num_crtcs)) return; spin_lock_irq(&dev->vbl_lock); drm_dbg_vbl(dev, "crtc %d, vblank enabled %d, inmodeset %d\n", pipe, vblank->enabled, vblank->inmodeset); vblank->config = *config; /* Drop our private "prevent drm_vblank_get" refcount */ if (vblank->inmodeset) { atomic_dec(&vblank->refcount); vblank->inmodeset = 0; } drm_reset_vblank_timestamp(dev, pipe); /* * re-enable interrupts if there are users left, or the * user wishes vblank interrupts to be enabled all the time. */ if (atomic_read(&vblank->refcount) != 0 || !vblank->config.offdelay_ms) drm_WARN_ON(dev, drm_vblank_enable(dev, pipe)); spin_unlock_irq(&dev->vbl_lock); } EXPORT_SYMBOL(drm_crtc_vblank_on_config); /** * drm_crtc_vblank_on - enable vblank events on a CRTC * @crtc: CRTC in question * * This functions restores the vblank interrupt state captured with * drm_crtc_vblank_off() again and is generally called when enabling @crtc. Note * that calls to drm_crtc_vblank_on() and drm_crtc_vblank_off() can be * unbalanced and so can also be unconditionally called in driver load code to * reflect the current hardware state of the crtc. * * Note that unlike in drm_crtc_vblank_on_config(), default values are used. */ void drm_crtc_vblank_on(struct drm_crtc *crtc) { const struct drm_vblank_crtc_config config = { .offdelay_ms = drm_vblank_offdelay, .disable_immediate = crtc->dev->vblank_disable_immediate }; drm_crtc_vblank_on_config(crtc, &config); } EXPORT_SYMBOL(drm_crtc_vblank_on); static void drm_vblank_restore(struct drm_device *dev, unsigned int pipe) { ktime_t t_vblank; struct drm_vblank_crtc *vblank; int framedur_ns; u64 diff_ns; u32 cur_vblank, diff = 1; int count = DRM_TIMESTAMP_MAXRETRIES; u32 max_vblank_count = drm_max_vblank_count(dev, pipe); if (drm_WARN_ON(dev, pipe >= dev->num_crtcs)) return; assert_spin_locked(&dev->vbl_lock); assert_spin_locked(&dev->vblank_time_lock); vblank = drm_vblank_crtc(dev, pipe); drm_WARN_ONCE(dev, drm_debug_enabled(DRM_UT_VBL) && !vblank->framedur_ns, "Cannot compute missed vblanks without frame duration\n"); framedur_ns = vblank->framedur_ns; do { cur_vblank = __get_vblank_counter(dev, pipe); drm_get_last_vbltimestamp(dev, pipe, &t_vblank, false); } while (cur_vblank != __get_vblank_counter(dev, pipe) && --count > 0); diff_ns = ktime_to_ns(ktime_sub(t_vblank, vblank->time)); if (framedur_ns) diff = DIV_ROUND_CLOSEST_ULL(diff_ns, framedur_ns); drm_dbg_vbl(dev, "missed %d vblanks in %lld ns, frame duration=%d ns, hw_diff=%d\n", diff, diff_ns, framedur_ns, cur_vblank - vblank->last); vblank->last = (cur_vblank - diff) & max_vblank_count; } /** * drm_crtc_vblank_restore - estimate missed vblanks and update vblank count. * @crtc: CRTC in question * * Power manamement features can cause frame counter resets between vblank * disable and enable. Drivers can use this function in their * &drm_crtc_funcs.enable_vblank implementation to estimate missed vblanks since * the last &drm_crtc_funcs.disable_vblank using timestamps and update the * vblank counter. * * Note that drivers must have race-free high-precision timestamping support, * i.e. &drm_crtc_funcs.get_vblank_timestamp must be hooked up and * &drm_vblank_crtc_config.disable_immediate must be set to indicate the * time-stamping functions are race-free against vblank hardware counter * increments. */ void drm_crtc_vblank_restore(struct drm_crtc *crtc) { struct drm_device *dev = crtc->dev; unsigned int pipe = drm_crtc_index(crtc); struct drm_vblank_crtc *vblank = drm_vblank_crtc(dev, pipe); drm_WARN_ON_ONCE(dev, !crtc->funcs->get_vblank_timestamp); drm_WARN_ON_ONCE(dev, vblank->inmodeset); drm_WARN_ON_ONCE(dev, !vblank->config.disable_immediate); drm_vblank_restore(dev, pipe); } EXPORT_SYMBOL(drm_crtc_vblank_restore); static int drm_queue_vblank_event(struct drm_device *dev, unsigned int pipe, u64 req_seq, union drm_wait_vblank *vblwait, struct drm_file *file_priv) { struct drm_vblank_crtc *vblank = drm_vblank_crtc(dev, pipe); struct drm_pending_vblank_event *e; ktime_t now; u64 seq; int ret; e = kzalloc(sizeof(*e), GFP_KERNEL); if (e == NULL) { ret = -ENOMEM; goto err_put; } e->pipe = pipe; e->event.base.type = DRM_EVENT_VBLANK; e->event.base.length = sizeof(e->event.vbl); e->event.vbl.user_data = vblwait->request.signal; e->event.vbl.crtc_id = 0; if (drm_core_check_feature(dev, DRIVER_MODESET)) { struct drm_crtc *crtc = drm_crtc_from_index(dev, pipe); if (crtc) e->event.vbl.crtc_id = crtc->base.id; } spin_lock_irq(&dev->event_lock); /* * drm_crtc_vblank_off() might have been called after we called * drm_vblank_get(). drm_crtc_vblank_off() holds event_lock around the * vblank disable, so no need for further locking. The reference from * drm_vblank_get() protects against vblank disable from another source. */ if (!READ_ONCE(vblank->enabled)) { ret = -EINVAL; goto err_unlock; } ret = drm_event_reserve_init_locked(dev, file_priv, &e->base, &e->event.base); if (ret) goto err_unlock; seq = drm_vblank_count_and_time(dev, pipe, &now); drm_dbg_core(dev, "event on vblank count %llu, current %llu, crtc %u\n", req_seq, seq, pipe); trace_drm_vblank_event_queued(file_priv, pipe, req_seq); e->sequence = req_seq; if (drm_vblank_passed(seq, req_seq)) { drm_vblank_put(dev, pipe); send_vblank_event(dev, e, seq, now); vblwait->reply.sequence = seq; } else { /* drm_handle_vblank_events will call drm_vblank_put */ list_add_tail(&e->base.link, &dev->vblank_event_list); vblwait->reply.sequence = req_seq; } spin_unlock_irq(&dev->event_lock); return 0; err_unlock: spin_unlock_irq(&dev->event_lock); kfree(e); err_put: drm_vblank_put(dev, pipe); return ret; } static bool drm_wait_vblank_is_query(union drm_wait_vblank *vblwait) { if (vblwait->request.sequence) return false; return _DRM_VBLANK_RELATIVE == (vblwait->request.type & (_DRM_VBLANK_TYPES_MASK | _DRM_VBLANK_EVENT | _DRM_VBLANK_NEXTONMISS)); } /* * Widen a 32-bit param to 64-bits. * * \param narrow 32-bit value (missing upper 32 bits) * \param near 64-bit value that should be 'close' to near * * This function returns a 64-bit value using the lower 32-bits from * 'narrow' and constructing the upper 32-bits so that the result is * as close as possible to 'near'. */ static u64 widen_32_to_64(u32 narrow, u64 near) { return near + (s32) (narrow - near); } static void drm_wait_vblank_reply(struct drm_device *dev, unsigned int pipe, struct drm_wait_vblank_reply *reply) { ktime_t now; struct timespec64 ts; /* * drm_wait_vblank_reply is a UAPI structure that uses 'long' * to store the seconds. This is safe as we always use monotonic * timestamps since linux-4.15. */ reply->sequence = drm_vblank_count_and_time(dev, pipe, &now); ts = ktime_to_timespec64(now); reply->tval_sec = (u32)ts.tv_sec; reply->tval_usec = ts.tv_nsec / 1000; } static bool drm_wait_vblank_supported(struct drm_device *dev) { return drm_dev_has_vblank(dev); } int drm_wait_vblank_ioctl(struct drm_device *dev, void *data, struct drm_file *file_priv) { struct drm_crtc *crtc; struct drm_vblank_crtc *vblank; union drm_wait_vblank *vblwait = data; int ret; u64 req_seq, seq; unsigned int pipe_index; unsigned int flags, pipe, high_pipe; if (!drm_wait_vblank_supported(dev)) return -EOPNOTSUPP; if (vblwait->request.type & _DRM_VBLANK_SIGNAL) return -EINVAL; if (vblwait->request.type & ~(_DRM_VBLANK_TYPES_MASK | _DRM_VBLANK_FLAGS_MASK | _DRM_VBLANK_HIGH_CRTC_MASK)) { drm_dbg_core(dev, "Unsupported type value 0x%x, supported mask 0x%x\n", vblwait->request.type, (_DRM_VBLANK_TYPES_MASK | _DRM_VBLANK_FLAGS_MASK | _DRM_VBLANK_HIGH_CRTC_MASK)); return -EINVAL; } flags = vblwait->request.type & _DRM_VBLANK_FLAGS_MASK; high_pipe = (vblwait->request.type & _DRM_VBLANK_HIGH_CRTC_MASK); if (high_pipe) pipe_index = high_pipe >> _DRM_VBLANK_HIGH_CRTC_SHIFT; else pipe_index = flags & _DRM_VBLANK_SECONDARY ? 1 : 0; /* Convert lease-relative crtc index into global crtc index */ if (drm_core_check_feature(dev, DRIVER_MODESET)) { pipe = 0; drm_for_each_crtc(crtc, dev) { if (drm_lease_held(file_priv, crtc->base.id)) { if (pipe_index == 0) break; pipe_index--; } pipe++; } } else { pipe = pipe_index; } if (pipe >= dev->num_crtcs) return -EINVAL; vblank = &dev->vblank[pipe]; /* If the counter is currently enabled and accurate, short-circuit * queries to return the cached timestamp of the last vblank. */ if (vblank->config.disable_immediate && drm_wait_vblank_is_query(vblwait) && READ_ONCE(vblank->enabled)) { drm_wait_vblank_reply(dev, pipe, &vblwait->reply); return 0; } ret = drm_vblank_get(dev, pipe); if (ret) { drm_dbg_core(dev, "crtc %d failed to acquire vblank counter, %d\n", pipe, ret); return ret; } seq = drm_vblank_count(dev, pipe); switch (vblwait->request.type & _DRM_VBLANK_TYPES_MASK) { case _DRM_VBLANK_RELATIVE: req_seq = seq + vblwait->request.sequence; vblwait->request.sequence = req_seq; vblwait->request.type &= ~_DRM_VBLANK_RELATIVE; break; case _DRM_VBLANK_ABSOLUTE: req_seq = widen_32_to_64(vblwait->request.sequence, seq); break; default: ret = -EINVAL; goto done; } if ((flags & _DRM_VBLANK_NEXTONMISS) && drm_vblank_passed(seq, req_seq)) { req_seq = seq + 1; vblwait->request.type &= ~_DRM_VBLANK_NEXTONMISS; vblwait->request.sequence = req_seq; } if (flags & _DRM_VBLANK_EVENT) { /* must hold on to the vblank ref until the event fires * drm_vblank_put will be called asynchronously */ return drm_queue_vblank_event(dev, pipe, req_seq, vblwait, file_priv); } if (req_seq != seq) { int wait; drm_dbg_core(dev, "waiting on vblank count %llu, crtc %u\n", req_seq, pipe); wait = wait_event_interruptible_timeout(vblank->queue, drm_vblank_passed(drm_vblank_count(dev, pipe), req_seq) || !READ_ONCE(vblank->enabled), msecs_to_jiffies(3000)); switch (wait) { case 0: /* timeout */ ret = -EBUSY; break; case -ERESTARTSYS: /* interrupted by signal */ ret = -EINTR; break; default: ret = 0; break; } } if (ret != -EINTR) { drm_wait_vblank_reply(dev, pipe, &vblwait->reply); drm_dbg_core(dev, "crtc %d returning %u to client\n", pipe, vblwait->reply.sequence); } else { drm_dbg_core(dev, "crtc %d vblank wait interrupted by signal\n", pipe); } done: drm_vblank_put(dev, pipe); return ret; } static void drm_handle_vblank_events(struct drm_device *dev, unsigned int pipe) { struct drm_crtc *crtc = drm_crtc_from_index(dev, pipe); bool high_prec = false; struct drm_pending_vblank_event *e, *t; ktime_t now; u64 seq; assert_spin_locked(&dev->event_lock); seq = drm_vblank_count_and_time(dev, pipe, &now); list_for_each_entry_safe(e, t, &dev->vblank_event_list, base.link) { if (e->pipe != pipe) continue; if (!drm_vblank_passed(seq, e->sequence)) continue; drm_dbg_core(dev, "vblank event on %llu, current %llu\n", e->sequence, seq); list_del(&e->base.link); drm_vblank_put(dev, pipe); send_vblank_event(dev, e, seq, now); } if (crtc && crtc->funcs->get_vblank_timestamp) high_prec = true; trace_drm_vblank_event(pipe, seq, now, high_prec); } /** * drm_handle_vblank - handle a vblank event * @dev: DRM device * @pipe: index of CRTC where this event occurred * * Drivers should call this routine in their vblank interrupt handlers to * update the vblank counter and send any signals that may be pending. * * This is the legacy version of drm_crtc_handle_vblank(). */ bool drm_handle_vblank(struct drm_device *dev, unsigned int pipe) { struct drm_vblank_crtc *vblank = drm_vblank_crtc(dev, pipe); unsigned long irqflags; bool disable_irq; if (drm_WARN_ON_ONCE(dev, !drm_dev_has_vblank(dev))) return false; if (drm_WARN_ON(dev, pipe >= dev->num_crtcs)) return false; spin_lock_irqsave(&dev->event_lock, irqflags); /* Need timestamp lock to prevent concurrent execution with * vblank enable/disable, as this would cause inconsistent * or corrupted timestamps and vblank counts. */ spin_lock(&dev->vblank_time_lock); /* Vblank irq handling disabled. Nothing to do. */ if (!vblank->enabled) { spin_unlock(&dev->vblank_time_lock); spin_unlock_irqrestore(&dev->event_lock, irqflags); return false; } drm_update_vblank_count(dev, pipe, true); spin_unlock(&dev->vblank_time_lock); wake_up(&vblank->queue); /* With instant-off, we defer disabling the interrupt until after * we finish processing the following vblank after all events have * been signaled. The disable has to be last (after * drm_handle_vblank_events) so that the timestamp is always accurate. */ disable_irq = (vblank->config.disable_immediate && vblank->config.offdelay_ms > 0 && !atomic_read(&vblank->refcount)); drm_handle_vblank_events(dev, pipe); drm_handle_vblank_works(vblank); spin_unlock_irqrestore(&dev->event_lock, irqflags); if (disable_irq) vblank_disable_fn(&vblank->disable_timer); return true; } EXPORT_SYMBOL(drm_handle_vblank); /** * drm_crtc_handle_vblank - handle a vblank event * @crtc: where this event occurred * * Drivers should call this routine in their vblank interrupt handlers to * update the vblank counter and send any signals that may be pending. * * This is the native KMS version of drm_handle_vblank(). * * Note that for a given vblank counter value drm_crtc_handle_vblank() * and drm_crtc_vblank_count() or drm_crtc_vblank_count_and_time() * provide a barrier: Any writes done before calling * drm_crtc_handle_vblank() will be visible to callers of the later * functions, if the vblank count is the same or a later one. * * See also &drm_vblank_crtc.count. * * Returns: * True if the event was successfully handled, false on failure. */ bool drm_crtc_handle_vblank(struct drm_crtc *crtc) { return drm_handle_vblank(crtc->dev, drm_crtc_index(crtc)); } EXPORT_SYMBOL(drm_crtc_handle_vblank); /* * Get crtc VBLANK count. * * \param dev DRM device * \param data user argument, pointing to a drm_crtc_get_sequence structure. * \param file_priv drm file private for the user's open file descriptor */ int drm_crtc_get_sequence_ioctl(struct drm_device *dev, void *data, struct drm_file *file_priv) { struct drm_crtc *crtc; struct drm_vblank_crtc *vblank; int pipe; struct drm_crtc_get_sequence *get_seq = data; ktime_t now; bool vblank_enabled; int ret; if (!drm_core_check_feature(dev, DRIVER_MODESET)) return -EOPNOTSUPP; if (!drm_dev_has_vblank(dev)) return -EOPNOTSUPP; crtc = drm_crtc_find(dev, file_priv, get_seq->crtc_id); if (!crtc) return -ENOENT; pipe = drm_crtc_index(crtc); vblank = drm_crtc_vblank_crtc(crtc); vblank_enabled = READ_ONCE(vblank->config.disable_immediate) && READ_ONCE(vblank->enabled); if (!vblank_enabled) { ret = drm_crtc_vblank_get(crtc); if (ret) { drm_dbg_core(dev, "crtc %d failed to acquire vblank counter, %d\n", pipe, ret); return ret; } } drm_modeset_lock(&crtc->mutex, NULL); if (crtc->state) get_seq->active = crtc->state->enable; else get_seq->active = crtc->enabled; drm_modeset_unlock(&crtc->mutex); get_seq->sequence = drm_vblank_count_and_time(dev, pipe, &now); get_seq->sequence_ns = ktime_to_ns(now); if (!vblank_enabled) drm_crtc_vblank_put(crtc); return 0; } /* * Queue a event for VBLANK sequence * * \param dev DRM device * \param data user argument, pointing to a drm_crtc_queue_sequence structure. * \param file_priv drm file private for the user's open file descriptor */ int drm_crtc_queue_sequence_ioctl(struct drm_device *dev, void *data, struct drm_file *file_priv) { struct drm_crtc *crtc; struct drm_vblank_crtc *vblank; int pipe; struct drm_crtc_queue_sequence *queue_seq = data; ktime_t now; struct drm_pending_vblank_event *e; u32 flags; u64 seq; u64 req_seq; int ret; if (!drm_core_check_feature(dev, DRIVER_MODESET)) return -EOPNOTSUPP; if (!drm_dev_has_vblank(dev)) return -EOPNOTSUPP; crtc = drm_crtc_find(dev, file_priv, queue_seq->crtc_id); if (!crtc) return -ENOENT; flags = queue_seq->flags; /* Check valid flag bits */ if (flags & ~(DRM_CRTC_SEQUENCE_RELATIVE| DRM_CRTC_SEQUENCE_NEXT_ON_MISS)) return -EINVAL; pipe = drm_crtc_index(crtc); vblank = drm_crtc_vblank_crtc(crtc); e = kzalloc(sizeof(*e), GFP_KERNEL); if (e == NULL) return -ENOMEM; ret = drm_crtc_vblank_get(crtc); if (ret) { drm_dbg_core(dev, "crtc %d failed to acquire vblank counter, %d\n", pipe, ret); goto err_free; } seq = drm_vblank_count_and_time(dev, pipe, &now); req_seq = queue_seq->sequence; if (flags & DRM_CRTC_SEQUENCE_RELATIVE) req_seq += seq; if ((flags & DRM_CRTC_SEQUENCE_NEXT_ON_MISS) && drm_vblank_passed(seq, req_seq)) req_seq = seq + 1; e->pipe = pipe; e->event.base.type = DRM_EVENT_CRTC_SEQUENCE; e->event.base.length = sizeof(e->event.seq); e->event.seq.user_data = queue_seq->user_data; spin_lock_irq(&dev->event_lock); /* * drm_crtc_vblank_off() might have been called after we called * drm_crtc_vblank_get(). drm_crtc_vblank_off() holds event_lock around the * vblank disable, so no need for further locking. The reference from * drm_crtc_vblank_get() protects against vblank disable from another source. */ if (!READ_ONCE(vblank->enabled)) { ret = -EINVAL; goto err_unlock; } ret = drm_event_reserve_init_locked(dev, file_priv, &e->base, &e->event.base); if (ret) goto err_unlock; e->sequence = req_seq; if (drm_vblank_passed(seq, req_seq)) { drm_crtc_vblank_put(crtc); send_vblank_event(dev, e, seq, now); queue_seq->sequence = seq; } else { /* drm_handle_vblank_events will call drm_vblank_put */ list_add_tail(&e->base.link, &dev->vblank_event_list); queue_seq->sequence = req_seq; } spin_unlock_irq(&dev->event_lock); return 0; err_unlock: spin_unlock_irq(&dev->event_lock); drm_crtc_vblank_put(crtc); err_free: kfree(e); return ret; }
1 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 // SPDX-License-Identifier: GPL-2.0+ /* * Copyright (C) 2003-2008 Takahiro Hirofuchi * Copyright (C) 2015-2016 Samsung Electronics * Krzysztof Opasiak <k.opasiak@samsung.com> */ #include <asm/byteorder.h> #include <linux/file.h> #include <linux/fs.h> #include <linux/kernel.h> #include <linux/slab.h> #include <linux/stat.h> #include <linux/module.h> #include <linux/moduleparam.h> #include <net/sock.h> #include "usbip_common.h" #define DRIVER_AUTHOR "Takahiro Hirofuchi <hirofuchi@users.sourceforge.net>" #define DRIVER_DESC "USB/IP Core" #ifdef CONFIG_USBIP_DEBUG unsigned long usbip_debug_flag = 0xffffffff; #else unsigned long usbip_debug_flag; #endif EXPORT_SYMBOL_GPL(usbip_debug_flag); module_param(usbip_debug_flag, ulong, S_IRUGO|S_IWUSR); MODULE_PARM_DESC(usbip_debug_flag, "debug flags (defined in usbip_common.h)"); /* FIXME */ struct device_attribute dev_attr_usbip_debug; EXPORT_SYMBOL_GPL(dev_attr_usbip_debug); static ssize_t usbip_debug_show(struct device *dev, struct device_attribute *attr, char *buf) { return sprintf(buf, "%lx\n", usbip_debug_flag); } static ssize_t usbip_debug_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { if (sscanf(buf, "%lx", &usbip_debug_flag) != 1) return -EINVAL; return count; } DEVICE_ATTR_RW(usbip_debug); static void usbip_dump_buffer(char *buff, int bufflen) { print_hex_dump(KERN_DEBUG, "usbip-core", DUMP_PREFIX_OFFSET, 16, 4, buff, bufflen, false); } static void usbip_dump_pipe(unsigned int p) { unsigned char type = usb_pipetype(p); unsigned char ep = usb_pipeendpoint(p); unsigned char dev = usb_pipedevice(p); unsigned char dir = usb_pipein(p); pr_debug("dev(%d) ep(%d) [%s] ", dev, ep, dir ? "IN" : "OUT"); switch (type) { case PIPE_ISOCHRONOUS: pr_debug("ISO\n"); break; case PIPE_INTERRUPT: pr_debug("INT\n"); break; case PIPE_CONTROL: pr_debug("CTRL\n"); break; case PIPE_BULK: pr_debug("BULK\n"); break; default: pr_debug("ERR\n"); break; } } static void usbip_dump_usb_device(struct usb_device *udev) { struct device *dev = &udev->dev; int i; dev_dbg(dev, " devnum(%d) devpath(%s) usb speed(%s)", udev->devnum, udev->devpath, usb_speed_string(udev->speed)); pr_debug("tt hub ttport %d\n", udev->ttport); dev_dbg(dev, " "); for (i = 0; i < 16; i++) pr_debug(" %2u", i); pr_debug("\n"); dev_dbg(dev, " toggle0(IN) :"); for (i = 0; i < 16; i++) pr_debug(" %2u", (udev->toggle[0] & (1 << i)) ? 1 : 0); pr_debug("\n"); dev_dbg(dev, " toggle1(OUT):"); for (i = 0; i < 16; i++) pr_debug(" %2u", (udev->toggle[1] & (1 << i)) ? 1 : 0); pr_debug("\n"); dev_dbg(dev, " epmaxp_in :"); for (i = 0; i < 16; i++) { if (udev->ep_in[i]) pr_debug(" %2u", le16_to_cpu(udev->ep_in[i]->desc.wMaxPacketSize)); } pr_debug("\n"); dev_dbg(dev, " epmaxp_out :"); for (i = 0; i < 16; i++) { if (udev->ep_out[i]) pr_debug(" %2u", le16_to_cpu(udev->ep_out[i]->desc.wMaxPacketSize)); } pr_debug("\n"); dev_dbg(dev, "parent %s, bus %s\n", dev_name(&udev->parent->dev), udev->bus->bus_name); dev_dbg(dev, "have_langid %d, string_langid %d\n", udev->have_langid, udev->string_langid); dev_dbg(dev, "maxchild %d\n", udev->maxchild); } static void usbip_dump_request_type(__u8 rt) { switch (rt & USB_RECIP_MASK) { case USB_RECIP_DEVICE: pr_debug("DEVICE"); break; case USB_RECIP_INTERFACE: pr_debug("INTERF"); break; case USB_RECIP_ENDPOINT: pr_debug("ENDPOI"); break; case USB_RECIP_OTHER: pr_debug("OTHER "); break; default: pr_debug("------"); break; } } static void usbip_dump_usb_ctrlrequest(struct usb_ctrlrequest *cmd) { if (!cmd) { pr_debug(" : null pointer\n"); return; } pr_debug(" "); pr_debug("bRequestType(%02X) bRequest(%02X) wValue(%04X) wIndex(%04X) wLength(%04X) ", cmd->bRequestType, cmd->bRequest, cmd->wValue, cmd->wIndex, cmd->wLength); pr_debug("\n "); if ((cmd->bRequestType & USB_TYPE_MASK) == USB_TYPE_STANDARD) { pr_debug("STANDARD "); switch (cmd->bRequest) { case USB_REQ_GET_STATUS: pr_debug("GET_STATUS\n"); break; case USB_REQ_CLEAR_FEATURE: pr_debug("CLEAR_FEAT\n"); break; case USB_REQ_SET_FEATURE: pr_debug("SET_FEAT\n"); break; case USB_REQ_SET_ADDRESS: pr_debug("SET_ADDRRS\n"); break; case USB_REQ_GET_DESCRIPTOR: pr_debug("GET_DESCRI\n"); break; case USB_REQ_SET_DESCRIPTOR: pr_debug("SET_DESCRI\n"); break; case USB_REQ_GET_CONFIGURATION: pr_debug("GET_CONFIG\n"); break; case USB_REQ_SET_CONFIGURATION: pr_debug("SET_CONFIG\n"); break; case USB_REQ_GET_INTERFACE: pr_debug("GET_INTERF\n"); break; case USB_REQ_SET_INTERFACE: pr_debug("SET_INTERF\n"); break; case USB_REQ_SYNCH_FRAME: pr_debug("SYNC_FRAME\n"); break; default: pr_debug("REQ(%02X)\n", cmd->bRequest); break; } usbip_dump_request_type(cmd->bRequestType); } else if ((cmd->bRequestType & USB_TYPE_MASK) == USB_TYPE_CLASS) { pr_debug("CLASS\n"); } else if ((cmd->bRequestType & USB_TYPE_MASK) == USB_TYPE_VENDOR) { pr_debug("VENDOR\n"); } else if ((cmd->bRequestType & USB_TYPE_MASK) == USB_TYPE_RESERVED) { pr_debug("RESERVED\n"); } } void usbip_dump_urb(struct urb *urb) { struct device *dev; if (!urb) { pr_debug("urb: null pointer!!\n"); return; } if (!urb->dev) { pr_debug("urb->dev: null pointer!!\n"); return; } dev = &urb->dev->dev; usbip_dump_usb_device(urb->dev); dev_dbg(dev, " pipe :%08x ", urb->pipe); usbip_dump_pipe(urb->pipe); dev_dbg(dev, " status :%d\n", urb->status); dev_dbg(dev, " transfer_flags :%08X\n", urb->transfer_flags); dev_dbg(dev, " transfer_buffer_length:%d\n", urb->transfer_buffer_length); dev_dbg(dev, " actual_length :%d\n", urb->actual_length); if (urb->setup_packet && usb_pipetype(urb->pipe) == PIPE_CONTROL) usbip_dump_usb_ctrlrequest( (struct usb_ctrlrequest *)urb->setup_packet); dev_dbg(dev, " start_frame :%d\n", urb->start_frame); dev_dbg(dev, " number_of_packets :%d\n", urb->number_of_packets); dev_dbg(dev, " interval :%d\n", urb->interval); dev_dbg(dev, " error_count :%d\n", urb->error_count); } EXPORT_SYMBOL_GPL(usbip_dump_urb); void usbip_dump_header(struct usbip_header *pdu) { pr_debug("BASE: cmd %u seq %u devid %u dir %u ep %u\n", pdu->base.command, pdu->base.seqnum, pdu->base.devid, pdu->base.direction, pdu->base.ep); switch (pdu->base.command) { case USBIP_CMD_SUBMIT: pr_debug("USBIP_CMD_SUBMIT: x_flags %u x_len %u sf %u #p %d iv %d\n", pdu->u.cmd_submit.transfer_flags, pdu->u.cmd_submit.transfer_buffer_length, pdu->u.cmd_submit.start_frame, pdu->u.cmd_submit.number_of_packets, pdu->u.cmd_submit.interval); break; case USBIP_CMD_UNLINK: pr_debug("USBIP_CMD_UNLINK: seq %u\n", pdu->u.cmd_unlink.seqnum); break; case USBIP_RET_SUBMIT: pr_debug("USBIP_RET_SUBMIT: st %d al %u sf %d #p %d ec %d\n", pdu->u.ret_submit.status, pdu->u.ret_submit.actual_length, pdu->u.ret_submit.start_frame, pdu->u.ret_submit.number_of_packets, pdu->u.ret_submit.error_count); break; case USBIP_RET_UNLINK: pr_debug("USBIP_RET_UNLINK: status %d\n", pdu->u.ret_unlink.status); break; default: /* NOT REACHED */ pr_err("unknown command\n"); break; } } EXPORT_SYMBOL_GPL(usbip_dump_header); /* Receive data over TCP/IP. */ int usbip_recv(struct socket *sock, void *buf, int size) { int result; struct kvec iov = {.iov_base = buf, .iov_len = size}; struct msghdr msg = {.msg_flags = MSG_NOSIGNAL}; int total = 0; if (!sock || !buf || !size) return -EINVAL; iov_iter_kvec(&msg.msg_iter, ITER_DEST, &iov, 1, size); usbip_dbg_xmit("enter\n"); do { sock->sk->sk_allocation = GFP_NOIO; sock->sk->sk_use_task_frag = false; result = sock_recvmsg(sock, &msg, MSG_WAITALL); if (result <= 0) goto err; total += result; } while (msg_data_left(&msg)); if (usbip_dbg_flag_xmit) { pr_debug("receiving....\n"); usbip_dump_buffer(buf, size); pr_debug("received, osize %d ret %d size %zd total %d\n", size, result, msg_data_left(&msg), total); } return total; err: return result; } EXPORT_SYMBOL_GPL(usbip_recv); /* there may be more cases to tweak the flags. */ static unsigned int tweak_transfer_flags(unsigned int flags) { flags &= ~URB_NO_TRANSFER_DMA_MAP; return flags; } /* * USBIP driver packs URB transfer flags in PDUs that are exchanged * between Server (usbip_host) and Client (vhci_hcd). URB_* flags * are internal to kernel and could change. Where as USBIP URB flags * exchanged in PDUs are USBIP user API must not change. * * USBIP_URB* flags are exported as explicit API and client and server * do mapping from kernel flags to USBIP_URB*. Details as follows: * * Client tx path (USBIP_CMD_SUBMIT): * - Maps URB_* to USBIP_URB_* when it sends USBIP_CMD_SUBMIT packet. * * Server rx path (USBIP_CMD_SUBMIT): * - Maps USBIP_URB_* to URB_* when it receives USBIP_CMD_SUBMIT packet. * * Flags aren't included in USBIP_CMD_UNLINK and USBIP_RET_SUBMIT packets * and no special handling is needed for them in the following cases: * - Server rx path (USBIP_CMD_UNLINK) * - Client rx path & Server tx path (USBIP_RET_SUBMIT) * * Code paths: * usbip_pack_pdu() is the common routine that handles packing pdu from * urb and unpack pdu to an urb. * * usbip_pack_cmd_submit() and usbip_pack_ret_submit() handle * USBIP_CMD_SUBMIT and USBIP_RET_SUBMIT respectively. * * usbip_map_urb_to_usbip() and usbip_map_usbip_to_urb() are used * by usbip_pack_cmd_submit() and usbip_pack_ret_submit() to map * flags. */ struct urb_to_usbip_flags { u32 urb_flag; u32 usbip_flag; }; #define NUM_USBIP_FLAGS 17 static const struct urb_to_usbip_flags flag_map[NUM_USBIP_FLAGS] = { {URB_SHORT_NOT_OK, USBIP_URB_SHORT_NOT_OK}, {URB_ISO_ASAP, USBIP_URB_ISO_ASAP}, {URB_NO_TRANSFER_DMA_MAP, USBIP_URB_NO_TRANSFER_DMA_MAP}, {URB_ZERO_PACKET, USBIP_URB_ZERO_PACKET}, {URB_NO_INTERRUPT, USBIP_URB_NO_INTERRUPT}, {URB_FREE_BUFFER, USBIP_URB_FREE_BUFFER}, {URB_DIR_IN, USBIP_URB_DIR_IN}, {URB_DIR_OUT, USBIP_URB_DIR_OUT}, {URB_DIR_MASK, USBIP_URB_DIR_MASK}, {URB_DMA_MAP_SINGLE, USBIP_URB_DMA_MAP_SINGLE}, {URB_DMA_MAP_PAGE, USBIP_URB_DMA_MAP_PAGE}, {URB_DMA_MAP_SG, USBIP_URB_DMA_MAP_SG}, {URB_MAP_LOCAL, USBIP_URB_MAP_LOCAL}, {URB_SETUP_MAP_SINGLE, USBIP_URB_SETUP_MAP_SINGLE}, {URB_SETUP_MAP_LOCAL, USBIP_URB_SETUP_MAP_LOCAL}, {URB_DMA_SG_COMBINED, USBIP_URB_DMA_SG_COMBINED}, {URB_ALIGNED_TEMP_BUFFER, USBIP_URB_ALIGNED_TEMP_BUFFER}, }; static unsigned int urb_to_usbip(unsigned int flags) { unsigned int map_flags = 0; int loop; for (loop = 0; loop < NUM_USBIP_FLAGS; loop++) { if (flags & flag_map[loop].urb_flag) map_flags |= flag_map[loop].usbip_flag; } return map_flags; } static unsigned int usbip_to_urb(unsigned int flags) { unsigned int map_flags = 0; int loop; for (loop = 0; loop < NUM_USBIP_FLAGS; loop++) { if (flags & flag_map[loop].usbip_flag) map_flags |= flag_map[loop].urb_flag; } return map_flags; } static void usbip_pack_cmd_submit(struct usbip_header *pdu, struct urb *urb, int pack) { struct usbip_header_cmd_submit *spdu = &pdu->u.cmd_submit; /* * Some members are not still implemented in usbip. I hope this issue * will be discussed when usbip is ported to other operating systems. */ if (pack) { /* map after tweaking the urb flags */ spdu->transfer_flags = urb_to_usbip(tweak_transfer_flags(urb->transfer_flags)); spdu->transfer_buffer_length = urb->transfer_buffer_length; spdu->start_frame = urb->start_frame; spdu->number_of_packets = urb->number_of_packets; spdu->interval = urb->interval; } else { urb->transfer_flags = usbip_to_urb(spdu->transfer_flags); urb->transfer_buffer_length = spdu->transfer_buffer_length; urb->start_frame = spdu->start_frame; urb->number_of_packets = spdu->number_of_packets; urb->interval = spdu->interval; } } static void usbip_pack_ret_submit(struct usbip_header *pdu, struct urb *urb, int pack) { struct usbip_header_ret_submit *rpdu = &pdu->u.ret_submit; if (pack) { rpdu->status = urb->status; rpdu->actual_length = urb->actual_length; rpdu->start_frame = urb->start_frame; rpdu->number_of_packets = urb->number_of_packets; rpdu->error_count = urb->error_count; } else { urb->status = rpdu->status; urb->actual_length = rpdu->actual_length; urb->start_frame = rpdu->start_frame; urb->number_of_packets = rpdu->number_of_packets; urb->error_count = rpdu->error_count; } } void usbip_pack_pdu(struct usbip_header *pdu, struct urb *urb, int cmd, int pack) { switch (cmd) { case USBIP_CMD_SUBMIT: usbip_pack_cmd_submit(pdu, urb, pack); break; case USBIP_RET_SUBMIT: usbip_pack_ret_submit(pdu, urb, pack); break; default: /* NOT REACHED */ pr_err("unknown command\n"); break; } } EXPORT_SYMBOL_GPL(usbip_pack_pdu); static void correct_endian_basic(struct usbip_header_basic *base, int send) { if (send) { base->command = cpu_to_be32(base->command); base->seqnum = cpu_to_be32(base->seqnum); base->devid = cpu_to_be32(base->devid); base->direction = cpu_to_be32(base->direction); base->ep = cpu_to_be32(base->ep); } else { base->command = be32_to_cpu(base->command); base->seqnum = be32_to_cpu(base->seqnum); base->devid = be32_to_cpu(base->devid); base->direction = be32_to_cpu(base->direction); base->ep = be32_to_cpu(base->ep); } } static void correct_endian_cmd_submit(struct usbip_header_cmd_submit *pdu, int send) { if (send) { pdu->transfer_flags = cpu_to_be32(pdu->transfer_flags); cpu_to_be32s(&pdu->transfer_buffer_length); cpu_to_be32s(&pdu->start_frame); cpu_to_be32s(&pdu->number_of_packets); cpu_to_be32s(&pdu->interval); } else { pdu->transfer_flags = be32_to_cpu(pdu->transfer_flags); be32_to_cpus(&pdu->transfer_buffer_length); be32_to_cpus(&pdu->start_frame); be32_to_cpus(&pdu->number_of_packets); be32_to_cpus(&pdu->interval); } } static void correct_endian_ret_submit(struct usbip_header_ret_submit *pdu, int send) { if (send) { cpu_to_be32s(&pdu->status); cpu_to_be32s(&pdu->actual_length); cpu_to_be32s(&pdu->start_frame); cpu_to_be32s(&pdu->number_of_packets); cpu_to_be32s(&pdu->error_count); } else { be32_to_cpus(&pdu->status); be32_to_cpus(&pdu->actual_length); be32_to_cpus(&pdu->start_frame); be32_to_cpus(&pdu->number_of_packets); be32_to_cpus(&pdu->error_count); } } static void correct_endian_cmd_unlink(struct usbip_header_cmd_unlink *pdu, int send) { if (send) pdu->seqnum = cpu_to_be32(pdu->seqnum); else pdu->seqnum = be32_to_cpu(pdu->seqnum); } static void correct_endian_ret_unlink(struct usbip_header_ret_unlink *pdu, int send) { if (send) cpu_to_be32s(&pdu->status); else be32_to_cpus(&pdu->status); } void usbip_header_correct_endian(struct usbip_header *pdu, int send) { __u32 cmd = 0; if (send) cmd = pdu->base.command; correct_endian_basic(&pdu->base, send); if (!send) cmd = pdu->base.command; switch (cmd) { case USBIP_CMD_SUBMIT: correct_endian_cmd_submit(&pdu->u.cmd_submit, send); break; case USBIP_RET_SUBMIT: correct_endian_ret_submit(&pdu->u.ret_submit, send); break; case USBIP_CMD_UNLINK: correct_endian_cmd_unlink(&pdu->u.cmd_unlink, send); break; case USBIP_RET_UNLINK: correct_endian_ret_unlink(&pdu->u.ret_unlink, send); break; default: /* NOT REACHED */ pr_err("unknown command\n"); break; } } EXPORT_SYMBOL_GPL(usbip_header_correct_endian); static void usbip_iso_packet_correct_endian( struct usbip_iso_packet_descriptor *iso, int send) { /* does not need all members. but copy all simply. */ if (send) { iso->offset = cpu_to_be32(iso->offset); iso->length = cpu_to_be32(iso->length); iso->status = cpu_to_be32(iso->status); iso->actual_length = cpu_to_be32(iso->actual_length); } else { iso->offset = be32_to_cpu(iso->offset); iso->length = be32_to_cpu(iso->length); iso->status = be32_to_cpu(iso->status); iso->actual_length = be32_to_cpu(iso->actual_length); } } static void usbip_pack_iso(struct usbip_iso_packet_descriptor *iso, struct usb_iso_packet_descriptor *uiso, int pack) { if (pack) { iso->offset = uiso->offset; iso->length = uiso->length; iso->status = uiso->status; iso->actual_length = uiso->actual_length; } else { uiso->offset = iso->offset; uiso->length = iso->length; uiso->status = iso->status; uiso->actual_length = iso->actual_length; } } /* must free buffer */ struct usbip_iso_packet_descriptor* usbip_alloc_iso_desc_pdu(struct urb *urb, ssize_t *bufflen) { struct usbip_iso_packet_descriptor *iso; int np = urb->number_of_packets; ssize_t size = np * sizeof(*iso); int i; iso = kzalloc(size, GFP_KERNEL); if (!iso) return NULL; for (i = 0; i < np; i++) { usbip_pack_iso(&iso[i], &urb->iso_frame_desc[i], 1); usbip_iso_packet_correct_endian(&iso[i], 1); } *bufflen = size; return iso; } EXPORT_SYMBOL_GPL(usbip_alloc_iso_desc_pdu); /* some members of urb must be substituted before. */ int usbip_recv_iso(struct usbip_device *ud, struct urb *urb) { void *buff; struct usbip_iso_packet_descriptor *iso; int np = urb->number_of_packets; int size = np * sizeof(*iso); int i; int ret; int total_length = 0; if (!usb_pipeisoc(urb->pipe)) return 0; /* my Bluetooth dongle gets ISO URBs which are np = 0 */ if (np == 0) return 0; buff = kzalloc(size, GFP_KERNEL); if (!buff) return -ENOMEM; ret = usbip_recv(ud->tcp_socket, buff, size); if (ret != size) { dev_err(&urb->dev->dev, "recv iso_frame_descriptor, %d\n", ret); kfree(buff); if (ud->side == USBIP_STUB || ud->side == USBIP_VUDC) usbip_event_add(ud, SDEV_EVENT_ERROR_TCP); else usbip_event_add(ud, VDEV_EVENT_ERROR_TCP); return -EPIPE; } iso = (struct usbip_iso_packet_descriptor *) buff; for (i = 0; i < np; i++) { usbip_iso_packet_correct_endian(&iso[i], 0); usbip_pack_iso(&iso[i], &urb->iso_frame_desc[i], 0); total_length += urb->iso_frame_desc[i].actual_length; } kfree(buff); if (total_length != urb->actual_length) { dev_err(&urb->dev->dev, "total length of iso packets %d not equal to actual length of buffer %d\n", total_length, urb->actual_length); if (ud->side == USBIP_STUB || ud->side == USBIP_VUDC) usbip_event_add(ud, SDEV_EVENT_ERROR_TCP); else usbip_event_add(ud, VDEV_EVENT_ERROR_TCP); return -EPIPE; } return ret; } EXPORT_SYMBOL_GPL(usbip_recv_iso); /* * This functions restores the padding which was removed for optimizing * the bandwidth during transfer over tcp/ip * * buffer and iso packets need to be stored and be in propeper endian in urb * before calling this function */ void usbip_pad_iso(struct usbip_device *ud, struct urb *urb) { int np = urb->number_of_packets; int i; int actualoffset = urb->actual_length; if (!usb_pipeisoc(urb->pipe)) return; /* if no packets or length of data is 0, then nothing to unpack */ if (np == 0 || urb->actual_length == 0) return; /* * if actual_length is transfer_buffer_length then no padding is * present. */ if (urb->actual_length == urb->transfer_buffer_length) return; /* * loop over all packets from last to first (to prevent overwriting * memory when padding) and move them into the proper place */ for (i = np-1; i > 0; i--) { actualoffset -= urb->iso_frame_desc[i].actual_length; memmove(urb->transfer_buffer + urb->iso_frame_desc[i].offset, urb->transfer_buffer + actualoffset, urb->iso_frame_desc[i].actual_length); } } EXPORT_SYMBOL_GPL(usbip_pad_iso); /* some members of urb must be substituted before. */ int usbip_recv_xbuff(struct usbip_device *ud, struct urb *urb) { struct scatterlist *sg; int ret = 0; int recv; int size; int copy; int i; if (ud->side == USBIP_STUB || ud->side == USBIP_VUDC) { /* the direction of urb must be OUT. */ if (usb_pipein(urb->pipe)) return 0; size = urb->transfer_buffer_length; } else { /* the direction of urb must be IN. */ if (usb_pipeout(urb->pipe)) return 0; size = urb->actual_length; } /* no need to recv xbuff */ if (!(size > 0)) return 0; if (size > urb->transfer_buffer_length) /* should not happen, probably malicious packet */ goto error; if (urb->num_sgs) { copy = size; for_each_sg(urb->sg, sg, urb->num_sgs, i) { int recv_size; if (copy < sg->length) recv_size = copy; else recv_size = sg->length; recv = usbip_recv(ud->tcp_socket, sg_virt(sg), recv_size); if (recv != recv_size) goto error; copy -= recv; ret += recv; if (!copy) break; } if (ret != size) goto error; } else { ret = usbip_recv(ud->tcp_socket, urb->transfer_buffer, size); if (ret != size) goto error; } return ret; error: dev_err(&urb->dev->dev, "recv xbuf, %d\n", ret); if (ud->side == USBIP_STUB || ud->side == USBIP_VUDC) usbip_event_add(ud, SDEV_EVENT_ERROR_TCP); else usbip_event_add(ud, VDEV_EVENT_ERROR_TCP); return -EPIPE; } EXPORT_SYMBOL_GPL(usbip_recv_xbuff); static int __init usbip_core_init(void) { return usbip_init_eh(); } static void __exit usbip_core_exit(void) { usbip_finish_eh(); return; } module_init(usbip_core_init); module_exit(usbip_core_exit); MODULE_AUTHOR(DRIVER_AUTHOR); MODULE_DESCRIPTION(DRIVER_DESC); MODULE_LICENSE("GPL");
4 4 4 10 9 10 11 11 10 4 11 11 11 11 11 11 11 11 11 11 11 10 10 10 11 4 4 4 4 4 4 4 4 4 3 3 3 3 3 4 4 4 4 4 4 4 4 4 4 4 7 7 7 7 7 7 7 7 7 7 7 7 7 7 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 // SPDX-License-Identifier: GPL-2.0 #include <linux/err.h> #include <linux/igmp.h> #include <linux/kernel.h> #include <linux/netdevice.h> #include <linux/rculist.h> #include <linux/skbuff.h> #include <linux/if_ether.h> #include <net/ip.h> #include <net/netlink.h> #include <net/switchdev.h> #if IS_ENABLED(CONFIG_IPV6) #include <net/ipv6.h> #include <net/addrconf.h> #endif #include "br_private.h" static bool br_ip4_rports_get_timer(struct net_bridge_mcast_port *pmctx, unsigned long *timer) { *timer = br_timer_value(&pmctx->ip4_mc_router_timer); return !hlist_unhashed(&pmctx->ip4_rlist); } static bool br_ip6_rports_get_timer(struct net_bridge_mcast_port *pmctx, unsigned long *timer) { #if IS_ENABLED(CONFIG_IPV6) *timer = br_timer_value(&pmctx->ip6_mc_router_timer); return !hlist_unhashed(&pmctx->ip6_rlist); #else *timer = 0; return false; #endif } static size_t __br_rports_one_size(void) { return nla_total_size(sizeof(u32)) + /* MDBA_ROUTER_PORT */ nla_total_size(sizeof(u32)) + /* MDBA_ROUTER_PATTR_TIMER */ nla_total_size(sizeof(u8)) + /* MDBA_ROUTER_PATTR_TYPE */ nla_total_size(sizeof(u32)) + /* MDBA_ROUTER_PATTR_INET_TIMER */ nla_total_size(sizeof(u32)) + /* MDBA_ROUTER_PATTR_INET6_TIMER */ nla_total_size(sizeof(u32)); /* MDBA_ROUTER_PATTR_VID */ } size_t br_rports_size(const struct net_bridge_mcast *brmctx) { struct net_bridge_mcast_port *pmctx; size_t size = nla_total_size(0); /* MDBA_ROUTER */ rcu_read_lock(); hlist_for_each_entry_rcu(pmctx, &brmctx->ip4_mc_router_list, ip4_rlist) size += __br_rports_one_size(); #if IS_ENABLED(CONFIG_IPV6) hlist_for_each_entry_rcu(pmctx, &brmctx->ip6_mc_router_list, ip6_rlist) size += __br_rports_one_size(); #endif rcu_read_unlock(); return size; } int br_rports_fill_info(struct sk_buff *skb, const struct net_bridge_mcast *brmctx) { u16 vid = brmctx->vlan ? brmctx->vlan->vid : 0; bool have_ip4_mc_rtr, have_ip6_mc_rtr; unsigned long ip4_timer, ip6_timer; struct nlattr *nest, *port_nest; struct net_bridge_port *p; if (!brmctx->multicast_router || !br_rports_have_mc_router(brmctx)) return 0; nest = nla_nest_start_noflag(skb, MDBA_ROUTER); if (nest == NULL) return -EMSGSIZE; list_for_each_entry_rcu(p, &brmctx->br->port_list, list) { struct net_bridge_mcast_port *pmctx; if (vid) { struct net_bridge_vlan *v; v = br_vlan_find(nbp_vlan_group(p), vid); if (!v) continue; pmctx = &v->port_mcast_ctx; } else { pmctx = &p->multicast_ctx; } have_ip4_mc_rtr = br_ip4_rports_get_timer(pmctx, &ip4_timer); have_ip6_mc_rtr = br_ip6_rports_get_timer(pmctx, &ip6_timer); if (!have_ip4_mc_rtr && !have_ip6_mc_rtr) continue; port_nest = nla_nest_start_noflag(skb, MDBA_ROUTER_PORT); if (!port_nest) goto fail; if (nla_put_nohdr(skb, sizeof(u32), &p->dev->ifindex) || nla_put_u32(skb, MDBA_ROUTER_PATTR_TIMER, max(ip4_timer, ip6_timer)) || nla_put_u8(skb, MDBA_ROUTER_PATTR_TYPE, p->multicast_ctx.multicast_router) || (have_ip4_mc_rtr && nla_put_u32(skb, MDBA_ROUTER_PATTR_INET_TIMER, ip4_timer)) || (have_ip6_mc_rtr && nla_put_u32(skb, MDBA_ROUTER_PATTR_INET6_TIMER, ip6_timer)) || (vid && nla_put_u16(skb, MDBA_ROUTER_PATTR_VID, vid))) { nla_nest_cancel(skb, port_nest); goto fail; } nla_nest_end(skb, port_nest); } nla_nest_end(skb, nest); return 0; fail: nla_nest_cancel(skb, nest); return -EMSGSIZE; } static void __mdb_entry_fill_flags(struct br_mdb_entry *e, unsigned char flags) { e->state = flags & MDB_PG_FLAGS_PERMANENT; e->flags = 0; if (flags & MDB_PG_FLAGS_OFFLOAD) e->flags |= MDB_FLAGS_OFFLOAD; if (flags & MDB_PG_FLAGS_FAST_LEAVE) e->flags |= MDB_FLAGS_FAST_LEAVE; if (flags & MDB_PG_FLAGS_STAR_EXCL) e->flags |= MDB_FLAGS_STAR_EXCL; if (flags & MDB_PG_FLAGS_BLOCKED) e->flags |= MDB_FLAGS_BLOCKED; } static void __mdb_entry_to_br_ip(struct br_mdb_entry *entry, struct br_ip *ip, struct nlattr **mdb_attrs) { memset(ip, 0, sizeof(struct br_ip)); ip->vid = entry->vid; ip->proto = entry->addr.proto; switch (ip->proto) { case htons(ETH_P_IP): ip->dst.ip4 = entry->addr.u.ip4; if (mdb_attrs && mdb_attrs[MDBE_ATTR_SOURCE]) ip->src.ip4 = nla_get_in_addr(mdb_attrs[MDBE_ATTR_SOURCE]); break; #if IS_ENABLED(CONFIG_IPV6) case htons(ETH_P_IPV6): ip->dst.ip6 = entry->addr.u.ip6; if (mdb_attrs && mdb_attrs[MDBE_ATTR_SOURCE]) ip->src.ip6 = nla_get_in6_addr(mdb_attrs[MDBE_ATTR_SOURCE]); break; #endif default: ether_addr_copy(ip->dst.mac_addr, entry->addr.u.mac_addr); } } static int __mdb_fill_srcs(struct sk_buff *skb, struct net_bridge_port_group *p) { struct net_bridge_group_src *ent; struct nlattr *nest, *nest_ent; if (hlist_empty(&p->src_list)) return 0; nest = nla_nest_start(skb, MDBA_MDB_EATTR_SRC_LIST); if (!nest) return -EMSGSIZE; hlist_for_each_entry_rcu(ent, &p->src_list, node, lockdep_is_held(&p->key.port->br->multicast_lock)) { nest_ent = nla_nest_start(skb, MDBA_MDB_SRCLIST_ENTRY); if (!nest_ent) goto out_cancel_err; switch (ent->addr.proto) { case htons(ETH_P_IP): if (nla_put_in_addr(skb, MDBA_MDB_SRCATTR_ADDRESS, ent->addr.src.ip4)) { nla_nest_cancel(skb, nest_ent); goto out_cancel_err; } break; #if IS_ENABLED(CONFIG_IPV6) case htons(ETH_P_IPV6): if (nla_put_in6_addr(skb, MDBA_MDB_SRCATTR_ADDRESS, &ent->addr.src.ip6)) { nla_nest_cancel(skb, nest_ent); goto out_cancel_err; } break; #endif default: nla_nest_cancel(skb, nest_ent); continue; } if (nla_put_u32(skb, MDBA_MDB_SRCATTR_TIMER, br_timer_value(&ent->timer))) { nla_nest_cancel(skb, nest_ent); goto out_cancel_err; } nla_nest_end(skb, nest_ent); } nla_nest_end(skb, nest); return 0; out_cancel_err: nla_nest_cancel(skb, nest); return -EMSGSIZE; } static int __mdb_fill_info(struct sk_buff *skb, struct net_bridge_mdb_entry *mp, struct net_bridge_port_group *p) { bool dump_srcs_mode = false; struct timer_list *mtimer; struct nlattr *nest_ent; struct br_mdb_entry e; u8 flags = 0; int ifindex; memset(&e, 0, sizeof(e)); if (p) { ifindex = p->key.port->dev->ifindex; mtimer = &p->timer; flags = p->flags; } else { ifindex = mp->br->dev->ifindex; mtimer = &mp->timer; } __mdb_entry_fill_flags(&e, flags); e.ifindex = ifindex; e.vid = mp->addr.vid; if (mp->addr.proto == htons(ETH_P_IP)) { e.addr.u.ip4 = mp->addr.dst.ip4; #if IS_ENABLED(CONFIG_IPV6) } else if (mp->addr.proto == htons(ETH_P_IPV6)) { e.addr.u.ip6 = mp->addr.dst.ip6; #endif } else { ether_addr_copy(e.addr.u.mac_addr, mp->addr.dst.mac_addr); e.state = MDB_PERMANENT; } e.addr.proto = mp->addr.proto; nest_ent = nla_nest_start_noflag(skb, MDBA_MDB_ENTRY_INFO); if (!nest_ent) return -EMSGSIZE; if (nla_put_nohdr(skb, sizeof(e), &e) || nla_put_u32(skb, MDBA_MDB_EATTR_TIMER, br_timer_value(mtimer))) goto nest_err; switch (mp->addr.proto) { case htons(ETH_P_IP): dump_srcs_mode = !!(mp->br->multicast_ctx.multicast_igmp_version == 3); if (mp->addr.src.ip4) { if (nla_put_in_addr(skb, MDBA_MDB_EATTR_SOURCE, mp->addr.src.ip4)) goto nest_err; break; } break; #if IS_ENABLED(CONFIG_IPV6) case htons(ETH_P_IPV6): dump_srcs_mode = !!(mp->br->multicast_ctx.multicast_mld_version == 2); if (!ipv6_addr_any(&mp->addr.src.ip6)) { if (nla_put_in6_addr(skb, MDBA_MDB_EATTR_SOURCE, &mp->addr.src.ip6)) goto nest_err; break; } break; #endif default: ether_addr_copy(e.addr.u.mac_addr, mp->addr.dst.mac_addr); } if (p) { if (nla_put_u8(skb, MDBA_MDB_EATTR_RTPROT, p->rt_protocol)) goto nest_err; if (dump_srcs_mode && (__mdb_fill_srcs(skb, p) || nla_put_u8(skb, MDBA_MDB_EATTR_GROUP_MODE, p->filter_mode))) goto nest_err; } nla_nest_end(skb, nest_ent); return 0; nest_err: nla_nest_cancel(skb, nest_ent); return -EMSGSIZE; } static int br_mdb_fill_info(struct sk_buff *skb, struct netlink_callback *cb, struct net_device *dev) { int idx = 0, s_idx = cb->args[1], err = 0, pidx = 0, s_pidx = cb->args[2]; struct net_bridge *br = netdev_priv(dev); struct net_bridge_mdb_entry *mp; struct nlattr *nest, *nest2; nest = nla_nest_start_noflag(skb, MDBA_MDB); if (nest == NULL) return -EMSGSIZE; hlist_for_each_entry_rcu(mp, &br->mdb_list, mdb_node) { struct net_bridge_port_group *p; struct net_bridge_port_group __rcu **pp; if (idx < s_idx) goto skip; nest2 = nla_nest_start_noflag(skb, MDBA_MDB_ENTRY); if (!nest2) { err = -EMSGSIZE; break; } if (!s_pidx && mp->host_joined) { err = __mdb_fill_info(skb, mp, NULL); if (err) { nla_nest_cancel(skb, nest2); break; } } for (pp = &mp->ports; (p = rcu_dereference(*pp)) != NULL; pp = &p->next) { if (!p->key.port) continue; if (pidx < s_pidx) goto skip_pg; err = __mdb_fill_info(skb, mp, p); if (err) { nla_nest_end(skb, nest2); goto out; } skip_pg: pidx++; } pidx = 0; s_pidx = 0; nla_nest_end(skb, nest2); skip: idx++; } out: cb->args[1] = idx; cb->args[2] = pidx; nla_nest_end(skb, nest); return err; } int br_mdb_dump(struct net_device *dev, struct sk_buff *skb, struct netlink_callback *cb) { struct net_bridge *br = netdev_priv(dev); struct br_port_msg *bpm; struct nlmsghdr *nlh; int err; nlh = nlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, RTM_GETMDB, sizeof(*bpm), NLM_F_MULTI); if (!nlh) return -EMSGSIZE; bpm = nlmsg_data(nlh); memset(bpm, 0, sizeof(*bpm)); bpm->ifindex = dev->ifindex; rcu_read_lock(); err = br_mdb_fill_info(skb, cb, dev); if (err) goto out; err = br_rports_fill_info(skb, &br->multicast_ctx); if (err) goto out; out: rcu_read_unlock(); nlmsg_end(skb, nlh); return err; } static int nlmsg_populate_mdb_fill(struct sk_buff *skb, struct net_device *dev, struct net_bridge_mdb_entry *mp, struct net_bridge_port_group *pg, int type) { struct nlmsghdr *nlh; struct br_port_msg *bpm; struct nlattr *nest, *nest2; nlh = nlmsg_put(skb, 0, 0, type, sizeof(*bpm), 0); if (!nlh) return -EMSGSIZE; bpm = nlmsg_data(nlh); memset(bpm, 0, sizeof(*bpm)); bpm->family = AF_BRIDGE; bpm->ifindex = dev->ifindex; nest = nla_nest_start_noflag(skb, MDBA_MDB); if (nest == NULL) goto cancel; nest2 = nla_nest_start_noflag(skb, MDBA_MDB_ENTRY); if (nest2 == NULL) goto end; if (__mdb_fill_info(skb, mp, pg)) goto end; nla_nest_end(skb, nest2); nla_nest_end(skb, nest); nlmsg_end(skb, nlh); return 0; end: nla_nest_end(skb, nest); cancel: nlmsg_cancel(skb, nlh); return -EMSGSIZE; } static size_t rtnl_mdb_nlmsg_pg_size(const struct net_bridge_port_group *pg) { struct net_bridge_group_src *ent; size_t nlmsg_size, addr_size = 0; /* MDBA_MDB_ENTRY_INFO */ nlmsg_size = nla_total_size(sizeof(struct br_mdb_entry)) + /* MDBA_MDB_EATTR_TIMER */ nla_total_size(sizeof(u32)); if (!pg) goto out; /* MDBA_MDB_EATTR_RTPROT */ nlmsg_size += nla_total_size(sizeof(u8)); switch (pg->key.addr.proto) { case htons(ETH_P_IP): /* MDBA_MDB_EATTR_SOURCE */ if (pg->key.addr.src.ip4) nlmsg_size += nla_total_size(sizeof(__be32)); if (pg->key.port->br->multicast_ctx.multicast_igmp_version == 2) goto out; addr_size = sizeof(__be32); break; #if IS_ENABLED(CONFIG_IPV6) case htons(ETH_P_IPV6): /* MDBA_MDB_EATTR_SOURCE */ if (!ipv6_addr_any(&pg->key.addr.src.ip6)) nlmsg_size += nla_total_size(sizeof(struct in6_addr)); if (pg->key.port->br->multicast_ctx.multicast_mld_version == 1) goto out; addr_size = sizeof(struct in6_addr); break; #endif } /* MDBA_MDB_EATTR_GROUP_MODE */ nlmsg_size += nla_total_size(sizeof(u8)); /* MDBA_MDB_EATTR_SRC_LIST nested attr */ if (!hlist_empty(&pg->src_list)) nlmsg_size += nla_total_size(0); hlist_for_each_entry(ent, &pg->src_list, node) { /* MDBA_MDB_SRCLIST_ENTRY nested attr + * MDBA_MDB_SRCATTR_ADDRESS + MDBA_MDB_SRCATTR_TIMER */ nlmsg_size += nla_total_size(0) + nla_total_size(addr_size) + nla_total_size(sizeof(u32)); } out: return nlmsg_size; } static size_t rtnl_mdb_nlmsg_size(const struct net_bridge_port_group *pg) { return NLMSG_ALIGN(sizeof(struct br_port_msg)) + /* MDBA_MDB */ nla_total_size(0) + /* MDBA_MDB_ENTRY */ nla_total_size(0) + /* Port group entry */ rtnl_mdb_nlmsg_pg_size(pg); } void br_mdb_notify(struct net_device *dev, struct net_bridge_mdb_entry *mp, struct net_bridge_port_group *pg, int type) { struct net *net = dev_net(dev); struct sk_buff *skb; int err = -ENOBUFS; br_switchdev_mdb_notify(dev, mp, pg, type); skb = nlmsg_new(rtnl_mdb_nlmsg_size(pg), GFP_ATOMIC); if (!skb) goto errout; err = nlmsg_populate_mdb_fill(skb, dev, mp, pg, type); if (err < 0) { kfree_skb(skb); goto errout; } rtnl_notify(skb, net, 0, RTNLGRP_MDB, NULL, GFP_ATOMIC); return; errout: rtnl_set_sk_err(net, RTNLGRP_MDB, err); } static int nlmsg_populate_rtr_fill(struct sk_buff *skb, struct net_device *dev, int ifindex, u16 vid, u32 pid, u32 seq, int type, unsigned int flags) { struct nlattr *nest, *port_nest; struct br_port_msg *bpm; struct nlmsghdr *nlh; nlh = nlmsg_put(skb, pid, seq, type, sizeof(*bpm), 0); if (!nlh) return -EMSGSIZE; bpm = nlmsg_data(nlh); memset(bpm, 0, sizeof(*bpm)); bpm->family = AF_BRIDGE; bpm->ifindex = dev->ifindex; nest = nla_nest_start_noflag(skb, MDBA_ROUTER); if (!nest) goto cancel; port_nest = nla_nest_start_noflag(skb, MDBA_ROUTER_PORT); if (!port_nest) goto end; if (nla_put_nohdr(skb, sizeof(u32), &ifindex)) { nla_nest_cancel(skb, port_nest); goto end; } if (vid && nla_put_u16(skb, MDBA_ROUTER_PATTR_VID, vid)) { nla_nest_cancel(skb, port_nest); goto end; } nla_nest_end(skb, port_nest); nla_nest_end(skb, nest); nlmsg_end(skb, nlh); return 0; end: nla_nest_end(skb, nest); cancel: nlmsg_cancel(skb, nlh); return -EMSGSIZE; } static inline size_t rtnl_rtr_nlmsg_size(void) { return NLMSG_ALIGN(sizeof(struct br_port_msg)) + nla_total_size(sizeof(__u32)) + nla_total_size(sizeof(u16)); } void br_rtr_notify(struct net_device *dev, struct net_bridge_mcast_port *pmctx, int type) { struct net *net = dev_net(dev); struct sk_buff *skb; int err = -ENOBUFS; int ifindex; u16 vid; ifindex = pmctx ? pmctx->port->dev->ifindex : 0; vid = pmctx && br_multicast_port_ctx_is_vlan(pmctx) ? pmctx->vlan->vid : 0; skb = nlmsg_new(rtnl_rtr_nlmsg_size(), GFP_ATOMIC); if (!skb) goto errout; err = nlmsg_populate_rtr_fill(skb, dev, ifindex, vid, 0, 0, type, NTF_SELF); if (err < 0) { kfree_skb(skb); goto errout; } rtnl_notify(skb, net, 0, RTNLGRP_MDB, NULL, GFP_ATOMIC); return; errout: rtnl_set_sk_err(net, RTNLGRP_MDB, err); } static const struct nla_policy br_mdbe_src_list_entry_pol[MDBE_SRCATTR_MAX + 1] = { [MDBE_SRCATTR_ADDRESS] = NLA_POLICY_RANGE(NLA_BINARY, sizeof(struct in_addr), sizeof(struct in6_addr)), }; static const struct nla_policy br_mdbe_src_list_pol[MDBE_SRC_LIST_MAX + 1] = { [MDBE_SRC_LIST_ENTRY] = NLA_POLICY_NESTED(br_mdbe_src_list_entry_pol), }; static const struct nla_policy br_mdbe_attrs_pol[MDBE_ATTR_MAX + 1] = { [MDBE_ATTR_SOURCE] = NLA_POLICY_RANGE(NLA_BINARY, sizeof(struct in_addr), sizeof(struct in6_addr)), [MDBE_ATTR_GROUP_MODE] = NLA_POLICY_RANGE(NLA_U8, MCAST_EXCLUDE, MCAST_INCLUDE), [MDBE_ATTR_SRC_LIST] = NLA_POLICY_NESTED(br_mdbe_src_list_pol), [MDBE_ATTR_RTPROT] = NLA_POLICY_MIN(NLA_U8, RTPROT_STATIC), }; static bool is_valid_mdb_source(struct nlattr *attr, __be16 proto, struct netlink_ext_ack *extack) { switch (proto) { case htons(ETH_P_IP): if (nla_len(attr) != sizeof(struct in_addr)) { NL_SET_ERR_MSG_MOD(extack, "IPv4 invalid source address length"); return false; } if (ipv4_is_multicast(nla_get_in_addr(attr))) { NL_SET_ERR_MSG_MOD(extack, "IPv4 multicast source address is not allowed"); return false; } break; #if IS_ENABLED(CONFIG_IPV6) case htons(ETH_P_IPV6): { struct in6_addr src; if (nla_len(attr) != sizeof(struct in6_addr)) { NL_SET_ERR_MSG_MOD(extack, "IPv6 invalid source address length"); return false; } src = nla_get_in6_addr(attr); if (ipv6_addr_is_multicast(&src)) { NL_SET_ERR_MSG_MOD(extack, "IPv6 multicast source address is not allowed"); return false; } break; } #endif default: NL_SET_ERR_MSG_MOD(extack, "Invalid protocol used with source address"); return false; } return true; } static struct net_bridge_mcast * __br_mdb_choose_context(struct net_bridge *br, const struct br_mdb_entry *entry, struct netlink_ext_ack *extack) { struct net_bridge_mcast *brmctx = NULL; struct net_bridge_vlan *v; if (!br_opt_get(br, BROPT_MCAST_VLAN_SNOOPING_ENABLED)) { brmctx = &br->multicast_ctx; goto out; } if (!entry->vid) { NL_SET_ERR_MSG_MOD(extack, "Cannot add an entry without a vlan when vlan snooping is enabled"); goto out; } v = br_vlan_find(br_vlan_group(br), entry->vid); if (!v) { NL_SET_ERR_MSG_MOD(extack, "Vlan is not configured"); goto out; } if (br_multicast_ctx_vlan_global_disabled(&v->br_mcast_ctx)) { NL_SET_ERR_MSG_MOD(extack, "Vlan's multicast processing is disabled"); goto out; } brmctx = &v->br_mcast_ctx; out: return brmctx; } static int br_mdb_replace_group_sg(const struct br_mdb_config *cfg, struct net_bridge_mdb_entry *mp, struct net_bridge_port_group *pg, struct net_bridge_mcast *brmctx, unsigned char flags) { unsigned long now = jiffies; pg->flags = flags; pg->rt_protocol = cfg->rt_protocol; if (!(flags & MDB_PG_FLAGS_PERMANENT) && !cfg->src_entry) mod_timer(&pg->timer, now + brmctx->multicast_membership_interval); else del_timer(&pg->timer); br_mdb_notify(cfg->br->dev, mp, pg, RTM_NEWMDB); return 0; } static int br_mdb_add_group_sg(const struct br_mdb_config *cfg, struct net_bridge_mdb_entry *mp, struct net_bridge_mcast *brmctx, unsigned char flags, struct netlink_ext_ack *extack) { struct net_bridge_port_group __rcu **pp; struct net_bridge_port_group *p; unsigned long now = jiffies; for (pp = &mp->ports; (p = mlock_dereference(*pp, cfg->br)) != NULL; pp = &p->next) { if (p->key.port == cfg->p) { if (!(cfg->nlflags & NLM_F_REPLACE)) { NL_SET_ERR_MSG_MOD(extack, "(S, G) group is already joined by port"); return -EEXIST; } return br_mdb_replace_group_sg(cfg, mp, p, brmctx, flags); } if ((unsigned long)p->key.port < (unsigned long)cfg->p) break; } p = br_multicast_new_port_group(cfg->p, &cfg->group, *pp, flags, NULL, MCAST_INCLUDE, cfg->rt_protocol, extack); if (unlikely(!p)) return -ENOMEM; rcu_assign_pointer(*pp, p); if (!(flags & MDB_PG_FLAGS_PERMANENT) && !cfg->src_entry) mod_timer(&p->timer, now + brmctx->multicast_membership_interval); br_mdb_notify(cfg->br->dev, mp, p, RTM_NEWMDB); /* All of (*, G) EXCLUDE ports need to be added to the new (S, G) for * proper replication. */ if (br_multicast_should_handle_mode(brmctx, cfg->group.proto)) { struct net_bridge_mdb_entry *star_mp; struct br_ip star_group; star_group = p->key.addr; memset(&star_group.src, 0, sizeof(star_group.src)); star_mp = br_mdb_ip_get(cfg->br, &star_group); if (star_mp) br_multicast_sg_add_exclude_ports(star_mp, p); } return 0; } static int br_mdb_add_group_src_fwd(const struct br_mdb_config *cfg, struct br_ip *src_ip, struct net_bridge_mcast *brmctx, struct netlink_ext_ack *extack) { struct net_bridge_mdb_entry *sgmp; struct br_mdb_config sg_cfg; struct br_ip sg_ip; u8 flags = 0; sg_ip = cfg->group; sg_ip.src = src_ip->src; sgmp = br_multicast_new_group(cfg->br, &sg_ip); if (IS_ERR(sgmp)) { NL_SET_ERR_MSG_MOD(extack, "Failed to add (S, G) MDB entry"); return PTR_ERR(sgmp); } if (cfg->entry->state == MDB_PERMANENT) flags |= MDB_PG_FLAGS_PERMANENT; if (cfg->filter_mode == MCAST_EXCLUDE) flags |= MDB_PG_FLAGS_BLOCKED; memset(&sg_cfg, 0, sizeof(sg_cfg)); sg_cfg.br = cfg->br; sg_cfg.p = cfg->p; sg_cfg.entry = cfg->entry; sg_cfg.group = sg_ip; sg_cfg.src_entry = true; sg_cfg.filter_mode = MCAST_INCLUDE; sg_cfg.rt_protocol = cfg->rt_protocol; sg_cfg.nlflags = cfg->nlflags; return br_mdb_add_group_sg(&sg_cfg, sgmp, brmctx, flags, extack); } static int br_mdb_add_group_src(const struct br_mdb_config *cfg, struct net_bridge_port_group *pg, struct net_bridge_mcast *brmctx, struct br_mdb_src_entry *src, struct netlink_ext_ack *extack) { struct net_bridge_group_src *ent; unsigned long now = jiffies; int err; ent = br_multicast_find_group_src(pg, &src->addr); if (!ent) { ent = br_multicast_new_group_src(pg, &src->addr); if (!ent) { NL_SET_ERR_MSG_MOD(extack, "Failed to add new source entry"); return -ENOSPC; } } else if (!(cfg->nlflags & NLM_F_REPLACE)) { NL_SET_ERR_MSG_MOD(extack, "Source entry already exists"); return -EEXIST; } if (cfg->filter_mode == MCAST_INCLUDE && cfg->entry->state == MDB_TEMPORARY) mod_timer(&ent->timer, now + br_multicast_gmi(brmctx)); else del_timer(&ent->timer); /* Install a (S, G) forwarding entry for the source. */ err = br_mdb_add_group_src_fwd(cfg, &src->addr, brmctx, extack); if (err) goto err_del_sg; ent->flags = BR_SGRP_F_INSTALLED | BR_SGRP_F_USER_ADDED; return 0; err_del_sg: __br_multicast_del_group_src(ent); return err; } static void br_mdb_del_group_src(struct net_bridge_port_group *pg, struct br_mdb_src_entry *src) { struct net_bridge_group_src *ent; ent = br_multicast_find_group_src(pg, &src->addr); if (WARN_ON_ONCE(!ent)) return; br_multicast_del_group_src(ent, false); } static int br_mdb_add_group_srcs(const struct br_mdb_config *cfg, struct net_bridge_port_group *pg, struct net_bridge_mcast *brmctx, struct netlink_ext_ack *extack) { int i, err; for (i = 0; i < cfg->num_src_entries; i++) { err = br_mdb_add_group_src(cfg, pg, brmctx, &cfg->src_entries[i], extack); if (err) goto err_del_group_srcs; } return 0; err_del_group_srcs: for (i--; i >= 0; i--) br_mdb_del_group_src(pg, &cfg->src_entries[i]); return err; } static int br_mdb_replace_group_srcs(const struct br_mdb_config *cfg, struct net_bridge_port_group *pg, struct net_bridge_mcast *brmctx, struct netlink_ext_ack *extack) { struct net_bridge_group_src *ent; struct hlist_node *tmp; int err; hlist_for_each_entry(ent, &pg->src_list, node) ent->flags |= BR_SGRP_F_DELETE; err = br_mdb_add_group_srcs(cfg, pg, brmctx, extack); if (err) goto err_clear_delete; hlist_for_each_entry_safe(ent, tmp, &pg->src_list, node) { if (ent->flags & BR_SGRP_F_DELETE) br_multicast_del_group_src(ent, false); } return 0; err_clear_delete: hlist_for_each_entry(ent, &pg->src_list, node) ent->flags &= ~BR_SGRP_F_DELETE; return err; } static int br_mdb_replace_group_star_g(const struct br_mdb_config *cfg, struct net_bridge_mdb_entry *mp, struct net_bridge_port_group *pg, struct net_bridge_mcast *brmctx, unsigned char flags, struct netlink_ext_ack *extack) { unsigned long now = jiffies; int err; err = br_mdb_replace_group_srcs(cfg, pg, brmctx, extack); if (err) return err; pg->flags = flags; pg->filter_mode = cfg->filter_mode; pg->rt_protocol = cfg->rt_protocol; if (!(flags & MDB_PG_FLAGS_PERMANENT) && cfg->filter_mode == MCAST_EXCLUDE) mod_timer(&pg->timer, now + brmctx->multicast_membership_interval); else del_timer(&pg->timer); br_mdb_notify(cfg->br->dev, mp, pg, RTM_NEWMDB); if (br_multicast_should_handle_mode(brmctx, cfg->group.proto)) br_multicast_star_g_handle_mode(pg, cfg->filter_mode); return 0; } static int br_mdb_add_group_star_g(const struct br_mdb_config *cfg, struct net_bridge_mdb_entry *mp, struct net_bridge_mcast *brmctx, unsigned char flags, struct netlink_ext_ack *extack) { struct net_bridge_port_group __rcu **pp; struct net_bridge_port_group *p; unsigned long now = jiffies; int err; for (pp = &mp->ports; (p = mlock_dereference(*pp, cfg->br)) != NULL; pp = &p->next) { if (p->key.port == cfg->p) { if (!(cfg->nlflags & NLM_F_REPLACE)) { NL_SET_ERR_MSG_MOD(extack, "(*, G) group is already joined by port"); return -EEXIST; } return br_mdb_replace_group_star_g(cfg, mp, p, brmctx, flags, extack); } if ((unsigned long)p->key.port < (unsigned long)cfg->p) break; } p = br_multicast_new_port_group(cfg->p, &cfg->group, *pp, flags, NULL, cfg->filter_mode, cfg->rt_protocol, extack); if (unlikely(!p)) return -ENOMEM; err = br_mdb_add_group_srcs(cfg, p, brmctx, extack); if (err) goto err_del_port_group; rcu_assign_pointer(*pp, p); if (!(flags & MDB_PG_FLAGS_PERMANENT) && cfg->filter_mode == MCAST_EXCLUDE) mod_timer(&p->timer, now + brmctx->multicast_membership_interval); br_mdb_notify(cfg->br->dev, mp, p, RTM_NEWMDB); /* If we are adding a new EXCLUDE port group (*, G), it needs to be * also added to all (S, G) entries for proper replication. */ if (br_multicast_should_handle_mode(brmctx, cfg->group.proto) && cfg->filter_mode == MCAST_EXCLUDE) br_multicast_star_g_handle_mode(p, MCAST_EXCLUDE); return 0; err_del_port_group: br_multicast_del_port_group(p); return err; } static int br_mdb_add_group(const struct br_mdb_config *cfg, struct netlink_ext_ack *extack) { struct br_mdb_entry *entry = cfg->entry; struct net_bridge_port *port = cfg->p; struct net_bridge_mdb_entry *mp; struct net_bridge *br = cfg->br; struct net_bridge_mcast *brmctx; struct br_ip group = cfg->group; unsigned char flags = 0; brmctx = __br_mdb_choose_context(br, entry, extack); if (!brmctx) return -EINVAL; mp = br_multicast_new_group(br, &group); if (IS_ERR(mp)) return PTR_ERR(mp); /* host join */ if (!port) { if (mp->host_joined) { NL_SET_ERR_MSG_MOD(extack, "Group is already joined by host"); return -EEXIST; } br_multicast_host_join(brmctx, mp, false); br_mdb_notify(br->dev, mp, NULL, RTM_NEWMDB); return 0; } if (entry->state == MDB_PERMANENT) flags |= MDB_PG_FLAGS_PERMANENT; if (br_multicast_is_star_g(&group)) return br_mdb_add_group_star_g(cfg, mp, brmctx, flags, extack); else return br_mdb_add_group_sg(cfg, mp, brmctx, flags, extack); } static int __br_mdb_add(const struct br_mdb_config *cfg, struct netlink_ext_ack *extack) { int ret; spin_lock_bh(&cfg->br->multicast_lock); ret = br_mdb_add_group(cfg, extack); spin_unlock_bh(&cfg->br->multicast_lock); return ret; } static int br_mdb_config_src_entry_init(struct nlattr *src_entry, struct br_mdb_src_entry *src, __be16 proto, struct netlink_ext_ack *extack) { struct nlattr *tb[MDBE_SRCATTR_MAX + 1]; int err; err = nla_parse_nested(tb, MDBE_SRCATTR_MAX, src_entry, br_mdbe_src_list_entry_pol, extack); if (err) return err; if (NL_REQ_ATTR_CHECK(extack, src_entry, tb, MDBE_SRCATTR_ADDRESS)) return -EINVAL; if (!is_valid_mdb_source(tb[MDBE_SRCATTR_ADDRESS], proto, extack)) return -EINVAL; src->addr.proto = proto; nla_memcpy(&src->addr.src, tb[MDBE_SRCATTR_ADDRESS], nla_len(tb[MDBE_SRCATTR_ADDRESS])); return 0; } static int br_mdb_config_src_list_init(struct nlattr *src_list, struct br_mdb_config *cfg, struct netlink_ext_ack *extack) { struct nlattr *src_entry; int rem, err; int i = 0; nla_for_each_nested(src_entry, src_list, rem) cfg->num_src_entries++; if (cfg->num_src_entries >= PG_SRC_ENT_LIMIT) { NL_SET_ERR_MSG_FMT_MOD(extack, "Exceeded maximum number of source entries (%u)", PG_SRC_ENT_LIMIT - 1); return -EINVAL; } cfg->src_entries = kcalloc(cfg->num_src_entries, sizeof(struct br_mdb_src_entry), GFP_KERNEL); if (!cfg->src_entries) return -ENOMEM; nla_for_each_nested(src_entry, src_list, rem) { err = br_mdb_config_src_entry_init(src_entry, &cfg->src_entries[i], cfg->entry->addr.proto, extack); if (err) goto err_src_entry_init; i++; } return 0; err_src_entry_init: kfree(cfg->src_entries); return err; } static void br_mdb_config_src_list_fini(struct br_mdb_config *cfg) { kfree(cfg->src_entries); } static int br_mdb_config_attrs_init(struct nlattr *set_attrs, struct br_mdb_config *cfg, struct netlink_ext_ack *extack) { struct nlattr *mdb_attrs[MDBE_ATTR_MAX + 1]; int err; err = nla_parse_nested(mdb_attrs, MDBE_ATTR_MAX, set_attrs, br_mdbe_attrs_pol, extack); if (err) return err; if (mdb_attrs[MDBE_ATTR_SOURCE] && !is_valid_mdb_source(mdb_attrs[MDBE_ATTR_SOURCE], cfg->entry->addr.proto, extack)) return -EINVAL; __mdb_entry_to_br_ip(cfg->entry, &cfg->group, mdb_attrs); if (mdb_attrs[MDBE_ATTR_GROUP_MODE]) { if (!cfg->p) { NL_SET_ERR_MSG_MOD(extack, "Filter mode cannot be set for host groups"); return -EINVAL; } if (!br_multicast_is_star_g(&cfg->group)) { NL_SET_ERR_MSG_MOD(extack, "Filter mode can only be set for (*, G) entries"); return -EINVAL; } cfg->filter_mode = nla_get_u8(mdb_attrs[MDBE_ATTR_GROUP_MODE]); } else { cfg->filter_mode = MCAST_EXCLUDE; } if (mdb_attrs[MDBE_ATTR_SRC_LIST]) { if (!cfg->p) { NL_SET_ERR_MSG_MOD(extack, "Source list cannot be set for host groups"); return -EINVAL; } if (!br_multicast_is_star_g(&cfg->group)) { NL_SET_ERR_MSG_MOD(extack, "Source list can only be set for (*, G) entries"); return -EINVAL; } if (!mdb_attrs[MDBE_ATTR_GROUP_MODE]) { NL_SET_ERR_MSG_MOD(extack, "Source list cannot be set without filter mode"); return -EINVAL; } err = br_mdb_config_src_list_init(mdb_attrs[MDBE_ATTR_SRC_LIST], cfg, extack); if (err) return err; } if (!cfg->num_src_entries && cfg->filter_mode == MCAST_INCLUDE) { NL_SET_ERR_MSG_MOD(extack, "Cannot add (*, G) INCLUDE with an empty source list"); return -EINVAL; } if (mdb_attrs[MDBE_ATTR_RTPROT]) { if (!cfg->p) { NL_SET_ERR_MSG_MOD(extack, "Protocol cannot be set for host groups"); return -EINVAL; } cfg->rt_protocol = nla_get_u8(mdb_attrs[MDBE_ATTR_RTPROT]); } return 0; } static int br_mdb_config_init(struct br_mdb_config *cfg, struct net_device *dev, struct nlattr *tb[], u16 nlmsg_flags, struct netlink_ext_ack *extack) { struct net *net = dev_net(dev); memset(cfg, 0, sizeof(*cfg)); cfg->filter_mode = MCAST_EXCLUDE; cfg->rt_protocol = RTPROT_STATIC; cfg->nlflags = nlmsg_flags; cfg->br = netdev_priv(dev); if (!netif_running(cfg->br->dev)) { NL_SET_ERR_MSG_MOD(extack, "Bridge device is not running"); return -EINVAL; } if (!br_opt_get(cfg->br, BROPT_MULTICAST_ENABLED)) { NL_SET_ERR_MSG_MOD(extack, "Bridge's multicast processing is disabled"); return -EINVAL; } cfg->entry = nla_data(tb[MDBA_SET_ENTRY]); if (cfg->entry->ifindex != cfg->br->dev->ifindex) { struct net_device *pdev; pdev = __dev_get_by_index(net, cfg->entry->ifindex); if (!pdev) { NL_SET_ERR_MSG_MOD(extack, "Port net device doesn't exist"); return -ENODEV; } cfg->p = br_port_get_rtnl(pdev); if (!cfg->p) { NL_SET_ERR_MSG_MOD(extack, "Net device is not a bridge port"); return -EINVAL; } if (cfg->p->br != cfg->br) { NL_SET_ERR_MSG_MOD(extack, "Port belongs to a different bridge device"); return -EINVAL; } } if (cfg->entry->addr.proto == htons(ETH_P_IP) && ipv4_is_zeronet(cfg->entry->addr.u.ip4)) { NL_SET_ERR_MSG_MOD(extack, "IPv4 entry group address 0.0.0.0 is not allowed"); return -EINVAL; } if (tb[MDBA_SET_ENTRY_ATTRS]) return br_mdb_config_attrs_init(tb[MDBA_SET_ENTRY_ATTRS], cfg, extack); else __mdb_entry_to_br_ip(cfg->entry, &cfg->group, NULL); return 0; } static void br_mdb_config_fini(struct br_mdb_config *cfg) { br_mdb_config_src_list_fini(cfg); } int br_mdb_add(struct net_device *dev, struct nlattr *tb[], u16 nlmsg_flags, struct netlink_ext_ack *extack) { struct net_bridge_vlan_group *vg; struct net_bridge_vlan *v; struct br_mdb_config cfg; int err; err = br_mdb_config_init(&cfg, dev, tb, nlmsg_flags, extack); if (err) return err; err = -EINVAL; /* host join errors which can happen before creating the group */ if (!cfg.p && !br_group_is_l2(&cfg.group)) { /* don't allow any flags for host-joined IP groups */ if (cfg.entry->state) { NL_SET_ERR_MSG_MOD(extack, "Flags are not allowed for host groups"); goto out; } if (!br_multicast_is_star_g(&cfg.group)) { NL_SET_ERR_MSG_MOD(extack, "Groups with sources cannot be manually host joined"); goto out; } } if (br_group_is_l2(&cfg.group) && cfg.entry->state != MDB_PERMANENT) { NL_SET_ERR_MSG_MOD(extack, "Only permanent L2 entries allowed"); goto out; } if (cfg.p) { if (cfg.p->state == BR_STATE_DISABLED && cfg.entry->state != MDB_PERMANENT) { NL_SET_ERR_MSG_MOD(extack, "Port is in disabled state and entry is not permanent"); goto out; } vg = nbp_vlan_group(cfg.p); } else { vg = br_vlan_group(cfg.br); } /* If vlan filtering is enabled and VLAN is not specified * install mdb entry on all vlans configured on the port. */ if (br_vlan_enabled(cfg.br->dev) && vg && cfg.entry->vid == 0) { list_for_each_entry(v, &vg->vlan_list, vlist) { cfg.entry->vid = v->vid; cfg.group.vid = v->vid; err = __br_mdb_add(&cfg, extack); if (err) break; } } else { err = __br_mdb_add(&cfg, extack); } out: br_mdb_config_fini(&cfg); return err; } static int __br_mdb_del(const struct br_mdb_config *cfg) { struct br_mdb_entry *entry = cfg->entry; struct net_bridge *br = cfg->br; struct net_bridge_mdb_entry *mp; struct net_bridge_port_group *p; struct net_bridge_port_group __rcu **pp; struct br_ip ip = cfg->group; int err = -EINVAL; spin_lock_bh(&br->multicast_lock); mp = br_mdb_ip_get(br, &ip); if (!mp) goto unlock; /* host leave */ if (entry->ifindex == mp->br->dev->ifindex && mp->host_joined) { br_multicast_host_leave(mp, false); err = 0; br_mdb_notify(br->dev, mp, NULL, RTM_DELMDB); if (!mp->ports && netif_running(br->dev)) mod_timer(&mp->timer, jiffies); goto unlock; } for (pp = &mp->ports; (p = mlock_dereference(*pp, br)) != NULL; pp = &p->next) { if (!p->key.port || p->key.port->dev->ifindex != entry->ifindex) continue; br_multicast_del_pg(mp, p, pp); err = 0; break; } unlock: spin_unlock_bh(&br->multicast_lock); return err; } int br_mdb_del(struct net_device *dev, struct nlattr *tb[], struct netlink_ext_ack *extack) { struct net_bridge_vlan_group *vg; struct net_bridge_vlan *v; struct br_mdb_config cfg; int err; err = br_mdb_config_init(&cfg, dev, tb, 0, extack); if (err) return err; if (cfg.p) vg = nbp_vlan_group(cfg.p); else vg = br_vlan_group(cfg.br); /* If vlan filtering is enabled and VLAN is not specified * delete mdb entry on all vlans configured on the port. */ if (br_vlan_enabled(cfg.br->dev) && vg && cfg.entry->vid == 0) { list_for_each_entry(v, &vg->vlan_list, vlist) { cfg.entry->vid = v->vid; cfg.group.vid = v->vid; err = __br_mdb_del(&cfg); } } else { err = __br_mdb_del(&cfg); } br_mdb_config_fini(&cfg); return err; } struct br_mdb_flush_desc { u32 port_ifindex; u16 vid; u8 rt_protocol; u8 state; u8 state_mask; }; static const struct nla_policy br_mdbe_attrs_del_bulk_pol[MDBE_ATTR_MAX + 1] = { [MDBE_ATTR_RTPROT] = NLA_POLICY_MIN(NLA_U8, RTPROT_STATIC), [MDBE_ATTR_STATE_MASK] = NLA_POLICY_MASK(NLA_U8, MDB_PERMANENT), }; static int br_mdb_flush_desc_init(struct br_mdb_flush_desc *desc, struct nlattr *tb[], struct netlink_ext_ack *extack) { struct br_mdb_entry *entry = nla_data(tb[MDBA_SET_ENTRY]); struct nlattr *mdbe_attrs[MDBE_ATTR_MAX + 1]; int err; desc->port_ifindex = entry->ifindex; desc->vid = entry->vid; desc->state = entry->state; if (!tb[MDBA_SET_ENTRY_ATTRS]) return 0; err = nla_parse_nested(mdbe_attrs, MDBE_ATTR_MAX, tb[MDBA_SET_ENTRY_ATTRS], br_mdbe_attrs_del_bulk_pol, extack); if (err) return err; if (mdbe_attrs[MDBE_ATTR_STATE_MASK]) desc->state_mask = nla_get_u8(mdbe_attrs[MDBE_ATTR_STATE_MASK]); if (mdbe_attrs[MDBE_ATTR_RTPROT]) desc->rt_protocol = nla_get_u8(mdbe_attrs[MDBE_ATTR_RTPROT]); return 0; } static void br_mdb_flush_host(struct net_bridge *br, struct net_bridge_mdb_entry *mp, const struct br_mdb_flush_desc *desc) { u8 state; if (desc->port_ifindex && desc->port_ifindex != br->dev->ifindex) return; if (desc->rt_protocol) return; state = br_group_is_l2(&mp->addr) ? MDB_PERMANENT : 0; if (desc->state_mask && (state & desc->state_mask) != desc->state) return; br_multicast_host_leave(mp, true); if (!mp->ports && netif_running(br->dev)) mod_timer(&mp->timer, jiffies); } static void br_mdb_flush_pgs(struct net_bridge *br, struct net_bridge_mdb_entry *mp, const struct br_mdb_flush_desc *desc) { struct net_bridge_port_group __rcu **pp; struct net_bridge_port_group *p; for (pp = &mp->ports; (p = mlock_dereference(*pp, br)) != NULL;) { u8 state; if (desc->port_ifindex && desc->port_ifindex != p->key.port->dev->ifindex) { pp = &p->next; continue; } if (desc->rt_protocol && desc->rt_protocol != p->rt_protocol) { pp = &p->next; continue; } state = p->flags & MDB_PG_FLAGS_PERMANENT ? MDB_PERMANENT : 0; if (desc->state_mask && (state & desc->state_mask) != desc->state) { pp = &p->next; continue; } br_multicast_del_pg(mp, p, pp); } } static void br_mdb_flush(struct net_bridge *br, const struct br_mdb_flush_desc *desc) { struct net_bridge_mdb_entry *mp; spin_lock_bh(&br->multicast_lock); /* Safe variant is not needed because entries are removed from the list * upon group timer expiration or bridge deletion. */ hlist_for_each_entry(mp, &br->mdb_list, mdb_node) { if (desc->vid && desc->vid != mp->addr.vid) continue; br_mdb_flush_host(br, mp, desc); br_mdb_flush_pgs(br, mp, desc); } spin_unlock_bh(&br->multicast_lock); } int br_mdb_del_bulk(struct net_device *dev, struct nlattr *tb[], struct netlink_ext_ack *extack) { struct net_bridge *br = netdev_priv(dev); struct br_mdb_flush_desc desc = {}; int err; err = br_mdb_flush_desc_init(&desc, tb, extack); if (err) return err; br_mdb_flush(br, &desc); return 0; } static const struct nla_policy br_mdbe_attrs_get_pol[MDBE_ATTR_MAX + 1] = { [MDBE_ATTR_SOURCE] = NLA_POLICY_RANGE(NLA_BINARY, sizeof(struct in_addr), sizeof(struct in6_addr)), }; static int br_mdb_get_parse(struct net_device *dev, struct nlattr *tb[], struct br_ip *group, struct netlink_ext_ack *extack) { struct br_mdb_entry *entry = nla_data(tb[MDBA_GET_ENTRY]); struct nlattr *mdbe_attrs[MDBE_ATTR_MAX + 1]; int err; if (!tb[MDBA_GET_ENTRY_ATTRS]) { __mdb_entry_to_br_ip(entry, group, NULL); return 0; } err = nla_parse_nested(mdbe_attrs, MDBE_ATTR_MAX, tb[MDBA_GET_ENTRY_ATTRS], br_mdbe_attrs_get_pol, extack); if (err) return err; if (mdbe_attrs[MDBE_ATTR_SOURCE] && !is_valid_mdb_source(mdbe_attrs[MDBE_ATTR_SOURCE], entry->addr.proto, extack)) return -EINVAL; __mdb_entry_to_br_ip(entry, group, mdbe_attrs); return 0; } static struct sk_buff * br_mdb_get_reply_alloc(const struct net_bridge_mdb_entry *mp) { struct net_bridge_port_group *pg; size_t nlmsg_size; nlmsg_size = NLMSG_ALIGN(sizeof(struct br_port_msg)) + /* MDBA_MDB */ nla_total_size(0) + /* MDBA_MDB_ENTRY */ nla_total_size(0); if (mp->host_joined) nlmsg_size += rtnl_mdb_nlmsg_pg_size(NULL); for (pg = mlock_dereference(mp->ports, mp->br); pg; pg = mlock_dereference(pg->next, mp->br)) nlmsg_size += rtnl_mdb_nlmsg_pg_size(pg); return nlmsg_new(nlmsg_size, GFP_ATOMIC); } static int br_mdb_get_reply_fill(struct sk_buff *skb, struct net_bridge_mdb_entry *mp, u32 portid, u32 seq) { struct nlattr *mdb_nest, *mdb_entry_nest; struct net_bridge_port_group *pg; struct br_port_msg *bpm; struct nlmsghdr *nlh; int err; nlh = nlmsg_put(skb, portid, seq, RTM_NEWMDB, sizeof(*bpm), 0); if (!nlh) return -EMSGSIZE; bpm = nlmsg_data(nlh); memset(bpm, 0, sizeof(*bpm)); bpm->family = AF_BRIDGE; bpm->ifindex = mp->br->dev->ifindex; mdb_nest = nla_nest_start_noflag(skb, MDBA_MDB); if (!mdb_nest) { err = -EMSGSIZE; goto cancel; } mdb_entry_nest = nla_nest_start_noflag(skb, MDBA_MDB_ENTRY); if (!mdb_entry_nest) { err = -EMSGSIZE; goto cancel; } if (mp->host_joined) { err = __mdb_fill_info(skb, mp, NULL); if (err) goto cancel; } for (pg = mlock_dereference(mp->ports, mp->br); pg; pg = mlock_dereference(pg->next, mp->br)) { err = __mdb_fill_info(skb, mp, pg); if (err) goto cancel; } nla_nest_end(skb, mdb_entry_nest); nla_nest_end(skb, mdb_nest); nlmsg_end(skb, nlh); return 0; cancel: nlmsg_cancel(skb, nlh); return err; } int br_mdb_get(struct net_device *dev, struct nlattr *tb[], u32 portid, u32 seq, struct netlink_ext_ack *extack) { struct net_bridge *br = netdev_priv(dev); struct net_bridge_mdb_entry *mp; struct sk_buff *skb; struct br_ip group; int err; err = br_mdb_get_parse(dev, tb, &group, extack); if (err) return err; /* Hold the multicast lock to ensure that the MDB entry does not change * between the time the reply size is determined and when the reply is * filled in. */ spin_lock_bh(&br->multicast_lock); mp = br_mdb_ip_get(br, &group); if (!mp || (!mp->ports && !mp->host_joined)) { NL_SET_ERR_MSG_MOD(extack, "MDB entry not found"); err = -ENOENT; goto unlock; } skb = br_mdb_get_reply_alloc(mp); if (!skb) { err = -ENOMEM; goto unlock; } err = br_mdb_get_reply_fill(skb, mp, portid, seq); if (err) { NL_SET_ERR_MSG_MOD(extack, "Failed to fill MDB get reply"); goto free; } spin_unlock_bh(&br->multicast_lock); return rtnl_unicast(skb, dev_net(dev), portid); free: kfree_skb(skb); unlock: spin_unlock_bh(&br->multicast_lock); return err; }
1 1 1 1 1 1 1 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 // SPDX-License-Identifier: GPL-2.0-or-later /* ASN.1 Object identifier (OID) registry * * Copyright (C) 2012 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) */ #include <linux/module.h> #include <linux/export.h> #include <linux/oid_registry.h> #include <linux/kernel.h> #include <linux/errno.h> #include <linux/bug.h> #include <linux/asn1.h> #include "oid_registry_data.c" MODULE_DESCRIPTION("OID Registry"); MODULE_AUTHOR("Red Hat, Inc."); MODULE_LICENSE("GPL"); /** * look_up_OID - Find an OID registration for the specified data * @data: Binary representation of the OID * @datasize: Size of the binary representation */ enum OID look_up_OID(const void *data, size_t datasize) { const unsigned char *octets = data; enum OID oid; unsigned char xhash; unsigned i, j, k, hash; size_t len; /* Hash the OID data */ hash = datasize - 1; for (i = 0; i < datasize; i++) hash += octets[i] * 33; hash = (hash >> 24) ^ (hash >> 16) ^ (hash >> 8) ^ hash; hash &= 0xff; /* Binary search the OID registry. OIDs are stored in ascending order * of hash value then ascending order of size and then in ascending * order of reverse value. */ i = 0; k = OID__NR; while (i < k) { j = (i + k) / 2; xhash = oid_search_table[j].hash; if (xhash > hash) { k = j; continue; } if (xhash < hash) { i = j + 1; continue; } oid = oid_search_table[j].oid; len = oid_index[oid + 1] - oid_index[oid]; if (len > datasize) { k = j; continue; } if (len < datasize) { i = j + 1; continue; } /* Variation is most likely to be at the tail end of the * OID, so do the comparison in reverse. */ while (len > 0) { unsigned char a = oid_data[oid_index[oid] + --len]; unsigned char b = octets[len]; if (a > b) { k = j; goto next; } if (a < b) { i = j + 1; goto next; } } return oid; next: ; } return OID__NR; } EXPORT_SYMBOL_GPL(look_up_OID); /** * parse_OID - Parse an OID from a bytestream * @data: Binary representation of the header + OID * @datasize: Size of the binary representation * @oid: Pointer to oid to return result * * Parse an OID from a bytestream that holds the OID in the format * ASN1_OID | length | oid. The length indicator must equal to datasize - 2. * -EBADMSG is returned if the bytestream is too short. */ int parse_OID(const void *data, size_t datasize, enum OID *oid) { const unsigned char *v = data; /* we need 2 bytes of header and at least 1 byte for oid */ if (datasize < 3 || v[0] != ASN1_OID || v[1] != datasize - 2) return -EBADMSG; *oid = look_up_OID(data + 2, datasize - 2); return 0; } EXPORT_SYMBOL_GPL(parse_OID); /* * sprint_OID - Print an Object Identifier into a buffer * @data: The encoded OID to print * @datasize: The size of the encoded OID * @buffer: The buffer to render into * @bufsize: The size of the buffer * * The OID is rendered into the buffer in "a.b.c.d" format and the number of * bytes is returned. -EBADMSG is returned if the data could not be interpreted * and -ENOBUFS if the buffer was too small. */ int sprint_oid(const void *data, size_t datasize, char *buffer, size_t bufsize) { const unsigned char *v = data, *end = v + datasize; unsigned long num; unsigned char n; size_t ret; int count; if (v >= end) goto bad; n = *v++; ret = count = snprintf(buffer, bufsize, "%u.%u", n / 40, n % 40); if (count >= bufsize) return -ENOBUFS; buffer += count; bufsize -= count; while (v < end) { n = *v++; if (!(n & 0x80)) { num = n; } else { num = n & 0x7f; do { if (v >= end) goto bad; n = *v++; num <<= 7; num |= n & 0x7f; } while (n & 0x80); } ret += count = snprintf(buffer, bufsize, ".%lu", num); if (count >= bufsize) return -ENOBUFS; buffer += count; bufsize -= count; } return ret; bad: snprintf(buffer, bufsize, "(bad)"); return -EBADMSG; } EXPORT_SYMBOL_GPL(sprint_oid); /** * sprint_OID - Print an Object Identifier into a buffer * @oid: The OID to print * @buffer: The buffer to render into * @bufsize: The size of the buffer * * The OID is rendered into the buffer in "a.b.c.d" format and the number of * bytes is returned. */ int sprint_OID(enum OID oid, char *buffer, size_t bufsize) { int ret; BUG_ON(oid >= OID__NR); ret = sprint_oid(oid_data + oid_index[oid], oid_index[oid + 1] - oid_index[oid], buffer, bufsize); BUG_ON(ret == -EBADMSG); return ret; } EXPORT_SYMBOL_GPL(sprint_OID);
392 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 /* SPDX-License-Identifier: GPL-2.0 */ /* IP Virtual Server * data structure and functionality definitions */ #ifndef _NET_IP_VS_H #define _NET_IP_VS_H #include <linux/ip_vs.h> /* definitions shared with userland */ #include <asm/types.h> /* for __uXX types */ #include <linux/list.h> /* for struct list_head */ #include <linux/spinlock.h> /* for struct rwlock_t */ #include <linux/atomic.h> /* for struct atomic_t */ #include <linux/refcount.h> /* for struct refcount_t */ #include <linux/workqueue.h> #include <linux/compiler.h> #include <linux/timer.h> #include <linux/bug.h> #include <net/checksum.h> #include <linux/netfilter.h> /* for union nf_inet_addr */ #include <linux/ip.h> #include <linux/ipv6.h> /* for struct ipv6hdr */ #include <net/ipv6.h> #if IS_ENABLED(CONFIG_NF_CONNTRACK) #include <net/netfilter/nf_conntrack.h> #endif #include <net/net_namespace.h> /* Netw namespace */ #include <linux/sched/isolation.h> #define IP_VS_HDR_INVERSE 1 #define IP_VS_HDR_ICMP 2 /* Generic access of ipvs struct */ static inline struct netns_ipvs *net_ipvs(struct net* net) { return net->ipvs; } /* Connections' size value needed by ip_vs_ctl.c */ extern int ip_vs_conn_tab_size; extern struct mutex __ip_vs_mutex; struct ip_vs_iphdr { int hdr_flags; /* ipvs flags */ __u32 off; /* Where IP or IPv4 header starts */ __u32 len; /* IPv4 simply where L4 starts * IPv6 where L4 Transport Header starts */ __u16 fragoffs; /* IPv6 fragment offset, 0 if first frag (or not frag)*/ __s16 protocol; __s32 flags; union nf_inet_addr saddr; union nf_inet_addr daddr; }; static inline void *frag_safe_skb_hp(const struct sk_buff *skb, int offset, int len, void *buffer) { return skb_header_pointer(skb, offset, len, buffer); } /* This function handles filling *ip_vs_iphdr, both for IPv4 and IPv6. * IPv6 requires some extra work, as finding proper header position, * depend on the IPv6 extension headers. */ static inline int ip_vs_fill_iph_skb_off(int af, const struct sk_buff *skb, int offset, int hdr_flags, struct ip_vs_iphdr *iphdr) { iphdr->hdr_flags = hdr_flags; iphdr->off = offset; #ifdef CONFIG_IP_VS_IPV6 if (af == AF_INET6) { struct ipv6hdr _iph; const struct ipv6hdr *iph = skb_header_pointer( skb, offset, sizeof(_iph), &_iph); if (!iph) return 0; iphdr->saddr.in6 = iph->saddr; iphdr->daddr.in6 = iph->daddr; /* ipv6_find_hdr() updates len, flags */ iphdr->len = offset; iphdr->flags = 0; iphdr->protocol = ipv6_find_hdr(skb, &iphdr->len, -1, &iphdr->fragoffs, &iphdr->flags); if (iphdr->protocol < 0) return 0; } else #endif { struct iphdr _iph; const struct iphdr *iph = skb_header_pointer( skb, offset, sizeof(_iph), &_iph); if (!iph) return 0; iphdr->len = offset + iph->ihl * 4; iphdr->fragoffs = 0; iphdr->protocol = iph->protocol; iphdr->saddr.ip = iph->saddr; iphdr->daddr.ip = iph->daddr; } return 1; } static inline int ip_vs_fill_iph_skb_icmp(int af, const struct sk_buff *skb, int offset, bool inverse, struct ip_vs_iphdr *iphdr) { int hdr_flags = IP_VS_HDR_ICMP; if (inverse) hdr_flags |= IP_VS_HDR_INVERSE; return ip_vs_fill_iph_skb_off(af, skb, offset, hdr_flags, iphdr); } static inline int ip_vs_fill_iph_skb(int af, const struct sk_buff *skb, bool inverse, struct ip_vs_iphdr *iphdr) { int hdr_flags = 0; if (inverse) hdr_flags |= IP_VS_HDR_INVERSE; return ip_vs_fill_iph_skb_off(af, skb, skb_network_offset(skb), hdr_flags, iphdr); } static inline bool ip_vs_iph_inverse(const struct ip_vs_iphdr *iph) { return !!(iph->hdr_flags & IP_VS_HDR_INVERSE); } static inline bool ip_vs_iph_icmp(const struct ip_vs_iphdr *iph) { return !!(iph->hdr_flags & IP_VS_HDR_ICMP); } static inline void ip_vs_addr_copy(int af, union nf_inet_addr *dst, const union nf_inet_addr *src) { #ifdef CONFIG_IP_VS_IPV6 if (af == AF_INET6) dst->in6 = src->in6; else #endif dst->ip = src->ip; } static inline void ip_vs_addr_set(int af, union nf_inet_addr *dst, const union nf_inet_addr *src) { #ifdef CONFIG_IP_VS_IPV6 if (af == AF_INET6) { dst->in6 = src->in6; return; } #endif dst->ip = src->ip; dst->all[1] = 0; dst->all[2] = 0; dst->all[3] = 0; } static inline int ip_vs_addr_equal(int af, const union nf_inet_addr *a, const union nf_inet_addr *b) { #ifdef CONFIG_IP_VS_IPV6 if (af == AF_INET6) return ipv6_addr_equal(&a->in6, &b->in6); #endif return a->ip == b->ip; } #ifdef CONFIG_IP_VS_DEBUG #include <linux/net.h> int ip_vs_get_debug_level(void); static inline const char *ip_vs_dbg_addr(int af, char *buf, size_t buf_len, const union nf_inet_addr *addr, int *idx) { int len; #ifdef CONFIG_IP_VS_IPV6 if (af == AF_INET6) len = snprintf(&buf[*idx], buf_len - *idx, "[%pI6c]", &addr->in6) + 1; else #endif len = snprintf(&buf[*idx], buf_len - *idx, "%pI4", &addr->ip) + 1; *idx += len; BUG_ON(*idx > buf_len + 1); return &buf[*idx - len]; } #define IP_VS_DBG_BUF(level, msg, ...) \ do { \ char ip_vs_dbg_buf[160]; \ int ip_vs_dbg_idx = 0; \ if (level <= ip_vs_get_debug_level()) \ printk(KERN_DEBUG pr_fmt(msg), ##__VA_ARGS__); \ } while (0) #define IP_VS_ERR_BUF(msg...) \ do { \ char ip_vs_dbg_buf[160]; \ int ip_vs_dbg_idx = 0; \ pr_err(msg); \ } while (0) /* Only use from within IP_VS_DBG_BUF() or IP_VS_ERR_BUF macros */ #define IP_VS_DBG_ADDR(af, addr) \ ip_vs_dbg_addr(af, ip_vs_dbg_buf, \ sizeof(ip_vs_dbg_buf), addr, \ &ip_vs_dbg_idx) #define IP_VS_DBG(level, msg, ...) \ do { \ if (level <= ip_vs_get_debug_level()) \ printk(KERN_DEBUG pr_fmt(msg), ##__VA_ARGS__); \ } while (0) #define IP_VS_DBG_RL(msg, ...) \ do { \ if (net_ratelimit()) \ printk(KERN_DEBUG pr_fmt(msg), ##__VA_ARGS__); \ } while (0) #define IP_VS_DBG_PKT(level, af, pp, skb, ofs, msg) \ do { \ if (level <= ip_vs_get_debug_level()) \ pp->debug_packet(af, pp, skb, ofs, msg); \ } while (0) #define IP_VS_DBG_RL_PKT(level, af, pp, skb, ofs, msg) \ do { \ if (level <= ip_vs_get_debug_level() && \ net_ratelimit()) \ pp->debug_packet(af, pp, skb, ofs, msg); \ } while (0) #else /* NO DEBUGGING at ALL */ #define IP_VS_DBG_BUF(level, msg...) do {} while (0) #define IP_VS_ERR_BUF(msg...) do {} while (0) #define IP_VS_DBG(level, msg...) do {} while (0) #define IP_VS_DBG_RL(msg...) do {} while (0) #define IP_VS_DBG_PKT(level, af, pp, skb, ofs, msg) do {} while (0) #define IP_VS_DBG_RL_PKT(level, af, pp, skb, ofs, msg) do {} while (0) #endif #define IP_VS_BUG() BUG() #define IP_VS_ERR_RL(msg, ...) \ do { \ if (net_ratelimit()) \ pr_err(msg, ##__VA_ARGS__); \ } while (0) /* The port number of FTP service (in network order). */ #define FTPPORT cpu_to_be16(21) #define FTPDATA cpu_to_be16(20) /* TCP State Values */ enum { IP_VS_TCP_S_NONE = 0, IP_VS_TCP_S_ESTABLISHED, IP_VS_TCP_S_SYN_SENT, IP_VS_TCP_S_SYN_RECV, IP_VS_TCP_S_FIN_WAIT, IP_VS_TCP_S_TIME_WAIT, IP_VS_TCP_S_CLOSE, IP_VS_TCP_S_CLOSE_WAIT, IP_VS_TCP_S_LAST_ACK, IP_VS_TCP_S_LISTEN, IP_VS_TCP_S_SYNACK, IP_VS_TCP_S_LAST }; /* UDP State Values */ enum { IP_VS_UDP_S_NORMAL, IP_VS_UDP_S_LAST, }; /* ICMP State Values */ enum { IP_VS_ICMP_S_NORMAL, IP_VS_ICMP_S_LAST, }; /* SCTP State Values */ enum ip_vs_sctp_states { IP_VS_SCTP_S_NONE, IP_VS_SCTP_S_INIT1, IP_VS_SCTP_S_INIT, IP_VS_SCTP_S_COOKIE_SENT, IP_VS_SCTP_S_COOKIE_REPLIED, IP_VS_SCTP_S_COOKIE_WAIT, IP_VS_SCTP_S_COOKIE, IP_VS_SCTP_S_COOKIE_ECHOED, IP_VS_SCTP_S_ESTABLISHED, IP_VS_SCTP_S_SHUTDOWN_SENT, IP_VS_SCTP_S_SHUTDOWN_RECEIVED, IP_VS_SCTP_S_SHUTDOWN_ACK_SENT, IP_VS_SCTP_S_REJECTED, IP_VS_SCTP_S_CLOSED, IP_VS_SCTP_S_LAST }; /* Connection templates use bits from state */ #define IP_VS_CTPL_S_NONE 0x0000 #define IP_VS_CTPL_S_ASSURED 0x0001 #define IP_VS_CTPL_S_LAST 0x0002 /* Delta sequence info structure * Each ip_vs_conn has 2 (output AND input seq. changes). * Only used in the VS/NAT. */ struct ip_vs_seq { __u32 init_seq; /* Add delta from this seq */ __u32 delta; /* Delta in sequence numbers */ __u32 previous_delta; /* Delta in sequence numbers * before last resized pkt */ }; /* counters per cpu */ struct ip_vs_counters { u64_stats_t conns; /* connections scheduled */ u64_stats_t inpkts; /* incoming packets */ u64_stats_t outpkts; /* outgoing packets */ u64_stats_t inbytes; /* incoming bytes */ u64_stats_t outbytes; /* outgoing bytes */ }; /* Stats per cpu */ struct ip_vs_cpu_stats { struct ip_vs_counters cnt; struct u64_stats_sync syncp; }; /* Default nice for estimator kthreads */ #define IPVS_EST_NICE 0 /* IPVS statistics objects */ struct ip_vs_estimator { struct hlist_node list; u64 last_inbytes; u64 last_outbytes; u64 last_conns; u64 last_inpkts; u64 last_outpkts; u64 cps; u64 inpps; u64 outpps; u64 inbps; u64 outbps; s32 ktid:16, /* kthread ID, -1=temp list */ ktrow:8, /* row/tick ID for kthread */ ktcid:8; /* chain ID for kthread tick */ }; /* * IPVS statistics object, 64-bit kernel version of struct ip_vs_stats_user */ struct ip_vs_kstats { u64 conns; /* connections scheduled */ u64 inpkts; /* incoming packets */ u64 outpkts; /* outgoing packets */ u64 inbytes; /* incoming bytes */ u64 outbytes; /* outgoing bytes */ u64 cps; /* current connection rate */ u64 inpps; /* current in packet rate */ u64 outpps; /* current out packet rate */ u64 inbps; /* current in byte rate */ u64 outbps; /* current out byte rate */ }; struct ip_vs_stats { struct ip_vs_kstats kstats; /* kernel statistics */ struct ip_vs_estimator est; /* estimator */ struct ip_vs_cpu_stats __percpu *cpustats; /* per cpu counters */ spinlock_t lock; /* spin lock */ struct ip_vs_kstats kstats0; /* reset values */ }; struct ip_vs_stats_rcu { struct ip_vs_stats s; struct rcu_head rcu_head; }; int ip_vs_stats_init_alloc(struct ip_vs_stats *s); struct ip_vs_stats *ip_vs_stats_alloc(void); void ip_vs_stats_release(struct ip_vs_stats *stats); void ip_vs_stats_free(struct ip_vs_stats *stats); /* Process estimators in multiple timer ticks (20/50/100, see ktrow) */ #define IPVS_EST_NTICKS 50 /* Estimation uses a 2-second period containing ticks (in jiffies) */ #define IPVS_EST_TICK ((2 * HZ) / IPVS_EST_NTICKS) /* Limit of CPU load per kthread (8 for 12.5%), ratio of CPU capacity (1/C). * Value of 4 and above ensures kthreads will take work without exceeding * the CPU capacity under different circumstances. */ #define IPVS_EST_LOAD_DIVISOR 8 /* Kthreads should not have work that exceeds the CPU load above 50% */ #define IPVS_EST_CPU_KTHREADS (IPVS_EST_LOAD_DIVISOR / 2) /* Desired number of chains per timer tick (chain load factor in 100us units), * 48=4.8ms of 40ms tick (12% CPU usage): * 2 sec * 1000 ms in sec * 10 (100us in ms) / 8 (12.5%) / 50 */ #define IPVS_EST_CHAIN_FACTOR \ ALIGN_DOWN(2 * 1000 * 10 / IPVS_EST_LOAD_DIVISOR / IPVS_EST_NTICKS, 8) /* Compiled number of chains per tick * The defines should match cond_resched_rcu */ #if defined(CONFIG_DEBUG_ATOMIC_SLEEP) || !defined(CONFIG_PREEMPT_RCU) #define IPVS_EST_TICK_CHAINS IPVS_EST_CHAIN_FACTOR #else #define IPVS_EST_TICK_CHAINS 1 #endif #if IPVS_EST_NTICKS > 127 #error Too many timer ticks for ktrow #endif /* Multiple chains processed in same tick */ struct ip_vs_est_tick_data { struct rcu_head rcu_head; struct hlist_head chains[IPVS_EST_TICK_CHAINS]; DECLARE_BITMAP(present, IPVS_EST_TICK_CHAINS); DECLARE_BITMAP(full, IPVS_EST_TICK_CHAINS); int chain_len[IPVS_EST_TICK_CHAINS]; }; /* Context for estimation kthread */ struct ip_vs_est_kt_data { struct netns_ipvs *ipvs; struct task_struct *task; /* task if running */ struct ip_vs_est_tick_data __rcu *ticks[IPVS_EST_NTICKS]; DECLARE_BITMAP(avail, IPVS_EST_NTICKS); /* tick has space for ests */ unsigned long est_timer; /* estimation timer (jiffies) */ struct ip_vs_stats *calc_stats; /* Used for calculation */ int tick_len[IPVS_EST_NTICKS]; /* est count */ int id; /* ktid per netns */ int chain_max; /* max ests per tick chain */ int tick_max; /* max ests per tick */ int est_count; /* attached ests to kthread */ int est_max_count; /* max ests per kthread */ int add_row; /* row for new ests */ int est_row; /* estimated row */ }; struct dst_entry; struct iphdr; struct ip_vs_conn; struct ip_vs_app; struct sk_buff; struct ip_vs_proto_data; struct ip_vs_protocol { struct ip_vs_protocol *next; char *name; u16 protocol; u16 num_states; int dont_defrag; void (*init)(struct ip_vs_protocol *pp); void (*exit)(struct ip_vs_protocol *pp); int (*init_netns)(struct netns_ipvs *ipvs, struct ip_vs_proto_data *pd); void (*exit_netns)(struct netns_ipvs *ipvs, struct ip_vs_proto_data *pd); int (*conn_schedule)(struct netns_ipvs *ipvs, int af, struct sk_buff *skb, struct ip_vs_proto_data *pd, int *verdict, struct ip_vs_conn **cpp, struct ip_vs_iphdr *iph); struct ip_vs_conn * (*conn_in_get)(struct netns_ipvs *ipvs, int af, const struct sk_buff *skb, const struct ip_vs_iphdr *iph); struct ip_vs_conn * (*conn_out_get)(struct netns_ipvs *ipvs, int af, const struct sk_buff *skb, const struct ip_vs_iphdr *iph); int (*snat_handler)(struct sk_buff *skb, struct ip_vs_protocol *pp, struct ip_vs_conn *cp, struct ip_vs_iphdr *iph); int (*dnat_handler)(struct sk_buff *skb, struct ip_vs_protocol *pp, struct ip_vs_conn *cp, struct ip_vs_iphdr *iph); const char *(*state_name)(int state); void (*state_transition)(struct ip_vs_conn *cp, int direction, const struct sk_buff *skb, struct ip_vs_proto_data *pd); int (*register_app)(struct netns_ipvs *ipvs, struct ip_vs_app *inc); void (*unregister_app)(struct netns_ipvs *ipvs, struct ip_vs_app *inc); int (*app_conn_bind)(struct ip_vs_conn *cp); void (*debug_packet)(int af, struct ip_vs_protocol *pp, const struct sk_buff *skb, int offset, const char *msg); void (*timeout_change)(struct ip_vs_proto_data *pd, int flags); }; /* protocol data per netns */ struct ip_vs_proto_data { struct ip_vs_proto_data *next; struct ip_vs_protocol *pp; int *timeout_table; /* protocol timeout table */ atomic_t appcnt; /* counter of proto app incs. */ struct tcp_states_t *tcp_state_table; }; struct ip_vs_protocol *ip_vs_proto_get(unsigned short proto); struct ip_vs_proto_data *ip_vs_proto_data_get(struct netns_ipvs *ipvs, unsigned short proto); struct ip_vs_conn_param { struct netns_ipvs *ipvs; const union nf_inet_addr *caddr; const union nf_inet_addr *vaddr; __be16 cport; __be16 vport; __u16 protocol; u16 af; const struct ip_vs_pe *pe; char *pe_data; __u8 pe_data_len; }; /* IP_VS structure allocated for each dynamically scheduled connection */ struct ip_vs_conn { struct hlist_node c_list; /* hashed list heads */ /* Protocol, addresses and port numbers */ __be16 cport; __be16 dport; __be16 vport; u16 af; /* address family */ union nf_inet_addr caddr; /* client address */ union nf_inet_addr vaddr; /* virtual address */ union nf_inet_addr daddr; /* destination address */ volatile __u32 flags; /* status flags */ __u16 protocol; /* Which protocol (TCP/UDP) */ __u16 daf; /* Address family of the dest */ struct netns_ipvs *ipvs; /* counter and timer */ refcount_t refcnt; /* reference count */ struct timer_list timer; /* Expiration timer */ volatile unsigned long timeout; /* timeout */ /* Flags and state transition */ spinlock_t lock; /* lock for state transition */ volatile __u16 state; /* state info */ volatile __u16 old_state; /* old state, to be used for * state transition triggered * synchronization */ __u32 fwmark; /* Fire wall mark from skb */ unsigned long sync_endtime; /* jiffies + sent_retries */ /* Control members */ struct ip_vs_conn *control; /* Master control connection */ atomic_t n_control; /* Number of controlled ones */ struct ip_vs_dest *dest; /* real server */ atomic_t in_pkts; /* incoming packet counter */ /* Packet transmitter for different forwarding methods. If it * mangles the packet, it must return NF_DROP or better NF_STOLEN, * otherwise this must be changed to a sk_buff **. * NF_ACCEPT can be returned when destination is local. */ int (*packet_xmit)(struct sk_buff *skb, struct ip_vs_conn *cp, struct ip_vs_protocol *pp, struct ip_vs_iphdr *iph); /* Note: we can group the following members into a structure, * in order to save more space, and the following members are * only used in VS/NAT anyway */ struct ip_vs_app *app; /* bound ip_vs_app object */ void *app_data; /* Application private data */ struct_group(sync_conn_opt, struct ip_vs_seq in_seq; /* incoming seq. struct */ struct ip_vs_seq out_seq; /* outgoing seq. struct */ ); const struct ip_vs_pe *pe; char *pe_data; __u8 pe_data_len; struct rcu_head rcu_head; }; /* Extended internal versions of struct ip_vs_service_user and ip_vs_dest_user * for IPv6 support. * * We need these to conveniently pass around service and destination * options, but unfortunately, we also need to keep the old definitions to * maintain userspace backwards compatibility for the setsockopt interface. */ struct ip_vs_service_user_kern { /* virtual service addresses */ u16 af; u16 protocol; union nf_inet_addr addr; /* virtual ip address */ __be16 port; u32 fwmark; /* firewall mark of service */ /* virtual service options */ char *sched_name; char *pe_name; unsigned int flags; /* virtual service flags */ unsigned int timeout; /* persistent timeout in sec */ __be32 netmask; /* persistent netmask or plen */ }; struct ip_vs_dest_user_kern { /* destination server address */ union nf_inet_addr addr; __be16 port; /* real server options */ unsigned int conn_flags; /* connection flags */ int weight; /* destination weight */ /* thresholds for active connections */ u32 u_threshold; /* upper threshold */ u32 l_threshold; /* lower threshold */ /* Address family of addr */ u16 af; u16 tun_type; /* tunnel type */ __be16 tun_port; /* tunnel port */ u16 tun_flags; /* tunnel flags */ }; /* * The information about the virtual service offered to the net and the * forwarding entries. */ struct ip_vs_service { struct hlist_node s_list; /* for normal service table */ struct hlist_node f_list; /* for fwmark-based service table */ atomic_t refcnt; /* reference counter */ u16 af; /* address family */ __u16 protocol; /* which protocol (TCP/UDP) */ union nf_inet_addr addr; /* IP address for virtual service */ __be16 port; /* port number for the service */ __u32 fwmark; /* firewall mark of the service */ unsigned int flags; /* service status flags */ unsigned int timeout; /* persistent timeout in ticks */ __be32 netmask; /* grouping granularity, mask/plen */ struct netns_ipvs *ipvs; struct list_head destinations; /* real server d-linked list */ __u32 num_dests; /* number of servers */ struct ip_vs_stats stats; /* statistics for the service */ /* for scheduling */ struct ip_vs_scheduler __rcu *scheduler; /* bound scheduler object */ spinlock_t sched_lock; /* lock sched_data */ void *sched_data; /* scheduler application data */ /* alternate persistence engine */ struct ip_vs_pe __rcu *pe; int conntrack_afmask; struct rcu_head rcu_head; }; /* Information for cached dst */ struct ip_vs_dest_dst { struct dst_entry *dst_cache; /* destination cache entry */ u32 dst_cookie; union nf_inet_addr dst_saddr; struct rcu_head rcu_head; }; /* The real server destination forwarding entry with ip address, port number, * and so on. */ struct ip_vs_dest { struct list_head n_list; /* for the dests in the service */ struct hlist_node d_list; /* for table with all the dests */ u16 af; /* address family */ __be16 port; /* port number of the server */ union nf_inet_addr addr; /* IP address of the server */ volatile unsigned int flags; /* dest status flags */ atomic_t conn_flags; /* flags to copy to conn */ atomic_t weight; /* server weight */ atomic_t last_weight; /* server latest weight */ __u16 tun_type; /* tunnel type */ __be16 tun_port; /* tunnel port */ __u16 tun_flags; /* tunnel flags */ refcount_t refcnt; /* reference counter */ struct ip_vs_stats stats; /* statistics */ unsigned long idle_start; /* start time, jiffies */ /* connection counters and thresholds */ atomic_t activeconns; /* active connections */ atomic_t inactconns; /* inactive connections */ atomic_t persistconns; /* persistent connections */ __u32 u_threshold; /* upper threshold */ __u32 l_threshold; /* lower threshold */ /* for destination cache */ spinlock_t dst_lock; /* lock of dst_cache */ struct ip_vs_dest_dst __rcu *dest_dst; /* cached dst info */ /* for virtual service */ struct ip_vs_service __rcu *svc; /* service it belongs to */ __u16 protocol; /* which protocol (TCP/UDP) */ __be16 vport; /* virtual port number */ union nf_inet_addr vaddr; /* virtual IP address */ __u32 vfwmark; /* firewall mark of service */ struct rcu_head rcu_head; struct list_head t_list; /* in dest_trash */ unsigned int in_rs_table:1; /* we are in rs_table */ }; /* The scheduler object */ struct ip_vs_scheduler { struct list_head n_list; /* d-linked list head */ char *name; /* scheduler name */ atomic_t refcnt; /* reference counter */ struct module *module; /* THIS_MODULE/NULL */ /* scheduler initializing service */ int (*init_service)(struct ip_vs_service *svc); /* scheduling service finish */ void (*done_service)(struct ip_vs_service *svc); /* dest is linked */ int (*add_dest)(struct ip_vs_service *svc, struct ip_vs_dest *dest); /* dest is unlinked */ int (*del_dest)(struct ip_vs_service *svc, struct ip_vs_dest *dest); /* dest is updated */ int (*upd_dest)(struct ip_vs_service *svc, struct ip_vs_dest *dest); /* selecting a server from the given service */ struct ip_vs_dest* (*schedule)(struct ip_vs_service *svc, const struct sk_buff *skb, struct ip_vs_iphdr *iph); }; /* The persistence engine object */ struct ip_vs_pe { struct list_head n_list; /* d-linked list head */ char *name; /* scheduler name */ atomic_t refcnt; /* reference counter */ struct module *module; /* THIS_MODULE/NULL */ /* get the connection template, if any */ int (*fill_param)(struct ip_vs_conn_param *p, struct sk_buff *skb); bool (*ct_match)(const struct ip_vs_conn_param *p, struct ip_vs_conn *ct); u32 (*hashkey_raw)(const struct ip_vs_conn_param *p, u32 initval, bool inverse); int (*show_pe_data)(const struct ip_vs_conn *cp, char *buf); /* create connections for real-server outgoing packets */ struct ip_vs_conn* (*conn_out)(struct ip_vs_service *svc, struct ip_vs_dest *dest, struct sk_buff *skb, const struct ip_vs_iphdr *iph, __be16 dport, __be16 cport); }; /* The application module object (a.k.a. app incarnation) */ struct ip_vs_app { struct list_head a_list; /* member in app list */ int type; /* IP_VS_APP_TYPE_xxx */ char *name; /* application module name */ __u16 protocol; struct module *module; /* THIS_MODULE/NULL */ struct list_head incs_list; /* list of incarnations */ /* members for application incarnations */ struct list_head p_list; /* member in proto app list */ struct ip_vs_app *app; /* its real application */ __be16 port; /* port number in net order */ atomic_t usecnt; /* usage counter */ struct rcu_head rcu_head; /* output hook: Process packet in inout direction, diff set for TCP. * Return: 0=Error, 1=Payload Not Mangled/Mangled but checksum is ok, * 2=Mangled but checksum was not updated */ int (*pkt_out)(struct ip_vs_app *, struct ip_vs_conn *, struct sk_buff *, int *diff, struct ip_vs_iphdr *ipvsh); /* input hook: Process packet in outin direction, diff set for TCP. * Return: 0=Error, 1=Payload Not Mangled/Mangled but checksum is ok, * 2=Mangled but checksum was not updated */ int (*pkt_in)(struct ip_vs_app *, struct ip_vs_conn *, struct sk_buff *, int *diff, struct ip_vs_iphdr *ipvsh); /* ip_vs_app initializer */ int (*init_conn)(struct ip_vs_app *, struct ip_vs_conn *); /* ip_vs_app finish */ int (*done_conn)(struct ip_vs_app *, struct ip_vs_conn *); /* not used now */ int (*bind_conn)(struct ip_vs_app *, struct ip_vs_conn *, struct ip_vs_protocol *); void (*unbind_conn)(struct ip_vs_app *, struct ip_vs_conn *); int * timeout_table; int * timeouts; int timeouts_size; int (*conn_schedule)(struct sk_buff *skb, struct ip_vs_app *app, int *verdict, struct ip_vs_conn **cpp); struct ip_vs_conn * (*conn_in_get)(const struct sk_buff *skb, struct ip_vs_app *app, const struct iphdr *iph, int inverse); struct ip_vs_conn * (*conn_out_get)(const struct sk_buff *skb, struct ip_vs_app *app, const struct iphdr *iph, int inverse); int (*state_transition)(struct ip_vs_conn *cp, int direction, const struct sk_buff *skb, struct ip_vs_app *app); void (*timeout_change)(struct ip_vs_app *app, int flags); }; struct ipvs_master_sync_state { struct list_head sync_queue; struct ip_vs_sync_buff *sync_buff; unsigned long sync_queue_len; unsigned int sync_queue_delay; struct delayed_work master_wakeup_work; struct netns_ipvs *ipvs; }; struct ip_vs_sync_thread_data; /* How much time to keep dests in trash */ #define IP_VS_DEST_TRASH_PERIOD (120 * HZ) struct ipvs_sync_daemon_cfg { union nf_inet_addr mcast_group; int syncid; u16 sync_maxlen; u16 mcast_port; u8 mcast_af; u8 mcast_ttl; /* multicast interface name */ char mcast_ifn[IP_VS_IFNAME_MAXLEN]; }; /* IPVS in network namespace */ struct netns_ipvs { int gen; /* Generation */ int enable; /* enable like nf_hooks do */ /* Hash table: for real service lookups */ #define IP_VS_RTAB_BITS 4 #define IP_VS_RTAB_SIZE (1 << IP_VS_RTAB_BITS) #define IP_VS_RTAB_MASK (IP_VS_RTAB_SIZE - 1) struct hlist_head rs_table[IP_VS_RTAB_SIZE]; /* ip_vs_app */ struct list_head app_list; /* ip_vs_proto */ #define IP_VS_PROTO_TAB_SIZE 32 /* must be power of 2 */ struct ip_vs_proto_data *proto_data_table[IP_VS_PROTO_TAB_SIZE]; /* ip_vs_proto_tcp */ #ifdef CONFIG_IP_VS_PROTO_TCP #define TCP_APP_TAB_BITS 4 #define TCP_APP_TAB_SIZE (1 << TCP_APP_TAB_BITS) #define TCP_APP_TAB_MASK (TCP_APP_TAB_SIZE - 1) struct list_head tcp_apps[TCP_APP_TAB_SIZE]; #endif /* ip_vs_proto_udp */ #ifdef CONFIG_IP_VS_PROTO_UDP #define UDP_APP_TAB_BITS 4 #define UDP_APP_TAB_SIZE (1 << UDP_APP_TAB_BITS) #define UDP_APP_TAB_MASK (UDP_APP_TAB_SIZE - 1) struct list_head udp_apps[UDP_APP_TAB_SIZE]; #endif /* ip_vs_proto_sctp */ #ifdef CONFIG_IP_VS_PROTO_SCTP #define SCTP_APP_TAB_BITS 4 #define SCTP_APP_TAB_SIZE (1 << SCTP_APP_TAB_BITS) #define SCTP_APP_TAB_MASK (SCTP_APP_TAB_SIZE - 1) /* Hash table for SCTP application incarnations */ struct list_head sctp_apps[SCTP_APP_TAB_SIZE]; #endif /* ip_vs_conn */ atomic_t conn_count; /* connection counter */ /* ip_vs_ctl */ struct ip_vs_stats_rcu *tot_stats; /* Statistics & est. */ int num_services; /* no of virtual services */ int num_services6; /* IPv6 virtual services */ /* Trash for destinations */ struct list_head dest_trash; spinlock_t dest_trash_lock; struct timer_list dest_trash_timer; /* expiration timer */ /* Service counters */ atomic_t ftpsvc_counter; atomic_t nullsvc_counter; atomic_t conn_out_counter; #ifdef CONFIG_SYSCTL /* delayed work for expiring no dest connections */ struct delayed_work expire_nodest_conn_work; /* 1/rate drop and drop-entry variables */ struct delayed_work defense_work; /* Work handler */ int drop_rate; int drop_counter; int old_secure_tcp; atomic_t dropentry; /* locks in ctl.c */ spinlock_t dropentry_lock; /* drop entry handling */ spinlock_t droppacket_lock; /* drop packet handling */ spinlock_t securetcp_lock; /* state and timeout tables */ /* sys-ctl struct */ struct ctl_table_header *sysctl_hdr; struct ctl_table *sysctl_tbl; #endif /* sysctl variables */ int sysctl_amemthresh; int sysctl_am_droprate; int sysctl_drop_entry; int sysctl_drop_packet; int sysctl_secure_tcp; #ifdef CONFIG_IP_VS_NFCT int sysctl_conntrack; #endif int sysctl_snat_reroute; int sysctl_sync_ver; int sysctl_sync_ports; int sysctl_sync_persist_mode; unsigned long sysctl_sync_qlen_max; int sysctl_sync_sock_size; int sysctl_cache_bypass; int sysctl_expire_nodest_conn; int sysctl_sloppy_tcp; int sysctl_sloppy_sctp; int sysctl_expire_quiescent_template; int sysctl_sync_threshold[2]; unsigned int sysctl_sync_refresh_period; int sysctl_sync_retries; int sysctl_nat_icmp_send; int sysctl_pmtu_disc; int sysctl_backup_only; int sysctl_conn_reuse_mode; int sysctl_schedule_icmp; int sysctl_ignore_tunneled; int sysctl_run_estimation; #ifdef CONFIG_SYSCTL cpumask_var_t sysctl_est_cpulist; /* kthread cpumask */ int est_cpulist_valid; /* cpulist set */ int sysctl_est_nice; /* kthread nice */ int est_stopped; /* stop tasks */ #endif /* ip_vs_lblc */ int sysctl_lblc_expiration; struct ctl_table_header *lblc_ctl_header; struct ctl_table *lblc_ctl_table; /* ip_vs_lblcr */ int sysctl_lblcr_expiration; struct ctl_table_header *lblcr_ctl_header; struct ctl_table *lblcr_ctl_table; /* ip_vs_est */ struct delayed_work est_reload_work;/* Reload kthread tasks */ struct mutex est_mutex; /* protect kthread tasks */ struct hlist_head est_temp_list; /* Ests during calc phase */ struct ip_vs_est_kt_data **est_kt_arr; /* Array of kthread data ptrs */ unsigned long est_max_threads;/* Hard limit of kthreads */ int est_calc_phase; /* Calculation phase */ int est_chain_max; /* Calculated chain_max */ int est_kt_count; /* Allocated ptrs */ int est_add_ktid; /* ktid where to add ests */ atomic_t est_genid; /* kthreads reload genid */ atomic_t est_genid_done; /* applied genid */ /* ip_vs_sync */ spinlock_t sync_lock; struct ipvs_master_sync_state *ms; spinlock_t sync_buff_lock; struct ip_vs_sync_thread_data *master_tinfo; struct ip_vs_sync_thread_data *backup_tinfo; int threads_mask; volatile int sync_state; struct mutex sync_mutex; struct ipvs_sync_daemon_cfg mcfg; /* Master Configuration */ struct ipvs_sync_daemon_cfg bcfg; /* Backup Configuration */ /* net name space ptr */ struct net *net; /* Needed by timer routines */ /* Number of heterogeneous destinations, needed because heterogeneous * are not supported when synchronization is enabled. */ unsigned int mixed_address_family_dests; unsigned int hooks_afmask; /* &1=AF_INET, &2=AF_INET6 */ }; #define DEFAULT_SYNC_THRESHOLD 3 #define DEFAULT_SYNC_PERIOD 50 #define DEFAULT_SYNC_VER 1 #define DEFAULT_SLOPPY_TCP 0 #define DEFAULT_SLOPPY_SCTP 0 #define DEFAULT_SYNC_REFRESH_PERIOD (0U * HZ) #define DEFAULT_SYNC_RETRIES 0 #define IPVS_SYNC_WAKEUP_RATE 8 #define IPVS_SYNC_QLEN_MAX (IPVS_SYNC_WAKEUP_RATE * 4) #define IPVS_SYNC_SEND_DELAY (HZ / 50) #define IPVS_SYNC_CHECK_PERIOD HZ #define IPVS_SYNC_FLUSH_TIME (HZ * 2) #define IPVS_SYNC_PORTS_MAX (1 << 6) #ifdef CONFIG_SYSCTL static inline int sysctl_sync_threshold(struct netns_ipvs *ipvs) { return ipvs->sysctl_sync_threshold[0]; } static inline int sysctl_sync_period(struct netns_ipvs *ipvs) { return READ_ONCE(ipvs->sysctl_sync_threshold[1]); } static inline unsigned int sysctl_sync_refresh_period(struct netns_ipvs *ipvs) { return READ_ONCE(ipvs->sysctl_sync_refresh_period); } static inline int sysctl_sync_retries(struct netns_ipvs *ipvs) { return ipvs->sysctl_sync_retries; } static inline int sysctl_sync_ver(struct netns_ipvs *ipvs) { return ipvs->sysctl_sync_ver; } static inline int sysctl_sloppy_tcp(struct netns_ipvs *ipvs) { return ipvs->sysctl_sloppy_tcp; } static inline int sysctl_sloppy_sctp(struct netns_ipvs *ipvs) { return ipvs->sysctl_sloppy_sctp; } static inline int sysctl_sync_ports(struct netns_ipvs *ipvs) { return READ_ONCE(ipvs->sysctl_sync_ports); } static inline int sysctl_sync_persist_mode(struct netns_ipvs *ipvs) { return ipvs->sysctl_sync_persist_mode; } static inline unsigned long sysctl_sync_qlen_max(struct netns_ipvs *ipvs) { return ipvs->sysctl_sync_qlen_max; } static inline int sysctl_sync_sock_size(struct netns_ipvs *ipvs) { return ipvs->sysctl_sync_sock_size; } static inline int sysctl_pmtu_disc(struct netns_ipvs *ipvs) { return ipvs->sysctl_pmtu_disc; } static inline int sysctl_backup_only(struct netns_ipvs *ipvs) { return ipvs->sync_state & IP_VS_STATE_BACKUP && ipvs->sysctl_backup_only; } static inline int sysctl_conn_reuse_mode(struct netns_ipvs *ipvs) { return ipvs->sysctl_conn_reuse_mode; } static inline int sysctl_expire_nodest_conn(struct netns_ipvs *ipvs) { return ipvs->sysctl_expire_nodest_conn; } static inline int sysctl_schedule_icmp(struct netns_ipvs *ipvs) { return ipvs->sysctl_schedule_icmp; } static inline int sysctl_ignore_tunneled(struct netns_ipvs *ipvs) { return ipvs->sysctl_ignore_tunneled; } static inline int sysctl_cache_bypass(struct netns_ipvs *ipvs) { return ipvs->sysctl_cache_bypass; } static inline int sysctl_run_estimation(struct netns_ipvs *ipvs) { return ipvs->sysctl_run_estimation; } static inline const struct cpumask *sysctl_est_cpulist(struct netns_ipvs *ipvs) { if (ipvs->est_cpulist_valid) return ipvs->sysctl_est_cpulist; else return housekeeping_cpumask(HK_TYPE_KTHREAD); } static inline int sysctl_est_nice(struct netns_ipvs *ipvs) { return ipvs->sysctl_est_nice; } #else static inline int sysctl_sync_threshold(struct netns_ipvs *ipvs) { return DEFAULT_SYNC_THRESHOLD; } static inline int sysctl_sync_period(struct netns_ipvs *ipvs) { return DEFAULT_SYNC_PERIOD; } static inline unsigned int sysctl_sync_refresh_period(struct netns_ipvs *ipvs) { return DEFAULT_SYNC_REFRESH_PERIOD; } static inline int sysctl_sync_retries(struct netns_ipvs *ipvs) { return DEFAULT_SYNC_RETRIES & 3; } static inline int sysctl_sync_ver(struct netns_ipvs *ipvs) { return DEFAULT_SYNC_VER; } static inline int sysctl_sloppy_tcp(struct netns_ipvs *ipvs) { return DEFAULT_SLOPPY_TCP; } static inline int sysctl_sloppy_sctp(struct netns_ipvs *ipvs) { return DEFAULT_SLOPPY_SCTP; } static inline int sysctl_sync_ports(struct netns_ipvs *ipvs) { return 1; } static inline int sysctl_sync_persist_mode(struct netns_ipvs *ipvs) { return 0; } static inline unsigned long sysctl_sync_qlen_max(struct netns_ipvs *ipvs) { return IPVS_SYNC_QLEN_MAX; } static inline int sysctl_sync_sock_size(struct netns_ipvs *ipvs) { return 0; } static inline int sysctl_pmtu_disc(struct netns_ipvs *ipvs) { return 1; } static inline int sysctl_backup_only(struct netns_ipvs *ipvs) { return 0; } static inline int sysctl_conn_reuse_mode(struct netns_ipvs *ipvs) { return 1; } static inline int sysctl_expire_nodest_conn(struct netns_ipvs *ipvs) { return 0; } static inline int sysctl_schedule_icmp(struct netns_ipvs *ipvs) { return 0; } static inline int sysctl_ignore_tunneled(struct netns_ipvs *ipvs) { return 0; } static inline int sysctl_cache_bypass(struct netns_ipvs *ipvs) { return 0; } static inline int sysctl_run_estimation(struct netns_ipvs *ipvs) { return 1; } static inline const struct cpumask *sysctl_est_cpulist(struct netns_ipvs *ipvs) { return housekeeping_cpumask(HK_TYPE_KTHREAD); } static inline int sysctl_est_nice(struct netns_ipvs *ipvs) { return IPVS_EST_NICE; } #endif /* IPVS core functions * (from ip_vs_core.c) */ const char *ip_vs_proto_name(unsigned int proto); void ip_vs_init_hash_table(struct list_head *table, int rows); struct ip_vs_conn *ip_vs_new_conn_out(struct ip_vs_service *svc, struct ip_vs_dest *dest, struct sk_buff *skb, const struct ip_vs_iphdr *iph, __be16 dport, __be16 cport); #define IP_VS_INIT_HASH_TABLE(t) ip_vs_init_hash_table((t), ARRAY_SIZE((t))) #define IP_VS_APP_TYPE_FTP 1 /* ip_vs_conn handling functions * (from ip_vs_conn.c) */ enum { IP_VS_DIR_INPUT = 0, IP_VS_DIR_OUTPUT, IP_VS_DIR_INPUT_ONLY, IP_VS_DIR_LAST, }; static inline void ip_vs_conn_fill_param(struct netns_ipvs *ipvs, int af, int protocol, const union nf_inet_addr *caddr, __be16 cport, const union nf_inet_addr *vaddr, __be16 vport, struct ip_vs_conn_param *p) { p->ipvs = ipvs; p->af = af; p->protocol = protocol; p->caddr = caddr; p->cport = cport; p->vaddr = vaddr; p->vport = vport; p->pe = NULL; p->pe_data = NULL; } struct ip_vs_conn *ip_vs_conn_in_get(const struct ip_vs_conn_param *p); struct ip_vs_conn *ip_vs_ct_in_get(const struct ip_vs_conn_param *p); struct ip_vs_conn * ip_vs_conn_in_get_proto(struct netns_ipvs *ipvs, int af, const struct sk_buff *skb, const struct ip_vs_iphdr *iph); struct ip_vs_conn *ip_vs_conn_out_get(const struct ip_vs_conn_param *p); struct ip_vs_conn * ip_vs_conn_out_get_proto(struct netns_ipvs *ipvs, int af, const struct sk_buff *skb, const struct ip_vs_iphdr *iph); /* Get reference to gain full access to conn. * By default, RCU read-side critical sections have access only to * conn fields and its PE data, see ip_vs_conn_rcu_free() for reference. */ static inline bool __ip_vs_conn_get(struct ip_vs_conn *cp) { return refcount_inc_not_zero(&cp->refcnt); } /* put back the conn without restarting its timer */ static inline void __ip_vs_conn_put(struct ip_vs_conn *cp) { smp_mb__before_atomic(); refcount_dec(&cp->refcnt); } void ip_vs_conn_put(struct ip_vs_conn *cp); void ip_vs_conn_fill_cport(struct ip_vs_conn *cp, __be16 cport); struct ip_vs_conn *ip_vs_conn_new(const struct ip_vs_conn_param *p, int dest_af, const union nf_inet_addr *daddr, __be16 dport, unsigned int flags, struct ip_vs_dest *dest, __u32 fwmark); void ip_vs_conn_expire_now(struct ip_vs_conn *cp); const char *ip_vs_state_name(const struct ip_vs_conn *cp); void ip_vs_tcp_conn_listen(struct ip_vs_conn *cp); int ip_vs_check_template(struct ip_vs_conn *ct, struct ip_vs_dest *cdest); void ip_vs_random_dropentry(struct netns_ipvs *ipvs); int ip_vs_conn_init(void); void ip_vs_conn_cleanup(void); static inline void ip_vs_control_del(struct ip_vs_conn *cp) { struct ip_vs_conn *ctl_cp = cp->control; if (!ctl_cp) { IP_VS_ERR_BUF("request control DEL for uncontrolled: " "%s:%d to %s:%d\n", IP_VS_DBG_ADDR(cp->af, &cp->caddr), ntohs(cp->cport), IP_VS_DBG_ADDR(cp->af, &cp->vaddr), ntohs(cp->vport)); return; } IP_VS_DBG_BUF(7, "DELeting control for: " "cp.dst=%s:%d ctl_cp.dst=%s:%d\n", IP_VS_DBG_ADDR(cp->af, &cp->caddr), ntohs(cp->cport), IP_VS_DBG_ADDR(cp->af, &ctl_cp->caddr), ntohs(ctl_cp->cport)); cp->control = NULL; if (atomic_read(&ctl_cp->n_control) == 0) { IP_VS_ERR_BUF("BUG control DEL with n=0 : " "%s:%d to %s:%d\n", IP_VS_DBG_ADDR(cp->af, &cp->caddr), ntohs(cp->cport), IP_VS_DBG_ADDR(cp->af, &cp->vaddr), ntohs(cp->vport)); return; } atomic_dec(&ctl_cp->n_control); } static inline void ip_vs_control_add(struct ip_vs_conn *cp, struct ip_vs_conn *ctl_cp) { if (cp->control) { IP_VS_ERR_BUF("request control ADD for already controlled: " "%s:%d to %s:%d\n", IP_VS_DBG_ADDR(cp->af, &cp->caddr), ntohs(cp->cport), IP_VS_DBG_ADDR(cp->af, &cp->vaddr), ntohs(cp->vport)); ip_vs_control_del(cp); } IP_VS_DBG_BUF(7, "ADDing control for: " "cp.dst=%s:%d ctl_cp.dst=%s:%d\n", IP_VS_DBG_ADDR(cp->af, &cp->caddr), ntohs(cp->cport), IP_VS_DBG_ADDR(cp->af, &ctl_cp->caddr), ntohs(ctl_cp->cport)); cp->control = ctl_cp; atomic_inc(&ctl_cp->n_control); } /* Mark our template as assured */ static inline void ip_vs_control_assure_ct(struct ip_vs_conn *cp) { struct ip_vs_conn *ct = cp->control; if (ct && !(ct->state & IP_VS_CTPL_S_ASSURED) && (ct->flags & IP_VS_CONN_F_TEMPLATE)) ct->state |= IP_VS_CTPL_S_ASSURED; } /* IPVS netns init & cleanup functions */ int ip_vs_estimator_net_init(struct netns_ipvs *ipvs); int ip_vs_control_net_init(struct netns_ipvs *ipvs); int ip_vs_protocol_net_init(struct netns_ipvs *ipvs); int ip_vs_app_net_init(struct netns_ipvs *ipvs); int ip_vs_conn_net_init(struct netns_ipvs *ipvs); int ip_vs_sync_net_init(struct netns_ipvs *ipvs); void ip_vs_conn_net_cleanup(struct netns_ipvs *ipvs); void ip_vs_app_net_cleanup(struct netns_ipvs *ipvs); void ip_vs_protocol_net_cleanup(struct netns_ipvs *ipvs); void ip_vs_control_net_cleanup(struct netns_ipvs *ipvs); void ip_vs_estimator_net_cleanup(struct netns_ipvs *ipvs); void ip_vs_sync_net_cleanup(struct netns_ipvs *ipvs); void ip_vs_service_nets_cleanup(struct list_head *net_list); /* IPVS application functions * (from ip_vs_app.c) */ #define IP_VS_APP_MAX_PORTS 8 struct ip_vs_app *register_ip_vs_app(struct netns_ipvs *ipvs, struct ip_vs_app *app); void unregister_ip_vs_app(struct netns_ipvs *ipvs, struct ip_vs_app *app); int ip_vs_bind_app(struct ip_vs_conn *cp, struct ip_vs_protocol *pp); void ip_vs_unbind_app(struct ip_vs_conn *cp); int register_ip_vs_app_inc(struct netns_ipvs *ipvs, struct ip_vs_app *app, __u16 proto, __u16 port); int ip_vs_app_inc_get(struct ip_vs_app *inc); void ip_vs_app_inc_put(struct ip_vs_app *inc); int ip_vs_app_pkt_out(struct ip_vs_conn *, struct sk_buff *skb, struct ip_vs_iphdr *ipvsh); int ip_vs_app_pkt_in(struct ip_vs_conn *, struct sk_buff *skb, struct ip_vs_iphdr *ipvsh); int register_ip_vs_pe(struct ip_vs_pe *pe); int unregister_ip_vs_pe(struct ip_vs_pe *pe); struct ip_vs_pe *ip_vs_pe_getbyname(const char *name); struct ip_vs_pe *__ip_vs_pe_getbyname(const char *pe_name); /* Use a #define to avoid all of module.h just for these trivial ops */ #define ip_vs_pe_get(pe) \ if (pe && pe->module) \ __module_get(pe->module); #define ip_vs_pe_put(pe) \ if (pe && pe->module) \ module_put(pe->module); /* IPVS protocol functions (from ip_vs_proto.c) */ int ip_vs_protocol_init(void); void ip_vs_protocol_cleanup(void); void ip_vs_protocol_timeout_change(struct netns_ipvs *ipvs, int flags); int *ip_vs_create_timeout_table(int *table, int size); void ip_vs_tcpudp_debug_packet(int af, struct ip_vs_protocol *pp, const struct sk_buff *skb, int offset, const char *msg); extern struct ip_vs_protocol ip_vs_protocol_tcp; extern struct ip_vs_protocol ip_vs_protocol_udp; extern struct ip_vs_protocol ip_vs_protocol_icmp; extern struct ip_vs_protocol ip_vs_protocol_esp; extern struct ip_vs_protocol ip_vs_protocol_ah; extern struct ip_vs_protocol ip_vs_protocol_sctp; /* Registering/unregistering scheduler functions * (from ip_vs_sched.c) */ int register_ip_vs_scheduler(struct ip_vs_scheduler *scheduler); int unregister_ip_vs_scheduler(struct ip_vs_scheduler *scheduler); int ip_vs_bind_scheduler(struct ip_vs_service *svc, struct ip_vs_scheduler *scheduler); void ip_vs_unbind_scheduler(struct ip_vs_service *svc, struct ip_vs_scheduler *sched); struct ip_vs_scheduler *ip_vs_scheduler_get(const char *sched_name); void ip_vs_scheduler_put(struct ip_vs_scheduler *scheduler); struct ip_vs_conn * ip_vs_schedule(struct ip_vs_service *svc, struct sk_buff *skb, struct ip_vs_proto_data *pd, int *ignored, struct ip_vs_iphdr *iph); int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb, struct ip_vs_proto_data *pd, struct ip_vs_iphdr *iph); void ip_vs_scheduler_err(struct ip_vs_service *svc, const char *msg); /* IPVS control data and functions (from ip_vs_ctl.c) */ extern struct ip_vs_stats ip_vs_stats; extern int sysctl_ip_vs_sync_ver; struct ip_vs_service * ip_vs_service_find(struct netns_ipvs *ipvs, int af, __u32 fwmark, __u16 protocol, const union nf_inet_addr *vaddr, __be16 vport); bool ip_vs_has_real_service(struct netns_ipvs *ipvs, int af, __u16 protocol, const union nf_inet_addr *daddr, __be16 dport); struct ip_vs_dest * ip_vs_find_real_service(struct netns_ipvs *ipvs, int af, __u16 protocol, const union nf_inet_addr *daddr, __be16 dport); struct ip_vs_dest *ip_vs_find_tunnel(struct netns_ipvs *ipvs, int af, const union nf_inet_addr *daddr, __be16 tun_port); int ip_vs_use_count_inc(void); void ip_vs_use_count_dec(void); int ip_vs_register_nl_ioctl(void); void ip_vs_unregister_nl_ioctl(void); int ip_vs_control_init(void); void ip_vs_control_cleanup(void); struct ip_vs_dest * ip_vs_find_dest(struct netns_ipvs *ipvs, int svc_af, int dest_af, const union nf_inet_addr *daddr, __be16 dport, const union nf_inet_addr *vaddr, __be16 vport, __u16 protocol, __u32 fwmark, __u32 flags); void ip_vs_try_bind_dest(struct ip_vs_conn *cp); static inline void ip_vs_dest_hold(struct ip_vs_dest *dest) { refcount_inc(&dest->refcnt); } static inline void ip_vs_dest_put(struct ip_vs_dest *dest) { smp_mb__before_atomic(); refcount_dec(&dest->refcnt); } static inline void ip_vs_dest_put_and_free(struct ip_vs_dest *dest) { if (refcount_dec_and_test(&dest->refcnt)) kfree(dest); } /* IPVS sync daemon data and function prototypes * (from ip_vs_sync.c) */ int start_sync_thread(struct netns_ipvs *ipvs, struct ipvs_sync_daemon_cfg *cfg, int state); int stop_sync_thread(struct netns_ipvs *ipvs, int state); void ip_vs_sync_conn(struct netns_ipvs *ipvs, struct ip_vs_conn *cp, int pkts); /* IPVS rate estimator prototypes (from ip_vs_est.c) */ int ip_vs_start_estimator(struct netns_ipvs *ipvs, struct ip_vs_stats *stats); void ip_vs_stop_estimator(struct netns_ipvs *ipvs, struct ip_vs_stats *stats); void ip_vs_zero_estimator(struct ip_vs_stats *stats); void ip_vs_read_estimator(struct ip_vs_kstats *dst, struct ip_vs_stats *stats); void ip_vs_est_reload_start(struct netns_ipvs *ipvs); int ip_vs_est_kthread_start(struct netns_ipvs *ipvs, struct ip_vs_est_kt_data *kd); void ip_vs_est_kthread_stop(struct ip_vs_est_kt_data *kd); static inline void ip_vs_est_stopped_recalc(struct netns_ipvs *ipvs) { #ifdef CONFIG_SYSCTL /* Stop tasks while cpulist is empty or if disabled with flag */ ipvs->est_stopped = !sysctl_run_estimation(ipvs) || (ipvs->est_cpulist_valid && cpumask_empty(sysctl_est_cpulist(ipvs))); #endif } static inline bool ip_vs_est_stopped(struct netns_ipvs *ipvs) { #ifdef CONFIG_SYSCTL return ipvs->est_stopped; #else return false; #endif } static inline int ip_vs_est_max_threads(struct netns_ipvs *ipvs) { unsigned int limit = IPVS_EST_CPU_KTHREADS * cpumask_weight(sysctl_est_cpulist(ipvs)); return max(1U, limit); } /* Various IPVS packet transmitters (from ip_vs_xmit.c) */ int ip_vs_null_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, struct ip_vs_protocol *pp, struct ip_vs_iphdr *iph); int ip_vs_bypass_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, struct ip_vs_protocol *pp, struct ip_vs_iphdr *iph); int ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, struct ip_vs_protocol *pp, struct ip_vs_iphdr *iph); int ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, struct ip_vs_protocol *pp, struct ip_vs_iphdr *iph); int ip_vs_dr_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, struct ip_vs_protocol *pp, struct ip_vs_iphdr *iph); int ip_vs_icmp_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, struct ip_vs_protocol *pp, int offset, unsigned int hooknum, struct ip_vs_iphdr *iph); void ip_vs_dest_dst_rcu_free(struct rcu_head *head); #ifdef CONFIG_IP_VS_IPV6 int ip_vs_bypass_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, struct ip_vs_protocol *pp, struct ip_vs_iphdr *iph); int ip_vs_nat_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, struct ip_vs_protocol *pp, struct ip_vs_iphdr *iph); int ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, struct ip_vs_protocol *pp, struct ip_vs_iphdr *iph); int ip_vs_dr_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, struct ip_vs_protocol *pp, struct ip_vs_iphdr *iph); int ip_vs_icmp_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, struct ip_vs_protocol *pp, int offset, unsigned int hooknum, struct ip_vs_iphdr *iph); #endif #ifdef CONFIG_SYSCTL /* This is a simple mechanism to ignore packets when * we are loaded. Just set ip_vs_drop_rate to 'n' and * we start to drop 1/rate of the packets */ static inline int ip_vs_todrop(struct netns_ipvs *ipvs) { if (!ipvs->drop_rate) return 0; if (--ipvs->drop_counter > 0) return 0; ipvs->drop_counter = ipvs->drop_rate; return 1; } #else static inline int ip_vs_todrop(struct netns_ipvs *ipvs) { return 0; } #endif #ifdef CONFIG_SYSCTL /* Enqueue delayed work for expiring no dest connections * Only run when sysctl_expire_nodest=1 */ static inline void ip_vs_enqueue_expire_nodest_conns(struct netns_ipvs *ipvs) { if (sysctl_expire_nodest_conn(ipvs)) queue_delayed_work(system_long_wq, &ipvs->expire_nodest_conn_work, 1); } void ip_vs_expire_nodest_conn_flush(struct netns_ipvs *ipvs); #else static inline void ip_vs_enqueue_expire_nodest_conns(struct netns_ipvs *ipvs) {} #endif #define IP_VS_DFWD_METHOD(dest) (atomic_read(&(dest)->conn_flags) & \ IP_VS_CONN_F_FWD_MASK) /* ip_vs_fwd_tag returns the forwarding tag of the connection */ #define IP_VS_FWD_METHOD(cp) (cp->flags & IP_VS_CONN_F_FWD_MASK) static inline char ip_vs_fwd_tag(struct ip_vs_conn *cp) { char fwd; switch (IP_VS_FWD_METHOD(cp)) { case IP_VS_CONN_F_MASQ: fwd = 'M'; break; case IP_VS_CONN_F_LOCALNODE: fwd = 'L'; break; case IP_VS_CONN_F_TUNNEL: fwd = 'T'; break; case IP_VS_CONN_F_DROUTE: fwd = 'R'; break; case IP_VS_CONN_F_BYPASS: fwd = 'B'; break; default: fwd = '?'; break; } return fwd; } void ip_vs_nat_icmp(struct sk_buff *skb, struct ip_vs_protocol *pp, struct ip_vs_conn *cp, int dir); #ifdef CONFIG_IP_VS_IPV6 void ip_vs_nat_icmp_v6(struct sk_buff *skb, struct ip_vs_protocol *pp, struct ip_vs_conn *cp, int dir); #endif __sum16 ip_vs_checksum_complete(struct sk_buff *skb, int offset); static inline __wsum ip_vs_check_diff4(__be32 old, __be32 new, __wsum oldsum) { __be32 diff[2] = { ~old, new }; return csum_partial(diff, sizeof(diff), oldsum); } #ifdef CONFIG_IP_VS_IPV6 static inline __wsum ip_vs_check_diff16(const __be32 *old, const __be32 *new, __wsum oldsum) { __be32 diff[8] = { ~old[3], ~old[2], ~old[1], ~old[0], new[3], new[2], new[1], new[0] }; return csum_partial(diff, sizeof(diff), oldsum); } #endif static inline __wsum ip_vs_check_diff2(__be16 old, __be16 new, __wsum oldsum) { __be16 diff[2] = { ~old, new }; return csum_partial(diff, sizeof(diff), oldsum); } /* Forget current conntrack (unconfirmed) and attach notrack entry */ static inline void ip_vs_notrack(struct sk_buff *skb) { #if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) enum ip_conntrack_info ctinfo; struct nf_conn *ct = nf_ct_get(skb, &ctinfo); if (ct) { nf_conntrack_put(&ct->ct_general); nf_ct_set(skb, NULL, IP_CT_UNTRACKED); } #endif } #ifdef CONFIG_IP_VS_NFCT /* Netfilter connection tracking * (from ip_vs_nfct.c) */ static inline int ip_vs_conntrack_enabled(struct netns_ipvs *ipvs) { #ifdef CONFIG_SYSCTL return ipvs->sysctl_conntrack; #else return 0; #endif } void ip_vs_update_conntrack(struct sk_buff *skb, struct ip_vs_conn *cp, int outin); int ip_vs_confirm_conntrack(struct sk_buff *skb); void ip_vs_nfct_expect_related(struct sk_buff *skb, struct nf_conn *ct, struct ip_vs_conn *cp, u_int8_t proto, const __be16 port, int from_rs); void ip_vs_conn_drop_conntrack(struct ip_vs_conn *cp); #else static inline int ip_vs_conntrack_enabled(struct netns_ipvs *ipvs) { return 0; } static inline void ip_vs_update_conntrack(struct sk_buff *skb, struct ip_vs_conn *cp, int outin) { } static inline int ip_vs_confirm_conntrack(struct sk_buff *skb) { return NF_ACCEPT; } static inline void ip_vs_conn_drop_conntrack(struct ip_vs_conn *cp) { } #endif /* CONFIG_IP_VS_NFCT */ /* Using old conntrack that can not be redirected to another real server? */ static inline bool ip_vs_conn_uses_old_conntrack(struct ip_vs_conn *cp, struct sk_buff *skb) { #ifdef CONFIG_IP_VS_NFCT enum ip_conntrack_info ctinfo; struct nf_conn *ct; ct = nf_ct_get(skb, &ctinfo); if (ct && nf_ct_is_confirmed(ct)) return true; #endif return false; } static inline int ip_vs_register_conntrack(struct ip_vs_service *svc) { #if IS_ENABLED(CONFIG_NF_CONNTRACK) int afmask = (svc->af == AF_INET6) ? 2 : 1; int ret = 0; if (!(svc->conntrack_afmask & afmask)) { ret = nf_ct_netns_get(svc->ipvs->net, svc->af); if (ret >= 0) svc->conntrack_afmask |= afmask; } return ret; #else return 0; #endif } static inline void ip_vs_unregister_conntrack(struct ip_vs_service *svc) { #if IS_ENABLED(CONFIG_NF_CONNTRACK) int afmask = (svc->af == AF_INET6) ? 2 : 1; if (svc->conntrack_afmask & afmask) { nf_ct_netns_put(svc->ipvs->net, svc->af); svc->conntrack_afmask &= ~afmask; } #endif } int ip_vs_register_hooks(struct netns_ipvs *ipvs, unsigned int af); void ip_vs_unregister_hooks(struct netns_ipvs *ipvs, unsigned int af); static inline int ip_vs_dest_conn_overhead(struct ip_vs_dest *dest) { /* We think the overhead of processing active connections is 256 * times higher than that of inactive connections in average. (This * 256 times might not be accurate, we will change it later) We * use the following formula to estimate the overhead now: * dest->activeconns*256 + dest->inactconns */ return (atomic_read(&dest->activeconns) << 8) + atomic_read(&dest->inactconns); } #ifdef CONFIG_IP_VS_PROTO_TCP INDIRECT_CALLABLE_DECLARE(int tcp_snat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp, struct ip_vs_conn *cp, struct ip_vs_iphdr *iph)); #endif #ifdef CONFIG_IP_VS_PROTO_UDP INDIRECT_CALLABLE_DECLARE(int udp_snat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp, struct ip_vs_conn *cp, struct ip_vs_iphdr *iph)); #endif #endif /* _NET_IP_VS_H */
17 6 17 17 17 17 17 17 17 17 17 17 6 6 5 5 4 2 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 // SPDX-License-Identifier: GPL-2.0 /* * Performance events callchain code, extracted from core.c: * * Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de> * Copyright (C) 2008-2011 Red Hat, Inc., Ingo Molnar * Copyright (C) 2008-2011 Red Hat, Inc., Peter Zijlstra * Copyright © 2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com> */ #include <linux/perf_event.h> #include <linux/slab.h> #include <linux/sched/task_stack.h> #include <linux/uprobes.h> #include "internal.h" struct callchain_cpus_entries { struct rcu_head rcu_head; struct perf_callchain_entry *cpu_entries[]; }; int sysctl_perf_event_max_stack __read_mostly = PERF_MAX_STACK_DEPTH; int sysctl_perf_event_max_contexts_per_stack __read_mostly = PERF_MAX_CONTEXTS_PER_STACK; static inline size_t perf_callchain_entry__sizeof(void) { return (sizeof(struct perf_callchain_entry) + sizeof(__u64) * (sysctl_perf_event_max_stack + sysctl_perf_event_max_contexts_per_stack)); } static DEFINE_PER_CPU(u8, callchain_recursion[PERF_NR_CONTEXTS]); static atomic_t nr_callchain_events; static DEFINE_MUTEX(callchain_mutex); static struct callchain_cpus_entries *callchain_cpus_entries; __weak void perf_callchain_kernel(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs) { } __weak void perf_callchain_user(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs) { } static void release_callchain_buffers_rcu(struct rcu_head *head) { struct callchain_cpus_entries *entries; int cpu; entries = container_of(head, struct callchain_cpus_entries, rcu_head); for_each_possible_cpu(cpu) kfree(entries->cpu_entries[cpu]); kfree(entries); } static void release_callchain_buffers(void) { struct callchain_cpus_entries *entries; entries = callchain_cpus_entries; RCU_INIT_POINTER(callchain_cpus_entries, NULL); call_rcu(&entries->rcu_head, release_callchain_buffers_rcu); } static int alloc_callchain_buffers(void) { int cpu; int size; struct callchain_cpus_entries *entries; /* * We can't use the percpu allocation API for data that can be * accessed from NMI. Use a temporary manual per cpu allocation * until that gets sorted out. */ size = offsetof(struct callchain_cpus_entries, cpu_entries[nr_cpu_ids]); entries = kzalloc(size, GFP_KERNEL); if (!entries) return -ENOMEM; size = perf_callchain_entry__sizeof() * PERF_NR_CONTEXTS; for_each_possible_cpu(cpu) { entries->cpu_entries[cpu] = kmalloc_node(size, GFP_KERNEL, cpu_to_node(cpu)); if (!entries->cpu_entries[cpu]) goto fail; } rcu_assign_pointer(callchain_cpus_entries, entries); return 0; fail: for_each_possible_cpu(cpu) kfree(entries->cpu_entries[cpu]); kfree(entries); return -ENOMEM; } int get_callchain_buffers(int event_max_stack) { int err = 0; int count; mutex_lock(&callchain_mutex); count = atomic_inc_return(&nr_callchain_events); if (WARN_ON_ONCE(count < 1)) { err = -EINVAL; goto exit; } /* * If requesting per event more than the global cap, * return a different error to help userspace figure * this out. * * And also do it here so that we have &callchain_mutex held. */ if (event_max_stack > sysctl_perf_event_max_stack) { err = -EOVERFLOW; goto exit; } if (count == 1) err = alloc_callchain_buffers(); exit: if (err) atomic_dec(&nr_callchain_events); mutex_unlock(&callchain_mutex); return err; } void put_callchain_buffers(void) { if (atomic_dec_and_mutex_lock(&nr_callchain_events, &callchain_mutex)) { release_callchain_buffers(); mutex_unlock(&callchain_mutex); } } struct perf_callchain_entry *get_callchain_entry(int *rctx) { int cpu; struct callchain_cpus_entries *entries; *rctx = get_recursion_context(this_cpu_ptr(callchain_recursion)); if (*rctx == -1) return NULL; entries = rcu_dereference(callchain_cpus_entries); if (!entries) { put_recursion_context(this_cpu_ptr(callchain_recursion), *rctx); return NULL; } cpu = smp_processor_id(); return (((void *)entries->cpu_entries[cpu]) + (*rctx * perf_callchain_entry__sizeof())); } void put_callchain_entry(int rctx) { put_recursion_context(this_cpu_ptr(callchain_recursion), rctx); } static void fixup_uretprobe_trampoline_entries(struct perf_callchain_entry *entry, int start_entry_idx) { #ifdef CONFIG_UPROBES struct uprobe_task *utask = current->utask; struct return_instance *ri; __u64 *cur_ip, *last_ip, tramp_addr; if (likely(!utask || !utask->return_instances)) return; cur_ip = &entry->ip[start_entry_idx]; last_ip = &entry->ip[entry->nr - 1]; ri = utask->return_instances; tramp_addr = uprobe_get_trampoline_vaddr(); /* * If there are pending uretprobes for the current thread, they are * recorded in a list inside utask->return_instances; each such * pending uretprobe replaces traced user function's return address on * the stack, so when stack trace is captured, instead of seeing * actual function's return address, we'll have one or many uretprobe * trampoline addresses in the stack trace, which are not helpful and * misleading to users. * So here we go over the pending list of uretprobes, and each * encountered trampoline address is replaced with actual return * address. */ while (ri && cur_ip <= last_ip) { if (*cur_ip == tramp_addr) { *cur_ip = ri->orig_ret_vaddr; ri = ri->next; } cur_ip++; } #endif } struct perf_callchain_entry * get_perf_callchain(struct pt_regs *regs, u32 init_nr, bool kernel, bool user, u32 max_stack, bool crosstask, bool add_mark) { struct perf_callchain_entry *entry; struct perf_callchain_entry_ctx ctx; int rctx, start_entry_idx; entry = get_callchain_entry(&rctx); if (!entry) return NULL; ctx.entry = entry; ctx.max_stack = max_stack; ctx.nr = entry->nr = init_nr; ctx.contexts = 0; ctx.contexts_maxed = false; if (kernel && !user_mode(regs)) { if (add_mark) perf_callchain_store_context(&ctx, PERF_CONTEXT_KERNEL); perf_callchain_kernel(&ctx, regs); } if (user) { if (!user_mode(regs)) { if (current->mm) regs = task_pt_regs(current); else regs = NULL; } if (regs) { if (crosstask) goto exit_put; if (add_mark) perf_callchain_store_context(&ctx, PERF_CONTEXT_USER); start_entry_idx = entry->nr; perf_callchain_user(&ctx, regs); fixup_uretprobe_trampoline_entries(entry, start_entry_idx); } } exit_put: put_callchain_entry(rctx); return entry; } /* * Used for sysctl_perf_event_max_stack and * sysctl_perf_event_max_contexts_per_stack. */ int perf_event_max_stack_handler(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { int *value = table->data; int new_value = *value, ret; struct ctl_table new_table = *table; new_table.data = &new_value; ret = proc_dointvec_minmax(&new_table, write, buffer, lenp, ppos); if (ret || !write) return ret; mutex_lock(&callchain_mutex); if (atomic_read(&nr_callchain_events)) ret = -EBUSY; else *value = new_value; mutex_unlock(&callchain_mutex); return ret; }
105 73 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * OSS compatible sequencer driver * * Copyright (C) 1998,99 Takashi Iwai <tiwai@suse.de> */ #ifndef __SEQ_OSS_DEVICE_H #define __SEQ_OSS_DEVICE_H #include <linux/time.h> #include <linux/wait.h> #include <linux/slab.h> #include <linux/sched/signal.h> #include <sound/core.h> #include <sound/seq_oss.h> #include <sound/rawmidi.h> #include <sound/seq_kernel.h> #include <sound/info.h> #include "../seq_clientmgr.h" /* max. applications */ #define SNDRV_SEQ_OSS_MAX_CLIENTS 16 #define SNDRV_SEQ_OSS_MAX_SYNTH_DEVS 16 #define SNDRV_SEQ_OSS_MAX_MIDI_DEVS 32 /* version */ #define SNDRV_SEQ_OSS_MAJOR_VERSION 0 #define SNDRV_SEQ_OSS_MINOR_VERSION 1 #define SNDRV_SEQ_OSS_TINY_VERSION 8 #define SNDRV_SEQ_OSS_VERSION_STR "0.1.8" /* device and proc interface name */ #define SNDRV_SEQ_OSS_PROCNAME "oss" /* * type definitions */ typedef unsigned int reltime_t; typedef unsigned int abstime_t; /* * synthesizer channel information */ struct seq_oss_chinfo { int note, vel; }; /* * synthesizer information */ struct seq_oss_synthinfo { struct snd_seq_oss_arg arg; struct seq_oss_chinfo *ch; int nr_voices; int opened; int is_midi; int midi_mapped; }; /* * sequencer client information */ struct seq_oss_devinfo { int index; /* application index */ int cseq; /* sequencer client number */ int port; /* sequencer port number */ int queue; /* sequencer queue number */ struct snd_seq_addr addr; /* address of this device */ int seq_mode; /* sequencer mode */ int file_mode; /* file access */ /* midi device table */ int max_mididev; /* synth device table */ int max_synthdev; struct seq_oss_synthinfo synths[SNDRV_SEQ_OSS_MAX_SYNTH_DEVS]; int synth_opened; /* output queue */ struct seq_oss_writeq *writeq; /* midi input queue */ struct seq_oss_readq *readq; /* timer */ struct seq_oss_timer *timer; }; /* * function prototypes */ /* create/delete OSS sequencer client */ int snd_seq_oss_create_client(void); int snd_seq_oss_delete_client(void); /* device file interface */ int snd_seq_oss_open(struct file *file, int level); void snd_seq_oss_release(struct seq_oss_devinfo *dp); int snd_seq_oss_ioctl(struct seq_oss_devinfo *dp, unsigned int cmd, unsigned long arg); int snd_seq_oss_read(struct seq_oss_devinfo *dev, char __user *buf, int count); int snd_seq_oss_write(struct seq_oss_devinfo *dp, const char __user *buf, int count, struct file *opt); __poll_t snd_seq_oss_poll(struct seq_oss_devinfo *dp, struct file *file, poll_table * wait); void snd_seq_oss_reset(struct seq_oss_devinfo *dp); /* proc interface */ void snd_seq_oss_system_info_read(struct snd_info_buffer *buf); void snd_seq_oss_midi_info_read(struct snd_info_buffer *buf); void snd_seq_oss_synth_info_read(struct snd_info_buffer *buf); void snd_seq_oss_readq_info_read(struct seq_oss_readq *q, struct snd_info_buffer *buf); /* file mode macros */ #define is_read_mode(mode) ((mode) & SNDRV_SEQ_OSS_FILE_READ) #define is_write_mode(mode) ((mode) & SNDRV_SEQ_OSS_FILE_WRITE) #define is_nonblock_mode(mode) ((mode) & SNDRV_SEQ_OSS_FILE_NONBLOCK) /* dispatch event */ static inline int snd_seq_oss_dispatch(struct seq_oss_devinfo *dp, struct snd_seq_event *ev, int atomic, int hop) { return snd_seq_kernel_client_dispatch(dp->cseq, ev, atomic, hop); } /* ioctl for writeq */ static inline int snd_seq_oss_control(struct seq_oss_devinfo *dp, unsigned int type, void *arg) { int err; snd_seq_client_ioctl_lock(dp->cseq); err = snd_seq_kernel_client_ctl(dp->cseq, type, arg); snd_seq_client_ioctl_unlock(dp->cseq); return err; } /* fill the addresses in header */ static inline void snd_seq_oss_fill_addr(struct seq_oss_devinfo *dp, struct snd_seq_event *ev, int dest_client, int dest_port) { ev->queue = dp->queue; ev->source = dp->addr; ev->dest.client = dest_client; ev->dest.port = dest_port; } #endif /* __SEQ_OSS_DEVICE_H */
199 1 1 2 1 9 9 8 7 5 4 2 1 3 2 2 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 // SPDX-License-Identifier: GPL-2.0 #include "io_uring.h" #include "napi.h" #ifdef CONFIG_NET_RX_BUSY_POLL /* Timeout for cleanout of stale entries. */ #define NAPI_TIMEOUT (60 * SEC_CONVERSION) struct io_napi_entry { unsigned int napi_id; struct list_head list; unsigned long timeout; struct hlist_node node; struct rcu_head rcu; }; static struct io_napi_entry *io_napi_hash_find(struct hlist_head *hash_list, unsigned int napi_id) { struct io_napi_entry *e; hlist_for_each_entry_rcu(e, hash_list, node) { if (e->napi_id != napi_id) continue; return e; } return NULL; } static inline ktime_t net_to_ktime(unsigned long t) { /* napi approximating usecs, reverse busy_loop_current_time */ return ns_to_ktime(t << 10); } int __io_napi_add_id(struct io_ring_ctx *ctx, unsigned int napi_id) { struct hlist_head *hash_list; struct io_napi_entry *e; /* Non-NAPI IDs can be rejected. */ if (napi_id < MIN_NAPI_ID) return -EINVAL; hash_list = &ctx->napi_ht[hash_min(napi_id, HASH_BITS(ctx->napi_ht))]; scoped_guard(rcu) { e = io_napi_hash_find(hash_list, napi_id); if (e) { WRITE_ONCE(e->timeout, jiffies + NAPI_TIMEOUT); return -EEXIST; } } e = kmalloc(sizeof(*e), GFP_NOWAIT); if (!e) return -ENOMEM; e->napi_id = napi_id; e->timeout = jiffies + NAPI_TIMEOUT; /* * guard(spinlock) is not used to manually unlock it before calling * kfree() */ spin_lock(&ctx->napi_lock); if (unlikely(io_napi_hash_find(hash_list, napi_id))) { spin_unlock(&ctx->napi_lock); kfree(e); return -EEXIST; } hlist_add_tail_rcu(&e->node, hash_list); list_add_tail_rcu(&e->list, &ctx->napi_list); spin_unlock(&ctx->napi_lock); return 0; } static int __io_napi_del_id(struct io_ring_ctx *ctx, unsigned int napi_id) { struct hlist_head *hash_list; struct io_napi_entry *e; /* Non-NAPI IDs can be rejected. */ if (napi_id < MIN_NAPI_ID) return -EINVAL; hash_list = &ctx->napi_ht[hash_min(napi_id, HASH_BITS(ctx->napi_ht))]; guard(spinlock)(&ctx->napi_lock); e = io_napi_hash_find(hash_list, napi_id); if (!e) return -ENOENT; list_del_rcu(&e->list); hash_del_rcu(&e->node); kfree_rcu(e, rcu); return 0; } static void __io_napi_remove_stale(struct io_ring_ctx *ctx) { struct io_napi_entry *e; guard(spinlock)(&ctx->napi_lock); /* * list_for_each_entry_safe() is not required as long as: * 1. list_del_rcu() does not reset the deleted node next pointer * 2. kfree_rcu() delays the memory freeing until the next quiescent * state */ list_for_each_entry(e, &ctx->napi_list, list) { if (time_after(jiffies, READ_ONCE(e->timeout))) { list_del_rcu(&e->list); hash_del_rcu(&e->node); kfree_rcu(e, rcu); } } } static inline void io_napi_remove_stale(struct io_ring_ctx *ctx, bool is_stale) { if (is_stale) __io_napi_remove_stale(ctx); } static inline bool io_napi_busy_loop_timeout(ktime_t start_time, ktime_t bp) { if (bp) { ktime_t end_time = ktime_add(start_time, bp); ktime_t now = net_to_ktime(busy_loop_current_time()); return ktime_after(now, end_time); } return true; } static bool io_napi_busy_loop_should_end(void *data, unsigned long start_time) { struct io_wait_queue *iowq = data; if (signal_pending(current)) return true; if (io_should_wake(iowq) || io_has_work(iowq->ctx)) return true; if (io_napi_busy_loop_timeout(net_to_ktime(start_time), iowq->napi_busy_poll_dt)) return true; return false; } /* * never report stale entries */ static bool static_tracking_do_busy_loop(struct io_ring_ctx *ctx, bool (*loop_end)(void *, unsigned long), void *loop_end_arg) { struct io_napi_entry *e; list_for_each_entry_rcu(e, &ctx->napi_list, list) napi_busy_loop_rcu(e->napi_id, loop_end, loop_end_arg, ctx->napi_prefer_busy_poll, BUSY_POLL_BUDGET); return false; } static bool dynamic_tracking_do_busy_loop(struct io_ring_ctx *ctx, bool (*loop_end)(void *, unsigned long), void *loop_end_arg) { struct io_napi_entry *e; bool is_stale = false; list_for_each_entry_rcu(e, &ctx->napi_list, list) { napi_busy_loop_rcu(e->napi_id, loop_end, loop_end_arg, ctx->napi_prefer_busy_poll, BUSY_POLL_BUDGET); if (time_after(jiffies, READ_ONCE(e->timeout))) is_stale = true; } return is_stale; } static inline bool __io_napi_do_busy_loop(struct io_ring_ctx *ctx, bool (*loop_end)(void *, unsigned long), void *loop_end_arg) { if (READ_ONCE(ctx->napi_track_mode) == IO_URING_NAPI_TRACKING_STATIC) return static_tracking_do_busy_loop(ctx, loop_end, loop_end_arg); return dynamic_tracking_do_busy_loop(ctx, loop_end, loop_end_arg); } static void io_napi_blocking_busy_loop(struct io_ring_ctx *ctx, struct io_wait_queue *iowq) { unsigned long start_time = busy_loop_current_time(); bool (*loop_end)(void *, unsigned long) = NULL; void *loop_end_arg = NULL; bool is_stale = false; /* Singular lists use a different napi loop end check function and are * only executed once. */ if (list_is_singular(&ctx->napi_list)) { loop_end = io_napi_busy_loop_should_end; loop_end_arg = iowq; } scoped_guard(rcu) { do { is_stale = __io_napi_do_busy_loop(ctx, loop_end, loop_end_arg); } while (!io_napi_busy_loop_should_end(iowq, start_time) && !loop_end_arg); } io_napi_remove_stale(ctx, is_stale); } /* * io_napi_init() - Init napi settings * @ctx: pointer to io-uring context structure * * Init napi settings in the io-uring context. */ void io_napi_init(struct io_ring_ctx *ctx) { u64 sys_dt = READ_ONCE(sysctl_net_busy_poll) * NSEC_PER_USEC; INIT_LIST_HEAD(&ctx->napi_list); spin_lock_init(&ctx->napi_lock); ctx->napi_prefer_busy_poll = false; ctx->napi_busy_poll_dt = ns_to_ktime(sys_dt); ctx->napi_track_mode = IO_URING_NAPI_TRACKING_INACTIVE; } /* * io_napi_free() - Deallocate napi * @ctx: pointer to io-uring context structure * * Free the napi list and the hash table in the io-uring context. */ void io_napi_free(struct io_ring_ctx *ctx) { struct io_napi_entry *e; guard(spinlock)(&ctx->napi_lock); list_for_each_entry(e, &ctx->napi_list, list) { hash_del_rcu(&e->node); kfree_rcu(e, rcu); } INIT_LIST_HEAD_RCU(&ctx->napi_list); } static int io_napi_register_napi(struct io_ring_ctx *ctx, struct io_uring_napi *napi) { switch (napi->op_param) { case IO_URING_NAPI_TRACKING_DYNAMIC: case IO_URING_NAPI_TRACKING_STATIC: break; default: return -EINVAL; } /* clean the napi list for new settings */ io_napi_free(ctx); WRITE_ONCE(ctx->napi_track_mode, napi->op_param); WRITE_ONCE(ctx->napi_busy_poll_dt, napi->busy_poll_to * NSEC_PER_USEC); WRITE_ONCE(ctx->napi_prefer_busy_poll, !!napi->prefer_busy_poll); return 0; } /* * io_napi_register() - Register napi with io-uring * @ctx: pointer to io-uring context structure * @arg: pointer to io_uring_napi structure * * Register napi in the io-uring context. */ int io_register_napi(struct io_ring_ctx *ctx, void __user *arg) { const struct io_uring_napi curr = { .busy_poll_to = ktime_to_us(ctx->napi_busy_poll_dt), .prefer_busy_poll = ctx->napi_prefer_busy_poll, .op_param = ctx->napi_track_mode }; struct io_uring_napi napi; if (ctx->flags & IORING_SETUP_IOPOLL) return -EINVAL; if (copy_from_user(&napi, arg, sizeof(napi))) return -EFAULT; if (napi.pad[0] || napi.pad[1] || napi.resv) return -EINVAL; if (copy_to_user(arg, &curr, sizeof(curr))) return -EFAULT; switch (napi.opcode) { case IO_URING_NAPI_REGISTER_OP: return io_napi_register_napi(ctx, &napi); case IO_URING_NAPI_STATIC_ADD_ID: if (curr.op_param != IO_URING_NAPI_TRACKING_STATIC) return -EINVAL; return __io_napi_add_id(ctx, napi.op_param); case IO_URING_NAPI_STATIC_DEL_ID: if (curr.op_param != IO_URING_NAPI_TRACKING_STATIC) return -EINVAL; return __io_napi_del_id(ctx, napi.op_param); default: return -EINVAL; } } /* * io_napi_unregister() - Unregister napi with io-uring * @ctx: pointer to io-uring context structure * @arg: pointer to io_uring_napi structure * * Unregister napi. If arg has been specified copy the busy poll timeout and * prefer busy poll setting to the passed in structure. */ int io_unregister_napi(struct io_ring_ctx *ctx, void __user *arg) { const struct io_uring_napi curr = { .busy_poll_to = ktime_to_us(ctx->napi_busy_poll_dt), .prefer_busy_poll = ctx->napi_prefer_busy_poll }; if (arg && copy_to_user(arg, &curr, sizeof(curr))) return -EFAULT; WRITE_ONCE(ctx->napi_busy_poll_dt, 0); WRITE_ONCE(ctx->napi_prefer_busy_poll, false); WRITE_ONCE(ctx->napi_track_mode, IO_URING_NAPI_TRACKING_INACTIVE); return 0; } /* * __io_napi_busy_loop() - execute busy poll loop * @ctx: pointer to io-uring context structure * @iowq: pointer to io wait queue * * Execute the busy poll loop and merge the spliced off list. */ void __io_napi_busy_loop(struct io_ring_ctx *ctx, struct io_wait_queue *iowq) { if (ctx->flags & IORING_SETUP_SQPOLL) return; iowq->napi_busy_poll_dt = READ_ONCE(ctx->napi_busy_poll_dt); if (iowq->timeout != KTIME_MAX) { ktime_t dt = ktime_sub(iowq->timeout, io_get_time(ctx)); iowq->napi_busy_poll_dt = min_t(u64, iowq->napi_busy_poll_dt, dt); } iowq->napi_prefer_busy_poll = READ_ONCE(ctx->napi_prefer_busy_poll); io_napi_blocking_busy_loop(ctx, iowq); } /* * io_napi_sqpoll_busy_poll() - busy poll loop for sqpoll * @ctx: pointer to io-uring context structure * * Splice of the napi list and execute the napi busy poll loop. */ int io_napi_sqpoll_busy_poll(struct io_ring_ctx *ctx) { bool is_stale = false; if (!READ_ONCE(ctx->napi_busy_poll_dt)) return 0; if (list_empty_careful(&ctx->napi_list)) return 0; scoped_guard(rcu) { is_stale = __io_napi_do_busy_loop(ctx, NULL, NULL); } io_napi_remove_stale(ctx, is_stale); return 1; } #endif
28 13 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 /* SPDX-License-Identifier: GPL-2.0-only */ #ifndef _LINUX_TRACEPOINT_H #define _LINUX_TRACEPOINT_H /* * Kernel Tracepoint API. * * See Documentation/trace/tracepoints.rst. * * Copyright (C) 2008-2014 Mathieu Desnoyers <mathieu.desnoyers@efficios.com> * * Heavily inspired from the Linux Kernel Markers. */ #include <linux/smp.h> #include <linux/srcu.h> #include <linux/errno.h> #include <linux/types.h> #include <linux/rcupdate.h> #include <linux/rcupdate_trace.h> #include <linux/tracepoint-defs.h> #include <linux/static_call.h> struct module; struct tracepoint; struct notifier_block; struct trace_eval_map { const char *system; const char *eval_string; unsigned long eval_value; }; #define TRACEPOINT_DEFAULT_PRIO 10 extern int tracepoint_probe_register(struct tracepoint *tp, void *probe, void *data); extern int tracepoint_probe_register_prio(struct tracepoint *tp, void *probe, void *data, int prio); extern int tracepoint_probe_register_prio_may_exist(struct tracepoint *tp, void *probe, void *data, int prio); extern int tracepoint_probe_unregister(struct tracepoint *tp, void *probe, void *data); static inline int tracepoint_probe_register_may_exist(struct tracepoint *tp, void *probe, void *data) { return tracepoint_probe_register_prio_may_exist(tp, probe, data, TRACEPOINT_DEFAULT_PRIO); } extern void for_each_kernel_tracepoint(void (*fct)(struct tracepoint *tp, void *priv), void *priv); #ifdef CONFIG_MODULES struct tp_module { struct list_head list; struct module *mod; }; bool trace_module_has_bad_taint(struct module *mod); extern int register_tracepoint_module_notifier(struct notifier_block *nb); extern int unregister_tracepoint_module_notifier(struct notifier_block *nb); void for_each_module_tracepoint(void (*fct)(struct tracepoint *, struct module *, void *), void *priv); void for_each_tracepoint_in_module(struct module *, void (*fct)(struct tracepoint *, struct module *, void *), void *priv); #else static inline bool trace_module_has_bad_taint(struct module *mod) { return false; } static inline int register_tracepoint_module_notifier(struct notifier_block *nb) { return 0; } static inline int unregister_tracepoint_module_notifier(struct notifier_block *nb) { return 0; } static inline void for_each_module_tracepoint(void (*fct)(struct tracepoint *, struct module *, void *), void *priv) { } static inline void for_each_tracepoint_in_module(struct module *mod, void (*fct)(struct tracepoint *, struct module *, void *), void *priv) { } #endif /* CONFIG_MODULES */ /* * tracepoint_synchronize_unregister must be called between the last tracepoint * probe unregistration and the end of module exit to make sure there is no * caller executing a probe when it is freed. * * An alternative is to use the following for batch reclaim associated * with a given tracepoint: * * - tracepoint_is_faultable() == false: call_rcu() * - tracepoint_is_faultable() == true: call_rcu_tasks_trace() */ #ifdef CONFIG_TRACEPOINTS static inline void tracepoint_synchronize_unregister(void) { synchronize_rcu_tasks_trace(); synchronize_rcu(); } static inline bool tracepoint_is_faultable(struct tracepoint *tp) { return tp->ext && tp->ext->faultable; } #else static inline void tracepoint_synchronize_unregister(void) { } static inline bool tracepoint_is_faultable(struct tracepoint *tp) { return false; } #endif #ifdef CONFIG_HAVE_SYSCALL_TRACEPOINTS extern int syscall_regfunc(void); extern void syscall_unregfunc(void); #endif /* CONFIG_HAVE_SYSCALL_TRACEPOINTS */ #ifndef PARAMS #define PARAMS(args...) args #endif #define TRACE_DEFINE_ENUM(x) #define TRACE_DEFINE_SIZEOF(x) #ifdef CONFIG_HAVE_ARCH_PREL32_RELOCATIONS static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p) { return offset_to_ptr(p); } #define __TRACEPOINT_ENTRY(name) \ asm(" .section \"__tracepoints_ptrs\", \"a\" \n" \ " .balign 4 \n" \ " .long __tracepoint_" #name " - . \n" \ " .previous \n") #else static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p) { return *p; } #define __TRACEPOINT_ENTRY(name) \ static tracepoint_ptr_t __tracepoint_ptr_##name __used \ __section("__tracepoints_ptrs") = &__tracepoint_##name #endif #endif /* _LINUX_TRACEPOINT_H */ /* * Note: we keep the TRACE_EVENT and DECLARE_TRACE outside the include * file ifdef protection. * This is due to the way trace events work. If a file includes two * trace event headers under one "CREATE_TRACE_POINTS" the first include * will override the TRACE_EVENT and break the second include. */ #ifndef DECLARE_TRACE #define TP_PROTO(args...) args #define TP_ARGS(args...) args #define TP_CONDITION(args...) args /* * Individual subsystem my have a separate configuration to * enable their tracepoints. By default, this file will create * the tracepoints if CONFIG_TRACEPOINTS is defined. If a subsystem * wants to be able to disable its tracepoints from being created * it can define NOTRACE before including the tracepoint headers. */ #if defined(CONFIG_TRACEPOINTS) && !defined(NOTRACE) #define TRACEPOINTS_ENABLED #endif #ifdef TRACEPOINTS_ENABLED #ifdef CONFIG_HAVE_STATIC_CALL #define __DO_TRACE_CALL(name, args) \ do { \ struct tracepoint_func *it_func_ptr; \ void *__data; \ it_func_ptr = \ rcu_dereference_raw((&__tracepoint_##name)->funcs); \ if (it_func_ptr) { \ __data = (it_func_ptr)->data; \ static_call(tp_func_##name)(__data, args); \ } \ } while (0) #else #define __DO_TRACE_CALL(name, args) __traceiter_##name(NULL, args) #endif /* CONFIG_HAVE_STATIC_CALL */ /* * Declare an exported function that Rust code can call to trigger this * tracepoint. This function does not include the static branch; that is done * in Rust to avoid a function call when the tracepoint is disabled. */ #define DEFINE_RUST_DO_TRACE(name, proto, args) #define __DEFINE_RUST_DO_TRACE(name, proto, args) \ notrace void rust_do_trace_##name(proto) \ { \ __do_trace_##name(args); \ } /* * Make sure the alignment of the structure in the __tracepoints section will * not add unwanted padding between the beginning of the section and the * structure. Force alignment to the same alignment as the section start. * * When lockdep is enabled, we make sure to always test if RCU is * "watching" regardless if the tracepoint is enabled or not. Tracepoints * require RCU to be active, and it should always warn at the tracepoint * site if it is not watching, as it will need to be active when the * tracepoint is enabled. */ #define __DECLARE_TRACE_COMMON(name, proto, args, data_proto) \ extern int __traceiter_##name(data_proto); \ DECLARE_STATIC_CALL(tp_func_##name, __traceiter_##name); \ extern struct tracepoint __tracepoint_##name; \ extern void rust_do_trace_##name(proto); \ static inline int \ register_trace_##name(void (*probe)(data_proto), void *data) \ { \ return tracepoint_probe_register(&__tracepoint_##name, \ (void *)probe, data); \ } \ static inline int \ register_trace_prio_##name(void (*probe)(data_proto), void *data,\ int prio) \ { \ return tracepoint_probe_register_prio(&__tracepoint_##name, \ (void *)probe, data, prio); \ } \ static inline int \ unregister_trace_##name(void (*probe)(data_proto), void *data) \ { \ return tracepoint_probe_unregister(&__tracepoint_##name,\ (void *)probe, data); \ } \ static inline void \ check_trace_callback_type_##name(void (*cb)(data_proto)) \ { \ } \ static inline bool \ trace_##name##_enabled(void) \ { \ return static_branch_unlikely(&__tracepoint_##name.key);\ } #define __DECLARE_TRACE(name, proto, args, cond, data_proto) \ __DECLARE_TRACE_COMMON(name, PARAMS(proto), PARAMS(args), PARAMS(data_proto)) \ static inline void __do_trace_##name(proto) \ { \ if (cond) { \ guard(preempt_notrace)(); \ __DO_TRACE_CALL(name, TP_ARGS(args)); \ } \ } \ static inline void trace_##name(proto) \ { \ if (static_branch_unlikely(&__tracepoint_##name.key)) \ __do_trace_##name(args); \ if (IS_ENABLED(CONFIG_LOCKDEP) && (cond)) { \ WARN_ONCE(!rcu_is_watching(), \ "RCU not watching for tracepoint"); \ } \ } #define __DECLARE_TRACE_SYSCALL(name, proto, args, data_proto) \ __DECLARE_TRACE_COMMON(name, PARAMS(proto), PARAMS(args), PARAMS(data_proto)) \ static inline void __do_trace_##name(proto) \ { \ guard(rcu_tasks_trace)(); \ __DO_TRACE_CALL(name, TP_ARGS(args)); \ } \ static inline void trace_##name(proto) \ { \ might_fault(); \ if (static_branch_unlikely(&__tracepoint_##name.key)) \ __do_trace_##name(args); \ if (IS_ENABLED(CONFIG_LOCKDEP)) { \ WARN_ONCE(!rcu_is_watching(), \ "RCU not watching for tracepoint"); \ } \ } /* * We have no guarantee that gcc and the linker won't up-align the tracepoint * structures, so we create an array of pointers that will be used for iteration * on the tracepoints. * * it_func[0] is never NULL because there is at least one element in the array * when the array itself is non NULL. */ #define __DEFINE_TRACE_EXT(_name, _ext, proto, args) \ static const char __tpstrtab_##_name[] \ __section("__tracepoints_strings") = #_name; \ extern struct static_call_key STATIC_CALL_KEY(tp_func_##_name); \ int __traceiter_##_name(void *__data, proto); \ void __probestub_##_name(void *__data, proto); \ struct tracepoint __tracepoint_##_name __used \ __section("__tracepoints") = { \ .name = __tpstrtab_##_name, \ .key = STATIC_KEY_FALSE_INIT, \ .static_call_key = &STATIC_CALL_KEY(tp_func_##_name), \ .static_call_tramp = STATIC_CALL_TRAMP_ADDR(tp_func_##_name), \ .iterator = &__traceiter_##_name, \ .probestub = &__probestub_##_name, \ .funcs = NULL, \ .ext = _ext, \ }; \ __TRACEPOINT_ENTRY(_name); \ int __traceiter_##_name(void *__data, proto) \ { \ struct tracepoint_func *it_func_ptr; \ void *it_func; \ \ it_func_ptr = \ rcu_dereference_raw((&__tracepoint_##_name)->funcs); \ if (it_func_ptr) { \ do { \ it_func = READ_ONCE((it_func_ptr)->func); \ __data = (it_func_ptr)->data; \ ((void(*)(void *, proto))(it_func))(__data, args); \ } while ((++it_func_ptr)->func); \ } \ return 0; \ } \ void __probestub_##_name(void *__data, proto) \ { \ } \ DEFINE_STATIC_CALL(tp_func_##_name, __traceiter_##_name); \ DEFINE_RUST_DO_TRACE(_name, TP_PROTO(proto), TP_ARGS(args)) #define DEFINE_TRACE_FN(_name, _reg, _unreg, _proto, _args) \ static struct tracepoint_ext __tracepoint_ext_##_name = { \ .regfunc = _reg, \ .unregfunc = _unreg, \ .faultable = false, \ }; \ __DEFINE_TRACE_EXT(_name, &__tracepoint_ext_##_name, PARAMS(_proto), PARAMS(_args)); #define DEFINE_TRACE_SYSCALL(_name, _reg, _unreg, _proto, _args) \ static struct tracepoint_ext __tracepoint_ext_##_name = { \ .regfunc = _reg, \ .unregfunc = _unreg, \ .faultable = true, \ }; \ __DEFINE_TRACE_EXT(_name, &__tracepoint_ext_##_name, PARAMS(_proto), PARAMS(_args)); #define DEFINE_TRACE(_name, _proto, _args) \ __DEFINE_TRACE_EXT(_name, NULL, PARAMS(_proto), PARAMS(_args)); #define EXPORT_TRACEPOINT_SYMBOL_GPL(name) \ EXPORT_SYMBOL_GPL(__tracepoint_##name); \ EXPORT_SYMBOL_GPL(__traceiter_##name); \ EXPORT_STATIC_CALL_GPL(tp_func_##name) #define EXPORT_TRACEPOINT_SYMBOL(name) \ EXPORT_SYMBOL(__tracepoint_##name); \ EXPORT_SYMBOL(__traceiter_##name); \ EXPORT_STATIC_CALL(tp_func_##name) #else /* !TRACEPOINTS_ENABLED */ #define __DECLARE_TRACE_COMMON(name, proto, args, data_proto) \ static inline void trace_##name(proto) \ { } \ static inline int \ register_trace_##name(void (*probe)(data_proto), \ void *data) \ { \ return -ENOSYS; \ } \ static inline int \ unregister_trace_##name(void (*probe)(data_proto), \ void *data) \ { \ return -ENOSYS; \ } \ static inline void check_trace_callback_type_##name(void (*cb)(data_proto)) \ { \ } \ static inline bool \ trace_##name##_enabled(void) \ { \ return false; \ } #define __DECLARE_TRACE(name, proto, args, cond, data_proto) \ __DECLARE_TRACE_COMMON(name, PARAMS(proto), PARAMS(args), PARAMS(data_proto)) #define __DECLARE_TRACE_SYSCALL(name, proto, args, data_proto) \ __DECLARE_TRACE_COMMON(name, PARAMS(proto), PARAMS(args), PARAMS(data_proto)) #define DEFINE_TRACE_FN(name, reg, unreg, proto, args) #define DEFINE_TRACE_SYSCALL(name, reg, unreg, proto, args) #define DEFINE_TRACE(name, proto, args) #define EXPORT_TRACEPOINT_SYMBOL_GPL(name) #define EXPORT_TRACEPOINT_SYMBOL(name) #endif /* TRACEPOINTS_ENABLED */ #ifdef CONFIG_TRACING /** * tracepoint_string - register constant persistent string to trace system * @str - a constant persistent string that will be referenced in tracepoints * * If constant strings are being used in tracepoints, it is faster and * more efficient to just save the pointer to the string and reference * that with a printf "%s" instead of saving the string in the ring buffer * and wasting space and time. * * The problem with the above approach is that userspace tools that read * the binary output of the trace buffers do not have access to the string. * Instead they just show the address of the string which is not very * useful to users. * * With tracepoint_string(), the string will be registered to the tracing * system and exported to userspace via the debugfs/tracing/printk_formats * file that maps the string address to the string text. This way userspace * tools that read the binary buffers have a way to map the pointers to * the ASCII strings they represent. * * The @str used must be a constant string and persistent as it would not * make sense to show a string that no longer exists. But it is still fine * to be used with modules, because when modules are unloaded, if they * had tracepoints, the ring buffers are cleared too. As long as the string * does not change during the life of the module, it is fine to use * tracepoint_string() within a module. */ #define tracepoint_string(str) \ ({ \ static const char *___tp_str __tracepoint_string = str; \ ___tp_str; \ }) #define __tracepoint_string __used __section("__tracepoint_str") #else /* * tracepoint_string() is used to save the string address for userspace * tracing tools. When tracing isn't configured, there's no need to save * anything. */ # define tracepoint_string(str) str # define __tracepoint_string #endif #define DECLARE_TRACE(name, proto, args) \ __DECLARE_TRACE(name, PARAMS(proto), PARAMS(args), \ cpu_online(raw_smp_processor_id()), \ PARAMS(void *__data, proto)) #define DECLARE_TRACE_CONDITION(name, proto, args, cond) \ __DECLARE_TRACE(name, PARAMS(proto), PARAMS(args), \ cpu_online(raw_smp_processor_id()) && (PARAMS(cond)), \ PARAMS(void *__data, proto)) #define DECLARE_TRACE_SYSCALL(name, proto, args) \ __DECLARE_TRACE_SYSCALL(name, PARAMS(proto), PARAMS(args), \ PARAMS(void *__data, proto)) #define TRACE_EVENT_FLAGS(event, flag) #define TRACE_EVENT_PERF_PERM(event, expr...) #endif /* DECLARE_TRACE */ #ifndef TRACE_EVENT /* * For use with the TRACE_EVENT macro: * * We define a tracepoint, its arguments, its printk format * and its 'fast binary record' layout. * * Firstly, name your tracepoint via TRACE_EVENT(name : the * 'subsystem_event' notation is fine. * * Think about this whole construct as the * 'trace_sched_switch() function' from now on. * * * TRACE_EVENT(sched_switch, * * * * * A function has a regular function arguments * * prototype, declare it via TP_PROTO(): * * * * TP_PROTO(struct rq *rq, struct task_struct *prev, * struct task_struct *next), * * * * * Define the call signature of the 'function'. * * (Design sidenote: we use this instead of a * * TP_PROTO1/TP_PROTO2/TP_PROTO3 ugliness.) * * * * TP_ARGS(rq, prev, next), * * * * * Fast binary tracing: define the trace record via * * TP_STRUCT__entry(). You can think about it like a * * regular C structure local variable definition. * * * * This is how the trace record is structured and will * * be saved into the ring buffer. These are the fields * * that will be exposed to user-space in * * /sys/kernel/tracing/events/<*>/format. * * * * The declared 'local variable' is called '__entry' * * * * __field(pid_t, prev_pid) is equivalent to a standard declaration: * * * * pid_t prev_pid; * * * * __array(char, prev_comm, TASK_COMM_LEN) is equivalent to: * * * * char prev_comm[TASK_COMM_LEN]; * * * * TP_STRUCT__entry( * __array( char, prev_comm, TASK_COMM_LEN ) * __field( pid_t, prev_pid ) * __field( int, prev_prio ) * __array( char, next_comm, TASK_COMM_LEN ) * __field( pid_t, next_pid ) * __field( int, next_prio ) * ), * * * * * Assign the entry into the trace record, by embedding * * a full C statement block into TP_fast_assign(). You * * can refer to the trace record as '__entry' - * * otherwise you can put arbitrary C code in here. * * * * Note: this C code will execute every time a trace event * * happens, on an active tracepoint. * * * * TP_fast_assign( * memcpy(__entry->next_comm, next->comm, TASK_COMM_LEN); * __entry->prev_pid = prev->pid; * __entry->prev_prio = prev->prio; * memcpy(__entry->prev_comm, prev->comm, TASK_COMM_LEN); * __entry->next_pid = next->pid; * __entry->next_prio = next->prio; * ), * * * * * Formatted output of a trace record via TP_printk(). * * This is how the tracepoint will appear under ftrace * * plugins that make use of this tracepoint. * * * * (raw-binary tracing wont actually perform this step.) * * * * TP_printk("task %s:%d [%d] ==> %s:%d [%d]", * __entry->prev_comm, __entry->prev_pid, __entry->prev_prio, * __entry->next_comm, __entry->next_pid, __entry->next_prio), * * ); * * This macro construct is thus used for the regular printk format * tracing setup, it is used to construct a function pointer based * tracepoint callback (this is used by programmatic plugins and * can also by used by generic instrumentation like SystemTap), and * it is also used to expose a structured trace record in * /sys/kernel/tracing/events/. * * A set of (un)registration functions can be passed to the variant * TRACE_EVENT_FN to perform any (un)registration work. */ #define DECLARE_EVENT_CLASS(name, proto, args, tstruct, assign, print) #define DEFINE_EVENT(template, name, proto, args) \ DECLARE_TRACE(name, PARAMS(proto), PARAMS(args)) #define DEFINE_EVENT_FN(template, name, proto, args, reg, unreg)\ DECLARE_TRACE(name, PARAMS(proto), PARAMS(args)) #define DEFINE_EVENT_PRINT(template, name, proto, args, print) \ DECLARE_TRACE(name, PARAMS(proto), PARAMS(args)) #define DEFINE_EVENT_CONDITION(template, name, proto, \ args, cond) \ DECLARE_TRACE_CONDITION(name, PARAMS(proto), \ PARAMS(args), PARAMS(cond)) #define TRACE_EVENT(name, proto, args, struct, assign, print) \ DECLARE_TRACE(name, PARAMS(proto), PARAMS(args)) #define TRACE_EVENT_FN(name, proto, args, struct, \ assign, print, reg, unreg) \ DECLARE_TRACE(name, PARAMS(proto), PARAMS(args)) #define TRACE_EVENT_FN_COND(name, proto, args, cond, struct, \ assign, print, reg, unreg) \ DECLARE_TRACE_CONDITION(name, PARAMS(proto), \ PARAMS(args), PARAMS(cond)) #define TRACE_EVENT_CONDITION(name, proto, args, cond, \ struct, assign, print) \ DECLARE_TRACE_CONDITION(name, PARAMS(proto), \ PARAMS(args), PARAMS(cond)) #define TRACE_EVENT_SYSCALL(name, proto, args, struct, assign, \ print, reg, unreg) \ DECLARE_TRACE_SYSCALL(name, PARAMS(proto), PARAMS(args)) #define TRACE_EVENT_FLAGS(event, flag) #define TRACE_EVENT_PERF_PERM(event, expr...) #define DECLARE_EVENT_NOP(name, proto, args) \ static inline void trace_##name(proto) \ { } \ static inline bool trace_##name##_enabled(void) \ { \ return false; \ } #define TRACE_EVENT_NOP(name, proto, args, struct, assign, print) \ DECLARE_EVENT_NOP(name, PARAMS(proto), PARAMS(args)) #define DECLARE_EVENT_CLASS_NOP(name, proto, args, tstruct, assign, print) #define DEFINE_EVENT_NOP(template, name, proto, args) \ DECLARE_EVENT_NOP(name, PARAMS(proto), PARAMS(args)) #endif /* ifdef TRACE_EVENT (see note above) */
392 7 7 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __NET_ACT_API_H #define __NET_ACT_API_H /* * Public action API for classifiers/qdiscs */ #include <linux/refcount.h> #include <net/flow_offload.h> #include <net/sch_generic.h> #include <net/pkt_sched.h> #include <net/net_namespace.h> #include <net/netns/generic.h> struct tcf_idrinfo { struct mutex lock; struct idr action_idr; struct net *net; }; struct tc_action_ops; struct tc_action { const struct tc_action_ops *ops; __u32 type; /* for backward compat(TCA_OLD_COMPAT) */ struct tcf_idrinfo *idrinfo; u32 tcfa_index; refcount_t tcfa_refcnt; atomic_t tcfa_bindcnt; int tcfa_action; struct tcf_t tcfa_tm; struct gnet_stats_basic_sync tcfa_bstats; struct gnet_stats_basic_sync tcfa_bstats_hw; struct gnet_stats_queue tcfa_qstats; struct net_rate_estimator __rcu *tcfa_rate_est; spinlock_t tcfa_lock; struct gnet_stats_basic_sync __percpu *cpu_bstats; struct gnet_stats_basic_sync __percpu *cpu_bstats_hw; struct gnet_stats_queue __percpu *cpu_qstats; struct tc_cookie __rcu *user_cookie; struct tcf_chain __rcu *goto_chain; u32 tcfa_flags; u8 hw_stats; u8 used_hw_stats; bool used_hw_stats_valid; u32 in_hw_count; }; #define tcf_index common.tcfa_index #define tcf_refcnt common.tcfa_refcnt #define tcf_bindcnt common.tcfa_bindcnt #define tcf_action common.tcfa_action #define tcf_tm common.tcfa_tm #define tcf_bstats common.tcfa_bstats #define tcf_qstats common.tcfa_qstats #define tcf_rate_est common.tcfa_rate_est #define tcf_lock common.tcfa_lock #define TCA_ACT_HW_STATS_ANY (TCA_ACT_HW_STATS_IMMEDIATE | \ TCA_ACT_HW_STATS_DELAYED) /* Reserve 16 bits for user-space. See TCA_ACT_FLAGS_NO_PERCPU_STATS. */ #define TCA_ACT_FLAGS_USER_BITS 16 #define TCA_ACT_FLAGS_USER_MASK 0xffff #define TCA_ACT_FLAGS_POLICE (1U << TCA_ACT_FLAGS_USER_BITS) #define TCA_ACT_FLAGS_BIND (1U << (TCA_ACT_FLAGS_USER_BITS + 1)) #define TCA_ACT_FLAGS_REPLACE (1U << (TCA_ACT_FLAGS_USER_BITS + 2)) #define TCA_ACT_FLAGS_NO_RTNL (1U << (TCA_ACT_FLAGS_USER_BITS + 3)) #define TCA_ACT_FLAGS_AT_INGRESS (1U << (TCA_ACT_FLAGS_USER_BITS + 4)) /* Update lastuse only if needed, to avoid dirtying a cache line. * We use a temp variable to avoid fetching jiffies twice. */ static inline void tcf_lastuse_update(struct tcf_t *tm) { unsigned long now = jiffies; if (tm->lastuse != now) tm->lastuse = now; if (unlikely(!tm->firstuse)) tm->firstuse = now; } static inline void tcf_tm_dump(struct tcf_t *dtm, const struct tcf_t *stm) { dtm->install = jiffies_to_clock_t(jiffies - stm->install); dtm->lastuse = jiffies_to_clock_t(jiffies - stm->lastuse); dtm->firstuse = stm->firstuse ? jiffies_to_clock_t(jiffies - stm->firstuse) : 0; dtm->expires = jiffies_to_clock_t(stm->expires); } static inline enum flow_action_hw_stats tc_act_hw_stats(u8 hw_stats) { if (WARN_ON_ONCE(hw_stats > TCA_ACT_HW_STATS_ANY)) return FLOW_ACTION_HW_STATS_DONT_CARE; else if (!hw_stats) return FLOW_ACTION_HW_STATS_DISABLED; return hw_stats; } typedef void (*tc_action_priv_destructor)(void *priv); struct tc_action_ops { struct list_head head; char kind[IFNAMSIZ]; enum tca_id id; /* identifier should match kind */ unsigned int net_id; size_t size; struct module *owner; int (*act)(struct sk_buff *, const struct tc_action *, struct tcf_result *); /* called under RCU BH lock*/ int (*dump)(struct sk_buff *, struct tc_action *, int, int); void (*cleanup)(struct tc_action *); int (*lookup)(struct net *net, struct tc_action **a, u32 index); int (*init)(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **act, struct tcf_proto *tp, u32 flags, struct netlink_ext_ack *extack); int (*walk)(struct net *, struct sk_buff *, struct netlink_callback *, int, const struct tc_action_ops *, struct netlink_ext_ack *); void (*stats_update)(struct tc_action *, u64, u64, u64, u64, bool); size_t (*get_fill_size)(const struct tc_action *act); struct net_device *(*get_dev)(const struct tc_action *a, tc_action_priv_destructor *destructor); struct psample_group * (*get_psample_group)(const struct tc_action *a, tc_action_priv_destructor *destructor); int (*offload_act_setup)(struct tc_action *act, void *entry_data, u32 *index_inc, bool bind, struct netlink_ext_ack *extack); }; #ifdef CONFIG_NET_CLS_ACT #define ACT_P_BOUND 0 #define ACT_P_CREATED 1 #define ACT_P_DELETED 1 struct tc_action_net { struct tcf_idrinfo *idrinfo; const struct tc_action_ops *ops; }; static inline int tc_action_net_init(struct net *net, struct tc_action_net *tn, const struct tc_action_ops *ops) { int err = 0; tn->idrinfo = kmalloc(sizeof(*tn->idrinfo), GFP_KERNEL); if (!tn->idrinfo) return -ENOMEM; tn->ops = ops; tn->idrinfo->net = net; mutex_init(&tn->idrinfo->lock); idr_init(&tn->idrinfo->action_idr); return err; } void tcf_idrinfo_destroy(const struct tc_action_ops *ops, struct tcf_idrinfo *idrinfo); static inline void tc_action_net_exit(struct list_head *net_list, unsigned int id) { struct net *net; rtnl_lock(); list_for_each_entry(net, net_list, exit_list) { struct tc_action_net *tn = net_generic(net, id); tcf_idrinfo_destroy(tn->ops, tn->idrinfo); kfree(tn->idrinfo); } rtnl_unlock(); } int tcf_generic_walker(struct tc_action_net *tn, struct sk_buff *skb, struct netlink_callback *cb, int type, const struct tc_action_ops *ops, struct netlink_ext_ack *extack); int tcf_idr_search(struct tc_action_net *tn, struct tc_action **a, u32 index); int tcf_idr_create(struct tc_action_net *tn, u32 index, struct nlattr *est, struct tc_action **a, const struct tc_action_ops *ops, int bind, bool cpustats, u32 flags); int tcf_idr_create_from_flags(struct tc_action_net *tn, u32 index, struct nlattr *est, struct tc_action **a, const struct tc_action_ops *ops, int bind, u32 flags); void tcf_idr_insert_many(struct tc_action *actions[], int init_res[]); void tcf_idr_cleanup(struct tc_action_net *tn, u32 index); int tcf_idr_check_alloc(struct tc_action_net *tn, u32 *index, struct tc_action **a, int bind); int tcf_idr_release(struct tc_action *a, bool bind); int tcf_register_action(struct tc_action_ops *a, struct pernet_operations *ops); int tcf_unregister_action(struct tc_action_ops *a, struct pernet_operations *ops); #define NET_ACT_ALIAS_PREFIX "net-act-" #define MODULE_ALIAS_NET_ACT(kind) MODULE_ALIAS(NET_ACT_ALIAS_PREFIX kind) int tcf_action_destroy(struct tc_action *actions[], int bind); int tcf_action_exec(struct sk_buff *skb, struct tc_action **actions, int nr_actions, struct tcf_result *res); int tcf_action_init(struct net *net, struct tcf_proto *tp, struct nlattr *nla, struct nlattr *est, struct tc_action *actions[], int init_res[], size_t *attr_size, u32 flags, u32 fl_flags, struct netlink_ext_ack *extack); struct tc_action_ops *tc_action_load_ops(struct nlattr *nla, u32 flags, struct netlink_ext_ack *extack); struct tc_action *tcf_action_init_1(struct net *net, struct tcf_proto *tp, struct nlattr *nla, struct nlattr *est, struct tc_action_ops *a_o, int *init_res, u32 flags, struct netlink_ext_ack *extack); int tcf_action_dump(struct sk_buff *skb, struct tc_action *actions[], int bind, int ref, bool terse); int tcf_action_dump_old(struct sk_buff *skb, struct tc_action *a, int, int); static inline void tcf_action_update_bstats(struct tc_action *a, struct sk_buff *skb) { if (likely(a->cpu_bstats)) { bstats_update(this_cpu_ptr(a->cpu_bstats), skb); return; } spin_lock(&a->tcfa_lock); bstats_update(&a->tcfa_bstats, skb); spin_unlock(&a->tcfa_lock); } static inline void tcf_action_inc_drop_qstats(struct tc_action *a) { if (likely(a->cpu_qstats)) { qstats_drop_inc(this_cpu_ptr(a->cpu_qstats)); return; } spin_lock(&a->tcfa_lock); qstats_drop_inc(&a->tcfa_qstats); spin_unlock(&a->tcfa_lock); } static inline void tcf_action_inc_overlimit_qstats(struct tc_action *a) { if (likely(a->cpu_qstats)) { qstats_overlimit_inc(this_cpu_ptr(a->cpu_qstats)); return; } spin_lock(&a->tcfa_lock); qstats_overlimit_inc(&a->tcfa_qstats); spin_unlock(&a->tcfa_lock); } void tcf_action_update_stats(struct tc_action *a, u64 bytes, u64 packets, u64 drops, bool hw); int tcf_action_copy_stats(struct sk_buff *, struct tc_action *, int); int tcf_action_update_hw_stats(struct tc_action *action); int tcf_action_reoffload_cb(flow_indr_block_bind_cb_t *cb, void *cb_priv, bool add); int tcf_action_check_ctrlact(int action, struct tcf_proto *tp, struct tcf_chain **handle, struct netlink_ext_ack *newchain); struct tcf_chain *tcf_action_set_ctrlact(struct tc_action *a, int action, struct tcf_chain *newchain); #ifdef CONFIG_INET DECLARE_STATIC_KEY_FALSE(tcf_frag_xmit_count); #endif int tcf_dev_queue_xmit(struct sk_buff *skb, int (*xmit)(struct sk_buff *skb)); #else /* !CONFIG_NET_CLS_ACT */ static inline int tcf_action_reoffload_cb(flow_indr_block_bind_cb_t *cb, void *cb_priv, bool add) { return 0; } #endif /* CONFIG_NET_CLS_ACT */ static inline void tcf_action_stats_update(struct tc_action *a, u64 bytes, u64 packets, u64 drops, u64 lastuse, bool hw) { #ifdef CONFIG_NET_CLS_ACT if (!a->ops->stats_update) return; a->ops->stats_update(a, bytes, packets, drops, lastuse, hw); #endif } #endif
804 25 25 8 804 801 11 801 26 2 1 2 6 4 4 4 3 25 23 3 39 16 16 16 38 30 22 22 30 74 74 74 74 74 74 29 29 11 3 9 9 9 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 // SPDX-License-Identifier: GPL-2.0-or-later /* * Generic parts * Linux ethernet bridge * * Authors: * Lennert Buytenhek <buytenh@gnu.org> */ #include <linux/module.h> #include <linux/kernel.h> #include <linux/netdevice.h> #include <linux/etherdevice.h> #include <linux/init.h> #include <linux/llc.h> #include <net/llc.h> #include <net/stp.h> #include <net/switchdev.h> #include "br_private.h" /* * Handle changes in state of network devices enslaved to a bridge. * * Note: don't care about up/down if bridge itself is down, because * port state is checked when bridge is brought up. */ static int br_device_event(struct notifier_block *unused, unsigned long event, void *ptr) { struct netlink_ext_ack *extack = netdev_notifier_info_to_extack(ptr); struct netdev_notifier_pre_changeaddr_info *prechaddr_info; struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct net_bridge_port *p; struct net_bridge *br; bool notified = false; bool changed_addr; int err; if (netif_is_bridge_master(dev)) { err = br_vlan_bridge_event(dev, event, ptr); if (err) return notifier_from_errno(err); if (event == NETDEV_REGISTER) { /* register of bridge completed, add sysfs entries */ err = br_sysfs_addbr(dev); if (err) return notifier_from_errno(err); return NOTIFY_DONE; } } if (is_vlan_dev(dev)) { struct net_device *real_dev = vlan_dev_real_dev(dev); if (netif_is_bridge_master(real_dev)) br_vlan_vlan_upper_event(real_dev, dev, event); } /* not a port of a bridge */ p = br_port_get_rtnl(dev); if (!p) return NOTIFY_DONE; br = p->br; switch (event) { case NETDEV_CHANGEMTU: br_mtu_auto_adjust(br); break; case NETDEV_PRE_CHANGEADDR: if (br->dev->addr_assign_type == NET_ADDR_SET) break; prechaddr_info = ptr; err = dev_pre_changeaddr_notify(br->dev, prechaddr_info->dev_addr, extack); if (err) return notifier_from_errno(err); break; case NETDEV_CHANGEADDR: spin_lock_bh(&br->lock); br_fdb_changeaddr(p, dev->dev_addr); changed_addr = br_stp_recalculate_bridge_id(br); spin_unlock_bh(&br->lock); if (changed_addr) call_netdevice_notifiers(NETDEV_CHANGEADDR, br->dev); break; case NETDEV_CHANGE: br_port_carrier_check(p, &notified); break; case NETDEV_FEAT_CHANGE: netdev_update_features(br->dev); break; case NETDEV_DOWN: spin_lock_bh(&br->lock); if (br->dev->flags & IFF_UP) { br_stp_disable_port(p); notified = true; } spin_unlock_bh(&br->lock); break; case NETDEV_UP: if (netif_running(br->dev) && netif_oper_up(dev)) { spin_lock_bh(&br->lock); br_stp_enable_port(p); notified = true; spin_unlock_bh(&br->lock); } break; case NETDEV_UNREGISTER: br_del_if(br, dev); break; case NETDEV_CHANGENAME: err = br_sysfs_renameif(p); if (err) return notifier_from_errno(err); break; case NETDEV_PRE_TYPE_CHANGE: /* Forbid underlying device to change its type. */ return NOTIFY_BAD; case NETDEV_RESEND_IGMP: /* Propagate to master device */ call_netdevice_notifiers(event, br->dev); break; } if (event != NETDEV_UNREGISTER) br_vlan_port_event(p, event); /* Events that may cause spanning tree to refresh */ if (!notified && (event == NETDEV_CHANGEADDR || event == NETDEV_UP || event == NETDEV_CHANGE || event == NETDEV_DOWN)) br_ifinfo_notify(RTM_NEWLINK, NULL, p); return NOTIFY_DONE; } static struct notifier_block br_device_notifier = { .notifier_call = br_device_event }; /* called with RTNL or RCU */ static int br_switchdev_event(struct notifier_block *unused, unsigned long event, void *ptr) { struct net_device *dev = switchdev_notifier_info_to_dev(ptr); struct net_bridge_port *p; struct net_bridge *br; struct switchdev_notifier_fdb_info *fdb_info; int err = NOTIFY_DONE; p = br_port_get_rtnl_rcu(dev); if (!p) goto out; br = p->br; switch (event) { case SWITCHDEV_FDB_ADD_TO_BRIDGE: fdb_info = ptr; err = br_fdb_external_learn_add(br, p, fdb_info->addr, fdb_info->vid, fdb_info->locked, false); if (err) { err = notifier_from_errno(err); break; } br_fdb_offloaded_set(br, p, fdb_info->addr, fdb_info->vid, fdb_info->offloaded); break; case SWITCHDEV_FDB_DEL_TO_BRIDGE: fdb_info = ptr; err = br_fdb_external_learn_del(br, p, fdb_info->addr, fdb_info->vid, false); if (err) err = notifier_from_errno(err); break; case SWITCHDEV_FDB_OFFLOADED: fdb_info = ptr; br_fdb_offloaded_set(br, p, fdb_info->addr, fdb_info->vid, fdb_info->offloaded); break; case SWITCHDEV_FDB_FLUSH_TO_BRIDGE: fdb_info = ptr; /* Don't delete static entries */ br_fdb_delete_by_port(br, p, fdb_info->vid, 0); break; } out: return err; } static struct notifier_block br_switchdev_notifier = { .notifier_call = br_switchdev_event, }; /* called under rtnl_mutex */ static int br_switchdev_blocking_event(struct notifier_block *nb, unsigned long event, void *ptr) { struct netlink_ext_ack *extack = netdev_notifier_info_to_extack(ptr); struct net_device *dev = switchdev_notifier_info_to_dev(ptr); struct switchdev_notifier_brport_info *brport_info; const struct switchdev_brport *b; struct net_bridge_port *p; int err = NOTIFY_DONE; p = br_port_get_rtnl(dev); if (!p) goto out; switch (event) { case SWITCHDEV_BRPORT_OFFLOADED: brport_info = ptr; b = &brport_info->brport; err = br_switchdev_port_offload(p, b->dev, b->ctx, b->atomic_nb, b->blocking_nb, b->tx_fwd_offload, extack); err = notifier_from_errno(err); break; case SWITCHDEV_BRPORT_UNOFFLOADED: brport_info = ptr; b = &brport_info->brport; br_switchdev_port_unoffload(p, b->ctx, b->atomic_nb, b->blocking_nb); break; case SWITCHDEV_BRPORT_REPLAY: brport_info = ptr; b = &brport_info->brport; err = br_switchdev_port_replay(p, b->dev, b->ctx, b->atomic_nb, b->blocking_nb, extack); err = notifier_from_errno(err); break; } out: return err; } static struct notifier_block br_switchdev_blocking_notifier = { .notifier_call = br_switchdev_blocking_event, }; /* br_boolopt_toggle - change user-controlled boolean option * * @br: bridge device * @opt: id of the option to change * @on: new option value * @extack: extack for error messages * * Changes the value of the respective boolean option to @on taking care of * any internal option value mapping and configuration. */ int br_boolopt_toggle(struct net_bridge *br, enum br_boolopt_id opt, bool on, struct netlink_ext_ack *extack) { int err = 0; switch (opt) { case BR_BOOLOPT_NO_LL_LEARN: br_opt_toggle(br, BROPT_NO_LL_LEARN, on); break; case BR_BOOLOPT_MCAST_VLAN_SNOOPING: err = br_multicast_toggle_vlan_snooping(br, on, extack); break; case BR_BOOLOPT_MST_ENABLE: err = br_mst_set_enabled(br, on, extack); break; default: /* shouldn't be called with unsupported options */ WARN_ON(1); break; } return err; } int br_boolopt_get(const struct net_bridge *br, enum br_boolopt_id opt) { switch (opt) { case BR_BOOLOPT_NO_LL_LEARN: return br_opt_get(br, BROPT_NO_LL_LEARN); case BR_BOOLOPT_MCAST_VLAN_SNOOPING: return br_opt_get(br, BROPT_MCAST_VLAN_SNOOPING_ENABLED); case BR_BOOLOPT_MST_ENABLE: return br_opt_get(br, BROPT_MST_ENABLED); default: /* shouldn't be called with unsupported options */ WARN_ON(1); break; } return 0; } int br_boolopt_multi_toggle(struct net_bridge *br, struct br_boolopt_multi *bm, struct netlink_ext_ack *extack) { unsigned long bitmap = bm->optmask; int err = 0; int opt_id; for_each_set_bit(opt_id, &bitmap, BR_BOOLOPT_MAX) { bool on = !!(bm->optval & BIT(opt_id)); err = br_boolopt_toggle(br, opt_id, on, extack); if (err) { br_debug(br, "boolopt multi-toggle error: option: %d current: %d new: %d error: %d\n", opt_id, br_boolopt_get(br, opt_id), on, err); break; } } return err; } void br_boolopt_multi_get(const struct net_bridge *br, struct br_boolopt_multi *bm) { u32 optval = 0; int opt_id; for (opt_id = 0; opt_id < BR_BOOLOPT_MAX; opt_id++) optval |= (br_boolopt_get(br, opt_id) << opt_id); bm->optval = optval; bm->optmask = GENMASK((BR_BOOLOPT_MAX - 1), 0); } /* private bridge options, controlled by the kernel */ void br_opt_toggle(struct net_bridge *br, enum net_bridge_opts opt, bool on) { bool cur = !!br_opt_get(br, opt); br_debug(br, "toggle option: %d state: %d -> %d\n", opt, cur, on); if (cur == on) return; if (on) set_bit(opt, &br->options); else clear_bit(opt, &br->options); } static void __net_exit br_net_exit_batch_rtnl(struct list_head *net_list, struct list_head *dev_to_kill) { struct net_device *dev; struct net *net; ASSERT_RTNL(); list_for_each_entry(net, net_list, exit_list) for_each_netdev(net, dev) if (netif_is_bridge_master(dev)) br_dev_delete(dev, dev_to_kill); } static struct pernet_operations br_net_ops = { .exit_batch_rtnl = br_net_exit_batch_rtnl, }; static const struct stp_proto br_stp_proto = { .rcv = br_stp_rcv, }; static int __init br_init(void) { int err; BUILD_BUG_ON(sizeof(struct br_input_skb_cb) > sizeof_field(struct sk_buff, cb)); err = stp_proto_register(&br_stp_proto); if (err < 0) { pr_err("bridge: can't register sap for STP\n"); return err; } err = br_fdb_init(); if (err) goto err_out; err = register_pernet_subsys(&br_net_ops); if (err) goto err_out1; err = br_nf_core_init(); if (err) goto err_out2; err = register_netdevice_notifier(&br_device_notifier); if (err) goto err_out3; err = register_switchdev_notifier(&br_switchdev_notifier); if (err) goto err_out4; err = register_switchdev_blocking_notifier(&br_switchdev_blocking_notifier); if (err) goto err_out5; err = br_netlink_init(); if (err) goto err_out6; brioctl_set(br_ioctl_stub); #if IS_ENABLED(CONFIG_ATM_LANE) br_fdb_test_addr_hook = br_fdb_test_addr; #endif #if IS_MODULE(CONFIG_BRIDGE_NETFILTER) pr_info("bridge: filtering via arp/ip/ip6tables is no longer available " "by default. Update your scripts to load br_netfilter if you " "need this.\n"); #endif return 0; err_out6: unregister_switchdev_blocking_notifier(&br_switchdev_blocking_notifier); err_out5: unregister_switchdev_notifier(&br_switchdev_notifier); err_out4: unregister_netdevice_notifier(&br_device_notifier); err_out3: br_nf_core_fini(); err_out2: unregister_pernet_subsys(&br_net_ops); err_out1: br_fdb_fini(); err_out: stp_proto_unregister(&br_stp_proto); return err; } static void __exit br_deinit(void) { stp_proto_unregister(&br_stp_proto); br_netlink_fini(); unregister_switchdev_blocking_notifier(&br_switchdev_blocking_notifier); unregister_switchdev_notifier(&br_switchdev_notifier); unregister_netdevice_notifier(&br_device_notifier); brioctl_set(NULL); unregister_pernet_subsys(&br_net_ops); rcu_barrier(); /* Wait for completion of call_rcu()'s */ br_nf_core_fini(); #if IS_ENABLED(CONFIG_ATM_LANE) br_fdb_test_addr_hook = NULL; #endif br_fdb_fini(); } module_init(br_init) module_exit(br_deinit) MODULE_LICENSE("GPL"); MODULE_VERSION(BR_VERSION); MODULE_ALIAS_RTNL_LINK("bridge"); MODULE_DESCRIPTION("Ethernet bridge driver");
10 13 13 13 13 20 3 17 18 18 3 54 65 65 65 6 10 131 136 67 13 54 5 5 10 10 13 13 2 2 50 5 5 13 6 6 1 19 161 2 161 13 16 14 14 14 16 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 /* SPDX-License-Identifier: GPL-2.0-only */ #ifndef __KVM_HOST_H #define __KVM_HOST_H #include <linux/types.h> #include <linux/hardirq.h> #include <linux/list.h> #include <linux/mutex.h> #include <linux/spinlock.h> #include <linux/signal.h> #include <linux/sched.h> #include <linux/sched/stat.h> #include <linux/bug.h> #include <linux/minmax.h> #include <linux/mm.h> #include <linux/mmu_notifier.h> #include <linux/preempt.h> #include <linux/msi.h> #include <linux/slab.h> #include <linux/vmalloc.h> #include <linux/rcupdate.h> #include <linux/ratelimit.h> #include <linux/err.h> #include <linux/irqflags.h> #include <linux/context_tracking.h> #include <linux/irqbypass.h> #include <linux/rcuwait.h> #include <linux/refcount.h> #include <linux/nospec.h> #include <linux/notifier.h> #include <linux/ftrace.h> #include <linux/hashtable.h> #include <linux/instrumentation.h> #include <linux/interval_tree.h> #include <linux/rbtree.h> #include <linux/xarray.h> #include <asm/signal.h> #include <linux/kvm.h> #include <linux/kvm_para.h> #include <linux/kvm_types.h> #include <asm/kvm_host.h> #include <linux/kvm_dirty_ring.h> #ifndef KVM_MAX_VCPU_IDS #define KVM_MAX_VCPU_IDS KVM_MAX_VCPUS #endif /* * The bit 16 ~ bit 31 of kvm_userspace_memory_region::flags are internally * used in kvm, other bits are visible for userspace which are defined in * include/linux/kvm_h. */ #define KVM_MEMSLOT_INVALID (1UL << 16) /* * Bit 63 of the memslot generation number is an "update in-progress flag", * e.g. is temporarily set for the duration of kvm_swap_active_memslots(). * This flag effectively creates a unique generation number that is used to * mark cached memslot data, e.g. MMIO accesses, as potentially being stale, * i.e. may (or may not) have come from the previous memslots generation. * * This is necessary because the actual memslots update is not atomic with * respect to the generation number update. Updating the generation number * first would allow a vCPU to cache a spte from the old memslots using the * new generation number, and updating the generation number after switching * to the new memslots would allow cache hits using the old generation number * to reference the defunct memslots. * * This mechanism is used to prevent getting hits in KVM's caches while a * memslot update is in-progress, and to prevent cache hits *after* updating * the actual generation number against accesses that were inserted into the * cache *before* the memslots were updated. */ #define KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS BIT_ULL(63) /* Two fragments for cross MMIO pages. */ #define KVM_MAX_MMIO_FRAGMENTS 2 #ifndef KVM_MAX_NR_ADDRESS_SPACES #define KVM_MAX_NR_ADDRESS_SPACES 1 #endif /* * For the normal pfn, the highest 12 bits should be zero, * so we can mask bit 62 ~ bit 52 to indicate the error pfn, * mask bit 63 to indicate the noslot pfn. */ #define KVM_PFN_ERR_MASK (0x7ffULL << 52) #define KVM_PFN_ERR_NOSLOT_MASK (0xfffULL << 52) #define KVM_PFN_NOSLOT (0x1ULL << 63) #define KVM_PFN_ERR_FAULT (KVM_PFN_ERR_MASK) #define KVM_PFN_ERR_HWPOISON (KVM_PFN_ERR_MASK + 1) #define KVM_PFN_ERR_RO_FAULT (KVM_PFN_ERR_MASK + 2) #define KVM_PFN_ERR_SIGPENDING (KVM_PFN_ERR_MASK + 3) #define KVM_PFN_ERR_NEEDS_IO (KVM_PFN_ERR_MASK + 4) /* * error pfns indicate that the gfn is in slot but faild to * translate it to pfn on host. */ static inline bool is_error_pfn(kvm_pfn_t pfn) { return !!(pfn & KVM_PFN_ERR_MASK); } /* * KVM_PFN_ERR_SIGPENDING indicates that fetching the PFN was interrupted * by a pending signal. Note, the signal may or may not be fatal. */ static inline bool is_sigpending_pfn(kvm_pfn_t pfn) { return pfn == KVM_PFN_ERR_SIGPENDING; } /* * error_noslot pfns indicate that the gfn can not be * translated to pfn - it is not in slot or failed to * translate it to pfn. */ static inline bool is_error_noslot_pfn(kvm_pfn_t pfn) { return !!(pfn & KVM_PFN_ERR_NOSLOT_MASK); } /* noslot pfn indicates that the gfn is not in slot. */ static inline bool is_noslot_pfn(kvm_pfn_t pfn) { return pfn == KVM_PFN_NOSLOT; } /* * architectures with KVM_HVA_ERR_BAD other than PAGE_OFFSET (e.g. s390) * provide own defines and kvm_is_error_hva */ #ifndef KVM_HVA_ERR_BAD #define KVM_HVA_ERR_BAD (PAGE_OFFSET) #define KVM_HVA_ERR_RO_BAD (PAGE_OFFSET + PAGE_SIZE) static inline bool kvm_is_error_hva(unsigned long addr) { return addr >= PAGE_OFFSET; } #endif static inline bool kvm_is_error_gpa(gpa_t gpa) { return gpa == INVALID_GPA; } #define KVM_REQUEST_MASK GENMASK(7,0) #define KVM_REQUEST_NO_WAKEUP BIT(8) #define KVM_REQUEST_WAIT BIT(9) #define KVM_REQUEST_NO_ACTION BIT(10) /* * Architecture-independent vcpu->requests bit members * Bits 3-7 are reserved for more arch-independent bits. */ #define KVM_REQ_TLB_FLUSH (0 | KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP) #define KVM_REQ_VM_DEAD (1 | KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP) #define KVM_REQ_UNBLOCK 2 #define KVM_REQ_DIRTY_RING_SOFT_FULL 3 #define KVM_REQUEST_ARCH_BASE 8 /* * KVM_REQ_OUTSIDE_GUEST_MODE exists is purely as way to force the vCPU to * OUTSIDE_GUEST_MODE. KVM_REQ_OUTSIDE_GUEST_MODE differs from a vCPU "kick" * in that it ensures the vCPU has reached OUTSIDE_GUEST_MODE before continuing * on. A kick only guarantees that the vCPU is on its way out, e.g. a previous * kick may have set vcpu->mode to EXITING_GUEST_MODE, and so there's no * guarantee the vCPU received an IPI and has actually exited guest mode. */ #define KVM_REQ_OUTSIDE_GUEST_MODE (KVM_REQUEST_NO_ACTION | KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP) #define KVM_ARCH_REQ_FLAGS(nr, flags) ({ \ BUILD_BUG_ON((unsigned)(nr) >= (sizeof_field(struct kvm_vcpu, requests) * 8) - KVM_REQUEST_ARCH_BASE); \ (unsigned)(((nr) + KVM_REQUEST_ARCH_BASE) | (flags)); \ }) #define KVM_ARCH_REQ(nr) KVM_ARCH_REQ_FLAGS(nr, 0) bool kvm_make_vcpus_request_mask(struct kvm *kvm, unsigned int req, unsigned long *vcpu_bitmap); bool kvm_make_all_cpus_request(struct kvm *kvm, unsigned int req); #define KVM_USERSPACE_IRQ_SOURCE_ID 0 #define KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID 1 extern struct mutex kvm_lock; extern struct list_head vm_list; struct kvm_io_range { gpa_t addr; int len; struct kvm_io_device *dev; }; #define NR_IOBUS_DEVS 1000 struct kvm_io_bus { int dev_count; int ioeventfd_count; struct kvm_io_range range[]; }; enum kvm_bus { KVM_MMIO_BUS, KVM_PIO_BUS, KVM_VIRTIO_CCW_NOTIFY_BUS, KVM_FAST_MMIO_BUS, KVM_IOCSR_BUS, KVM_NR_BUSES }; int kvm_io_bus_write(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx, gpa_t addr, int len, const void *val); int kvm_io_bus_write_cookie(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx, gpa_t addr, int len, const void *val, long cookie); int kvm_io_bus_read(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx, gpa_t addr, int len, void *val); int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr, int len, struct kvm_io_device *dev); int kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx, struct kvm_io_device *dev); struct kvm_io_device *kvm_io_bus_get_dev(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr); #ifdef CONFIG_KVM_ASYNC_PF struct kvm_async_pf { struct work_struct work; struct list_head link; struct list_head queue; struct kvm_vcpu *vcpu; gpa_t cr2_or_gpa; unsigned long addr; struct kvm_arch_async_pf arch; bool wakeup_all; bool notpresent_injected; }; void kvm_clear_async_pf_completion_queue(struct kvm_vcpu *vcpu); void kvm_check_async_pf_completion(struct kvm_vcpu *vcpu); bool kvm_setup_async_pf(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, unsigned long hva, struct kvm_arch_async_pf *arch); int kvm_async_pf_wakeup_all(struct kvm_vcpu *vcpu); #endif #ifdef CONFIG_KVM_GENERIC_MMU_NOTIFIER union kvm_mmu_notifier_arg { unsigned long attributes; }; enum kvm_gfn_range_filter { KVM_FILTER_SHARED = BIT(0), KVM_FILTER_PRIVATE = BIT(1), }; struct kvm_gfn_range { struct kvm_memory_slot *slot; gfn_t start; gfn_t end; union kvm_mmu_notifier_arg arg; enum kvm_gfn_range_filter attr_filter; bool may_block; }; bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range); bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range); bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range); #endif enum { OUTSIDE_GUEST_MODE, IN_GUEST_MODE, EXITING_GUEST_MODE, READING_SHADOW_PAGE_TABLES, }; struct kvm_host_map { /* * Only valid if the 'pfn' is managed by the host kernel (i.e. There is * a 'struct page' for it. When using mem= kernel parameter some memory * can be used as guest memory but they are not managed by host * kernel). */ struct page *pinned_page; struct page *page; void *hva; kvm_pfn_t pfn; kvm_pfn_t gfn; bool writable; }; /* * Used to check if the mapping is valid or not. Never use 'kvm_host_map' * directly to check for that. */ static inline bool kvm_vcpu_mapped(struct kvm_host_map *map) { return !!map->hva; } static inline bool kvm_vcpu_can_poll(ktime_t cur, ktime_t stop) { return single_task_running() && !need_resched() && ktime_before(cur, stop); } /* * Sometimes a large or cross-page mmio needs to be broken up into separate * exits for userspace servicing. */ struct kvm_mmio_fragment { gpa_t gpa; void *data; unsigned len; }; struct kvm_vcpu { struct kvm *kvm; #ifdef CONFIG_PREEMPT_NOTIFIERS struct preempt_notifier preempt_notifier; #endif int cpu; int vcpu_id; /* id given by userspace at creation */ int vcpu_idx; /* index into kvm->vcpu_array */ int ____srcu_idx; /* Don't use this directly. You've been warned. */ #ifdef CONFIG_PROVE_RCU int srcu_depth; #endif int mode; u64 requests; unsigned long guest_debug; struct mutex mutex; struct kvm_run *run; #ifndef __KVM_HAVE_ARCH_WQP struct rcuwait wait; #endif struct pid *pid; rwlock_t pid_lock; int sigset_active; sigset_t sigset; unsigned int halt_poll_ns; bool valid_wakeup; #ifdef CONFIG_HAS_IOMEM int mmio_needed; int mmio_read_completed; int mmio_is_write; int mmio_cur_fragment; int mmio_nr_fragments; struct kvm_mmio_fragment mmio_fragments[KVM_MAX_MMIO_FRAGMENTS]; #endif #ifdef CONFIG_KVM_ASYNC_PF struct { u32 queued; struct list_head queue; struct list_head done; spinlock_t lock; } async_pf; #endif #ifdef CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT /* * Cpu relax intercept or pause loop exit optimization * in_spin_loop: set when a vcpu does a pause loop exit * or cpu relax intercepted. * dy_eligible: indicates whether vcpu is eligible for directed yield. */ struct { bool in_spin_loop; bool dy_eligible; } spin_loop; #endif bool wants_to_run; bool preempted; bool ready; bool scheduled_out; struct kvm_vcpu_arch arch; struct kvm_vcpu_stat stat; char stats_id[KVM_STATS_NAME_SIZE]; struct kvm_dirty_ring dirty_ring; /* * The most recently used memslot by this vCPU and the slots generation * for which it is valid. * No wraparound protection is needed since generations won't overflow in * thousands of years, even assuming 1M memslot operations per second. */ struct kvm_memory_slot *last_used_slot; u64 last_used_slot_gen; }; /* * Start accounting time towards a guest. * Must be called before entering guest context. */ static __always_inline void guest_timing_enter_irqoff(void) { /* * This is running in ioctl context so its safe to assume that it's the * stime pending cputime to flush. */ instrumentation_begin(); vtime_account_guest_enter(); instrumentation_end(); } /* * Enter guest context and enter an RCU extended quiescent state. * * Between guest_context_enter_irqoff() and guest_context_exit_irqoff() it is * unsafe to use any code which may directly or indirectly use RCU, tracing * (including IRQ flag tracing), or lockdep. All code in this period must be * non-instrumentable. */ static __always_inline void guest_context_enter_irqoff(void) { /* * KVM does not hold any references to rcu protected data when it * switches CPU into a guest mode. In fact switching to a guest mode * is very similar to exiting to userspace from rcu point of view. In * addition CPU may stay in a guest mode for quite a long time (up to * one time slice). Lets treat guest mode as quiescent state, just like * we do with user-mode execution. */ if (!context_tracking_guest_enter()) { instrumentation_begin(); rcu_virt_note_context_switch(); instrumentation_end(); } } /* * Deprecated. Architectures should move to guest_timing_enter_irqoff() and * guest_state_enter_irqoff(). */ static __always_inline void guest_enter_irqoff(void) { guest_timing_enter_irqoff(); guest_context_enter_irqoff(); } /** * guest_state_enter_irqoff - Fixup state when entering a guest * * Entry to a guest will enable interrupts, but the kernel state is interrupts * disabled when this is invoked. Also tell RCU about it. * * 1) Trace interrupts on state * 2) Invoke context tracking if enabled to adjust RCU state * 3) Tell lockdep that interrupts are enabled * * Invoked from architecture specific code before entering a guest. * Must be called with interrupts disabled and the caller must be * non-instrumentable. * The caller has to invoke guest_timing_enter_irqoff() before this. * * Note: this is analogous to exit_to_user_mode(). */ static __always_inline void guest_state_enter_irqoff(void) { instrumentation_begin(); trace_hardirqs_on_prepare(); lockdep_hardirqs_on_prepare(); instrumentation_end(); guest_context_enter_irqoff(); lockdep_hardirqs_on(CALLER_ADDR0); } /* * Exit guest context and exit an RCU extended quiescent state. * * Between guest_context_enter_irqoff() and guest_context_exit_irqoff() it is * unsafe to use any code which may directly or indirectly use RCU, tracing * (including IRQ flag tracing), or lockdep. All code in this period must be * non-instrumentable. */ static __always_inline void guest_context_exit_irqoff(void) { /* * Guest mode is treated as a quiescent state, see * guest_context_enter_irqoff() for more details. */ if (!context_tracking_guest_exit()) { instrumentation_begin(); rcu_virt_note_context_switch(); instrumentation_end(); } } /* * Stop accounting time towards a guest. * Must be called after exiting guest context. */ static __always_inline void guest_timing_exit_irqoff(void) { instrumentation_begin(); /* Flush the guest cputime we spent on the guest */ vtime_account_guest_exit(); instrumentation_end(); } /* * Deprecated. Architectures should move to guest_state_exit_irqoff() and * guest_timing_exit_irqoff(). */ static __always_inline void guest_exit_irqoff(void) { guest_context_exit_irqoff(); guest_timing_exit_irqoff(); } static inline void guest_exit(void) { unsigned long flags; local_irq_save(flags); guest_exit_irqoff(); local_irq_restore(flags); } /** * guest_state_exit_irqoff - Establish state when returning from guest mode * * Entry from a guest disables interrupts, but guest mode is traced as * interrupts enabled. Also with NO_HZ_FULL RCU might be idle. * * 1) Tell lockdep that interrupts are disabled * 2) Invoke context tracking if enabled to reactivate RCU * 3) Trace interrupts off state * * Invoked from architecture specific code after exiting a guest. * Must be invoked with interrupts disabled and the caller must be * non-instrumentable. * The caller has to invoke guest_timing_exit_irqoff() after this. * * Note: this is analogous to enter_from_user_mode(). */ static __always_inline void guest_state_exit_irqoff(void) { lockdep_hardirqs_off(CALLER_ADDR0); guest_context_exit_irqoff(); instrumentation_begin(); trace_hardirqs_off_finish(); instrumentation_end(); } static inline int kvm_vcpu_exiting_guest_mode(struct kvm_vcpu *vcpu) { /* * The memory barrier ensures a previous write to vcpu->requests cannot * be reordered with the read of vcpu->mode. It pairs with the general * memory barrier following the write of vcpu->mode in VCPU RUN. */ smp_mb__before_atomic(); return cmpxchg(&vcpu->mode, IN_GUEST_MODE, EXITING_GUEST_MODE); } /* * Some of the bitops functions do not support too long bitmaps. * This number must be determined not to exceed such limits. */ #define KVM_MEM_MAX_NR_PAGES ((1UL << 31) - 1) /* * Since at idle each memslot belongs to two memslot sets it has to contain * two embedded nodes for each data structure that it forms a part of. * * Two memslot sets (one active and one inactive) are necessary so the VM * continues to run on one memslot set while the other is being modified. * * These two memslot sets normally point to the same set of memslots. * They can, however, be desynchronized when performing a memslot management * operation by replacing the memslot to be modified by its copy. * After the operation is complete, both memslot sets once again point to * the same, common set of memslot data. * * The memslots themselves are independent of each other so they can be * individually added or deleted. */ struct kvm_memory_slot { struct hlist_node id_node[2]; struct interval_tree_node hva_node[2]; struct rb_node gfn_node[2]; gfn_t base_gfn; unsigned long npages; unsigned long *dirty_bitmap; struct kvm_arch_memory_slot arch; unsigned long userspace_addr; u32 flags; short id; u16 as_id; #ifdef CONFIG_KVM_PRIVATE_MEM struct { /* * Writes protected by kvm->slots_lock. Acquiring a * reference via kvm_gmem_get_file() is protected by * either kvm->slots_lock or kvm->srcu. */ struct file *file; pgoff_t pgoff; } gmem; #endif }; static inline bool kvm_slot_can_be_private(const struct kvm_memory_slot *slot) { return slot && (slot->flags & KVM_MEM_GUEST_MEMFD); } static inline bool kvm_slot_dirty_track_enabled(const struct kvm_memory_slot *slot) { return slot->flags & KVM_MEM_LOG_DIRTY_PAGES; } static inline unsigned long kvm_dirty_bitmap_bytes(struct kvm_memory_slot *memslot) { return ALIGN(memslot->npages, BITS_PER_LONG) / 8; } static inline unsigned long *kvm_second_dirty_bitmap(struct kvm_memory_slot *memslot) { unsigned long len = kvm_dirty_bitmap_bytes(memslot); return memslot->dirty_bitmap + len / sizeof(*memslot->dirty_bitmap); } #ifndef KVM_DIRTY_LOG_MANUAL_CAPS #define KVM_DIRTY_LOG_MANUAL_CAPS KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE #endif struct kvm_s390_adapter_int { u64 ind_addr; u64 summary_addr; u64 ind_offset; u32 summary_offset; u32 adapter_id; }; struct kvm_hv_sint { u32 vcpu; u32 sint; }; struct kvm_xen_evtchn { u32 port; u32 vcpu_id; int vcpu_idx; u32 priority; }; struct kvm_kernel_irq_routing_entry { u32 gsi; u32 type; int (*set)(struct kvm_kernel_irq_routing_entry *e, struct kvm *kvm, int irq_source_id, int level, bool line_status); union { struct { unsigned irqchip; unsigned pin; } irqchip; struct { u32 address_lo; u32 address_hi; u32 data; u32 flags; u32 devid; } msi; struct kvm_s390_adapter_int adapter; struct kvm_hv_sint hv_sint; struct kvm_xen_evtchn xen_evtchn; }; struct hlist_node link; }; #ifdef CONFIG_HAVE_KVM_IRQ_ROUTING struct kvm_irq_routing_table { int chip[KVM_NR_IRQCHIPS][KVM_IRQCHIP_NUM_PINS]; u32 nr_rt_entries; /* * Array indexed by gsi. Each entry contains list of irq chips * the gsi is connected to. */ struct hlist_head map[] __counted_by(nr_rt_entries); }; #endif bool kvm_arch_irqchip_in_kernel(struct kvm *kvm); #ifndef KVM_INTERNAL_MEM_SLOTS #define KVM_INTERNAL_MEM_SLOTS 0 #endif #define KVM_MEM_SLOTS_NUM SHRT_MAX #define KVM_USER_MEM_SLOTS (KVM_MEM_SLOTS_NUM - KVM_INTERNAL_MEM_SLOTS) #if KVM_MAX_NR_ADDRESS_SPACES == 1 static inline int kvm_arch_nr_memslot_as_ids(struct kvm *kvm) { return KVM_MAX_NR_ADDRESS_SPACES; } static inline int kvm_arch_vcpu_memslots_id(struct kvm_vcpu *vcpu) { return 0; } #endif /* * Arch code must define kvm_arch_has_private_mem if support for private memory * is enabled. */ #if !defined(kvm_arch_has_private_mem) && !IS_ENABLED(CONFIG_KVM_PRIVATE_MEM) static inline bool kvm_arch_has_private_mem(struct kvm *kvm) { return false; } #endif #ifndef kvm_arch_has_readonly_mem static inline bool kvm_arch_has_readonly_mem(struct kvm *kvm) { return IS_ENABLED(CONFIG_HAVE_KVM_READONLY_MEM); } #endif struct kvm_memslots { u64 generation; atomic_long_t last_used_slot; struct rb_root_cached hva_tree; struct rb_root gfn_tree; /* * The mapping table from slot id to memslot. * * 7-bit bucket count matches the size of the old id to index array for * 512 slots, while giving good performance with this slot count. * Higher bucket counts bring only small performance improvements but * always result in higher memory usage (even for lower memslot counts). */ DECLARE_HASHTABLE(id_hash, 7); int node_idx; }; struct kvm { #ifdef KVM_HAVE_MMU_RWLOCK rwlock_t mmu_lock; #else spinlock_t mmu_lock; #endif /* KVM_HAVE_MMU_RWLOCK */ struct mutex slots_lock; /* * Protects the arch-specific fields of struct kvm_memory_slots in * use by the VM. To be used under the slots_lock (above) or in a * kvm->srcu critical section where acquiring the slots_lock would * lead to deadlock with the synchronize_srcu in * kvm_swap_active_memslots(). */ struct mutex slots_arch_lock; struct mm_struct *mm; /* userspace tied to this vm */ unsigned long nr_memslot_pages; /* The two memslot sets - active and inactive (per address space) */ struct kvm_memslots __memslots[KVM_MAX_NR_ADDRESS_SPACES][2]; /* The current active memslot set for each address space */ struct kvm_memslots __rcu *memslots[KVM_MAX_NR_ADDRESS_SPACES]; struct xarray vcpu_array; /* * Protected by slots_lock, but can be read outside if an * incorrect answer is acceptable. */ atomic_t nr_memslots_dirty_logging; /* Used to wait for completion of MMU notifiers. */ spinlock_t mn_invalidate_lock; unsigned long mn_active_invalidate_count; struct rcuwait mn_memslots_update_rcuwait; /* For management / invalidation of gfn_to_pfn_caches */ spinlock_t gpc_lock; struct list_head gpc_list; /* * created_vcpus is protected by kvm->lock, and is incremented * at the beginning of KVM_CREATE_VCPU. online_vcpus is only * incremented after storing the kvm_vcpu pointer in vcpus, * and is accessed atomically. */ atomic_t online_vcpus; int max_vcpus; int created_vcpus; int last_boosted_vcpu; struct list_head vm_list; struct mutex lock; struct kvm_io_bus __rcu *buses[KVM_NR_BUSES]; #ifdef CONFIG_HAVE_KVM_IRQCHIP struct { spinlock_t lock; struct list_head items; /* resampler_list update side is protected by resampler_lock. */ struct list_head resampler_list; struct mutex resampler_lock; } irqfds; #endif struct list_head ioeventfds; struct kvm_vm_stat stat; struct kvm_arch arch; refcount_t users_count; #ifdef CONFIG_KVM_MMIO struct kvm_coalesced_mmio_ring *coalesced_mmio_ring; spinlock_t ring_lock; struct list_head coalesced_zones; #endif struct mutex irq_lock; #ifdef CONFIG_HAVE_KVM_IRQCHIP /* * Update side is protected by irq_lock. */ struct kvm_irq_routing_table __rcu *irq_routing; struct hlist_head irq_ack_notifier_list; #endif #ifdef CONFIG_KVM_GENERIC_MMU_NOTIFIER struct mmu_notifier mmu_notifier; unsigned long mmu_invalidate_seq; long mmu_invalidate_in_progress; gfn_t mmu_invalidate_range_start; gfn_t mmu_invalidate_range_end; #endif struct list_head devices; u64 manual_dirty_log_protect; struct dentry *debugfs_dentry; struct kvm_stat_data **debugfs_stat_data; struct srcu_struct srcu; struct srcu_struct irq_srcu; pid_t userspace_pid; bool override_halt_poll_ns; unsigned int max_halt_poll_ns; u32 dirty_ring_size; bool dirty_ring_with_bitmap; bool vm_bugged; bool vm_dead; #ifdef CONFIG_HAVE_KVM_PM_NOTIFIER struct notifier_block pm_notifier; #endif #ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES /* Protected by slots_locks (for writes) and RCU (for reads) */ struct xarray mem_attr_array; #endif char stats_id[KVM_STATS_NAME_SIZE]; }; #define kvm_err(fmt, ...) \ pr_err("kvm [%i]: " fmt, task_pid_nr(current), ## __VA_ARGS__) #define kvm_info(fmt, ...) \ pr_info("kvm [%i]: " fmt, task_pid_nr(current), ## __VA_ARGS__) #define kvm_debug(fmt, ...) \ pr_debug("kvm [%i]: " fmt, task_pid_nr(current), ## __VA_ARGS__) #define kvm_debug_ratelimited(fmt, ...) \ pr_debug_ratelimited("kvm [%i]: " fmt, task_pid_nr(current), \ ## __VA_ARGS__) #define kvm_pr_unimpl(fmt, ...) \ pr_err_ratelimited("kvm [%i]: " fmt, \ task_tgid_nr(current), ## __VA_ARGS__) /* The guest did something we don't support. */ #define vcpu_unimpl(vcpu, fmt, ...) \ kvm_pr_unimpl("vcpu%i, guest rIP: 0x%lx " fmt, \ (vcpu)->vcpu_id, kvm_rip_read(vcpu), ## __VA_ARGS__) #define vcpu_debug(vcpu, fmt, ...) \ kvm_debug("vcpu%i " fmt, (vcpu)->vcpu_id, ## __VA_ARGS__) #define vcpu_debug_ratelimited(vcpu, fmt, ...) \ kvm_debug_ratelimited("vcpu%i " fmt, (vcpu)->vcpu_id, \ ## __VA_ARGS__) #define vcpu_err(vcpu, fmt, ...) \ kvm_err("vcpu%i " fmt, (vcpu)->vcpu_id, ## __VA_ARGS__) static inline void kvm_vm_dead(struct kvm *kvm) { kvm->vm_dead = true; kvm_make_all_cpus_request(kvm, KVM_REQ_VM_DEAD); } static inline void kvm_vm_bugged(struct kvm *kvm) { kvm->vm_bugged = true; kvm_vm_dead(kvm); } #define KVM_BUG(cond, kvm, fmt...) \ ({ \ bool __ret = !!(cond); \ \ if (WARN_ONCE(__ret && !(kvm)->vm_bugged, fmt)) \ kvm_vm_bugged(kvm); \ unlikely(__ret); \ }) #define KVM_BUG_ON(cond, kvm) \ ({ \ bool __ret = !!(cond); \ \ if (WARN_ON_ONCE(__ret && !(kvm)->vm_bugged)) \ kvm_vm_bugged(kvm); \ unlikely(__ret); \ }) /* * Note, "data corruption" refers to corruption of host kernel data structures, * not guest data. Guest data corruption, suspected or confirmed, that is tied * and contained to a single VM should *never* BUG() and potentially panic the * host, i.e. use this variant of KVM_BUG() if and only if a KVM data structure * is corrupted and that corruption can have a cascading effect to other parts * of the hosts and/or to other VMs. */ #define KVM_BUG_ON_DATA_CORRUPTION(cond, kvm) \ ({ \ bool __ret = !!(cond); \ \ if (IS_ENABLED(CONFIG_BUG_ON_DATA_CORRUPTION)) \ BUG_ON(__ret); \ else if (WARN_ON_ONCE(__ret && !(kvm)->vm_bugged)) \ kvm_vm_bugged(kvm); \ unlikely(__ret); \ }) static inline void kvm_vcpu_srcu_read_lock(struct kvm_vcpu *vcpu) { #ifdef CONFIG_PROVE_RCU WARN_ONCE(vcpu->srcu_depth++, "KVM: Illegal vCPU srcu_idx LOCK, depth=%d", vcpu->srcu_depth - 1); #endif vcpu->____srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); } static inline void kvm_vcpu_srcu_read_unlock(struct kvm_vcpu *vcpu) { srcu_read_unlock(&vcpu->kvm->srcu, vcpu->____srcu_idx); #ifdef CONFIG_PROVE_RCU WARN_ONCE(--vcpu->srcu_depth, "KVM: Illegal vCPU srcu_idx UNLOCK, depth=%d", vcpu->srcu_depth); #endif } static inline bool kvm_dirty_log_manual_protect_and_init_set(struct kvm *kvm) { return !!(kvm->manual_dirty_log_protect & KVM_DIRTY_LOG_INITIALLY_SET); } static inline struct kvm_io_bus *kvm_get_bus(struct kvm *kvm, enum kvm_bus idx) { return srcu_dereference_check(kvm->buses[idx], &kvm->srcu, lockdep_is_held(&kvm->slots_lock) || !refcount_read(&kvm->users_count)); } static inline struct kvm_vcpu *kvm_get_vcpu(struct kvm *kvm, int i) { int num_vcpus = atomic_read(&kvm->online_vcpus); /* * Explicitly verify the target vCPU is online, as the anti-speculation * logic only limits the CPU's ability to speculate, e.g. given a "bad" * index, clamping the index to 0 would return vCPU0, not NULL. */ if (i >= num_vcpus) return NULL; i = array_index_nospec(i, num_vcpus); /* Pairs with smp_wmb() in kvm_vm_ioctl_create_vcpu. */ smp_rmb(); return xa_load(&kvm->vcpu_array, i); } #define kvm_for_each_vcpu(idx, vcpup, kvm) \ if (atomic_read(&kvm->online_vcpus)) \ xa_for_each_range(&kvm->vcpu_array, idx, vcpup, 0, \ (atomic_read(&kvm->online_vcpus) - 1)) static inline struct kvm_vcpu *kvm_get_vcpu_by_id(struct kvm *kvm, int id) { struct kvm_vcpu *vcpu = NULL; unsigned long i; if (id < 0) return NULL; if (id < KVM_MAX_VCPUS) vcpu = kvm_get_vcpu(kvm, id); if (vcpu && vcpu->vcpu_id == id) return vcpu; kvm_for_each_vcpu(i, vcpu, kvm) if (vcpu->vcpu_id == id) return vcpu; return NULL; } void kvm_destroy_vcpus(struct kvm *kvm); void vcpu_load(struct kvm_vcpu *vcpu); void vcpu_put(struct kvm_vcpu *vcpu); #ifdef __KVM_HAVE_IOAPIC void kvm_arch_post_irq_ack_notifier_list_update(struct kvm *kvm); void kvm_arch_post_irq_routing_update(struct kvm *kvm); #else static inline void kvm_arch_post_irq_ack_notifier_list_update(struct kvm *kvm) { } static inline void kvm_arch_post_irq_routing_update(struct kvm *kvm) { } #endif #ifdef CONFIG_HAVE_KVM_IRQCHIP int kvm_irqfd_init(void); void kvm_irqfd_exit(void); #else static inline int kvm_irqfd_init(void) { return 0; } static inline void kvm_irqfd_exit(void) { } #endif int kvm_init(unsigned vcpu_size, unsigned vcpu_align, struct module *module); void kvm_exit(void); void kvm_get_kvm(struct kvm *kvm); bool kvm_get_kvm_safe(struct kvm *kvm); void kvm_put_kvm(struct kvm *kvm); bool file_is_kvm(struct file *file); void kvm_put_kvm_no_destroy(struct kvm *kvm); static inline struct kvm_memslots *__kvm_memslots(struct kvm *kvm, int as_id) { as_id = array_index_nospec(as_id, KVM_MAX_NR_ADDRESS_SPACES); return srcu_dereference_check(kvm->memslots[as_id], &kvm->srcu, lockdep_is_held(&kvm->slots_lock) || !refcount_read(&kvm->users_count)); } static inline struct kvm_memslots *kvm_memslots(struct kvm *kvm) { return __kvm_memslots(kvm, 0); } static inline struct kvm_memslots *kvm_vcpu_memslots(struct kvm_vcpu *vcpu) { int as_id = kvm_arch_vcpu_memslots_id(vcpu); return __kvm_memslots(vcpu->kvm, as_id); } static inline bool kvm_memslots_empty(struct kvm_memslots *slots) { return RB_EMPTY_ROOT(&slots->gfn_tree); } bool kvm_are_all_memslots_empty(struct kvm *kvm); #define kvm_for_each_memslot(memslot, bkt, slots) \ hash_for_each(slots->id_hash, bkt, memslot, id_node[slots->node_idx]) \ if (WARN_ON_ONCE(!memslot->npages)) { \ } else static inline struct kvm_memory_slot *id_to_memslot(struct kvm_memslots *slots, int id) { struct kvm_memory_slot *slot; int idx = slots->node_idx; hash_for_each_possible(slots->id_hash, slot, id_node[idx], id) { if (slot->id == id) return slot; } return NULL; } /* Iterator used for walking memslots that overlap a gfn range. */ struct kvm_memslot_iter { struct kvm_memslots *slots; struct rb_node *node; struct kvm_memory_slot *slot; }; static inline void kvm_memslot_iter_next(struct kvm_memslot_iter *iter) { iter->node = rb_next(iter->node); if (!iter->node) return; iter->slot = container_of(iter->node, struct kvm_memory_slot, gfn_node[iter->slots->node_idx]); } static inline void kvm_memslot_iter_start(struct kvm_memslot_iter *iter, struct kvm_memslots *slots, gfn_t start) { int idx = slots->node_idx; struct rb_node *tmp; struct kvm_memory_slot *slot; iter->slots = slots; /* * Find the so called "upper bound" of a key - the first node that has * its key strictly greater than the searched one (the start gfn in our case). */ iter->node = NULL; for (tmp = slots->gfn_tree.rb_node; tmp; ) { slot = container_of(tmp, struct kvm_memory_slot, gfn_node[idx]); if (start < slot->base_gfn) { iter->node = tmp; tmp = tmp->rb_left; } else { tmp = tmp->rb_right; } } /* * Find the slot with the lowest gfn that can possibly intersect with * the range, so we'll ideally have slot start <= range start */ if (iter->node) { /* * A NULL previous node means that the very first slot * already has a higher start gfn. * In this case slot start > range start. */ tmp = rb_prev(iter->node); if (tmp) iter->node = tmp; } else { /* a NULL node below means no slots */ iter->node = rb_last(&slots->gfn_tree); } if (iter->node) { iter->slot = container_of(iter->node, struct kvm_memory_slot, gfn_node[idx]); /* * It is possible in the slot start < range start case that the * found slot ends before or at range start (slot end <= range start) * and so it does not overlap the requested range. * * In such non-overlapping case the next slot (if it exists) will * already have slot start > range start, otherwise the logic above * would have found it instead of the current slot. */ if (iter->slot->base_gfn + iter->slot->npages <= start) kvm_memslot_iter_next(iter); } } static inline bool kvm_memslot_iter_is_valid(struct kvm_memslot_iter *iter, gfn_t end) { if (!iter->node) return false; /* * If this slot starts beyond or at the end of the range so does * every next one */ return iter->slot->base_gfn < end; } /* Iterate over each memslot at least partially intersecting [start, end) range */ #define kvm_for_each_memslot_in_gfn_range(iter, slots, start, end) \ for (kvm_memslot_iter_start(iter, slots, start); \ kvm_memslot_iter_is_valid(iter, end); \ kvm_memslot_iter_next(iter)) struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn); struct kvm_memslots *kvm_vcpu_memslots(struct kvm_vcpu *vcpu); struct kvm_memory_slot *kvm_vcpu_gfn_to_memslot(struct kvm_vcpu *vcpu, gfn_t gfn); /* * KVM_SET_USER_MEMORY_REGION ioctl allows the following operations: * - create a new memory slot * - delete an existing memory slot * - modify an existing memory slot * -- move it in the guest physical memory space * -- just change its flags * * Since flags can be changed by some of these operations, the following * differentiation is the best we can do for kvm_set_memory_region(): */ enum kvm_mr_change { KVM_MR_CREATE, KVM_MR_DELETE, KVM_MR_MOVE, KVM_MR_FLAGS_ONLY, }; int kvm_set_internal_memslot(struct kvm *kvm, const struct kvm_userspace_memory_region2 *mem); void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *slot); void kvm_arch_memslots_updated(struct kvm *kvm, u64 gen); int kvm_arch_prepare_memory_region(struct kvm *kvm, const struct kvm_memory_slot *old, struct kvm_memory_slot *new, enum kvm_mr_change change); void kvm_arch_commit_memory_region(struct kvm *kvm, struct kvm_memory_slot *old, const struct kvm_memory_slot *new, enum kvm_mr_change change); /* flush all memory translations */ void kvm_arch_flush_shadow_all(struct kvm *kvm); /* flush memory translations pointing to 'slot' */ void kvm_arch_flush_shadow_memslot(struct kvm *kvm, struct kvm_memory_slot *slot); int kvm_prefetch_pages(struct kvm_memory_slot *slot, gfn_t gfn, struct page **pages, int nr_pages); struct page *__gfn_to_page(struct kvm *kvm, gfn_t gfn, bool write); static inline struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn) { return __gfn_to_page(kvm, gfn, true); } unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn); unsigned long gfn_to_hva_prot(struct kvm *kvm, gfn_t gfn, bool *writable); unsigned long gfn_to_hva_memslot(struct kvm_memory_slot *slot, gfn_t gfn); unsigned long gfn_to_hva_memslot_prot(struct kvm_memory_slot *slot, gfn_t gfn, bool *writable); static inline void kvm_release_page_unused(struct page *page) { if (!page) return; put_page(page); } void kvm_release_page_clean(struct page *page); void kvm_release_page_dirty(struct page *page); static inline void kvm_release_faultin_page(struct kvm *kvm, struct page *page, bool unused, bool dirty) { lockdep_assert_once(lockdep_is_held(&kvm->mmu_lock) || unused); if (!page) return; /* * If the page that KVM got from the *primary MMU* is writable, and KVM * installed or reused a SPTE, mark the page/folio dirty. Note, this * may mark a folio dirty even if KVM created a read-only SPTE, e.g. if * the GFN is write-protected. Folios can't be safely marked dirty * outside of mmu_lock as doing so could race with writeback on the * folio. As a result, KVM can't mark folios dirty in the fast page * fault handler, and so KVM must (somewhat) speculatively mark the * folio dirty if KVM could locklessly make the SPTE writable. */ if (unused) kvm_release_page_unused(page); else if (dirty) kvm_release_page_dirty(page); else kvm_release_page_clean(page); } kvm_pfn_t __kvm_faultin_pfn(const struct kvm_memory_slot *slot, gfn_t gfn, unsigned int foll, bool *writable, struct page **refcounted_page); static inline kvm_pfn_t kvm_faultin_pfn(struct kvm_vcpu *vcpu, gfn_t gfn, bool write, bool *writable, struct page **refcounted_page) { return __kvm_faultin_pfn(kvm_vcpu_gfn_to_memslot(vcpu, gfn), gfn, write ? FOLL_WRITE : 0, writable, refcounted_page); } int kvm_read_guest_page(struct kvm *kvm, gfn_t gfn, void *data, int offset, int len); int kvm_read_guest(struct kvm *kvm, gpa_t gpa, void *data, unsigned long len); int kvm_read_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc, void *data, unsigned long len); int kvm_read_guest_offset_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc, void *data, unsigned int offset, unsigned long len); int kvm_write_guest_page(struct kvm *kvm, gfn_t gfn, const void *data, int offset, int len); int kvm_write_guest(struct kvm *kvm, gpa_t gpa, const void *data, unsigned long len); int kvm_write_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc, void *data, unsigned long len); int kvm_write_guest_offset_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc, void *data, unsigned int offset, unsigned long len); int kvm_gfn_to_hva_cache_init(struct kvm *kvm, struct gfn_to_hva_cache *ghc, gpa_t gpa, unsigned long len); #define __kvm_get_guest(kvm, gfn, offset, v) \ ({ \ unsigned long __addr = gfn_to_hva(kvm, gfn); \ typeof(v) __user *__uaddr = (typeof(__uaddr))(__addr + offset); \ int __ret = -EFAULT; \ \ if (!kvm_is_error_hva(__addr)) \ __ret = get_user(v, __uaddr); \ __ret; \ }) #define kvm_get_guest(kvm, gpa, v) \ ({ \ gpa_t __gpa = gpa; \ struct kvm *__kvm = kvm; \ \ __kvm_get_guest(__kvm, __gpa >> PAGE_SHIFT, \ offset_in_page(__gpa), v); \ }) #define __kvm_put_guest(kvm, gfn, offset, v) \ ({ \ unsigned long __addr = gfn_to_hva(kvm, gfn); \ typeof(v) __user *__uaddr = (typeof(__uaddr))(__addr + offset); \ int __ret = -EFAULT; \ \ if (!kvm_is_error_hva(__addr)) \ __ret = put_user(v, __uaddr); \ if (!__ret) \ mark_page_dirty(kvm, gfn); \ __ret; \ }) #define kvm_put_guest(kvm, gpa, v) \ ({ \ gpa_t __gpa = gpa; \ struct kvm *__kvm = kvm; \ \ __kvm_put_guest(__kvm, __gpa >> PAGE_SHIFT, \ offset_in_page(__gpa), v); \ }) int kvm_clear_guest(struct kvm *kvm, gpa_t gpa, unsigned long len); bool kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn); bool kvm_vcpu_is_visible_gfn(struct kvm_vcpu *vcpu, gfn_t gfn); unsigned long kvm_host_page_size(struct kvm_vcpu *vcpu, gfn_t gfn); void mark_page_dirty_in_slot(struct kvm *kvm, const struct kvm_memory_slot *memslot, gfn_t gfn); void mark_page_dirty(struct kvm *kvm, gfn_t gfn); int __kvm_vcpu_map(struct kvm_vcpu *vcpu, gpa_t gpa, struct kvm_host_map *map, bool writable); void kvm_vcpu_unmap(struct kvm_vcpu *vcpu, struct kvm_host_map *map); static inline int kvm_vcpu_map(struct kvm_vcpu *vcpu, gpa_t gpa, struct kvm_host_map *map) { return __kvm_vcpu_map(vcpu, gpa, map, true); } static inline int kvm_vcpu_map_readonly(struct kvm_vcpu *vcpu, gpa_t gpa, struct kvm_host_map *map) { return __kvm_vcpu_map(vcpu, gpa, map, false); } unsigned long kvm_vcpu_gfn_to_hva(struct kvm_vcpu *vcpu, gfn_t gfn); unsigned long kvm_vcpu_gfn_to_hva_prot(struct kvm_vcpu *vcpu, gfn_t gfn, bool *writable); int kvm_vcpu_read_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn, void *data, int offset, int len); int kvm_vcpu_read_guest_atomic(struct kvm_vcpu *vcpu, gpa_t gpa, void *data, unsigned long len); int kvm_vcpu_read_guest(struct kvm_vcpu *vcpu, gpa_t gpa, void *data, unsigned long len); int kvm_vcpu_write_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn, const void *data, int offset, int len); int kvm_vcpu_write_guest(struct kvm_vcpu *vcpu, gpa_t gpa, const void *data, unsigned long len); void kvm_vcpu_mark_page_dirty(struct kvm_vcpu *vcpu, gfn_t gfn); /** * kvm_gpc_init - initialize gfn_to_pfn_cache. * * @gpc: struct gfn_to_pfn_cache object. * @kvm: pointer to kvm instance. * * This sets up a gfn_to_pfn_cache by initializing locks and assigning the * immutable attributes. Note, the cache must be zero-allocated (or zeroed by * the caller before init). */ void kvm_gpc_init(struct gfn_to_pfn_cache *gpc, struct kvm *kvm); /** * kvm_gpc_activate - prepare a cached kernel mapping and HPA for a given guest * physical address. * * @gpc: struct gfn_to_pfn_cache object. * @gpa: guest physical address to map. * @len: sanity check; the range being access must fit a single page. * * @return: 0 for success. * -EINVAL for a mapping which would cross a page boundary. * -EFAULT for an untranslatable guest physical address. * * This primes a gfn_to_pfn_cache and links it into the @gpc->kvm's list for * invalidations to be processed. Callers are required to use kvm_gpc_check() * to ensure that the cache is valid before accessing the target page. */ int kvm_gpc_activate(struct gfn_to_pfn_cache *gpc, gpa_t gpa, unsigned long len); /** * kvm_gpc_activate_hva - prepare a cached kernel mapping and HPA for a given HVA. * * @gpc: struct gfn_to_pfn_cache object. * @hva: userspace virtual address to map. * @len: sanity check; the range being access must fit a single page. * * @return: 0 for success. * -EINVAL for a mapping which would cross a page boundary. * -EFAULT for an untranslatable guest physical address. * * The semantics of this function are the same as those of kvm_gpc_activate(). It * merely bypasses a layer of address translation. */ int kvm_gpc_activate_hva(struct gfn_to_pfn_cache *gpc, unsigned long hva, unsigned long len); /** * kvm_gpc_check - check validity of a gfn_to_pfn_cache. * * @gpc: struct gfn_to_pfn_cache object. * @len: sanity check; the range being access must fit a single page. * * @return: %true if the cache is still valid and the address matches. * %false if the cache is not valid. * * Callers outside IN_GUEST_MODE context should hold a read lock on @gpc->lock * while calling this function, and then continue to hold the lock until the * access is complete. * * Callers in IN_GUEST_MODE may do so without locking, although they should * still hold a read lock on kvm->scru for the memslot checks. */ bool kvm_gpc_check(struct gfn_to_pfn_cache *gpc, unsigned long len); /** * kvm_gpc_refresh - update a previously initialized cache. * * @gpc: struct gfn_to_pfn_cache object. * @len: sanity check; the range being access must fit a single page. * * @return: 0 for success. * -EINVAL for a mapping which would cross a page boundary. * -EFAULT for an untranslatable guest physical address. * * This will attempt to refresh a gfn_to_pfn_cache. Note that a successful * return from this function does not mean the page can be immediately * accessed because it may have raced with an invalidation. Callers must * still lock and check the cache status, as this function does not return * with the lock still held to permit access. */ int kvm_gpc_refresh(struct gfn_to_pfn_cache *gpc, unsigned long len); /** * kvm_gpc_deactivate - deactivate and unlink a gfn_to_pfn_cache. * * @gpc: struct gfn_to_pfn_cache object. * * This removes a cache from the VM's list to be processed on MMU notifier * invocation. */ void kvm_gpc_deactivate(struct gfn_to_pfn_cache *gpc); static inline bool kvm_gpc_is_gpa_active(struct gfn_to_pfn_cache *gpc) { return gpc->active && !kvm_is_error_gpa(gpc->gpa); } static inline bool kvm_gpc_is_hva_active(struct gfn_to_pfn_cache *gpc) { return gpc->active && kvm_is_error_gpa(gpc->gpa); } void kvm_sigset_activate(struct kvm_vcpu *vcpu); void kvm_sigset_deactivate(struct kvm_vcpu *vcpu); void kvm_vcpu_halt(struct kvm_vcpu *vcpu); bool kvm_vcpu_block(struct kvm_vcpu *vcpu); void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu); void kvm_arch_vcpu_unblocking(struct kvm_vcpu *vcpu); bool kvm_vcpu_wake_up(struct kvm_vcpu *vcpu); void kvm_vcpu_kick(struct kvm_vcpu *vcpu); int kvm_vcpu_yield_to(struct kvm_vcpu *target); void kvm_vcpu_on_spin(struct kvm_vcpu *vcpu, bool yield_to_kernel_mode); void kvm_flush_remote_tlbs(struct kvm *kvm); void kvm_flush_remote_tlbs_range(struct kvm *kvm, gfn_t gfn, u64 nr_pages); void kvm_flush_remote_tlbs_memslot(struct kvm *kvm, const struct kvm_memory_slot *memslot); #ifdef KVM_ARCH_NR_OBJS_PER_MEMORY_CACHE int kvm_mmu_topup_memory_cache(struct kvm_mmu_memory_cache *mc, int min); int __kvm_mmu_topup_memory_cache(struct kvm_mmu_memory_cache *mc, int capacity, int min); int kvm_mmu_memory_cache_nr_free_objects(struct kvm_mmu_memory_cache *mc); void kvm_mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc); void *kvm_mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc); #endif void kvm_mmu_invalidate_begin(struct kvm *kvm); void kvm_mmu_invalidate_range_add(struct kvm *kvm, gfn_t start, gfn_t end); void kvm_mmu_invalidate_end(struct kvm *kvm); bool kvm_mmu_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range); long kvm_arch_dev_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg); long kvm_arch_vcpu_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg); vm_fault_t kvm_arch_vcpu_fault(struct kvm_vcpu *vcpu, struct vm_fault *vmf); int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext); void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm, struct kvm_memory_slot *slot, gfn_t gfn_offset, unsigned long mask); void kvm_arch_sync_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot); #ifndef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log); int kvm_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log, int *is_dirty, struct kvm_memory_slot **memslot); #endif int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_level, bool line_status); int kvm_vm_ioctl_enable_cap(struct kvm *kvm, struct kvm_enable_cap *cap); int kvm_arch_vm_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg); long kvm_arch_vm_compat_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg); int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu); int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu); int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu, struct kvm_translation *tr); int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs); int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs); int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs); int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs); int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu, struct kvm_mp_state *mp_state); int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu, struct kvm_mp_state *mp_state); int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu, struct kvm_guest_debug *dbg); int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu); void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu); void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu); int kvm_arch_vcpu_precreate(struct kvm *kvm, unsigned int id); int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu); void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu); void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu); #ifdef CONFIG_HAVE_KVM_PM_NOTIFIER int kvm_arch_pm_notifier(struct kvm *kvm, unsigned long state); #endif #ifdef __KVM_HAVE_ARCH_VCPU_DEBUGFS void kvm_arch_create_vcpu_debugfs(struct kvm_vcpu *vcpu, struct dentry *debugfs_dentry); #else static inline void kvm_create_vcpu_debugfs(struct kvm_vcpu *vcpu) {} #endif #ifdef CONFIG_KVM_GENERIC_HARDWARE_ENABLING /* * kvm_arch_{enable,disable}_virtualization() are called on one CPU, under * kvm_usage_lock, immediately after/before 0=>1 and 1=>0 transitions of * kvm_usage_count, i.e. at the beginning of the generic hardware enabling * sequence, and at the end of the generic hardware disabling sequence. */ void kvm_arch_enable_virtualization(void); void kvm_arch_disable_virtualization(void); /* * kvm_arch_{enable,disable}_virtualization_cpu() are called on "every" CPU to * do the actual twiddling of hardware bits. The hooks are called on all * online CPUs when KVM enables/disabled virtualization, and on a single CPU * when that CPU is onlined/offlined (including for Resume/Suspend). */ int kvm_arch_enable_virtualization_cpu(void); void kvm_arch_disable_virtualization_cpu(void); #endif int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu); bool kvm_arch_vcpu_in_kernel(struct kvm_vcpu *vcpu); int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu); bool kvm_arch_dy_runnable(struct kvm_vcpu *vcpu); bool kvm_arch_dy_has_pending_interrupt(struct kvm_vcpu *vcpu); bool kvm_arch_vcpu_preempted_in_kernel(struct kvm_vcpu *vcpu); void kvm_arch_pre_destroy_vm(struct kvm *kvm); void kvm_arch_create_vm_debugfs(struct kvm *kvm); #ifndef __KVM_HAVE_ARCH_VM_ALLOC /* * All architectures that want to use vzalloc currently also * need their own kvm_arch_alloc_vm implementation. */ static inline struct kvm *kvm_arch_alloc_vm(void) { return kzalloc(sizeof(struct kvm), GFP_KERNEL_ACCOUNT); } #endif static inline void __kvm_arch_free_vm(struct kvm *kvm) { kvfree(kvm); } #ifndef __KVM_HAVE_ARCH_VM_FREE static inline void kvm_arch_free_vm(struct kvm *kvm) { __kvm_arch_free_vm(kvm); } #endif #ifndef __KVM_HAVE_ARCH_FLUSH_REMOTE_TLBS static inline int kvm_arch_flush_remote_tlbs(struct kvm *kvm) { return -ENOTSUPP; } #else int kvm_arch_flush_remote_tlbs(struct kvm *kvm); #endif #ifndef __KVM_HAVE_ARCH_FLUSH_REMOTE_TLBS_RANGE static inline int kvm_arch_flush_remote_tlbs_range(struct kvm *kvm, gfn_t gfn, u64 nr_pages) { return -EOPNOTSUPP; } #else int kvm_arch_flush_remote_tlbs_range(struct kvm *kvm, gfn_t gfn, u64 nr_pages); #endif #ifdef __KVM_HAVE_ARCH_NONCOHERENT_DMA void kvm_arch_register_noncoherent_dma(struct kvm *kvm); void kvm_arch_unregister_noncoherent_dma(struct kvm *kvm); bool kvm_arch_has_noncoherent_dma(struct kvm *kvm); #else static inline void kvm_arch_register_noncoherent_dma(struct kvm *kvm) { } static inline void kvm_arch_unregister_noncoherent_dma(struct kvm *kvm) { } static inline bool kvm_arch_has_noncoherent_dma(struct kvm *kvm) { return false; } #endif #ifdef __KVM_HAVE_ARCH_ASSIGNED_DEVICE void kvm_arch_start_assignment(struct kvm *kvm); void kvm_arch_end_assignment(struct kvm *kvm); bool kvm_arch_has_assigned_device(struct kvm *kvm); #else static inline void kvm_arch_start_assignment(struct kvm *kvm) { } static inline void kvm_arch_end_assignment(struct kvm *kvm) { } static __always_inline bool kvm_arch_has_assigned_device(struct kvm *kvm) { return false; } #endif static inline struct rcuwait *kvm_arch_vcpu_get_wait(struct kvm_vcpu *vcpu) { #ifdef __KVM_HAVE_ARCH_WQP return vcpu->arch.waitp; #else return &vcpu->wait; #endif } /* * Wake a vCPU if necessary, but don't do any stats/metadata updates. Returns * true if the vCPU was blocking and was awakened, false otherwise. */ static inline bool __kvm_vcpu_wake_up(struct kvm_vcpu *vcpu) { return !!rcuwait_wake_up(kvm_arch_vcpu_get_wait(vcpu)); } static inline bool kvm_vcpu_is_blocking(struct kvm_vcpu *vcpu) { return rcuwait_active(kvm_arch_vcpu_get_wait(vcpu)); } #ifdef __KVM_HAVE_ARCH_INTC_INITIALIZED /* * returns true if the virtual interrupt controller is initialized and * ready to accept virtual IRQ. On some architectures the virtual interrupt * controller is dynamically instantiated and this is not always true. */ bool kvm_arch_intc_initialized(struct kvm *kvm); #else static inline bool kvm_arch_intc_initialized(struct kvm *kvm) { return true; } #endif #ifdef CONFIG_GUEST_PERF_EVENTS unsigned long kvm_arch_vcpu_get_ip(struct kvm_vcpu *vcpu); void kvm_register_perf_callbacks(unsigned int (*pt_intr_handler)(void)); void kvm_unregister_perf_callbacks(void); #else static inline void kvm_register_perf_callbacks(void *ign) {} static inline void kvm_unregister_perf_callbacks(void) {} #endif /* CONFIG_GUEST_PERF_EVENTS */ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type); void kvm_arch_destroy_vm(struct kvm *kvm); void kvm_arch_sync_events(struct kvm *kvm); int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu); struct kvm_irq_ack_notifier { struct hlist_node link; unsigned gsi; void (*irq_acked)(struct kvm_irq_ack_notifier *kian); }; int kvm_irq_map_gsi(struct kvm *kvm, struct kvm_kernel_irq_routing_entry *entries, int gsi); int kvm_irq_map_chip_pin(struct kvm *kvm, unsigned irqchip, unsigned pin); int kvm_set_irq(struct kvm *kvm, int irq_source_id, u32 irq, int level, bool line_status); int kvm_set_msi(struct kvm_kernel_irq_routing_entry *irq_entry, struct kvm *kvm, int irq_source_id, int level, bool line_status); int kvm_arch_set_irq_inatomic(struct kvm_kernel_irq_routing_entry *e, struct kvm *kvm, int irq_source_id, int level, bool line_status); bool kvm_irq_has_notifier(struct kvm *kvm, unsigned irqchip, unsigned pin); void kvm_notify_acked_gsi(struct kvm *kvm, int gsi); void kvm_notify_acked_irq(struct kvm *kvm, unsigned irqchip, unsigned pin); void kvm_register_irq_ack_notifier(struct kvm *kvm, struct kvm_irq_ack_notifier *kian); void kvm_unregister_irq_ack_notifier(struct kvm *kvm, struct kvm_irq_ack_notifier *kian); int kvm_request_irq_source_id(struct kvm *kvm); void kvm_free_irq_source_id(struct kvm *kvm, int irq_source_id); bool kvm_arch_irqfd_allowed(struct kvm *kvm, struct kvm_irqfd *args); /* * Returns a pointer to the memslot if it contains gfn. * Otherwise returns NULL. */ static inline struct kvm_memory_slot * try_get_memslot(struct kvm_memory_slot *slot, gfn_t gfn) { if (!slot) return NULL; if (gfn >= slot->base_gfn && gfn < slot->base_gfn + slot->npages) return slot; else return NULL; } /* * Returns a pointer to the memslot that contains gfn. Otherwise returns NULL. * * With "approx" set returns the memslot also when the address falls * in a hole. In that case one of the memslots bordering the hole is * returned. */ static inline struct kvm_memory_slot * search_memslots(struct kvm_memslots *slots, gfn_t gfn, bool approx) { struct kvm_memory_slot *slot; struct rb_node *node; int idx = slots->node_idx; slot = NULL; for (node = slots->gfn_tree.rb_node; node; ) { slot = container_of(node, struct kvm_memory_slot, gfn_node[idx]); if (gfn >= slot->base_gfn) { if (gfn < slot->base_gfn + slot->npages) return slot; node = node->rb_right; } else node = node->rb_left; } return approx ? slot : NULL; } static inline struct kvm_memory_slot * ____gfn_to_memslot(struct kvm_memslots *slots, gfn_t gfn, bool approx) { struct kvm_memory_slot *slot; slot = (struct kvm_memory_slot *)atomic_long_read(&slots->last_used_slot); slot = try_get_memslot(slot, gfn); if (slot) return slot; slot = search_memslots(slots, gfn, approx); if (slot) { atomic_long_set(&slots->last_used_slot, (unsigned long)slot); return slot; } return NULL; } /* * __gfn_to_memslot() and its descendants are here to allow arch code to inline * the lookups in hot paths. gfn_to_memslot() itself isn't here as an inline * because that would bloat other code too much. */ static inline struct kvm_memory_slot * __gfn_to_memslot(struct kvm_memslots *slots, gfn_t gfn) { return ____gfn_to_memslot(slots, gfn, false); } static inline unsigned long __gfn_to_hva_memslot(const struct kvm_memory_slot *slot, gfn_t gfn) { /* * The index was checked originally in search_memslots. To avoid * that a malicious guest builds a Spectre gadget out of e.g. page * table walks, do not let the processor speculate loads outside * the guest's registered memslots. */ unsigned long offset = gfn - slot->base_gfn; offset = array_index_nospec(offset, slot->npages); return slot->userspace_addr + offset * PAGE_SIZE; } static inline int memslot_id(struct kvm *kvm, gfn_t gfn) { return gfn_to_memslot(kvm, gfn)->id; } static inline gfn_t hva_to_gfn_memslot(unsigned long hva, struct kvm_memory_slot *slot) { gfn_t gfn_offset = (hva - slot->userspace_addr) >> PAGE_SHIFT; return slot->base_gfn + gfn_offset; } static inline gpa_t gfn_to_gpa(gfn_t gfn) { return (gpa_t)gfn << PAGE_SHIFT; } static inline gfn_t gpa_to_gfn(gpa_t gpa) { return (gfn_t)(gpa >> PAGE_SHIFT); } static inline hpa_t pfn_to_hpa(kvm_pfn_t pfn) { return (hpa_t)pfn << PAGE_SHIFT; } static inline bool kvm_is_gpa_in_memslot(struct kvm *kvm, gpa_t gpa) { unsigned long hva = gfn_to_hva(kvm, gpa_to_gfn(gpa)); return !kvm_is_error_hva(hva); } static inline void kvm_gpc_mark_dirty_in_slot(struct gfn_to_pfn_cache *gpc) { lockdep_assert_held(&gpc->lock); if (!gpc->memslot) return; mark_page_dirty_in_slot(gpc->kvm, gpc->memslot, gpa_to_gfn(gpc->gpa)); } enum kvm_stat_kind { KVM_STAT_VM, KVM_STAT_VCPU, }; struct kvm_stat_data { struct kvm *kvm; const struct _kvm_stats_desc *desc; enum kvm_stat_kind kind; }; struct _kvm_stats_desc { struct kvm_stats_desc desc; char name[KVM_STATS_NAME_SIZE]; }; #define STATS_DESC_COMMON(type, unit, base, exp, sz, bsz) \ .flags = type | unit | base | \ BUILD_BUG_ON_ZERO(type & ~KVM_STATS_TYPE_MASK) | \ BUILD_BUG_ON_ZERO(unit & ~KVM_STATS_UNIT_MASK) | \ BUILD_BUG_ON_ZERO(base & ~KVM_STATS_BASE_MASK), \ .exponent = exp, \ .size = sz, \ .bucket_size = bsz #define VM_GENERIC_STATS_DESC(stat, type, unit, base, exp, sz, bsz) \ { \ { \ STATS_DESC_COMMON(type, unit, base, exp, sz, bsz), \ .offset = offsetof(struct kvm_vm_stat, generic.stat) \ }, \ .name = #stat, \ } #define VCPU_GENERIC_STATS_DESC(stat, type, unit, base, exp, sz, bsz) \ { \ { \ STATS_DESC_COMMON(type, unit, base, exp, sz, bsz), \ .offset = offsetof(struct kvm_vcpu_stat, generic.stat) \ }, \ .name = #stat, \ } #define VM_STATS_DESC(stat, type, unit, base, exp, sz, bsz) \ { \ { \ STATS_DESC_COMMON(type, unit, base, exp, sz, bsz), \ .offset = offsetof(struct kvm_vm_stat, stat) \ }, \ .name = #stat, \ } #define VCPU_STATS_DESC(stat, type, unit, base, exp, sz, bsz) \ { \ { \ STATS_DESC_COMMON(type, unit, base, exp, sz, bsz), \ .offset = offsetof(struct kvm_vcpu_stat, stat) \ }, \ .name = #stat, \ } /* SCOPE: VM, VM_GENERIC, VCPU, VCPU_GENERIC */ #define STATS_DESC(SCOPE, stat, type, unit, base, exp, sz, bsz) \ SCOPE##_STATS_DESC(stat, type, unit, base, exp, sz, bsz) #define STATS_DESC_CUMULATIVE(SCOPE, name, unit, base, exponent) \ STATS_DESC(SCOPE, name, KVM_STATS_TYPE_CUMULATIVE, \ unit, base, exponent, 1, 0) #define STATS_DESC_INSTANT(SCOPE, name, unit, base, exponent) \ STATS_DESC(SCOPE, name, KVM_STATS_TYPE_INSTANT, \ unit, base, exponent, 1, 0) #define STATS_DESC_PEAK(SCOPE, name, unit, base, exponent) \ STATS_DESC(SCOPE, name, KVM_STATS_TYPE_PEAK, \ unit, base, exponent, 1, 0) #define STATS_DESC_LINEAR_HIST(SCOPE, name, unit, base, exponent, sz, bsz) \ STATS_DESC(SCOPE, name, KVM_STATS_TYPE_LINEAR_HIST, \ unit, base, exponent, sz, bsz) #define STATS_DESC_LOG_HIST(SCOPE, name, unit, base, exponent, sz) \ STATS_DESC(SCOPE, name, KVM_STATS_TYPE_LOG_HIST, \ unit, base, exponent, sz, 0) /* Cumulative counter, read/write */ #define STATS_DESC_COUNTER(SCOPE, name) \ STATS_DESC_CUMULATIVE(SCOPE, name, KVM_STATS_UNIT_NONE, \ KVM_STATS_BASE_POW10, 0) /* Instantaneous counter, read only */ #define STATS_DESC_ICOUNTER(SCOPE, name) \ STATS_DESC_INSTANT(SCOPE, name, KVM_STATS_UNIT_NONE, \ KVM_STATS_BASE_POW10, 0) /* Peak counter, read/write */ #define STATS_DESC_PCOUNTER(SCOPE, name) \ STATS_DESC_PEAK(SCOPE, name, KVM_STATS_UNIT_NONE, \ KVM_STATS_BASE_POW10, 0) /* Instantaneous boolean value, read only */ #define STATS_DESC_IBOOLEAN(SCOPE, name) \ STATS_DESC_INSTANT(SCOPE, name, KVM_STATS_UNIT_BOOLEAN, \ KVM_STATS_BASE_POW10, 0) /* Peak (sticky) boolean value, read/write */ #define STATS_DESC_PBOOLEAN(SCOPE, name) \ STATS_DESC_PEAK(SCOPE, name, KVM_STATS_UNIT_BOOLEAN, \ KVM_STATS_BASE_POW10, 0) /* Cumulative time in nanosecond */ #define STATS_DESC_TIME_NSEC(SCOPE, name) \ STATS_DESC_CUMULATIVE(SCOPE, name, KVM_STATS_UNIT_SECONDS, \ KVM_STATS_BASE_POW10, -9) /* Linear histogram for time in nanosecond */ #define STATS_DESC_LINHIST_TIME_NSEC(SCOPE, name, sz, bsz) \ STATS_DESC_LINEAR_HIST(SCOPE, name, KVM_STATS_UNIT_SECONDS, \ KVM_STATS_BASE_POW10, -9, sz, bsz) /* Logarithmic histogram for time in nanosecond */ #define STATS_DESC_LOGHIST_TIME_NSEC(SCOPE, name, sz) \ STATS_DESC_LOG_HIST(SCOPE, name, KVM_STATS_UNIT_SECONDS, \ KVM_STATS_BASE_POW10, -9, sz) #define KVM_GENERIC_VM_STATS() \ STATS_DESC_COUNTER(VM_GENERIC, remote_tlb_flush), \ STATS_DESC_COUNTER(VM_GENERIC, remote_tlb_flush_requests) #define KVM_GENERIC_VCPU_STATS() \ STATS_DESC_COUNTER(VCPU_GENERIC, halt_successful_poll), \ STATS_DESC_COUNTER(VCPU_GENERIC, halt_attempted_poll), \ STATS_DESC_COUNTER(VCPU_GENERIC, halt_poll_invalid), \ STATS_DESC_COUNTER(VCPU_GENERIC, halt_wakeup), \ STATS_DESC_TIME_NSEC(VCPU_GENERIC, halt_poll_success_ns), \ STATS_DESC_TIME_NSEC(VCPU_GENERIC, halt_poll_fail_ns), \ STATS_DESC_TIME_NSEC(VCPU_GENERIC, halt_wait_ns), \ STATS_DESC_LOGHIST_TIME_NSEC(VCPU_GENERIC, halt_poll_success_hist, \ HALT_POLL_HIST_COUNT), \ STATS_DESC_LOGHIST_TIME_NSEC(VCPU_GENERIC, halt_poll_fail_hist, \ HALT_POLL_HIST_COUNT), \ STATS_DESC_LOGHIST_TIME_NSEC(VCPU_GENERIC, halt_wait_hist, \ HALT_POLL_HIST_COUNT), \ STATS_DESC_IBOOLEAN(VCPU_GENERIC, blocking) ssize_t kvm_stats_read(char *id, const struct kvm_stats_header *header, const struct _kvm_stats_desc *desc, void *stats, size_t size_stats, char __user *user_buffer, size_t size, loff_t *offset); /** * kvm_stats_linear_hist_update() - Update bucket value for linear histogram * statistics data. * * @data: start address of the stats data * @size: the number of bucket of the stats data * @value: the new value used to update the linear histogram's bucket * @bucket_size: the size (width) of a bucket */ static inline void kvm_stats_linear_hist_update(u64 *data, size_t size, u64 value, size_t bucket_size) { size_t index = div64_u64(value, bucket_size); index = min(index, size - 1); ++data[index]; } /** * kvm_stats_log_hist_update() - Update bucket value for logarithmic histogram * statistics data. * * @data: start address of the stats data * @size: the number of bucket of the stats data * @value: the new value used to update the logarithmic histogram's bucket */ static inline void kvm_stats_log_hist_update(u64 *data, size_t size, u64 value) { size_t index = fls64(value); index = min(index, size - 1); ++data[index]; } #define KVM_STATS_LINEAR_HIST_UPDATE(array, value, bsize) \ kvm_stats_linear_hist_update(array, ARRAY_SIZE(array), value, bsize) #define KVM_STATS_LOG_HIST_UPDATE(array, value) \ kvm_stats_log_hist_update(array, ARRAY_SIZE(array), value) extern const struct kvm_stats_header kvm_vm_stats_header; extern const struct _kvm_stats_desc kvm_vm_stats_desc[]; extern const struct kvm_stats_header kvm_vcpu_stats_header; extern const struct _kvm_stats_desc kvm_vcpu_stats_desc[]; #ifdef CONFIG_KVM_GENERIC_MMU_NOTIFIER static inline int mmu_invalidate_retry(struct kvm *kvm, unsigned long mmu_seq) { if (unlikely(kvm->mmu_invalidate_in_progress)) return 1; /* * Ensure the read of mmu_invalidate_in_progress happens before * the read of mmu_invalidate_seq. This interacts with the * smp_wmb() in mmu_notifier_invalidate_range_end to make sure * that the caller either sees the old (non-zero) value of * mmu_invalidate_in_progress or the new (incremented) value of * mmu_invalidate_seq. * * PowerPC Book3s HV KVM calls this under a per-page lock rather * than under kvm->mmu_lock, for scalability, so can't rely on * kvm->mmu_lock to keep things ordered. */ smp_rmb(); if (kvm->mmu_invalidate_seq != mmu_seq) return 1; return 0; } static inline int mmu_invalidate_retry_gfn(struct kvm *kvm, unsigned long mmu_seq, gfn_t gfn) { lockdep_assert_held(&kvm->mmu_lock); /* * If mmu_invalidate_in_progress is non-zero, then the range maintained * by kvm_mmu_notifier_invalidate_range_start contains all addresses * that might be being invalidated. Note that it may include some false * positives, due to shortcuts when handing concurrent invalidations. */ if (unlikely(kvm->mmu_invalidate_in_progress)) { /* * Dropping mmu_lock after bumping mmu_invalidate_in_progress * but before updating the range is a KVM bug. */ if (WARN_ON_ONCE(kvm->mmu_invalidate_range_start == INVALID_GPA || kvm->mmu_invalidate_range_end == INVALID_GPA)) return 1; if (gfn >= kvm->mmu_invalidate_range_start && gfn < kvm->mmu_invalidate_range_end) return 1; } if (kvm->mmu_invalidate_seq != mmu_seq) return 1; return 0; } /* * This lockless version of the range-based retry check *must* be paired with a * call to the locked version after acquiring mmu_lock, i.e. this is safe to * use only as a pre-check to avoid contending mmu_lock. This version *will* * get false negatives and false positives. */ static inline bool mmu_invalidate_retry_gfn_unsafe(struct kvm *kvm, unsigned long mmu_seq, gfn_t gfn) { /* * Use READ_ONCE() to ensure the in-progress flag and sequence counter * are always read from memory, e.g. so that checking for retry in a * loop won't result in an infinite retry loop. Don't force loads for * start+end, as the key to avoiding infinite retry loops is observing * the 1=>0 transition of in-progress, i.e. getting false negatives * due to stale start+end values is acceptable. */ if (unlikely(READ_ONCE(kvm->mmu_invalidate_in_progress)) && gfn >= kvm->mmu_invalidate_range_start && gfn < kvm->mmu_invalidate_range_end) return true; return READ_ONCE(kvm->mmu_invalidate_seq) != mmu_seq; } #endif #ifdef CONFIG_HAVE_KVM_IRQ_ROUTING #define KVM_MAX_IRQ_ROUTES 4096 /* might need extension/rework in the future */ bool kvm_arch_can_set_irq_routing(struct kvm *kvm); int kvm_set_irq_routing(struct kvm *kvm, const struct kvm_irq_routing_entry *entries, unsigned nr, unsigned flags); int kvm_init_irq_routing(struct kvm *kvm); int kvm_set_routing_entry(struct kvm *kvm, struct kvm_kernel_irq_routing_entry *e, const struct kvm_irq_routing_entry *ue); void kvm_free_irq_routing(struct kvm *kvm); #else static inline void kvm_free_irq_routing(struct kvm *kvm) {} static inline int kvm_init_irq_routing(struct kvm *kvm) { return 0; } #endif int kvm_send_userspace_msi(struct kvm *kvm, struct kvm_msi *msi); void kvm_eventfd_init(struct kvm *kvm); int kvm_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args); #ifdef CONFIG_HAVE_KVM_IRQCHIP int kvm_irqfd(struct kvm *kvm, struct kvm_irqfd *args); void kvm_irqfd_release(struct kvm *kvm); bool kvm_notify_irqfd_resampler(struct kvm *kvm, unsigned int irqchip, unsigned int pin); void kvm_irq_routing_update(struct kvm *); #else static inline int kvm_irqfd(struct kvm *kvm, struct kvm_irqfd *args) { return -EINVAL; } static inline void kvm_irqfd_release(struct kvm *kvm) {} static inline bool kvm_notify_irqfd_resampler(struct kvm *kvm, unsigned int irqchip, unsigned int pin) { return false; } #endif /* CONFIG_HAVE_KVM_IRQCHIP */ void kvm_arch_irq_routing_update(struct kvm *kvm); static inline void __kvm_make_request(int req, struct kvm_vcpu *vcpu) { /* * Ensure the rest of the request is published to kvm_check_request's * caller. Paired with the smp_mb__after_atomic in kvm_check_request. */ smp_wmb(); set_bit(req & KVM_REQUEST_MASK, (void *)&vcpu->requests); } static __always_inline void kvm_make_request(int req, struct kvm_vcpu *vcpu) { /* * Request that don't require vCPU action should never be logged in * vcpu->requests. The vCPU won't clear the request, so it will stay * logged indefinitely and prevent the vCPU from entering the guest. */ BUILD_BUG_ON(!__builtin_constant_p(req) || (req & KVM_REQUEST_NO_ACTION)); __kvm_make_request(req, vcpu); } static inline bool kvm_request_pending(struct kvm_vcpu *vcpu) { return READ_ONCE(vcpu->requests); } static inline bool kvm_test_request(int req, struct kvm_vcpu *vcpu) { return test_bit(req & KVM_REQUEST_MASK, (void *)&vcpu->requests); } static inline void kvm_clear_request(int req, struct kvm_vcpu *vcpu) { clear_bit(req & KVM_REQUEST_MASK, (void *)&vcpu->requests); } static inline bool kvm_check_request(int req, struct kvm_vcpu *vcpu) { if (kvm_test_request(req, vcpu)) { kvm_clear_request(req, vcpu); /* * Ensure the rest of the request is visible to kvm_check_request's * caller. Paired with the smp_wmb in kvm_make_request. */ smp_mb__after_atomic(); return true; } else { return false; } } #ifdef CONFIG_KVM_GENERIC_HARDWARE_ENABLING extern bool kvm_rebooting; #endif extern unsigned int halt_poll_ns; extern unsigned int halt_poll_ns_grow; extern unsigned int halt_poll_ns_grow_start; extern unsigned int halt_poll_ns_shrink; struct kvm_device { const struct kvm_device_ops *ops; struct kvm *kvm; void *private; struct list_head vm_node; }; /* create, destroy, and name are mandatory */ struct kvm_device_ops { const char *name; /* * create is called holding kvm->lock and any operations not suitable * to do while holding the lock should be deferred to init (see * below). */ int (*create)(struct kvm_device *dev, u32 type); /* * init is called after create if create is successful and is called * outside of holding kvm->lock. */ void (*init)(struct kvm_device *dev); /* * Destroy is responsible for freeing dev. * * Destroy may be called before or after destructors are called * on emulated I/O regions, depending on whether a reference is * held by a vcpu or other kvm component that gets destroyed * after the emulated I/O. */ void (*destroy)(struct kvm_device *dev); /* * Release is an alternative method to free the device. It is * called when the device file descriptor is closed. Once * release is called, the destroy method will not be called * anymore as the device is removed from the device list of * the VM. kvm->lock is held. */ void (*release)(struct kvm_device *dev); int (*set_attr)(struct kvm_device *dev, struct kvm_device_attr *attr); int (*get_attr)(struct kvm_device *dev, struct kvm_device_attr *attr); int (*has_attr)(struct kvm_device *dev, struct kvm_device_attr *attr); long (*ioctl)(struct kvm_device *dev, unsigned int ioctl, unsigned long arg); int (*mmap)(struct kvm_device *dev, struct vm_area_struct *vma); }; struct kvm_device *kvm_device_from_filp(struct file *filp); int kvm_register_device_ops(const struct kvm_device_ops *ops, u32 type); void kvm_unregister_device_ops(u32 type); extern struct kvm_device_ops kvm_mpic_ops; extern struct kvm_device_ops kvm_arm_vgic_v2_ops; extern struct kvm_device_ops kvm_arm_vgic_v3_ops; #ifdef CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT static inline void kvm_vcpu_set_in_spin_loop(struct kvm_vcpu *vcpu, bool val) { vcpu->spin_loop.in_spin_loop = val; } static inline void kvm_vcpu_set_dy_eligible(struct kvm_vcpu *vcpu, bool val) { vcpu->spin_loop.dy_eligible = val; } #else /* !CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT */ static inline void kvm_vcpu_set_in_spin_loop(struct kvm_vcpu *vcpu, bool val) { } static inline void kvm_vcpu_set_dy_eligible(struct kvm_vcpu *vcpu, bool val) { } #endif /* CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT */ static inline bool kvm_is_visible_memslot(struct kvm_memory_slot *memslot) { return (memslot && memslot->id < KVM_USER_MEM_SLOTS && !(memslot->flags & KVM_MEMSLOT_INVALID)); } struct kvm_vcpu *kvm_get_running_vcpu(void); struct kvm_vcpu * __percpu *kvm_get_running_vcpus(void); #ifdef CONFIG_HAVE_KVM_IRQ_BYPASS bool kvm_arch_has_irq_bypass(void); int kvm_arch_irq_bypass_add_producer(struct irq_bypass_consumer *, struct irq_bypass_producer *); void kvm_arch_irq_bypass_del_producer(struct irq_bypass_consumer *, struct irq_bypass_producer *); void kvm_arch_irq_bypass_stop(struct irq_bypass_consumer *); void kvm_arch_irq_bypass_start(struct irq_bypass_consumer *); int kvm_arch_update_irqfd_routing(struct kvm *kvm, unsigned int host_irq, uint32_t guest_irq, bool set); bool kvm_arch_irqfd_route_changed(struct kvm_kernel_irq_routing_entry *, struct kvm_kernel_irq_routing_entry *); #endif /* CONFIG_HAVE_KVM_IRQ_BYPASS */ #ifdef CONFIG_HAVE_KVM_INVALID_WAKEUPS /* If we wakeup during the poll time, was it a sucessful poll? */ static inline bool vcpu_valid_wakeup(struct kvm_vcpu *vcpu) { return vcpu->valid_wakeup; } #else static inline bool vcpu_valid_wakeup(struct kvm_vcpu *vcpu) { return true; } #endif /* CONFIG_HAVE_KVM_INVALID_WAKEUPS */ #ifdef CONFIG_HAVE_KVM_NO_POLL /* Callback that tells if we must not poll */ bool kvm_arch_no_poll(struct kvm_vcpu *vcpu); #else static inline bool kvm_arch_no_poll(struct kvm_vcpu *vcpu) { return false; } #endif /* CONFIG_HAVE_KVM_NO_POLL */ #ifdef CONFIG_HAVE_KVM_VCPU_ASYNC_IOCTL long kvm_arch_vcpu_async_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg); #else static inline long kvm_arch_vcpu_async_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) { return -ENOIOCTLCMD; } #endif /* CONFIG_HAVE_KVM_VCPU_ASYNC_IOCTL */ void kvm_arch_guest_memory_reclaimed(struct kvm *kvm); #ifdef CONFIG_HAVE_KVM_VCPU_RUN_PID_CHANGE int kvm_arch_vcpu_run_pid_change(struct kvm_vcpu *vcpu); #else static inline int kvm_arch_vcpu_run_pid_change(struct kvm_vcpu *vcpu) { return 0; } #endif /* CONFIG_HAVE_KVM_VCPU_RUN_PID_CHANGE */ #ifdef CONFIG_KVM_XFER_TO_GUEST_WORK static inline void kvm_handle_signal_exit(struct kvm_vcpu *vcpu) { vcpu->run->exit_reason = KVM_EXIT_INTR; vcpu->stat.signal_exits++; } #endif /* CONFIG_KVM_XFER_TO_GUEST_WORK */ /* * If more than one page is being (un)accounted, @virt must be the address of * the first page of a block of pages what were allocated together (i.e * accounted together). * * kvm_account_pgtable_pages() is thread-safe because mod_lruvec_page_state() * is thread-safe. */ static inline void kvm_account_pgtable_pages(void *virt, int nr) { mod_lruvec_page_state(virt_to_page(virt), NR_SECONDARY_PAGETABLE, nr); } /* * This defines how many reserved entries we want to keep before we * kick the vcpu to the userspace to avoid dirty ring full. This * value can be tuned to higher if e.g. PML is enabled on the host. */ #define KVM_DIRTY_RING_RSVD_ENTRIES 64 /* Max number of entries allowed for each kvm dirty ring */ #define KVM_DIRTY_RING_MAX_ENTRIES 65536 static inline void kvm_prepare_memory_fault_exit(struct kvm_vcpu *vcpu, gpa_t gpa, gpa_t size, bool is_write, bool is_exec, bool is_private) { vcpu->run->exit_reason = KVM_EXIT_MEMORY_FAULT; vcpu->run->memory_fault.gpa = gpa; vcpu->run->memory_fault.size = size; /* RWX flags are not (yet) defined or communicated to userspace. */ vcpu->run->memory_fault.flags = 0; if (is_private) vcpu->run->memory_fault.flags |= KVM_MEMORY_EXIT_FLAG_PRIVATE; } #ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES static inline unsigned long kvm_get_memory_attributes(struct kvm *kvm, gfn_t gfn) { return xa_to_value(xa_load(&kvm->mem_attr_array, gfn)); } bool kvm_range_has_memory_attributes(struct kvm *kvm, gfn_t start, gfn_t end, unsigned long mask, unsigned long attrs); bool kvm_arch_pre_set_memory_attributes(struct kvm *kvm, struct kvm_gfn_range *range); bool kvm_arch_post_set_memory_attributes(struct kvm *kvm, struct kvm_gfn_range *range); static inline bool kvm_mem_is_private(struct kvm *kvm, gfn_t gfn) { return IS_ENABLED(CONFIG_KVM_PRIVATE_MEM) && kvm_get_memory_attributes(kvm, gfn) & KVM_MEMORY_ATTRIBUTE_PRIVATE; } #else static inline bool kvm_mem_is_private(struct kvm *kvm, gfn_t gfn) { return false; } #endif /* CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES */ #ifdef CONFIG_KVM_PRIVATE_MEM int kvm_gmem_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot, gfn_t gfn, kvm_pfn_t *pfn, struct page **page, int *max_order); #else static inline int kvm_gmem_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot, gfn_t gfn, kvm_pfn_t *pfn, struct page **page, int *max_order) { KVM_BUG_ON(1, kvm); return -EIO; } #endif /* CONFIG_KVM_PRIVATE_MEM */ #ifdef CONFIG_HAVE_KVM_ARCH_GMEM_PREPARE int kvm_arch_gmem_prepare(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn, int max_order); #endif #ifdef CONFIG_KVM_GENERIC_PRIVATE_MEM /** * kvm_gmem_populate() - Populate/prepare a GPA range with guest data * * @kvm: KVM instance * @gfn: starting GFN to be populated * @src: userspace-provided buffer containing data to copy into GFN range * (passed to @post_populate, and incremented on each iteration * if not NULL) * @npages: number of pages to copy from userspace-buffer * @post_populate: callback to issue for each gmem page that backs the GPA * range * @opaque: opaque data to pass to @post_populate callback * * This is primarily intended for cases where a gmem-backed GPA range needs * to be initialized with userspace-provided data prior to being mapped into * the guest as a private page. This should be called with the slots->lock * held so that caller-enforced invariants regarding the expected memory * attributes of the GPA range do not race with KVM_SET_MEMORY_ATTRIBUTES. * * Returns the number of pages that were populated. */ typedef int (*kvm_gmem_populate_cb)(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn, void __user *src, int order, void *opaque); long kvm_gmem_populate(struct kvm *kvm, gfn_t gfn, void __user *src, long npages, kvm_gmem_populate_cb post_populate, void *opaque); #endif #ifdef CONFIG_HAVE_KVM_ARCH_GMEM_INVALIDATE void kvm_arch_gmem_invalidate(kvm_pfn_t start, kvm_pfn_t end); #endif #ifdef CONFIG_KVM_GENERIC_PRE_FAULT_MEMORY long kvm_arch_vcpu_pre_fault_memory(struct kvm_vcpu *vcpu, struct kvm_pre_fault_memory *range); #endif #endif
3 3 3 3 3 3 3 3 3 4 4 4 4 2 2 2 2 2 2 2 2 2 2 2 2 1 3 3 3 3 2 2 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 2 2 2 7 7 3 3 1 1 1 3 7 4 4 4 4 2 2 2 9 9 9 9 8 8 9 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 // SPDX-License-Identifier: GPL-1.0+ /* generic HDLC line discipline for Linux * * Written by Paul Fulghum paulkf@microgate.com * for Microgate Corporation * * Microgate and SyncLink are registered trademarks of Microgate Corporation * * Adapted from ppp.c, written by Michael Callahan <callahan@maths.ox.ac.uk>, * Al Longyear <longyear@netcom.com>, * Paul Mackerras <Paul.Mackerras@cs.anu.edu.au> * * Original release 01/11/99 * * This module implements the tty line discipline N_HDLC for use with * tty device drivers that support bit-synchronous HDLC communications. * * All HDLC data is frame oriented which means: * * 1. tty write calls represent one complete transmit frame of data * The device driver should accept the complete frame or none of * the frame (busy) in the write method. Each write call should have * a byte count in the range of 2-65535 bytes (2 is min HDLC frame * with 1 addr byte and 1 ctrl byte). The max byte count of 65535 * should include any crc bytes required. For example, when using * CCITT CRC32, 4 crc bytes are required, so the maximum size frame * the application may transmit is limited to 65531 bytes. For CCITT * CRC16, the maximum application frame size would be 65533. * * * 2. receive callbacks from the device driver represents * one received frame. The device driver should bypass * the tty flip buffer and call the line discipline receive * callback directly to avoid fragmenting or concatenating * multiple frames into a single receive callback. * * The HDLC line discipline queues the receive frames in separate * buffers so complete receive frames can be returned by the * tty read calls. * * 3. tty read calls returns an entire frame of data or nothing. * * 4. all send and receive data is considered raw. No processing * or translation is performed by the line discipline, regardless * of the tty flags * * 5. When line discipline is queried for the amount of receive * data available (FIOC), 0 is returned if no data available, * otherwise the count of the next available frame is returned. * (instead of the sum of all received frame counts). * * These conventions allow the standard tty programming interface * to be used for synchronous HDLC applications when used with * this line discipline (or another line discipline that is frame * oriented such as N_PPP). * * The SyncLink driver (synclink.c) implements both asynchronous * (using standard line discipline N_TTY) and synchronous HDLC * (using N_HDLC) communications, with the latter using the above * conventions. * * This implementation is very basic and does not maintain * any statistics. The main point is to enforce the raw data * and frame orientation of HDLC communications. * * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED * OF THE POSSIBILITY OF SUCH DAMAGE. */ #include <linux/module.h> #include <linux/init.h> #include <linux/kernel.h> #include <linux/sched.h> #include <linux/types.h> #include <linux/fcntl.h> #include <linux/interrupt.h> #include <linux/ptrace.h> #include <linux/poll.h> #include <linux/in.h> #include <linux/ioctl.h> #include <linux/slab.h> #include <linux/tty.h> #include <linux/errno.h> #include <linux/string.h> /* used in new tty drivers */ #include <linux/signal.h> /* used in new tty drivers */ #include <linux/if.h> #include <linux/bitops.h> #include <linux/uaccess.h> #include "tty.h" /* * Buffers for individual HDLC frames */ #define MAX_HDLC_FRAME_SIZE 65535 #define DEFAULT_RX_BUF_COUNT 10 #define MAX_RX_BUF_COUNT 60 #define DEFAULT_TX_BUF_COUNT 3 struct n_hdlc_buf { struct list_head list_item; size_t count; u8 buf[]; }; struct n_hdlc_buf_list { struct list_head list; int count; spinlock_t spinlock; }; /** * struct n_hdlc - per device instance data structure * @tbusy: reentrancy flag for tx wakeup code * @woke_up: tx wakeup needs to be run again as it was called while @tbusy * @tx_buf_list: list of pending transmit frame buffers * @rx_buf_list: list of received frame buffers * @tx_free_buf_list: list unused transmit frame buffers * @rx_free_buf_list: list unused received frame buffers */ struct n_hdlc { bool tbusy; bool woke_up; struct n_hdlc_buf_list tx_buf_list; struct n_hdlc_buf_list rx_buf_list; struct n_hdlc_buf_list tx_free_buf_list; struct n_hdlc_buf_list rx_free_buf_list; struct work_struct write_work; struct tty_struct *tty_for_write_work; }; /* * HDLC buffer list manipulation functions */ static void n_hdlc_buf_return(struct n_hdlc_buf_list *buf_list, struct n_hdlc_buf *buf); static void n_hdlc_buf_put(struct n_hdlc_buf_list *list, struct n_hdlc_buf *buf); static struct n_hdlc_buf *n_hdlc_buf_get(struct n_hdlc_buf_list *list); /* Local functions */ static struct n_hdlc *n_hdlc_alloc(void); static void n_hdlc_tty_write_work(struct work_struct *work); /* max frame size for memory allocations */ static int maxframe = 4096; static void flush_rx_queue(struct tty_struct *tty) { struct n_hdlc *n_hdlc = tty->disc_data; struct n_hdlc_buf *buf; while ((buf = n_hdlc_buf_get(&n_hdlc->rx_buf_list))) n_hdlc_buf_put(&n_hdlc->rx_free_buf_list, buf); } static void flush_tx_queue(struct tty_struct *tty) { struct n_hdlc *n_hdlc = tty->disc_data; struct n_hdlc_buf *buf; while ((buf = n_hdlc_buf_get(&n_hdlc->tx_buf_list))) n_hdlc_buf_put(&n_hdlc->tx_free_buf_list, buf); } static void n_hdlc_free_buf_list(struct n_hdlc_buf_list *list) { struct n_hdlc_buf *buf; do { buf = n_hdlc_buf_get(list); kfree(buf); } while (buf); } /** * n_hdlc_tty_close - line discipline close * @tty: pointer to tty info structure * * Called when the line discipline is changed to something * else, the tty is closed, or the tty detects a hangup. */ static void n_hdlc_tty_close(struct tty_struct *tty) { struct n_hdlc *n_hdlc = tty->disc_data; #if defined(TTY_NO_WRITE_SPLIT) clear_bit(TTY_NO_WRITE_SPLIT, &tty->flags); #endif tty->disc_data = NULL; /* Ensure that the n_hdlcd process is not hanging on select()/poll() */ wake_up_interruptible(&tty->read_wait); wake_up_interruptible(&tty->write_wait); cancel_work_sync(&n_hdlc->write_work); n_hdlc_free_buf_list(&n_hdlc->rx_free_buf_list); n_hdlc_free_buf_list(&n_hdlc->tx_free_buf_list); n_hdlc_free_buf_list(&n_hdlc->rx_buf_list); n_hdlc_free_buf_list(&n_hdlc->tx_buf_list); kfree(n_hdlc); } /* end of n_hdlc_tty_close() */ /** * n_hdlc_tty_open - called when line discipline changed to n_hdlc * @tty: pointer to tty info structure * * Returns 0 if success, otherwise error code */ static int n_hdlc_tty_open(struct tty_struct *tty) { struct n_hdlc *n_hdlc = tty->disc_data; pr_debug("%s() called (device=%s)\n", __func__, tty->name); /* There should not be an existing table for this slot. */ if (n_hdlc) { pr_err("%s: tty already associated!\n", __func__); return -EEXIST; } n_hdlc = n_hdlc_alloc(); if (!n_hdlc) { pr_err("%s: n_hdlc_alloc failed\n", __func__); return -ENFILE; } INIT_WORK(&n_hdlc->write_work, n_hdlc_tty_write_work); n_hdlc->tty_for_write_work = tty; tty->disc_data = n_hdlc; tty->receive_room = 65536; /* change tty_io write() to not split large writes into 8K chunks */ set_bit(TTY_NO_WRITE_SPLIT, &tty->flags); /* flush receive data from driver */ tty_driver_flush_buffer(tty); return 0; } /* end of n_tty_hdlc_open() */ /** * n_hdlc_send_frames - send frames on pending send buffer list * @n_hdlc: pointer to ldisc instance data * @tty: pointer to tty instance data * * Send frames on pending send buffer list until the driver does not accept a * frame (busy) this function is called after adding a frame to the send buffer * list and by the tty wakeup callback. */ static void n_hdlc_send_frames(struct n_hdlc *n_hdlc, struct tty_struct *tty) { unsigned long flags; struct n_hdlc_buf *tbuf; ssize_t actual; check_again: spin_lock_irqsave(&n_hdlc->tx_buf_list.spinlock, flags); if (n_hdlc->tbusy) { n_hdlc->woke_up = true; spin_unlock_irqrestore(&n_hdlc->tx_buf_list.spinlock, flags); return; } n_hdlc->tbusy = true; n_hdlc->woke_up = false; spin_unlock_irqrestore(&n_hdlc->tx_buf_list.spinlock, flags); tbuf = n_hdlc_buf_get(&n_hdlc->tx_buf_list); while (tbuf) { pr_debug("sending frame %p, count=%zu\n", tbuf, tbuf->count); /* Send the next block of data to device */ set_bit(TTY_DO_WRITE_WAKEUP, &tty->flags); actual = tty->ops->write(tty, tbuf->buf, tbuf->count); /* rollback was possible and has been done */ if (actual == -ERESTARTSYS) { n_hdlc_buf_return(&n_hdlc->tx_buf_list, tbuf); break; } /* if transmit error, throw frame away by */ /* pretending it was accepted by driver */ if (actual < 0) actual = tbuf->count; if (actual == tbuf->count) { pr_debug("frame %p completed\n", tbuf); /* free current transmit buffer */ n_hdlc_buf_put(&n_hdlc->tx_free_buf_list, tbuf); /* wait up sleeping writers */ wake_up_interruptible(&tty->write_wait); /* get next pending transmit buffer */ tbuf = n_hdlc_buf_get(&n_hdlc->tx_buf_list); } else { pr_debug("frame %p pending\n", tbuf); /* * the buffer was not accepted by driver, * return it back into tx queue */ n_hdlc_buf_return(&n_hdlc->tx_buf_list, tbuf); break; } } if (!tbuf) clear_bit(TTY_DO_WRITE_WAKEUP, &tty->flags); /* Clear the re-entry flag */ spin_lock_irqsave(&n_hdlc->tx_buf_list.spinlock, flags); n_hdlc->tbusy = false; spin_unlock_irqrestore(&n_hdlc->tx_buf_list.spinlock, flags); if (n_hdlc->woke_up) goto check_again; } /* end of n_hdlc_send_frames() */ /** * n_hdlc_tty_write_work - Asynchronous callback for transmit wakeup * @work: pointer to work_struct * * Called when low level device driver can accept more send data. */ static void n_hdlc_tty_write_work(struct work_struct *work) { struct n_hdlc *n_hdlc = container_of(work, struct n_hdlc, write_work); struct tty_struct *tty = n_hdlc->tty_for_write_work; n_hdlc_send_frames(n_hdlc, tty); } /* end of n_hdlc_tty_write_work() */ /** * n_hdlc_tty_wakeup - Callback for transmit wakeup * @tty: pointer to associated tty instance data * * Called when low level device driver can accept more send data. */ static void n_hdlc_tty_wakeup(struct tty_struct *tty) { struct n_hdlc *n_hdlc = tty->disc_data; schedule_work(&n_hdlc->write_work); } /* end of n_hdlc_tty_wakeup() */ /** * n_hdlc_tty_receive - Called by tty driver when receive data is available * @tty: pointer to tty instance data * @data: pointer to received data * @flags: pointer to flags for data * @count: count of received data in bytes * * Called by tty low level driver when receive data is available. Data is * interpreted as one HDLC frame. */ static void n_hdlc_tty_receive(struct tty_struct *tty, const u8 *data, const u8 *flags, size_t count) { register struct n_hdlc *n_hdlc = tty->disc_data; register struct n_hdlc_buf *buf; pr_debug("%s() called count=%zu\n", __func__, count); if (count > maxframe) { pr_debug("rx count>maxframesize, data discarded\n"); return; } /* get a free HDLC buffer */ buf = n_hdlc_buf_get(&n_hdlc->rx_free_buf_list); if (!buf) { /* * no buffers in free list, attempt to allocate another rx * buffer unless the maximum count has been reached */ if (n_hdlc->rx_buf_list.count < MAX_RX_BUF_COUNT) buf = kmalloc(struct_size(buf, buf, maxframe), GFP_ATOMIC); } if (!buf) { pr_debug("no more rx buffers, data discarded\n"); return; } /* copy received data to HDLC buffer */ memcpy(buf->buf, data, count); buf->count = count; /* add HDLC buffer to list of received frames */ n_hdlc_buf_put(&n_hdlc->rx_buf_list, buf); /* wake up any blocked reads and perform async signalling */ wake_up_interruptible(&tty->read_wait); if (tty->fasync != NULL) kill_fasync(&tty->fasync, SIGIO, POLL_IN); } /* end of n_hdlc_tty_receive() */ /** * n_hdlc_tty_read - Called to retrieve one frame of data (if available) * @tty: pointer to tty instance data * @file: pointer to open file object * @kbuf: pointer to returned data buffer * @nr: size of returned data buffer * @cookie: stored rbuf from previous run * @offset: offset into the data buffer * * Returns the number of bytes returned or error code. */ static ssize_t n_hdlc_tty_read(struct tty_struct *tty, struct file *file, u8 *kbuf, size_t nr, void **cookie, unsigned long offset) { struct n_hdlc *n_hdlc = tty->disc_data; int ret = 0; struct n_hdlc_buf *rbuf; DECLARE_WAITQUEUE(wait, current); /* Is this a repeated call for an rbuf we already found earlier? */ rbuf = *cookie; if (rbuf) goto have_rbuf; add_wait_queue(&tty->read_wait, &wait); for (;;) { if (test_bit(TTY_OTHER_CLOSED, &tty->flags)) { ret = -EIO; break; } if (tty_hung_up_p(file)) break; set_current_state(TASK_INTERRUPTIBLE); rbuf = n_hdlc_buf_get(&n_hdlc->rx_buf_list); if (rbuf) break; /* no data */ if (tty_io_nonblock(tty, file)) { ret = -EAGAIN; break; } schedule(); if (signal_pending(current)) { ret = -EINTR; break; } } remove_wait_queue(&tty->read_wait, &wait); __set_current_state(TASK_RUNNING); if (!rbuf) return ret; *cookie = rbuf; have_rbuf: /* Have we used it up entirely? */ if (offset >= rbuf->count) goto done_with_rbuf; /* More data to go, but can't copy any more? EOVERFLOW */ ret = -EOVERFLOW; if (!nr) goto done_with_rbuf; /* Copy as much data as possible */ ret = rbuf->count - offset; if (ret > nr) ret = nr; memcpy(kbuf, rbuf->buf+offset, ret); offset += ret; /* If we still have data left, we leave the rbuf in the cookie */ if (offset < rbuf->count) return ret; done_with_rbuf: *cookie = NULL; if (n_hdlc->rx_free_buf_list.count > DEFAULT_RX_BUF_COUNT) kfree(rbuf); else n_hdlc_buf_put(&n_hdlc->rx_free_buf_list, rbuf); return ret; } /* end of n_hdlc_tty_read() */ /** * n_hdlc_tty_write - write a single frame of data to device * @tty: pointer to associated tty device instance data * @file: pointer to file object data * @data: pointer to transmit data (one frame) * @count: size of transmit frame in bytes * * Returns the number of bytes written (or error code). */ static ssize_t n_hdlc_tty_write(struct tty_struct *tty, struct file *file, const u8 *data, size_t count) { struct n_hdlc *n_hdlc = tty->disc_data; DECLARE_WAITQUEUE(wait, current); struct n_hdlc_buf *tbuf; ssize_t error = 0; pr_debug("%s() called count=%zd\n", __func__, count); /* verify frame size */ if (count > maxframe) { pr_debug("%s: truncating user packet from %zu to %d\n", __func__, count, maxframe); count = maxframe; } add_wait_queue(&tty->write_wait, &wait); for (;;) { set_current_state(TASK_INTERRUPTIBLE); tbuf = n_hdlc_buf_get(&n_hdlc->tx_free_buf_list); if (tbuf) break; if (tty_io_nonblock(tty, file)) { error = -EAGAIN; break; } schedule(); if (signal_pending(current)) { error = -EINTR; break; } } __set_current_state(TASK_RUNNING); remove_wait_queue(&tty->write_wait, &wait); if (!error) { /* Retrieve the user's buffer */ memcpy(tbuf->buf, data, count); /* Send the data */ tbuf->count = error = count; n_hdlc_buf_put(&n_hdlc->tx_buf_list, tbuf); n_hdlc_send_frames(n_hdlc, tty); } return error; } /* end of n_hdlc_tty_write() */ /** * n_hdlc_tty_ioctl - process IOCTL system call for the tty device. * @tty: pointer to tty instance data * @cmd: IOCTL command code * @arg: argument for IOCTL call (cmd dependent) * * Returns command dependent result. */ static int n_hdlc_tty_ioctl(struct tty_struct *tty, unsigned int cmd, unsigned long arg) { struct n_hdlc *n_hdlc = tty->disc_data; int error = 0; int count; unsigned long flags; struct n_hdlc_buf *buf = NULL; pr_debug("%s() called %d\n", __func__, cmd); switch (cmd) { case FIONREAD: /* report count of read data available */ /* in next available frame (if any) */ spin_lock_irqsave(&n_hdlc->rx_buf_list.spinlock, flags); buf = list_first_entry_or_null(&n_hdlc->rx_buf_list.list, struct n_hdlc_buf, list_item); if (buf) count = buf->count; else count = 0; spin_unlock_irqrestore(&n_hdlc->rx_buf_list.spinlock, flags); error = put_user(count, (int __user *)arg); break; case TIOCOUTQ: /* get the pending tx byte count in the driver */ count = tty_chars_in_buffer(tty); /* add size of next output frame in queue */ spin_lock_irqsave(&n_hdlc->tx_buf_list.spinlock, flags); buf = list_first_entry_or_null(&n_hdlc->tx_buf_list.list, struct n_hdlc_buf, list_item); if (buf) count += buf->count; spin_unlock_irqrestore(&n_hdlc->tx_buf_list.spinlock, flags); error = put_user(count, (int __user *)arg); break; case TCFLSH: switch (arg) { case TCIOFLUSH: case TCOFLUSH: flush_tx_queue(tty); } fallthrough; /* to default */ default: error = n_tty_ioctl_helper(tty, cmd, arg); break; } return error; } /* end of n_hdlc_tty_ioctl() */ /** * n_hdlc_tty_poll - TTY callback for poll system call * @tty: pointer to tty instance data * @filp: pointer to open file object for device * @wait: wait queue for operations * * Determine which operations (read/write) will not block and return info * to caller. * Returns a bit mask containing info on which ops will not block. */ static __poll_t n_hdlc_tty_poll(struct tty_struct *tty, struct file *filp, poll_table *wait) { struct n_hdlc *n_hdlc = tty->disc_data; __poll_t mask = 0; /* * queue the current process into any wait queue that may awaken in the * future (read and write) */ poll_wait(filp, &tty->read_wait, wait); poll_wait(filp, &tty->write_wait, wait); /* set bits for operations that won't block */ if (!list_empty(&n_hdlc->rx_buf_list.list)) mask |= EPOLLIN | EPOLLRDNORM; /* readable */ if (test_bit(TTY_OTHER_CLOSED, &tty->flags)) mask |= EPOLLHUP; if (tty_hung_up_p(filp)) mask |= EPOLLHUP; if (!tty_is_writelocked(tty) && !list_empty(&n_hdlc->tx_free_buf_list.list)) mask |= EPOLLOUT | EPOLLWRNORM; /* writable */ return mask; } /* end of n_hdlc_tty_poll() */ static void n_hdlc_alloc_buf(struct n_hdlc_buf_list *list, unsigned int count, const char *name) { struct n_hdlc_buf *buf; unsigned int i; for (i = 0; i < count; i++) { buf = kmalloc(struct_size(buf, buf, maxframe), GFP_KERNEL); if (!buf) { pr_debug("%s(), kmalloc() failed for %s buffer %u\n", __func__, name, i); return; } n_hdlc_buf_put(list, buf); } } /** * n_hdlc_alloc - allocate an n_hdlc instance data structure * * Returns a pointer to newly created structure if success, otherwise %NULL */ static struct n_hdlc *n_hdlc_alloc(void) { struct n_hdlc *n_hdlc = kzalloc(sizeof(*n_hdlc), GFP_KERNEL); if (!n_hdlc) return NULL; spin_lock_init(&n_hdlc->rx_free_buf_list.spinlock); spin_lock_init(&n_hdlc->tx_free_buf_list.spinlock); spin_lock_init(&n_hdlc->rx_buf_list.spinlock); spin_lock_init(&n_hdlc->tx_buf_list.spinlock); INIT_LIST_HEAD(&n_hdlc->rx_free_buf_list.list); INIT_LIST_HEAD(&n_hdlc->tx_free_buf_list.list); INIT_LIST_HEAD(&n_hdlc->rx_buf_list.list); INIT_LIST_HEAD(&n_hdlc->tx_buf_list.list); n_hdlc_alloc_buf(&n_hdlc->rx_free_buf_list, DEFAULT_RX_BUF_COUNT, "rx"); n_hdlc_alloc_buf(&n_hdlc->tx_free_buf_list, DEFAULT_TX_BUF_COUNT, "tx"); return n_hdlc; } /* end of n_hdlc_alloc() */ /** * n_hdlc_buf_return - put the HDLC buffer after the head of the specified list * @buf_list: pointer to the buffer list * @buf: pointer to the buffer */ static void n_hdlc_buf_return(struct n_hdlc_buf_list *buf_list, struct n_hdlc_buf *buf) { unsigned long flags; spin_lock_irqsave(&buf_list->spinlock, flags); list_add(&buf->list_item, &buf_list->list); buf_list->count++; spin_unlock_irqrestore(&buf_list->spinlock, flags); } /** * n_hdlc_buf_put - add specified HDLC buffer to tail of specified list * @buf_list: pointer to buffer list * @buf: pointer to buffer */ static void n_hdlc_buf_put(struct n_hdlc_buf_list *buf_list, struct n_hdlc_buf *buf) { unsigned long flags; spin_lock_irqsave(&buf_list->spinlock, flags); list_add_tail(&buf->list_item, &buf_list->list); buf_list->count++; spin_unlock_irqrestore(&buf_list->spinlock, flags); } /* end of n_hdlc_buf_put() */ /** * n_hdlc_buf_get - remove and return an HDLC buffer from list * @buf_list: pointer to HDLC buffer list * * Remove and return an HDLC buffer from the head of the specified HDLC buffer * list. * Returns a pointer to HDLC buffer if available, otherwise %NULL. */ static struct n_hdlc_buf *n_hdlc_buf_get(struct n_hdlc_buf_list *buf_list) { unsigned long flags; struct n_hdlc_buf *buf; spin_lock_irqsave(&buf_list->spinlock, flags); buf = list_first_entry_or_null(&buf_list->list, struct n_hdlc_buf, list_item); if (buf) { list_del(&buf->list_item); buf_list->count--; } spin_unlock_irqrestore(&buf_list->spinlock, flags); return buf; } /* end of n_hdlc_buf_get() */ static struct tty_ldisc_ops n_hdlc_ldisc = { .owner = THIS_MODULE, .num = N_HDLC, .name = "hdlc", .open = n_hdlc_tty_open, .close = n_hdlc_tty_close, .read = n_hdlc_tty_read, .write = n_hdlc_tty_write, .ioctl = n_hdlc_tty_ioctl, .poll = n_hdlc_tty_poll, .receive_buf = n_hdlc_tty_receive, .write_wakeup = n_hdlc_tty_wakeup, .flush_buffer = flush_rx_queue, }; static int __init n_hdlc_init(void) { int status; /* range check maxframe arg */ maxframe = clamp(maxframe, 4096, MAX_HDLC_FRAME_SIZE); status = tty_register_ldisc(&n_hdlc_ldisc); if (!status) pr_info("N_HDLC line discipline registered with maxframe=%d\n", maxframe); else pr_err("N_HDLC: error registering line discipline: %d\n", status); return status; } /* end of init_module() */ static void __exit n_hdlc_exit(void) { tty_unregister_ldisc(&n_hdlc_ldisc); } module_init(n_hdlc_init); module_exit(n_hdlc_exit); MODULE_DESCRIPTION("HDLC line discipline support"); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Paul Fulghum paulkf@microgate.com"); module_param(maxframe, int, 0); MODULE_ALIAS_LDISC(N_HDLC);
4864 1524 1521 1525 32 1 1525 1526 1717 147 1851 1851 3040 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_LIST_BL_H #define _LINUX_LIST_BL_H #include <linux/list.h> #include <linux/bit_spinlock.h> /* * Special version of lists, where head of the list has a lock in the lowest * bit. This is useful for scalable hash tables without increasing memory * footprint overhead. * * For modification operations, the 0 bit of hlist_bl_head->first * pointer must be set. * * With some small modifications, this can easily be adapted to store several * arbitrary bits (not just a single lock bit), if the need arises to store * some fast and compact auxiliary data. */ #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) #define LIST_BL_LOCKMASK 1UL #else #define LIST_BL_LOCKMASK 0UL #endif #ifdef CONFIG_DEBUG_LIST #define LIST_BL_BUG_ON(x) BUG_ON(x) #else #define LIST_BL_BUG_ON(x) #endif struct hlist_bl_head { struct hlist_bl_node *first; }; struct hlist_bl_node { struct hlist_bl_node *next, **pprev; }; #define INIT_HLIST_BL_HEAD(ptr) \ ((ptr)->first = NULL) static inline void INIT_HLIST_BL_NODE(struct hlist_bl_node *h) { h->next = NULL; h->pprev = NULL; } #define hlist_bl_entry(ptr, type, member) container_of(ptr,type,member) static inline bool hlist_bl_unhashed(const struct hlist_bl_node *h) { return !h->pprev; } static inline struct hlist_bl_node *hlist_bl_first(struct hlist_bl_head *h) { return (struct hlist_bl_node *) ((unsigned long)h->first & ~LIST_BL_LOCKMASK); } static inline void hlist_bl_set_first(struct hlist_bl_head *h, struct hlist_bl_node *n) { LIST_BL_BUG_ON((unsigned long)n & LIST_BL_LOCKMASK); LIST_BL_BUG_ON(((unsigned long)h->first & LIST_BL_LOCKMASK) != LIST_BL_LOCKMASK); h->first = (struct hlist_bl_node *)((unsigned long)n | LIST_BL_LOCKMASK); } static inline bool hlist_bl_empty(const struct hlist_bl_head *h) { return !((unsigned long)READ_ONCE(h->first) & ~LIST_BL_LOCKMASK); } static inline void hlist_bl_add_head(struct hlist_bl_node *n, struct hlist_bl_head *h) { struct hlist_bl_node *first = hlist_bl_first(h); n->next = first; if (first) first->pprev = &n->next; n->pprev = &h->first; hlist_bl_set_first(h, n); } static inline void hlist_bl_add_before(struct hlist_bl_node *n, struct hlist_bl_node *next) { struct hlist_bl_node **pprev = next->pprev; n->pprev = pprev; n->next = next; next->pprev = &n->next; /* pprev may be `first`, so be careful not to lose the lock bit */ WRITE_ONCE(*pprev, (struct hlist_bl_node *) ((uintptr_t)n | ((uintptr_t)*pprev & LIST_BL_LOCKMASK))); } static inline void hlist_bl_add_behind(struct hlist_bl_node *n, struct hlist_bl_node *prev) { n->next = prev->next; n->pprev = &prev->next; prev->next = n; if (n->next) n->next->pprev = &n->next; } static inline void __hlist_bl_del(struct hlist_bl_node *n) { struct hlist_bl_node *next = n->next; struct hlist_bl_node **pprev = n->pprev; LIST_BL_BUG_ON((unsigned long)n & LIST_BL_LOCKMASK); /* pprev may be `first`, so be careful not to lose the lock bit */ WRITE_ONCE(*pprev, (struct hlist_bl_node *) ((unsigned long)next | ((unsigned long)*pprev & LIST_BL_LOCKMASK))); if (next) next->pprev = pprev; } static inline void hlist_bl_del(struct hlist_bl_node *n) { __hlist_bl_del(n); n->next = LIST_POISON1; n->pprev = LIST_POISON2; } static inline void hlist_bl_del_init(struct hlist_bl_node *n) { if (!hlist_bl_unhashed(n)) { __hlist_bl_del(n); INIT_HLIST_BL_NODE(n); } } static inline void hlist_bl_lock(struct hlist_bl_head *b) { bit_spin_lock(0, (unsigned long *)b); } static inline void hlist_bl_unlock(struct hlist_bl_head *b) { __bit_spin_unlock(0, (unsigned long *)b); } static inline bool hlist_bl_is_locked(struct hlist_bl_head *b) { return bit_spin_is_locked(0, (unsigned long *)b); } /** * hlist_bl_for_each_entry - iterate over list of given type * @tpos: the type * to use as a loop cursor. * @pos: the &struct hlist_node to use as a loop cursor. * @head: the head for your list. * @member: the name of the hlist_node within the struct. * */ #define hlist_bl_for_each_entry(tpos, pos, head, member) \ for (pos = hlist_bl_first(head); \ pos && \ ({ tpos = hlist_bl_entry(pos, typeof(*tpos), member); 1;}); \ pos = pos->next) /** * hlist_bl_for_each_entry_safe - iterate over list of given type safe against removal of list entry * @tpos: the type * to use as a loop cursor. * @pos: the &struct hlist_node to use as a loop cursor. * @n: another &struct hlist_node to use as temporary storage * @head: the head for your list. * @member: the name of the hlist_node within the struct. */ #define hlist_bl_for_each_entry_safe(tpos, pos, n, head, member) \ for (pos = hlist_bl_first(head); \ pos && ({ n = pos->next; 1; }) && \ ({ tpos = hlist_bl_entry(pos, typeof(*tpos), member); 1;}); \ pos = n) #endif
1 1 1 7 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) 2007 * * Author: Eric Biederman <ebiederm@xmision.com> */ #include <linux/export.h> #include <linux/uts.h> #include <linux/utsname.h> #include <linux/random.h> #include <linux/sysctl.h> #include <linux/wait.h> #include <linux/rwsem.h> #ifdef CONFIG_PROC_SYSCTL static void *get_uts(const struct ctl_table *table) { char *which = table->data; struct uts_namespace *uts_ns; uts_ns = current->nsproxy->uts_ns; which = (which - (char *)&init_uts_ns) + (char *)uts_ns; return which; } /* * Special case of dostring for the UTS structure. This has locks * to observe. Should this be in kernel/sys.c ???? */ static int proc_do_uts_string(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { struct ctl_table uts_table; int r; char tmp_data[__NEW_UTS_LEN + 1]; memcpy(&uts_table, table, sizeof(uts_table)); uts_table.data = tmp_data; /* * Buffer the value in tmp_data so that proc_dostring() can be called * without holding any locks. * We also need to read the original value in the write==1 case to * support partial writes. */ down_read(&uts_sem); memcpy(tmp_data, get_uts(table), sizeof(tmp_data)); up_read(&uts_sem); r = proc_dostring(&uts_table, write, buffer, lenp, ppos); if (write) { /* * Write back the new value. * Note that, since we dropped uts_sem, the result can * theoretically be incorrect if there are two parallel writes * at non-zero offsets to the same sysctl. */ add_device_randomness(tmp_data, sizeof(tmp_data)); down_write(&uts_sem); memcpy(get_uts(table), tmp_data, sizeof(tmp_data)); up_write(&uts_sem); proc_sys_poll_notify(table->poll); } return r; } #else #define proc_do_uts_string NULL #endif static DEFINE_CTL_TABLE_POLL(hostname_poll); static DEFINE_CTL_TABLE_POLL(domainname_poll); // Note: update 'enum uts_proc' to match any changes to this table static const struct ctl_table uts_kern_table[] = { { .procname = "arch", .data = init_uts_ns.name.machine, .maxlen = sizeof(init_uts_ns.name.machine), .mode = 0444, .proc_handler = proc_do_uts_string, }, { .procname = "ostype", .data = init_uts_ns.name.sysname, .maxlen = sizeof(init_uts_ns.name.sysname), .mode = 0444, .proc_handler = proc_do_uts_string, }, { .procname = "osrelease", .data = init_uts_ns.name.release, .maxlen = sizeof(init_uts_ns.name.release), .mode = 0444, .proc_handler = proc_do_uts_string, }, { .procname = "version", .data = init_uts_ns.name.version, .maxlen = sizeof(init_uts_ns.name.version), .mode = 0444, .proc_handler = proc_do_uts_string, }, { .procname = "hostname", .data = init_uts_ns.name.nodename, .maxlen = sizeof(init_uts_ns.name.nodename), .mode = 0644, .proc_handler = proc_do_uts_string, .poll = &hostname_poll, }, { .procname = "domainname", .data = init_uts_ns.name.domainname, .maxlen = sizeof(init_uts_ns.name.domainname), .mode = 0644, .proc_handler = proc_do_uts_string, .poll = &domainname_poll, }, }; #ifdef CONFIG_PROC_SYSCTL /* * Notify userspace about a change in a certain entry of uts_kern_table, * identified by the parameter proc. */ void uts_proc_notify(enum uts_proc proc) { const struct ctl_table *table = &uts_kern_table[proc]; proc_sys_poll_notify(table->poll); } #endif static int __init utsname_sysctl_init(void) { register_sysctl("kernel", uts_kern_table); return 0; } device_initcall(utsname_sysctl_init);
6 6 5 6 6 6 6 2 6 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 // SPDX-License-Identifier: GPL-2.0-only /* * vivid-ctrls.c - control support functions. * * Copyright 2014 Cisco Systems, Inc. and/or its affiliates. All rights reserved. */ #include <linux/errno.h> #include <linux/kernel.h> #include <linux/videodev2.h> #include <media/v4l2-event.h> #include <media/v4l2-common.h> #include "vivid-core.h" #include "vivid-vid-cap.h" #include "vivid-vid-out.h" #include "vivid-vid-common.h" #include "vivid-radio-common.h" #include "vivid-osd.h" #include "vivid-ctrls.h" #define VIVID_CID_CUSTOM_BASE (V4L2_CID_USER_BASE | 0xf000) #define VIVID_CID_BUTTON (VIVID_CID_CUSTOM_BASE + 0) #define VIVID_CID_BOOLEAN (VIVID_CID_CUSTOM_BASE + 1) #define VIVID_CID_INTEGER (VIVID_CID_CUSTOM_BASE + 2) #define VIVID_CID_INTEGER64 (VIVID_CID_CUSTOM_BASE + 3) #define VIVID_CID_MENU (VIVID_CID_CUSTOM_BASE + 4) #define VIVID_CID_STRING (VIVID_CID_CUSTOM_BASE + 5) #define VIVID_CID_BITMASK (VIVID_CID_CUSTOM_BASE + 6) #define VIVID_CID_INTMENU (VIVID_CID_CUSTOM_BASE + 7) #define VIVID_CID_U32_ARRAY (VIVID_CID_CUSTOM_BASE + 8) #define VIVID_CID_U16_MATRIX (VIVID_CID_CUSTOM_BASE + 9) #define VIVID_CID_U8_4D_ARRAY (VIVID_CID_CUSTOM_BASE + 10) #define VIVID_CID_AREA (VIVID_CID_CUSTOM_BASE + 11) #define VIVID_CID_RO_INTEGER (VIVID_CID_CUSTOM_BASE + 12) #define VIVID_CID_U32_DYN_ARRAY (VIVID_CID_CUSTOM_BASE + 13) #define VIVID_CID_U8_PIXEL_ARRAY (VIVID_CID_CUSTOM_BASE + 14) #define VIVID_CID_S32_ARRAY (VIVID_CID_CUSTOM_BASE + 15) #define VIVID_CID_S64_ARRAY (VIVID_CID_CUSTOM_BASE + 16) #define VIVID_CID_VIVID_BASE (0x00f00000 | 0xf000) #define VIVID_CID_VIVID_CLASS (0x00f00000 | 1) #define VIVID_CID_TEST_PATTERN (VIVID_CID_VIVID_BASE + 0) #define VIVID_CID_OSD_TEXT_MODE (VIVID_CID_VIVID_BASE + 1) #define VIVID_CID_HOR_MOVEMENT (VIVID_CID_VIVID_BASE + 2) #define VIVID_CID_VERT_MOVEMENT (VIVID_CID_VIVID_BASE + 3) #define VIVID_CID_SHOW_BORDER (VIVID_CID_VIVID_BASE + 4) #define VIVID_CID_SHOW_SQUARE (VIVID_CID_VIVID_BASE + 5) #define VIVID_CID_INSERT_SAV (VIVID_CID_VIVID_BASE + 6) #define VIVID_CID_INSERT_EAV (VIVID_CID_VIVID_BASE + 7) #define VIVID_CID_VBI_CAP_INTERLACED (VIVID_CID_VIVID_BASE + 8) #define VIVID_CID_INSERT_HDMI_VIDEO_GUARD_BAND (VIVID_CID_VIVID_BASE + 9) #define VIVID_CID_HFLIP (VIVID_CID_VIVID_BASE + 20) #define VIVID_CID_VFLIP (VIVID_CID_VIVID_BASE + 21) #define VIVID_CID_STD_ASPECT_RATIO (VIVID_CID_VIVID_BASE + 22) #define VIVID_CID_DV_TIMINGS_ASPECT_RATIO (VIVID_CID_VIVID_BASE + 23) #define VIVID_CID_TSTAMP_SRC (VIVID_CID_VIVID_BASE + 24) #define VIVID_CID_COLORSPACE (VIVID_CID_VIVID_BASE + 25) #define VIVID_CID_XFER_FUNC (VIVID_CID_VIVID_BASE + 26) #define VIVID_CID_YCBCR_ENC (VIVID_CID_VIVID_BASE + 27) #define VIVID_CID_QUANTIZATION (VIVID_CID_VIVID_BASE + 28) #define VIVID_CID_LIMITED_RGB_RANGE (VIVID_CID_VIVID_BASE + 29) #define VIVID_CID_ALPHA_MODE (VIVID_CID_VIVID_BASE + 30) #define VIVID_CID_HAS_CROP_CAP (VIVID_CID_VIVID_BASE + 31) #define VIVID_CID_HAS_COMPOSE_CAP (VIVID_CID_VIVID_BASE + 32) #define VIVID_CID_HAS_SCALER_CAP (VIVID_CID_VIVID_BASE + 33) #define VIVID_CID_HAS_CROP_OUT (VIVID_CID_VIVID_BASE + 34) #define VIVID_CID_HAS_COMPOSE_OUT (VIVID_CID_VIVID_BASE + 35) #define VIVID_CID_HAS_SCALER_OUT (VIVID_CID_VIVID_BASE + 36) #define VIVID_CID_SEQ_WRAP (VIVID_CID_VIVID_BASE + 38) #define VIVID_CID_TIME_WRAP (VIVID_CID_VIVID_BASE + 39) #define VIVID_CID_MAX_EDID_BLOCKS (VIVID_CID_VIVID_BASE + 40) #define VIVID_CID_PERCENTAGE_FILL (VIVID_CID_VIVID_BASE + 41) #define VIVID_CID_REDUCED_FPS (VIVID_CID_VIVID_BASE + 42) #define VIVID_CID_HSV_ENC (VIVID_CID_VIVID_BASE + 43) #define VIVID_CID_STD_SIGNAL_MODE (VIVID_CID_VIVID_BASE + 60) #define VIVID_CID_STANDARD (VIVID_CID_VIVID_BASE + 61) #define VIVID_CID_DV_TIMINGS_SIGNAL_MODE (VIVID_CID_VIVID_BASE + 62) #define VIVID_CID_DV_TIMINGS (VIVID_CID_VIVID_BASE + 63) #define VIVID_CID_PERC_DROPPED (VIVID_CID_VIVID_BASE + 64) #define VIVID_CID_DISCONNECT (VIVID_CID_VIVID_BASE + 65) #define VIVID_CID_DQBUF_ERROR (VIVID_CID_VIVID_BASE + 66) #define VIVID_CID_QUEUE_SETUP_ERROR (VIVID_CID_VIVID_BASE + 67) #define VIVID_CID_BUF_PREPARE_ERROR (VIVID_CID_VIVID_BASE + 68) #define VIVID_CID_START_STR_ERROR (VIVID_CID_VIVID_BASE + 69) #define VIVID_CID_QUEUE_ERROR (VIVID_CID_VIVID_BASE + 70) #define VIVID_CID_CLEAR_FB (VIVID_CID_VIVID_BASE + 71) #define VIVID_CID_REQ_VALIDATE_ERROR (VIVID_CID_VIVID_BASE + 72) #define VIVID_CID_RADIO_SEEK_MODE (VIVID_CID_VIVID_BASE + 90) #define VIVID_CID_RADIO_SEEK_PROG_LIM (VIVID_CID_VIVID_BASE + 91) #define VIVID_CID_RADIO_RX_RDS_RBDS (VIVID_CID_VIVID_BASE + 92) #define VIVID_CID_RADIO_RX_RDS_BLOCKIO (VIVID_CID_VIVID_BASE + 93) #define VIVID_CID_RADIO_TX_RDS_BLOCKIO (VIVID_CID_VIVID_BASE + 94) #define VIVID_CID_SDR_CAP_FM_DEVIATION (VIVID_CID_VIVID_BASE + 110) #define VIVID_CID_META_CAP_GENERATE_PTS (VIVID_CID_VIVID_BASE + 111) #define VIVID_CID_META_CAP_GENERATE_SCR (VIVID_CID_VIVID_BASE + 112) /* HDMI inputs are in the range 0-14. The next available CID is VIVID_CID_VIVID_BASE + 128 */ #define VIVID_CID_HDMI_IS_CONNECTED_TO_OUTPUT(input) (VIVID_CID_VIVID_BASE + 113 + (input)) /* S-Video inputs are in the range 0-15. The next available CID is VIVID_CID_VIVID_BASE + 144 */ #define VIVID_CID_SVID_IS_CONNECTED_TO_OUTPUT(input) (VIVID_CID_VIVID_BASE + 128 + (input)) /* General User Controls */ static void vivid_unregister_dev(bool valid, struct video_device *vdev) { if (!valid) return; clear_bit(V4L2_FL_REGISTERED, &vdev->flags); v4l2_event_wake_all(vdev); } static int vivid_user_gen_s_ctrl(struct v4l2_ctrl *ctrl) { struct vivid_dev *dev = container_of(ctrl->handler, struct vivid_dev, ctrl_hdl_user_gen); switch (ctrl->id) { case VIVID_CID_DISCONNECT: v4l2_info(&dev->v4l2_dev, "disconnect\n"); dev->disconnect_error = true; vivid_unregister_dev(dev->has_vid_cap, &dev->vid_cap_dev); vivid_unregister_dev(dev->has_vid_out, &dev->vid_out_dev); vivid_unregister_dev(dev->has_vbi_cap, &dev->vbi_cap_dev); vivid_unregister_dev(dev->has_vbi_out, &dev->vbi_out_dev); vivid_unregister_dev(dev->has_radio_rx, &dev->radio_rx_dev); vivid_unregister_dev(dev->has_radio_tx, &dev->radio_tx_dev); vivid_unregister_dev(dev->has_sdr_cap, &dev->sdr_cap_dev); vivid_unregister_dev(dev->has_meta_cap, &dev->meta_cap_dev); vivid_unregister_dev(dev->has_meta_out, &dev->meta_out_dev); vivid_unregister_dev(dev->has_touch_cap, &dev->touch_cap_dev); break; case VIVID_CID_BUTTON: dev->button_pressed = 30; break; } return 0; } static const struct v4l2_ctrl_ops vivid_user_gen_ctrl_ops = { .s_ctrl = vivid_user_gen_s_ctrl, }; static const struct v4l2_ctrl_config vivid_ctrl_button = { .ops = &vivid_user_gen_ctrl_ops, .id = VIVID_CID_BUTTON, .name = "Button", .type = V4L2_CTRL_TYPE_BUTTON, }; static const struct v4l2_ctrl_config vivid_ctrl_boolean = { .ops = &vivid_user_gen_ctrl_ops, .id = VIVID_CID_BOOLEAN, .name = "Boolean", .type = V4L2_CTRL_TYPE_BOOLEAN, .min = 0, .max = 1, .step = 1, .def = 1, }; static const struct v4l2_ctrl_config vivid_ctrl_int32 = { .ops = &vivid_user_gen_ctrl_ops, .id = VIVID_CID_INTEGER, .name = "Integer 32 Bits", .type = V4L2_CTRL_TYPE_INTEGER, .min = 0xffffffff80000000ULL, .max = 0x7fffffff, .step = 1, }; static const struct v4l2_ctrl_config vivid_ctrl_int64 = { .ops = &vivid_user_gen_ctrl_ops, .id = VIVID_CID_INTEGER64, .name = "Integer 64 Bits", .type = V4L2_CTRL_TYPE_INTEGER64, .min = 0x8000000000000000ULL, .max = 0x7fffffffffffffffLL, .step = 1, }; static const struct v4l2_ctrl_config vivid_ctrl_u32_array = { .ops = &vivid_user_gen_ctrl_ops, .id = VIVID_CID_U32_ARRAY, .name = "U32 1 Element Array", .type = V4L2_CTRL_TYPE_U32, .def = 0x18, .min = 0x10, .max = 0x20000, .step = 1, .dims = { 1 }, }; static const struct v4l2_ctrl_config vivid_ctrl_u32_dyn_array = { .ops = &vivid_user_gen_ctrl_ops, .id = VIVID_CID_U32_DYN_ARRAY, .name = "U32 Dynamic Array", .type = V4L2_CTRL_TYPE_U32, .flags = V4L2_CTRL_FLAG_DYNAMIC_ARRAY, .def = 50, .min = 10, .max = 90, .step = 1, .dims = { 100 }, }; static const struct v4l2_ctrl_config vivid_ctrl_u16_matrix = { .ops = &vivid_user_gen_ctrl_ops, .id = VIVID_CID_U16_MATRIX, .name = "U16 8x16 Matrix", .type = V4L2_CTRL_TYPE_U16, .def = 0x18, .min = 0x10, .max = 0x2000, .step = 1, .dims = { 8, 16 }, }; static const struct v4l2_ctrl_config vivid_ctrl_u8_4d_array = { .ops = &vivid_user_gen_ctrl_ops, .id = VIVID_CID_U8_4D_ARRAY, .name = "U8 2x3x4x5 Array", .type = V4L2_CTRL_TYPE_U8, .def = 0x18, .min = 0x10, .max = 0x20, .step = 1, .dims = { 2, 3, 4, 5 }, }; static const struct v4l2_ctrl_config vivid_ctrl_u8_pixel_array = { .ops = &vivid_user_gen_ctrl_ops, .id = VIVID_CID_U8_PIXEL_ARRAY, .name = "U8 Pixel Array", .type = V4L2_CTRL_TYPE_U8, .def = 0x80, .min = 0x00, .max = 0xff, .step = 1, .dims = { 640 / PIXEL_ARRAY_DIV, 360 / PIXEL_ARRAY_DIV }, }; static const struct v4l2_ctrl_config vivid_ctrl_s32_array = { .ops = &vivid_user_gen_ctrl_ops, .id = VIVID_CID_S32_ARRAY, .name = "S32 2 Element Array", .type = V4L2_CTRL_TYPE_INTEGER, .def = 2, .min = -10, .max = 10, .step = 1, .dims = { 2 }, }; static const struct v4l2_ctrl_config vivid_ctrl_s64_array = { .ops = &vivid_user_gen_ctrl_ops, .id = VIVID_CID_S64_ARRAY, .name = "S64 5 Element Array", .type = V4L2_CTRL_TYPE_INTEGER64, .def = 4, .min = -10, .max = 10, .step = 1, .dims = { 5 }, }; static const char * const vivid_ctrl_menu_strings[] = { "Menu Item 0 (Skipped)", "Menu Item 1", "Menu Item 2 (Skipped)", "Menu Item 3", "Menu Item 4", "Menu Item 5 (Skipped)", NULL, }; static const struct v4l2_ctrl_config vivid_ctrl_menu = { .ops = &vivid_user_gen_ctrl_ops, .id = VIVID_CID_MENU, .name = "Menu", .type = V4L2_CTRL_TYPE_MENU, .min = 1, .max = 4, .def = 3, .menu_skip_mask = 0x04, .qmenu = vivid_ctrl_menu_strings, }; static const struct v4l2_ctrl_config vivid_ctrl_string = { .ops = &vivid_user_gen_ctrl_ops, .id = VIVID_CID_STRING, .name = "String", .type = V4L2_CTRL_TYPE_STRING, .min = 2, .max = 4, .step = 1, }; static const struct v4l2_ctrl_config vivid_ctrl_bitmask = { .ops = &vivid_user_gen_ctrl_ops, .id = VIVID_CID_BITMASK, .name = "Bitmask", .type = V4L2_CTRL_TYPE_BITMASK, .def = 0x80002000, .min = 0, .max = 0x80402010, .step = 0, }; static const s64 vivid_ctrl_int_menu_values[] = { 1, 1, 2, 3, 5, 8, 13, 21, 42, }; static const struct v4l2_ctrl_config vivid_ctrl_int_menu = { .ops = &vivid_user_gen_ctrl_ops, .id = VIVID_CID_INTMENU, .name = "Integer Menu", .type = V4L2_CTRL_TYPE_INTEGER_MENU, .min = 1, .max = 8, .def = 4, .menu_skip_mask = 0x02, .qmenu_int = vivid_ctrl_int_menu_values, }; static const struct v4l2_ctrl_config vivid_ctrl_disconnect = { .ops = &vivid_user_gen_ctrl_ops, .id = VIVID_CID_DISCONNECT, .name = "Disconnect", .type = V4L2_CTRL_TYPE_BUTTON, }; static const struct v4l2_area area = { .width = 1000, .height = 2000, }; static const struct v4l2_ctrl_config vivid_ctrl_area = { .ops = &vivid_user_gen_ctrl_ops, .id = VIVID_CID_AREA, .name = "Area", .type = V4L2_CTRL_TYPE_AREA, .p_def.p_const = &area, }; static const struct v4l2_ctrl_config vivid_ctrl_ro_int32 = { .ops = &vivid_user_gen_ctrl_ops, .id = VIVID_CID_RO_INTEGER, .name = "Read-Only Integer 32 Bits", .type = V4L2_CTRL_TYPE_INTEGER, .flags = V4L2_CTRL_FLAG_READ_ONLY, .min = 0, .max = 255, .step = 1, }; /* Framebuffer Controls */ static int vivid_fb_s_ctrl(struct v4l2_ctrl *ctrl) { struct vivid_dev *dev = container_of(ctrl->handler, struct vivid_dev, ctrl_hdl_fb); switch (ctrl->id) { case VIVID_CID_CLEAR_FB: vivid_clear_fb(dev); break; } return 0; } static const struct v4l2_ctrl_ops vivid_fb_ctrl_ops = { .s_ctrl = vivid_fb_s_ctrl, }; static const struct v4l2_ctrl_config vivid_ctrl_clear_fb = { .ops = &vivid_fb_ctrl_ops, .id = VIVID_CID_CLEAR_FB, .name = "Clear Framebuffer", .type = V4L2_CTRL_TYPE_BUTTON, }; /* Video User Controls */ static int vivid_user_vid_g_volatile_ctrl(struct v4l2_ctrl *ctrl) { struct vivid_dev *dev = container_of(ctrl->handler, struct vivid_dev, ctrl_hdl_user_vid); switch (ctrl->id) { case V4L2_CID_AUTOGAIN: dev->gain->val = (jiffies_to_msecs(jiffies) / 1000) & 0xff; break; } return 0; } static int vivid_user_vid_s_ctrl(struct v4l2_ctrl *ctrl) { struct vivid_dev *dev = container_of(ctrl->handler, struct vivid_dev, ctrl_hdl_user_vid); switch (ctrl->id) { case V4L2_CID_BRIGHTNESS: dev->input_brightness[dev->input] = ctrl->val - dev->input * 128; tpg_s_brightness(&dev->tpg, dev->input_brightness[dev->input]); break; case V4L2_CID_CONTRAST: tpg_s_contrast(&dev->tpg, ctrl->val); break; case V4L2_CID_SATURATION: tpg_s_saturation(&dev->tpg, ctrl->val); break; case V4L2_CID_HUE: tpg_s_hue(&dev->tpg, ctrl->val); break; case V4L2_CID_HFLIP: dev->hflip = ctrl->val; tpg_s_hflip(&dev->tpg, dev->sensor_hflip ^ dev->hflip); break; case V4L2_CID_VFLIP: dev->vflip = ctrl->val; tpg_s_vflip(&dev->tpg, dev->sensor_vflip ^ dev->vflip); break; case V4L2_CID_ALPHA_COMPONENT: tpg_s_alpha_component(&dev->tpg, ctrl->val); break; } return 0; } static const struct v4l2_ctrl_ops vivid_user_vid_ctrl_ops = { .g_volatile_ctrl = vivid_user_vid_g_volatile_ctrl, .s_ctrl = vivid_user_vid_s_ctrl, }; /* Video Capture Controls */ static void vivid_update_power_present(struct vivid_dev *dev) { unsigned int i, j; dev->power_present = 0; for (i = 0, j = 0; i < ARRAY_SIZE(dev->dv_timings_signal_mode); i++) { if (dev->input_type[i] != HDMI) continue; /* * If connected to TPG or HDMI output, and the signal * mode is not NO_SIGNAL, then there is power present. */ if (dev->input_is_connected_to_output[i] != 1 && dev->dv_timings_signal_mode[i] != NO_SIGNAL) dev->power_present |= (1 << j); j++; } __v4l2_ctrl_s_ctrl(dev->ctrl_rx_power_present, dev->power_present); v4l2_ctrl_activate(dev->ctrl_dv_timings, dev->dv_timings_signal_mode[dev->input] == SELECTED_DV_TIMINGS); } static int vivid_vid_cap_s_ctrl(struct v4l2_ctrl *ctrl) { static const u32 colorspaces[] = { V4L2_COLORSPACE_SMPTE170M, V4L2_COLORSPACE_REC709, V4L2_COLORSPACE_SRGB, V4L2_COLORSPACE_OPRGB, V4L2_COLORSPACE_BT2020, V4L2_COLORSPACE_DCI_P3, V4L2_COLORSPACE_SMPTE240M, V4L2_COLORSPACE_470_SYSTEM_M, V4L2_COLORSPACE_470_SYSTEM_BG, }; struct vivid_dev *dev = container_of(ctrl->handler, struct vivid_dev, ctrl_hdl_vid_cap); unsigned int i; struct vivid_dev *output_inst = NULL; int index = 0; int hdmi_index, svid_index; s32 input_index = 0; switch (ctrl->id) { case VIVID_CID_TEST_PATTERN: vivid_update_quality(dev); tpg_s_pattern(&dev->tpg, ctrl->val); break; case VIVID_CID_COLORSPACE: tpg_s_colorspace(&dev->tpg, colorspaces[ctrl->val]); vivid_send_source_change(dev, TV); vivid_send_source_change(dev, SVID); vivid_send_source_change(dev, HDMI); vivid_send_source_change(dev, WEBCAM); break; case VIVID_CID_XFER_FUNC: tpg_s_xfer_func(&dev->tpg, ctrl->val); vivid_send_source_change(dev, TV); vivid_send_source_change(dev, SVID); vivid_send_source_change(dev, HDMI); vivid_send_source_change(dev, WEBCAM); break; case VIVID_CID_YCBCR_ENC: tpg_s_ycbcr_enc(&dev->tpg, ctrl->val); vivid_send_source_change(dev, TV); vivid_send_source_change(dev, SVID); vivid_send_source_change(dev, HDMI); vivid_send_source_change(dev, WEBCAM); break; case VIVID_CID_HSV_ENC: tpg_s_hsv_enc(&dev->tpg, ctrl->val ? V4L2_HSV_ENC_256 : V4L2_HSV_ENC_180); vivid_send_source_change(dev, TV); vivid_send_source_change(dev, SVID); vivid_send_source_change(dev, HDMI); vivid_send_source_change(dev, WEBCAM); break; case VIVID_CID_QUANTIZATION: tpg_s_quantization(&dev->tpg, ctrl->val); vivid_send_source_change(dev, TV); vivid_send_source_change(dev, SVID); vivid_send_source_change(dev, HDMI); vivid_send_source_change(dev, WEBCAM); break; case V4L2_CID_DV_RX_RGB_RANGE: if (!vivid_is_hdmi_cap(dev)) break; tpg_s_rgb_range(&dev->tpg, ctrl->val); break; case VIVID_CID_LIMITED_RGB_RANGE: tpg_s_real_rgb_range(&dev->tpg, ctrl->val ? V4L2_DV_RGB_RANGE_LIMITED : V4L2_DV_RGB_RANGE_FULL); break; case VIVID_CID_ALPHA_MODE: tpg_s_alpha_mode(&dev->tpg, ctrl->val); break; case VIVID_CID_HOR_MOVEMENT: tpg_s_mv_hor_mode(&dev->tpg, ctrl->val); break; case VIVID_CID_VERT_MOVEMENT: tpg_s_mv_vert_mode(&dev->tpg, ctrl->val); break; case VIVID_CID_OSD_TEXT_MODE: dev->osd_mode = ctrl->val; break; case VIVID_CID_PERCENTAGE_FILL: tpg_s_perc_fill(&dev->tpg, ctrl->val); for (i = 0; i < MAX_VID_CAP_BUFFERS; i++) dev->must_blank[i] = ctrl->val < 100; break; case VIVID_CID_INSERT_SAV: tpg_s_insert_sav(&dev->tpg, ctrl->val); break; case VIVID_CID_INSERT_EAV: tpg_s_insert_eav(&dev->tpg, ctrl->val); break; case VIVID_CID_INSERT_HDMI_VIDEO_GUARD_BAND: tpg_s_insert_hdmi_video_guard_band(&dev->tpg, ctrl->val); break; case VIVID_CID_HFLIP: dev->sensor_hflip = ctrl->val; tpg_s_hflip(&dev->tpg, dev->sensor_hflip ^ dev->hflip); break; case VIVID_CID_VFLIP: dev->sensor_vflip = ctrl->val; tpg_s_vflip(&dev->tpg, dev->sensor_vflip ^ dev->vflip); break; case VIVID_CID_REDUCED_FPS: dev->reduced_fps = ctrl->val; vivid_update_format_cap(dev, true); break; case VIVID_CID_HAS_CROP_CAP: dev->has_crop_cap = ctrl->val; vivid_update_format_cap(dev, true); break; case VIVID_CID_HAS_COMPOSE_CAP: dev->has_compose_cap = ctrl->val; vivid_update_format_cap(dev, true); break; case VIVID_CID_HAS_SCALER_CAP: dev->has_scaler_cap = ctrl->val; vivid_update_format_cap(dev, true); break; case VIVID_CID_SHOW_BORDER: tpg_s_show_border(&dev->tpg, ctrl->val); break; case VIVID_CID_SHOW_SQUARE: tpg_s_show_square(&dev->tpg, ctrl->val); break; case VIVID_CID_STD_ASPECT_RATIO: dev->std_aspect_ratio[dev->input] = ctrl->val; tpg_s_video_aspect(&dev->tpg, vivid_get_video_aspect(dev)); break; case VIVID_CID_DV_TIMINGS_SIGNAL_MODE: dev->dv_timings_signal_mode[dev->input] = dev->ctrl_dv_timings_signal_mode->val; dev->query_dv_timings[dev->input] = dev->ctrl_dv_timings->val; vivid_update_power_present(dev); vivid_update_quality(dev); vivid_send_input_source_change(dev, dev->input); break; case VIVID_CID_DV_TIMINGS_ASPECT_RATIO: dev->dv_timings_aspect_ratio[dev->input] = ctrl->val; tpg_s_video_aspect(&dev->tpg, vivid_get_video_aspect(dev)); break; case VIVID_CID_TSTAMP_SRC: dev->tstamp_src_is_soe = ctrl->val; dev->vb_vid_cap_q.timestamp_flags &= ~V4L2_BUF_FLAG_TSTAMP_SRC_MASK; if (dev->tstamp_src_is_soe) dev->vb_vid_cap_q.timestamp_flags |= V4L2_BUF_FLAG_TSTAMP_SRC_SOE; break; case VIVID_CID_MAX_EDID_BLOCKS: dev->edid_max_blocks = ctrl->val; if (dev->edid_blocks > dev->edid_max_blocks) dev->edid_blocks = dev->edid_max_blocks; break; case VIVID_CID_HDMI_IS_CONNECTED_TO_OUTPUT(0) ... VIVID_CID_HDMI_IS_CONNECTED_TO_OUTPUT(14): hdmi_index = ctrl->id - VIVID_CID_HDMI_IS_CONNECTED_TO_OUTPUT(0); output_inst = vivid_ctrl_hdmi_to_output_instance[ctrl->cur.val]; index = vivid_ctrl_hdmi_to_output_index[ctrl->cur.val]; input_index = dev->hdmi_index_to_input_index[hdmi_index]; dev->input_is_connected_to_output[input_index] = ctrl->val; if (output_inst) { output_inst->output_to_input_instance[index] = NULL; vivid_update_outputs(output_inst); cec_phys_addr_invalidate(output_inst->cec_tx_adap[index]); } if (ctrl->val >= FIXED_MENU_ITEMS) { output_inst = vivid_ctrl_hdmi_to_output_instance[ctrl->val]; index = vivid_ctrl_hdmi_to_output_index[ctrl->val]; output_inst->output_to_input_instance[index] = dev; output_inst->output_to_input_index[index] = dev->hdmi_index_to_input_index[hdmi_index]; } spin_lock(&hdmi_output_skip_mask_lock); hdmi_to_output_menu_skip_mask &= ~(1ULL << ctrl->cur.val); if (ctrl->val >= FIXED_MENU_ITEMS) hdmi_to_output_menu_skip_mask |= 1ULL << ctrl->val; spin_unlock(&hdmi_output_skip_mask_lock); vivid_update_power_present(dev); vivid_update_quality(dev); vivid_send_input_source_change(dev, dev->hdmi_index_to_input_index[hdmi_index]); if (ctrl->val < FIXED_MENU_ITEMS && ctrl->cur.val < FIXED_MENU_ITEMS) break; spin_lock(&hdmi_output_skip_mask_lock); hdmi_input_update_outputs_mask |= 1 << dev->inst; spin_unlock(&hdmi_output_skip_mask_lock); queue_work(update_hdmi_ctrls_workqueue, &dev->update_hdmi_ctrl_work); break; case VIVID_CID_SVID_IS_CONNECTED_TO_OUTPUT(0) ... VIVID_CID_SVID_IS_CONNECTED_TO_OUTPUT(15): svid_index = ctrl->id - VIVID_CID_SVID_IS_CONNECTED_TO_OUTPUT(0); output_inst = vivid_ctrl_svid_to_output_instance[ctrl->cur.val]; index = vivid_ctrl_svid_to_output_index[ctrl->cur.val]; input_index = dev->svid_index_to_input_index[svid_index]; dev->input_is_connected_to_output[input_index] = ctrl->val; if (output_inst) output_inst->output_to_input_instance[index] = NULL; if (ctrl->val >= FIXED_MENU_ITEMS) { output_inst = vivid_ctrl_svid_to_output_instance[ctrl->val]; index = vivid_ctrl_svid_to_output_index[ctrl->val]; output_inst->output_to_input_instance[index] = dev; output_inst->output_to_input_index[index] = dev->svid_index_to_input_index[svid_index]; } spin_lock(&svid_output_skip_mask_lock); svid_to_output_menu_skip_mask &= ~(1ULL << ctrl->cur.val); if (ctrl->val >= FIXED_MENU_ITEMS) svid_to_output_menu_skip_mask |= 1ULL << ctrl->val; spin_unlock(&svid_output_skip_mask_lock); vivid_update_quality(dev); vivid_send_input_source_change(dev, dev->svid_index_to_input_index[svid_index]); if (ctrl->val < FIXED_MENU_ITEMS && ctrl->cur.val < FIXED_MENU_ITEMS) break; queue_work(update_svid_ctrls_workqueue, &dev->update_svid_ctrl_work); break; } return 0; } static const struct v4l2_ctrl_ops vivid_vid_cap_ctrl_ops = { .s_ctrl = vivid_vid_cap_s_ctrl, }; static const char * const vivid_ctrl_hor_movement_strings[] = { "Move Left Fast", "Move Left", "Move Left Slow", "No Movement", "Move Right Slow", "Move Right", "Move Right Fast", NULL, }; static const struct v4l2_ctrl_config vivid_ctrl_hor_movement = { .ops = &vivid_vid_cap_ctrl_ops, .id = VIVID_CID_HOR_MOVEMENT, .name = "Horizontal Movement", .type = V4L2_CTRL_TYPE_MENU, .max = TPG_MOVE_POS_FAST, .def = TPG_MOVE_NONE, .qmenu = vivid_ctrl_hor_movement_strings, }; static const char * const vivid_ctrl_vert_movement_strings[] = { "Move Up Fast", "Move Up", "Move Up Slow", "No Movement", "Move Down Slow", "Move Down", "Move Down Fast", NULL, }; static const struct v4l2_ctrl_config vivid_ctrl_vert_movement = { .ops = &vivid_vid_cap_ctrl_ops, .id = VIVID_CID_VERT_MOVEMENT, .name = "Vertical Movement", .type = V4L2_CTRL_TYPE_MENU, .max = TPG_MOVE_POS_FAST, .def = TPG_MOVE_NONE, .qmenu = vivid_ctrl_vert_movement_strings, }; static const struct v4l2_ctrl_config vivid_ctrl_show_border = { .ops = &vivid_vid_cap_ctrl_ops, .id = VIVID_CID_SHOW_BORDER, .name = "Show Border", .type = V4L2_CTRL_TYPE_BOOLEAN, .max = 1, .step = 1, }; static const struct v4l2_ctrl_config vivid_ctrl_show_square = { .ops = &vivid_vid_cap_ctrl_ops, .id = VIVID_CID_SHOW_SQUARE, .name = "Show Square", .type = V4L2_CTRL_TYPE_BOOLEAN, .max = 1, .step = 1, }; static const char * const vivid_ctrl_osd_mode_strings[] = { "All", "Counters Only", "None", NULL, }; static const struct v4l2_ctrl_config vivid_ctrl_osd_mode = { .ops = &vivid_vid_cap_ctrl_ops, .id = VIVID_CID_OSD_TEXT_MODE, .name = "OSD Text Mode", .type = V4L2_CTRL_TYPE_MENU, .max = ARRAY_SIZE(vivid_ctrl_osd_mode_strings) - 2, .qmenu = vivid_ctrl_osd_mode_strings, }; static const struct v4l2_ctrl_config vivid_ctrl_perc_fill = { .ops = &vivid_vid_cap_ctrl_ops, .id = VIVID_CID_PERCENTAGE_FILL, .name = "Fill Percentage of Frame", .type = V4L2_CTRL_TYPE_INTEGER, .min = 0, .max = 100, .def = 100, .step = 1, }; static const struct v4l2_ctrl_config vivid_ctrl_insert_sav = { .ops = &vivid_vid_cap_ctrl_ops, .id = VIVID_CID_INSERT_SAV, .name = "Insert SAV Code in Image", .type = V4L2_CTRL_TYPE_BOOLEAN, .max = 1, .step = 1, }; static const struct v4l2_ctrl_config vivid_ctrl_insert_eav = { .ops = &vivid_vid_cap_ctrl_ops, .id = VIVID_CID_INSERT_EAV, .name = "Insert EAV Code in Image", .type = V4L2_CTRL_TYPE_BOOLEAN, .max = 1, .step = 1, }; static const struct v4l2_ctrl_config vivid_ctrl_insert_hdmi_video_guard_band = { .ops = &vivid_vid_cap_ctrl_ops, .id = VIVID_CID_INSERT_HDMI_VIDEO_GUARD_BAND, .name = "Insert Video Guard Band", .type = V4L2_CTRL_TYPE_BOOLEAN, .max = 1, .step = 1, }; static const struct v4l2_ctrl_config vivid_ctrl_hflip = { .ops = &vivid_vid_cap_ctrl_ops, .id = VIVID_CID_HFLIP, .name = "Sensor Flipped Horizontally", .type = V4L2_CTRL_TYPE_BOOLEAN, .max = 1, .step = 1, }; static const struct v4l2_ctrl_config vivid_ctrl_vflip = { .ops = &vivid_vid_cap_ctrl_ops, .id = VIVID_CID_VFLIP, .name = "Sensor Flipped Vertically", .type = V4L2_CTRL_TYPE_BOOLEAN, .max = 1, .step = 1, }; static const struct v4l2_ctrl_config vivid_ctrl_reduced_fps = { .ops = &vivid_vid_cap_ctrl_ops, .id = VIVID_CID_REDUCED_FPS, .name = "Reduced Framerate", .type = V4L2_CTRL_TYPE_BOOLEAN, .max = 1, .step = 1, }; static const struct v4l2_ctrl_config vivid_ctrl_has_crop_cap = { .ops = &vivid_vid_cap_ctrl_ops, .id = VIVID_CID_HAS_CROP_CAP, .name = "Enable Capture Cropping", .type = V4L2_CTRL_TYPE_BOOLEAN, .max = 1, .def = 1, .step = 1, }; static const struct v4l2_ctrl_config vivid_ctrl_has_compose_cap = { .ops = &vivid_vid_cap_ctrl_ops, .id = VIVID_CID_HAS_COMPOSE_CAP, .name = "Enable Capture Composing", .type = V4L2_CTRL_TYPE_BOOLEAN, .max = 1, .def = 1, .step = 1, }; static const struct v4l2_ctrl_config vivid_ctrl_has_scaler_cap = { .ops = &vivid_vid_cap_ctrl_ops, .id = VIVID_CID_HAS_SCALER_CAP, .name = "Enable Capture Scaler", .type = V4L2_CTRL_TYPE_BOOLEAN, .max = 1, .def = 1, .step = 1, }; static const char * const vivid_ctrl_tstamp_src_strings[] = { "End of Frame", "Start of Exposure", NULL, }; static const struct v4l2_ctrl_config vivid_ctrl_tstamp_src = { .ops = &vivid_vid_cap_ctrl_ops, .id = VIVID_CID_TSTAMP_SRC, .name = "Timestamp Source", .type = V4L2_CTRL_TYPE_MENU, .max = ARRAY_SIZE(vivid_ctrl_tstamp_src_strings) - 2, .qmenu = vivid_ctrl_tstamp_src_strings, }; static const struct v4l2_ctrl_config vivid_ctrl_std_aspect_ratio = { .ops = &vivid_vid_cap_ctrl_ops, .id = VIVID_CID_STD_ASPECT_RATIO, .name = "Standard Aspect Ratio", .type = V4L2_CTRL_TYPE_MENU, .min = 1, .max = 4, .def = 1, .qmenu = tpg_aspect_strings, }; static const char * const vivid_ctrl_dv_timings_signal_mode_strings[] = { "Current DV Timings", "No Signal", "No Lock", "Out of Range", "Selected DV Timings", "Cycle Through All DV Timings", "Custom DV Timings", NULL, }; static const struct v4l2_ctrl_config vivid_ctrl_dv_timings_signal_mode = { .ops = &vivid_vid_cap_ctrl_ops, .id = VIVID_CID_DV_TIMINGS_SIGNAL_MODE, .name = "DV Timings Signal Mode", .type = V4L2_CTRL_TYPE_MENU, .max = 5, .qmenu = vivid_ctrl_dv_timings_signal_mode_strings, }; static const struct v4l2_ctrl_config vivid_ctrl_dv_timings_aspect_ratio = { .ops = &vivid_vid_cap_ctrl_ops, .id = VIVID_CID_DV_TIMINGS_ASPECT_RATIO, .name = "DV Timings Aspect Ratio", .type = V4L2_CTRL_TYPE_MENU, .max = 3, .qmenu = tpg_aspect_strings, }; static const struct v4l2_ctrl_config vivid_ctrl_max_edid_blocks = { .ops = &vivid_vid_cap_ctrl_ops, .id = VIVID_CID_MAX_EDID_BLOCKS, .name = "Maximum EDID Blocks", .type = V4L2_CTRL_TYPE_INTEGER, .min = 1, .max = 256, .def = 2, .step = 1, }; static const char * const vivid_ctrl_colorspace_strings[] = { "SMPTE 170M", "Rec. 709", "sRGB", "opRGB", "BT.2020", "DCI-P3", "SMPTE 240M", "470 System M", "470 System BG", NULL, }; static const struct v4l2_ctrl_config vivid_ctrl_colorspace = { .ops = &vivid_vid_cap_ctrl_ops, .id = VIVID_CID_COLORSPACE, .name = "Colorspace", .type = V4L2_CTRL_TYPE_MENU, .max = ARRAY_SIZE(vivid_ctrl_colorspace_strings) - 2, .def = 2, .qmenu = vivid_ctrl_colorspace_strings, }; static const char * const vivid_ctrl_xfer_func_strings[] = { "Default", "Rec. 709", "sRGB", "opRGB", "SMPTE 240M", "None", "DCI-P3", "SMPTE 2084", NULL, }; static const struct v4l2_ctrl_config vivid_ctrl_xfer_func = { .ops = &vivid_vid_cap_ctrl_ops, .id = VIVID_CID_XFER_FUNC, .name = "Transfer Function", .type = V4L2_CTRL_TYPE_MENU, .max = ARRAY_SIZE(vivid_ctrl_xfer_func_strings) - 2, .qmenu = vivid_ctrl_xfer_func_strings, }; static const char * const vivid_ctrl_ycbcr_enc_strings[] = { "Default", "ITU-R 601", "Rec. 709", "xvYCC 601", "xvYCC 709", "", "BT.2020", "BT.2020 Constant Luminance", "SMPTE 240M", NULL, }; static const struct v4l2_ctrl_config vivid_ctrl_ycbcr_enc = { .ops = &vivid_vid_cap_ctrl_ops, .id = VIVID_CID_YCBCR_ENC, .name = "Y'CbCr Encoding", .type = V4L2_CTRL_TYPE_MENU, .menu_skip_mask = 1 << 5, .max = ARRAY_SIZE(vivid_ctrl_ycbcr_enc_strings) - 2, .qmenu = vivid_ctrl_ycbcr_enc_strings, }; static const char * const vivid_ctrl_hsv_enc_strings[] = { "Hue 0-179", "Hue 0-256", NULL, }; static const struct v4l2_ctrl_config vivid_ctrl_hsv_enc = { .ops = &vivid_vid_cap_ctrl_ops, .id = VIVID_CID_HSV_ENC, .name = "HSV Encoding", .type = V4L2_CTRL_TYPE_MENU, .max = ARRAY_SIZE(vivid_ctrl_hsv_enc_strings) - 2, .qmenu = vivid_ctrl_hsv_enc_strings, }; static const char * const vivid_ctrl_quantization_strings[] = { "Default", "Full Range", "Limited Range", NULL, }; static const struct v4l2_ctrl_config vivid_ctrl_quantization = { .ops = &vivid_vid_cap_ctrl_ops, .id = VIVID_CID_QUANTIZATION, .name = "Quantization", .type = V4L2_CTRL_TYPE_MENU, .max = ARRAY_SIZE(vivid_ctrl_quantization_strings) - 2, .qmenu = vivid_ctrl_quantization_strings, }; static const struct v4l2_ctrl_config vivid_ctrl_alpha_mode = { .ops = &vivid_vid_cap_ctrl_ops, .id = VIVID_CID_ALPHA_MODE, .name = "Apply Alpha To Red Only", .type = V4L2_CTRL_TYPE_BOOLEAN, .max = 1, .step = 1, }; static const struct v4l2_ctrl_config vivid_ctrl_limited_rgb_range = { .ops = &vivid_vid_cap_ctrl_ops, .id = VIVID_CID_LIMITED_RGB_RANGE, .name = "Limited RGB Range (16-235)", .type = V4L2_CTRL_TYPE_BOOLEAN, .max = 1, .step = 1, }; /* VBI Capture Control */ static int vivid_vbi_cap_s_ctrl(struct v4l2_ctrl *ctrl) { struct vivid_dev *dev = container_of(ctrl->handler, struct vivid_dev, ctrl_hdl_vbi_cap); switch (ctrl->id) { case VIVID_CID_VBI_CAP_INTERLACED: dev->vbi_cap_interlaced = ctrl->val; break; } return 0; } static const struct v4l2_ctrl_ops vivid_vbi_cap_ctrl_ops = { .s_ctrl = vivid_vbi_cap_s_ctrl, }; static const struct v4l2_ctrl_config vivid_ctrl_vbi_cap_interlaced = { .ops = &vivid_vbi_cap_ctrl_ops, .id = VIVID_CID_VBI_CAP_INTERLACED, .name = "Interlaced VBI Format", .type = V4L2_CTRL_TYPE_BOOLEAN, .max = 1, .step = 1, }; /* Video Output Controls */ static int vivid_vid_out_s_ctrl(struct v4l2_ctrl *ctrl) { struct vivid_dev *dev = container_of(ctrl->handler, struct vivid_dev, ctrl_hdl_vid_out); struct v4l2_bt_timings *bt = &dev->dv_timings_out.bt; switch (ctrl->id) { case VIVID_CID_HAS_CROP_OUT: dev->has_crop_out = ctrl->val; vivid_update_format_out(dev); break; case VIVID_CID_HAS_COMPOSE_OUT: dev->has_compose_out = ctrl->val; vivid_update_format_out(dev); break; case VIVID_CID_HAS_SCALER_OUT: dev->has_scaler_out = ctrl->val; vivid_update_format_out(dev); break; case V4L2_CID_DV_TX_MODE: dev->dvi_d_out = ctrl->val == V4L2_DV_TX_MODE_DVI_D; if (!vivid_is_hdmi_out(dev)) break; if (!dev->dvi_d_out && (bt->flags & V4L2_DV_FL_IS_CE_VIDEO)) { if (bt->width == 720 && bt->height <= 576) dev->colorspace_out = V4L2_COLORSPACE_SMPTE170M; else dev->colorspace_out = V4L2_COLORSPACE_REC709; dev->quantization_out = V4L2_QUANTIZATION_DEFAULT; } else { dev->colorspace_out = V4L2_COLORSPACE_SRGB; dev->quantization_out = dev->dvi_d_out ? V4L2_QUANTIZATION_LIM_RANGE : V4L2_QUANTIZATION_DEFAULT; } if (vivid_output_is_connected_to(dev)) { struct vivid_dev *dev_rx = vivid_output_is_connected_to(dev); vivid_send_source_change(dev_rx, HDMI); } break; } return 0; } static const struct v4l2_ctrl_ops vivid_vid_out_ctrl_ops = { .s_ctrl = vivid_vid_out_s_ctrl, }; static const struct v4l2_ctrl_config vivid_ctrl_has_crop_out = { .ops = &vivid_vid_out_ctrl_ops, .id = VIVID_CID_HAS_CROP_OUT, .name = "Enable Output Cropping", .type = V4L2_CTRL_TYPE_BOOLEAN, .max = 1, .def = 1, .step = 1, }; static const struct v4l2_ctrl_config vivid_ctrl_has_compose_out = { .ops = &vivid_vid_out_ctrl_ops, .id = VIVID_CID_HAS_COMPOSE_OUT, .name = "Enable Output Composing", .type = V4L2_CTRL_TYPE_BOOLEAN, .max = 1, .def = 1, .step = 1, }; static const struct v4l2_ctrl_config vivid_ctrl_has_scaler_out = { .ops = &vivid_vid_out_ctrl_ops, .id = VIVID_CID_HAS_SCALER_OUT, .name = "Enable Output Scaler", .type = V4L2_CTRL_TYPE_BOOLEAN, .max = 1, .def = 1, .step = 1, }; /* Streaming Controls */ static int vivid_streaming_s_ctrl(struct v4l2_ctrl *ctrl) { struct vivid_dev *dev = container_of(ctrl->handler, struct vivid_dev, ctrl_hdl_streaming); switch (ctrl->id) { case VIVID_CID_DQBUF_ERROR: dev->dqbuf_error = true; break; case VIVID_CID_PERC_DROPPED: dev->perc_dropped_buffers = ctrl->val; break; case VIVID_CID_QUEUE_SETUP_ERROR: dev->queue_setup_error = true; break; case VIVID_CID_BUF_PREPARE_ERROR: dev->buf_prepare_error = true; break; case VIVID_CID_START_STR_ERROR: dev->start_streaming_error = true; break; case VIVID_CID_REQ_VALIDATE_ERROR: dev->req_validate_error = true; break; case VIVID_CID_QUEUE_ERROR: if (vb2_start_streaming_called(&dev->vb_vid_cap_q)) vb2_queue_error(&dev->vb_vid_cap_q); if (vb2_start_streaming_called(&dev->vb_vbi_cap_q)) vb2_queue_error(&dev->vb_vbi_cap_q); if (vb2_start_streaming_called(&dev->vb_vid_out_q)) vb2_queue_error(&dev->vb_vid_out_q); if (vb2_start_streaming_called(&dev->vb_vbi_out_q)) vb2_queue_error(&dev->vb_vbi_out_q); if (vb2_start_streaming_called(&dev->vb_sdr_cap_q)) vb2_queue_error(&dev->vb_sdr_cap_q); break; case VIVID_CID_SEQ_WRAP: dev->seq_wrap = ctrl->val; break; case VIVID_CID_TIME_WRAP: dev->time_wrap = ctrl->val; if (dev->time_wrap == 1) dev->time_wrap = (1ULL << 63) - NSEC_PER_SEC * 16ULL; else if (dev->time_wrap == 2) dev->time_wrap = ((1ULL << 31) - 16) * NSEC_PER_SEC; break; } return 0; } static const struct v4l2_ctrl_ops vivid_streaming_ctrl_ops = { .s_ctrl = vivid_streaming_s_ctrl, }; static const struct v4l2_ctrl_config vivid_ctrl_dqbuf_error = { .ops = &vivid_streaming_ctrl_ops, .id = VIVID_CID_DQBUF_ERROR, .name = "Inject V4L2_BUF_FLAG_ERROR", .type = V4L2_CTRL_TYPE_BUTTON, }; static const struct v4l2_ctrl_config vivid_ctrl_perc_dropped = { .ops = &vivid_streaming_ctrl_ops, .id = VIVID_CID_PERC_DROPPED, .name = "Percentage of Dropped Buffers", .type = V4L2_CTRL_TYPE_INTEGER, .min = 0, .max = 100, .step = 1, }; static const struct v4l2_ctrl_config vivid_ctrl_queue_setup_error = { .ops = &vivid_streaming_ctrl_ops, .id = VIVID_CID_QUEUE_SETUP_ERROR, .name = "Inject VIDIOC_REQBUFS Error", .type = V4L2_CTRL_TYPE_BUTTON, }; static const struct v4l2_ctrl_config vivid_ctrl_buf_prepare_error = { .ops = &vivid_streaming_ctrl_ops, .id = VIVID_CID_BUF_PREPARE_ERROR, .name = "Inject VIDIOC_QBUF Error", .type = V4L2_CTRL_TYPE_BUTTON, }; static const struct v4l2_ctrl_config vivid_ctrl_start_streaming_error = { .ops = &vivid_streaming_ctrl_ops, .id = VIVID_CID_START_STR_ERROR, .name = "Inject VIDIOC_STREAMON Error", .type = V4L2_CTRL_TYPE_BUTTON, }; static const struct v4l2_ctrl_config vivid_ctrl_queue_error = { .ops = &vivid_streaming_ctrl_ops, .id = VIVID_CID_QUEUE_ERROR, .name = "Inject Fatal Streaming Error", .type = V4L2_CTRL_TYPE_BUTTON, }; #ifdef CONFIG_MEDIA_CONTROLLER static const struct v4l2_ctrl_config vivid_ctrl_req_validate_error = { .ops = &vivid_streaming_ctrl_ops, .id = VIVID_CID_REQ_VALIDATE_ERROR, .name = "Inject req_validate() Error", .type = V4L2_CTRL_TYPE_BUTTON, }; #endif static const struct v4l2_ctrl_config vivid_ctrl_seq_wrap = { .ops = &vivid_streaming_ctrl_ops, .id = VIVID_CID_SEQ_WRAP, .name = "Wrap Sequence Number", .type = V4L2_CTRL_TYPE_BOOLEAN, .max = 1, .step = 1, }; static const char * const vivid_ctrl_time_wrap_strings[] = { "None", "64 Bit", "32 Bit", NULL, }; static const struct v4l2_ctrl_config vivid_ctrl_time_wrap = { .ops = &vivid_streaming_ctrl_ops, .id = VIVID_CID_TIME_WRAP, .name = "Wrap Timestamp", .type = V4L2_CTRL_TYPE_MENU, .max = ARRAY_SIZE(vivid_ctrl_time_wrap_strings) - 2, .qmenu = vivid_ctrl_time_wrap_strings, }; /* SDTV Capture Controls */ static int vivid_sdtv_cap_s_ctrl(struct v4l2_ctrl *ctrl) { struct vivid_dev *dev = container_of(ctrl->handler, struct vivid_dev, ctrl_hdl_sdtv_cap); switch (ctrl->id) { case VIVID_CID_STD_SIGNAL_MODE: dev->std_signal_mode[dev->input] = dev->ctrl_std_signal_mode->val; if (dev->std_signal_mode[dev->input] == SELECTED_STD) dev->query_std[dev->input] = vivid_standard[dev->ctrl_standard->val]; v4l2_ctrl_activate(dev->ctrl_standard, dev->std_signal_mode[dev->input] == SELECTED_STD); vivid_update_quality(dev); vivid_send_source_change(dev, TV); vivid_send_source_change(dev, SVID); break; } return 0; } static const struct v4l2_ctrl_ops vivid_sdtv_cap_ctrl_ops = { .s_ctrl = vivid_sdtv_cap_s_ctrl, }; static const char * const vivid_ctrl_std_signal_mode_strings[] = { "Current Standard", "No Signal", "No Lock", "", "Selected Standard", "Cycle Through All Standards", NULL, }; static const struct v4l2_ctrl_config vivid_ctrl_std_signal_mode = { .ops = &vivid_sdtv_cap_ctrl_ops, .id = VIVID_CID_STD_SIGNAL_MODE, .name = "Standard Signal Mode", .type = V4L2_CTRL_TYPE_MENU, .max = ARRAY_SIZE(vivid_ctrl_std_signal_mode_strings) - 2, .menu_skip_mask = 1 << 3, .qmenu = vivid_ctrl_std_signal_mode_strings, }; static const struct v4l2_ctrl_config vivid_ctrl_standard = { .ops = &vivid_sdtv_cap_ctrl_ops, .id = VIVID_CID_STANDARD, .name = "Standard", .type = V4L2_CTRL_TYPE_MENU, .max = 14, .qmenu = vivid_ctrl_standard_strings, }; /* Radio Receiver Controls */ static int vivid_radio_rx_s_ctrl(struct v4l2_ctrl *ctrl) { struct vivid_dev *dev = container_of(ctrl->handler, struct vivid_dev, ctrl_hdl_radio_rx); switch (ctrl->id) { case VIVID_CID_RADIO_SEEK_MODE: dev->radio_rx_hw_seek_mode = ctrl->val; break; case VIVID_CID_RADIO_SEEK_PROG_LIM: dev->radio_rx_hw_seek_prog_lim = ctrl->val; break; case VIVID_CID_RADIO_RX_RDS_RBDS: dev->rds_gen.use_rbds = ctrl->val; break; case VIVID_CID_RADIO_RX_RDS_BLOCKIO: dev->radio_rx_rds_controls = ctrl->val; dev->radio_rx_caps &= ~V4L2_CAP_READWRITE; dev->radio_rx_rds_use_alternates = false; if (!dev->radio_rx_rds_controls) { dev->radio_rx_caps |= V4L2_CAP_READWRITE; __v4l2_ctrl_s_ctrl(dev->radio_rx_rds_pty, 0); __v4l2_ctrl_s_ctrl(dev->radio_rx_rds_ta, 0); __v4l2_ctrl_s_ctrl(dev->radio_rx_rds_tp, 0); __v4l2_ctrl_s_ctrl(dev->radio_rx_rds_ms, 0); __v4l2_ctrl_s_ctrl_string(dev->radio_rx_rds_psname, ""); __v4l2_ctrl_s_ctrl_string(dev->radio_rx_rds_radiotext, ""); } v4l2_ctrl_activate(dev->radio_rx_rds_pty, dev->radio_rx_rds_controls); v4l2_ctrl_activate(dev->radio_rx_rds_psname, dev->radio_rx_rds_controls); v4l2_ctrl_activate(dev->radio_rx_rds_radiotext, dev->radio_rx_rds_controls); v4l2_ctrl_activate(dev->radio_rx_rds_ta, dev->radio_rx_rds_controls); v4l2_ctrl_activate(dev->radio_rx_rds_tp, dev->radio_rx_rds_controls); v4l2_ctrl_activate(dev->radio_rx_rds_ms, dev->radio_rx_rds_controls); dev->radio_rx_dev.device_caps = dev->radio_rx_caps; break; case V4L2_CID_RDS_RECEPTION: dev->radio_rx_rds_enabled = ctrl->val; break; } return 0; } static const struct v4l2_ctrl_ops vivid_radio_rx_ctrl_ops = { .s_ctrl = vivid_radio_rx_s_ctrl, }; static const char * const vivid_ctrl_radio_rds_mode_strings[] = { "Block I/O", "Controls", NULL, }; static const struct v4l2_ctrl_config vivid_ctrl_radio_rx_rds_blockio = { .ops = &vivid_radio_rx_ctrl_ops, .id = VIVID_CID_RADIO_RX_RDS_BLOCKIO, .name = "RDS Rx I/O Mode", .type = V4L2_CTRL_TYPE_MENU, .qmenu = vivid_ctrl_radio_rds_mode_strings, .max = 1, }; static const struct v4l2_ctrl_config vivid_ctrl_radio_rx_rds_rbds = { .ops = &vivid_radio_rx_ctrl_ops, .id = VIVID_CID_RADIO_RX_RDS_RBDS, .name = "Generate RBDS Instead of RDS", .type = V4L2_CTRL_TYPE_BOOLEAN, .max = 1, .step = 1, }; static const char * const vivid_ctrl_radio_hw_seek_mode_strings[] = { "Bounded", "Wrap Around", "Both", NULL, }; static const struct v4l2_ctrl_config vivid_ctrl_radio_hw_seek_mode = { .ops = &vivid_radio_rx_ctrl_ops, .id = VIVID_CID_RADIO_SEEK_MODE, .name = "Radio HW Seek Mode", .type = V4L2_CTRL_TYPE_MENU, .max = 2, .qmenu = vivid_ctrl_radio_hw_seek_mode_strings, }; static const struct v4l2_ctrl_config vivid_ctrl_radio_hw_seek_prog_lim = { .ops = &vivid_radio_rx_ctrl_ops, .id = VIVID_CID_RADIO_SEEK_PROG_LIM, .name = "Radio Programmable HW Seek", .type = V4L2_CTRL_TYPE_BOOLEAN, .max = 1, .step = 1, }; /* Radio Transmitter Controls */ static int vivid_radio_tx_s_ctrl(struct v4l2_ctrl *ctrl) { struct vivid_dev *dev = container_of(ctrl->handler, struct vivid_dev, ctrl_hdl_radio_tx); switch (ctrl->id) { case VIVID_CID_RADIO_TX_RDS_BLOCKIO: dev->radio_tx_rds_controls = ctrl->val; dev->radio_tx_caps &= ~V4L2_CAP_READWRITE; if (!dev->radio_tx_rds_controls) dev->radio_tx_caps |= V4L2_CAP_READWRITE; dev->radio_tx_dev.device_caps = dev->radio_tx_caps; break; case V4L2_CID_RDS_TX_PTY: if (dev->radio_rx_rds_controls) v4l2_ctrl_s_ctrl(dev->radio_rx_rds_pty, ctrl->val); break; case V4L2_CID_RDS_TX_PS_NAME: if (dev->radio_rx_rds_controls) v4l2_ctrl_s_ctrl_string(dev->radio_rx_rds_psname, ctrl->p_new.p_char); break; case V4L2_CID_RDS_TX_RADIO_TEXT: if (dev->radio_rx_rds_controls) v4l2_ctrl_s_ctrl_string(dev->radio_rx_rds_radiotext, ctrl->p_new.p_char); break; case V4L2_CID_RDS_TX_TRAFFIC_ANNOUNCEMENT: if (dev->radio_rx_rds_controls) v4l2_ctrl_s_ctrl(dev->radio_rx_rds_ta, ctrl->val); break; case V4L2_CID_RDS_TX_TRAFFIC_PROGRAM: if (dev->radio_rx_rds_controls) v4l2_ctrl_s_ctrl(dev->radio_rx_rds_tp, ctrl->val); break; case V4L2_CID_RDS_TX_MUSIC_SPEECH: if (dev->radio_rx_rds_controls) v4l2_ctrl_s_ctrl(dev->radio_rx_rds_ms, ctrl->val); break; } return 0; } static const struct v4l2_ctrl_ops vivid_radio_tx_ctrl_ops = { .s_ctrl = vivid_radio_tx_s_ctrl, }; static const struct v4l2_ctrl_config vivid_ctrl_radio_tx_rds_blockio = { .ops = &vivid_radio_tx_ctrl_ops, .id = VIVID_CID_RADIO_TX_RDS_BLOCKIO, .name = "RDS Tx I/O Mode", .type = V4L2_CTRL_TYPE_MENU, .qmenu = vivid_ctrl_radio_rds_mode_strings, .max = 1, .def = 1, }; /* SDR Capture Controls */ static int vivid_sdr_cap_s_ctrl(struct v4l2_ctrl *ctrl) { struct vivid_dev *dev = container_of(ctrl->handler, struct vivid_dev, ctrl_hdl_sdr_cap); switch (ctrl->id) { case VIVID_CID_SDR_CAP_FM_DEVIATION: dev->sdr_fm_deviation = ctrl->val; break; } return 0; } static const struct v4l2_ctrl_ops vivid_sdr_cap_ctrl_ops = { .s_ctrl = vivid_sdr_cap_s_ctrl, }; static const struct v4l2_ctrl_config vivid_ctrl_sdr_cap_fm_deviation = { .ops = &vivid_sdr_cap_ctrl_ops, .id = VIVID_CID_SDR_CAP_FM_DEVIATION, .name = "FM Deviation", .type = V4L2_CTRL_TYPE_INTEGER, .min = 100, .max = 200000, .def = 75000, .step = 1, }; /* Metadata Capture Control */ static int vivid_meta_cap_s_ctrl(struct v4l2_ctrl *ctrl) { struct vivid_dev *dev = container_of(ctrl->handler, struct vivid_dev, ctrl_hdl_meta_cap); switch (ctrl->id) { case VIVID_CID_META_CAP_GENERATE_PTS: dev->meta_pts = ctrl->val; break; case VIVID_CID_META_CAP_GENERATE_SCR: dev->meta_scr = ctrl->val; break; } return 0; } static const struct v4l2_ctrl_ops vivid_meta_cap_ctrl_ops = { .s_ctrl = vivid_meta_cap_s_ctrl, }; static const struct v4l2_ctrl_config vivid_ctrl_meta_has_pts = { .ops = &vivid_meta_cap_ctrl_ops, .id = VIVID_CID_META_CAP_GENERATE_PTS, .name = "Generate PTS", .type = V4L2_CTRL_TYPE_BOOLEAN, .max = 1, .def = 1, .step = 1, }; static const struct v4l2_ctrl_config vivid_ctrl_meta_has_src_clk = { .ops = &vivid_meta_cap_ctrl_ops, .id = VIVID_CID_META_CAP_GENERATE_SCR, .name = "Generate SCR", .type = V4L2_CTRL_TYPE_BOOLEAN, .max = 1, .def = 1, .step = 1, }; static const struct v4l2_ctrl_config vivid_ctrl_class = { .ops = &vivid_user_gen_ctrl_ops, .flags = V4L2_CTRL_FLAG_READ_ONLY | V4L2_CTRL_FLAG_WRITE_ONLY, .id = VIVID_CID_VIVID_CLASS, .name = "Vivid Controls", .type = V4L2_CTRL_TYPE_CTRL_CLASS, }; int vivid_create_controls(struct vivid_dev *dev, bool show_ccs_cap, bool show_ccs_out, bool no_error_inj, bool has_sdtv, bool has_hdmi) { struct v4l2_ctrl_handler *hdl_user_gen = &dev->ctrl_hdl_user_gen; struct v4l2_ctrl_handler *hdl_user_vid = &dev->ctrl_hdl_user_vid; struct v4l2_ctrl_handler *hdl_user_aud = &dev->ctrl_hdl_user_aud; struct v4l2_ctrl_handler *hdl_streaming = &dev->ctrl_hdl_streaming; struct v4l2_ctrl_handler *hdl_sdtv_cap = &dev->ctrl_hdl_sdtv_cap; struct v4l2_ctrl_handler *hdl_loop_cap = &dev->ctrl_hdl_loop_cap; struct v4l2_ctrl_handler *hdl_fb = &dev->ctrl_hdl_fb; struct v4l2_ctrl_handler *hdl_vid_cap = &dev->ctrl_hdl_vid_cap; struct v4l2_ctrl_handler *hdl_vid_out = &dev->ctrl_hdl_vid_out; struct v4l2_ctrl_handler *hdl_vbi_cap = &dev->ctrl_hdl_vbi_cap; struct v4l2_ctrl_handler *hdl_vbi_out = &dev->ctrl_hdl_vbi_out; struct v4l2_ctrl_handler *hdl_radio_rx = &dev->ctrl_hdl_radio_rx; struct v4l2_ctrl_handler *hdl_radio_tx = &dev->ctrl_hdl_radio_tx; struct v4l2_ctrl_handler *hdl_sdr_cap = &dev->ctrl_hdl_sdr_cap; struct v4l2_ctrl_handler *hdl_meta_cap = &dev->ctrl_hdl_meta_cap; struct v4l2_ctrl_handler *hdl_meta_out = &dev->ctrl_hdl_meta_out; struct v4l2_ctrl_handler *hdl_tch_cap = &dev->ctrl_hdl_touch_cap; struct v4l2_ctrl_config vivid_ctrl_dv_timings = { .ops = &vivid_vid_cap_ctrl_ops, .id = VIVID_CID_DV_TIMINGS, .name = "DV Timings", .type = V4L2_CTRL_TYPE_MENU, }; int i; v4l2_ctrl_handler_init(hdl_user_gen, 10); v4l2_ctrl_new_custom(hdl_user_gen, &vivid_ctrl_class, NULL); v4l2_ctrl_handler_init(hdl_user_vid, 9); v4l2_ctrl_new_custom(hdl_user_vid, &vivid_ctrl_class, NULL); v4l2_ctrl_handler_init(hdl_user_aud, 2); v4l2_ctrl_new_custom(hdl_user_aud, &vivid_ctrl_class, NULL); v4l2_ctrl_handler_init(hdl_streaming, 8); v4l2_ctrl_new_custom(hdl_streaming, &vivid_ctrl_class, NULL); v4l2_ctrl_handler_init(hdl_sdtv_cap, 2); v4l2_ctrl_new_custom(hdl_sdtv_cap, &vivid_ctrl_class, NULL); v4l2_ctrl_handler_init(hdl_loop_cap, 1); v4l2_ctrl_new_custom(hdl_loop_cap, &vivid_ctrl_class, NULL); v4l2_ctrl_handler_init(hdl_fb, 1); v4l2_ctrl_new_custom(hdl_fb, &vivid_ctrl_class, NULL); v4l2_ctrl_handler_init(hdl_vid_cap, 55); v4l2_ctrl_new_custom(hdl_vid_cap, &vivid_ctrl_class, NULL); v4l2_ctrl_handler_init(hdl_vid_out, 26); if (!no_error_inj || dev->has_fb || dev->num_hdmi_outputs) v4l2_ctrl_new_custom(hdl_vid_out, &vivid_ctrl_class, NULL); v4l2_ctrl_handler_init(hdl_vbi_cap, 21); v4l2_ctrl_new_custom(hdl_vbi_cap, &vivid_ctrl_class, NULL); v4l2_ctrl_handler_init(hdl_vbi_out, 19); if (!no_error_inj) v4l2_ctrl_new_custom(hdl_vbi_out, &vivid_ctrl_class, NULL); v4l2_ctrl_handler_init(hdl_radio_rx, 17); v4l2_ctrl_new_custom(hdl_radio_rx, &vivid_ctrl_class, NULL); v4l2_ctrl_handler_init(hdl_radio_tx, 17); v4l2_ctrl_new_custom(hdl_radio_tx, &vivid_ctrl_class, NULL); v4l2_ctrl_handler_init(hdl_sdr_cap, 19); v4l2_ctrl_new_custom(hdl_sdr_cap, &vivid_ctrl_class, NULL); v4l2_ctrl_handler_init(hdl_meta_cap, 2); v4l2_ctrl_new_custom(hdl_meta_cap, &vivid_ctrl_class, NULL); v4l2_ctrl_handler_init(hdl_meta_out, 2); v4l2_ctrl_new_custom(hdl_meta_out, &vivid_ctrl_class, NULL); v4l2_ctrl_handler_init(hdl_tch_cap, 2); v4l2_ctrl_new_custom(hdl_tch_cap, &vivid_ctrl_class, NULL); /* User Controls */ dev->volume = v4l2_ctrl_new_std(hdl_user_aud, NULL, V4L2_CID_AUDIO_VOLUME, 0, 255, 1, 200); dev->mute = v4l2_ctrl_new_std(hdl_user_aud, NULL, V4L2_CID_AUDIO_MUTE, 0, 1, 1, 0); if (dev->has_vid_cap) { dev->brightness = v4l2_ctrl_new_std(hdl_user_vid, &vivid_user_vid_ctrl_ops, V4L2_CID_BRIGHTNESS, 0, 255, 1, 128); for (i = 0; i < MAX_INPUTS; i++) dev->input_brightness[i] = 128; dev->contrast = v4l2_ctrl_new_std(hdl_user_vid, &vivid_user_vid_ctrl_ops, V4L2_CID_CONTRAST, 0, 255, 1, 128); dev->saturation = v4l2_ctrl_new_std(hdl_user_vid, &vivid_user_vid_ctrl_ops, V4L2_CID_SATURATION, 0, 255, 1, 128); dev->hue = v4l2_ctrl_new_std(hdl_user_vid, &vivid_user_vid_ctrl_ops, V4L2_CID_HUE, -128, 128, 1, 0); v4l2_ctrl_new_std(hdl_user_vid, &vivid_user_vid_ctrl_ops, V4L2_CID_HFLIP, 0, 1, 1, 0); v4l2_ctrl_new_std(hdl_user_vid, &vivid_user_vid_ctrl_ops, V4L2_CID_VFLIP, 0, 1, 1, 0); dev->autogain = v4l2_ctrl_new_std(hdl_user_vid, &vivid_user_vid_ctrl_ops, V4L2_CID_AUTOGAIN, 0, 1, 1, 1); dev->gain = v4l2_ctrl_new_std(hdl_user_vid, &vivid_user_vid_ctrl_ops, V4L2_CID_GAIN, 0, 255, 1, 100); dev->alpha = v4l2_ctrl_new_std(hdl_user_vid, &vivid_user_vid_ctrl_ops, V4L2_CID_ALPHA_COMPONENT, 0, 255, 1, 0); } dev->button = v4l2_ctrl_new_custom(hdl_user_gen, &vivid_ctrl_button, NULL); dev->int32 = v4l2_ctrl_new_custom(hdl_user_gen, &vivid_ctrl_int32, NULL); dev->int64 = v4l2_ctrl_new_custom(hdl_user_gen, &vivid_ctrl_int64, NULL); dev->boolean = v4l2_ctrl_new_custom(hdl_user_gen, &vivid_ctrl_boolean, NULL); dev->menu = v4l2_ctrl_new_custom(hdl_user_gen, &vivid_ctrl_menu, NULL); dev->string = v4l2_ctrl_new_custom(hdl_user_gen, &vivid_ctrl_string, NULL); dev->bitmask = v4l2_ctrl_new_custom(hdl_user_gen, &vivid_ctrl_bitmask, NULL); dev->int_menu = v4l2_ctrl_new_custom(hdl_user_gen, &vivid_ctrl_int_menu, NULL); dev->ro_int32 = v4l2_ctrl_new_custom(hdl_user_gen, &vivid_ctrl_ro_int32, NULL); v4l2_ctrl_new_custom(hdl_user_gen, &vivid_ctrl_area, NULL); v4l2_ctrl_new_custom(hdl_user_gen, &vivid_ctrl_u32_array, NULL); v4l2_ctrl_new_custom(hdl_user_gen, &vivid_ctrl_u32_dyn_array, NULL); v4l2_ctrl_new_custom(hdl_user_gen, &vivid_ctrl_u16_matrix, NULL); v4l2_ctrl_new_custom(hdl_user_gen, &vivid_ctrl_u8_4d_array, NULL); dev->pixel_array = v4l2_ctrl_new_custom(hdl_user_gen, &vivid_ctrl_u8_pixel_array, NULL); v4l2_ctrl_new_custom(hdl_user_gen, &vivid_ctrl_s32_array, NULL); v4l2_ctrl_new_custom(hdl_user_gen, &vivid_ctrl_s64_array, NULL); if (dev->has_vid_cap) { /* Image Processing Controls */ struct v4l2_ctrl_config vivid_ctrl_test_pattern = { .ops = &vivid_vid_cap_ctrl_ops, .id = VIVID_CID_TEST_PATTERN, .name = "Test Pattern", .type = V4L2_CTRL_TYPE_MENU, .max = TPG_PAT_NOISE, .qmenu = tpg_pattern_strings, }; dev->test_pattern = v4l2_ctrl_new_custom(hdl_vid_cap, &vivid_ctrl_test_pattern, NULL); v4l2_ctrl_new_custom(hdl_vid_cap, &vivid_ctrl_perc_fill, NULL); v4l2_ctrl_new_custom(hdl_vid_cap, &vivid_ctrl_hor_movement, NULL); v4l2_ctrl_new_custom(hdl_vid_cap, &vivid_ctrl_vert_movement, NULL); v4l2_ctrl_new_custom(hdl_vid_cap, &vivid_ctrl_osd_mode, NULL); v4l2_ctrl_new_custom(hdl_vid_cap, &vivid_ctrl_show_border, NULL); v4l2_ctrl_new_custom(hdl_vid_cap, &vivid_ctrl_show_square, NULL); v4l2_ctrl_new_custom(hdl_vid_cap, &vivid_ctrl_hflip, NULL); v4l2_ctrl_new_custom(hdl_vid_cap, &vivid_ctrl_vflip, NULL); v4l2_ctrl_new_custom(hdl_vid_cap, &vivid_ctrl_insert_sav, NULL); v4l2_ctrl_new_custom(hdl_vid_cap, &vivid_ctrl_insert_eav, NULL); v4l2_ctrl_new_custom(hdl_vid_cap, &vivid_ctrl_insert_hdmi_video_guard_band, NULL); v4l2_ctrl_new_custom(hdl_vid_cap, &vivid_ctrl_reduced_fps, NULL); WARN_ON(dev->num_hdmi_inputs > MAX_HDMI_INPUTS); WARN_ON(dev->num_svid_inputs > MAX_SVID_INPUTS); for (u8 i = 0; i < dev->num_hdmi_inputs; i++) { snprintf(dev->ctrl_hdmi_to_output_names[i], sizeof(dev->ctrl_hdmi_to_output_names[i]), "HDMI %03u-%u Is Connected To", dev->inst, i); } for (u8 i = 0; i < dev->num_hdmi_inputs; i++) { struct v4l2_ctrl_config ctrl_config = { .ops = &vivid_vid_cap_ctrl_ops, .id = VIVID_CID_HDMI_IS_CONNECTED_TO_OUTPUT(i), .name = dev->ctrl_hdmi_to_output_names[i], .type = V4L2_CTRL_TYPE_MENU, .max = 1, .qmenu = (const char * const *)vivid_ctrl_hdmi_to_output_strings, }; dev->ctrl_hdmi_to_output[i] = v4l2_ctrl_new_custom(hdl_vid_cap, &ctrl_config, NULL); } for (u8 i = 0; i < dev->num_svid_inputs; i++) { snprintf(dev->ctrl_svid_to_output_names[i], sizeof(dev->ctrl_svid_to_output_names[i]), "S-Video %03u-%u Is Connected To", dev->inst, i); } for (u8 i = 0; i < dev->num_svid_inputs; i++) { struct v4l2_ctrl_config ctrl_config = { .ops = &vivid_vid_cap_ctrl_ops, .id = VIVID_CID_SVID_IS_CONNECTED_TO_OUTPUT(i), .name = dev->ctrl_svid_to_output_names[i], .type = V4L2_CTRL_TYPE_MENU, .max = 1, .qmenu = (const char * const *)vivid_ctrl_svid_to_output_strings, }; dev->ctrl_svid_to_output[i] = v4l2_ctrl_new_custom(hdl_vid_cap, &ctrl_config, NULL); } if (show_ccs_cap) { dev->ctrl_has_crop_cap = v4l2_ctrl_new_custom(hdl_vid_cap, &vivid_ctrl_has_crop_cap, NULL); dev->ctrl_has_compose_cap = v4l2_ctrl_new_custom(hdl_vid_cap, &vivid_ctrl_has_compose_cap, NULL); dev->ctrl_has_scaler_cap = v4l2_ctrl_new_custom(hdl_vid_cap, &vivid_ctrl_has_scaler_cap, NULL); } v4l2_ctrl_new_custom(hdl_vid_cap, &vivid_ctrl_tstamp_src, NULL); dev->colorspace = v4l2_ctrl_new_custom(hdl_vid_cap, &vivid_ctrl_colorspace, NULL); v4l2_ctrl_new_custom(hdl_vid_cap, &vivid_ctrl_xfer_func, NULL); v4l2_ctrl_new_custom(hdl_vid_cap, &vivid_ctrl_ycbcr_enc, NULL); v4l2_ctrl_new_custom(hdl_vid_cap, &vivid_ctrl_hsv_enc, NULL); v4l2_ctrl_new_custom(hdl_vid_cap, &vivid_ctrl_quantization, NULL); v4l2_ctrl_new_custom(hdl_vid_cap, &vivid_ctrl_alpha_mode, NULL); } if (dev->has_vid_out && show_ccs_out) { dev->ctrl_has_crop_out = v4l2_ctrl_new_custom(hdl_vid_out, &vivid_ctrl_has_crop_out, NULL); dev->ctrl_has_compose_out = v4l2_ctrl_new_custom(hdl_vid_out, &vivid_ctrl_has_compose_out, NULL); dev->ctrl_has_scaler_out = v4l2_ctrl_new_custom(hdl_vid_out, &vivid_ctrl_has_scaler_out, NULL); } /* * Testing this driver with v4l2-compliance will trigger the error * injection controls, and after that nothing will work as expected. * So we have a module option to drop these error injecting controls * allowing us to run v4l2_compliance again. */ if (!no_error_inj) { v4l2_ctrl_new_custom(hdl_user_gen, &vivid_ctrl_disconnect, NULL); v4l2_ctrl_new_custom(hdl_streaming, &vivid_ctrl_dqbuf_error, NULL); v4l2_ctrl_new_custom(hdl_streaming, &vivid_ctrl_perc_dropped, NULL); v4l2_ctrl_new_custom(hdl_streaming, &vivid_ctrl_queue_setup_error, NULL); v4l2_ctrl_new_custom(hdl_streaming, &vivid_ctrl_buf_prepare_error, NULL); v4l2_ctrl_new_custom(hdl_streaming, &vivid_ctrl_start_streaming_error, NULL); v4l2_ctrl_new_custom(hdl_streaming, &vivid_ctrl_queue_error, NULL); #ifdef CONFIG_MEDIA_CONTROLLER v4l2_ctrl_new_custom(hdl_streaming, &vivid_ctrl_req_validate_error, NULL); #endif v4l2_ctrl_new_custom(hdl_streaming, &vivid_ctrl_seq_wrap, NULL); v4l2_ctrl_new_custom(hdl_streaming, &vivid_ctrl_time_wrap, NULL); } if (has_sdtv && (dev->has_vid_cap || dev->has_vbi_cap)) { if (dev->has_vid_cap) v4l2_ctrl_new_custom(hdl_vid_cap, &vivid_ctrl_std_aspect_ratio, NULL); dev->ctrl_std_signal_mode = v4l2_ctrl_new_custom(hdl_sdtv_cap, &vivid_ctrl_std_signal_mode, NULL); dev->ctrl_standard = v4l2_ctrl_new_custom(hdl_sdtv_cap, &vivid_ctrl_standard, NULL); if (dev->ctrl_std_signal_mode) v4l2_ctrl_cluster(2, &dev->ctrl_std_signal_mode); if (dev->has_raw_vbi_cap) v4l2_ctrl_new_custom(hdl_vbi_cap, &vivid_ctrl_vbi_cap_interlaced, NULL); } if (dev->num_hdmi_inputs) { s64 hdmi_input_mask = GENMASK(dev->num_hdmi_inputs - 1, 0); dev->ctrl_dv_timings_signal_mode = v4l2_ctrl_new_custom(hdl_vid_cap, &vivid_ctrl_dv_timings_signal_mode, NULL); vivid_ctrl_dv_timings.max = dev->query_dv_timings_size - 1; vivid_ctrl_dv_timings.qmenu = (const char * const *)dev->query_dv_timings_qmenu; dev->ctrl_dv_timings = v4l2_ctrl_new_custom(hdl_vid_cap, &vivid_ctrl_dv_timings, NULL); if (dev->ctrl_dv_timings_signal_mode) v4l2_ctrl_cluster(2, &dev->ctrl_dv_timings_signal_mode); v4l2_ctrl_new_custom(hdl_vid_cap, &vivid_ctrl_dv_timings_aspect_ratio, NULL); v4l2_ctrl_new_custom(hdl_vid_cap, &vivid_ctrl_max_edid_blocks, NULL); dev->real_rgb_range_cap = v4l2_ctrl_new_custom(hdl_vid_cap, &vivid_ctrl_limited_rgb_range, NULL); dev->rgb_range_cap = v4l2_ctrl_new_std_menu(hdl_vid_cap, &vivid_vid_cap_ctrl_ops, V4L2_CID_DV_RX_RGB_RANGE, V4L2_DV_RGB_RANGE_FULL, 0, V4L2_DV_RGB_RANGE_AUTO); dev->ctrl_rx_power_present = v4l2_ctrl_new_std(hdl_vid_cap, NULL, V4L2_CID_DV_RX_POWER_PRESENT, 0, hdmi_input_mask, 0, hdmi_input_mask); } if (dev->num_hdmi_outputs) { s64 hdmi_output_mask = GENMASK(dev->num_hdmi_outputs - 1, 0); /* * We aren't doing anything with this at the moment, but * HDMI outputs typically have this controls. */ dev->ctrl_tx_rgb_range = v4l2_ctrl_new_std_menu(hdl_vid_out, NULL, V4L2_CID_DV_TX_RGB_RANGE, V4L2_DV_RGB_RANGE_FULL, 0, V4L2_DV_RGB_RANGE_AUTO); dev->ctrl_tx_mode = v4l2_ctrl_new_std_menu(hdl_vid_out, NULL, V4L2_CID_DV_TX_MODE, V4L2_DV_TX_MODE_HDMI, 0, V4L2_DV_TX_MODE_HDMI); dev->ctrl_tx_hotplug = v4l2_ctrl_new_std(hdl_vid_out, NULL, V4L2_CID_DV_TX_HOTPLUG, 0, hdmi_output_mask, 0, 0); dev->ctrl_tx_rxsense = v4l2_ctrl_new_std(hdl_vid_out, NULL, V4L2_CID_DV_TX_RXSENSE, 0, hdmi_output_mask, 0, 0); dev->ctrl_tx_edid_present = v4l2_ctrl_new_std(hdl_vid_out, NULL, V4L2_CID_DV_TX_EDID_PRESENT, 0, hdmi_output_mask, 0, 0); } if (dev->has_fb) v4l2_ctrl_new_custom(hdl_fb, &vivid_ctrl_clear_fb, NULL); if (dev->has_radio_rx) { v4l2_ctrl_new_custom(hdl_radio_rx, &vivid_ctrl_radio_hw_seek_mode, NULL); v4l2_ctrl_new_custom(hdl_radio_rx, &vivid_ctrl_radio_hw_seek_prog_lim, NULL); v4l2_ctrl_new_custom(hdl_radio_rx, &vivid_ctrl_radio_rx_rds_blockio, NULL); v4l2_ctrl_new_custom(hdl_radio_rx, &vivid_ctrl_radio_rx_rds_rbds, NULL); v4l2_ctrl_new_std(hdl_radio_rx, &vivid_radio_rx_ctrl_ops, V4L2_CID_RDS_RECEPTION, 0, 1, 1, 1); dev->radio_rx_rds_pty = v4l2_ctrl_new_std(hdl_radio_rx, &vivid_radio_rx_ctrl_ops, V4L2_CID_RDS_RX_PTY, 0, 31, 1, 0); dev->radio_rx_rds_psname = v4l2_ctrl_new_std(hdl_radio_rx, &vivid_radio_rx_ctrl_ops, V4L2_CID_RDS_RX_PS_NAME, 0, 8, 8, 0); dev->radio_rx_rds_radiotext = v4l2_ctrl_new_std(hdl_radio_rx, &vivid_radio_rx_ctrl_ops, V4L2_CID_RDS_RX_RADIO_TEXT, 0, 64, 64, 0); dev->radio_rx_rds_ta = v4l2_ctrl_new_std(hdl_radio_rx, &vivid_radio_rx_ctrl_ops, V4L2_CID_RDS_RX_TRAFFIC_ANNOUNCEMENT, 0, 1, 1, 0); dev->radio_rx_rds_tp = v4l2_ctrl_new_std(hdl_radio_rx, &vivid_radio_rx_ctrl_ops, V4L2_CID_RDS_RX_TRAFFIC_PROGRAM, 0, 1, 1, 0); dev->radio_rx_rds_ms = v4l2_ctrl_new_std(hdl_radio_rx, &vivid_radio_rx_ctrl_ops, V4L2_CID_RDS_RX_MUSIC_SPEECH, 0, 1, 1, 1); } if (dev->has_radio_tx) { v4l2_ctrl_new_custom(hdl_radio_tx, &vivid_ctrl_radio_tx_rds_blockio, NULL); dev->radio_tx_rds_pi = v4l2_ctrl_new_std(hdl_radio_tx, &vivid_radio_tx_ctrl_ops, V4L2_CID_RDS_TX_PI, 0, 0xffff, 1, 0x8088); dev->radio_tx_rds_pty = v4l2_ctrl_new_std(hdl_radio_tx, &vivid_radio_tx_ctrl_ops, V4L2_CID_RDS_TX_PTY, 0, 31, 1, 3); dev->radio_tx_rds_psname = v4l2_ctrl_new_std(hdl_radio_tx, &vivid_radio_tx_ctrl_ops, V4L2_CID_RDS_TX_PS_NAME, 0, 8, 8, 0); if (dev->radio_tx_rds_psname) v4l2_ctrl_s_ctrl_string(dev->radio_tx_rds_psname, "VIVID-TX"); dev->radio_tx_rds_radiotext = v4l2_ctrl_new_std(hdl_radio_tx, &vivid_radio_tx_ctrl_ops, V4L2_CID_RDS_TX_RADIO_TEXT, 0, 64 * 2, 64, 0); if (dev->radio_tx_rds_radiotext) v4l2_ctrl_s_ctrl_string(dev->radio_tx_rds_radiotext, "This is a VIVID default Radio Text template text, change at will"); dev->radio_tx_rds_mono_stereo = v4l2_ctrl_new_std(hdl_radio_tx, &vivid_radio_tx_ctrl_ops, V4L2_CID_RDS_TX_MONO_STEREO, 0, 1, 1, 1); dev->radio_tx_rds_art_head = v4l2_ctrl_new_std(hdl_radio_tx, &vivid_radio_tx_ctrl_ops, V4L2_CID_RDS_TX_ARTIFICIAL_HEAD, 0, 1, 1, 0); dev->radio_tx_rds_compressed = v4l2_ctrl_new_std(hdl_radio_tx, &vivid_radio_tx_ctrl_ops, V4L2_CID_RDS_TX_COMPRESSED, 0, 1, 1, 0); dev->radio_tx_rds_dyn_pty = v4l2_ctrl_new_std(hdl_radio_tx, &vivid_radio_tx_ctrl_ops, V4L2_CID_RDS_TX_DYNAMIC_PTY, 0, 1, 1, 0); dev->radio_tx_rds_ta = v4l2_ctrl_new_std(hdl_radio_tx, &vivid_radio_tx_ctrl_ops, V4L2_CID_RDS_TX_TRAFFIC_ANNOUNCEMENT, 0, 1, 1, 0); dev->radio_tx_rds_tp = v4l2_ctrl_new_std(hdl_radio_tx, &vivid_radio_tx_ctrl_ops, V4L2_CID_RDS_TX_TRAFFIC_PROGRAM, 0, 1, 1, 1); dev->radio_tx_rds_ms = v4l2_ctrl_new_std(hdl_radio_tx, &vivid_radio_tx_ctrl_ops, V4L2_CID_RDS_TX_MUSIC_SPEECH, 0, 1, 1, 1); } if (dev->has_sdr_cap) { v4l2_ctrl_new_custom(hdl_sdr_cap, &vivid_ctrl_sdr_cap_fm_deviation, NULL); } if (dev->has_meta_cap) { v4l2_ctrl_new_custom(hdl_meta_cap, &vivid_ctrl_meta_has_pts, NULL); v4l2_ctrl_new_custom(hdl_meta_cap, &vivid_ctrl_meta_has_src_clk, NULL); } if (hdl_user_gen->error) return hdl_user_gen->error; if (hdl_user_vid->error) return hdl_user_vid->error; if (hdl_user_aud->error) return hdl_user_aud->error; if (hdl_streaming->error) return hdl_streaming->error; if (hdl_sdr_cap->error) return hdl_sdr_cap->error; if (hdl_loop_cap->error) return hdl_loop_cap->error; if (dev->autogain) v4l2_ctrl_auto_cluster(2, &dev->autogain, 0, true); if (dev->has_vid_cap) { v4l2_ctrl_add_handler(hdl_vid_cap, hdl_user_gen, NULL, false); v4l2_ctrl_add_handler(hdl_vid_cap, hdl_user_vid, NULL, false); v4l2_ctrl_add_handler(hdl_vid_cap, hdl_user_aud, NULL, false); v4l2_ctrl_add_handler(hdl_vid_cap, hdl_streaming, NULL, false); v4l2_ctrl_add_handler(hdl_vid_cap, hdl_sdtv_cap, NULL, false); v4l2_ctrl_add_handler(hdl_vid_cap, hdl_loop_cap, NULL, false); v4l2_ctrl_add_handler(hdl_vid_cap, hdl_fb, NULL, false); if (hdl_vid_cap->error) return hdl_vid_cap->error; dev->vid_cap_dev.ctrl_handler = hdl_vid_cap; } if (dev->has_vid_out) { v4l2_ctrl_add_handler(hdl_vid_out, hdl_user_gen, NULL, false); v4l2_ctrl_add_handler(hdl_vid_out, hdl_user_aud, NULL, false); v4l2_ctrl_add_handler(hdl_vid_out, hdl_streaming, NULL, false); v4l2_ctrl_add_handler(hdl_vid_out, hdl_fb, NULL, false); if (hdl_vid_out->error) return hdl_vid_out->error; dev->vid_out_dev.ctrl_handler = hdl_vid_out; } if (dev->has_vbi_cap) { v4l2_ctrl_add_handler(hdl_vbi_cap, hdl_user_gen, NULL, false); v4l2_ctrl_add_handler(hdl_vbi_cap, hdl_streaming, NULL, false); v4l2_ctrl_add_handler(hdl_vbi_cap, hdl_sdtv_cap, NULL, false); v4l2_ctrl_add_handler(hdl_vbi_cap, hdl_loop_cap, NULL, false); if (hdl_vbi_cap->error) return hdl_vbi_cap->error; dev->vbi_cap_dev.ctrl_handler = hdl_vbi_cap; } if (dev->has_vbi_out) { v4l2_ctrl_add_handler(hdl_vbi_out, hdl_user_gen, NULL, false); v4l2_ctrl_add_handler(hdl_vbi_out, hdl_streaming, NULL, false); if (hdl_vbi_out->error) return hdl_vbi_out->error; dev->vbi_out_dev.ctrl_handler = hdl_vbi_out; } if (dev->has_radio_rx) { v4l2_ctrl_add_handler(hdl_radio_rx, hdl_user_gen, NULL, false); v4l2_ctrl_add_handler(hdl_radio_rx, hdl_user_aud, NULL, false); if (hdl_radio_rx->error) return hdl_radio_rx->error; dev->radio_rx_dev.ctrl_handler = hdl_radio_rx; } if (dev->has_radio_tx) { v4l2_ctrl_add_handler(hdl_radio_tx, hdl_user_gen, NULL, false); v4l2_ctrl_add_handler(hdl_radio_tx, hdl_user_aud, NULL, false); if (hdl_radio_tx->error) return hdl_radio_tx->error; dev->radio_tx_dev.ctrl_handler = hdl_radio_tx; } if (dev->has_sdr_cap) { v4l2_ctrl_add_handler(hdl_sdr_cap, hdl_user_gen, NULL, false); v4l2_ctrl_add_handler(hdl_sdr_cap, hdl_streaming, NULL, false); if (hdl_sdr_cap->error) return hdl_sdr_cap->error; dev->sdr_cap_dev.ctrl_handler = hdl_sdr_cap; } if (dev->has_meta_cap) { v4l2_ctrl_add_handler(hdl_meta_cap, hdl_user_gen, NULL, false); v4l2_ctrl_add_handler(hdl_meta_cap, hdl_streaming, NULL, false); if (hdl_meta_cap->error) return hdl_meta_cap->error; dev->meta_cap_dev.ctrl_handler = hdl_meta_cap; } if (dev->has_meta_out) { v4l2_ctrl_add_handler(hdl_meta_out, hdl_user_gen, NULL, false); v4l2_ctrl_add_handler(hdl_meta_out, hdl_streaming, NULL, false); if (hdl_meta_out->error) return hdl_meta_out->error; dev->meta_out_dev.ctrl_handler = hdl_meta_out; } if (dev->has_touch_cap) { v4l2_ctrl_add_handler(hdl_tch_cap, hdl_user_gen, NULL, false); v4l2_ctrl_add_handler(hdl_tch_cap, hdl_streaming, NULL, false); if (hdl_tch_cap->error) return hdl_tch_cap->error; dev->touch_cap_dev.ctrl_handler = hdl_tch_cap; } return 0; } void vivid_free_controls(struct vivid_dev *dev) { v4l2_ctrl_handler_free(&dev->ctrl_hdl_vid_cap); v4l2_ctrl_handler_free(&dev->ctrl_hdl_vid_out); v4l2_ctrl_handler_free(&dev->ctrl_hdl_vbi_cap); v4l2_ctrl_handler_free(&dev->ctrl_hdl_vbi_out); v4l2_ctrl_handler_free(&dev->ctrl_hdl_radio_rx); v4l2_ctrl_handler_free(&dev->ctrl_hdl_radio_tx); v4l2_ctrl_handler_free(&dev->ctrl_hdl_sdr_cap); v4l2_ctrl_handler_free(&dev->ctrl_hdl_user_gen); v4l2_ctrl_handler_free(&dev->ctrl_hdl_user_vid); v4l2_ctrl_handler_free(&dev->ctrl_hdl_user_aud); v4l2_ctrl_handler_free(&dev->ctrl_hdl_streaming); v4l2_ctrl_handler_free(&dev->ctrl_hdl_sdtv_cap); v4l2_ctrl_handler_free(&dev->ctrl_hdl_loop_cap); v4l2_ctrl_handler_free(&dev->ctrl_hdl_fb); v4l2_ctrl_handler_free(&dev->ctrl_hdl_meta_cap); v4l2_ctrl_handler_free(&dev->ctrl_hdl_meta_out); v4l2_ctrl_handler_free(&dev->ctrl_hdl_touch_cap); }
38 5 46 11 71 91 50 46 46 46 46 46 46 46 16 82 15 15 35 44 9 9 9 9 9 30 30 5 30 15 30 15 35 30 34 48 48 46 46 48 31 16 10 70 73 73 72 73 73 43 73 73 11 11 70 73 73 2 9 2 2 2 5 1 1 1 1 5 5 5 5 5 5 38 38 38 38 4 33 39 35 6 9 33 5 34 33 9 9 9 38 5 5 5 5 5 39 39 39 39 32 25 25 52 19 85 86 86 86 41 41 41 40 5 15 39 15 39 50 16 16 16 16 16 16 16 16 16 16 16 16 16 15 51 51 51 49 49 49 49 49 49 49 50 50 50 50 50 1 50 49 16 51 32 32 32 1 1 2 2 2 1 3 2 3 3 1 3 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 1 2 1 2 1 2 2 3 2 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 // SPDX-License-Identifier: GPL-2.0-or-later /* * Loopback soundcard * * Original code: * Copyright (c) by Jaroslav Kysela <perex@perex.cz> * * More accurate positioning and full-duplex support: * Copyright (c) Ahmet İnan <ainan at mathematik.uni-freiburg.de> * * Major (almost complete) rewrite: * Copyright (c) by Takashi Iwai <tiwai@suse.de> * * A next major update in 2010 (separate timers for playback and capture): * Copyright (c) Jaroslav Kysela <perex@perex.cz> */ #include <linux/init.h> #include <linux/jiffies.h> #include <linux/slab.h> #include <linux/time.h> #include <linux/wait.h> #include <linux/module.h> #include <linux/platform_device.h> #include <sound/core.h> #include <sound/control.h> #include <sound/pcm.h> #include <sound/pcm_params.h> #include <sound/info.h> #include <sound/initval.h> #include <sound/timer.h> MODULE_AUTHOR("Jaroslav Kysela <perex@perex.cz>"); MODULE_DESCRIPTION("A loopback soundcard"); MODULE_LICENSE("GPL"); #define MAX_PCM_SUBSTREAMS 8 static int index[SNDRV_CARDS] = SNDRV_DEFAULT_IDX; /* Index 0-MAX */ static char *id[SNDRV_CARDS] = SNDRV_DEFAULT_STR; /* ID for this card */ static bool enable[SNDRV_CARDS] = {1, [1 ... (SNDRV_CARDS - 1)] = 0}; static int pcm_substreams[SNDRV_CARDS] = {[0 ... (SNDRV_CARDS - 1)] = 8}; static int pcm_notify[SNDRV_CARDS]; static char *timer_source[SNDRV_CARDS]; module_param_array(index, int, NULL, 0444); MODULE_PARM_DESC(index, "Index value for loopback soundcard."); module_param_array(id, charp, NULL, 0444); MODULE_PARM_DESC(id, "ID string for loopback soundcard."); module_param_array(enable, bool, NULL, 0444); MODULE_PARM_DESC(enable, "Enable this loopback soundcard."); module_param_array(pcm_substreams, int, NULL, 0444); MODULE_PARM_DESC(pcm_substreams, "PCM substreams # (1-8) for loopback driver."); module_param_array(pcm_notify, int, NULL, 0444); MODULE_PARM_DESC(pcm_notify, "Break capture when PCM format/rate/channels changes."); module_param_array(timer_source, charp, NULL, 0444); MODULE_PARM_DESC(timer_source, "Sound card name or number and device/subdevice number of timer to be used. Empty string for jiffies timer [default]."); #define NO_PITCH 100000 #define CABLE_VALID_PLAYBACK BIT(SNDRV_PCM_STREAM_PLAYBACK) #define CABLE_VALID_CAPTURE BIT(SNDRV_PCM_STREAM_CAPTURE) #define CABLE_VALID_BOTH (CABLE_VALID_PLAYBACK | CABLE_VALID_CAPTURE) struct loopback_cable; struct loopback_pcm; struct loopback_ops { /* optional * call in loopback->cable_lock */ int (*open)(struct loopback_pcm *dpcm); /* required * call in cable->lock */ int (*start)(struct loopback_pcm *dpcm); /* required * call in cable->lock */ int (*stop)(struct loopback_pcm *dpcm); /* optional */ int (*stop_sync)(struct loopback_pcm *dpcm); /* optional */ int (*close_substream)(struct loopback_pcm *dpcm); /* optional * call in loopback->cable_lock */ int (*close_cable)(struct loopback_pcm *dpcm); /* optional * call in cable->lock */ unsigned int (*pos_update)(struct loopback_cable *cable); /* optional */ void (*dpcm_info)(struct loopback_pcm *dpcm, struct snd_info_buffer *buffer); }; struct loopback_cable { spinlock_t lock; struct loopback_pcm *streams[2]; struct snd_pcm_hardware hw; /* flags */ unsigned int valid; unsigned int running; unsigned int pause; /* timer specific */ const struct loopback_ops *ops; /* If sound timer is used */ struct { int stream; struct snd_timer_id id; struct work_struct event_work; struct snd_timer_instance *instance; } snd_timer; }; struct loopback_setup { unsigned int notify: 1; unsigned int rate_shift; snd_pcm_format_t format; unsigned int rate; snd_pcm_access_t access; unsigned int channels; struct snd_ctl_elem_id active_id; struct snd_ctl_elem_id format_id; struct snd_ctl_elem_id rate_id; struct snd_ctl_elem_id channels_id; struct snd_ctl_elem_id access_id; }; struct loopback { struct snd_card *card; struct mutex cable_lock; struct loopback_cable *cables[MAX_PCM_SUBSTREAMS][2]; struct snd_pcm *pcm[2]; struct loopback_setup setup[MAX_PCM_SUBSTREAMS][2]; const char *timer_source; }; struct loopback_pcm { struct loopback *loopback; struct snd_pcm_substream *substream; struct loopback_cable *cable; unsigned int pcm_buffer_size; unsigned int buf_pos; /* position in buffer */ unsigned int silent_size; /* PCM parameters */ unsigned int pcm_period_size; unsigned int pcm_bps; /* bytes per second */ unsigned int pcm_salign; /* bytes per sample * channels */ unsigned int pcm_rate_shift; /* rate shift value */ /* flags */ unsigned int period_update_pending :1; /* timer stuff */ unsigned int irq_pos; /* fractional IRQ position in jiffies * ticks */ unsigned int period_size_frac; /* period size in jiffies ticks */ unsigned int last_drift; unsigned long last_jiffies; /* If jiffies timer is used */ struct timer_list timer; /* size of per channel buffer in case of non-interleaved access */ unsigned int channel_buf_n; }; static struct platform_device *devices[SNDRV_CARDS]; static inline unsigned int byte_pos(struct loopback_pcm *dpcm, unsigned int x) { if (dpcm->pcm_rate_shift == NO_PITCH) { x /= HZ; } else { x = div_u64(NO_PITCH * (unsigned long long)x, HZ * (unsigned long long)dpcm->pcm_rate_shift); } return x - (x % dpcm->pcm_salign); } static inline unsigned int frac_pos(struct loopback_pcm *dpcm, unsigned int x) { if (dpcm->pcm_rate_shift == NO_PITCH) { /* no pitch */ return x * HZ; } else { x = div_u64(dpcm->pcm_rate_shift * (unsigned long long)x * HZ, NO_PITCH); } return x; } static inline struct loopback_setup *get_setup(struct loopback_pcm *dpcm) { int device = dpcm->substream->pstr->pcm->device; if (dpcm->substream->stream == SNDRV_PCM_STREAM_PLAYBACK) device ^= 1; return &dpcm->loopback->setup[dpcm->substream->number][device]; } static inline unsigned int get_notify(struct loopback_pcm *dpcm) { return get_setup(dpcm)->notify; } static inline unsigned int get_rate_shift(struct loopback_pcm *dpcm) { return get_setup(dpcm)->rate_shift; } /* call in cable->lock */ static int loopback_jiffies_timer_start(struct loopback_pcm *dpcm) { unsigned long tick; unsigned int rate_shift = get_rate_shift(dpcm); if (rate_shift != dpcm->pcm_rate_shift) { dpcm->pcm_rate_shift = rate_shift; dpcm->period_size_frac = frac_pos(dpcm, dpcm->pcm_period_size); } if (dpcm->period_size_frac <= dpcm->irq_pos) { dpcm->irq_pos %= dpcm->period_size_frac; dpcm->period_update_pending = 1; } tick = dpcm->period_size_frac - dpcm->irq_pos; tick = DIV_ROUND_UP(tick, dpcm->pcm_bps); mod_timer(&dpcm->timer, jiffies + tick); return 0; } /* call in cable->lock */ static int loopback_snd_timer_start(struct loopback_pcm *dpcm) { struct loopback_cable *cable = dpcm->cable; int err; /* Loopback device has to use same period as timer card. Therefore * wake up for each snd_pcm_period_elapsed() call of timer card. */ err = snd_timer_start(cable->snd_timer.instance, 1); if (err < 0) { /* do not report error if trying to start but already * running. For example called by opposite substream * of the same cable */ if (err == -EBUSY) return 0; pcm_err(dpcm->substream->pcm, "snd_timer_start(%d,%d,%d) failed with %d", cable->snd_timer.id.card, cable->snd_timer.id.device, cable->snd_timer.id.subdevice, err); } return err; } /* call in cable->lock */ static inline int loopback_jiffies_timer_stop(struct loopback_pcm *dpcm) { del_timer(&dpcm->timer); dpcm->timer.expires = 0; return 0; } /* call in cable->lock */ static int loopback_snd_timer_stop(struct loopback_pcm *dpcm) { struct loopback_cable *cable = dpcm->cable; int err; /* only stop if both devices (playback and capture) are not running */ if (cable->running ^ cable->pause) return 0; err = snd_timer_stop(cable->snd_timer.instance); if (err < 0) { pcm_err(dpcm->substream->pcm, "snd_timer_stop(%d,%d,%d) failed with %d", cable->snd_timer.id.card, cable->snd_timer.id.device, cable->snd_timer.id.subdevice, err); } return err; } static inline int loopback_jiffies_timer_stop_sync(struct loopback_pcm *dpcm) { del_timer_sync(&dpcm->timer); return 0; } /* call in loopback->cable_lock */ static int loopback_snd_timer_close_cable(struct loopback_pcm *dpcm) { struct loopback_cable *cable = dpcm->cable; /* snd_timer was not opened */ if (!cable->snd_timer.instance) return 0; /* will only be called from free_cable() when other stream was * already closed. Other stream cannot be reopened as long as * loopback->cable_lock is locked. Therefore no need to lock * cable->lock; */ snd_timer_close(cable->snd_timer.instance); /* wait till drain work has finished if requested */ cancel_work_sync(&cable->snd_timer.event_work); snd_timer_instance_free(cable->snd_timer.instance); memset(&cable->snd_timer, 0, sizeof(cable->snd_timer)); return 0; } static bool is_access_interleaved(snd_pcm_access_t access) { switch (access) { case SNDRV_PCM_ACCESS_MMAP_INTERLEAVED: case SNDRV_PCM_ACCESS_RW_INTERLEAVED: return true; default: return false; } }; static int loopback_check_format(struct loopback_cable *cable, int stream) { struct snd_pcm_runtime *runtime, *cruntime; struct loopback_setup *setup; struct snd_card *card; int check; if (cable->valid != CABLE_VALID_BOTH) { if (stream == SNDRV_PCM_STREAM_PLAYBACK) goto __notify; return 0; } runtime = cable->streams[SNDRV_PCM_STREAM_PLAYBACK]-> substream->runtime; cruntime = cable->streams[SNDRV_PCM_STREAM_CAPTURE]-> substream->runtime; check = runtime->format != cruntime->format || runtime->rate != cruntime->rate || runtime->channels != cruntime->channels || is_access_interleaved(runtime->access) != is_access_interleaved(cruntime->access); if (!check) return 0; if (stream == SNDRV_PCM_STREAM_CAPTURE) { return -EIO; } else { snd_pcm_stop(cable->streams[SNDRV_PCM_STREAM_CAPTURE]-> substream, SNDRV_PCM_STATE_DRAINING); __notify: runtime = cable->streams[SNDRV_PCM_STREAM_PLAYBACK]-> substream->runtime; setup = get_setup(cable->streams[SNDRV_PCM_STREAM_PLAYBACK]); card = cable->streams[SNDRV_PCM_STREAM_PLAYBACK]->loopback->card; if (setup->format != runtime->format) { snd_ctl_notify(card, SNDRV_CTL_EVENT_MASK_VALUE, &setup->format_id); setup->format = runtime->format; } if (setup->rate != runtime->rate) { snd_ctl_notify(card, SNDRV_CTL_EVENT_MASK_VALUE, &setup->rate_id); setup->rate = runtime->rate; } if (setup->channels != runtime->channels) { snd_ctl_notify(card, SNDRV_CTL_EVENT_MASK_VALUE, &setup->channels_id); setup->channels = runtime->channels; } if (is_access_interleaved(setup->access) != is_access_interleaved(runtime->access)) { snd_ctl_notify(card, SNDRV_CTL_EVENT_MASK_VALUE, &setup->access_id); setup->access = runtime->access; } } return 0; } static void loopback_active_notify(struct loopback_pcm *dpcm) { snd_ctl_notify(dpcm->loopback->card, SNDRV_CTL_EVENT_MASK_VALUE, &get_setup(dpcm)->active_id); } static int loopback_trigger(struct snd_pcm_substream *substream, int cmd) { struct snd_pcm_runtime *runtime = substream->runtime; struct loopback_pcm *dpcm = runtime->private_data; struct loopback_cable *cable = dpcm->cable; int err = 0, stream = 1 << substream->stream; switch (cmd) { case SNDRV_PCM_TRIGGER_START: err = loopback_check_format(cable, substream->stream); if (err < 0) return err; dpcm->last_jiffies = jiffies; dpcm->pcm_rate_shift = 0; dpcm->last_drift = 0; spin_lock(&cable->lock); cable->running |= stream; cable->pause &= ~stream; err = cable->ops->start(dpcm); spin_unlock(&cable->lock); if (substream->stream == SNDRV_PCM_STREAM_PLAYBACK) loopback_active_notify(dpcm); break; case SNDRV_PCM_TRIGGER_STOP: spin_lock(&cable->lock); cable->running &= ~stream; cable->pause &= ~stream; err = cable->ops->stop(dpcm); spin_unlock(&cable->lock); if (substream->stream == SNDRV_PCM_STREAM_PLAYBACK) loopback_active_notify(dpcm); break; case SNDRV_PCM_TRIGGER_PAUSE_PUSH: case SNDRV_PCM_TRIGGER_SUSPEND: spin_lock(&cable->lock); cable->pause |= stream; err = cable->ops->stop(dpcm); spin_unlock(&cable->lock); if (substream->stream == SNDRV_PCM_STREAM_PLAYBACK) loopback_active_notify(dpcm); break; case SNDRV_PCM_TRIGGER_PAUSE_RELEASE: case SNDRV_PCM_TRIGGER_RESUME: spin_lock(&cable->lock); dpcm->last_jiffies = jiffies; cable->pause &= ~stream; err = cable->ops->start(dpcm); spin_unlock(&cable->lock); if (substream->stream == SNDRV_PCM_STREAM_PLAYBACK) loopback_active_notify(dpcm); break; default: return -EINVAL; } return err; } static void params_change(struct snd_pcm_substream *substream) { struct snd_pcm_runtime *runtime = substream->runtime; struct loopback_pcm *dpcm = runtime->private_data; struct loopback_cable *cable = dpcm->cable; cable->hw.formats = pcm_format_to_bits(runtime->format); cable->hw.rate_min = runtime->rate; cable->hw.rate_max = runtime->rate; cable->hw.channels_min = runtime->channels; cable->hw.channels_max = runtime->channels; if (cable->snd_timer.instance) { cable->hw.period_bytes_min = frames_to_bytes(runtime, runtime->period_size); cable->hw.period_bytes_max = cable->hw.period_bytes_min; } } static int loopback_prepare(struct snd_pcm_substream *substream) { struct snd_pcm_runtime *runtime = substream->runtime; struct loopback_pcm *dpcm = runtime->private_data; struct loopback_cable *cable = dpcm->cable; int err, bps, salign; if (cable->ops->stop_sync) { err = cable->ops->stop_sync(dpcm); if (err < 0) return err; } salign = (snd_pcm_format_physical_width(runtime->format) * runtime->channels) / 8; bps = salign * runtime->rate; if (bps <= 0 || salign <= 0) return -EINVAL; dpcm->buf_pos = 0; dpcm->pcm_buffer_size = frames_to_bytes(runtime, runtime->buffer_size); dpcm->channel_buf_n = dpcm->pcm_buffer_size / runtime->channels; if (substream->stream == SNDRV_PCM_STREAM_CAPTURE) { /* clear capture buffer */ dpcm->silent_size = dpcm->pcm_buffer_size; snd_pcm_format_set_silence(runtime->format, runtime->dma_area, runtime->buffer_size * runtime->channels); } dpcm->irq_pos = 0; dpcm->period_update_pending = 0; dpcm->pcm_bps = bps; dpcm->pcm_salign = salign; dpcm->pcm_period_size = frames_to_bytes(runtime, runtime->period_size); mutex_lock(&dpcm->loopback->cable_lock); if (!(cable->valid & ~(1 << substream->stream)) || (get_setup(dpcm)->notify && substream->stream == SNDRV_PCM_STREAM_PLAYBACK)) params_change(substream); cable->valid |= 1 << substream->stream; mutex_unlock(&dpcm->loopback->cable_lock); return 0; } static void clear_capture_buf(struct loopback_pcm *dpcm, unsigned int bytes) { struct snd_pcm_runtime *runtime = dpcm->substream->runtime; char *dst = runtime->dma_area; unsigned int dst_off = dpcm->buf_pos; if (dpcm->silent_size >= dpcm->pcm_buffer_size) return; if (dpcm->silent_size + bytes > dpcm->pcm_buffer_size) bytes = dpcm->pcm_buffer_size - dpcm->silent_size; for (;;) { unsigned int size = bytes; if (dst_off + size > dpcm->pcm_buffer_size) size = dpcm->pcm_buffer_size - dst_off; snd_pcm_format_set_silence(runtime->format, dst + dst_off, bytes_to_frames(runtime, size) * runtime->channels); dpcm->silent_size += size; bytes -= size; if (!bytes) break; dst_off = 0; } } static void copy_play_buf_part_n(struct loopback_pcm *play, struct loopback_pcm *capt, unsigned int size, unsigned int src_off, unsigned int dst_off) { unsigned int channels = capt->substream->runtime->channels; unsigned int size_p_ch = size / channels; unsigned int src_off_ch = src_off / channels; unsigned int dst_off_ch = dst_off / channels; int i; for (i = 0; i < channels; i++) { memcpy(capt->substream->runtime->dma_area + capt->channel_buf_n * i + dst_off_ch, play->substream->runtime->dma_area + play->channel_buf_n * i + src_off_ch, size_p_ch); } } static void copy_play_buf(struct loopback_pcm *play, struct loopback_pcm *capt, unsigned int bytes) { struct snd_pcm_runtime *runtime = play->substream->runtime; char *src = runtime->dma_area; char *dst = capt->substream->runtime->dma_area; unsigned int src_off = play->buf_pos; unsigned int dst_off = capt->buf_pos; unsigned int clear_bytes = 0; /* check if playback is draining, trim the capture copy size * when our pointer is at the end of playback ring buffer */ if (runtime->state == SNDRV_PCM_STATE_DRAINING && snd_pcm_playback_hw_avail(runtime) < runtime->buffer_size) { snd_pcm_uframes_t appl_ptr, appl_ptr1, diff; appl_ptr = appl_ptr1 = runtime->control->appl_ptr; appl_ptr1 -= appl_ptr1 % runtime->buffer_size; appl_ptr1 += play->buf_pos / play->pcm_salign; if (appl_ptr < appl_ptr1) appl_ptr1 -= runtime->buffer_size; diff = (appl_ptr - appl_ptr1) * play->pcm_salign; if (diff < bytes) { clear_bytes = bytes - diff; bytes = diff; } } for (;;) { unsigned int size = bytes; if (src_off + size > play->pcm_buffer_size) size = play->pcm_buffer_size - src_off; if (dst_off + size > capt->pcm_buffer_size) size = capt->pcm_buffer_size - dst_off; if (!is_access_interleaved(runtime->access)) copy_play_buf_part_n(play, capt, size, src_off, dst_off); else memcpy(dst + dst_off, src + src_off, size); capt->silent_size = 0; bytes -= size; if (!bytes) break; src_off = (src_off + size) % play->pcm_buffer_size; dst_off = (dst_off + size) % capt->pcm_buffer_size; } if (clear_bytes > 0) { clear_capture_buf(capt, clear_bytes); capt->silent_size = 0; } } static inline unsigned int bytepos_delta(struct loopback_pcm *dpcm, unsigned int jiffies_delta) { unsigned long last_pos; unsigned int delta; last_pos = byte_pos(dpcm, dpcm->irq_pos); dpcm->irq_pos += jiffies_delta * dpcm->pcm_bps; delta = byte_pos(dpcm, dpcm->irq_pos) - last_pos; if (delta >= dpcm->last_drift) delta -= dpcm->last_drift; dpcm->last_drift = 0; if (dpcm->irq_pos >= dpcm->period_size_frac) { dpcm->irq_pos %= dpcm->period_size_frac; dpcm->period_update_pending = 1; } return delta; } static inline void bytepos_finish(struct loopback_pcm *dpcm, unsigned int delta) { dpcm->buf_pos += delta; dpcm->buf_pos %= dpcm->pcm_buffer_size; } /* call in cable->lock */ static unsigned int loopback_jiffies_timer_pos_update (struct loopback_cable *cable) { struct loopback_pcm *dpcm_play = cable->streams[SNDRV_PCM_STREAM_PLAYBACK]; struct loopback_pcm *dpcm_capt = cable->streams[SNDRV_PCM_STREAM_CAPTURE]; unsigned long delta_play = 0, delta_capt = 0, cur_jiffies; unsigned int running, count1, count2; cur_jiffies = jiffies; running = cable->running ^ cable->pause; if (running & (1 << SNDRV_PCM_STREAM_PLAYBACK)) { delta_play = cur_jiffies - dpcm_play->last_jiffies; dpcm_play->last_jiffies += delta_play; } if (running & (1 << SNDRV_PCM_STREAM_CAPTURE)) { delta_capt = cur_jiffies - dpcm_capt->last_jiffies; dpcm_capt->last_jiffies += delta_capt; } if (delta_play == 0 && delta_capt == 0) goto unlock; if (delta_play > delta_capt) { count1 = bytepos_delta(dpcm_play, delta_play - delta_capt); bytepos_finish(dpcm_play, count1); delta_play = delta_capt; } else if (delta_play < delta_capt) { count1 = bytepos_delta(dpcm_capt, delta_capt - delta_play); clear_capture_buf(dpcm_capt, count1); bytepos_finish(dpcm_capt, count1); delta_capt = delta_play; } if (delta_play == 0 && delta_capt == 0) goto unlock; /* note delta_capt == delta_play at this moment */ count1 = bytepos_delta(dpcm_play, delta_play); count2 = bytepos_delta(dpcm_capt, delta_capt); if (count1 < count2) { dpcm_capt->last_drift = count2 - count1; count1 = count2; } else if (count1 > count2) { dpcm_play->last_drift = count1 - count2; } copy_play_buf(dpcm_play, dpcm_capt, count1); bytepos_finish(dpcm_play, count1); bytepos_finish(dpcm_capt, count1); unlock: return running; } static void loopback_jiffies_timer_function(struct timer_list *t) { struct loopback_pcm *dpcm = from_timer(dpcm, t, timer); unsigned long flags; spin_lock_irqsave(&dpcm->cable->lock, flags); if (loopback_jiffies_timer_pos_update(dpcm->cable) & (1 << dpcm->substream->stream)) { loopback_jiffies_timer_start(dpcm); if (dpcm->period_update_pending) { dpcm->period_update_pending = 0; spin_unlock_irqrestore(&dpcm->cable->lock, flags); /* need to unlock before calling below */ snd_pcm_period_elapsed(dpcm->substream); return; } } spin_unlock_irqrestore(&dpcm->cable->lock, flags); } /* call in cable->lock */ static int loopback_snd_timer_check_resolution(struct snd_pcm_runtime *runtime, unsigned long resolution) { if (resolution != runtime->timer_resolution) { struct loopback_pcm *dpcm = runtime->private_data; struct loopback_cable *cable = dpcm->cable; /* Worst case estimation of possible values for resolution * resolution <= (512 * 1024) frames / 8kHz in nsec * resolution <= 65.536.000.000 nsec * * period_size <= 65.536.000.000 nsec / 1000nsec/usec * 192kHz + * 500.000 * period_size <= 12.582.912.000.000 <64bit * / 1.000.000 usec/sec */ snd_pcm_uframes_t period_size_usec = resolution / 1000 * runtime->rate; /* round to nearest sample rate */ snd_pcm_uframes_t period_size = (period_size_usec + 500 * 1000) / (1000 * 1000); pcm_err(dpcm->substream->pcm, "Period size (%lu frames) of loopback device is not corresponding to timer resolution (%lu nsec = %lu frames) of card timer %d,%d,%d. Use period size of %lu frames for loopback device.", runtime->period_size, resolution, period_size, cable->snd_timer.id.card, cable->snd_timer.id.device, cable->snd_timer.id.subdevice, period_size); return -EINVAL; } return 0; } static void loopback_snd_timer_period_elapsed(struct loopback_cable *cable, int event, unsigned long resolution) { struct loopback_pcm *dpcm_play, *dpcm_capt; struct snd_pcm_substream *substream_play, *substream_capt; struct snd_pcm_runtime *valid_runtime; unsigned int running, elapsed_bytes; unsigned long flags; spin_lock_irqsave(&cable->lock, flags); running = cable->running ^ cable->pause; /* no need to do anything if no stream is running */ if (!running) { spin_unlock_irqrestore(&cable->lock, flags); return; } dpcm_play = cable->streams[SNDRV_PCM_STREAM_PLAYBACK]; dpcm_capt = cable->streams[SNDRV_PCM_STREAM_CAPTURE]; if (event == SNDRV_TIMER_EVENT_MSTOP) { if (!dpcm_play || dpcm_play->substream->runtime->state != SNDRV_PCM_STATE_DRAINING) { spin_unlock_irqrestore(&cable->lock, flags); return; } } substream_play = (running & (1 << SNDRV_PCM_STREAM_PLAYBACK)) ? dpcm_play->substream : NULL; substream_capt = (running & (1 << SNDRV_PCM_STREAM_CAPTURE)) ? dpcm_capt->substream : NULL; valid_runtime = (running & (1 << SNDRV_PCM_STREAM_PLAYBACK)) ? dpcm_play->substream->runtime : dpcm_capt->substream->runtime; /* resolution is only valid for SNDRV_TIMER_EVENT_TICK events */ if (event == SNDRV_TIMER_EVENT_TICK) { /* The hardware rules guarantee that playback and capture period * are the same. Therefore only one device has to be checked * here. */ if (loopback_snd_timer_check_resolution(valid_runtime, resolution) < 0) { spin_unlock_irqrestore(&cable->lock, flags); if (substream_play) snd_pcm_stop_xrun(substream_play); if (substream_capt) snd_pcm_stop_xrun(substream_capt); return; } } elapsed_bytes = frames_to_bytes(valid_runtime, valid_runtime->period_size); /* The same timer interrupt is used for playback and capture device */ if ((running & (1 << SNDRV_PCM_STREAM_PLAYBACK)) && (running & (1 << SNDRV_PCM_STREAM_CAPTURE))) { copy_play_buf(dpcm_play, dpcm_capt, elapsed_bytes); bytepos_finish(dpcm_play, elapsed_bytes); bytepos_finish(dpcm_capt, elapsed_bytes); } else if (running & (1 << SNDRV_PCM_STREAM_PLAYBACK)) { bytepos_finish(dpcm_play, elapsed_bytes); } else if (running & (1 << SNDRV_PCM_STREAM_CAPTURE)) { clear_capture_buf(dpcm_capt, elapsed_bytes); bytepos_finish(dpcm_capt, elapsed_bytes); } spin_unlock_irqrestore(&cable->lock, flags); if (substream_play) snd_pcm_period_elapsed(substream_play); if (substream_capt) snd_pcm_period_elapsed(substream_capt); } static void loopback_snd_timer_function(struct snd_timer_instance *timeri, unsigned long resolution, unsigned long ticks) { struct loopback_cable *cable = timeri->callback_data; loopback_snd_timer_period_elapsed(cable, SNDRV_TIMER_EVENT_TICK, resolution); } static void loopback_snd_timer_work(struct work_struct *work) { struct loopback_cable *cable; cable = container_of(work, struct loopback_cable, snd_timer.event_work); loopback_snd_timer_period_elapsed(cable, SNDRV_TIMER_EVENT_MSTOP, 0); } static void loopback_snd_timer_event(struct snd_timer_instance *timeri, int event, struct timespec64 *tstamp, unsigned long resolution) { /* Do not lock cable->lock here because timer->lock is already hold. * There are other functions which first lock cable->lock and than * timer->lock e.g. * loopback_trigger() * spin_lock(&cable->lock) * loopback_snd_timer_start() * snd_timer_start() * spin_lock(&timer->lock) * Therefore when using the oposit order of locks here it could result * in a deadlock. */ if (event == SNDRV_TIMER_EVENT_MSTOP) { struct loopback_cable *cable = timeri->callback_data; /* sound card of the timer was stopped. Therefore there will not * be any further timer callbacks. Due to this forward audio * data from here if in draining state. When still in running * state the streaming will be aborted by the usual timeout. It * should not be aborted here because may be the timer sound * card does only a recovery and the timer is back soon. * This work triggers loopback_snd_timer_work() */ schedule_work(&cable->snd_timer.event_work); } } static void loopback_jiffies_timer_dpcm_info(struct loopback_pcm *dpcm, struct snd_info_buffer *buffer) { snd_iprintf(buffer, " update_pending:\t%u\n", dpcm->period_update_pending); snd_iprintf(buffer, " irq_pos:\t\t%u\n", dpcm->irq_pos); snd_iprintf(buffer, " period_frac:\t%u\n", dpcm->period_size_frac); snd_iprintf(buffer, " last_jiffies:\t%lu (%lu)\n", dpcm->last_jiffies, jiffies); snd_iprintf(buffer, " timer_expires:\t%lu\n", dpcm->timer.expires); } static void loopback_snd_timer_dpcm_info(struct loopback_pcm *dpcm, struct snd_info_buffer *buffer) { struct loopback_cable *cable = dpcm->cable; snd_iprintf(buffer, " sound timer:\thw:%d,%d,%d\n", cable->snd_timer.id.card, cable->snd_timer.id.device, cable->snd_timer.id.subdevice); snd_iprintf(buffer, " timer open:\t\t%s\n", snd_pcm_direction_name(cable->snd_timer.stream)); } static snd_pcm_uframes_t loopback_pointer(struct snd_pcm_substream *substream) { struct snd_pcm_runtime *runtime = substream->runtime; struct loopback_pcm *dpcm = runtime->private_data; snd_pcm_uframes_t pos; spin_lock(&dpcm->cable->lock); if (dpcm->cable->ops->pos_update) dpcm->cable->ops->pos_update(dpcm->cable); pos = dpcm->buf_pos; spin_unlock(&dpcm->cable->lock); return bytes_to_frames(runtime, pos); } static const struct snd_pcm_hardware loopback_pcm_hardware = { .info = (SNDRV_PCM_INFO_INTERLEAVED | SNDRV_PCM_INFO_MMAP | SNDRV_PCM_INFO_MMAP_VALID | SNDRV_PCM_INFO_PAUSE | SNDRV_PCM_INFO_RESUME | SNDRV_PCM_INFO_NONINTERLEAVED), .formats = (SNDRV_PCM_FMTBIT_S16_LE | SNDRV_PCM_FMTBIT_S16_BE | SNDRV_PCM_FMTBIT_S24_LE | SNDRV_PCM_FMTBIT_S24_BE | SNDRV_PCM_FMTBIT_S24_3LE | SNDRV_PCM_FMTBIT_S24_3BE | SNDRV_PCM_FMTBIT_S32_LE | SNDRV_PCM_FMTBIT_S32_BE | SNDRV_PCM_FMTBIT_FLOAT_LE | SNDRV_PCM_FMTBIT_FLOAT_BE | SNDRV_PCM_FMTBIT_DSD_U8 | SNDRV_PCM_FMTBIT_DSD_U16_LE | SNDRV_PCM_FMTBIT_DSD_U16_BE | SNDRV_PCM_FMTBIT_DSD_U32_LE | SNDRV_PCM_FMTBIT_DSD_U32_BE), .rates = SNDRV_PCM_RATE_CONTINUOUS | SNDRV_PCM_RATE_8000_768000, .rate_min = 8000, .rate_max = 768000, .channels_min = 1, .channels_max = 32, .buffer_bytes_max = 2 * 1024 * 1024, .period_bytes_min = 64, /* note check overflow in frac_pos() using pcm_rate_shift before changing period_bytes_max value */ .period_bytes_max = 1024 * 1024, .periods_min = 1, .periods_max = 1024, .fifo_size = 0, }; static void loopback_runtime_free(struct snd_pcm_runtime *runtime) { struct loopback_pcm *dpcm = runtime->private_data; kfree(dpcm); } static int loopback_hw_free(struct snd_pcm_substream *substream) { struct snd_pcm_runtime *runtime = substream->runtime; struct loopback_pcm *dpcm = runtime->private_data; struct loopback_cable *cable = dpcm->cable; mutex_lock(&dpcm->loopback->cable_lock); cable->valid &= ~(1 << substream->stream); mutex_unlock(&dpcm->loopback->cable_lock); return 0; } static unsigned int get_cable_index(struct snd_pcm_substream *substream) { if (!substream->pcm->device) return substream->stream; else return !substream->stream; } static int rule_format(struct snd_pcm_hw_params *params, struct snd_pcm_hw_rule *rule) { struct loopback_pcm *dpcm = rule->private; struct loopback_cable *cable = dpcm->cable; struct snd_mask m; snd_mask_none(&m); mutex_lock(&dpcm->loopback->cable_lock); m.bits[0] = (u_int32_t)cable->hw.formats; m.bits[1] = (u_int32_t)(cable->hw.formats >> 32); mutex_unlock(&dpcm->loopback->cable_lock); return snd_mask_refine(hw_param_mask(params, rule->var), &m); } static int rule_rate(struct snd_pcm_hw_params *params, struct snd_pcm_hw_rule *rule) { struct loopback_pcm *dpcm = rule->private; struct loopback_cable *cable = dpcm->cable; struct snd_interval t; mutex_lock(&dpcm->loopback->cable_lock); t.min = cable->hw.rate_min; t.max = cable->hw.rate_max; mutex_unlock(&dpcm->loopback->cable_lock); t.openmin = t.openmax = 0; t.integer = 0; return snd_interval_refine(hw_param_interval(params, rule->var), &t); } static int rule_channels(struct snd_pcm_hw_params *params, struct snd_pcm_hw_rule *rule) { struct loopback_pcm *dpcm = rule->private; struct loopback_cable *cable = dpcm->cable; struct snd_interval t; mutex_lock(&dpcm->loopback->cable_lock); t.min = cable->hw.channels_min; t.max = cable->hw.channels_max; mutex_unlock(&dpcm->loopback->cable_lock); t.openmin = t.openmax = 0; t.integer = 0; return snd_interval_refine(hw_param_interval(params, rule->var), &t); } static int rule_period_bytes(struct snd_pcm_hw_params *params, struct snd_pcm_hw_rule *rule) { struct loopback_pcm *dpcm = rule->private; struct loopback_cable *cable = dpcm->cable; struct snd_interval t; mutex_lock(&dpcm->loopback->cable_lock); t.min = cable->hw.period_bytes_min; t.max = cable->hw.period_bytes_max; mutex_unlock(&dpcm->loopback->cable_lock); t.openmin = 0; t.openmax = 0; t.integer = 0; return snd_interval_refine(hw_param_interval(params, rule->var), &t); } static void free_cable(struct snd_pcm_substream *substream) { struct loopback *loopback = substream->private_data; int dev = get_cable_index(substream); struct loopback_cable *cable; cable = loopback->cables[substream->number][dev]; if (!cable) return; if (cable->streams[!substream->stream]) { /* other stream is still alive */ spin_lock_irq(&cable->lock); cable->streams[substream->stream] = NULL; spin_unlock_irq(&cable->lock); } else { struct loopback_pcm *dpcm = substream->runtime->private_data; if (cable->ops && cable->ops->close_cable && dpcm) cable->ops->close_cable(dpcm); /* free the cable */ loopback->cables[substream->number][dev] = NULL; kfree(cable); } } static int loopback_jiffies_timer_open(struct loopback_pcm *dpcm) { timer_setup(&dpcm->timer, loopback_jiffies_timer_function, 0); return 0; } static const struct loopback_ops loopback_jiffies_timer_ops = { .open = loopback_jiffies_timer_open, .start = loopback_jiffies_timer_start, .stop = loopback_jiffies_timer_stop, .stop_sync = loopback_jiffies_timer_stop_sync, .close_substream = loopback_jiffies_timer_stop_sync, .pos_update = loopback_jiffies_timer_pos_update, .dpcm_info = loopback_jiffies_timer_dpcm_info, }; static int loopback_parse_timer_id(const char *str, struct snd_timer_id *tid) { /* [<pref>:](<card name>|<card idx>)[{.,}<dev idx>[{.,}<subdev idx>]] */ const char * const sep_dev = ".,"; const char * const sep_pref = ":"; const char *name = str; char *sep, save = '\0'; int card_idx = 0, dev = 0, subdev = 0; int err; sep = strpbrk(str, sep_pref); if (sep) name = sep + 1; sep = strpbrk(name, sep_dev); if (sep) { save = *sep; *sep = '\0'; } err = kstrtoint(name, 0, &card_idx); if (err == -EINVAL) { /* Must be the name, not number */ for (card_idx = 0; card_idx < snd_ecards_limit; card_idx++) { struct snd_card *card = snd_card_ref(card_idx); if (card) { if (!strcmp(card->id, name)) err = 0; snd_card_unref(card); } if (!err) break; } } if (sep) { *sep = save; if (!err) { char *sep2, save2 = '\0'; sep2 = strpbrk(sep + 1, sep_dev); if (sep2) { save2 = *sep2; *sep2 = '\0'; } err = kstrtoint(sep + 1, 0, &dev); if (sep2) { *sep2 = save2; if (!err) err = kstrtoint(sep2 + 1, 0, &subdev); } } } if (card_idx == -1) tid->dev_class = SNDRV_TIMER_CLASS_GLOBAL; if (!err && tid) { tid->card = card_idx; tid->device = dev; tid->subdevice = subdev; } return err; } /* call in loopback->cable_lock */ static int loopback_snd_timer_open(struct loopback_pcm *dpcm) { int err = 0; struct snd_timer_id tid = { .dev_class = SNDRV_TIMER_CLASS_PCM, .dev_sclass = SNDRV_TIMER_SCLASS_APPLICATION, }; struct snd_timer_instance *timeri; struct loopback_cable *cable = dpcm->cable; /* check if timer was already opened. It is only opened once * per playback and capture subdevice (aka cable). */ if (cable->snd_timer.instance) goto exit; err = loopback_parse_timer_id(dpcm->loopback->timer_source, &tid); if (err < 0) { pcm_err(dpcm->substream->pcm, "Parsing timer source \'%s\' failed with %d", dpcm->loopback->timer_source, err); goto exit; } cable->snd_timer.stream = dpcm->substream->stream; cable->snd_timer.id = tid; timeri = snd_timer_instance_new(dpcm->loopback->card->id); if (!timeri) { err = -ENOMEM; goto exit; } /* The callback has to be called from another work. If * SNDRV_TIMER_IFLG_FAST is specified it will be called from the * snd_pcm_period_elapsed() call of the selected sound card. * snd_pcm_period_elapsed() helds snd_pcm_stream_lock_irqsave(). * Due to our callback loopback_snd_timer_function() also calls * snd_pcm_period_elapsed() which calls snd_pcm_stream_lock_irqsave(). * This would end up in a dead lock. */ timeri->flags |= SNDRV_TIMER_IFLG_AUTO; timeri->callback = loopback_snd_timer_function; timeri->callback_data = (void *)cable; timeri->ccallback = loopback_snd_timer_event; /* initialise a work used for draining */ INIT_WORK(&cable->snd_timer.event_work, loopback_snd_timer_work); /* The mutex loopback->cable_lock is kept locked. * Therefore snd_timer_open() cannot be called a second time * by the other device of the same cable. * Therefore the following issue cannot happen: * [proc1] Call loopback_timer_open() -> * Unlock cable->lock for snd_timer_close/open() call * [proc2] Call loopback_timer_open() -> snd_timer_open(), * snd_timer_start() * [proc1] Call snd_timer_open() and overwrite running timer * instance */ err = snd_timer_open(timeri, &cable->snd_timer.id, current->pid); if (err < 0) { pcm_err(dpcm->substream->pcm, "snd_timer_open (%d,%d,%d) failed with %d", cable->snd_timer.id.card, cable->snd_timer.id.device, cable->snd_timer.id.subdevice, err); snd_timer_instance_free(timeri); goto exit; } cable->snd_timer.instance = timeri; exit: return err; } /* stop_sync() is not required for sound timer because it does not need to be * restarted in loopback_prepare() on Xrun recovery */ static const struct loopback_ops loopback_snd_timer_ops = { .open = loopback_snd_timer_open, .start = loopback_snd_timer_start, .stop = loopback_snd_timer_stop, .close_cable = loopback_snd_timer_close_cable, .dpcm_info = loopback_snd_timer_dpcm_info, }; static int loopback_open(struct snd_pcm_substream *substream) { struct snd_pcm_runtime *runtime = substream->runtime; struct loopback *loopback = substream->private_data; struct loopback_pcm *dpcm; struct loopback_cable *cable = NULL; int err = 0; int dev = get_cable_index(substream); mutex_lock(&loopback->cable_lock); dpcm = kzalloc(sizeof(*dpcm), GFP_KERNEL); if (!dpcm) { err = -ENOMEM; goto unlock; } dpcm->loopback = loopback; dpcm->substream = substream; cable = loopback->cables[substream->number][dev]; if (!cable) { cable = kzalloc(sizeof(*cable), GFP_KERNEL); if (!cable) { err = -ENOMEM; goto unlock; } spin_lock_init(&cable->lock); cable->hw = loopback_pcm_hardware; if (loopback->timer_source) cable->ops = &loopback_snd_timer_ops; else cable->ops = &loopback_jiffies_timer_ops; loopback->cables[substream->number][dev] = cable; } dpcm->cable = cable; runtime->private_data = dpcm; if (cable->ops->open) { err = cable->ops->open(dpcm); if (err < 0) goto unlock; } snd_pcm_hw_constraint_integer(runtime, SNDRV_PCM_HW_PARAM_PERIODS); /* use dynamic rules based on actual runtime->hw values */ /* note that the default rules created in the PCM midlevel code */ /* are cached -> they do not reflect the actual state */ err = snd_pcm_hw_rule_add(runtime, 0, SNDRV_PCM_HW_PARAM_FORMAT, rule_format, dpcm, SNDRV_PCM_HW_PARAM_FORMAT, -1); if (err < 0) goto unlock; err = snd_pcm_hw_rule_add(runtime, 0, SNDRV_PCM_HW_PARAM_RATE, rule_rate, dpcm, SNDRV_PCM_HW_PARAM_RATE, -1); if (err < 0) goto unlock; err = snd_pcm_hw_rule_add(runtime, 0, SNDRV_PCM_HW_PARAM_CHANNELS, rule_channels, dpcm, SNDRV_PCM_HW_PARAM_CHANNELS, -1); if (err < 0) goto unlock; /* In case of sound timer the period time of both devices of the same * loop has to be the same. * This rule only takes effect if a sound timer was chosen */ if (cable->snd_timer.instance) { err = snd_pcm_hw_rule_add(runtime, 0, SNDRV_PCM_HW_PARAM_PERIOD_BYTES, rule_period_bytes, dpcm, SNDRV_PCM_HW_PARAM_PERIOD_BYTES, -1); if (err < 0) goto unlock; } /* loopback_runtime_free() has not to be called if kfree(dpcm) was * already called here. Otherwise it will end up with a double free. */ runtime->private_free = loopback_runtime_free; if (get_notify(dpcm)) runtime->hw = loopback_pcm_hardware; else runtime->hw = cable->hw; spin_lock_irq(&cable->lock); cable->streams[substream->stream] = dpcm; spin_unlock_irq(&cable->lock); unlock: if (err < 0) { free_cable(substream); kfree(dpcm); } mutex_unlock(&loopback->cable_lock); return err; } static int loopback_close(struct snd_pcm_substream *substream) { struct loopback *loopback = substream->private_data; struct loopback_pcm *dpcm = substream->runtime->private_data; int err = 0; if (dpcm->cable->ops->close_substream) err = dpcm->cable->ops->close_substream(dpcm); mutex_lock(&loopback->cable_lock); free_cable(substream); mutex_unlock(&loopback->cable_lock); return err; } static const struct snd_pcm_ops loopback_pcm_ops = { .open = loopback_open, .close = loopback_close, .hw_free = loopback_hw_free, .prepare = loopback_prepare, .trigger = loopback_trigger, .pointer = loopback_pointer, }; static int loopback_pcm_new(struct loopback *loopback, int device, int substreams) { struct snd_pcm *pcm; int err; err = snd_pcm_new(loopback->card, "Loopback PCM", device, substreams, substreams, &pcm); if (err < 0) return err; snd_pcm_set_ops(pcm, SNDRV_PCM_STREAM_PLAYBACK, &loopback_pcm_ops); snd_pcm_set_ops(pcm, SNDRV_PCM_STREAM_CAPTURE, &loopback_pcm_ops); snd_pcm_set_managed_buffer_all(pcm, SNDRV_DMA_TYPE_VMALLOC, NULL, 0, 0); pcm->private_data = loopback; pcm->info_flags = 0; strcpy(pcm->name, "Loopback PCM"); loopback->pcm[device] = pcm; return 0; } static int loopback_rate_shift_info(struct snd_kcontrol *kcontrol, struct snd_ctl_elem_info *uinfo) { uinfo->type = SNDRV_CTL_ELEM_TYPE_INTEGER; uinfo->count = 1; uinfo->value.integer.min = 80000; uinfo->value.integer.max = 120000; uinfo->value.integer.step = 1; return 0; } static int loopback_rate_shift_get(struct snd_kcontrol *kcontrol, struct snd_ctl_elem_value *ucontrol) { struct loopback *loopback = snd_kcontrol_chip(kcontrol); mutex_lock(&loopback->cable_lock); ucontrol->value.integer.value[0] = loopback->setup[kcontrol->id.subdevice] [kcontrol->id.device].rate_shift; mutex_unlock(&loopback->cable_lock); return 0; } static int loopback_rate_shift_put(struct snd_kcontrol *kcontrol, struct snd_ctl_elem_value *ucontrol) { struct loopback *loopback = snd_kcontrol_chip(kcontrol); unsigned int val; int change = 0; val = ucontrol->value.integer.value[0]; if (val < 80000) val = 80000; if (val > 120000) val = 120000; mutex_lock(&loopback->cable_lock); if (val != loopback->setup[kcontrol->id.subdevice] [kcontrol->id.device].rate_shift) { loopback->setup[kcontrol->id.subdevice] [kcontrol->id.device].rate_shift = val; change = 1; } mutex_unlock(&loopback->cable_lock); return change; } static int loopback_notify_get(struct snd_kcontrol *kcontrol, struct snd_ctl_elem_value *ucontrol) { struct loopback *loopback = snd_kcontrol_chip(kcontrol); mutex_lock(&loopback->cable_lock); ucontrol->value.integer.value[0] = loopback->setup[kcontrol->id.subdevice] [kcontrol->id.device].notify; mutex_unlock(&loopback->cable_lock); return 0; } static int loopback_notify_put(struct snd_kcontrol *kcontrol, struct snd_ctl_elem_value *ucontrol) { struct loopback *loopback = snd_kcontrol_chip(kcontrol); unsigned int val; int change = 0; val = ucontrol->value.integer.value[0] ? 1 : 0; mutex_lock(&loopback->cable_lock); if (val != loopback->setup[kcontrol->id.subdevice] [kcontrol->id.device].notify) { loopback->setup[kcontrol->id.subdevice] [kcontrol->id.device].notify = val; change = 1; } mutex_unlock(&loopback->cable_lock); return change; } static int loopback_active_get(struct snd_kcontrol *kcontrol, struct snd_ctl_elem_value *ucontrol) { struct loopback *loopback = snd_kcontrol_chip(kcontrol); struct loopback_cable *cable; unsigned int val = 0; mutex_lock(&loopback->cable_lock); cable = loopback->cables[kcontrol->id.subdevice][kcontrol->id.device ^ 1]; if (cable != NULL) { unsigned int running = cable->running ^ cable->pause; val = (running & (1 << SNDRV_PCM_STREAM_PLAYBACK)) ? 1 : 0; } mutex_unlock(&loopback->cable_lock); ucontrol->value.integer.value[0] = val; return 0; } static int loopback_format_info(struct snd_kcontrol *kcontrol, struct snd_ctl_elem_info *uinfo) { uinfo->type = SNDRV_CTL_ELEM_TYPE_INTEGER; uinfo->count = 1; uinfo->value.integer.min = 0; uinfo->value.integer.max = (__force int)SNDRV_PCM_FORMAT_LAST; uinfo->value.integer.step = 1; return 0; } static int loopback_format_get(struct snd_kcontrol *kcontrol, struct snd_ctl_elem_value *ucontrol) { struct loopback *loopback = snd_kcontrol_chip(kcontrol); ucontrol->value.integer.value[0] = (__force int)loopback->setup[kcontrol->id.subdevice] [kcontrol->id.device].format; return 0; } static int loopback_rate_info(struct snd_kcontrol *kcontrol, struct snd_ctl_elem_info *uinfo) { uinfo->type = SNDRV_CTL_ELEM_TYPE_INTEGER; uinfo->count = 1; uinfo->value.integer.min = 0; uinfo->value.integer.max = 192000; uinfo->value.integer.step = 1; return 0; } static int loopback_rate_get(struct snd_kcontrol *kcontrol, struct snd_ctl_elem_value *ucontrol) { struct loopback *loopback = snd_kcontrol_chip(kcontrol); mutex_lock(&loopback->cable_lock); ucontrol->value.integer.value[0] = loopback->setup[kcontrol->id.subdevice] [kcontrol->id.device].rate; mutex_unlock(&loopback->cable_lock); return 0; } static int loopback_channels_info(struct snd_kcontrol *kcontrol, struct snd_ctl_elem_info *uinfo) { uinfo->type = SNDRV_CTL_ELEM_TYPE_INTEGER; uinfo->count = 1; uinfo->value.integer.min = 1; uinfo->value.integer.max = 1024; uinfo->value.integer.step = 1; return 0; } static int loopback_channels_get(struct snd_kcontrol *kcontrol, struct snd_ctl_elem_value *ucontrol) { struct loopback *loopback = snd_kcontrol_chip(kcontrol); mutex_lock(&loopback->cable_lock); ucontrol->value.integer.value[0] = loopback->setup[kcontrol->id.subdevice] [kcontrol->id.device].channels; mutex_unlock(&loopback->cable_lock); return 0; } static int loopback_access_info(struct snd_kcontrol *kcontrol, struct snd_ctl_elem_info *uinfo) { const char * const texts[] = {"Interleaved", "Non-interleaved"}; return snd_ctl_enum_info(uinfo, 1, ARRAY_SIZE(texts), texts); } static int loopback_access_get(struct snd_kcontrol *kcontrol, struct snd_ctl_elem_value *ucontrol) { struct loopback *loopback = snd_kcontrol_chip(kcontrol); snd_pcm_access_t access; mutex_lock(&loopback->cable_lock); access = loopback->setup[kcontrol->id.subdevice][kcontrol->id.device].access; ucontrol->value.enumerated.item[0] = !is_access_interleaved(access); mutex_unlock(&loopback->cable_lock); return 0; } static const struct snd_kcontrol_new loopback_controls[] = { { .iface = SNDRV_CTL_ELEM_IFACE_PCM, .name = "PCM Rate Shift 100000", .info = loopback_rate_shift_info, .get = loopback_rate_shift_get, .put = loopback_rate_shift_put, }, { .iface = SNDRV_CTL_ELEM_IFACE_PCM, .name = "PCM Notify", .info = snd_ctl_boolean_mono_info, .get = loopback_notify_get, .put = loopback_notify_put, }, #define ACTIVE_IDX 2 { .access = SNDRV_CTL_ELEM_ACCESS_READ, .iface = SNDRV_CTL_ELEM_IFACE_PCM, .name = "PCM Slave Active", .info = snd_ctl_boolean_mono_info, .get = loopback_active_get, }, #define FORMAT_IDX 3 { .access = SNDRV_CTL_ELEM_ACCESS_READ, .iface = SNDRV_CTL_ELEM_IFACE_PCM, .name = "PCM Slave Format", .info = loopback_format_info, .get = loopback_format_get }, #define RATE_IDX 4 { .access = SNDRV_CTL_ELEM_ACCESS_READ, .iface = SNDRV_CTL_ELEM_IFACE_PCM, .name = "PCM Slave Rate", .info = loopback_rate_info, .get = loopback_rate_get }, #define CHANNELS_IDX 5 { .access = SNDRV_CTL_ELEM_ACCESS_READ, .iface = SNDRV_CTL_ELEM_IFACE_PCM, .name = "PCM Slave Channels", .info = loopback_channels_info, .get = loopback_channels_get }, #define ACCESS_IDX 6 { .access = SNDRV_CTL_ELEM_ACCESS_READ, .iface = SNDRV_CTL_ELEM_IFACE_PCM, .name = "PCM Slave Access Mode", .info = loopback_access_info, .get = loopback_access_get, }, }; static int loopback_mixer_new(struct loopback *loopback, int notify) { struct snd_card *card = loopback->card; struct snd_pcm *pcm; struct snd_kcontrol *kctl; struct loopback_setup *setup; int err, dev, substr, substr_count, idx; strcpy(card->mixername, "Loopback Mixer"); for (dev = 0; dev < 2; dev++) { pcm = loopback->pcm[dev]; substr_count = pcm->streams[SNDRV_PCM_STREAM_CAPTURE].substream_count; for (substr = 0; substr < substr_count; substr++) { setup = &loopback->setup[substr][dev]; setup->notify = notify; setup->rate_shift = NO_PITCH; setup->format = SNDRV_PCM_FORMAT_S16_LE; setup->access = SNDRV_PCM_ACCESS_RW_INTERLEAVED; setup->rate = 48000; setup->channels = 2; for (idx = 0; idx < ARRAY_SIZE(loopback_controls); idx++) { kctl = snd_ctl_new1(&loopback_controls[idx], loopback); if (!kctl) return -ENOMEM; kctl->id.device = dev; kctl->id.subdevice = substr; /* Add the control before copying the id so that * the numid field of the id is set in the copy. */ err = snd_ctl_add(card, kctl); if (err < 0) return err; switch (idx) { case ACTIVE_IDX: setup->active_id = kctl->id; break; case FORMAT_IDX: setup->format_id = kctl->id; break; case RATE_IDX: setup->rate_id = kctl->id; break; case CHANNELS_IDX: setup->channels_id = kctl->id; break; case ACCESS_IDX: setup->access_id = kctl->id; break; default: break; } } } } return 0; } static void print_dpcm_info(struct snd_info_buffer *buffer, struct loopback_pcm *dpcm, const char *id) { snd_iprintf(buffer, " %s\n", id); if (dpcm == NULL) { snd_iprintf(buffer, " inactive\n"); return; } snd_iprintf(buffer, " buffer_size:\t%u\n", dpcm->pcm_buffer_size); snd_iprintf(buffer, " buffer_pos:\t\t%u\n", dpcm->buf_pos); snd_iprintf(buffer, " silent_size:\t%u\n", dpcm->silent_size); snd_iprintf(buffer, " period_size:\t%u\n", dpcm->pcm_period_size); snd_iprintf(buffer, " bytes_per_sec:\t%u\n", dpcm->pcm_bps); snd_iprintf(buffer, " sample_align:\t%u\n", dpcm->pcm_salign); snd_iprintf(buffer, " rate_shift:\t\t%u\n", dpcm->pcm_rate_shift); if (dpcm->cable->ops->dpcm_info) dpcm->cable->ops->dpcm_info(dpcm, buffer); } static void print_substream_info(struct snd_info_buffer *buffer, struct loopback *loopback, int sub, int num) { struct loopback_cable *cable = loopback->cables[sub][num]; snd_iprintf(buffer, "Cable %i substream %i:\n", num, sub); if (cable == NULL) { snd_iprintf(buffer, " inactive\n"); return; } snd_iprintf(buffer, " valid: %u\n", cable->valid); snd_iprintf(buffer, " running: %u\n", cable->running); snd_iprintf(buffer, " pause: %u\n", cable->pause); print_dpcm_info(buffer, cable->streams[0], "Playback"); print_dpcm_info(buffer, cable->streams[1], "Capture"); } static void print_cable_info(struct snd_info_entry *entry, struct snd_info_buffer *buffer) { struct loopback *loopback = entry->private_data; int sub, num; mutex_lock(&loopback->cable_lock); num = entry->name[strlen(entry->name)-1]; num = num == '0' ? 0 : 1; for (sub = 0; sub < MAX_PCM_SUBSTREAMS; sub++) print_substream_info(buffer, loopback, sub, num); mutex_unlock(&loopback->cable_lock); } static int loopback_cable_proc_new(struct loopback *loopback, int cidx) { char name[32]; snprintf(name, sizeof(name), "cable#%d", cidx); return snd_card_ro_proc_new(loopback->card, name, loopback, print_cable_info); } static void loopback_set_timer_source(struct loopback *loopback, const char *value) { if (loopback->timer_source) { devm_kfree(loopback->card->dev, loopback->timer_source); loopback->timer_source = NULL; } if (value && *value) loopback->timer_source = devm_kstrdup(loopback->card->dev, value, GFP_KERNEL); } static void print_timer_source_info(struct snd_info_entry *entry, struct snd_info_buffer *buffer) { struct loopback *loopback = entry->private_data; mutex_lock(&loopback->cable_lock); snd_iprintf(buffer, "%s\n", loopback->timer_source ? loopback->timer_source : ""); mutex_unlock(&loopback->cable_lock); } static void change_timer_source_info(struct snd_info_entry *entry, struct snd_info_buffer *buffer) { struct loopback *loopback = entry->private_data; char line[64]; mutex_lock(&loopback->cable_lock); if (!snd_info_get_line(buffer, line, sizeof(line))) loopback_set_timer_source(loopback, strim(line)); mutex_unlock(&loopback->cable_lock); } static int loopback_timer_source_proc_new(struct loopback *loopback) { return snd_card_rw_proc_new(loopback->card, "timer_source", loopback, print_timer_source_info, change_timer_source_info); } static int loopback_probe(struct platform_device *devptr) { struct snd_card *card; struct loopback *loopback; int dev = devptr->id; int err; err = snd_devm_card_new(&devptr->dev, index[dev], id[dev], THIS_MODULE, sizeof(struct loopback), &card); if (err < 0) return err; loopback = card->private_data; if (pcm_substreams[dev] < 1) pcm_substreams[dev] = 1; if (pcm_substreams[dev] > MAX_PCM_SUBSTREAMS) pcm_substreams[dev] = MAX_PCM_SUBSTREAMS; loopback->card = card; loopback_set_timer_source(loopback, timer_source[dev]); mutex_init(&loopback->cable_lock); err = loopback_pcm_new(loopback, 0, pcm_substreams[dev]); if (err < 0) return err; err = loopback_pcm_new(loopback, 1, pcm_substreams[dev]); if (err < 0) return err; err = loopback_mixer_new(loopback, pcm_notify[dev] ? 1 : 0); if (err < 0) return err; loopback_cable_proc_new(loopback, 0); loopback_cable_proc_new(loopback, 1); loopback_timer_source_proc_new(loopback); strcpy(card->driver, "Loopback"); strcpy(card->shortname, "Loopback"); sprintf(card->longname, "Loopback %i", dev + 1); err = snd_card_register(card); if (err < 0) return err; platform_set_drvdata(devptr, card); return 0; } static int loopback_suspend(struct device *pdev) { struct snd_card *card = dev_get_drvdata(pdev); snd_power_change_state(card, SNDRV_CTL_POWER_D3hot); return 0; } static int loopback_resume(struct device *pdev) { struct snd_card *card = dev_get_drvdata(pdev); snd_power_change_state(card, SNDRV_CTL_POWER_D0); return 0; } static DEFINE_SIMPLE_DEV_PM_OPS(loopback_pm, loopback_suspend, loopback_resume); #define SND_LOOPBACK_DRIVER "snd_aloop" static struct platform_driver loopback_driver = { .probe = loopback_probe, .driver = { .name = SND_LOOPBACK_DRIVER, .pm = &loopback_pm, }, }; static void loopback_unregister_all(void) { int i; for (i = 0; i < ARRAY_SIZE(devices); ++i) platform_device_unregister(devices[i]); platform_driver_unregister(&loopback_driver); } static int __init alsa_card_loopback_init(void) { int i, err, cards; err = platform_driver_register(&loopback_driver); if (err < 0) return err; cards = 0; for (i = 0; i < SNDRV_CARDS; i++) { struct platform_device *device; if (!enable[i]) continue; device = platform_device_register_simple(SND_LOOPBACK_DRIVER, i, NULL, 0); if (IS_ERR(device)) continue; if (!platform_get_drvdata(device)) { platform_device_unregister(device); continue; } devices[i] = device; cards++; } if (!cards) { #ifdef MODULE pr_err("aloop: No loopback enabled\n"); #endif loopback_unregister_all(); return -ENODEV; } return 0; } static void __exit alsa_card_loopback_exit(void) { loopback_unregister_all(); } module_init(alsa_card_loopback_init) module_exit(alsa_card_loopback_exit)
3 3 3 2 2 2 2 1 1 2 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 // SPDX-License-Identifier: GPL-2.0 /* * SME code for cfg80211 * both driver SME event handling and the SME implementation * (for nl80211's connect() and wext) * * Copyright 2009 Johannes Berg <johannes@sipsolutions.net> * Copyright (C) 2009, 2020, 2022-2024 Intel Corporation. All rights reserved. * Copyright 2017 Intel Deutschland GmbH */ #include <linux/etherdevice.h> #include <linux/if_arp.h> #include <linux/slab.h> #include <linux/workqueue.h> #include <linux/wireless.h> #include <linux/export.h> #include <net/iw_handler.h> #include <net/cfg80211.h> #include <net/rtnetlink.h> #include "nl80211.h" #include "reg.h" #include "rdev-ops.h" /* * Software SME in cfg80211, using auth/assoc/deauth calls to the * driver. This is for implementing nl80211's connect/disconnect * and wireless extensions (if configured.) */ struct cfg80211_conn { struct cfg80211_connect_params params; /* these are sub-states of the _CONNECTING sme_state */ enum { CFG80211_CONN_SCANNING, CFG80211_CONN_SCAN_AGAIN, CFG80211_CONN_AUTHENTICATE_NEXT, CFG80211_CONN_AUTHENTICATING, CFG80211_CONN_AUTH_FAILED_TIMEOUT, CFG80211_CONN_ASSOCIATE_NEXT, CFG80211_CONN_ASSOCIATING, CFG80211_CONN_ASSOC_FAILED, CFG80211_CONN_ASSOC_FAILED_TIMEOUT, CFG80211_CONN_DEAUTH, CFG80211_CONN_ABANDON, CFG80211_CONN_CONNECTED, } state; u8 bssid[ETH_ALEN], prev_bssid[ETH_ALEN]; const u8 *ie; size_t ie_len; bool auto_auth, prev_bssid_valid; }; static void cfg80211_sme_free(struct wireless_dev *wdev) { if (!wdev->conn) return; kfree(wdev->conn->ie); kfree(wdev->conn); wdev->conn = NULL; } static int cfg80211_conn_scan(struct wireless_dev *wdev) { struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); struct cfg80211_scan_request *request; int n_channels, err; lockdep_assert_wiphy(wdev->wiphy); if (rdev->scan_req || rdev->scan_msg) return -EBUSY; if (wdev->conn->params.channel) n_channels = 1; else n_channels = ieee80211_get_num_supported_channels(wdev->wiphy); request = kzalloc(sizeof(*request) + sizeof(request->ssids[0]) + sizeof(request->channels[0]) * n_channels, GFP_KERNEL); if (!request) return -ENOMEM; request->n_channels = n_channels; if (wdev->conn->params.channel) { enum nl80211_band band = wdev->conn->params.channel->band; struct ieee80211_supported_band *sband = wdev->wiphy->bands[band]; if (!sband) { kfree(request); return -EINVAL; } request->channels[0] = wdev->conn->params.channel; request->rates[band] = (1 << sband->n_bitrates) - 1; } else { int i = 0, j; enum nl80211_band band; struct ieee80211_supported_band *bands; struct ieee80211_channel *channel; for (band = 0; band < NUM_NL80211_BANDS; band++) { bands = wdev->wiphy->bands[band]; if (!bands) continue; for (j = 0; j < bands->n_channels; j++) { channel = &bands->channels[j]; if (channel->flags & IEEE80211_CHAN_DISABLED) continue; request->channels[i++] = channel; } request->rates[band] = (1 << bands->n_bitrates) - 1; } n_channels = i; } request->n_channels = n_channels; request->ssids = (void *)request + struct_size(request, channels, n_channels); request->n_ssids = 1; memcpy(request->ssids[0].ssid, wdev->conn->params.ssid, wdev->conn->params.ssid_len); request->ssids[0].ssid_len = wdev->conn->params.ssid_len; eth_broadcast_addr(request->bssid); request->wdev = wdev; request->wiphy = &rdev->wiphy; request->scan_start = jiffies; rdev->scan_req = request; err = cfg80211_scan(rdev); if (!err) { wdev->conn->state = CFG80211_CONN_SCANNING; nl80211_send_scan_start(rdev, wdev); dev_hold(wdev->netdev); } else { rdev->scan_req = NULL; kfree(request); } return err; } static int cfg80211_conn_do_work(struct wireless_dev *wdev, enum nl80211_timeout_reason *treason) { struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); struct cfg80211_connect_params *params; struct cfg80211_auth_request auth_req = {}; struct cfg80211_assoc_request req = {}; int err; lockdep_assert_wiphy(wdev->wiphy); if (!wdev->conn) return 0; params = &wdev->conn->params; switch (wdev->conn->state) { case CFG80211_CONN_SCANNING: /* didn't find it during scan ... */ return -ENOENT; case CFG80211_CONN_SCAN_AGAIN: return cfg80211_conn_scan(wdev); case CFG80211_CONN_AUTHENTICATE_NEXT: if (WARN_ON(!rdev->ops->auth)) return -EOPNOTSUPP; wdev->conn->state = CFG80211_CONN_AUTHENTICATING; auth_req.key = params->key; auth_req.key_len = params->key_len; auth_req.key_idx = params->key_idx; auth_req.auth_type = params->auth_type; auth_req.bss = cfg80211_get_bss(&rdev->wiphy, params->channel, params->bssid, params->ssid, params->ssid_len, IEEE80211_BSS_TYPE_ESS, IEEE80211_PRIVACY_ANY); auth_req.link_id = -1; err = cfg80211_mlme_auth(rdev, wdev->netdev, &auth_req); cfg80211_put_bss(&rdev->wiphy, auth_req.bss); return err; case CFG80211_CONN_AUTH_FAILED_TIMEOUT: *treason = NL80211_TIMEOUT_AUTH; return -ENOTCONN; case CFG80211_CONN_ASSOCIATE_NEXT: if (WARN_ON(!rdev->ops->assoc)) return -EOPNOTSUPP; wdev->conn->state = CFG80211_CONN_ASSOCIATING; if (wdev->conn->prev_bssid_valid) req.prev_bssid = wdev->conn->prev_bssid; req.ie = params->ie; req.ie_len = params->ie_len; req.use_mfp = params->mfp != NL80211_MFP_NO; req.crypto = params->crypto; req.flags = params->flags; req.ht_capa = params->ht_capa; req.ht_capa_mask = params->ht_capa_mask; req.vht_capa = params->vht_capa; req.vht_capa_mask = params->vht_capa_mask; req.link_id = -1; req.bss = cfg80211_get_bss(&rdev->wiphy, params->channel, params->bssid, params->ssid, params->ssid_len, IEEE80211_BSS_TYPE_ESS, IEEE80211_PRIVACY_ANY); if (!req.bss) { err = -ENOENT; } else { err = cfg80211_mlme_assoc(rdev, wdev->netdev, &req, NULL); cfg80211_put_bss(&rdev->wiphy, req.bss); } if (err) cfg80211_mlme_deauth(rdev, wdev->netdev, params->bssid, NULL, 0, WLAN_REASON_DEAUTH_LEAVING, false); return err; case CFG80211_CONN_ASSOC_FAILED_TIMEOUT: *treason = NL80211_TIMEOUT_ASSOC; fallthrough; case CFG80211_CONN_ASSOC_FAILED: cfg80211_mlme_deauth(rdev, wdev->netdev, params->bssid, NULL, 0, WLAN_REASON_DEAUTH_LEAVING, false); return -ENOTCONN; case CFG80211_CONN_DEAUTH: cfg80211_mlme_deauth(rdev, wdev->netdev, params->bssid, NULL, 0, WLAN_REASON_DEAUTH_LEAVING, false); fallthrough; case CFG80211_CONN_ABANDON: /* free directly, disconnected event already sent */ cfg80211_sme_free(wdev); return 0; default: return 0; } } void cfg80211_conn_work(struct work_struct *work) { struct cfg80211_registered_device *rdev = container_of(work, struct cfg80211_registered_device, conn_work); struct wireless_dev *wdev; u8 bssid_buf[ETH_ALEN], *bssid = NULL; enum nl80211_timeout_reason treason; guard(wiphy)(&rdev->wiphy); list_for_each_entry(wdev, &rdev->wiphy.wdev_list, list) { if (!wdev->netdev) continue; if (!netif_running(wdev->netdev)) continue; if (!wdev->conn || wdev->conn->state == CFG80211_CONN_CONNECTED) continue; if (wdev->conn->params.bssid) { memcpy(bssid_buf, wdev->conn->params.bssid, ETH_ALEN); bssid = bssid_buf; } treason = NL80211_TIMEOUT_UNSPECIFIED; if (cfg80211_conn_do_work(wdev, &treason)) { struct cfg80211_connect_resp_params cr; memset(&cr, 0, sizeof(cr)); cr.status = -1; cr.links[0].bssid = bssid; cr.timeout_reason = treason; __cfg80211_connect_result(wdev->netdev, &cr, false); } } } static void cfg80211_step_auth_next(struct cfg80211_conn *conn, struct cfg80211_bss *bss) { memcpy(conn->bssid, bss->bssid, ETH_ALEN); conn->params.bssid = conn->bssid; conn->params.channel = bss->channel; conn->state = CFG80211_CONN_AUTHENTICATE_NEXT; } /* Returned bss is reference counted and must be cleaned up appropriately. */ static struct cfg80211_bss *cfg80211_get_conn_bss(struct wireless_dev *wdev) { struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); struct cfg80211_bss *bss; lockdep_assert_wiphy(wdev->wiphy); bss = cfg80211_get_bss(wdev->wiphy, wdev->conn->params.channel, wdev->conn->params.bssid, wdev->conn->params.ssid, wdev->conn->params.ssid_len, wdev->conn_bss_type, IEEE80211_PRIVACY(wdev->conn->params.privacy)); if (!bss) return NULL; cfg80211_step_auth_next(wdev->conn, bss); schedule_work(&rdev->conn_work); return bss; } void cfg80211_sme_scan_done(struct net_device *dev) { struct wireless_dev *wdev = dev->ieee80211_ptr; struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); struct cfg80211_bss *bss; lockdep_assert_wiphy(wdev->wiphy); if (!wdev->conn) return; if (wdev->conn->state != CFG80211_CONN_SCANNING && wdev->conn->state != CFG80211_CONN_SCAN_AGAIN) return; bss = cfg80211_get_conn_bss(wdev); if (bss) cfg80211_put_bss(&rdev->wiphy, bss); else schedule_work(&rdev->conn_work); } void cfg80211_sme_rx_auth(struct wireless_dev *wdev, const u8 *buf, size_t len) { struct wiphy *wiphy = wdev->wiphy; struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy); struct ieee80211_mgmt *mgmt = (struct ieee80211_mgmt *)buf; u16 status_code = le16_to_cpu(mgmt->u.auth.status_code); lockdep_assert_wiphy(wdev->wiphy); if (!wdev->conn || wdev->conn->state == CFG80211_CONN_CONNECTED) return; if (status_code == WLAN_STATUS_NOT_SUPPORTED_AUTH_ALG && wdev->conn->auto_auth && wdev->conn->params.auth_type != NL80211_AUTHTYPE_NETWORK_EAP) { /* select automatically between only open, shared, leap */ switch (wdev->conn->params.auth_type) { case NL80211_AUTHTYPE_OPEN_SYSTEM: if (wdev->connect_keys) wdev->conn->params.auth_type = NL80211_AUTHTYPE_SHARED_KEY; else wdev->conn->params.auth_type = NL80211_AUTHTYPE_NETWORK_EAP; break; case NL80211_AUTHTYPE_SHARED_KEY: wdev->conn->params.auth_type = NL80211_AUTHTYPE_NETWORK_EAP; break; default: /* huh? */ wdev->conn->params.auth_type = NL80211_AUTHTYPE_OPEN_SYSTEM; break; } wdev->conn->state = CFG80211_CONN_AUTHENTICATE_NEXT; schedule_work(&rdev->conn_work); } else if (status_code != WLAN_STATUS_SUCCESS) { struct cfg80211_connect_resp_params cr; memset(&cr, 0, sizeof(cr)); cr.status = status_code; cr.links[0].bssid = mgmt->bssid; cr.timeout_reason = NL80211_TIMEOUT_UNSPECIFIED; __cfg80211_connect_result(wdev->netdev, &cr, false); } else if (wdev->conn->state == CFG80211_CONN_AUTHENTICATING) { wdev->conn->state = CFG80211_CONN_ASSOCIATE_NEXT; schedule_work(&rdev->conn_work); } } bool cfg80211_sme_rx_assoc_resp(struct wireless_dev *wdev, u16 status) { struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); if (!wdev->conn) return false; if (status == WLAN_STATUS_SUCCESS) { wdev->conn->state = CFG80211_CONN_CONNECTED; return false; } if (wdev->conn->prev_bssid_valid) { /* * Some stupid APs don't accept reassoc, so we * need to fall back to trying regular assoc; * return true so no event is sent to userspace. */ wdev->conn->prev_bssid_valid = false; wdev->conn->state = CFG80211_CONN_ASSOCIATE_NEXT; schedule_work(&rdev->conn_work); return true; } wdev->conn->state = CFG80211_CONN_ASSOC_FAILED; schedule_work(&rdev->conn_work); return false; } void cfg80211_sme_deauth(struct wireless_dev *wdev) { cfg80211_sme_free(wdev); } void cfg80211_sme_auth_timeout(struct wireless_dev *wdev) { struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); if (!wdev->conn) return; wdev->conn->state = CFG80211_CONN_AUTH_FAILED_TIMEOUT; schedule_work(&rdev->conn_work); } void cfg80211_sme_disassoc(struct wireless_dev *wdev) { struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); if (!wdev->conn) return; wdev->conn->state = CFG80211_CONN_DEAUTH; schedule_work(&rdev->conn_work); } void cfg80211_sme_assoc_timeout(struct wireless_dev *wdev) { struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); if (!wdev->conn) return; wdev->conn->state = CFG80211_CONN_ASSOC_FAILED_TIMEOUT; schedule_work(&rdev->conn_work); } void cfg80211_sme_abandon_assoc(struct wireless_dev *wdev) { struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); if (!wdev->conn) return; wdev->conn->state = CFG80211_CONN_ABANDON; schedule_work(&rdev->conn_work); } static void cfg80211_wdev_release_bsses(struct wireless_dev *wdev) { unsigned int link; for_each_valid_link(wdev, link) { if (!wdev->links[link].client.current_bss) continue; cfg80211_unhold_bss(wdev->links[link].client.current_bss); cfg80211_put_bss(wdev->wiphy, &wdev->links[link].client.current_bss->pub); wdev->links[link].client.current_bss = NULL; } } void cfg80211_wdev_release_link_bsses(struct wireless_dev *wdev, u16 link_mask) { unsigned int link; for_each_valid_link(wdev, link) { if (!wdev->links[link].client.current_bss || !(link_mask & BIT(link))) continue; cfg80211_unhold_bss(wdev->links[link].client.current_bss); cfg80211_put_bss(wdev->wiphy, &wdev->links[link].client.current_bss->pub); wdev->links[link].client.current_bss = NULL; } } static int cfg80211_sme_get_conn_ies(struct wireless_dev *wdev, const u8 *ies, size_t ies_len, const u8 **out_ies, size_t *out_ies_len) { struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); u8 *buf; size_t offs; if (!rdev->wiphy.extended_capabilities_len || (ies && cfg80211_find_ie(WLAN_EID_EXT_CAPABILITY, ies, ies_len))) { *out_ies = kmemdup(ies, ies_len, GFP_KERNEL); if (!*out_ies) return -ENOMEM; *out_ies_len = ies_len; return 0; } buf = kmalloc(ies_len + rdev->wiphy.extended_capabilities_len + 2, GFP_KERNEL); if (!buf) return -ENOMEM; if (ies_len) { static const u8 before_extcapa[] = { /* not listing IEs expected to be created by driver */ WLAN_EID_RSN, WLAN_EID_QOS_CAPA, WLAN_EID_RRM_ENABLED_CAPABILITIES, WLAN_EID_MOBILITY_DOMAIN, WLAN_EID_SUPPORTED_REGULATORY_CLASSES, WLAN_EID_BSS_COEX_2040, }; offs = ieee80211_ie_split(ies, ies_len, before_extcapa, ARRAY_SIZE(before_extcapa), 0); memcpy(buf, ies, offs); /* leave a whole for extended capabilities IE */ memcpy(buf + offs + rdev->wiphy.extended_capabilities_len + 2, ies + offs, ies_len - offs); } else { offs = 0; } /* place extended capabilities IE (with only driver capabilities) */ buf[offs] = WLAN_EID_EXT_CAPABILITY; buf[offs + 1] = rdev->wiphy.extended_capabilities_len; memcpy(buf + offs + 2, rdev->wiphy.extended_capabilities, rdev->wiphy.extended_capabilities_len); *out_ies = buf; *out_ies_len = ies_len + rdev->wiphy.extended_capabilities_len + 2; return 0; } static int cfg80211_sme_connect(struct wireless_dev *wdev, struct cfg80211_connect_params *connect, const u8 *prev_bssid) { struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); struct cfg80211_bss *bss; int err; if (!rdev->ops->auth || !rdev->ops->assoc) return -EOPNOTSUPP; cfg80211_wdev_release_bsses(wdev); if (wdev->connected) { cfg80211_sme_free(wdev); wdev->connected = false; } if (wdev->conn) return -EINPROGRESS; wdev->conn = kzalloc(sizeof(*wdev->conn), GFP_KERNEL); if (!wdev->conn) return -ENOMEM; /* * Copy all parameters, and treat explicitly IEs, BSSID, SSID. */ memcpy(&wdev->conn->params, connect, sizeof(*connect)); if (connect->bssid) { wdev->conn->params.bssid = wdev->conn->bssid; memcpy(wdev->conn->bssid, connect->bssid, ETH_ALEN); } if (cfg80211_sme_get_conn_ies(wdev, connect->ie, connect->ie_len, &wdev->conn->ie, &wdev->conn->params.ie_len)) { kfree(wdev->conn); wdev->conn = NULL; return -ENOMEM; } wdev->conn->params.ie = wdev->conn->ie; if (connect->auth_type == NL80211_AUTHTYPE_AUTOMATIC) { wdev->conn->auto_auth = true; /* start with open system ... should mostly work */ wdev->conn->params.auth_type = NL80211_AUTHTYPE_OPEN_SYSTEM; } else { wdev->conn->auto_auth = false; } wdev->conn->params.ssid = wdev->u.client.ssid; wdev->conn->params.ssid_len = wdev->u.client.ssid_len; /* see if we have the bss already */ bss = cfg80211_get_bss(wdev->wiphy, wdev->conn->params.channel, wdev->conn->params.bssid, wdev->conn->params.ssid, wdev->conn->params.ssid_len, wdev->conn_bss_type, IEEE80211_PRIVACY(wdev->conn->params.privacy)); if (prev_bssid) { memcpy(wdev->conn->prev_bssid, prev_bssid, ETH_ALEN); wdev->conn->prev_bssid_valid = true; } /* we're good if we have a matching bss struct */ if (bss) { enum nl80211_timeout_reason treason; cfg80211_step_auth_next(wdev->conn, bss); err = cfg80211_conn_do_work(wdev, &treason); cfg80211_put_bss(wdev->wiphy, bss); } else { /* otherwise we'll need to scan for the AP first */ err = cfg80211_conn_scan(wdev); /* * If we can't scan right now, then we need to scan again * after the current scan finished, since the parameters * changed (unless we find a good AP anyway). */ if (err == -EBUSY) { err = 0; wdev->conn->state = CFG80211_CONN_SCAN_AGAIN; } } if (err) cfg80211_sme_free(wdev); return err; } static int cfg80211_sme_disconnect(struct wireless_dev *wdev, u16 reason) { struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); int err; if (!wdev->conn) return 0; if (!rdev->ops->deauth) return -EOPNOTSUPP; if (wdev->conn->state == CFG80211_CONN_SCANNING || wdev->conn->state == CFG80211_CONN_SCAN_AGAIN) { err = 0; goto out; } /* wdev->conn->params.bssid must be set if > SCANNING */ err = cfg80211_mlme_deauth(rdev, wdev->netdev, wdev->conn->params.bssid, NULL, 0, reason, false); out: cfg80211_sme_free(wdev); return err; } /* * code shared for in-device and software SME */ static bool cfg80211_is_all_idle(void) { struct cfg80211_registered_device *rdev; struct wireless_dev *wdev; bool is_all_idle = true; /* * All devices must be idle as otherwise if you are actively * scanning some new beacon hints could be learned and would * count as new regulatory hints. * Also if there is any other active beaconing interface we * need not issue a disconnect hint and reset any info such * as chan dfs state, etc. */ for_each_rdev(rdev) { guard(wiphy)(&rdev->wiphy); list_for_each_entry(wdev, &rdev->wiphy.wdev_list, list) { if (wdev->conn || wdev->connected || cfg80211_beaconing_iface_active(wdev)) is_all_idle = false; } } return is_all_idle; } static void disconnect_work(struct work_struct *work) { rtnl_lock(); if (cfg80211_is_all_idle()) regulatory_hint_disconnect(); rtnl_unlock(); } DECLARE_WORK(cfg80211_disconnect_work, disconnect_work); static void cfg80211_connect_result_release_bsses(struct wireless_dev *wdev, struct cfg80211_connect_resp_params *cr) { unsigned int link; for_each_valid_link(cr, link) { if (!cr->links[link].bss) continue; cfg80211_unhold_bss(bss_from_pub(cr->links[link].bss)); cfg80211_put_bss(wdev->wiphy, cr->links[link].bss); } } /* * API calls for drivers implementing connect/disconnect and * SME event handling */ /* This method must consume bss one way or another */ void __cfg80211_connect_result(struct net_device *dev, struct cfg80211_connect_resp_params *cr, bool wextev) { struct wireless_dev *wdev = dev->ieee80211_ptr; const struct element *country_elem = NULL; const struct element *ssid; const u8 *country_data; u8 country_datalen; #ifdef CONFIG_CFG80211_WEXT union iwreq_data wrqu; #endif unsigned int link; const u8 *connected_addr; bool bss_not_found = false; lockdep_assert_wiphy(wdev->wiphy); if (WARN_ON(wdev->iftype != NL80211_IFTYPE_STATION && wdev->iftype != NL80211_IFTYPE_P2P_CLIENT)) goto out; if (cr->valid_links) { if (WARN_ON(!cr->ap_mld_addr)) goto out; for_each_valid_link(cr, link) { if (WARN_ON(!cr->links[link].addr)) goto out; } if (WARN_ON(wdev->connect_keys)) goto out; } wdev->unprot_beacon_reported = 0; nl80211_send_connect_result(wiphy_to_rdev(wdev->wiphy), dev, cr, GFP_KERNEL); connected_addr = cr->valid_links ? cr->ap_mld_addr : cr->links[0].bssid; #ifdef CONFIG_CFG80211_WEXT if (wextev && !cr->valid_links) { if (cr->req_ie && cr->status == WLAN_STATUS_SUCCESS) { memset(&wrqu, 0, sizeof(wrqu)); wrqu.data.length = cr->req_ie_len; wireless_send_event(dev, IWEVASSOCREQIE, &wrqu, cr->req_ie); } if (cr->resp_ie && cr->status == WLAN_STATUS_SUCCESS) { memset(&wrqu, 0, sizeof(wrqu)); wrqu.data.length = cr->resp_ie_len; wireless_send_event(dev, IWEVASSOCRESPIE, &wrqu, cr->resp_ie); } memset(&wrqu, 0, sizeof(wrqu)); wrqu.ap_addr.sa_family = ARPHRD_ETHER; if (connected_addr && cr->status == WLAN_STATUS_SUCCESS) { memcpy(wrqu.ap_addr.sa_data, connected_addr, ETH_ALEN); memcpy(wdev->wext.prev_bssid, connected_addr, ETH_ALEN); wdev->wext.prev_bssid_valid = true; } wireless_send_event(dev, SIOCGIWAP, &wrqu, NULL); } #endif if (cr->status == WLAN_STATUS_SUCCESS) { if (!wiphy_to_rdev(wdev->wiphy)->ops->connect) { for_each_valid_link(cr, link) { if (WARN_ON_ONCE(!cr->links[link].bss)) break; } } for_each_valid_link(cr, link) { /* don't do extra lookups for failures */ if (cr->links[link].status != WLAN_STATUS_SUCCESS) continue; if (cr->links[link].bss) continue; cr->links[link].bss = cfg80211_get_bss(wdev->wiphy, NULL, cr->links[link].bssid, wdev->u.client.ssid, wdev->u.client.ssid_len, wdev->conn_bss_type, IEEE80211_PRIVACY_ANY); if (!cr->links[link].bss) { bss_not_found = true; break; } cfg80211_hold_bss(bss_from_pub(cr->links[link].bss)); } } cfg80211_wdev_release_bsses(wdev); if (cr->status != WLAN_STATUS_SUCCESS) { kfree_sensitive(wdev->connect_keys); wdev->connect_keys = NULL; wdev->u.client.ssid_len = 0; wdev->conn_owner_nlportid = 0; cfg80211_connect_result_release_bsses(wdev, cr); cfg80211_sme_free(wdev); return; } if (WARN_ON(bss_not_found)) { cfg80211_connect_result_release_bsses(wdev, cr); return; } memset(wdev->links, 0, sizeof(wdev->links)); for_each_valid_link(cr, link) { if (cr->links[link].status == WLAN_STATUS_SUCCESS) continue; cr->valid_links &= ~BIT(link); /* don't require bss pointer for failed links */ if (!cr->links[link].bss) continue; cfg80211_unhold_bss(bss_from_pub(cr->links[link].bss)); cfg80211_put_bss(wdev->wiphy, cr->links[link].bss); } wdev->valid_links = cr->valid_links; for_each_valid_link(cr, link) wdev->links[link].client.current_bss = bss_from_pub(cr->links[link].bss); wdev->connected = true; ether_addr_copy(wdev->u.client.connected_addr, connected_addr); if (cr->valid_links) { for_each_valid_link(cr, link) memcpy(wdev->links[link].addr, cr->links[link].addr, ETH_ALEN); } cfg80211_upload_connect_keys(wdev); rcu_read_lock(); for_each_valid_link(cr, link) { country_elem = ieee80211_bss_get_elem(cr->links[link].bss, WLAN_EID_COUNTRY); if (country_elem) break; } if (!country_elem) { rcu_read_unlock(); return; } country_datalen = country_elem->datalen; country_data = kmemdup(country_elem->data, country_datalen, GFP_ATOMIC); rcu_read_unlock(); if (!country_data) return; regulatory_hint_country_ie(wdev->wiphy, cr->links[link].bss->channel->band, country_data, country_datalen); kfree(country_data); if (!wdev->u.client.ssid_len) { rcu_read_lock(); for_each_valid_link(cr, link) { ssid = ieee80211_bss_get_elem(cr->links[link].bss, WLAN_EID_SSID); if (!ssid || !ssid->datalen) continue; memcpy(wdev->u.client.ssid, ssid->data, ssid->datalen); wdev->u.client.ssid_len = ssid->datalen; break; } rcu_read_unlock(); } return; out: for_each_valid_link(cr, link) cfg80211_put_bss(wdev->wiphy, cr->links[link].bss); } static void cfg80211_update_link_bss(struct wireless_dev *wdev, struct cfg80211_bss **bss) { struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); struct cfg80211_internal_bss *ibss; if (!*bss) return; ibss = bss_from_pub(*bss); if (list_empty(&ibss->list)) { struct cfg80211_bss *found = NULL, *tmp = *bss; found = cfg80211_get_bss(wdev->wiphy, NULL, (*bss)->bssid, wdev->u.client.ssid, wdev->u.client.ssid_len, wdev->conn_bss_type, IEEE80211_PRIVACY_ANY); if (found) { /* The same BSS is already updated so use it * instead, as it has latest info. */ *bss = found; } else { /* Update with BSS provided by driver, it will * be freshly added and ref cnted, we can free * the old one. * * signal_valid can be false, as we are not * expecting the BSS to be found. * * keep the old timestamp to avoid confusion */ cfg80211_bss_update(rdev, ibss, false, ibss->ts); } cfg80211_put_bss(wdev->wiphy, tmp); } } /* Consumes bss object(s) one way or another */ void cfg80211_connect_done(struct net_device *dev, struct cfg80211_connect_resp_params *params, gfp_t gfp) { struct wireless_dev *wdev = dev->ieee80211_ptr; struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); struct cfg80211_event *ev; unsigned long flags; u8 *next; size_t link_info_size = 0; unsigned int link; for_each_valid_link(params, link) { cfg80211_update_link_bss(wdev, &params->links[link].bss); link_info_size += params->links[link].bssid ? ETH_ALEN : 0; link_info_size += params->links[link].addr ? ETH_ALEN : 0; } ev = kzalloc(sizeof(*ev) + (params->ap_mld_addr ? ETH_ALEN : 0) + params->req_ie_len + params->resp_ie_len + params->fils.kek_len + params->fils.pmk_len + (params->fils.pmkid ? WLAN_PMKID_LEN : 0) + link_info_size, gfp); if (!ev) { for_each_valid_link(params, link) cfg80211_put_bss(wdev->wiphy, params->links[link].bss); return; } ev->type = EVENT_CONNECT_RESULT; next = ((u8 *)ev) + sizeof(*ev); if (params->ap_mld_addr) { ev->cr.ap_mld_addr = next; memcpy((void *)ev->cr.ap_mld_addr, params->ap_mld_addr, ETH_ALEN); next += ETH_ALEN; } if (params->req_ie_len) { ev->cr.req_ie = next; ev->cr.req_ie_len = params->req_ie_len; memcpy((void *)ev->cr.req_ie, params->req_ie, params->req_ie_len); next += params->req_ie_len; } if (params->resp_ie_len) { ev->cr.resp_ie = next; ev->cr.resp_ie_len = params->resp_ie_len; memcpy((void *)ev->cr.resp_ie, params->resp_ie, params->resp_ie_len); next += params->resp_ie_len; } if (params->fils.kek_len) { ev->cr.fils.kek = next; ev->cr.fils.kek_len = params->fils.kek_len; memcpy((void *)ev->cr.fils.kek, params->fils.kek, params->fils.kek_len); next += params->fils.kek_len; } if (params->fils.pmk_len) { ev->cr.fils.pmk = next; ev->cr.fils.pmk_len = params->fils.pmk_len; memcpy((void *)ev->cr.fils.pmk, params->fils.pmk, params->fils.pmk_len); next += params->fils.pmk_len; } if (params->fils.pmkid) { ev->cr.fils.pmkid = next; memcpy((void *)ev->cr.fils.pmkid, params->fils.pmkid, WLAN_PMKID_LEN); next += WLAN_PMKID_LEN; } ev->cr.fils.update_erp_next_seq_num = params->fils.update_erp_next_seq_num; if (params->fils.update_erp_next_seq_num) ev->cr.fils.erp_next_seq_num = params->fils.erp_next_seq_num; ev->cr.valid_links = params->valid_links; for_each_valid_link(params, link) { if (params->links[link].bss) cfg80211_hold_bss( bss_from_pub(params->links[link].bss)); ev->cr.links[link].bss = params->links[link].bss; ev->cr.links[link].status = params->links[link].status; if (params->links[link].addr) { ev->cr.links[link].addr = next; memcpy((void *)ev->cr.links[link].addr, params->links[link].addr, ETH_ALEN); next += ETH_ALEN; } if (params->links[link].bssid) { ev->cr.links[link].bssid = next; memcpy((void *)ev->cr.links[link].bssid, params->links[link].bssid, ETH_ALEN); next += ETH_ALEN; } } ev->cr.status = params->status; ev->cr.timeout_reason = params->timeout_reason; spin_lock_irqsave(&wdev->event_lock, flags); list_add_tail(&ev->list, &wdev->event_list); spin_unlock_irqrestore(&wdev->event_lock, flags); queue_work(cfg80211_wq, &rdev->event_work); } EXPORT_SYMBOL(cfg80211_connect_done); /* Consumes bss object one way or another */ void __cfg80211_roamed(struct wireless_dev *wdev, struct cfg80211_roam_info *info) { #ifdef CONFIG_CFG80211_WEXT union iwreq_data wrqu; #endif unsigned int link; const u8 *connected_addr; lockdep_assert_wiphy(wdev->wiphy); if (WARN_ON(wdev->iftype != NL80211_IFTYPE_STATION && wdev->iftype != NL80211_IFTYPE_P2P_CLIENT)) goto out; if (WARN_ON(!wdev->connected)) goto out; if (info->valid_links) { if (WARN_ON(!info->ap_mld_addr)) goto out; for_each_valid_link(info, link) { if (WARN_ON(!info->links[link].addr)) goto out; } } cfg80211_wdev_release_bsses(wdev); for_each_valid_link(info, link) { if (WARN_ON(!info->links[link].bss)) goto out; } memset(wdev->links, 0, sizeof(wdev->links)); wdev->valid_links = info->valid_links; for_each_valid_link(info, link) { cfg80211_hold_bss(bss_from_pub(info->links[link].bss)); wdev->links[link].client.current_bss = bss_from_pub(info->links[link].bss); } connected_addr = info->valid_links ? info->ap_mld_addr : info->links[0].bss->bssid; ether_addr_copy(wdev->u.client.connected_addr, connected_addr); if (info->valid_links) { for_each_valid_link(info, link) memcpy(wdev->links[link].addr, info->links[link].addr, ETH_ALEN); } wdev->unprot_beacon_reported = 0; nl80211_send_roamed(wiphy_to_rdev(wdev->wiphy), wdev->netdev, info, GFP_KERNEL); #ifdef CONFIG_CFG80211_WEXT if (!info->valid_links) { if (info->req_ie) { memset(&wrqu, 0, sizeof(wrqu)); wrqu.data.length = info->req_ie_len; wireless_send_event(wdev->netdev, IWEVASSOCREQIE, &wrqu, info->req_ie); } if (info->resp_ie) { memset(&wrqu, 0, sizeof(wrqu)); wrqu.data.length = info->resp_ie_len; wireless_send_event(wdev->netdev, IWEVASSOCRESPIE, &wrqu, info->resp_ie); } memset(&wrqu, 0, sizeof(wrqu)); wrqu.ap_addr.sa_family = ARPHRD_ETHER; memcpy(wrqu.ap_addr.sa_data, connected_addr, ETH_ALEN); memcpy(wdev->wext.prev_bssid, connected_addr, ETH_ALEN); wdev->wext.prev_bssid_valid = true; wireless_send_event(wdev->netdev, SIOCGIWAP, &wrqu, NULL); } #endif return; out: for_each_valid_link(info, link) cfg80211_put_bss(wdev->wiphy, info->links[link].bss); } /* Consumes info->links.bss object(s) one way or another */ void cfg80211_roamed(struct net_device *dev, struct cfg80211_roam_info *info, gfp_t gfp) { struct wireless_dev *wdev = dev->ieee80211_ptr; struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); struct cfg80211_event *ev; unsigned long flags; u8 *next; unsigned int link; size_t link_info_size = 0; bool bss_not_found = false; for_each_valid_link(info, link) { link_info_size += info->links[link].addr ? ETH_ALEN : 0; link_info_size += info->links[link].bssid ? ETH_ALEN : 0; if (info->links[link].bss) continue; info->links[link].bss = cfg80211_get_bss(wdev->wiphy, info->links[link].channel, info->links[link].bssid, wdev->u.client.ssid, wdev->u.client.ssid_len, wdev->conn_bss_type, IEEE80211_PRIVACY_ANY); if (!info->links[link].bss) { bss_not_found = true; break; } } if (WARN_ON(bss_not_found)) goto out; ev = kzalloc(sizeof(*ev) + info->req_ie_len + info->resp_ie_len + info->fils.kek_len + info->fils.pmk_len + (info->fils.pmkid ? WLAN_PMKID_LEN : 0) + (info->ap_mld_addr ? ETH_ALEN : 0) + link_info_size, gfp); if (!ev) goto out; ev->type = EVENT_ROAMED; next = ((u8 *)ev) + sizeof(*ev); if (info->req_ie_len) { ev->rm.req_ie = next; ev->rm.req_ie_len = info->req_ie_len; memcpy((void *)ev->rm.req_ie, info->req_ie, info->req_ie_len); next += info->req_ie_len; } if (info->resp_ie_len) { ev->rm.resp_ie = next; ev->rm.resp_ie_len = info->resp_ie_len; memcpy((void *)ev->rm.resp_ie, info->resp_ie, info->resp_ie_len); next += info->resp_ie_len; } if (info->fils.kek_len) { ev->rm.fils.kek = next; ev->rm.fils.kek_len = info->fils.kek_len; memcpy((void *)ev->rm.fils.kek, info->fils.kek, info->fils.kek_len); next += info->fils.kek_len; } if (info->fils.pmk_len) { ev->rm.fils.pmk = next; ev->rm.fils.pmk_len = info->fils.pmk_len; memcpy((void *)ev->rm.fils.pmk, info->fils.pmk, info->fils.pmk_len); next += info->fils.pmk_len; } if (info->fils.pmkid) { ev->rm.fils.pmkid = next; memcpy((void *)ev->rm.fils.pmkid, info->fils.pmkid, WLAN_PMKID_LEN); next += WLAN_PMKID_LEN; } ev->rm.fils.update_erp_next_seq_num = info->fils.update_erp_next_seq_num; if (info->fils.update_erp_next_seq_num) ev->rm.fils.erp_next_seq_num = info->fils.erp_next_seq_num; if (info->ap_mld_addr) { ev->rm.ap_mld_addr = next; memcpy((void *)ev->rm.ap_mld_addr, info->ap_mld_addr, ETH_ALEN); next += ETH_ALEN; } ev->rm.valid_links = info->valid_links; for_each_valid_link(info, link) { ev->rm.links[link].bss = info->links[link].bss; if (info->links[link].addr) { ev->rm.links[link].addr = next; memcpy((void *)ev->rm.links[link].addr, info->links[link].addr, ETH_ALEN); next += ETH_ALEN; } if (info->links[link].bssid) { ev->rm.links[link].bssid = next; memcpy((void *)ev->rm.links[link].bssid, info->links[link].bssid, ETH_ALEN); next += ETH_ALEN; } } spin_lock_irqsave(&wdev->event_lock, flags); list_add_tail(&ev->list, &wdev->event_list); spin_unlock_irqrestore(&wdev->event_lock, flags); queue_work(cfg80211_wq, &rdev->event_work); return; out: for_each_valid_link(info, link) cfg80211_put_bss(wdev->wiphy, info->links[link].bss); } EXPORT_SYMBOL(cfg80211_roamed); void __cfg80211_port_authorized(struct wireless_dev *wdev, const u8 *peer_addr, const u8 *td_bitmap, u8 td_bitmap_len) { lockdep_assert_wiphy(wdev->wiphy); if (WARN_ON(wdev->iftype != NL80211_IFTYPE_STATION && wdev->iftype != NL80211_IFTYPE_P2P_CLIENT && wdev->iftype != NL80211_IFTYPE_AP && wdev->iftype != NL80211_IFTYPE_P2P_GO)) return; if (wdev->iftype == NL80211_IFTYPE_STATION || wdev->iftype == NL80211_IFTYPE_P2P_CLIENT) { if (WARN_ON(!wdev->connected) || WARN_ON(!ether_addr_equal(wdev->u.client.connected_addr, peer_addr))) return; } nl80211_send_port_authorized(wiphy_to_rdev(wdev->wiphy), wdev->netdev, peer_addr, td_bitmap, td_bitmap_len); } void cfg80211_port_authorized(struct net_device *dev, const u8 *peer_addr, const u8 *td_bitmap, u8 td_bitmap_len, gfp_t gfp) { struct wireless_dev *wdev = dev->ieee80211_ptr; struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); struct cfg80211_event *ev; unsigned long flags; if (WARN_ON(!peer_addr)) return; ev = kzalloc(sizeof(*ev) + td_bitmap_len, gfp); if (!ev) return; ev->type = EVENT_PORT_AUTHORIZED; memcpy(ev->pa.peer_addr, peer_addr, ETH_ALEN); ev->pa.td_bitmap = ((u8 *)ev) + sizeof(*ev); ev->pa.td_bitmap_len = td_bitmap_len; memcpy((void *)ev->pa.td_bitmap, td_bitmap, td_bitmap_len); /* * Use the wdev event list so that if there are pending * connected/roamed events, they will be reported first. */ spin_lock_irqsave(&wdev->event_lock, flags); list_add_tail(&ev->list, &wdev->event_list); spin_unlock_irqrestore(&wdev->event_lock, flags); queue_work(cfg80211_wq, &rdev->event_work); } EXPORT_SYMBOL(cfg80211_port_authorized); void __cfg80211_disconnected(struct net_device *dev, const u8 *ie, size_t ie_len, u16 reason, bool from_ap) { struct wireless_dev *wdev = dev->ieee80211_ptr; struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); int i; #ifdef CONFIG_CFG80211_WEXT union iwreq_data wrqu; #endif lockdep_assert_wiphy(wdev->wiphy); if (WARN_ON(wdev->iftype != NL80211_IFTYPE_STATION && wdev->iftype != NL80211_IFTYPE_P2P_CLIENT)) return; cfg80211_wdev_release_bsses(wdev); wdev->valid_links = 0; wdev->connected = false; wdev->u.client.ssid_len = 0; wdev->conn_owner_nlportid = 0; kfree_sensitive(wdev->connect_keys); wdev->connect_keys = NULL; nl80211_send_disconnected(rdev, dev, reason, ie, ie_len, from_ap); /* stop critical protocol if supported */ if (rdev->ops->crit_proto_stop && rdev->crit_proto_nlportid) { rdev->crit_proto_nlportid = 0; rdev_crit_proto_stop(rdev, wdev); } /* * Delete all the keys ... pairwise keys can't really * exist any more anyway, but default keys might. */ if (rdev->ops->del_key) { int max_key_idx = 5; if (wiphy_ext_feature_isset( wdev->wiphy, NL80211_EXT_FEATURE_BEACON_PROTECTION) || wiphy_ext_feature_isset( wdev->wiphy, NL80211_EXT_FEATURE_BEACON_PROTECTION_CLIENT)) max_key_idx = 7; for (i = 0; i <= max_key_idx; i++) rdev_del_key(rdev, dev, -1, i, false, NULL); } rdev_set_qos_map(rdev, dev, NULL); #ifdef CONFIG_CFG80211_WEXT memset(&wrqu, 0, sizeof(wrqu)); wrqu.ap_addr.sa_family = ARPHRD_ETHER; wireless_send_event(dev, SIOCGIWAP, &wrqu, NULL); wdev->wext.connect.ssid_len = 0; #endif schedule_work(&cfg80211_disconnect_work); cfg80211_schedule_channels_check(wdev); } void cfg80211_disconnected(struct net_device *dev, u16 reason, const u8 *ie, size_t ie_len, bool locally_generated, gfp_t gfp) { struct wireless_dev *wdev = dev->ieee80211_ptr; struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); struct cfg80211_event *ev; unsigned long flags; ev = kzalloc(sizeof(*ev) + ie_len, gfp); if (!ev) return; ev->type = EVENT_DISCONNECTED; ev->dc.ie = ((u8 *)ev) + sizeof(*ev); ev->dc.ie_len = ie_len; memcpy((void *)ev->dc.ie, ie, ie_len); ev->dc.reason = reason; ev->dc.locally_generated = locally_generated; spin_lock_irqsave(&wdev->event_lock, flags); list_add_tail(&ev->list, &wdev->event_list); spin_unlock_irqrestore(&wdev->event_lock, flags); queue_work(cfg80211_wq, &rdev->event_work); } EXPORT_SYMBOL(cfg80211_disconnected); /* * API calls for nl80211/wext compatibility code */ int cfg80211_connect(struct cfg80211_registered_device *rdev, struct net_device *dev, struct cfg80211_connect_params *connect, struct cfg80211_cached_keys *connkeys, const u8 *prev_bssid) { struct wireless_dev *wdev = dev->ieee80211_ptr; int err; lockdep_assert_wiphy(wdev->wiphy); /* * If we have an ssid_len, we're trying to connect or are * already connected, so reject a new SSID unless it's the * same (which is the case for re-association.) */ if (wdev->u.client.ssid_len && (wdev->u.client.ssid_len != connect->ssid_len || memcmp(wdev->u.client.ssid, connect->ssid, wdev->u.client.ssid_len))) return -EALREADY; /* * If connected, reject (re-)association unless prev_bssid * matches the current BSSID. */ if (wdev->connected) { if (!prev_bssid) return -EALREADY; if (!ether_addr_equal(prev_bssid, wdev->u.client.connected_addr)) return -ENOTCONN; } /* * Reject if we're in the process of connecting with WEP, * this case isn't very interesting and trying to handle * it would make the code much more complex. */ if (wdev->connect_keys) return -EINPROGRESS; cfg80211_oper_and_ht_capa(&connect->ht_capa_mask, rdev->wiphy.ht_capa_mod_mask); cfg80211_oper_and_vht_capa(&connect->vht_capa_mask, rdev->wiphy.vht_capa_mod_mask); if (connkeys && connkeys->def >= 0) { int idx; u32 cipher; idx = connkeys->def; cipher = connkeys->params[idx].cipher; /* If given a WEP key we may need it for shared key auth */ if (cipher == WLAN_CIPHER_SUITE_WEP40 || cipher == WLAN_CIPHER_SUITE_WEP104) { connect->key_idx = idx; connect->key = connkeys->params[idx].key; connect->key_len = connkeys->params[idx].key_len; /* * If ciphers are not set (e.g. when going through * iwconfig), we have to set them appropriately here. */ if (connect->crypto.cipher_group == 0) connect->crypto.cipher_group = cipher; if (connect->crypto.n_ciphers_pairwise == 0) { connect->crypto.n_ciphers_pairwise = 1; connect->crypto.ciphers_pairwise[0] = cipher; } } } else { if (WARN_ON(connkeys)) return -EINVAL; /* connect can point to wdev->wext.connect which * can hold key data from a previous connection */ connect->key = NULL; connect->key_len = 0; connect->key_idx = 0; } wdev->connect_keys = connkeys; memcpy(wdev->u.client.ssid, connect->ssid, connect->ssid_len); wdev->u.client.ssid_len = connect->ssid_len; wdev->conn_bss_type = connect->pbss ? IEEE80211_BSS_TYPE_PBSS : IEEE80211_BSS_TYPE_ESS; if (!rdev->ops->connect) err = cfg80211_sme_connect(wdev, connect, prev_bssid); else err = rdev_connect(rdev, dev, connect); if (err) { wdev->connect_keys = NULL; /* * This could be reassoc getting refused, don't clear * ssid_len in that case. */ if (!wdev->connected) wdev->u.client.ssid_len = 0; return err; } return 0; } int cfg80211_disconnect(struct cfg80211_registered_device *rdev, struct net_device *dev, u16 reason, bool wextev) { struct wireless_dev *wdev = dev->ieee80211_ptr; int err = 0; lockdep_assert_wiphy(wdev->wiphy); kfree_sensitive(wdev->connect_keys); wdev->connect_keys = NULL; wdev->conn_owner_nlportid = 0; if (wdev->conn) err = cfg80211_sme_disconnect(wdev, reason); else if (!rdev->ops->disconnect) cfg80211_mlme_down(rdev, dev); else if (wdev->u.client.ssid_len) err = rdev_disconnect(rdev, dev, reason); /* * Clear ssid_len unless we actually were fully connected, * in which case cfg80211_disconnected() will take care of * this later. */ if (!wdev->connected) wdev->u.client.ssid_len = 0; return err; } /* * Used to clean up after the connection / connection attempt owner socket * disconnects */ void cfg80211_autodisconnect_wk(struct work_struct *work) { struct wireless_dev *wdev = container_of(work, struct wireless_dev, disconnect_wk); struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); guard(wiphy)(wdev->wiphy); if (wdev->conn_owner_nlportid) { switch (wdev->iftype) { case NL80211_IFTYPE_ADHOC: cfg80211_leave_ibss(rdev, wdev->netdev, false); break; case NL80211_IFTYPE_AP: case NL80211_IFTYPE_P2P_GO: cfg80211_stop_ap(rdev, wdev->netdev, -1, false); break; case NL80211_IFTYPE_MESH_POINT: cfg80211_leave_mesh(rdev, wdev->netdev); break; case NL80211_IFTYPE_STATION: case NL80211_IFTYPE_P2P_CLIENT: /* * Use disconnect_bssid if still connecting and * ops->disconnect not implemented. Otherwise we can * use cfg80211_disconnect. */ if (rdev->ops->disconnect || wdev->connected) cfg80211_disconnect(rdev, wdev->netdev, WLAN_REASON_DEAUTH_LEAVING, true); else cfg80211_mlme_deauth(rdev, wdev->netdev, wdev->disconnect_bssid, NULL, 0, WLAN_REASON_DEAUTH_LEAVING, false); break; default: break; } } }
290 207 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 /* SPDX-License-Identifier: GPL-2.0 */ #undef TRACE_SYSTEM #define TRACE_SYSTEM signal #if !defined(_TRACE_SIGNAL_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_SIGNAL_H #include <linux/signal.h> #include <linux/sched.h> #include <linux/tracepoint.h> #define TP_STORE_SIGINFO(__entry, info) \ do { \ if (info == SEND_SIG_NOINFO) { \ __entry->errno = 0; \ __entry->code = SI_USER; \ } else if (info == SEND_SIG_PRIV) { \ __entry->errno = 0; \ __entry->code = SI_KERNEL; \ } else { \ __entry->errno = info->si_errno; \ __entry->code = info->si_code; \ } \ } while (0) #ifndef TRACE_HEADER_MULTI_READ enum { TRACE_SIGNAL_DELIVERED, TRACE_SIGNAL_IGNORED, TRACE_SIGNAL_ALREADY_PENDING, TRACE_SIGNAL_OVERFLOW_FAIL, TRACE_SIGNAL_LOSE_INFO, }; #endif /** * signal_generate - called when a signal is generated * @sig: signal number * @info: pointer to struct siginfo * @task: pointer to struct task_struct * @group: shared or private * @result: TRACE_SIGNAL_* * * Current process sends a 'sig' signal to 'task' process with * 'info' siginfo. If 'info' is SEND_SIG_NOINFO or SEND_SIG_PRIV, * 'info' is not a pointer and you can't access its field. Instead, * SEND_SIG_NOINFO means that si_code is SI_USER, and SEND_SIG_PRIV * means that si_code is SI_KERNEL. */ TRACE_EVENT(signal_generate, TP_PROTO(int sig, struct kernel_siginfo *info, struct task_struct *task, int group, int result), TP_ARGS(sig, info, task, group, result), TP_STRUCT__entry( __field( int, sig ) __field( int, errno ) __field( int, code ) __array( char, comm, TASK_COMM_LEN ) __field( pid_t, pid ) __field( int, group ) __field( int, result ) ), TP_fast_assign( __entry->sig = sig; TP_STORE_SIGINFO(__entry, info); memcpy(__entry->comm, task->comm, TASK_COMM_LEN); __entry->pid = task->pid; __entry->group = group; __entry->result = result; ), TP_printk("sig=%d errno=%d code=%d comm=%s pid=%d grp=%d res=%d", __entry->sig, __entry->errno, __entry->code, __entry->comm, __entry->pid, __entry->group, __entry->result) ); /** * signal_deliver - called when a signal is delivered * @sig: signal number * @info: pointer to struct siginfo * @ka: pointer to struct k_sigaction * * A 'sig' signal is delivered to current process with 'info' siginfo, * and it will be handled by 'ka'. ka->sa.sa_handler can be SIG_IGN or * SIG_DFL. * Note that some signals reported by signal_generate tracepoint can be * lost, ignored or modified (by debugger) before hitting this tracepoint. * This means, this can show which signals are actually delivered, but * matching generated signals and delivered signals may not be correct. */ TRACE_EVENT(signal_deliver, TP_PROTO(int sig, struct kernel_siginfo *info, struct k_sigaction *ka), TP_ARGS(sig, info, ka), TP_STRUCT__entry( __field( int, sig ) __field( int, errno ) __field( int, code ) __field( unsigned long, sa_handler ) __field( unsigned long, sa_flags ) ), TP_fast_assign( __entry->sig = sig; TP_STORE_SIGINFO(__entry, info); __entry->sa_handler = (unsigned long)ka->sa.sa_handler; __entry->sa_flags = ka->sa.sa_flags; ), TP_printk("sig=%d errno=%d code=%d sa_handler=%lx sa_flags=%lx", __entry->sig, __entry->errno, __entry->code, __entry->sa_handler, __entry->sa_flags) ); #endif /* _TRACE_SIGNAL_H */ /* This part must be outside protection */ #include <trace/define_trace.h>
392 392 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 // SPDX-License-Identifier: GPL-2.0-or-later /* Copyright 2020 NXP */ #include <linux/module.h> #include <linux/types.h> #include <linux/kernel.h> #include <linux/string.h> #include <linux/errno.h> #include <linux/skbuff.h> #include <linux/rtnetlink.h> #include <linux/init.h> #include <linux/slab.h> #include <net/act_api.h> #include <net/netlink.h> #include <net/pkt_cls.h> #include <net/tc_act/tc_gate.h> #include <net/tc_wrapper.h> static struct tc_action_ops act_gate_ops; static ktime_t gate_get_time(struct tcf_gate *gact) { ktime_t mono = ktime_get(); switch (gact->tk_offset) { case TK_OFFS_MAX: return mono; default: return ktime_mono_to_any(mono, gact->tk_offset); } return KTIME_MAX; } static void gate_get_start_time(struct tcf_gate *gact, ktime_t *start) { struct tcf_gate_params *param = &gact->param; ktime_t now, base, cycle; u64 n; base = ns_to_ktime(param->tcfg_basetime); now = gate_get_time(gact); if (ktime_after(base, now)) { *start = base; return; } cycle = param->tcfg_cycletime; n = div64_u64(ktime_sub_ns(now, base), cycle); *start = ktime_add_ns(base, (n + 1) * cycle); } static void gate_start_timer(struct tcf_gate *gact, ktime_t start) { ktime_t expires; expires = hrtimer_get_expires(&gact->hitimer); if (expires == 0) expires = KTIME_MAX; start = min_t(ktime_t, start, expires); hrtimer_start(&gact->hitimer, start, HRTIMER_MODE_ABS_SOFT); } static enum hrtimer_restart gate_timer_func(struct hrtimer *timer) { struct tcf_gate *gact = container_of(timer, struct tcf_gate, hitimer); struct tcf_gate_params *p = &gact->param; struct tcfg_gate_entry *next; ktime_t close_time, now; spin_lock(&gact->tcf_lock); next = gact->next_entry; /* cycle start, clear pending bit, clear total octets */ gact->current_gate_status = next->gate_state ? GATE_ACT_GATE_OPEN : 0; gact->current_entry_octets = 0; gact->current_max_octets = next->maxoctets; gact->current_close_time = ktime_add_ns(gact->current_close_time, next->interval); close_time = gact->current_close_time; if (list_is_last(&next->list, &p->entries)) next = list_first_entry(&p->entries, struct tcfg_gate_entry, list); else next = list_next_entry(next, list); now = gate_get_time(gact); if (ktime_after(now, close_time)) { ktime_t cycle, base; u64 n; cycle = p->tcfg_cycletime; base = ns_to_ktime(p->tcfg_basetime); n = div64_u64(ktime_sub_ns(now, base), cycle); close_time = ktime_add_ns(base, (n + 1) * cycle); } gact->next_entry = next; hrtimer_set_expires(&gact->hitimer, close_time); spin_unlock(&gact->tcf_lock); return HRTIMER_RESTART; } TC_INDIRECT_SCOPE int tcf_gate_act(struct sk_buff *skb, const struct tc_action *a, struct tcf_result *res) { struct tcf_gate *gact = to_gate(a); int action = READ_ONCE(gact->tcf_action); tcf_lastuse_update(&gact->tcf_tm); tcf_action_update_bstats(&gact->common, skb); spin_lock(&gact->tcf_lock); if (unlikely(gact->current_gate_status & GATE_ACT_PENDING)) { spin_unlock(&gact->tcf_lock); return action; } if (!(gact->current_gate_status & GATE_ACT_GATE_OPEN)) { spin_unlock(&gact->tcf_lock); goto drop; } if (gact->current_max_octets >= 0) { gact->current_entry_octets += qdisc_pkt_len(skb); if (gact->current_entry_octets > gact->current_max_octets) { spin_unlock(&gact->tcf_lock); goto overlimit; } } spin_unlock(&gact->tcf_lock); return action; overlimit: tcf_action_inc_overlimit_qstats(&gact->common); drop: tcf_action_inc_drop_qstats(&gact->common); return TC_ACT_SHOT; } static const struct nla_policy entry_policy[TCA_GATE_ENTRY_MAX + 1] = { [TCA_GATE_ENTRY_INDEX] = { .type = NLA_U32 }, [TCA_GATE_ENTRY_GATE] = { .type = NLA_FLAG }, [TCA_GATE_ENTRY_INTERVAL] = { .type = NLA_U32 }, [TCA_GATE_ENTRY_IPV] = { .type = NLA_S32 }, [TCA_GATE_ENTRY_MAX_OCTETS] = { .type = NLA_S32 }, }; static const struct nla_policy gate_policy[TCA_GATE_MAX + 1] = { [TCA_GATE_PARMS] = NLA_POLICY_EXACT_LEN(sizeof(struct tc_gate)), [TCA_GATE_PRIORITY] = { .type = NLA_S32 }, [TCA_GATE_ENTRY_LIST] = { .type = NLA_NESTED }, [TCA_GATE_BASE_TIME] = { .type = NLA_U64 }, [TCA_GATE_CYCLE_TIME] = { .type = NLA_U64 }, [TCA_GATE_CYCLE_TIME_EXT] = { .type = NLA_U64 }, [TCA_GATE_FLAGS] = { .type = NLA_U32 }, [TCA_GATE_CLOCKID] = { .type = NLA_S32 }, }; static int fill_gate_entry(struct nlattr **tb, struct tcfg_gate_entry *entry, struct netlink_ext_ack *extack) { u32 interval = 0; entry->gate_state = nla_get_flag(tb[TCA_GATE_ENTRY_GATE]); if (tb[TCA_GATE_ENTRY_INTERVAL]) interval = nla_get_u32(tb[TCA_GATE_ENTRY_INTERVAL]); if (interval == 0) { NL_SET_ERR_MSG(extack, "Invalid interval for schedule entry"); return -EINVAL; } entry->interval = interval; entry->ipv = nla_get_s32_default(tb[TCA_GATE_ENTRY_IPV], -1); entry->maxoctets = nla_get_s32_default(tb[TCA_GATE_ENTRY_MAX_OCTETS], -1); return 0; } static int parse_gate_entry(struct nlattr *n, struct tcfg_gate_entry *entry, int index, struct netlink_ext_ack *extack) { struct nlattr *tb[TCA_GATE_ENTRY_MAX + 1] = { }; int err; err = nla_parse_nested(tb, TCA_GATE_ENTRY_MAX, n, entry_policy, extack); if (err < 0) { NL_SET_ERR_MSG(extack, "Could not parse nested entry"); return -EINVAL; } entry->index = index; return fill_gate_entry(tb, entry, extack); } static void release_entry_list(struct list_head *entries) { struct tcfg_gate_entry *entry, *e; list_for_each_entry_safe(entry, e, entries, list) { list_del(&entry->list); kfree(entry); } } static int parse_gate_list(struct nlattr *list_attr, struct tcf_gate_params *sched, struct netlink_ext_ack *extack) { struct tcfg_gate_entry *entry; struct nlattr *n; int err, rem; int i = 0; if (!list_attr) return -EINVAL; nla_for_each_nested(n, list_attr, rem) { if (nla_type(n) != TCA_GATE_ONE_ENTRY) { NL_SET_ERR_MSG(extack, "Attribute isn't type 'entry'"); continue; } entry = kzalloc(sizeof(*entry), GFP_ATOMIC); if (!entry) { NL_SET_ERR_MSG(extack, "Not enough memory for entry"); err = -ENOMEM; goto release_list; } err = parse_gate_entry(n, entry, i, extack); if (err < 0) { kfree(entry); goto release_list; } list_add_tail(&entry->list, &sched->entries); i++; } sched->num_entries = i; return i; release_list: release_entry_list(&sched->entries); return err; } static void gate_setup_timer(struct tcf_gate *gact, u64 basetime, enum tk_offsets tko, s32 clockid, bool do_init) { if (!do_init) { if (basetime == gact->param.tcfg_basetime && tko == gact->tk_offset && clockid == gact->param.tcfg_clockid) return; spin_unlock_bh(&gact->tcf_lock); hrtimer_cancel(&gact->hitimer); spin_lock_bh(&gact->tcf_lock); } gact->param.tcfg_basetime = basetime; gact->param.tcfg_clockid = clockid; gact->tk_offset = tko; hrtimer_init(&gact->hitimer, clockid, HRTIMER_MODE_ABS_SOFT); gact->hitimer.function = gate_timer_func; } static int tcf_gate_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **a, struct tcf_proto *tp, u32 flags, struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, act_gate_ops.net_id); enum tk_offsets tk_offset = TK_OFFS_TAI; bool bind = flags & TCA_ACT_FLAGS_BIND; struct nlattr *tb[TCA_GATE_MAX + 1]; struct tcf_chain *goto_ch = NULL; u64 cycletime = 0, basetime = 0; struct tcf_gate_params *p; s32 clockid = CLOCK_TAI; struct tcf_gate *gact; struct tc_gate *parm; int ret = 0, err; u32 gflags = 0; s32 prio = -1; ktime_t start; u32 index; if (!nla) return -EINVAL; err = nla_parse_nested(tb, TCA_GATE_MAX, nla, gate_policy, extack); if (err < 0) return err; if (!tb[TCA_GATE_PARMS]) return -EINVAL; if (tb[TCA_GATE_CLOCKID]) { clockid = nla_get_s32(tb[TCA_GATE_CLOCKID]); switch (clockid) { case CLOCK_REALTIME: tk_offset = TK_OFFS_REAL; break; case CLOCK_MONOTONIC: tk_offset = TK_OFFS_MAX; break; case CLOCK_BOOTTIME: tk_offset = TK_OFFS_BOOT; break; case CLOCK_TAI: tk_offset = TK_OFFS_TAI; break; default: NL_SET_ERR_MSG(extack, "Invalid 'clockid'"); return -EINVAL; } } parm = nla_data(tb[TCA_GATE_PARMS]); index = parm->index; err = tcf_idr_check_alloc(tn, &index, a, bind); if (err < 0) return err; if (err && bind) return ACT_P_BOUND; if (!err) { ret = tcf_idr_create_from_flags(tn, index, est, a, &act_gate_ops, bind, flags); if (ret) { tcf_idr_cleanup(tn, index); return ret; } ret = ACT_P_CREATED; } else if (!(flags & TCA_ACT_FLAGS_REPLACE)) { tcf_idr_release(*a, bind); return -EEXIST; } if (tb[TCA_GATE_PRIORITY]) prio = nla_get_s32(tb[TCA_GATE_PRIORITY]); if (tb[TCA_GATE_BASE_TIME]) basetime = nla_get_u64(tb[TCA_GATE_BASE_TIME]); if (tb[TCA_GATE_FLAGS]) gflags = nla_get_u32(tb[TCA_GATE_FLAGS]); gact = to_gate(*a); if (ret == ACT_P_CREATED) INIT_LIST_HEAD(&gact->param.entries); err = tcf_action_check_ctrlact(parm->action, tp, &goto_ch, extack); if (err < 0) goto release_idr; spin_lock_bh(&gact->tcf_lock); p = &gact->param; if (tb[TCA_GATE_CYCLE_TIME]) cycletime = nla_get_u64(tb[TCA_GATE_CYCLE_TIME]); if (tb[TCA_GATE_ENTRY_LIST]) { err = parse_gate_list(tb[TCA_GATE_ENTRY_LIST], p, extack); if (err < 0) goto chain_put; } if (!cycletime) { struct tcfg_gate_entry *entry; ktime_t cycle = 0; list_for_each_entry(entry, &p->entries, list) cycle = ktime_add_ns(cycle, entry->interval); cycletime = cycle; if (!cycletime) { err = -EINVAL; goto chain_put; } } p->tcfg_cycletime = cycletime; if (tb[TCA_GATE_CYCLE_TIME_EXT]) p->tcfg_cycletime_ext = nla_get_u64(tb[TCA_GATE_CYCLE_TIME_EXT]); gate_setup_timer(gact, basetime, tk_offset, clockid, ret == ACT_P_CREATED); p->tcfg_priority = prio; p->tcfg_flags = gflags; gate_get_start_time(gact, &start); gact->current_close_time = start; gact->current_gate_status = GATE_ACT_GATE_OPEN | GATE_ACT_PENDING; gact->next_entry = list_first_entry(&p->entries, struct tcfg_gate_entry, list); goto_ch = tcf_action_set_ctrlact(*a, parm->action, goto_ch); gate_start_timer(gact, start); spin_unlock_bh(&gact->tcf_lock); if (goto_ch) tcf_chain_put_by_act(goto_ch); return ret; chain_put: spin_unlock_bh(&gact->tcf_lock); if (goto_ch) tcf_chain_put_by_act(goto_ch); release_idr: /* action is not inserted in any list: it's safe to init hitimer * without taking tcf_lock. */ if (ret == ACT_P_CREATED) gate_setup_timer(gact, gact->param.tcfg_basetime, gact->tk_offset, gact->param.tcfg_clockid, true); tcf_idr_release(*a, bind); return err; } static void tcf_gate_cleanup(struct tc_action *a) { struct tcf_gate *gact = to_gate(a); struct tcf_gate_params *p; p = &gact->param; hrtimer_cancel(&gact->hitimer); release_entry_list(&p->entries); } static int dumping_entry(struct sk_buff *skb, struct tcfg_gate_entry *entry) { struct nlattr *item; item = nla_nest_start_noflag(skb, TCA_GATE_ONE_ENTRY); if (!item) return -ENOSPC; if (nla_put_u32(skb, TCA_GATE_ENTRY_INDEX, entry->index)) goto nla_put_failure; if (entry->gate_state && nla_put_flag(skb, TCA_GATE_ENTRY_GATE)) goto nla_put_failure; if (nla_put_u32(skb, TCA_GATE_ENTRY_INTERVAL, entry->interval)) goto nla_put_failure; if (nla_put_s32(skb, TCA_GATE_ENTRY_MAX_OCTETS, entry->maxoctets)) goto nla_put_failure; if (nla_put_s32(skb, TCA_GATE_ENTRY_IPV, entry->ipv)) goto nla_put_failure; return nla_nest_end(skb, item); nla_put_failure: nla_nest_cancel(skb, item); return -1; } static int tcf_gate_dump(struct sk_buff *skb, struct tc_action *a, int bind, int ref) { unsigned char *b = skb_tail_pointer(skb); struct tcf_gate *gact = to_gate(a); struct tc_gate opt = { .index = gact->tcf_index, .refcnt = refcount_read(&gact->tcf_refcnt) - ref, .bindcnt = atomic_read(&gact->tcf_bindcnt) - bind, }; struct tcfg_gate_entry *entry; struct tcf_gate_params *p; struct nlattr *entry_list; struct tcf_t t; spin_lock_bh(&gact->tcf_lock); opt.action = gact->tcf_action; p = &gact->param; if (nla_put(skb, TCA_GATE_PARMS, sizeof(opt), &opt)) goto nla_put_failure; if (nla_put_u64_64bit(skb, TCA_GATE_BASE_TIME, p->tcfg_basetime, TCA_GATE_PAD)) goto nla_put_failure; if (nla_put_u64_64bit(skb, TCA_GATE_CYCLE_TIME, p->tcfg_cycletime, TCA_GATE_PAD)) goto nla_put_failure; if (nla_put_u64_64bit(skb, TCA_GATE_CYCLE_TIME_EXT, p->tcfg_cycletime_ext, TCA_GATE_PAD)) goto nla_put_failure; if (nla_put_s32(skb, TCA_GATE_CLOCKID, p->tcfg_clockid)) goto nla_put_failure; if (nla_put_u32(skb, TCA_GATE_FLAGS, p->tcfg_flags)) goto nla_put_failure; if (nla_put_s32(skb, TCA_GATE_PRIORITY, p->tcfg_priority)) goto nla_put_failure; entry_list = nla_nest_start_noflag(skb, TCA_GATE_ENTRY_LIST); if (!entry_list) goto nla_put_failure; list_for_each_entry(entry, &p->entries, list) { if (dumping_entry(skb, entry) < 0) goto nla_put_failure; } nla_nest_end(skb, entry_list); tcf_tm_dump(&t, &gact->tcf_tm); if (nla_put_64bit(skb, TCA_GATE_TM, sizeof(t), &t, TCA_GATE_PAD)) goto nla_put_failure; spin_unlock_bh(&gact->tcf_lock); return skb->len; nla_put_failure: spin_unlock_bh(&gact->tcf_lock); nlmsg_trim(skb, b); return -1; } static void tcf_gate_stats_update(struct tc_action *a, u64 bytes, u64 packets, u64 drops, u64 lastuse, bool hw) { struct tcf_gate *gact = to_gate(a); struct tcf_t *tm = &gact->tcf_tm; tcf_action_update_stats(a, bytes, packets, drops, hw); tm->lastuse = max_t(u64, tm->lastuse, lastuse); } static size_t tcf_gate_get_fill_size(const struct tc_action *act) { return nla_total_size(sizeof(struct tc_gate)); } static void tcf_gate_entry_destructor(void *priv) { struct action_gate_entry *oe = priv; kfree(oe); } static int tcf_gate_get_entries(struct flow_action_entry *entry, const struct tc_action *act) { entry->gate.entries = tcf_gate_get_list(act); if (!entry->gate.entries) return -EINVAL; entry->destructor = tcf_gate_entry_destructor; entry->destructor_priv = entry->gate.entries; return 0; } static int tcf_gate_offload_act_setup(struct tc_action *act, void *entry_data, u32 *index_inc, bool bind, struct netlink_ext_ack *extack) { int err; if (bind) { struct flow_action_entry *entry = entry_data; entry->id = FLOW_ACTION_GATE; entry->gate.prio = tcf_gate_prio(act); entry->gate.basetime = tcf_gate_basetime(act); entry->gate.cycletime = tcf_gate_cycletime(act); entry->gate.cycletimeext = tcf_gate_cycletimeext(act); entry->gate.num_entries = tcf_gate_num_entries(act); err = tcf_gate_get_entries(entry, act); if (err) return err; *index_inc = 1; } else { struct flow_offload_action *fl_action = entry_data; fl_action->id = FLOW_ACTION_GATE; } return 0; } static struct tc_action_ops act_gate_ops = { .kind = "gate", .id = TCA_ID_GATE, .owner = THIS_MODULE, .act = tcf_gate_act, .dump = tcf_gate_dump, .init = tcf_gate_init, .cleanup = tcf_gate_cleanup, .stats_update = tcf_gate_stats_update, .get_fill_size = tcf_gate_get_fill_size, .offload_act_setup = tcf_gate_offload_act_setup, .size = sizeof(struct tcf_gate), }; MODULE_ALIAS_NET_ACT("gate"); static __net_init int gate_init_net(struct net *net) { struct tc_action_net *tn = net_generic(net, act_gate_ops.net_id); return tc_action_net_init(net, tn, &act_gate_ops); } static void __net_exit gate_exit_net(struct list_head *net_list) { tc_action_net_exit(net_list, act_gate_ops.net_id); } static struct pernet_operations gate_net_ops = { .init = gate_init_net, .exit_batch = gate_exit_net, .id = &act_gate_ops.net_id, .size = sizeof(struct tc_action_net), }; static int __init gate_init_module(void) { return tcf_register_action(&act_gate_ops, &gate_net_ops); } static void __exit gate_cleanup_module(void) { tcf_unregister_action(&act_gate_ops, &gate_net_ops); } module_init(gate_init_module); module_exit(gate_cleanup_module); MODULE_DESCRIPTION("TC gate action"); MODULE_LICENSE("GPL v2");
1 1 3 1 1 1 1 1 1 1 1 1 1 1 3 3 3 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 // SPDX-License-Identifier: GPL-2.0-or-later /* * Copyright 1997-1998 Transmeta Corporation -- All Rights Reserved * Copyright 2005-2006 Ian Kent <raven@themaw.net> */ #include <linux/seq_file.h> #include <linux/pagemap.h> #include "autofs_i.h" struct autofs_info *autofs_new_ino(struct autofs_sb_info *sbi) { struct autofs_info *ino; ino = kzalloc(sizeof(*ino), GFP_KERNEL); if (ino) { INIT_LIST_HEAD(&ino->active); INIT_LIST_HEAD(&ino->expiring); ino->last_used = jiffies; ino->sbi = sbi; ino->exp_timeout = -1; ino->count = 1; } return ino; } void autofs_clean_ino(struct autofs_info *ino) { ino->uid = GLOBAL_ROOT_UID; ino->gid = GLOBAL_ROOT_GID; ino->exp_timeout = -1; ino->last_used = jiffies; } void autofs_free_ino(struct autofs_info *ino) { kfree_rcu(ino, rcu); } void autofs_kill_sb(struct super_block *sb) { struct autofs_sb_info *sbi = autofs_sbi(sb); /* * In the event of a failure in get_sb_nodev the superblock * info is not present so nothing else has been setup, so * just call kill_anon_super when we are called from * deactivate_super. */ if (sbi) { /* Free wait queues, close pipe */ autofs_catatonic_mode(sbi); put_pid(sbi->oz_pgrp); } pr_debug("shutting down\n"); kill_litter_super(sb); if (sbi) kfree_rcu(sbi, rcu); } static int autofs_show_options(struct seq_file *m, struct dentry *root) { struct autofs_sb_info *sbi = autofs_sbi(root->d_sb); struct inode *root_inode = d_inode(root->d_sb->s_root); if (!sbi) return 0; seq_printf(m, ",fd=%d", sbi->pipefd); if (!uid_eq(root_inode->i_uid, GLOBAL_ROOT_UID)) seq_printf(m, ",uid=%u", from_kuid_munged(&init_user_ns, root_inode->i_uid)); if (!gid_eq(root_inode->i_gid, GLOBAL_ROOT_GID)) seq_printf(m, ",gid=%u", from_kgid_munged(&init_user_ns, root_inode->i_gid)); seq_printf(m, ",pgrp=%d", pid_vnr(sbi->oz_pgrp)); seq_printf(m, ",timeout=%lu", sbi->exp_timeout/HZ); seq_printf(m, ",minproto=%d", sbi->min_proto); seq_printf(m, ",maxproto=%d", sbi->max_proto); if (autofs_type_offset(sbi->type)) seq_puts(m, ",offset"); else if (autofs_type_direct(sbi->type)) seq_puts(m, ",direct"); else seq_puts(m, ",indirect"); if (sbi->flags & AUTOFS_SBI_STRICTEXPIRE) seq_puts(m, ",strictexpire"); if (sbi->flags & AUTOFS_SBI_IGNORE) seq_puts(m, ",ignore"); #ifdef CONFIG_CHECKPOINT_RESTORE if (sbi->pipe) seq_printf(m, ",pipe_ino=%ld", file_inode(sbi->pipe)->i_ino); else seq_puts(m, ",pipe_ino=-1"); #endif return 0; } static void autofs_evict_inode(struct inode *inode) { clear_inode(inode); kfree(inode->i_private); } static const struct super_operations autofs_sops = { .statfs = simple_statfs, .show_options = autofs_show_options, .evict_inode = autofs_evict_inode, }; enum { Opt_direct, Opt_fd, Opt_gid, Opt_ignore, Opt_indirect, Opt_maxproto, Opt_minproto, Opt_offset, Opt_pgrp, Opt_strictexpire, Opt_uid, }; const struct fs_parameter_spec autofs_param_specs[] = { fsparam_flag ("direct", Opt_direct), fsparam_fd ("fd", Opt_fd), fsparam_gid ("gid", Opt_gid), fsparam_flag ("ignore", Opt_ignore), fsparam_flag ("indirect", Opt_indirect), fsparam_u32 ("maxproto", Opt_maxproto), fsparam_u32 ("minproto", Opt_minproto), fsparam_flag ("offset", Opt_offset), fsparam_u32 ("pgrp", Opt_pgrp), fsparam_flag ("strictexpire", Opt_strictexpire), fsparam_uid ("uid", Opt_uid), {} }; struct autofs_fs_context { kuid_t uid; kgid_t gid; int pgrp; bool pgrp_set; }; /* * Open the fd. We do it here rather than in get_tree so that it's done in the * context of the system call that passed the data and not the one that * triggered the superblock creation, lest the fd gets reassigned. */ static int autofs_parse_fd(struct fs_context *fc, struct autofs_sb_info *sbi, struct fs_parameter *param, struct fs_parse_result *result) { struct file *pipe; int ret; if (param->type == fs_value_is_file) { /* came through the new api */ pipe = param->file; param->file = NULL; } else { pipe = fget(result->uint_32); } if (!pipe) { errorf(fc, "could not open pipe file descriptor"); return -EBADF; } ret = autofs_check_pipe(pipe); if (ret < 0) { errorf(fc, "Invalid/unusable pipe"); fput(pipe); return -EBADF; } autofs_set_packet_pipe_flags(pipe); if (sbi->pipe) fput(sbi->pipe); sbi->pipefd = result->uint_32; sbi->pipe = pipe; return 0; } static int autofs_parse_param(struct fs_context *fc, struct fs_parameter *param) { struct autofs_fs_context *ctx = fc->fs_private; struct autofs_sb_info *sbi = fc->s_fs_info; struct fs_parse_result result; int opt; opt = fs_parse(fc, autofs_param_specs, param, &result); if (opt < 0) return opt; switch (opt) { case Opt_fd: return autofs_parse_fd(fc, sbi, param, &result); case Opt_uid: ctx->uid = result.uid; break; case Opt_gid: ctx->gid = result.gid; break; case Opt_pgrp: ctx->pgrp = result.uint_32; ctx->pgrp_set = true; break; case Opt_minproto: sbi->min_proto = result.uint_32; break; case Opt_maxproto: sbi->max_proto = result.uint_32; break; case Opt_indirect: set_autofs_type_indirect(&sbi->type); break; case Opt_direct: set_autofs_type_direct(&sbi->type); break; case Opt_offset: set_autofs_type_offset(&sbi->type); break; case Opt_strictexpire: sbi->flags |= AUTOFS_SBI_STRICTEXPIRE; break; case Opt_ignore: sbi->flags |= AUTOFS_SBI_IGNORE; } return 0; } static struct autofs_sb_info *autofs_alloc_sbi(void) { struct autofs_sb_info *sbi; sbi = kzalloc(sizeof(*sbi), GFP_KERNEL); if (!sbi) return NULL; sbi->magic = AUTOFS_SBI_MAGIC; sbi->flags = AUTOFS_SBI_CATATONIC; sbi->min_proto = AUTOFS_MIN_PROTO_VERSION; sbi->max_proto = AUTOFS_MAX_PROTO_VERSION; sbi->pipefd = -1; set_autofs_type_indirect(&sbi->type); mutex_init(&sbi->wq_mutex); mutex_init(&sbi->pipe_mutex); spin_lock_init(&sbi->fs_lock); spin_lock_init(&sbi->lookup_lock); INIT_LIST_HEAD(&sbi->active_list); INIT_LIST_HEAD(&sbi->expiring_list); return sbi; } static int autofs_validate_protocol(struct fs_context *fc) { struct autofs_sb_info *sbi = fc->s_fs_info; /* Test versions first */ if (sbi->max_proto < AUTOFS_MIN_PROTO_VERSION || sbi->min_proto > AUTOFS_MAX_PROTO_VERSION) { errorf(fc, "kernel does not match daemon version " "daemon (%d, %d) kernel (%d, %d)\n", sbi->min_proto, sbi->max_proto, AUTOFS_MIN_PROTO_VERSION, AUTOFS_MAX_PROTO_VERSION); return -EINVAL; } /* Establish highest kernel protocol version */ if (sbi->max_proto > AUTOFS_MAX_PROTO_VERSION) sbi->version = AUTOFS_MAX_PROTO_VERSION; else sbi->version = sbi->max_proto; switch (sbi->version) { case 4: sbi->sub_version = 7; break; case 5: sbi->sub_version = AUTOFS_PROTO_SUBVERSION; break; default: sbi->sub_version = 0; } return 0; } static int autofs_fill_super(struct super_block *s, struct fs_context *fc) { struct autofs_fs_context *ctx = fc->fs_private; struct autofs_sb_info *sbi = s->s_fs_info; struct inode *root_inode; struct autofs_info *ino; pr_debug("starting up, sbi = %p\n", sbi); sbi->sb = s; s->s_blocksize = 1024; s->s_blocksize_bits = 10; s->s_magic = AUTOFS_SUPER_MAGIC; s->s_op = &autofs_sops; s->s_d_op = &autofs_dentry_operations; s->s_time_gran = 1; /* * Get the root inode and dentry, but defer checking for errors. */ ino = autofs_new_ino(sbi); if (!ino) return -ENOMEM; root_inode = autofs_get_inode(s, S_IFDIR | 0755); if (!root_inode) return -ENOMEM; root_inode->i_uid = ctx->uid; root_inode->i_gid = ctx->gid; root_inode->i_fop = &autofs_root_operations; root_inode->i_op = &autofs_dir_inode_operations; s->s_root = d_make_root(root_inode); if (unlikely(!s->s_root)) { autofs_free_ino(ino); return -ENOMEM; } s->s_root->d_fsdata = ino; if (ctx->pgrp_set) { sbi->oz_pgrp = find_get_pid(ctx->pgrp); if (!sbi->oz_pgrp) return invalf(fc, "Could not find process group %d", ctx->pgrp); } else sbi->oz_pgrp = get_task_pid(current, PIDTYPE_PGID); if (autofs_type_trigger(sbi->type)) /* s->s_root won't be contended so there's little to * be gained by not taking the d_lock when setting * d_flags, even when a lot mounts are being done. */ managed_dentry_set_managed(s->s_root); pr_debug("pipe fd = %d, pgrp = %u\n", sbi->pipefd, pid_nr(sbi->oz_pgrp)); sbi->flags &= ~AUTOFS_SBI_CATATONIC; return 0; } /* * Validate the parameters and then request a superblock. */ static int autofs_get_tree(struct fs_context *fc) { struct autofs_sb_info *sbi = fc->s_fs_info; int ret; ret = autofs_validate_protocol(fc); if (ret) return ret; if (sbi->pipefd < 0) return invalf(fc, "No control pipe specified"); return get_tree_nodev(fc, autofs_fill_super); } static void autofs_free_fc(struct fs_context *fc) { struct autofs_fs_context *ctx = fc->fs_private; struct autofs_sb_info *sbi = fc->s_fs_info; if (sbi) { if (sbi->pipe) fput(sbi->pipe); kfree(sbi); } kfree(ctx); } static const struct fs_context_operations autofs_context_ops = { .free = autofs_free_fc, .parse_param = autofs_parse_param, .get_tree = autofs_get_tree, }; /* * Set up the filesystem mount context. */ int autofs_init_fs_context(struct fs_context *fc) { struct autofs_fs_context *ctx; struct autofs_sb_info *sbi; ctx = kzalloc(sizeof(struct autofs_fs_context), GFP_KERNEL); if (!ctx) goto nomem; ctx->uid = current_uid(); ctx->gid = current_gid(); sbi = autofs_alloc_sbi(); if (!sbi) goto nomem_ctx; fc->fs_private = ctx; fc->s_fs_info = sbi; fc->ops = &autofs_context_ops; return 0; nomem_ctx: kfree(ctx); nomem: return -ENOMEM; } struct inode *autofs_get_inode(struct super_block *sb, umode_t mode) { struct inode *inode = new_inode(sb); if (inode == NULL) return NULL; inode->i_mode = mode; if (sb->s_root) { inode->i_uid = d_inode(sb->s_root)->i_uid; inode->i_gid = d_inode(sb->s_root)->i_gid; } simple_inode_init_ts(inode); inode->i_ino = get_next_ino(); if (S_ISDIR(mode)) { set_nlink(inode, 2); inode->i_op = &autofs_dir_inode_operations; inode->i_fop = &autofs_dir_operations; } else if (S_ISLNK(mode)) { inode->i_op = &autofs_symlink_inode_operations; } else WARN_ON(1); return inode; }
7 7 8 8 7 8 6 6 4 4 1 3 1 2 1 1 1 2 2 1 1 1 6 4 4 4 1 3 1 2 1 1 1 4 3 4 1 1 1 1 1 1 1 2 2 2 2 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 // SPDX-License-Identifier: GPL-2.0 /* * Documentation/ABI/stable/sysfs-fs-orangefs: * * What: /sys/fs/orangefs/perf_counter_reset * Date: June 2015 * Contact: Mike Marshall <hubcap@omnibond.com> * Description: * echo a 0 or a 1 into perf_counter_reset to * reset all the counters in * /sys/fs/orangefs/perf_counters * except ones with PINT_PERF_PRESERVE set. * * * What: /sys/fs/orangefs/perf_counters/... * Date: Jun 2015 * Contact: Mike Marshall <hubcap@omnibond.com> * Description: * Counters and settings for various caches. * Read only. * * * What: /sys/fs/orangefs/perf_time_interval_secs * Date: Jun 2015 * Contact: Mike Marshall <hubcap@omnibond.com> * Description: * Length of perf counter intervals in * seconds. * * * What: /sys/fs/orangefs/perf_history_size * Date: Jun 2015 * Contact: Mike Marshall <hubcap@omnibond.com> * Description: * The perf_counters cache statistics have N, or * perf_history_size, samples. The default is * one. * * Every perf_time_interval_secs the (first) * samples are reset. * * If N is greater than one, the "current" set * of samples is reset, and the samples from the * other N-1 intervals remain available. * * * What: /sys/fs/orangefs/op_timeout_secs * Date: Jun 2015 * Contact: Mike Marshall <hubcap@omnibond.com> * Description: * Service operation timeout in seconds. * * * What: /sys/fs/orangefs/slot_timeout_secs * Date: Jun 2015 * Contact: Mike Marshall <hubcap@omnibond.com> * Description: * "Slot" timeout in seconds. A "slot" * is an indexed buffer in the shared * memory segment used for communication * between the kernel module and userspace. * Slots are requested and waited for, * the wait times out after slot_timeout_secs. * * What: /sys/fs/orangefs/cache_timeout_msecs * Date: Mar 2018 * Contact: Martin Brandenburg <martin@omnibond.com> * Description: * Time in milliseconds between which * orangefs_revalidate_mapping will invalidate the page * cache. * * What: /sys/fs/orangefs/dcache_timeout_msecs * Date: Jul 2016 * Contact: Martin Brandenburg <martin@omnibond.com> * Description: * Time lookup is valid in milliseconds. * * What: /sys/fs/orangefs/getattr_timeout_msecs * Date: Jul 2016 * Contact: Martin Brandenburg <martin@omnibond.com> * Description: * Time getattr is valid in milliseconds. * * What: /sys/fs/orangefs/readahead_count * Date: Aug 2016 * Contact: Martin Brandenburg <martin@omnibond.com> * Description: * Readahead cache buffer count. * * What: /sys/fs/orangefs/readahead_size * Date: Aug 2016 * Contact: Martin Brandenburg <martin@omnibond.com> * Description: * Readahead cache buffer size. * * What: /sys/fs/orangefs/readahead_count_size * Date: Aug 2016 * Contact: Martin Brandenburg <martin@omnibond.com> * Description: * Readahead cache buffer count and size. * * What: /sys/fs/orangefs/readahead_readcnt * Date: Jan 2017 * Contact: Martin Brandenburg <martin@omnibond.com> * Description: * Number of buffers (in multiples of readahead_size) * which can be read ahead for a single file at once. * * What: /sys/fs/orangefs/acache/... * Date: Jun 2015 * Contact: Martin Brandenburg <martin@omnibond.com> * Description: * Attribute cache configurable settings. * * * What: /sys/fs/orangefs/ncache/... * Date: Jun 2015 * Contact: Mike Marshall <hubcap@omnibond.com> * Description: * Name cache configurable settings. * * * What: /sys/fs/orangefs/capcache/... * Date: Jun 2015 * Contact: Mike Marshall <hubcap@omnibond.com> * Description: * Capability cache configurable settings. * * * What: /sys/fs/orangefs/ccache/... * Date: Jun 2015 * Contact: Mike Marshall <hubcap@omnibond.com> * Description: * Credential cache configurable settings. * */ #include <linux/fs.h> #include <linux/kobject.h> #include <linux/string.h> #include <linux/sysfs.h> #include <linux/module.h> #include <linux/init.h> #include "protocol.h" #include "orangefs-kernel.h" #include "orangefs-sysfs.h" #define ORANGEFS_KOBJ_ID "orangefs" #define ACACHE_KOBJ_ID "acache" #define CAPCACHE_KOBJ_ID "capcache" #define CCACHE_KOBJ_ID "ccache" #define NCACHE_KOBJ_ID "ncache" #define PC_KOBJ_ID "pc" #define STATS_KOBJ_ID "stats" /* * Every item calls orangefs_attr_show and orangefs_attr_store through * orangefs_sysfs_ops. They look at the orangefs_attributes further below to * call one of sysfs_int_show, sysfs_int_store, sysfs_service_op_show, or * sysfs_service_op_store. */ struct orangefs_attribute { struct attribute attr; ssize_t (*show)(struct kobject *kobj, struct orangefs_attribute *attr, char *buf); ssize_t (*store)(struct kobject *kobj, struct orangefs_attribute *attr, const char *buf, size_t count); }; static ssize_t orangefs_attr_show(struct kobject *kobj, struct attribute *attr, char *buf) { struct orangefs_attribute *attribute; attribute = container_of(attr, struct orangefs_attribute, attr); if (!attribute->show) return -EIO; return attribute->show(kobj, attribute, buf); } static ssize_t orangefs_attr_store(struct kobject *kobj, struct attribute *attr, const char *buf, size_t len) { struct orangefs_attribute *attribute; if (!strcmp(kobj->name, PC_KOBJ_ID) || !strcmp(kobj->name, STATS_KOBJ_ID)) return -EPERM; attribute = container_of(attr, struct orangefs_attribute, attr); if (!attribute->store) return -EIO; return attribute->store(kobj, attribute, buf, len); } static const struct sysfs_ops orangefs_sysfs_ops = { .show = orangefs_attr_show, .store = orangefs_attr_store, }; static ssize_t sysfs_int_show(struct kobject *kobj, struct orangefs_attribute *attr, char *buf) { int rc = -EIO; gossip_debug(GOSSIP_SYSFS_DEBUG, "sysfs_int_show: id:%s:\n", kobj->name); if (!strcmp(kobj->name, ORANGEFS_KOBJ_ID)) { if (!strcmp(attr->attr.name, "op_timeout_secs")) { rc = scnprintf(buf, PAGE_SIZE, "%d\n", op_timeout_secs); goto out; } else if (!strcmp(attr->attr.name, "slot_timeout_secs")) { rc = scnprintf(buf, PAGE_SIZE, "%d\n", slot_timeout_secs); goto out; } else if (!strcmp(attr->attr.name, "cache_timeout_msecs")) { rc = scnprintf(buf, PAGE_SIZE, "%d\n", orangefs_cache_timeout_msecs); goto out; } else if (!strcmp(attr->attr.name, "dcache_timeout_msecs")) { rc = scnprintf(buf, PAGE_SIZE, "%d\n", orangefs_dcache_timeout_msecs); goto out; } else if (!strcmp(attr->attr.name, "getattr_timeout_msecs")) { rc = scnprintf(buf, PAGE_SIZE, "%d\n", orangefs_getattr_timeout_msecs); goto out; } else { goto out; } } else if (!strcmp(kobj->name, STATS_KOBJ_ID)) { if (!strcmp(attr->attr.name, "reads")) { rc = scnprintf(buf, PAGE_SIZE, "%lu\n", orangefs_stats.reads); goto out; } else if (!strcmp(attr->attr.name, "writes")) { rc = scnprintf(buf, PAGE_SIZE, "%lu\n", orangefs_stats.writes); goto out; } else { goto out; } } out: return rc; } static ssize_t sysfs_int_store(struct kobject *kobj, struct orangefs_attribute *attr, const char *buf, size_t count) { int rc = 0; gossip_debug(GOSSIP_SYSFS_DEBUG, "sysfs_int_store: start attr->attr.name:%s: buf:%s:\n", attr->attr.name, buf); if (!strcmp(attr->attr.name, "op_timeout_secs")) { rc = kstrtoint(buf, 0, &op_timeout_secs); goto out; } else if (!strcmp(attr->attr.name, "slot_timeout_secs")) { rc = kstrtoint(buf, 0, &slot_timeout_secs); goto out; } else if (!strcmp(attr->attr.name, "cache_timeout_msecs")) { rc = kstrtoint(buf, 0, &orangefs_cache_timeout_msecs); goto out; } else if (!strcmp(attr->attr.name, "dcache_timeout_msecs")) { rc = kstrtoint(buf, 0, &orangefs_dcache_timeout_msecs); goto out; } else if (!strcmp(attr->attr.name, "getattr_timeout_msecs")) { rc = kstrtoint(buf, 0, &orangefs_getattr_timeout_msecs); goto out; } else { goto out; } out: if (rc) rc = -EINVAL; else rc = count; return rc; } /* * obtain attribute values from userspace with a service operation. */ static ssize_t sysfs_service_op_show(struct kobject *kobj, struct orangefs_attribute *attr, char *buf) { struct orangefs_kernel_op_s *new_op = NULL; int rc = 0; char *ser_op_type = NULL; __u32 op_alloc_type; gossip_debug(GOSSIP_SYSFS_DEBUG, "sysfs_service_op_show: id:%s:\n", kobj->name); if (strcmp(kobj->name, PC_KOBJ_ID)) op_alloc_type = ORANGEFS_VFS_OP_PARAM; else op_alloc_type = ORANGEFS_VFS_OP_PERF_COUNT; new_op = op_alloc(op_alloc_type); if (!new_op) return -ENOMEM; /* Can't do a service_operation if the client is not running... */ rc = is_daemon_in_service(); if (rc) { pr_info_ratelimited("%s: Client not running :%d:\n", __func__, is_daemon_in_service()); goto out; } if (strcmp(kobj->name, PC_KOBJ_ID)) new_op->upcall.req.param.type = ORANGEFS_PARAM_REQUEST_GET; if (!strcmp(kobj->name, ORANGEFS_KOBJ_ID)) { /* Drop unsupported requests first. */ if (!(orangefs_features & ORANGEFS_FEATURE_READAHEAD) && (!strcmp(attr->attr.name, "readahead_count") || !strcmp(attr->attr.name, "readahead_size") || !strcmp(attr->attr.name, "readahead_count_size") || !strcmp(attr->attr.name, "readahead_readcnt"))) { rc = -EINVAL; goto out; } if (!strcmp(attr->attr.name, "perf_history_size")) new_op->upcall.req.param.op = ORANGEFS_PARAM_REQUEST_OP_PERF_HISTORY_SIZE; else if (!strcmp(attr->attr.name, "perf_time_interval_secs")) new_op->upcall.req.param.op = ORANGEFS_PARAM_REQUEST_OP_PERF_TIME_INTERVAL_SECS; else if (!strcmp(attr->attr.name, "perf_counter_reset")) new_op->upcall.req.param.op = ORANGEFS_PARAM_REQUEST_OP_PERF_RESET; else if (!strcmp(attr->attr.name, "readahead_count")) new_op->upcall.req.param.op = ORANGEFS_PARAM_REQUEST_OP_READAHEAD_COUNT; else if (!strcmp(attr->attr.name, "readahead_size")) new_op->upcall.req.param.op = ORANGEFS_PARAM_REQUEST_OP_READAHEAD_SIZE; else if (!strcmp(attr->attr.name, "readahead_count_size")) new_op->upcall.req.param.op = ORANGEFS_PARAM_REQUEST_OP_READAHEAD_COUNT_SIZE; else if (!strcmp(attr->attr.name, "readahead_readcnt")) new_op->upcall.req.param.op = ORANGEFS_PARAM_REQUEST_OP_READAHEAD_READCNT; } else if (!strcmp(kobj->name, ACACHE_KOBJ_ID)) { if (!strcmp(attr->attr.name, "timeout_msecs")) new_op->upcall.req.param.op = ORANGEFS_PARAM_REQUEST_OP_ACACHE_TIMEOUT_MSECS; if (!strcmp(attr->attr.name, "hard_limit")) new_op->upcall.req.param.op = ORANGEFS_PARAM_REQUEST_OP_ACACHE_HARD_LIMIT; if (!strcmp(attr->attr.name, "soft_limit")) new_op->upcall.req.param.op = ORANGEFS_PARAM_REQUEST_OP_ACACHE_SOFT_LIMIT; if (!strcmp(attr->attr.name, "reclaim_percentage")) new_op->upcall.req.param.op = ORANGEFS_PARAM_REQUEST_OP_ACACHE_RECLAIM_PERCENTAGE; } else if (!strcmp(kobj->name, CAPCACHE_KOBJ_ID)) { if (!strcmp(attr->attr.name, "timeout_secs")) new_op->upcall.req.param.op = ORANGEFS_PARAM_REQUEST_OP_CAPCACHE_TIMEOUT_SECS; if (!strcmp(attr->attr.name, "hard_limit")) new_op->upcall.req.param.op = ORANGEFS_PARAM_REQUEST_OP_CAPCACHE_HARD_LIMIT; if (!strcmp(attr->attr.name, "soft_limit")) new_op->upcall.req.param.op = ORANGEFS_PARAM_REQUEST_OP_CAPCACHE_SOFT_LIMIT; if (!strcmp(attr->attr.name, "reclaim_percentage")) new_op->upcall.req.param.op = ORANGEFS_PARAM_REQUEST_OP_CAPCACHE_RECLAIM_PERCENTAGE; } else if (!strcmp(kobj->name, CCACHE_KOBJ_ID)) { if (!strcmp(attr->attr.name, "timeout_secs")) new_op->upcall.req.param.op = ORANGEFS_PARAM_REQUEST_OP_CCACHE_TIMEOUT_SECS; if (!strcmp(attr->attr.name, "hard_limit")) new_op->upcall.req.param.op = ORANGEFS_PARAM_REQUEST_OP_CCACHE_HARD_LIMIT; if (!strcmp(attr->attr.name, "soft_limit")) new_op->upcall.req.param.op = ORANGEFS_PARAM_REQUEST_OP_CCACHE_SOFT_LIMIT; if (!strcmp(attr->attr.name, "reclaim_percentage")) new_op->upcall.req.param.op = ORANGEFS_PARAM_REQUEST_OP_CCACHE_RECLAIM_PERCENTAGE; } else if (!strcmp(kobj->name, NCACHE_KOBJ_ID)) { if (!strcmp(attr->attr.name, "timeout_msecs")) new_op->upcall.req.param.op = ORANGEFS_PARAM_REQUEST_OP_NCACHE_TIMEOUT_MSECS; if (!strcmp(attr->attr.name, "hard_limit")) new_op->upcall.req.param.op = ORANGEFS_PARAM_REQUEST_OP_NCACHE_HARD_LIMIT; if (!strcmp(attr->attr.name, "soft_limit")) new_op->upcall.req.param.op = ORANGEFS_PARAM_REQUEST_OP_NCACHE_SOFT_LIMIT; if (!strcmp(attr->attr.name, "reclaim_percentage")) new_op->upcall.req.param.op = ORANGEFS_PARAM_REQUEST_OP_NCACHE_RECLAIM_PERCENTAGE; } else if (!strcmp(kobj->name, PC_KOBJ_ID)) { if (!strcmp(attr->attr.name, ACACHE_KOBJ_ID)) new_op->upcall.req.perf_count.type = ORANGEFS_PERF_COUNT_REQUEST_ACACHE; if (!strcmp(attr->attr.name, CAPCACHE_KOBJ_ID)) new_op->upcall.req.perf_count.type = ORANGEFS_PERF_COUNT_REQUEST_CAPCACHE; if (!strcmp(attr->attr.name, NCACHE_KOBJ_ID)) new_op->upcall.req.perf_count.type = ORANGEFS_PERF_COUNT_REQUEST_NCACHE; } else { gossip_err("sysfs_service_op_show: unknown kobj_id:%s:\n", kobj->name); rc = -EINVAL; goto out; } if (strcmp(kobj->name, PC_KOBJ_ID)) ser_op_type = "orangefs_param"; else ser_op_type = "orangefs_perf_count"; /* * The service_operation will return an errno return code on * error, and zero on success. */ rc = service_operation(new_op, ser_op_type, ORANGEFS_OP_INTERRUPTIBLE); out: if (!rc) { if (strcmp(kobj->name, PC_KOBJ_ID)) { if (new_op->upcall.req.param.op == ORANGEFS_PARAM_REQUEST_OP_READAHEAD_COUNT_SIZE) { rc = scnprintf(buf, PAGE_SIZE, "%d %d\n", (int)new_op->downcall.resp.param.u. value32[0], (int)new_op->downcall.resp.param.u. value32[1]); } else { rc = scnprintf(buf, PAGE_SIZE, "%d\n", (int)new_op->downcall.resp.param.u.value64); } } else { rc = scnprintf( buf, PAGE_SIZE, "%s", new_op->downcall.resp.perf_count.buffer); } } op_release(new_op); return rc; } /* * pass attribute values back to userspace with a service operation. * * We have to do a memory allocation, an sscanf and a service operation. * And we have to evaluate what the user entered, to make sure the * value is within the range supported by the attribute. So, there's * a lot of return code checking and mapping going on here. * * We want to return 1 if we think everything went OK, and * EINVAL if not. */ static ssize_t sysfs_service_op_store(struct kobject *kobj, struct orangefs_attribute *attr, const char *buf, size_t count) { struct orangefs_kernel_op_s *new_op = NULL; int val = 0; int rc = 0; gossip_debug(GOSSIP_SYSFS_DEBUG, "sysfs_service_op_store: id:%s:\n", kobj->name); new_op = op_alloc(ORANGEFS_VFS_OP_PARAM); if (!new_op) return -EINVAL; /* sic */ /* Can't do a service_operation if the client is not running... */ rc = is_daemon_in_service(); if (rc) { pr_info("%s: Client not running :%d:\n", __func__, is_daemon_in_service()); goto out; } /* * The value we want to send back to userspace is in buf, unless this * there are two parameters, which is specially handled below. */ if (strcmp(kobj->name, ORANGEFS_KOBJ_ID) || strcmp(attr->attr.name, "readahead_count_size")) { rc = kstrtoint(buf, 0, &val); if (rc) goto out; } new_op->upcall.req.param.type = ORANGEFS_PARAM_REQUEST_SET; if (!strcmp(kobj->name, ORANGEFS_KOBJ_ID)) { /* Drop unsupported requests first. */ if (!(orangefs_features & ORANGEFS_FEATURE_READAHEAD) && (!strcmp(attr->attr.name, "readahead_count") || !strcmp(attr->attr.name, "readahead_size") || !strcmp(attr->attr.name, "readahead_count_size") || !strcmp(attr->attr.name, "readahead_readcnt"))) { rc = -EINVAL; goto out; } if (!strcmp(attr->attr.name, "perf_history_size")) { if (val > 0) { new_op->upcall.req.param.op = ORANGEFS_PARAM_REQUEST_OP_PERF_HISTORY_SIZE; } else { rc = 0; goto out; } } else if (!strcmp(attr->attr.name, "perf_time_interval_secs")) { if (val > 0) { new_op->upcall.req.param.op = ORANGEFS_PARAM_REQUEST_OP_PERF_TIME_INTERVAL_SECS; } else { rc = 0; goto out; } } else if (!strcmp(attr->attr.name, "perf_counter_reset")) { if ((val == 0) || (val == 1)) { new_op->upcall.req.param.op = ORANGEFS_PARAM_REQUEST_OP_PERF_RESET; } else { rc = 0; goto out; } } else if (!strcmp(attr->attr.name, "readahead_count")) { if ((val >= 0)) { new_op->upcall.req.param.op = ORANGEFS_PARAM_REQUEST_OP_READAHEAD_COUNT; } else { rc = 0; goto out; } } else if (!strcmp(attr->attr.name, "readahead_size")) { if ((val >= 0)) { new_op->upcall.req.param.op = ORANGEFS_PARAM_REQUEST_OP_READAHEAD_SIZE; } else { rc = 0; goto out; } } else if (!strcmp(attr->attr.name, "readahead_count_size")) { int val1, val2; rc = sscanf(buf, "%d %d", &val1, &val2); if (rc < 2) { rc = 0; goto out; } if ((val1 >= 0) && (val2 >= 0)) { new_op->upcall.req.param.op = ORANGEFS_PARAM_REQUEST_OP_READAHEAD_COUNT_SIZE; } else { rc = 0; goto out; } new_op->upcall.req.param.u.value32[0] = val1; new_op->upcall.req.param.u.value32[1] = val2; goto value_set; } else if (!strcmp(attr->attr.name, "readahead_readcnt")) { if ((val >= 0)) { new_op->upcall.req.param.op = ORANGEFS_PARAM_REQUEST_OP_READAHEAD_READCNT; } else { rc = 0; goto out; } } } else if (!strcmp(kobj->name, ACACHE_KOBJ_ID)) { if (!strcmp(attr->attr.name, "hard_limit")) { if (val > -1) { new_op->upcall.req.param.op = ORANGEFS_PARAM_REQUEST_OP_ACACHE_HARD_LIMIT; } else { rc = 0; goto out; } } else if (!strcmp(attr->attr.name, "soft_limit")) { if (val > -1) { new_op->upcall.req.param.op = ORANGEFS_PARAM_REQUEST_OP_ACACHE_SOFT_LIMIT; } else { rc = 0; goto out; } } else if (!strcmp(attr->attr.name, "reclaim_percentage")) { if ((val > -1) && (val < 101)) { new_op->upcall.req.param.op = ORANGEFS_PARAM_REQUEST_OP_ACACHE_RECLAIM_PERCENTAGE; } else { rc = 0; goto out; } } else if (!strcmp(attr->attr.name, "timeout_msecs")) { if (val > -1) { new_op->upcall.req.param.op = ORANGEFS_PARAM_REQUEST_OP_ACACHE_TIMEOUT_MSECS; } else { rc = 0; goto out; } } } else if (!strcmp(kobj->name, CAPCACHE_KOBJ_ID)) { if (!strcmp(attr->attr.name, "hard_limit")) { if (val > -1) { new_op->upcall.req.param.op = ORANGEFS_PARAM_REQUEST_OP_CAPCACHE_HARD_LIMIT; } else { rc = 0; goto out; } } else if (!strcmp(attr->attr.name, "soft_limit")) { if (val > -1) { new_op->upcall.req.param.op = ORANGEFS_PARAM_REQUEST_OP_CAPCACHE_SOFT_LIMIT; } else { rc = 0; goto out; } } else if (!strcmp(attr->attr.name, "reclaim_percentage")) { if ((val > -1) && (val < 101)) { new_op->upcall.req.param.op = ORANGEFS_PARAM_REQUEST_OP_CAPCACHE_RECLAIM_PERCENTAGE; } else { rc = 0; goto out; } } else if (!strcmp(attr->attr.name, "timeout_secs")) { if (val > -1) { new_op->upcall.req.param.op = ORANGEFS_PARAM_REQUEST_OP_CAPCACHE_TIMEOUT_SECS; } else { rc = 0; goto out; } } } else if (!strcmp(kobj->name, CCACHE_KOBJ_ID)) { if (!strcmp(attr->attr.name, "hard_limit")) { if (val > -1) { new_op->upcall.req.param.op = ORANGEFS_PARAM_REQUEST_OP_CCACHE_HARD_LIMIT; } else { rc = 0; goto out; } } else if (!strcmp(attr->attr.name, "soft_limit")) { if (val > -1) { new_op->upcall.req.param.op = ORANGEFS_PARAM_REQUEST_OP_CCACHE_SOFT_LIMIT; } else { rc = 0; goto out; } } else if (!strcmp(attr->attr.name, "reclaim_percentage")) { if ((val > -1) && (val < 101)) { new_op->upcall.req.param.op = ORANGEFS_PARAM_REQUEST_OP_CCACHE_RECLAIM_PERCENTAGE; } else { rc = 0; goto out; } } else if (!strcmp(attr->attr.name, "timeout_secs")) { if (val > -1) { new_op->upcall.req.param.op = ORANGEFS_PARAM_REQUEST_OP_CCACHE_TIMEOUT_SECS; } else { rc = 0; goto out; } } } else if (!strcmp(kobj->name, NCACHE_KOBJ_ID)) { if (!strcmp(attr->attr.name, "hard_limit")) { if (val > -1) { new_op->upcall.req.param.op = ORANGEFS_PARAM_REQUEST_OP_NCACHE_HARD_LIMIT; } else { rc = 0; goto out; } } else if (!strcmp(attr->attr.name, "soft_limit")) { if (val > -1) { new_op->upcall.req.param.op = ORANGEFS_PARAM_REQUEST_OP_NCACHE_SOFT_LIMIT; } else { rc = 0; goto out; } } else if (!strcmp(attr->attr.name, "reclaim_percentage")) { if ((val > -1) && (val < 101)) { new_op->upcall.req.param.op = ORANGEFS_PARAM_REQUEST_OP_NCACHE_RECLAIM_PERCENTAGE; } else { rc = 0; goto out; } } else if (!strcmp(attr->attr.name, "timeout_msecs")) { if (val > -1) { new_op->upcall.req.param.op = ORANGEFS_PARAM_REQUEST_OP_NCACHE_TIMEOUT_MSECS; } else { rc = 0; goto out; } } } else { gossip_err("sysfs_service_op_store: unknown kobj_id:%s:\n", kobj->name); rc = -EINVAL; goto out; } new_op->upcall.req.param.u.value64 = val; value_set: /* * The service_operation will return a errno return code on * error, and zero on success. */ rc = service_operation(new_op, "orangefs_param", ORANGEFS_OP_INTERRUPTIBLE); if (rc < 0) { gossip_err("sysfs_service_op_store: service op returned:%d:\n", rc); rc = 0; } else { rc = count; } out: op_release(new_op); if (rc == -ENOMEM || rc == 0) rc = -EINVAL; return rc; } static struct orangefs_attribute op_timeout_secs_attribute = __ATTR(op_timeout_secs, 0664, sysfs_int_show, sysfs_int_store); static struct orangefs_attribute slot_timeout_secs_attribute = __ATTR(slot_timeout_secs, 0664, sysfs_int_show, sysfs_int_store); static struct orangefs_attribute cache_timeout_msecs_attribute = __ATTR(cache_timeout_msecs, 0664, sysfs_int_show, sysfs_int_store); static struct orangefs_attribute dcache_timeout_msecs_attribute = __ATTR(dcache_timeout_msecs, 0664, sysfs_int_show, sysfs_int_store); static struct orangefs_attribute getattr_timeout_msecs_attribute = __ATTR(getattr_timeout_msecs, 0664, sysfs_int_show, sysfs_int_store); static struct orangefs_attribute readahead_count_attribute = __ATTR(readahead_count, 0664, sysfs_service_op_show, sysfs_service_op_store); static struct orangefs_attribute readahead_size_attribute = __ATTR(readahead_size, 0664, sysfs_service_op_show, sysfs_service_op_store); static struct orangefs_attribute readahead_count_size_attribute = __ATTR(readahead_count_size, 0664, sysfs_service_op_show, sysfs_service_op_store); static struct orangefs_attribute readahead_readcnt_attribute = __ATTR(readahead_readcnt, 0664, sysfs_service_op_show, sysfs_service_op_store); static struct orangefs_attribute perf_counter_reset_attribute = __ATTR(perf_counter_reset, 0664, sysfs_service_op_show, sysfs_service_op_store); static struct orangefs_attribute perf_history_size_attribute = __ATTR(perf_history_size, 0664, sysfs_service_op_show, sysfs_service_op_store); static struct orangefs_attribute perf_time_interval_secs_attribute = __ATTR(perf_time_interval_secs, 0664, sysfs_service_op_show, sysfs_service_op_store); static struct attribute *orangefs_default_attrs[] = { &op_timeout_secs_attribute.attr, &slot_timeout_secs_attribute.attr, &cache_timeout_msecs_attribute.attr, &dcache_timeout_msecs_attribute.attr, &getattr_timeout_msecs_attribute.attr, &readahead_count_attribute.attr, &readahead_size_attribute.attr, &readahead_count_size_attribute.attr, &readahead_readcnt_attribute.attr, &perf_counter_reset_attribute.attr, &perf_history_size_attribute.attr, &perf_time_interval_secs_attribute.attr, NULL, }; ATTRIBUTE_GROUPS(orangefs_default); static struct kobject *orangefs_obj; static void orangefs_obj_release(struct kobject *kobj) { kfree(orangefs_obj); orangefs_obj = NULL; } static const struct kobj_type orangefs_ktype = { .sysfs_ops = &orangefs_sysfs_ops, .default_groups = orangefs_default_groups, .release = orangefs_obj_release, }; static struct orangefs_attribute acache_hard_limit_attribute = __ATTR(hard_limit, 0664, sysfs_service_op_show, sysfs_service_op_store); static struct orangefs_attribute acache_reclaim_percent_attribute = __ATTR(reclaim_percentage, 0664, sysfs_service_op_show, sysfs_service_op_store); static struct orangefs_attribute acache_soft_limit_attribute = __ATTR(soft_limit, 0664, sysfs_service_op_show, sysfs_service_op_store); static struct orangefs_attribute acache_timeout_msecs_attribute = __ATTR(timeout_msecs, 0664, sysfs_service_op_show, sysfs_service_op_store); static struct attribute *acache_orangefs_default_attrs[] = { &acache_hard_limit_attribute.attr, &acache_reclaim_percent_attribute.attr, &acache_soft_limit_attribute.attr, &acache_timeout_msecs_attribute.attr, NULL, }; ATTRIBUTE_GROUPS(acache_orangefs_default); static struct kobject *acache_orangefs_obj; static void acache_orangefs_obj_release(struct kobject *kobj) { kfree(acache_orangefs_obj); acache_orangefs_obj = NULL; } static const struct kobj_type acache_orangefs_ktype = { .sysfs_ops = &orangefs_sysfs_ops, .default_groups = acache_orangefs_default_groups, .release = acache_orangefs_obj_release, }; static struct orangefs_attribute capcache_hard_limit_attribute = __ATTR(hard_limit, 0664, sysfs_service_op_show, sysfs_service_op_store); static struct orangefs_attribute capcache_reclaim_percent_attribute = __ATTR(reclaim_percentage, 0664, sysfs_service_op_show, sysfs_service_op_store); static struct orangefs_attribute capcache_soft_limit_attribute = __ATTR(soft_limit, 0664, sysfs_service_op_show, sysfs_service_op_store); static struct orangefs_attribute capcache_timeout_secs_attribute = __ATTR(timeout_secs, 0664, sysfs_service_op_show, sysfs_service_op_store); static struct attribute *capcache_orangefs_default_attrs[] = { &capcache_hard_limit_attribute.attr, &capcache_reclaim_percent_attribute.attr, &capcache_soft_limit_attribute.attr, &capcache_timeout_secs_attribute.attr, NULL, }; ATTRIBUTE_GROUPS(capcache_orangefs_default); static struct kobject *capcache_orangefs_obj; static void capcache_orangefs_obj_release(struct kobject *kobj) { kfree(capcache_orangefs_obj); capcache_orangefs_obj = NULL; } static const struct kobj_type capcache_orangefs_ktype = { .sysfs_ops = &orangefs_sysfs_ops, .default_groups = capcache_orangefs_default_groups, .release = capcache_orangefs_obj_release, }; static struct orangefs_attribute ccache_hard_limit_attribute = __ATTR(hard_limit, 0664, sysfs_service_op_show, sysfs_service_op_store); static struct orangefs_attribute ccache_reclaim_percent_attribute = __ATTR(reclaim_percentage, 0664, sysfs_service_op_show, sysfs_service_op_store); static struct orangefs_attribute ccache_soft_limit_attribute = __ATTR(soft_limit, 0664, sysfs_service_op_show, sysfs_service_op_store); static struct orangefs_attribute ccache_timeout_secs_attribute = __ATTR(timeout_secs, 0664, sysfs_service_op_show, sysfs_service_op_store); static struct attribute *ccache_orangefs_default_attrs[] = { &ccache_hard_limit_attribute.attr, &ccache_reclaim_percent_attribute.attr, &ccache_soft_limit_attribute.attr, &ccache_timeout_secs_attribute.attr, NULL, }; ATTRIBUTE_GROUPS(ccache_orangefs_default); static struct kobject *ccache_orangefs_obj; static void ccache_orangefs_obj_release(struct kobject *kobj) { kfree(ccache_orangefs_obj); ccache_orangefs_obj = NULL; } static const struct kobj_type ccache_orangefs_ktype = { .sysfs_ops = &orangefs_sysfs_ops, .default_groups = ccache_orangefs_default_groups, .release = ccache_orangefs_obj_release, }; static struct orangefs_attribute ncache_hard_limit_attribute = __ATTR(hard_limit, 0664, sysfs_service_op_show, sysfs_service_op_store); static struct orangefs_attribute ncache_reclaim_percent_attribute = __ATTR(reclaim_percentage, 0664, sysfs_service_op_show, sysfs_service_op_store); static struct orangefs_attribute ncache_soft_limit_attribute = __ATTR(soft_limit, 0664, sysfs_service_op_show, sysfs_service_op_store); static struct orangefs_attribute ncache_timeout_msecs_attribute = __ATTR(timeout_msecs, 0664, sysfs_service_op_show, sysfs_service_op_store); static struct attribute *ncache_orangefs_default_attrs[] = { &ncache_hard_limit_attribute.attr, &ncache_reclaim_percent_attribute.attr, &ncache_soft_limit_attribute.attr, &ncache_timeout_msecs_attribute.attr, NULL, }; ATTRIBUTE_GROUPS(ncache_orangefs_default); static struct kobject *ncache_orangefs_obj; static void ncache_orangefs_obj_release(struct kobject *kobj) { kfree(ncache_orangefs_obj); ncache_orangefs_obj = NULL; } static const struct kobj_type ncache_orangefs_ktype = { .sysfs_ops = &orangefs_sysfs_ops, .default_groups = ncache_orangefs_default_groups, .release = ncache_orangefs_obj_release, }; static struct orangefs_attribute pc_acache_attribute = __ATTR(acache, 0664, sysfs_service_op_show, NULL); static struct orangefs_attribute pc_capcache_attribute = __ATTR(capcache, 0664, sysfs_service_op_show, NULL); static struct orangefs_attribute pc_ncache_attribute = __ATTR(ncache, 0664, sysfs_service_op_show, NULL); static struct attribute *pc_orangefs_default_attrs[] = { &pc_acache_attribute.attr, &pc_capcache_attribute.attr, &pc_ncache_attribute.attr, NULL, }; ATTRIBUTE_GROUPS(pc_orangefs_default); static struct kobject *pc_orangefs_obj; static void pc_orangefs_obj_release(struct kobject *kobj) { kfree(pc_orangefs_obj); pc_orangefs_obj = NULL; } static const struct kobj_type pc_orangefs_ktype = { .sysfs_ops = &orangefs_sysfs_ops, .default_groups = pc_orangefs_default_groups, .release = pc_orangefs_obj_release, }; static struct orangefs_attribute stats_reads_attribute = __ATTR(reads, 0664, sysfs_int_show, NULL); static struct orangefs_attribute stats_writes_attribute = __ATTR(writes, 0664, sysfs_int_show, NULL); static struct attribute *stats_orangefs_default_attrs[] = { &stats_reads_attribute.attr, &stats_writes_attribute.attr, NULL, }; ATTRIBUTE_GROUPS(stats_orangefs_default); static struct kobject *stats_orangefs_obj; static void stats_orangefs_obj_release(struct kobject *kobj) { kfree(stats_orangefs_obj); stats_orangefs_obj = NULL; } static const struct kobj_type stats_orangefs_ktype = { .sysfs_ops = &orangefs_sysfs_ops, .default_groups = stats_orangefs_default_groups, .release = stats_orangefs_obj_release, }; int orangefs_sysfs_init(void) { int rc = -EINVAL; gossip_debug(GOSSIP_SYSFS_DEBUG, "orangefs_sysfs_init: start\n"); /* create /sys/fs/orangefs. */ orangefs_obj = kzalloc(sizeof(*orangefs_obj), GFP_KERNEL); if (!orangefs_obj) goto out; rc = kobject_init_and_add(orangefs_obj, &orangefs_ktype, fs_kobj, ORANGEFS_KOBJ_ID); if (rc) goto ofs_obj_bail; kobject_uevent(orangefs_obj, KOBJ_ADD); /* create /sys/fs/orangefs/acache. */ acache_orangefs_obj = kzalloc(sizeof(*acache_orangefs_obj), GFP_KERNEL); if (!acache_orangefs_obj) { rc = -EINVAL; goto ofs_obj_bail; } rc = kobject_init_and_add(acache_orangefs_obj, &acache_orangefs_ktype, orangefs_obj, ACACHE_KOBJ_ID); if (rc) goto acache_obj_bail; kobject_uevent(acache_orangefs_obj, KOBJ_ADD); /* create /sys/fs/orangefs/capcache. */ capcache_orangefs_obj = kzalloc(sizeof(*capcache_orangefs_obj), GFP_KERNEL); if (!capcache_orangefs_obj) { rc = -EINVAL; goto acache_obj_bail; } rc = kobject_init_and_add(capcache_orangefs_obj, &capcache_orangefs_ktype, orangefs_obj, CAPCACHE_KOBJ_ID); if (rc) goto capcache_obj_bail; kobject_uevent(capcache_orangefs_obj, KOBJ_ADD); /* create /sys/fs/orangefs/ccache. */ ccache_orangefs_obj = kzalloc(sizeof(*ccache_orangefs_obj), GFP_KERNEL); if (!ccache_orangefs_obj) { rc = -EINVAL; goto capcache_obj_bail; } rc = kobject_init_and_add(ccache_orangefs_obj, &ccache_orangefs_ktype, orangefs_obj, CCACHE_KOBJ_ID); if (rc) goto ccache_obj_bail; kobject_uevent(ccache_orangefs_obj, KOBJ_ADD); /* create /sys/fs/orangefs/ncache. */ ncache_orangefs_obj = kzalloc(sizeof(*ncache_orangefs_obj), GFP_KERNEL); if (!ncache_orangefs_obj) { rc = -EINVAL; goto ccache_obj_bail; } rc = kobject_init_and_add(ncache_orangefs_obj, &ncache_orangefs_ktype, orangefs_obj, NCACHE_KOBJ_ID); if (rc) goto ncache_obj_bail; kobject_uevent(ncache_orangefs_obj, KOBJ_ADD); /* create /sys/fs/orangefs/perf_counters. */ pc_orangefs_obj = kzalloc(sizeof(*pc_orangefs_obj), GFP_KERNEL); if (!pc_orangefs_obj) { rc = -EINVAL; goto ncache_obj_bail; } rc = kobject_init_and_add(pc_orangefs_obj, &pc_orangefs_ktype, orangefs_obj, "perf_counters"); if (rc) goto pc_obj_bail; kobject_uevent(pc_orangefs_obj, KOBJ_ADD); /* create /sys/fs/orangefs/stats. */ stats_orangefs_obj = kzalloc(sizeof(*stats_orangefs_obj), GFP_KERNEL); if (!stats_orangefs_obj) { rc = -EINVAL; goto pc_obj_bail; } rc = kobject_init_and_add(stats_orangefs_obj, &stats_orangefs_ktype, orangefs_obj, STATS_KOBJ_ID); if (rc) goto stats_obj_bail; kobject_uevent(stats_orangefs_obj, KOBJ_ADD); goto out; stats_obj_bail: kobject_put(stats_orangefs_obj); pc_obj_bail: kobject_put(pc_orangefs_obj); ncache_obj_bail: kobject_put(ncache_orangefs_obj); ccache_obj_bail: kobject_put(ccache_orangefs_obj); capcache_obj_bail: kobject_put(capcache_orangefs_obj); acache_obj_bail: kobject_put(acache_orangefs_obj); ofs_obj_bail: kobject_put(orangefs_obj); out: return rc; } void orangefs_sysfs_exit(void) { gossip_debug(GOSSIP_SYSFS_DEBUG, "orangefs_sysfs_exit: start\n"); kobject_put(acache_orangefs_obj); kobject_put(capcache_orangefs_obj); kobject_put(ccache_orangefs_obj); kobject_put(ncache_orangefs_obj); kobject_put(pc_orangefs_obj); kobject_put(stats_orangefs_obj); kobject_put(orangefs_obj); }
4 4 4 4 4 4 4 4 23 24 24 24 3 3 3 3 3 3 3 21 21 21 21 21 21 5 5 5 3 3 5 3 6 6 6 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 5 21 21 21 21 21 21 21 21 21 21 21 21 21 21 21 21 21 21 21 21 21 8 21 21 21 21 21 21 21 21 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 // SPDX-License-Identifier: GPL-2.0 /* * cfg80211 scan result handling * * Copyright 2008 Johannes Berg <johannes@sipsolutions.net> * Copyright 2013-2014 Intel Mobile Communications GmbH * Copyright 2016 Intel Deutschland GmbH * Copyright (C) 2018-2024 Intel Corporation */ #include <linux/kernel.h> #include <linux/slab.h> #include <linux/module.h> #include <linux/netdevice.h> #include <linux/wireless.h> #include <linux/nl80211.h> #include <linux/etherdevice.h> #include <linux/crc32.h> #include <linux/bitfield.h> #include <net/arp.h> #include <net/cfg80211.h> #include <net/cfg80211-wext.h> #include <net/iw_handler.h> #include <kunit/visibility.h> #include "core.h" #include "nl80211.h" #include "wext-compat.h" #include "rdev-ops.h" /** * DOC: BSS tree/list structure * * At the top level, the BSS list is kept in both a list in each * registered device (@bss_list) as well as an RB-tree for faster * lookup. In the RB-tree, entries can be looked up using their * channel, MESHID, MESHCONF (for MBSSes) or channel, BSSID, SSID * for other BSSes. * * Due to the possibility of hidden SSIDs, there's a second level * structure, the "hidden_list" and "hidden_beacon_bss" pointer. * The hidden_list connects all BSSes belonging to a single AP * that has a hidden SSID, and connects beacon and probe response * entries. For a probe response entry for a hidden SSID, the * hidden_beacon_bss pointer points to the BSS struct holding the * beacon's information. * * Reference counting is done for all these references except for * the hidden_list, so that a beacon BSS struct that is otherwise * not referenced has one reference for being on the bss_list and * one for each probe response entry that points to it using the * hidden_beacon_bss pointer. When a BSS struct that has such a * pointer is get/put, the refcount update is also propagated to * the referenced struct, this ensure that it cannot get removed * while somebody is using the probe response version. * * Note that the hidden_beacon_bss pointer never changes, due to * the reference counting. Therefore, no locking is needed for * it. * * Also note that the hidden_beacon_bss pointer is only relevant * if the driver uses something other than the IEs, e.g. private * data stored in the BSS struct, since the beacon IEs are * also linked into the probe response struct. */ /* * Limit the number of BSS entries stored in mac80211. Each one is * a bit over 4k at most, so this limits to roughly 4-5M of memory. * If somebody wants to really attack this though, they'd likely * use small beacons, and only one type of frame, limiting each of * the entries to a much smaller size (in order to generate more * entries in total, so overhead is bigger.) */ static int bss_entries_limit = 1000; module_param(bss_entries_limit, int, 0644); MODULE_PARM_DESC(bss_entries_limit, "limit to number of scan BSS entries (per wiphy, default 1000)"); #define IEEE80211_SCAN_RESULT_EXPIRE (30 * HZ) static void bss_free(struct cfg80211_internal_bss *bss) { struct cfg80211_bss_ies *ies; if (WARN_ON(atomic_read(&bss->hold))) return; ies = (void *)rcu_access_pointer(bss->pub.beacon_ies); if (ies && !bss->pub.hidden_beacon_bss) kfree_rcu(ies, rcu_head); ies = (void *)rcu_access_pointer(bss->pub.proberesp_ies); if (ies) kfree_rcu(ies, rcu_head); /* * This happens when the module is removed, it doesn't * really matter any more save for completeness */ if (!list_empty(&bss->hidden_list)) list_del(&bss->hidden_list); kfree(bss); } static inline void bss_ref_get(struct cfg80211_registered_device *rdev, struct cfg80211_internal_bss *bss) { lockdep_assert_held(&rdev->bss_lock); bss->refcount++; if (bss->pub.hidden_beacon_bss) bss_from_pub(bss->pub.hidden_beacon_bss)->refcount++; if (bss->pub.transmitted_bss) bss_from_pub(bss->pub.transmitted_bss)->refcount++; } static inline void bss_ref_put(struct cfg80211_registered_device *rdev, struct cfg80211_internal_bss *bss) { lockdep_assert_held(&rdev->bss_lock); if (bss->pub.hidden_beacon_bss) { struct cfg80211_internal_bss *hbss; hbss = bss_from_pub(bss->pub.hidden_beacon_bss); hbss->refcount--; if (hbss->refcount == 0) bss_free(hbss); } if (bss->pub.transmitted_bss) { struct cfg80211_internal_bss *tbss; tbss = bss_from_pub(bss->pub.transmitted_bss); tbss->refcount--; if (tbss->refcount == 0) bss_free(tbss); } bss->refcount--; if (bss->refcount == 0) bss_free(bss); } static bool __cfg80211_unlink_bss(struct cfg80211_registered_device *rdev, struct cfg80211_internal_bss *bss) { lockdep_assert_held(&rdev->bss_lock); if (!list_empty(&bss->hidden_list)) { /* * don't remove the beacon entry if it has * probe responses associated with it */ if (!bss->pub.hidden_beacon_bss) return false; /* * if it's a probe response entry break its * link to the other entries in the group */ list_del_init(&bss->hidden_list); } list_del_init(&bss->list); list_del_init(&bss->pub.nontrans_list); rb_erase(&bss->rbn, &rdev->bss_tree); rdev->bss_entries--; WARN_ONCE((rdev->bss_entries == 0) ^ list_empty(&rdev->bss_list), "rdev bss entries[%d]/list[empty:%d] corruption\n", rdev->bss_entries, list_empty(&rdev->bss_list)); bss_ref_put(rdev, bss); return true; } bool cfg80211_is_element_inherited(const struct element *elem, const struct element *non_inherit_elem) { u8 id_len, ext_id_len, i, loop_len, id; const u8 *list; if (elem->id == WLAN_EID_MULTIPLE_BSSID) return false; if (elem->id == WLAN_EID_EXTENSION && elem->datalen > 1 && elem->data[0] == WLAN_EID_EXT_EHT_MULTI_LINK) return false; if (!non_inherit_elem || non_inherit_elem->datalen < 2) return true; /* * non inheritance element format is: * ext ID (56) | IDs list len | list | extension IDs list len | list * Both lists are optional. Both lengths are mandatory. * This means valid length is: * elem_len = 1 (extension ID) + 2 (list len fields) + list lengths */ id_len = non_inherit_elem->data[1]; if (non_inherit_elem->datalen < 3 + id_len) return true; ext_id_len = non_inherit_elem->data[2 + id_len]; if (non_inherit_elem->datalen < 3 + id_len + ext_id_len) return true; if (elem->id == WLAN_EID_EXTENSION) { if (!ext_id_len) return true; loop_len = ext_id_len; list = &non_inherit_elem->data[3 + id_len]; id = elem->data[0]; } else { if (!id_len) return true; loop_len = id_len; list = &non_inherit_elem->data[2]; id = elem->id; } for (i = 0; i < loop_len; i++) { if (list[i] == id) return false; } return true; } EXPORT_SYMBOL(cfg80211_is_element_inherited); static size_t cfg80211_copy_elem_with_frags(const struct element *elem, const u8 *ie, size_t ie_len, u8 **pos, u8 *buf, size_t buf_len) { if (WARN_ON((u8 *)elem < ie || elem->data > ie + ie_len || elem->data + elem->datalen > ie + ie_len)) return 0; if (elem->datalen + 2 > buf + buf_len - *pos) return 0; memcpy(*pos, elem, elem->datalen + 2); *pos += elem->datalen + 2; /* Finish if it is not fragmented */ if (elem->datalen != 255) return *pos - buf; ie_len = ie + ie_len - elem->data - elem->datalen; ie = (const u8 *)elem->data + elem->datalen; for_each_element(elem, ie, ie_len) { if (elem->id != WLAN_EID_FRAGMENT) break; if (elem->datalen + 2 > buf + buf_len - *pos) return 0; memcpy(*pos, elem, elem->datalen + 2); *pos += elem->datalen + 2; if (elem->datalen != 255) break; } return *pos - buf; } VISIBLE_IF_CFG80211_KUNIT size_t cfg80211_gen_new_ie(const u8 *ie, size_t ielen, const u8 *subie, size_t subie_len, u8 *new_ie, size_t new_ie_len) { const struct element *non_inherit_elem, *parent, *sub; u8 *pos = new_ie; const u8 *mbssid_index_ie; u8 id, ext_id, bssid_index = 255; unsigned int match_len; non_inherit_elem = cfg80211_find_ext_elem(WLAN_EID_EXT_NON_INHERITANCE, subie, subie_len); mbssid_index_ie = cfg80211_find_ie(WLAN_EID_MULTI_BSSID_IDX, subie, subie_len); if (mbssid_index_ie && mbssid_index_ie[1] > 0 && mbssid_index_ie[2] > 0 && mbssid_index_ie[2] <= 46) bssid_index = mbssid_index_ie[2]; /* We copy the elements one by one from the parent to the generated * elements. * If they are not inherited (included in subie or in the non * inheritance element), then we copy all occurrences the first time * we see this element type. */ for_each_element(parent, ie, ielen) { if (parent->id == WLAN_EID_FRAGMENT) continue; if (parent->id == WLAN_EID_EXTENSION) { if (parent->datalen < 1) continue; id = WLAN_EID_EXTENSION; ext_id = parent->data[0]; match_len = 1; } else { id = parent->id; match_len = 0; } /* Find first occurrence in subie */ sub = cfg80211_find_elem_match(id, subie, subie_len, &ext_id, match_len, 0); /* Copy from parent if not in subie and inherited */ if (!sub && cfg80211_is_element_inherited(parent, non_inherit_elem)) { if (!cfg80211_copy_elem_with_frags(parent, ie, ielen, &pos, new_ie, new_ie_len)) return 0; continue; } /* For ML probe response, match the MLE in the frame body with * MLD id being 'bssid_index' */ if (parent->id == WLAN_EID_EXTENSION && parent->datalen > 1 && parent->data[0] == WLAN_EID_EXT_EHT_MULTI_LINK && bssid_index == ieee80211_mle_get_mld_id(parent->data + 1)) { if (!cfg80211_copy_elem_with_frags(parent, ie, ielen, &pos, new_ie, new_ie_len)) return 0; /* Continue here to prevent processing the MLE in * sub-element, which AP MLD should not carry */ continue; } /* Already copied if an earlier element had the same type */ if (cfg80211_find_elem_match(id, ie, (u8 *)parent - ie, &ext_id, match_len, 0)) continue; /* Not inheriting, copy all similar elements from subie */ while (sub) { if (!cfg80211_copy_elem_with_frags(sub, subie, subie_len, &pos, new_ie, new_ie_len)) return 0; sub = cfg80211_find_elem_match(id, sub->data + sub->datalen, subie_len + subie - (sub->data + sub->datalen), &ext_id, match_len, 0); } } /* The above misses elements that are included in subie but not in the * parent, so do a pass over subie and append those. * Skip the non-tx BSSID caps and non-inheritance element. */ for_each_element(sub, subie, subie_len) { if (sub->id == WLAN_EID_NON_TX_BSSID_CAP) continue; if (sub->id == WLAN_EID_FRAGMENT) continue; if (sub->id == WLAN_EID_EXTENSION) { if (sub->datalen < 1) continue; id = WLAN_EID_EXTENSION; ext_id = sub->data[0]; match_len = 1; if (ext_id == WLAN_EID_EXT_NON_INHERITANCE) continue; } else { id = sub->id; match_len = 0; } /* Processed if one was included in the parent */ if (cfg80211_find_elem_match(id, ie, ielen, &ext_id, match_len, 0)) continue; if (!cfg80211_copy_elem_with_frags(sub, subie, subie_len, &pos, new_ie, new_ie_len)) return 0; } return pos - new_ie; } EXPORT_SYMBOL_IF_CFG80211_KUNIT(cfg80211_gen_new_ie); static bool is_bss(struct cfg80211_bss *a, const u8 *bssid, const u8 *ssid, size_t ssid_len) { const struct cfg80211_bss_ies *ies; const struct element *ssid_elem; if (bssid && !ether_addr_equal(a->bssid, bssid)) return false; if (!ssid) return true; ies = rcu_access_pointer(a->ies); if (!ies) return false; ssid_elem = cfg80211_find_elem(WLAN_EID_SSID, ies->data, ies->len); if (!ssid_elem) return false; if (ssid_elem->datalen != ssid_len) return false; return memcmp(ssid_elem->data, ssid, ssid_len) == 0; } static int cfg80211_add_nontrans_list(struct cfg80211_bss *trans_bss, struct cfg80211_bss *nontrans_bss) { const struct element *ssid_elem; struct cfg80211_bss *bss = NULL; rcu_read_lock(); ssid_elem = ieee80211_bss_get_elem(nontrans_bss, WLAN_EID_SSID); if (!ssid_elem) { rcu_read_unlock(); return -EINVAL; } /* check if nontrans_bss is in the list */ list_for_each_entry(bss, &trans_bss->nontrans_list, nontrans_list) { if (is_bss(bss, nontrans_bss->bssid, ssid_elem->data, ssid_elem->datalen)) { rcu_read_unlock(); return 0; } } rcu_read_unlock(); /* * This is a bit weird - it's not on the list, but already on another * one! The only way that could happen is if there's some BSSID/SSID * shared by multiple APs in their multi-BSSID profiles, potentially * with hidden SSID mixed in ... ignore it. */ if (!list_empty(&nontrans_bss->nontrans_list)) return -EINVAL; /* add to the list */ list_add_tail(&nontrans_bss->nontrans_list, &trans_bss->nontrans_list); return 0; } static void __cfg80211_bss_expire(struct cfg80211_registered_device *rdev, unsigned long expire_time) { struct cfg80211_internal_bss *bss, *tmp; bool expired = false; lockdep_assert_held(&rdev->bss_lock); list_for_each_entry_safe(bss, tmp, &rdev->bss_list, list) { if (atomic_read(&bss->hold)) continue; if (!time_after(expire_time, bss->ts)) continue; if (__cfg80211_unlink_bss(rdev, bss)) expired = true; } if (expired) rdev->bss_generation++; } static bool cfg80211_bss_expire_oldest(struct cfg80211_registered_device *rdev) { struct cfg80211_internal_bss *bss, *oldest = NULL; bool ret; lockdep_assert_held(&rdev->bss_lock); list_for_each_entry(bss, &rdev->bss_list, list) { if (atomic_read(&bss->hold)) continue; if (!list_empty(&bss->hidden_list) && !bss->pub.hidden_beacon_bss) continue; if (oldest && time_before(oldest->ts, bss->ts)) continue; oldest = bss; } if (WARN_ON(!oldest)) return false; /* * The callers make sure to increase rdev->bss_generation if anything * gets removed (and a new entry added), so there's no need to also do * it here. */ ret = __cfg80211_unlink_bss(rdev, oldest); WARN_ON(!ret); return ret; } static u8 cfg80211_parse_bss_param(u8 data, struct cfg80211_colocated_ap *coloc_ap) { coloc_ap->oct_recommended = u8_get_bits(data, IEEE80211_RNR_TBTT_PARAMS_OCT_RECOMMENDED); coloc_ap->same_ssid = u8_get_bits(data, IEEE80211_RNR_TBTT_PARAMS_SAME_SSID); coloc_ap->multi_bss = u8_get_bits(data, IEEE80211_RNR_TBTT_PARAMS_MULTI_BSSID); coloc_ap->transmitted_bssid = u8_get_bits(data, IEEE80211_RNR_TBTT_PARAMS_TRANSMITTED_BSSID); coloc_ap->unsolicited_probe = u8_get_bits(data, IEEE80211_RNR_TBTT_PARAMS_PROBE_ACTIVE); coloc_ap->colocated_ess = u8_get_bits(data, IEEE80211_RNR_TBTT_PARAMS_COLOC_ESS); return u8_get_bits(data, IEEE80211_RNR_TBTT_PARAMS_COLOC_AP); } static int cfg80211_calc_short_ssid(const struct cfg80211_bss_ies *ies, const struct element **elem, u32 *s_ssid) { *elem = cfg80211_find_elem(WLAN_EID_SSID, ies->data, ies->len); if (!*elem || (*elem)->datalen > IEEE80211_MAX_SSID_LEN) return -EINVAL; *s_ssid = ~crc32_le(~0, (*elem)->data, (*elem)->datalen); return 0; } VISIBLE_IF_CFG80211_KUNIT void cfg80211_free_coloc_ap_list(struct list_head *coloc_ap_list) { struct cfg80211_colocated_ap *ap, *tmp_ap; list_for_each_entry_safe(ap, tmp_ap, coloc_ap_list, list) { list_del(&ap->list); kfree(ap); } } EXPORT_SYMBOL_IF_CFG80211_KUNIT(cfg80211_free_coloc_ap_list); static int cfg80211_parse_ap_info(struct cfg80211_colocated_ap *entry, const u8 *pos, u8 length, const struct element *ssid_elem, u32 s_ssid_tmp) { u8 bss_params; entry->psd_20 = IEEE80211_RNR_TBTT_PARAMS_PSD_RESERVED; /* The length is already verified by the caller to contain bss_params */ if (length > sizeof(struct ieee80211_tbtt_info_7_8_9)) { struct ieee80211_tbtt_info_ge_11 *tbtt_info = (void *)pos; memcpy(entry->bssid, tbtt_info->bssid, ETH_ALEN); entry->short_ssid = le32_to_cpu(tbtt_info->short_ssid); entry->short_ssid_valid = true; bss_params = tbtt_info->bss_params; /* Ignore disabled links */ if (length >= offsetofend(typeof(*tbtt_info), mld_params)) { if (le16_get_bits(tbtt_info->mld_params.params, IEEE80211_RNR_MLD_PARAMS_DISABLED_LINK)) return -EINVAL; } if (length >= offsetofend(struct ieee80211_tbtt_info_ge_11, psd_20)) entry->psd_20 = tbtt_info->psd_20; } else { struct ieee80211_tbtt_info_7_8_9 *tbtt_info = (void *)pos; memcpy(entry->bssid, tbtt_info->bssid, ETH_ALEN); bss_params = tbtt_info->bss_params; if (length == offsetofend(struct ieee80211_tbtt_info_7_8_9, psd_20)) entry->psd_20 = tbtt_info->psd_20; } /* ignore entries with invalid BSSID */ if (!is_valid_ether_addr(entry->bssid)) return -EINVAL; /* skip non colocated APs */ if (!cfg80211_parse_bss_param(bss_params, entry)) return -EINVAL; /* no information about the short ssid. Consider the entry valid * for now. It would later be dropped in case there are explicit * SSIDs that need to be matched */ if (!entry->same_ssid && !entry->short_ssid_valid) return 0; if (entry->same_ssid) { entry->short_ssid = s_ssid_tmp; entry->short_ssid_valid = true; /* * This is safe because we validate datalen in * cfg80211_parse_colocated_ap(), before calling this * function. */ memcpy(&entry->ssid, &ssid_elem->data, ssid_elem->datalen); entry->ssid_len = ssid_elem->datalen; } return 0; } bool cfg80211_iter_rnr(const u8 *elems, size_t elems_len, enum cfg80211_rnr_iter_ret (*iter)(void *data, u8 type, const struct ieee80211_neighbor_ap_info *info, const u8 *tbtt_info, u8 tbtt_info_len), void *iter_data) { const struct element *rnr; const u8 *pos, *end; for_each_element_id(rnr, WLAN_EID_REDUCED_NEIGHBOR_REPORT, elems, elems_len) { const struct ieee80211_neighbor_ap_info *info; pos = rnr->data; end = rnr->data + rnr->datalen; /* RNR IE may contain more than one NEIGHBOR_AP_INFO */ while (sizeof(*info) <= end - pos) { u8 length, i, count; u8 type; info = (void *)pos; count = u8_get_bits(info->tbtt_info_hdr, IEEE80211_AP_INFO_TBTT_HDR_COUNT) + 1; length = info->tbtt_info_len; pos += sizeof(*info); if (count * length > end - pos) return false; type = u8_get_bits(info->tbtt_info_hdr, IEEE80211_AP_INFO_TBTT_HDR_TYPE); for (i = 0; i < count; i++) { switch (iter(iter_data, type, info, pos, length)) { case RNR_ITER_CONTINUE: break; case RNR_ITER_BREAK: return true; case RNR_ITER_ERROR: return false; } pos += length; } } if (pos != end) return false; } return true; } EXPORT_SYMBOL_GPL(cfg80211_iter_rnr); struct colocated_ap_data { const struct element *ssid_elem; struct list_head ap_list; u32 s_ssid_tmp; int n_coloc; }; static enum cfg80211_rnr_iter_ret cfg80211_parse_colocated_ap_iter(void *_data, u8 type, const struct ieee80211_neighbor_ap_info *info, const u8 *tbtt_info, u8 tbtt_info_len) { struct colocated_ap_data *data = _data; struct cfg80211_colocated_ap *entry; enum nl80211_band band; if (type != IEEE80211_TBTT_INFO_TYPE_TBTT) return RNR_ITER_CONTINUE; if (!ieee80211_operating_class_to_band(info->op_class, &band)) return RNR_ITER_CONTINUE; /* TBTT info must include bss param + BSSID + (short SSID or * same_ssid bit to be set). Ignore other options, and move to * the next AP info */ if (band != NL80211_BAND_6GHZ || !(tbtt_info_len == offsetofend(struct ieee80211_tbtt_info_7_8_9, bss_params) || tbtt_info_len == sizeof(struct ieee80211_tbtt_info_7_8_9) || tbtt_info_len >= offsetofend(struct ieee80211_tbtt_info_ge_11, bss_params))) return RNR_ITER_CONTINUE; entry = kzalloc(sizeof(*entry), GFP_ATOMIC); if (!entry) return RNR_ITER_ERROR; entry->center_freq = ieee80211_channel_to_frequency(info->channel, band); if (!cfg80211_parse_ap_info(entry, tbtt_info, tbtt_info_len, data->ssid_elem, data->s_ssid_tmp)) { struct cfg80211_colocated_ap *tmp; /* Don't add duplicate BSSIDs on the same channel. */ list_for_each_entry(tmp, &data->ap_list, list) { if (ether_addr_equal(tmp->bssid, entry->bssid) && tmp->center_freq == entry->center_freq) { kfree(entry); return RNR_ITER_CONTINUE; } } data->n_coloc++; list_add_tail(&entry->list, &data->ap_list); } else { kfree(entry); } return RNR_ITER_CONTINUE; } VISIBLE_IF_CFG80211_KUNIT int cfg80211_parse_colocated_ap(const struct cfg80211_bss_ies *ies, struct list_head *list) { struct colocated_ap_data data = {}; int ret; INIT_LIST_HEAD(&data.ap_list); ret = cfg80211_calc_short_ssid(ies, &data.ssid_elem, &data.s_ssid_tmp); if (ret) return 0; if (!cfg80211_iter_rnr(ies->data, ies->len, cfg80211_parse_colocated_ap_iter, &data)) { cfg80211_free_coloc_ap_list(&data.ap_list); return 0; } list_splice_tail(&data.ap_list, list); return data.n_coloc; } EXPORT_SYMBOL_IF_CFG80211_KUNIT(cfg80211_parse_colocated_ap); static void cfg80211_scan_req_add_chan(struct cfg80211_scan_request *request, struct ieee80211_channel *chan, bool add_to_6ghz) { int i; u32 n_channels = request->n_channels; struct cfg80211_scan_6ghz_params *params = &request->scan_6ghz_params[request->n_6ghz_params]; for (i = 0; i < n_channels; i++) { if (request->channels[i] == chan) { if (add_to_6ghz) params->channel_idx = i; return; } } request->n_channels++; request->channels[n_channels] = chan; if (add_to_6ghz) request->scan_6ghz_params[request->n_6ghz_params].channel_idx = n_channels; } static bool cfg80211_find_ssid_match(struct cfg80211_colocated_ap *ap, struct cfg80211_scan_request *request) { int i; u32 s_ssid; for (i = 0; i < request->n_ssids; i++) { /* wildcard ssid in the scan request */ if (!request->ssids[i].ssid_len) { if (ap->multi_bss && !ap->transmitted_bssid) continue; return true; } if (ap->ssid_len && ap->ssid_len == request->ssids[i].ssid_len) { if (!memcmp(request->ssids[i].ssid, ap->ssid, ap->ssid_len)) return true; } else if (ap->short_ssid_valid) { s_ssid = ~crc32_le(~0, request->ssids[i].ssid, request->ssids[i].ssid_len); if (ap->short_ssid == s_ssid) return true; } } return false; } static int cfg80211_scan_6ghz(struct cfg80211_registered_device *rdev) { u8 i; struct cfg80211_colocated_ap *ap; int n_channels, count = 0, err; struct cfg80211_scan_request *request, *rdev_req = rdev->scan_req; LIST_HEAD(coloc_ap_list); bool need_scan_psc = true; const struct ieee80211_sband_iftype_data *iftd; size_t size, offs_ssids, offs_6ghz_params, offs_ies; rdev_req->scan_6ghz = true; if (!rdev->wiphy.bands[NL80211_BAND_6GHZ]) return -EOPNOTSUPP; iftd = ieee80211_get_sband_iftype_data(rdev->wiphy.bands[NL80211_BAND_6GHZ], rdev_req->wdev->iftype); if (!iftd || !iftd->he_cap.has_he) return -EOPNOTSUPP; n_channels = rdev->wiphy.bands[NL80211_BAND_6GHZ]->n_channels; if (rdev_req->flags & NL80211_SCAN_FLAG_COLOCATED_6GHZ) { struct cfg80211_internal_bss *intbss; spin_lock_bh(&rdev->bss_lock); list_for_each_entry(intbss, &rdev->bss_list, list) { struct cfg80211_bss *res = &intbss->pub; const struct cfg80211_bss_ies *ies; const struct element *ssid_elem; struct cfg80211_colocated_ap *entry; u32 s_ssid_tmp; int ret; ies = rcu_access_pointer(res->ies); count += cfg80211_parse_colocated_ap(ies, &coloc_ap_list); /* In case the scan request specified a specific BSSID * and the BSS is found and operating on 6GHz band then * add this AP to the collocated APs list. * This is relevant for ML probe requests when the lower * band APs have not been discovered. */ if (is_broadcast_ether_addr(rdev_req->bssid) || !ether_addr_equal(rdev_req->bssid, res->bssid) || res->channel->band != NL80211_BAND_6GHZ) continue; ret = cfg80211_calc_short_ssid(ies, &ssid_elem, &s_ssid_tmp); if (ret) continue; entry = kzalloc(sizeof(*entry), GFP_ATOMIC); if (!entry) continue; memcpy(entry->bssid, res->bssid, ETH_ALEN); entry->short_ssid = s_ssid_tmp; memcpy(entry->ssid, ssid_elem->data, ssid_elem->datalen); entry->ssid_len = ssid_elem->datalen; entry->short_ssid_valid = true; entry->center_freq = res->channel->center_freq; list_add_tail(&entry->list, &coloc_ap_list); count++; } spin_unlock_bh(&rdev->bss_lock); } size = struct_size(request, channels, n_channels); offs_ssids = size; size += sizeof(*request->ssids) * rdev_req->n_ssids; offs_6ghz_params = size; size += sizeof(*request->scan_6ghz_params) * count; offs_ies = size; size += rdev_req->ie_len; request = kzalloc(size, GFP_KERNEL); if (!request) { cfg80211_free_coloc_ap_list(&coloc_ap_list); return -ENOMEM; } *request = *rdev_req; request->n_channels = 0; request->n_6ghz_params = 0; if (rdev_req->n_ssids) { /* * Add the ssids from the parent scan request to the new * scan request, so the driver would be able to use them * in its probe requests to discover hidden APs on PSC * channels. */ request->ssids = (void *)request + offs_ssids; memcpy(request->ssids, rdev_req->ssids, sizeof(*request->ssids) * request->n_ssids); } request->scan_6ghz_params = (void *)request + offs_6ghz_params; if (rdev_req->ie_len) { void *ie = (void *)request + offs_ies; memcpy(ie, rdev_req->ie, rdev_req->ie_len); request->ie = ie; } /* * PSC channels should not be scanned in case of direct scan with 1 SSID * and at least one of the reported co-located APs with same SSID * indicating that all APs in the same ESS are co-located */ if (count && request->n_ssids == 1 && request->ssids[0].ssid_len) { list_for_each_entry(ap, &coloc_ap_list, list) { if (ap->colocated_ess && cfg80211_find_ssid_match(ap, request)) { need_scan_psc = false; break; } } } /* * add to the scan request the channels that need to be scanned * regardless of the collocated APs (PSC channels or all channels * in case that NL80211_SCAN_FLAG_COLOCATED_6GHZ is not set) */ for (i = 0; i < rdev_req->n_channels; i++) { if (rdev_req->channels[i]->band == NL80211_BAND_6GHZ && ((need_scan_psc && cfg80211_channel_is_psc(rdev_req->channels[i])) || !(rdev_req->flags & NL80211_SCAN_FLAG_COLOCATED_6GHZ))) { cfg80211_scan_req_add_chan(request, rdev_req->channels[i], false); } } if (!(rdev_req->flags & NL80211_SCAN_FLAG_COLOCATED_6GHZ)) goto skip; list_for_each_entry(ap, &coloc_ap_list, list) { bool found = false; struct cfg80211_scan_6ghz_params *scan_6ghz_params = &request->scan_6ghz_params[request->n_6ghz_params]; struct ieee80211_channel *chan = ieee80211_get_channel(&rdev->wiphy, ap->center_freq); if (!chan || chan->flags & IEEE80211_CHAN_DISABLED || !cfg80211_wdev_channel_allowed(rdev_req->wdev, chan)) continue; for (i = 0; i < rdev_req->n_channels; i++) { if (rdev_req->channels[i] == chan) found = true; } if (!found) continue; if (request->n_ssids > 0 && !cfg80211_find_ssid_match(ap, request)) continue; if (!is_broadcast_ether_addr(request->bssid) && !ether_addr_equal(request->bssid, ap->bssid)) continue; if (!request->n_ssids && ap->multi_bss && !ap->transmitted_bssid) continue; cfg80211_scan_req_add_chan(request, chan, true); memcpy(scan_6ghz_params->bssid, ap->bssid, ETH_ALEN); scan_6ghz_params->short_ssid = ap->short_ssid; scan_6ghz_params->short_ssid_valid = ap->short_ssid_valid; scan_6ghz_params->unsolicited_probe = ap->unsolicited_probe; scan_6ghz_params->psd_20 = ap->psd_20; /* * If a PSC channel is added to the scan and 'need_scan_psc' is * set to false, then all the APs that the scan logic is * interested with on the channel are collocated and thus there * is no need to perform the initial PSC channel listen. */ if (cfg80211_channel_is_psc(chan) && !need_scan_psc) scan_6ghz_params->psc_no_listen = true; request->n_6ghz_params++; } skip: cfg80211_free_coloc_ap_list(&coloc_ap_list); if (request->n_channels) { struct cfg80211_scan_request *old = rdev->int_scan_req; rdev->int_scan_req = request; /* * If this scan follows a previous scan, save the scan start * info from the first part of the scan */ if (old) rdev->int_scan_req->info = old->info; err = rdev_scan(rdev, request); if (err) { rdev->int_scan_req = old; kfree(request); } else { kfree(old); } return err; } kfree(request); return -EINVAL; } int cfg80211_scan(struct cfg80211_registered_device *rdev) { struct cfg80211_scan_request *request; struct cfg80211_scan_request *rdev_req = rdev->scan_req; u32 n_channels = 0, idx, i; if (!(rdev->wiphy.flags & WIPHY_FLAG_SPLIT_SCAN_6GHZ)) return rdev_scan(rdev, rdev_req); for (i = 0; i < rdev_req->n_channels; i++) { if (rdev_req->channels[i]->band != NL80211_BAND_6GHZ) n_channels++; } if (!n_channels) return cfg80211_scan_6ghz(rdev); request = kzalloc(struct_size(request, channels, n_channels), GFP_KERNEL); if (!request) return -ENOMEM; *request = *rdev_req; request->n_channels = n_channels; for (i = idx = 0; i < rdev_req->n_channels; i++) { if (rdev_req->channels[i]->band != NL80211_BAND_6GHZ) request->channels[idx++] = rdev_req->channels[i]; } rdev_req->scan_6ghz = false; rdev->int_scan_req = request; return rdev_scan(rdev, request); } void ___cfg80211_scan_done(struct cfg80211_registered_device *rdev, bool send_message) { struct cfg80211_scan_request *request, *rdev_req; struct wireless_dev *wdev; struct sk_buff *msg; #ifdef CONFIG_CFG80211_WEXT union iwreq_data wrqu; #endif lockdep_assert_held(&rdev->wiphy.mtx); if (rdev->scan_msg) { nl80211_send_scan_msg(rdev, rdev->scan_msg); rdev->scan_msg = NULL; return; } rdev_req = rdev->scan_req; if (!rdev_req) return; wdev = rdev_req->wdev; request = rdev->int_scan_req ? rdev->int_scan_req : rdev_req; if (wdev_running(wdev) && (rdev->wiphy.flags & WIPHY_FLAG_SPLIT_SCAN_6GHZ) && !rdev_req->scan_6ghz && !request->info.aborted && !cfg80211_scan_6ghz(rdev)) return; /* * This must be before sending the other events! * Otherwise, wpa_supplicant gets completely confused with * wext events. */ if (wdev->netdev) cfg80211_sme_scan_done(wdev->netdev); if (!request->info.aborted && request->flags & NL80211_SCAN_FLAG_FLUSH) { /* flush entries from previous scans */ spin_lock_bh(&rdev->bss_lock); __cfg80211_bss_expire(rdev, request->scan_start); spin_unlock_bh(&rdev->bss_lock); } msg = nl80211_build_scan_msg(rdev, wdev, request->info.aborted); #ifdef CONFIG_CFG80211_WEXT if (wdev->netdev && !request->info.aborted) { memset(&wrqu, 0, sizeof(wrqu)); wireless_send_event(wdev->netdev, SIOCGIWSCAN, &wrqu, NULL); } #endif dev_put(wdev->netdev); kfree(rdev->int_scan_req); rdev->int_scan_req = NULL; kfree(rdev->scan_req); rdev->scan_req = NULL; if (!send_message) rdev->scan_msg = msg; else nl80211_send_scan_msg(rdev, msg); } void __cfg80211_scan_done(struct wiphy *wiphy, struct wiphy_work *wk) { ___cfg80211_scan_done(wiphy_to_rdev(wiphy), true); } void cfg80211_scan_done(struct cfg80211_scan_request *request, struct cfg80211_scan_info *info) { struct cfg80211_scan_info old_info = request->info; trace_cfg80211_scan_done(request, info); WARN_ON(request != wiphy_to_rdev(request->wiphy)->scan_req && request != wiphy_to_rdev(request->wiphy)->int_scan_req); request->info = *info; /* * In case the scan is split, the scan_start_tsf and tsf_bssid should * be of the first part. In such a case old_info.scan_start_tsf should * be non zero. */ if (request->scan_6ghz && old_info.scan_start_tsf) { request->info.scan_start_tsf = old_info.scan_start_tsf; memcpy(request->info.tsf_bssid, old_info.tsf_bssid, sizeof(request->info.tsf_bssid)); } request->notified = true; wiphy_work_queue(request->wiphy, &wiphy_to_rdev(request->wiphy)->scan_done_wk); } EXPORT_SYMBOL(cfg80211_scan_done); void cfg80211_add_sched_scan_req(struct cfg80211_registered_device *rdev, struct cfg80211_sched_scan_request *req) { lockdep_assert_held(&rdev->wiphy.mtx); list_add_rcu(&req->list, &rdev->sched_scan_req_list); } static void cfg80211_del_sched_scan_req(struct cfg80211_registered_device *rdev, struct cfg80211_sched_scan_request *req) { lockdep_assert_held(&rdev->wiphy.mtx); list_del_rcu(&req->list); kfree_rcu(req, rcu_head); } static struct cfg80211_sched_scan_request * cfg80211_find_sched_scan_req(struct cfg80211_registered_device *rdev, u64 reqid) { struct cfg80211_sched_scan_request *pos; list_for_each_entry_rcu(pos, &rdev->sched_scan_req_list, list, lockdep_is_held(&rdev->wiphy.mtx)) { if (pos->reqid == reqid) return pos; } return NULL; } /* * Determines if a scheduled scan request can be handled. When a legacy * scheduled scan is running no other scheduled scan is allowed regardless * whether the request is for legacy or multi-support scan. When a multi-support * scheduled scan is running a request for legacy scan is not allowed. In this * case a request for multi-support scan can be handled if resources are * available, ie. struct wiphy::max_sched_scan_reqs limit is not yet reached. */ int cfg80211_sched_scan_req_possible(struct cfg80211_registered_device *rdev, bool want_multi) { struct cfg80211_sched_scan_request *pos; int i = 0; list_for_each_entry(pos, &rdev->sched_scan_req_list, list) { /* request id zero means legacy in progress */ if (!i && !pos->reqid) return -EINPROGRESS; i++; } if (i) { /* no legacy allowed when multi request(s) are active */ if (!want_multi) return -EINPROGRESS; /* resource limit reached */ if (i == rdev->wiphy.max_sched_scan_reqs) return -ENOSPC; } return 0; } void cfg80211_sched_scan_results_wk(struct work_struct *work) { struct cfg80211_registered_device *rdev; struct cfg80211_sched_scan_request *req, *tmp; rdev = container_of(work, struct cfg80211_registered_device, sched_scan_res_wk); guard(wiphy)(&rdev->wiphy); list_for_each_entry_safe(req, tmp, &rdev->sched_scan_req_list, list) { if (req->report_results) { req->report_results = false; if (req->flags & NL80211_SCAN_FLAG_FLUSH) { /* flush entries from previous scans */ spin_lock_bh(&rdev->bss_lock); __cfg80211_bss_expire(rdev, req->scan_start); spin_unlock_bh(&rdev->bss_lock); req->scan_start = jiffies; } nl80211_send_sched_scan(req, NL80211_CMD_SCHED_SCAN_RESULTS); } } } void cfg80211_sched_scan_results(struct wiphy *wiphy, u64 reqid) { struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy); struct cfg80211_sched_scan_request *request; trace_cfg80211_sched_scan_results(wiphy, reqid); /* ignore if we're not scanning */ rcu_read_lock(); request = cfg80211_find_sched_scan_req(rdev, reqid); if (request) { request->report_results = true; queue_work(cfg80211_wq, &rdev->sched_scan_res_wk); } rcu_read_unlock(); } EXPORT_SYMBOL(cfg80211_sched_scan_results); void cfg80211_sched_scan_stopped_locked(struct wiphy *wiphy, u64 reqid) { struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy); lockdep_assert_held(&wiphy->mtx); trace_cfg80211_sched_scan_stopped(wiphy, reqid); __cfg80211_stop_sched_scan(rdev, reqid, true); } EXPORT_SYMBOL(cfg80211_sched_scan_stopped_locked); void cfg80211_sched_scan_stopped(struct wiphy *wiphy, u64 reqid) { guard(wiphy)(wiphy); cfg80211_sched_scan_stopped_locked(wiphy, reqid); } EXPORT_SYMBOL(cfg80211_sched_scan_stopped); int cfg80211_stop_sched_scan_req(struct cfg80211_registered_device *rdev, struct cfg80211_sched_scan_request *req, bool driver_initiated) { lockdep_assert_held(&rdev->wiphy.mtx); if (!driver_initiated) { int err = rdev_sched_scan_stop(rdev, req->dev, req->reqid); if (err) return err; } nl80211_send_sched_scan(req, NL80211_CMD_SCHED_SCAN_STOPPED); cfg80211_del_sched_scan_req(rdev, req); return 0; } int __cfg80211_stop_sched_scan(struct cfg80211_registered_device *rdev, u64 reqid, bool driver_initiated) { struct cfg80211_sched_scan_request *sched_scan_req; lockdep_assert_held(&rdev->wiphy.mtx); sched_scan_req = cfg80211_find_sched_scan_req(rdev, reqid); if (!sched_scan_req) return -ENOENT; return cfg80211_stop_sched_scan_req(rdev, sched_scan_req, driver_initiated); } void cfg80211_bss_age(struct cfg80211_registered_device *rdev, unsigned long age_secs) { struct cfg80211_internal_bss *bss; unsigned long age_jiffies = msecs_to_jiffies(age_secs * MSEC_PER_SEC); spin_lock_bh(&rdev->bss_lock); list_for_each_entry(bss, &rdev->bss_list, list) bss->ts -= age_jiffies; spin_unlock_bh(&rdev->bss_lock); } void cfg80211_bss_expire(struct cfg80211_registered_device *rdev) { __cfg80211_bss_expire(rdev, jiffies - IEEE80211_SCAN_RESULT_EXPIRE); } void cfg80211_bss_flush(struct wiphy *wiphy) { struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy); spin_lock_bh(&rdev->bss_lock); __cfg80211_bss_expire(rdev, jiffies); spin_unlock_bh(&rdev->bss_lock); } EXPORT_SYMBOL(cfg80211_bss_flush); const struct element * cfg80211_find_elem_match(u8 eid, const u8 *ies, unsigned int len, const u8 *match, unsigned int match_len, unsigned int match_offset) { const struct element *elem; for_each_element_id(elem, eid, ies, len) { if (elem->datalen >= match_offset + match_len && !memcmp(elem->data + match_offset, match, match_len)) return elem; } return NULL; } EXPORT_SYMBOL(cfg80211_find_elem_match); const struct element *cfg80211_find_vendor_elem(unsigned int oui, int oui_type, const u8 *ies, unsigned int len) { const struct element *elem; u8 match[] = { oui >> 16, oui >> 8, oui, oui_type }; int match_len = (oui_type < 0) ? 3 : sizeof(match); if (WARN_ON(oui_type > 0xff)) return NULL; elem = cfg80211_find_elem_match(WLAN_EID_VENDOR_SPECIFIC, ies, len, match, match_len, 0); if (!elem || elem->datalen < 4) return NULL; return elem; } EXPORT_SYMBOL(cfg80211_find_vendor_elem); /** * enum bss_compare_mode - BSS compare mode * @BSS_CMP_REGULAR: regular compare mode (for insertion and normal find) * @BSS_CMP_HIDE_ZLEN: find hidden SSID with zero-length mode * @BSS_CMP_HIDE_NUL: find hidden SSID with NUL-ed out mode */ enum bss_compare_mode { BSS_CMP_REGULAR, BSS_CMP_HIDE_ZLEN, BSS_CMP_HIDE_NUL, }; static int cmp_bss(struct cfg80211_bss *a, struct cfg80211_bss *b, enum bss_compare_mode mode) { const struct cfg80211_bss_ies *a_ies, *b_ies; const u8 *ie1 = NULL; const u8 *ie2 = NULL; int i, r; if (a->channel != b->channel) return (b->channel->center_freq * 1000 + b->channel->freq_offset) - (a->channel->center_freq * 1000 + a->channel->freq_offset); a_ies = rcu_access_pointer(a->ies); if (!a_ies) return -1; b_ies = rcu_access_pointer(b->ies); if (!b_ies) return 1; if (WLAN_CAPABILITY_IS_STA_BSS(a->capability)) ie1 = cfg80211_find_ie(WLAN_EID_MESH_ID, a_ies->data, a_ies->len); if (WLAN_CAPABILITY_IS_STA_BSS(b->capability)) ie2 = cfg80211_find_ie(WLAN_EID_MESH_ID, b_ies->data, b_ies->len); if (ie1 && ie2) { int mesh_id_cmp; if (ie1[1] == ie2[1]) mesh_id_cmp = memcmp(ie1 + 2, ie2 + 2, ie1[1]); else mesh_id_cmp = ie2[1] - ie1[1]; ie1 = cfg80211_find_ie(WLAN_EID_MESH_CONFIG, a_ies->data, a_ies->len); ie2 = cfg80211_find_ie(WLAN_EID_MESH_CONFIG, b_ies->data, b_ies->len); if (ie1 && ie2) { if (mesh_id_cmp) return mesh_id_cmp; if (ie1[1] != ie2[1]) return ie2[1] - ie1[1]; return memcmp(ie1 + 2, ie2 + 2, ie1[1]); } } r = memcmp(a->bssid, b->bssid, sizeof(a->bssid)); if (r) return r; ie1 = cfg80211_find_ie(WLAN_EID_SSID, a_ies->data, a_ies->len); ie2 = cfg80211_find_ie(WLAN_EID_SSID, b_ies->data, b_ies->len); if (!ie1 && !ie2) return 0; /* * Note that with "hide_ssid", the function returns a match if * the already-present BSS ("b") is a hidden SSID beacon for * the new BSS ("a"). */ /* sort missing IE before (left of) present IE */ if (!ie1) return -1; if (!ie2) return 1; switch (mode) { case BSS_CMP_HIDE_ZLEN: /* * In ZLEN mode we assume the BSS entry we're * looking for has a zero-length SSID. So if * the one we're looking at right now has that, * return 0. Otherwise, return the difference * in length, but since we're looking for the * 0-length it's really equivalent to returning * the length of the one we're looking at. * * No content comparison is needed as we assume * the content length is zero. */ return ie2[1]; case BSS_CMP_REGULAR: default: /* sort by length first, then by contents */ if (ie1[1] != ie2[1]) return ie2[1] - ie1[1]; return memcmp(ie1 + 2, ie2 + 2, ie1[1]); case BSS_CMP_HIDE_NUL: if (ie1[1] != ie2[1]) return ie2[1] - ie1[1]; /* this is equivalent to memcmp(zeroes, ie2 + 2, len) */ for (i = 0; i < ie2[1]; i++) if (ie2[i + 2]) return -1; return 0; } } static bool cfg80211_bss_type_match(u16 capability, enum nl80211_band band, enum ieee80211_bss_type bss_type) { bool ret = true; u16 mask, val; if (bss_type == IEEE80211_BSS_TYPE_ANY) return ret; if (band == NL80211_BAND_60GHZ) { mask = WLAN_CAPABILITY_DMG_TYPE_MASK; switch (bss_type) { case IEEE80211_BSS_TYPE_ESS: val = WLAN_CAPABILITY_DMG_TYPE_AP; break; case IEEE80211_BSS_TYPE_PBSS: val = WLAN_CAPABILITY_DMG_TYPE_PBSS; break; case IEEE80211_BSS_TYPE_IBSS: val = WLAN_CAPABILITY_DMG_TYPE_IBSS; break; default: return false; } } else { mask = WLAN_CAPABILITY_ESS | WLAN_CAPABILITY_IBSS; switch (bss_type) { case IEEE80211_BSS_TYPE_ESS: val = WLAN_CAPABILITY_ESS; break; case IEEE80211_BSS_TYPE_IBSS: val = WLAN_CAPABILITY_IBSS; break; case IEEE80211_BSS_TYPE_MBSS: val = 0; break; default: return false; } } ret = ((capability & mask) == val); return ret; } /* Returned bss is reference counted and must be cleaned up appropriately. */ struct cfg80211_bss *__cfg80211_get_bss(struct wiphy *wiphy, struct ieee80211_channel *channel, const u8 *bssid, const u8 *ssid, size_t ssid_len, enum ieee80211_bss_type bss_type, enum ieee80211_privacy privacy, u32 use_for) { struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy); struct cfg80211_internal_bss *bss, *res = NULL; unsigned long now = jiffies; int bss_privacy; trace_cfg80211_get_bss(wiphy, channel, bssid, ssid, ssid_len, bss_type, privacy); spin_lock_bh(&rdev->bss_lock); list_for_each_entry(bss, &rdev->bss_list, list) { if (!cfg80211_bss_type_match(bss->pub.capability, bss->pub.channel->band, bss_type)) continue; bss_privacy = (bss->pub.capability & WLAN_CAPABILITY_PRIVACY); if ((privacy == IEEE80211_PRIVACY_ON && !bss_privacy) || (privacy == IEEE80211_PRIVACY_OFF && bss_privacy)) continue; if (channel && bss->pub.channel != channel) continue; if (!is_valid_ether_addr(bss->pub.bssid)) continue; if ((bss->pub.use_for & use_for) != use_for) continue; /* Don't get expired BSS structs */ if (time_after(now, bss->ts + IEEE80211_SCAN_RESULT_EXPIRE) && !atomic_read(&bss->hold)) continue; if (is_bss(&bss->pub, bssid, ssid, ssid_len)) { res = bss; bss_ref_get(rdev, res); break; } } spin_unlock_bh(&rdev->bss_lock); if (!res) return NULL; trace_cfg80211_return_bss(&res->pub); return &res->pub; } EXPORT_SYMBOL(__cfg80211_get_bss); static bool rb_insert_bss(struct cfg80211_registered_device *rdev, struct cfg80211_internal_bss *bss) { struct rb_node **p = &rdev->bss_tree.rb_node; struct rb_node *parent = NULL; struct cfg80211_internal_bss *tbss; int cmp; while (*p) { parent = *p; tbss = rb_entry(parent, struct cfg80211_internal_bss, rbn); cmp = cmp_bss(&bss->pub, &tbss->pub, BSS_CMP_REGULAR); if (WARN_ON(!cmp)) { /* will sort of leak this BSS */ return false; } if (cmp < 0) p = &(*p)->rb_left; else p = &(*p)->rb_right; } rb_link_node(&bss->rbn, parent, p); rb_insert_color(&bss->rbn, &rdev->bss_tree); return true; } static struct cfg80211_internal_bss * rb_find_bss(struct cfg80211_registered_device *rdev, struct cfg80211_internal_bss *res, enum bss_compare_mode mode) { struct rb_node *n = rdev->bss_tree.rb_node; struct cfg80211_internal_bss *bss; int r; while (n) { bss = rb_entry(n, struct cfg80211_internal_bss, rbn); r = cmp_bss(&res->pub, &bss->pub, mode); if (r == 0) return bss; else if (r < 0) n = n->rb_left; else n = n->rb_right; } return NULL; } static void cfg80211_insert_bss(struct cfg80211_registered_device *rdev, struct cfg80211_internal_bss *bss) { lockdep_assert_held(&rdev->bss_lock); if (!rb_insert_bss(rdev, bss)) return; list_add_tail(&bss->list, &rdev->bss_list); rdev->bss_entries++; } static void cfg80211_rehash_bss(struct cfg80211_registered_device *rdev, struct cfg80211_internal_bss *bss) { lockdep_assert_held(&rdev->bss_lock); rb_erase(&bss->rbn, &rdev->bss_tree); if (!rb_insert_bss(rdev, bss)) { list_del(&bss->list); if (!list_empty(&bss->hidden_list)) list_del_init(&bss->hidden_list); if (!list_empty(&bss->pub.nontrans_list)) list_del_init(&bss->pub.nontrans_list); rdev->bss_entries--; } rdev->bss_generation++; } static bool cfg80211_combine_bsses(struct cfg80211_registered_device *rdev, struct cfg80211_internal_bss *new) { const struct cfg80211_bss_ies *ies; struct cfg80211_internal_bss *bss; const u8 *ie; int i, ssidlen; u8 fold = 0; u32 n_entries = 0; ies = rcu_access_pointer(new->pub.beacon_ies); if (WARN_ON(!ies)) return false; ie = cfg80211_find_ie(WLAN_EID_SSID, ies->data, ies->len); if (!ie) { /* nothing to do */ return true; } ssidlen = ie[1]; for (i = 0; i < ssidlen; i++) fold |= ie[2 + i]; if (fold) { /* not a hidden SSID */ return true; } /* This is the bad part ... */ list_for_each_entry(bss, &rdev->bss_list, list) { /* * we're iterating all the entries anyway, so take the * opportunity to validate the list length accounting */ n_entries++; if (!ether_addr_equal(bss->pub.bssid, new->pub.bssid)) continue; if (bss->pub.channel != new->pub.channel) continue; if (rcu_access_pointer(bss->pub.beacon_ies)) continue; ies = rcu_access_pointer(bss->pub.ies); if (!ies) continue; ie = cfg80211_find_ie(WLAN_EID_SSID, ies->data, ies->len); if (!ie) continue; if (ssidlen && ie[1] != ssidlen) continue; if (WARN_ON_ONCE(bss->pub.hidden_beacon_bss)) continue; if (WARN_ON_ONCE(!list_empty(&bss->hidden_list))) list_del(&bss->hidden_list); /* combine them */ list_add(&bss->hidden_list, &new->hidden_list); bss->pub.hidden_beacon_bss = &new->pub; new->refcount += bss->refcount; rcu_assign_pointer(bss->pub.beacon_ies, new->pub.beacon_ies); } WARN_ONCE(n_entries != rdev->bss_entries, "rdev bss entries[%d]/list[len:%d] corruption\n", rdev->bss_entries, n_entries); return true; } static void cfg80211_update_hidden_bsses(struct cfg80211_internal_bss *known, const struct cfg80211_bss_ies *new_ies, const struct cfg80211_bss_ies *old_ies) { struct cfg80211_internal_bss *bss; /* Assign beacon IEs to all sub entries */ list_for_each_entry(bss, &known->hidden_list, hidden_list) { const struct cfg80211_bss_ies *ies; ies = rcu_access_pointer(bss->pub.beacon_ies); WARN_ON(ies != old_ies); rcu_assign_pointer(bss->pub.beacon_ies, new_ies); } } static void cfg80211_check_stuck_ecsa(struct cfg80211_registered_device *rdev, struct cfg80211_internal_bss *known, const struct cfg80211_bss_ies *old) { const struct ieee80211_ext_chansw_ie *ecsa; const struct element *elem_new, *elem_old; const struct cfg80211_bss_ies *new, *bcn; if (known->pub.proberesp_ecsa_stuck) return; new = rcu_dereference_protected(known->pub.proberesp_ies, lockdep_is_held(&rdev->bss_lock)); if (WARN_ON(!new)) return; if (new->tsf - old->tsf < USEC_PER_SEC) return; elem_old = cfg80211_find_elem(WLAN_EID_EXT_CHANSWITCH_ANN, old->data, old->len); if (!elem_old) return; elem_new = cfg80211_find_elem(WLAN_EID_EXT_CHANSWITCH_ANN, new->data, new->len); if (!elem_new) return; bcn = rcu_dereference_protected(known->pub.beacon_ies, lockdep_is_held(&rdev->bss_lock)); if (bcn && cfg80211_find_elem(WLAN_EID_EXT_CHANSWITCH_ANN, bcn->data, bcn->len)) return; if (elem_new->datalen != elem_old->datalen) return; if (elem_new->datalen < sizeof(struct ieee80211_ext_chansw_ie)) return; if (memcmp(elem_new->data, elem_old->data, elem_new->datalen)) return; ecsa = (void *)elem_new->data; if (!ecsa->mode) return; if (ecsa->new_ch_num != ieee80211_frequency_to_channel(known->pub.channel->center_freq)) return; known->pub.proberesp_ecsa_stuck = 1; } static bool cfg80211_update_known_bss(struct cfg80211_registered_device *rdev, struct cfg80211_internal_bss *known, struct cfg80211_internal_bss *new, bool signal_valid) { lockdep_assert_held(&rdev->bss_lock); /* Update IEs */ if (rcu_access_pointer(new->pub.proberesp_ies)) { const struct cfg80211_bss_ies *old; old = rcu_access_pointer(known->pub.proberesp_ies); rcu_assign_pointer(known->pub.proberesp_ies, new->pub.proberesp_ies); /* Override possible earlier Beacon frame IEs */ rcu_assign_pointer(known->pub.ies, new->pub.proberesp_ies); if (old) { cfg80211_check_stuck_ecsa(rdev, known, old); kfree_rcu((struct cfg80211_bss_ies *)old, rcu_head); } } if (rcu_access_pointer(new->pub.beacon_ies)) { const struct cfg80211_bss_ies *old; if (known->pub.hidden_beacon_bss && !list_empty(&known->hidden_list)) { const struct cfg80211_bss_ies *f; /* The known BSS struct is one of the probe * response members of a group, but we're * receiving a beacon (beacon_ies in the new * bss is used). This can only mean that the * AP changed its beacon from not having an * SSID to showing it, which is confusing so * drop this information. */ f = rcu_access_pointer(new->pub.beacon_ies); kfree_rcu((struct cfg80211_bss_ies *)f, rcu_head); return false; } old = rcu_access_pointer(known->pub.beacon_ies); rcu_assign_pointer(known->pub.beacon_ies, new->pub.beacon_ies); /* Override IEs if they were from a beacon before */ if (old == rcu_access_pointer(known->pub.ies)) rcu_assign_pointer(known->pub.ies, new->pub.beacon_ies); cfg80211_update_hidden_bsses(known, rcu_access_pointer(new->pub.beacon_ies), old); if (old) kfree_rcu((struct cfg80211_bss_ies *)old, rcu_head); } known->pub.beacon_interval = new->pub.beacon_interval; /* don't update the signal if beacon was heard on * adjacent channel. */ if (signal_valid) known->pub.signal = new->pub.signal; known->pub.capability = new->pub.capability; known->ts = new->ts; known->ts_boottime = new->ts_boottime; known->parent_tsf = new->parent_tsf; known->pub.chains = new->pub.chains; memcpy(known->pub.chain_signal, new->pub.chain_signal, IEEE80211_MAX_CHAINS); ether_addr_copy(known->parent_bssid, new->parent_bssid); known->pub.max_bssid_indicator = new->pub.max_bssid_indicator; known->pub.bssid_index = new->pub.bssid_index; known->pub.use_for &= new->pub.use_for; known->pub.cannot_use_reasons = new->pub.cannot_use_reasons; known->bss_source = new->bss_source; return true; } /* Returned bss is reference counted and must be cleaned up appropriately. */ static struct cfg80211_internal_bss * __cfg80211_bss_update(struct cfg80211_registered_device *rdev, struct cfg80211_internal_bss *tmp, bool signal_valid, unsigned long ts) { struct cfg80211_internal_bss *found = NULL; struct cfg80211_bss_ies *ies; if (WARN_ON(!tmp->pub.channel)) goto free_ies; tmp->ts = ts; if (WARN_ON(!rcu_access_pointer(tmp->pub.ies))) goto free_ies; found = rb_find_bss(rdev, tmp, BSS_CMP_REGULAR); if (found) { if (!cfg80211_update_known_bss(rdev, found, tmp, signal_valid)) return NULL; } else { struct cfg80211_internal_bss *new; struct cfg80211_internal_bss *hidden; /* * create a copy -- the "res" variable that is passed in * is allocated on the stack since it's not needed in the * more common case of an update */ new = kzalloc(sizeof(*new) + rdev->wiphy.bss_priv_size, GFP_ATOMIC); if (!new) goto free_ies; memcpy(new, tmp, sizeof(*new)); new->refcount = 1; INIT_LIST_HEAD(&new->hidden_list); INIT_LIST_HEAD(&new->pub.nontrans_list); /* we'll set this later if it was non-NULL */ new->pub.transmitted_bss = NULL; if (rcu_access_pointer(tmp->pub.proberesp_ies)) { hidden = rb_find_bss(rdev, tmp, BSS_CMP_HIDE_ZLEN); if (!hidden) hidden = rb_find_bss(rdev, tmp, BSS_CMP_HIDE_NUL); if (hidden) { new->pub.hidden_beacon_bss = &hidden->pub; list_add(&new->hidden_list, &hidden->hidden_list); hidden->refcount++; ies = (void *)rcu_access_pointer(new->pub.beacon_ies); rcu_assign_pointer(new->pub.beacon_ies, hidden->pub.beacon_ies); if (ies) kfree_rcu(ies, rcu_head); } } else { /* * Ok so we found a beacon, and don't have an entry. If * it's a beacon with hidden SSID, we might be in for an * expensive search for any probe responses that should * be grouped with this beacon for updates ... */ if (!cfg80211_combine_bsses(rdev, new)) { bss_ref_put(rdev, new); return NULL; } } if (rdev->bss_entries >= bss_entries_limit && !cfg80211_bss_expire_oldest(rdev)) { bss_ref_put(rdev, new); return NULL; } /* This must be before the call to bss_ref_get */ if (tmp->pub.transmitted_bss) { new->pub.transmitted_bss = tmp->pub.transmitted_bss; bss_ref_get(rdev, bss_from_pub(tmp->pub.transmitted_bss)); } cfg80211_insert_bss(rdev, new); found = new; } rdev->bss_generation++; bss_ref_get(rdev, found); return found; free_ies: ies = (void *)rcu_access_pointer(tmp->pub.beacon_ies); if (ies) kfree_rcu(ies, rcu_head); ies = (void *)rcu_access_pointer(tmp->pub.proberesp_ies); if (ies) kfree_rcu(ies, rcu_head); return NULL; } struct cfg80211_internal_bss * cfg80211_bss_update(struct cfg80211_registered_device *rdev, struct cfg80211_internal_bss *tmp, bool signal_valid, unsigned long ts) { struct cfg80211_internal_bss *res; spin_lock_bh(&rdev->bss_lock); res = __cfg80211_bss_update(rdev, tmp, signal_valid, ts); spin_unlock_bh(&rdev->bss_lock); return res; } int cfg80211_get_ies_channel_number(const u8 *ie, size_t ielen, enum nl80211_band band) { const struct element *tmp; if (band == NL80211_BAND_6GHZ) { struct ieee80211_he_operation *he_oper; tmp = cfg80211_find_ext_elem(WLAN_EID_EXT_HE_OPERATION, ie, ielen); if (tmp && tmp->datalen >= sizeof(*he_oper) && tmp->datalen >= ieee80211_he_oper_size(&tmp->data[1])) { const struct ieee80211_he_6ghz_oper *he_6ghz_oper; he_oper = (void *)&tmp->data[1]; he_6ghz_oper = ieee80211_he_6ghz_oper(he_oper); if (!he_6ghz_oper) return -1; return he_6ghz_oper->primary; } } else if (band == NL80211_BAND_S1GHZ) { tmp = cfg80211_find_elem(WLAN_EID_S1G_OPERATION, ie, ielen); if (tmp && tmp->datalen >= sizeof(struct ieee80211_s1g_oper_ie)) { struct ieee80211_s1g_oper_ie *s1gop = (void *)tmp->data; return s1gop->oper_ch; } } else { tmp = cfg80211_find_elem(WLAN_EID_DS_PARAMS, ie, ielen); if (tmp && tmp->datalen == 1) return tmp->data[0]; tmp = cfg80211_find_elem(WLAN_EID_HT_OPERATION, ie, ielen); if (tmp && tmp->datalen >= sizeof(struct ieee80211_ht_operation)) { struct ieee80211_ht_operation *htop = (void *)tmp->data; return htop->primary_chan; } } return -1; } EXPORT_SYMBOL(cfg80211_get_ies_channel_number); /* * Update RX channel information based on the available frame payload * information. This is mainly for the 2.4 GHz band where frames can be received * from neighboring channels and the Beacon frames use the DSSS Parameter Set * element to indicate the current (transmitting) channel, but this might also * be needed on other bands if RX frequency does not match with the actual * operating channel of a BSS, or if the AP reports a different primary channel. */ static struct ieee80211_channel * cfg80211_get_bss_channel(struct wiphy *wiphy, const u8 *ie, size_t ielen, struct ieee80211_channel *channel) { u32 freq; int channel_number; struct ieee80211_channel *alt_channel; channel_number = cfg80211_get_ies_channel_number(ie, ielen, channel->band); if (channel_number < 0) { /* No channel information in frame payload */ return channel; } freq = ieee80211_channel_to_freq_khz(channel_number, channel->band); /* * Frame info (beacon/prob res) is the same as received channel, * no need for further processing. */ if (freq == ieee80211_channel_to_khz(channel)) return channel; alt_channel = ieee80211_get_channel_khz(wiphy, freq); if (!alt_channel) { if (channel->band == NL80211_BAND_2GHZ || channel->band == NL80211_BAND_6GHZ) { /* * Better not allow unexpected channels when that could * be going beyond the 1-11 range (e.g., discovering * BSS on channel 12 when radio is configured for * channel 11) or beyond the 6 GHz channel range. */ return NULL; } /* No match for the payload channel number - ignore it */ return channel; } /* * Use the channel determined through the payload channel number * instead of the RX channel reported by the driver. */ if (alt_channel->flags & IEEE80211_CHAN_DISABLED) return NULL; return alt_channel; } struct cfg80211_inform_single_bss_data { struct cfg80211_inform_bss *drv_data; enum cfg80211_bss_frame_type ftype; struct ieee80211_channel *channel; u8 bssid[ETH_ALEN]; u64 tsf; u16 capability; u16 beacon_interval; const u8 *ie; size_t ielen; enum bss_source_type bss_source; /* Set if reporting bss_source != BSS_SOURCE_DIRECT */ struct cfg80211_bss *source_bss; u8 max_bssid_indicator; u8 bssid_index; u8 use_for; u64 cannot_use_reasons; }; enum ieee80211_ap_reg_power cfg80211_get_6ghz_power_type(const u8 *elems, size_t elems_len) { const struct ieee80211_he_6ghz_oper *he_6ghz_oper; struct ieee80211_he_operation *he_oper; const struct element *tmp; tmp = cfg80211_find_ext_elem(WLAN_EID_EXT_HE_OPERATION, elems, elems_len); if (!tmp || tmp->datalen < sizeof(*he_oper) + 1 || tmp->datalen < ieee80211_he_oper_size(tmp->data + 1)) return IEEE80211_REG_UNSET_AP; he_oper = (void *)&tmp->data[1]; he_6ghz_oper = ieee80211_he_6ghz_oper(he_oper); if (!he_6ghz_oper) return IEEE80211_REG_UNSET_AP; switch (u8_get_bits(he_6ghz_oper->control, IEEE80211_HE_6GHZ_OPER_CTRL_REG_INFO)) { case IEEE80211_6GHZ_CTRL_REG_LPI_AP: case IEEE80211_6GHZ_CTRL_REG_INDOOR_LPI_AP: return IEEE80211_REG_LPI_AP; case IEEE80211_6GHZ_CTRL_REG_SP_AP: case IEEE80211_6GHZ_CTRL_REG_INDOOR_SP_AP: return IEEE80211_REG_SP_AP; case IEEE80211_6GHZ_CTRL_REG_VLP_AP: return IEEE80211_REG_VLP_AP; default: return IEEE80211_REG_UNSET_AP; } } static bool cfg80211_6ghz_power_type_valid(const u8 *elems, size_t elems_len, const u32 flags) { switch (cfg80211_get_6ghz_power_type(elems, elems_len)) { case IEEE80211_REG_LPI_AP: return true; case IEEE80211_REG_SP_AP: return !(flags & IEEE80211_CHAN_NO_6GHZ_AFC_CLIENT); case IEEE80211_REG_VLP_AP: return !(flags & IEEE80211_CHAN_NO_6GHZ_VLP_CLIENT); default: return false; } } /* Returned bss is reference counted and must be cleaned up appropriately. */ static struct cfg80211_bss * cfg80211_inform_single_bss_data(struct wiphy *wiphy, struct cfg80211_inform_single_bss_data *data, gfp_t gfp) { struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy); struct cfg80211_inform_bss *drv_data = data->drv_data; struct cfg80211_bss_ies *ies; struct ieee80211_channel *channel; struct cfg80211_internal_bss tmp = {}, *res; int bss_type; bool signal_valid; unsigned long ts; if (WARN_ON(!wiphy)) return NULL; if (WARN_ON(wiphy->signal_type == CFG80211_SIGNAL_TYPE_UNSPEC && (drv_data->signal < 0 || drv_data->signal > 100))) return NULL; if (WARN_ON(data->bss_source != BSS_SOURCE_DIRECT && !data->source_bss)) return NULL; channel = data->channel; if (!channel) channel = cfg80211_get_bss_channel(wiphy, data->ie, data->ielen, drv_data->chan); if (!channel) return NULL; if (channel->band == NL80211_BAND_6GHZ && !cfg80211_6ghz_power_type_valid(data->ie, data->ielen, channel->flags)) { data->use_for = 0; data->cannot_use_reasons = NL80211_BSS_CANNOT_USE_6GHZ_PWR_MISMATCH; } memcpy(tmp.pub.bssid, data->bssid, ETH_ALEN); tmp.pub.channel = channel; if (data->bss_source != BSS_SOURCE_STA_PROFILE) tmp.pub.signal = drv_data->signal; else tmp.pub.signal = 0; tmp.pub.beacon_interval = data->beacon_interval; tmp.pub.capability = data->capability; tmp.ts_boottime = drv_data->boottime_ns; tmp.parent_tsf = drv_data->parent_tsf; ether_addr_copy(tmp.parent_bssid, drv_data->parent_bssid); tmp.pub.chains = drv_data->chains; memcpy(tmp.pub.chain_signal, drv_data->chain_signal, IEEE80211_MAX_CHAINS); tmp.pub.use_for = data->use_for; tmp.pub.cannot_use_reasons = data->cannot_use_reasons; tmp.bss_source = data->bss_source; switch (data->bss_source) { case BSS_SOURCE_MBSSID: tmp.pub.transmitted_bss = data->source_bss; fallthrough; case BSS_SOURCE_STA_PROFILE: ts = bss_from_pub(data->source_bss)->ts; tmp.pub.bssid_index = data->bssid_index; tmp.pub.max_bssid_indicator = data->max_bssid_indicator; break; case BSS_SOURCE_DIRECT: ts = jiffies; if (channel->band == NL80211_BAND_60GHZ) { bss_type = data->capability & WLAN_CAPABILITY_DMG_TYPE_MASK; if (bss_type == WLAN_CAPABILITY_DMG_TYPE_AP || bss_type == WLAN_CAPABILITY_DMG_TYPE_PBSS) regulatory_hint_found_beacon(wiphy, channel, gfp); } else { if (data->capability & WLAN_CAPABILITY_ESS) regulatory_hint_found_beacon(wiphy, channel, gfp); } break; } /* * If we do not know here whether the IEs are from a Beacon or Probe * Response frame, we need to pick one of the options and only use it * with the driver that does not provide the full Beacon/Probe Response * frame. Use Beacon frame pointer to avoid indicating that this should * override the IEs pointer should we have received an earlier * indication of Probe Response data. */ ies = kzalloc(sizeof(*ies) + data->ielen, gfp); if (!ies) return NULL; ies->len = data->ielen; ies->tsf = data->tsf; ies->from_beacon = false; memcpy(ies->data, data->ie, data->ielen); switch (data->ftype) { case CFG80211_BSS_FTYPE_BEACON: case CFG80211_BSS_FTYPE_S1G_BEACON: ies->from_beacon = true; fallthrough; case CFG80211_BSS_FTYPE_UNKNOWN: rcu_assign_pointer(tmp.pub.beacon_ies, ies); break; case CFG80211_BSS_FTYPE_PRESP: rcu_assign_pointer(tmp.pub.proberesp_ies, ies); break; } rcu_assign_pointer(tmp.pub.ies, ies); signal_valid = drv_data->chan == channel; spin_lock_bh(&rdev->bss_lock); res = __cfg80211_bss_update(rdev, &tmp, signal_valid, ts); if (!res) goto drop; rdev_inform_bss(rdev, &res->pub, ies, drv_data->drv_data); if (data->bss_source == BSS_SOURCE_MBSSID) { /* this is a nontransmitting bss, we need to add it to * transmitting bss' list if it is not there */ if (cfg80211_add_nontrans_list(data->source_bss, &res->pub)) { if (__cfg80211_unlink_bss(rdev, res)) { rdev->bss_generation++; res = NULL; } } if (!res) goto drop; } spin_unlock_bh(&rdev->bss_lock); trace_cfg80211_return_bss(&res->pub); /* __cfg80211_bss_update gives us a referenced result */ return &res->pub; drop: spin_unlock_bh(&rdev->bss_lock); return NULL; } static const struct element *cfg80211_get_profile_continuation(const u8 *ie, size_t ielen, const struct element *mbssid_elem, const struct element *sub_elem) { const u8 *mbssid_end = mbssid_elem->data + mbssid_elem->datalen; const struct element *next_mbssid; const struct element *next_sub; next_mbssid = cfg80211_find_elem(WLAN_EID_MULTIPLE_BSSID, mbssid_end, ielen - (mbssid_end - ie)); /* * If it is not the last subelement in current MBSSID IE or there isn't * a next MBSSID IE - profile is complete. */ if ((sub_elem->data + sub_elem->datalen < mbssid_end - 1) || !next_mbssid) return NULL; /* For any length error, just return NULL */ if (next_mbssid->datalen < 4) return NULL; next_sub = (void *)&next_mbssid->data[1]; if (next_mbssid->data + next_mbssid->datalen < next_sub->data + next_sub->datalen) return NULL; if (next_sub->id != 0 || next_sub->datalen < 2) return NULL; /* * Check if the first element in the next sub element is a start * of a new profile */ return next_sub->data[0] == WLAN_EID_NON_TX_BSSID_CAP ? NULL : next_mbssid; } size_t cfg80211_merge_profile(const u8 *ie, size_t ielen, const struct element *mbssid_elem, const struct element *sub_elem, u8 *merged_ie, size_t max_copy_len) { size_t copied_len = sub_elem->datalen; const struct element *next_mbssid; if (sub_elem->datalen > max_copy_len) return 0; memcpy(merged_ie, sub_elem->data, sub_elem->datalen); while ((next_mbssid = cfg80211_get_profile_continuation(ie, ielen, mbssid_elem, sub_elem))) { const struct element *next_sub = (void *)&next_mbssid->data[1]; if (copied_len + next_sub->datalen > max_copy_len) break; memcpy(merged_ie + copied_len, next_sub->data, next_sub->datalen); copied_len += next_sub->datalen; } return copied_len; } EXPORT_SYMBOL(cfg80211_merge_profile); static void cfg80211_parse_mbssid_data(struct wiphy *wiphy, struct cfg80211_inform_single_bss_data *tx_data, struct cfg80211_bss *source_bss, gfp_t gfp) { struct cfg80211_inform_single_bss_data data = { .drv_data = tx_data->drv_data, .ftype = tx_data->ftype, .tsf = tx_data->tsf, .beacon_interval = tx_data->beacon_interval, .source_bss = source_bss, .bss_source = BSS_SOURCE_MBSSID, .use_for = tx_data->use_for, .cannot_use_reasons = tx_data->cannot_use_reasons, }; const u8 *mbssid_index_ie; const struct element *elem, *sub; u8 *new_ie, *profile; u64 seen_indices = 0; struct cfg80211_bss *bss; if (!source_bss) return; if (!cfg80211_find_elem(WLAN_EID_MULTIPLE_BSSID, tx_data->ie, tx_data->ielen)) return; if (!wiphy->support_mbssid) return; if (wiphy->support_only_he_mbssid && !cfg80211_find_ext_elem(WLAN_EID_EXT_HE_CAPABILITY, tx_data->ie, tx_data->ielen)) return; new_ie = kmalloc(IEEE80211_MAX_DATA_LEN, gfp); if (!new_ie) return; profile = kmalloc(tx_data->ielen, gfp); if (!profile) goto out; for_each_element_id(elem, WLAN_EID_MULTIPLE_BSSID, tx_data->ie, tx_data->ielen) { if (elem->datalen < 4) continue; if (elem->data[0] < 1 || (int)elem->data[0] > 8) continue; for_each_element(sub, elem->data + 1, elem->datalen - 1) { u8 profile_len; if (sub->id != 0 || sub->datalen < 4) { /* not a valid BSS profile */ continue; } if (sub->data[0] != WLAN_EID_NON_TX_BSSID_CAP || sub->data[1] != 2) { /* The first element within the Nontransmitted * BSSID Profile is not the Nontransmitted * BSSID Capability element. */ continue; } memset(profile, 0, tx_data->ielen); profile_len = cfg80211_merge_profile(tx_data->ie, tx_data->ielen, elem, sub, profile, tx_data->ielen); /* found a Nontransmitted BSSID Profile */ mbssid_index_ie = cfg80211_find_ie (WLAN_EID_MULTI_BSSID_IDX, profile, profile_len); if (!mbssid_index_ie || mbssid_index_ie[1] < 1 || mbssid_index_ie[2] == 0 || mbssid_index_ie[2] > 46 || mbssid_index_ie[2] >= (1 << elem->data[0])) { /* No valid Multiple BSSID-Index element */ continue; } if (seen_indices & BIT_ULL(mbssid_index_ie[2])) /* We don't support legacy split of a profile */ net_dbg_ratelimited("Partial info for BSSID index %d\n", mbssid_index_ie[2]); seen_indices |= BIT_ULL(mbssid_index_ie[2]); data.bssid_index = mbssid_index_ie[2]; data.max_bssid_indicator = elem->data[0]; cfg80211_gen_new_bssid(tx_data->bssid, data.max_bssid_indicator, data.bssid_index, data.bssid); memset(new_ie, 0, IEEE80211_MAX_DATA_LEN); data.ie = new_ie; data.ielen = cfg80211_gen_new_ie(tx_data->ie, tx_data->ielen, profile, profile_len, new_ie, IEEE80211_MAX_DATA_LEN); if (!data.ielen) continue; data.capability = get_unaligned_le16(profile + 2); bss = cfg80211_inform_single_bss_data(wiphy, &data, gfp); if (!bss) break; cfg80211_put_bss(wiphy, bss); } } out: kfree(new_ie); kfree(profile); } ssize_t cfg80211_defragment_element(const struct element *elem, const u8 *ies, size_t ieslen, u8 *data, size_t data_len, u8 frag_id) { const struct element *next; ssize_t copied; u8 elem_datalen; if (!elem) return -EINVAL; /* elem might be invalid after the memmove */ next = (void *)(elem->data + elem->datalen); elem_datalen = elem->datalen; if (elem->id == WLAN_EID_EXTENSION) { copied = elem->datalen - 1; if (data) { if (copied > data_len) return -ENOSPC; memmove(data, elem->data + 1, copied); } } else { copied = elem->datalen; if (data) { if (copied > data_len) return -ENOSPC; memmove(data, elem->data, copied); } } /* Fragmented elements must have 255 bytes */ if (elem_datalen < 255) return copied; for (elem = next; elem->data < ies + ieslen && elem->data + elem->datalen <= ies + ieslen; elem = next) { /* elem might be invalid after the memmove */ next = (void *)(elem->data + elem->datalen); if (elem->id != frag_id) break; elem_datalen = elem->datalen; if (data) { if (copied + elem_datalen > data_len) return -ENOSPC; memmove(data + copied, elem->data, elem_datalen); } copied += elem_datalen; /* Only the last fragment may be short */ if (elem_datalen != 255) break; } return copied; } EXPORT_SYMBOL(cfg80211_defragment_element); struct cfg80211_mle { struct ieee80211_multi_link_elem *mle; struct ieee80211_mle_per_sta_profile *sta_prof[IEEE80211_MLD_MAX_NUM_LINKS]; ssize_t sta_prof_len[IEEE80211_MLD_MAX_NUM_LINKS]; u8 data[]; }; static struct cfg80211_mle * cfg80211_defrag_mle(const struct element *mle, const u8 *ie, size_t ielen, gfp_t gfp) { const struct element *elem; struct cfg80211_mle *res; size_t buf_len; ssize_t mle_len; u8 common_size, idx; if (!mle || !ieee80211_mle_size_ok(mle->data + 1, mle->datalen - 1)) return NULL; /* Required length for first defragmentation */ buf_len = mle->datalen - 1; for_each_element(elem, mle->data + mle->datalen, ielen - sizeof(*mle) + mle->datalen) { if (elem->id != WLAN_EID_FRAGMENT) break; buf_len += elem->datalen; } res = kzalloc(struct_size(res, data, buf_len), gfp); if (!res) return NULL; mle_len = cfg80211_defragment_element(mle, ie, ielen, res->data, buf_len, WLAN_EID_FRAGMENT); if (mle_len < 0) goto error; res->mle = (void *)res->data; /* Find the sub-element area in the buffer */ common_size = ieee80211_mle_common_size((u8 *)res->mle); ie = res->data + common_size; ielen = mle_len - common_size; idx = 0; for_each_element_id(elem, IEEE80211_MLE_SUBELEM_PER_STA_PROFILE, ie, ielen) { res->sta_prof[idx] = (void *)elem->data; res->sta_prof_len[idx] = elem->datalen; idx++; if (idx >= IEEE80211_MLD_MAX_NUM_LINKS) break; } if (!for_each_element_completed(elem, ie, ielen)) goto error; /* Defragment sta_info in-place */ for (idx = 0; idx < IEEE80211_MLD_MAX_NUM_LINKS && res->sta_prof[idx]; idx++) { if (res->sta_prof_len[idx] < 255) continue; elem = (void *)res->sta_prof[idx] - 2; if (idx + 1 < ARRAY_SIZE(res->sta_prof) && res->sta_prof[idx + 1]) buf_len = (u8 *)res->sta_prof[idx + 1] - (u8 *)res->sta_prof[idx]; else buf_len = ielen + ie - (u8 *)elem; res->sta_prof_len[idx] = cfg80211_defragment_element(elem, (u8 *)elem, buf_len, (u8 *)res->sta_prof[idx], buf_len, IEEE80211_MLE_SUBELEM_FRAGMENT); if (res->sta_prof_len[idx] < 0) goto error; } return res; error: kfree(res); return NULL; } struct tbtt_info_iter_data { const struct ieee80211_neighbor_ap_info *ap_info; u8 param_ch_count; u32 use_for; u8 mld_id, link_id; bool non_tx; }; static enum cfg80211_rnr_iter_ret cfg802121_mld_ap_rnr_iter(void *_data, u8 type, const struct ieee80211_neighbor_ap_info *info, const u8 *tbtt_info, u8 tbtt_info_len) { const struct ieee80211_rnr_mld_params *mld_params; struct tbtt_info_iter_data *data = _data; u8 link_id; bool non_tx = false; if (type == IEEE80211_TBTT_INFO_TYPE_TBTT && tbtt_info_len >= offsetofend(struct ieee80211_tbtt_info_ge_11, mld_params)) { const struct ieee80211_tbtt_info_ge_11 *tbtt_info_ge_11 = (void *)tbtt_info; non_tx = (tbtt_info_ge_11->bss_params & (IEEE80211_RNR_TBTT_PARAMS_MULTI_BSSID | IEEE80211_RNR_TBTT_PARAMS_TRANSMITTED_BSSID)) == IEEE80211_RNR_TBTT_PARAMS_MULTI_BSSID; mld_params = &tbtt_info_ge_11->mld_params; } else if (type == IEEE80211_TBTT_INFO_TYPE_MLD && tbtt_info_len >= sizeof(struct ieee80211_rnr_mld_params)) mld_params = (void *)tbtt_info; else return RNR_ITER_CONTINUE; link_id = le16_get_bits(mld_params->params, IEEE80211_RNR_MLD_PARAMS_LINK_ID); if (data->mld_id != mld_params->mld_id) return RNR_ITER_CONTINUE; if (data->link_id != link_id) return RNR_ITER_CONTINUE; data->ap_info = info; data->param_ch_count = le16_get_bits(mld_params->params, IEEE80211_RNR_MLD_PARAMS_BSS_CHANGE_COUNT); data->non_tx = non_tx; if (type == IEEE80211_TBTT_INFO_TYPE_TBTT) data->use_for = NL80211_BSS_USE_FOR_ALL; else data->use_for = NL80211_BSS_USE_FOR_MLD_LINK; return RNR_ITER_BREAK; } static u8 cfg80211_rnr_info_for_mld_ap(const u8 *ie, size_t ielen, u8 mld_id, u8 link_id, const struct ieee80211_neighbor_ap_info **ap_info, u8 *param_ch_count, bool *non_tx) { struct tbtt_info_iter_data data = { .mld_id = mld_id, .link_id = link_id, }; cfg80211_iter_rnr(ie, ielen, cfg802121_mld_ap_rnr_iter, &data); *ap_info = data.ap_info; *param_ch_count = data.param_ch_count; *non_tx = data.non_tx; return data.use_for; } static struct element * cfg80211_gen_reporter_rnr(struct cfg80211_bss *source_bss, bool is_mbssid, bool same_mld, u8 link_id, u8 bss_change_count, gfp_t gfp) { const struct cfg80211_bss_ies *ies; struct ieee80211_neighbor_ap_info ap_info; struct ieee80211_tbtt_info_ge_11 tbtt_info; u32 short_ssid; const struct element *elem; struct element *res; /* * We only generate the RNR to permit ML lookups. For that we do not * need an entry for the corresponding transmitting BSS, lets just skip * it even though it would be easy to add. */ if (!same_mld) return NULL; /* We could use tx_data->ies if we change cfg80211_calc_short_ssid */ rcu_read_lock(); ies = rcu_dereference(source_bss->ies); ap_info.tbtt_info_len = offsetofend(typeof(tbtt_info), mld_params); ap_info.tbtt_info_hdr = u8_encode_bits(IEEE80211_TBTT_INFO_TYPE_TBTT, IEEE80211_AP_INFO_TBTT_HDR_TYPE) | u8_encode_bits(0, IEEE80211_AP_INFO_TBTT_HDR_COUNT); ap_info.channel = ieee80211_frequency_to_channel(source_bss->channel->center_freq); /* operating class */ elem = cfg80211_find_elem(WLAN_EID_SUPPORTED_REGULATORY_CLASSES, ies->data, ies->len); if (elem && elem->datalen >= 1) { ap_info.op_class = elem->data[0]; } else { struct cfg80211_chan_def chandef; /* The AP is not providing us with anything to work with. So * make up a somewhat reasonable operating class, but don't * bother with it too much as no one will ever use the * information. */ cfg80211_chandef_create(&chandef, source_bss->channel, NL80211_CHAN_NO_HT); if (!ieee80211_chandef_to_operating_class(&chandef, &ap_info.op_class)) goto out_unlock; } /* Just set TBTT offset and PSD 20 to invalid/unknown */ tbtt_info.tbtt_offset = 255; tbtt_info.psd_20 = IEEE80211_RNR_TBTT_PARAMS_PSD_RESERVED; memcpy(tbtt_info.bssid, source_bss->bssid, ETH_ALEN); if (cfg80211_calc_short_ssid(ies, &elem, &short_ssid)) goto out_unlock; rcu_read_unlock(); tbtt_info.short_ssid = cpu_to_le32(short_ssid); tbtt_info.bss_params = IEEE80211_RNR_TBTT_PARAMS_SAME_SSID; if (is_mbssid) { tbtt_info.bss_params |= IEEE80211_RNR_TBTT_PARAMS_MULTI_BSSID; tbtt_info.bss_params |= IEEE80211_RNR_TBTT_PARAMS_TRANSMITTED_BSSID; } tbtt_info.mld_params.mld_id = 0; tbtt_info.mld_params.params = le16_encode_bits(link_id, IEEE80211_RNR_MLD_PARAMS_LINK_ID) | le16_encode_bits(bss_change_count, IEEE80211_RNR_MLD_PARAMS_BSS_CHANGE_COUNT); res = kzalloc(struct_size(res, data, sizeof(ap_info) + ap_info.tbtt_info_len), gfp); if (!res) return NULL; /* Copy the data */ res->id = WLAN_EID_REDUCED_NEIGHBOR_REPORT; res->datalen = sizeof(ap_info) + ap_info.tbtt_info_len; memcpy(res->data, &ap_info, sizeof(ap_info)); memcpy(res->data + sizeof(ap_info), &tbtt_info, ap_info.tbtt_info_len); return res; out_unlock: rcu_read_unlock(); return NULL; } static void cfg80211_parse_ml_elem_sta_data(struct wiphy *wiphy, struct cfg80211_inform_single_bss_data *tx_data, struct cfg80211_bss *source_bss, const struct element *elem, gfp_t gfp) { struct cfg80211_inform_single_bss_data data = { .drv_data = tx_data->drv_data, .ftype = tx_data->ftype, .source_bss = source_bss, .bss_source = BSS_SOURCE_STA_PROFILE, }; struct element *reporter_rnr = NULL; struct ieee80211_multi_link_elem *ml_elem; struct cfg80211_mle *mle; const struct element *ssid_elem; const u8 *ssid = NULL; size_t ssid_len = 0; u16 control; u8 ml_common_len; u8 *new_ie = NULL; struct cfg80211_bss *bss; u8 mld_id, reporter_link_id, bss_change_count; u16 seen_links = 0; u8 i; if (!ieee80211_mle_type_ok(elem->data + 1, IEEE80211_ML_CONTROL_TYPE_BASIC, elem->datalen - 1)) return; ml_elem = (void *)(elem->data + 1); control = le16_to_cpu(ml_elem->control); ml_common_len = ml_elem->variable[0]; /* Must be present when transmitted by an AP (in a probe response) */ if (!(control & IEEE80211_MLC_BASIC_PRES_BSS_PARAM_CH_CNT) || !(control & IEEE80211_MLC_BASIC_PRES_LINK_ID) || !(control & IEEE80211_MLC_BASIC_PRES_MLD_CAPA_OP)) return; reporter_link_id = ieee80211_mle_get_link_id(elem->data + 1); bss_change_count = ieee80211_mle_get_bss_param_ch_cnt(elem->data + 1); /* * The MLD ID of the reporting AP is always zero. It is set if the AP * is part of an MBSSID set and will be non-zero for ML Elements * relating to a nontransmitted BSS (matching the Multi-BSSID Index, * Draft P802.11be_D3.2, 35.3.4.2) */ mld_id = ieee80211_mle_get_mld_id(elem->data + 1); /* Fully defrag the ML element for sta information/profile iteration */ mle = cfg80211_defrag_mle(elem, tx_data->ie, tx_data->ielen, gfp); if (!mle) return; /* No point in doing anything if there is no per-STA profile */ if (!mle->sta_prof[0]) goto out; new_ie = kmalloc(IEEE80211_MAX_DATA_LEN, gfp); if (!new_ie) goto out; reporter_rnr = cfg80211_gen_reporter_rnr(source_bss, u16_get_bits(control, IEEE80211_MLC_BASIC_PRES_MLD_ID), mld_id == 0, reporter_link_id, bss_change_count, gfp); ssid_elem = cfg80211_find_elem(WLAN_EID_SSID, tx_data->ie, tx_data->ielen); if (ssid_elem) { ssid = ssid_elem->data; ssid_len = ssid_elem->datalen; } for (i = 0; i < ARRAY_SIZE(mle->sta_prof) && mle->sta_prof[i]; i++) { const struct ieee80211_neighbor_ap_info *ap_info; enum nl80211_band band; u32 freq; const u8 *profile; ssize_t profile_len; u8 param_ch_count; u8 link_id, use_for; bool non_tx; if (!ieee80211_mle_basic_sta_prof_size_ok((u8 *)mle->sta_prof[i], mle->sta_prof_len[i])) continue; control = le16_to_cpu(mle->sta_prof[i]->control); if (!(control & IEEE80211_MLE_STA_CONTROL_COMPLETE_PROFILE)) continue; link_id = u16_get_bits(control, IEEE80211_MLE_STA_CONTROL_LINK_ID); if (seen_links & BIT(link_id)) break; seen_links |= BIT(link_id); if (!(control & IEEE80211_MLE_STA_CONTROL_BEACON_INT_PRESENT) || !(control & IEEE80211_MLE_STA_CONTROL_TSF_OFFS_PRESENT) || !(control & IEEE80211_MLE_STA_CONTROL_STA_MAC_ADDR_PRESENT)) continue; memcpy(data.bssid, mle->sta_prof[i]->variable, ETH_ALEN); data.beacon_interval = get_unaligned_le16(mle->sta_prof[i]->variable + 6); data.tsf = tx_data->tsf + get_unaligned_le64(mle->sta_prof[i]->variable + 8); /* sta_info_len counts itself */ profile = mle->sta_prof[i]->variable + mle->sta_prof[i]->sta_info_len - 1; profile_len = (u8 *)mle->sta_prof[i] + mle->sta_prof_len[i] - profile; if (profile_len < 2) continue; data.capability = get_unaligned_le16(profile); profile += 2; profile_len -= 2; /* Find in RNR to look up channel information */ use_for = cfg80211_rnr_info_for_mld_ap(tx_data->ie, tx_data->ielen, mld_id, link_id, &ap_info, &param_ch_count, &non_tx); if (!use_for) continue; /* * As of 802.11be_D5.0, the specification does not give us any * way of discovering both the MaxBSSID and the Multiple-BSSID * Index. It does seem like the Multiple-BSSID Index element * may be provided, but section 9.4.2.45 explicitly forbids * including a Multiple-BSSID Element (in this case without any * subelements). * Without both pieces of information we cannot calculate the * reference BSSID, so simply ignore the BSS. */ if (non_tx) continue; /* We could sanity check the BSSID is included */ if (!ieee80211_operating_class_to_band(ap_info->op_class, &band)) continue; freq = ieee80211_channel_to_freq_khz(ap_info->channel, band); data.channel = ieee80211_get_channel_khz(wiphy, freq); /* Skip if RNR element specifies an unsupported channel */ if (!data.channel) continue; /* Skip if BSS entry generated from MBSSID or DIRECT source * frame data available already. */ bss = cfg80211_get_bss(wiphy, data.channel, data.bssid, ssid, ssid_len, IEEE80211_BSS_TYPE_ANY, IEEE80211_PRIVACY_ANY); if (bss) { struct cfg80211_internal_bss *ibss = bss_from_pub(bss); if (data.capability == bss->capability && ibss->bss_source != BSS_SOURCE_STA_PROFILE) { cfg80211_put_bss(wiphy, bss); continue; } cfg80211_put_bss(wiphy, bss); } if (use_for == NL80211_BSS_USE_FOR_MLD_LINK && !(wiphy->flags & WIPHY_FLAG_SUPPORTS_NSTR_NONPRIMARY)) { use_for = 0; data.cannot_use_reasons = NL80211_BSS_CANNOT_USE_NSTR_NONPRIMARY; } data.use_for = use_for; /* Generate new elements */ memset(new_ie, 0, IEEE80211_MAX_DATA_LEN); data.ie = new_ie; data.ielen = cfg80211_gen_new_ie(tx_data->ie, tx_data->ielen, profile, profile_len, new_ie, IEEE80211_MAX_DATA_LEN); if (!data.ielen) continue; /* The generated elements do not contain: * - Basic ML element * - A TBTT entry in the RNR for the transmitting AP * * This information is needed both internally and in userspace * as such, we should append it here. */ if (data.ielen + 3 + sizeof(*ml_elem) + ml_common_len > IEEE80211_MAX_DATA_LEN) continue; /* Copy the Basic Multi-Link element including the common * information, and then fix up the link ID and BSS param * change count. * Note that the ML element length has been verified and we * also checked that it contains the link ID. */ new_ie[data.ielen++] = WLAN_EID_EXTENSION; new_ie[data.ielen++] = 1 + sizeof(*ml_elem) + ml_common_len; new_ie[data.ielen++] = WLAN_EID_EXT_EHT_MULTI_LINK; memcpy(new_ie + data.ielen, ml_elem, sizeof(*ml_elem) + ml_common_len); new_ie[data.ielen + sizeof(*ml_elem) + 1 + ETH_ALEN] = link_id; new_ie[data.ielen + sizeof(*ml_elem) + 1 + ETH_ALEN + 1] = param_ch_count; data.ielen += sizeof(*ml_elem) + ml_common_len; if (reporter_rnr && (use_for & NL80211_BSS_USE_FOR_NORMAL)) { if (data.ielen + sizeof(struct element) + reporter_rnr->datalen > IEEE80211_MAX_DATA_LEN) continue; memcpy(new_ie + data.ielen, reporter_rnr, sizeof(struct element) + reporter_rnr->datalen); data.ielen += sizeof(struct element) + reporter_rnr->datalen; } bss = cfg80211_inform_single_bss_data(wiphy, &data, gfp); if (!bss) break; cfg80211_put_bss(wiphy, bss); } out: kfree(reporter_rnr); kfree(new_ie); kfree(mle); } static void cfg80211_parse_ml_sta_data(struct wiphy *wiphy, struct cfg80211_inform_single_bss_data *tx_data, struct cfg80211_bss *source_bss, gfp_t gfp) { const struct element *elem; if (!source_bss) return; if (tx_data->ftype != CFG80211_BSS_FTYPE_PRESP) return; for_each_element_extid(elem, WLAN_EID_EXT_EHT_MULTI_LINK, tx_data->ie, tx_data->ielen) cfg80211_parse_ml_elem_sta_data(wiphy, tx_data, source_bss, elem, gfp); } struct cfg80211_bss * cfg80211_inform_bss_data(struct wiphy *wiphy, struct cfg80211_inform_bss *data, enum cfg80211_bss_frame_type ftype, const u8 *bssid, u64 tsf, u16 capability, u16 beacon_interval, const u8 *ie, size_t ielen, gfp_t gfp) { struct cfg80211_inform_single_bss_data inform_data = { .drv_data = data, .ftype = ftype, .tsf = tsf, .capability = capability, .beacon_interval = beacon_interval, .ie = ie, .ielen = ielen, .use_for = data->restrict_use ? data->use_for : NL80211_BSS_USE_FOR_ALL, .cannot_use_reasons = data->cannot_use_reasons, }; struct cfg80211_bss *res; memcpy(inform_data.bssid, bssid, ETH_ALEN); res = cfg80211_inform_single_bss_data(wiphy, &inform_data, gfp); if (!res) return NULL; /* don't do any further MBSSID/ML handling for S1G */ if (ftype == CFG80211_BSS_FTYPE_S1G_BEACON) return res; cfg80211_parse_mbssid_data(wiphy, &inform_data, res, gfp); cfg80211_parse_ml_sta_data(wiphy, &inform_data, res, gfp); return res; } EXPORT_SYMBOL(cfg80211_inform_bss_data); struct cfg80211_bss * cfg80211_inform_bss_frame_data(struct wiphy *wiphy, struct cfg80211_inform_bss *data, struct ieee80211_mgmt *mgmt, size_t len, gfp_t gfp) { size_t min_hdr_len; struct ieee80211_ext *ext = NULL; enum cfg80211_bss_frame_type ftype; u16 beacon_interval; const u8 *bssid; u16 capability; const u8 *ie; size_t ielen; u64 tsf; if (WARN_ON(!mgmt)) return NULL; if (WARN_ON(!wiphy)) return NULL; BUILD_BUG_ON(offsetof(struct ieee80211_mgmt, u.probe_resp.variable) != offsetof(struct ieee80211_mgmt, u.beacon.variable)); trace_cfg80211_inform_bss_frame(wiphy, data, mgmt, len); if (ieee80211_is_s1g_beacon(mgmt->frame_control)) { ext = (void *) mgmt; if (ieee80211_is_s1g_short_beacon(mgmt->frame_control)) min_hdr_len = offsetof(struct ieee80211_ext, u.s1g_short_beacon.variable); else min_hdr_len = offsetof(struct ieee80211_ext, u.s1g_beacon.variable); } else { /* same for beacons */ min_hdr_len = offsetof(struct ieee80211_mgmt, u.probe_resp.variable); } if (WARN_ON(len < min_hdr_len)) return NULL; ielen = len - min_hdr_len; ie = mgmt->u.probe_resp.variable; if (ext) { const struct ieee80211_s1g_bcn_compat_ie *compat; const struct element *elem; if (ieee80211_is_s1g_short_beacon(mgmt->frame_control)) ie = ext->u.s1g_short_beacon.variable; else ie = ext->u.s1g_beacon.variable; elem = cfg80211_find_elem(WLAN_EID_S1G_BCN_COMPAT, ie, ielen); if (!elem) return NULL; if (elem->datalen < sizeof(*compat)) return NULL; compat = (void *)elem->data; bssid = ext->u.s1g_beacon.sa; capability = le16_to_cpu(compat->compat_info); beacon_interval = le16_to_cpu(compat->beacon_int); } else { bssid = mgmt->bssid; beacon_interval = le16_to_cpu(mgmt->u.probe_resp.beacon_int); capability = le16_to_cpu(mgmt->u.probe_resp.capab_info); } tsf = le64_to_cpu(mgmt->u.probe_resp.timestamp); if (ieee80211_is_probe_resp(mgmt->frame_control)) ftype = CFG80211_BSS_FTYPE_PRESP; else if (ext) ftype = CFG80211_BSS_FTYPE_S1G_BEACON; else ftype = CFG80211_BSS_FTYPE_BEACON; return cfg80211_inform_bss_data(wiphy, data, ftype, bssid, tsf, capability, beacon_interval, ie, ielen, gfp); } EXPORT_SYMBOL(cfg80211_inform_bss_frame_data); void cfg80211_ref_bss(struct wiphy *wiphy, struct cfg80211_bss *pub) { struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy); if (!pub) return; spin_lock_bh(&rdev->bss_lock); bss_ref_get(rdev, bss_from_pub(pub)); spin_unlock_bh(&rdev->bss_lock); } EXPORT_SYMBOL(cfg80211_ref_bss); void cfg80211_put_bss(struct wiphy *wiphy, struct cfg80211_bss *pub) { struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy); if (!pub) return; spin_lock_bh(&rdev->bss_lock); bss_ref_put(rdev, bss_from_pub(pub)); spin_unlock_bh(&rdev->bss_lock); } EXPORT_SYMBOL(cfg80211_put_bss); void cfg80211_unlink_bss(struct wiphy *wiphy, struct cfg80211_bss *pub) { struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy); struct cfg80211_internal_bss *bss, *tmp1; struct cfg80211_bss *nontrans_bss, *tmp; if (WARN_ON(!pub)) return; bss = bss_from_pub(pub); spin_lock_bh(&rdev->bss_lock); if (list_empty(&bss->list)) goto out; list_for_each_entry_safe(nontrans_bss, tmp, &pub->nontrans_list, nontrans_list) { tmp1 = bss_from_pub(nontrans_bss); if (__cfg80211_unlink_bss(rdev, tmp1)) rdev->bss_generation++; } if (__cfg80211_unlink_bss(rdev, bss)) rdev->bss_generation++; out: spin_unlock_bh(&rdev->bss_lock); } EXPORT_SYMBOL(cfg80211_unlink_bss); void cfg80211_bss_iter(struct wiphy *wiphy, struct cfg80211_chan_def *chandef, void (*iter)(struct wiphy *wiphy, struct cfg80211_bss *bss, void *data), void *iter_data) { struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy); struct cfg80211_internal_bss *bss; spin_lock_bh(&rdev->bss_lock); list_for_each_entry(bss, &rdev->bss_list, list) { if (!chandef || cfg80211_is_sub_chan(chandef, bss->pub.channel, false)) iter(wiphy, &bss->pub, iter_data); } spin_unlock_bh(&rdev->bss_lock); } EXPORT_SYMBOL(cfg80211_bss_iter); void cfg80211_update_assoc_bss_entry(struct wireless_dev *wdev, unsigned int link_id, struct ieee80211_channel *chan) { struct wiphy *wiphy = wdev->wiphy; struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy); struct cfg80211_internal_bss *cbss = wdev->links[link_id].client.current_bss; struct cfg80211_internal_bss *new = NULL; struct cfg80211_internal_bss *bss; struct cfg80211_bss *nontrans_bss; struct cfg80211_bss *tmp; spin_lock_bh(&rdev->bss_lock); /* * Some APs use CSA also for bandwidth changes, i.e., without actually * changing the control channel, so no need to update in such a case. */ if (cbss->pub.channel == chan) goto done; /* use transmitting bss */ if (cbss->pub.transmitted_bss) cbss = bss_from_pub(cbss->pub.transmitted_bss); cbss->pub.channel = chan; list_for_each_entry(bss, &rdev->bss_list, list) { if (!cfg80211_bss_type_match(bss->pub.capability, bss->pub.channel->band, wdev->conn_bss_type)) continue; if (bss == cbss) continue; if (!cmp_bss(&bss->pub, &cbss->pub, BSS_CMP_REGULAR)) { new = bss; break; } } if (new) { /* to save time, update IEs for transmitting bss only */ cfg80211_update_known_bss(rdev, cbss, new, false); new->pub.proberesp_ies = NULL; new->pub.beacon_ies = NULL; list_for_each_entry_safe(nontrans_bss, tmp, &new->pub.nontrans_list, nontrans_list) { bss = bss_from_pub(nontrans_bss); if (__cfg80211_unlink_bss(rdev, bss)) rdev->bss_generation++; } WARN_ON(atomic_read(&new->hold)); if (!WARN_ON(!__cfg80211_unlink_bss(rdev, new))) rdev->bss_generation++; } cfg80211_rehash_bss(rdev, cbss); list_for_each_entry_safe(nontrans_bss, tmp, &cbss->pub.nontrans_list, nontrans_list) { bss = bss_from_pub(nontrans_bss); bss->pub.channel = chan; cfg80211_rehash_bss(rdev, bss); } done: spin_unlock_bh(&rdev->bss_lock); } #ifdef CONFIG_CFG80211_WEXT static struct cfg80211_registered_device * cfg80211_get_dev_from_ifindex(struct net *net, int ifindex) { struct cfg80211_registered_device *rdev; struct net_device *dev; ASSERT_RTNL(); dev = dev_get_by_index(net, ifindex); if (!dev) return ERR_PTR(-ENODEV); if (dev->ieee80211_ptr) rdev = wiphy_to_rdev(dev->ieee80211_ptr->wiphy); else rdev = ERR_PTR(-ENODEV); dev_put(dev); return rdev; } int cfg80211_wext_siwscan(struct net_device *dev, struct iw_request_info *info, union iwreq_data *wrqu, char *extra) { struct cfg80211_registered_device *rdev; struct wiphy *wiphy; struct iw_scan_req *wreq = NULL; struct cfg80211_scan_request *creq; int i, err, n_channels = 0; enum nl80211_band band; if (!netif_running(dev)) return -ENETDOWN; if (wrqu->data.length == sizeof(struct iw_scan_req)) wreq = (struct iw_scan_req *)extra; rdev = cfg80211_get_dev_from_ifindex(dev_net(dev), dev->ifindex); if (IS_ERR(rdev)) return PTR_ERR(rdev); if (rdev->scan_req || rdev->scan_msg) return -EBUSY; wiphy = &rdev->wiphy; /* Determine number of channels, needed to allocate creq */ if (wreq && wreq->num_channels) { /* Passed from userspace so should be checked */ if (unlikely(wreq->num_channels > IW_MAX_FREQUENCIES)) return -EINVAL; n_channels = wreq->num_channels; } else { n_channels = ieee80211_get_num_supported_channels(wiphy); } creq = kzalloc(struct_size(creq, channels, n_channels) + sizeof(struct cfg80211_ssid), GFP_ATOMIC); if (!creq) return -ENOMEM; creq->wiphy = wiphy; creq->wdev = dev->ieee80211_ptr; /* SSIDs come after channels */ creq->ssids = (void *)creq + struct_size(creq, channels, n_channels); creq->n_channels = n_channels; creq->n_ssids = 1; creq->scan_start = jiffies; /* translate "Scan on frequencies" request */ i = 0; for (band = 0; band < NUM_NL80211_BANDS; band++) { int j; if (!wiphy->bands[band]) continue; for (j = 0; j < wiphy->bands[band]->n_channels; j++) { struct ieee80211_channel *chan; /* ignore disabled channels */ chan = &wiphy->bands[band]->channels[j]; if (chan->flags & IEEE80211_CHAN_DISABLED || !cfg80211_wdev_channel_allowed(creq->wdev, chan)) continue; /* If we have a wireless request structure and the * wireless request specifies frequencies, then search * for the matching hardware channel. */ if (wreq && wreq->num_channels) { int k; int wiphy_freq = wiphy->bands[band]->channels[j].center_freq; for (k = 0; k < wreq->num_channels; k++) { struct iw_freq *freq = &wreq->channel_list[k]; int wext_freq = cfg80211_wext_freq(freq); if (wext_freq == wiphy_freq) goto wext_freq_found; } goto wext_freq_not_found; } wext_freq_found: creq->channels[i] = &wiphy->bands[band]->channels[j]; i++; wext_freq_not_found: ; } } /* No channels found? */ if (!i) { err = -EINVAL; goto out; } /* Set real number of channels specified in creq->channels[] */ creq->n_channels = i; /* translate "Scan for SSID" request */ if (wreq) { if (wrqu->data.flags & IW_SCAN_THIS_ESSID) { if (wreq->essid_len > IEEE80211_MAX_SSID_LEN) return -EINVAL; memcpy(creq->ssids[0].ssid, wreq->essid, wreq->essid_len); creq->ssids[0].ssid_len = wreq->essid_len; } if (wreq->scan_type == IW_SCAN_TYPE_PASSIVE) { creq->ssids = NULL; creq->n_ssids = 0; } } for (i = 0; i < NUM_NL80211_BANDS; i++) if (wiphy->bands[i]) creq->rates[i] = (1 << wiphy->bands[i]->n_bitrates) - 1; eth_broadcast_addr(creq->bssid); scoped_guard(wiphy, &rdev->wiphy) { rdev->scan_req = creq; err = rdev_scan(rdev, creq); if (err) { rdev->scan_req = NULL; /* creq will be freed below */ } else { nl80211_send_scan_start(rdev, dev->ieee80211_ptr); /* creq now owned by driver */ creq = NULL; dev_hold(dev); } } out: kfree(creq); return err; } static char *ieee80211_scan_add_ies(struct iw_request_info *info, const struct cfg80211_bss_ies *ies, char *current_ev, char *end_buf) { const u8 *pos, *end, *next; struct iw_event iwe; if (!ies) return current_ev; /* * If needed, fragment the IEs buffer (at IE boundaries) into short * enough fragments to fit into IW_GENERIC_IE_MAX octet messages. */ pos = ies->data; end = pos + ies->len; while (end - pos > IW_GENERIC_IE_MAX) { next = pos + 2 + pos[1]; while (next + 2 + next[1] - pos < IW_GENERIC_IE_MAX) next = next + 2 + next[1]; memset(&iwe, 0, sizeof(iwe)); iwe.cmd = IWEVGENIE; iwe.u.data.length = next - pos; current_ev = iwe_stream_add_point_check(info, current_ev, end_buf, &iwe, (void *)pos); if (IS_ERR(current_ev)) return current_ev; pos = next; } if (end > pos) { memset(&iwe, 0, sizeof(iwe)); iwe.cmd = IWEVGENIE; iwe.u.data.length = end - pos; current_ev = iwe_stream_add_point_check(info, current_ev, end_buf, &iwe, (void *)pos); if (IS_ERR(current_ev)) return current_ev; } return current_ev; } static char * ieee80211_bss(struct wiphy *wiphy, struct iw_request_info *info, struct cfg80211_internal_bss *bss, char *current_ev, char *end_buf) { const struct cfg80211_bss_ies *ies; struct iw_event iwe; const u8 *ie; u8 buf[50]; u8 *cfg, *p, *tmp; int rem, i, sig; bool ismesh = false; memset(&iwe, 0, sizeof(iwe)); iwe.cmd = SIOCGIWAP; iwe.u.ap_addr.sa_family = ARPHRD_ETHER; memcpy(iwe.u.ap_addr.sa_data, bss->pub.bssid, ETH_ALEN); current_ev = iwe_stream_add_event_check(info, current_ev, end_buf, &iwe, IW_EV_ADDR_LEN); if (IS_ERR(current_ev)) return current_ev; memset(&iwe, 0, sizeof(iwe)); iwe.cmd = SIOCGIWFREQ; iwe.u.freq.m = ieee80211_frequency_to_channel(bss->pub.channel->center_freq); iwe.u.freq.e = 0; current_ev = iwe_stream_add_event_check(info, current_ev, end_buf, &iwe, IW_EV_FREQ_LEN); if (IS_ERR(current_ev)) return current_ev; memset(&iwe, 0, sizeof(iwe)); iwe.cmd = SIOCGIWFREQ; iwe.u.freq.m = bss->pub.channel->center_freq; iwe.u.freq.e = 6; current_ev = iwe_stream_add_event_check(info, current_ev, end_buf, &iwe, IW_EV_FREQ_LEN); if (IS_ERR(current_ev)) return current_ev; if (wiphy->signal_type != CFG80211_SIGNAL_TYPE_NONE) { memset(&iwe, 0, sizeof(iwe)); iwe.cmd = IWEVQUAL; iwe.u.qual.updated = IW_QUAL_LEVEL_UPDATED | IW_QUAL_NOISE_INVALID | IW_QUAL_QUAL_UPDATED; switch (wiphy->signal_type) { case CFG80211_SIGNAL_TYPE_MBM: sig = bss->pub.signal / 100; iwe.u.qual.level = sig; iwe.u.qual.updated |= IW_QUAL_DBM; if (sig < -110) /* rather bad */ sig = -110; else if (sig > -40) /* perfect */ sig = -40; /* will give a range of 0 .. 70 */ iwe.u.qual.qual = sig + 110; break; case CFG80211_SIGNAL_TYPE_UNSPEC: iwe.u.qual.level = bss->pub.signal; /* will give range 0 .. 100 */ iwe.u.qual.qual = bss->pub.signal; break; default: /* not reached */ break; } current_ev = iwe_stream_add_event_check(info, current_ev, end_buf, &iwe, IW_EV_QUAL_LEN); if (IS_ERR(current_ev)) return current_ev; } memset(&iwe, 0, sizeof(iwe)); iwe.cmd = SIOCGIWENCODE; if (bss->pub.capability & WLAN_CAPABILITY_PRIVACY) iwe.u.data.flags = IW_ENCODE_ENABLED | IW_ENCODE_NOKEY; else iwe.u.data.flags = IW_ENCODE_DISABLED; iwe.u.data.length = 0; current_ev = iwe_stream_add_point_check(info, current_ev, end_buf, &iwe, ""); if (IS_ERR(current_ev)) return current_ev; rcu_read_lock(); ies = rcu_dereference(bss->pub.ies); rem = ies->len; ie = ies->data; while (rem >= 2) { /* invalid data */ if (ie[1] > rem - 2) break; switch (ie[0]) { case WLAN_EID_SSID: memset(&iwe, 0, sizeof(iwe)); iwe.cmd = SIOCGIWESSID; iwe.u.data.length = ie[1]; iwe.u.data.flags = 1; current_ev = iwe_stream_add_point_check(info, current_ev, end_buf, &iwe, (u8 *)ie + 2); if (IS_ERR(current_ev)) goto unlock; break; case WLAN_EID_MESH_ID: memset(&iwe, 0, sizeof(iwe)); iwe.cmd = SIOCGIWESSID; iwe.u.data.length = ie[1]; iwe.u.data.flags = 1; current_ev = iwe_stream_add_point_check(info, current_ev, end_buf, &iwe, (u8 *)ie + 2); if (IS_ERR(current_ev)) goto unlock; break; case WLAN_EID_MESH_CONFIG: ismesh = true; if (ie[1] != sizeof(struct ieee80211_meshconf_ie)) break; cfg = (u8 *)ie + 2; memset(&iwe, 0, sizeof(iwe)); iwe.cmd = IWEVCUSTOM; iwe.u.data.length = sprintf(buf, "Mesh Network Path Selection Protocol ID: 0x%02X", cfg[0]); current_ev = iwe_stream_add_point_check(info, current_ev, end_buf, &iwe, buf); if (IS_ERR(current_ev)) goto unlock; iwe.u.data.length = sprintf(buf, "Path Selection Metric ID: 0x%02X", cfg[1]); current_ev = iwe_stream_add_point_check(info, current_ev, end_buf, &iwe, buf); if (IS_ERR(current_ev)) goto unlock; iwe.u.data.length = sprintf(buf, "Congestion Control Mode ID: 0x%02X", cfg[2]); current_ev = iwe_stream_add_point_check(info, current_ev, end_buf, &iwe, buf); if (IS_ERR(current_ev)) goto unlock; iwe.u.data.length = sprintf(buf, "Synchronization ID: 0x%02X", cfg[3]); current_ev = iwe_stream_add_point_check(info, current_ev, end_buf, &iwe, buf); if (IS_ERR(current_ev)) goto unlock; iwe.u.data.length = sprintf(buf, "Authentication ID: 0x%02X", cfg[4]); current_ev = iwe_stream_add_point_check(info, current_ev, end_buf, &iwe, buf); if (IS_ERR(current_ev)) goto unlock; iwe.u.data.length = sprintf(buf, "Formation Info: 0x%02X", cfg[5]); current_ev = iwe_stream_add_point_check(info, current_ev, end_buf, &iwe, buf); if (IS_ERR(current_ev)) goto unlock; iwe.u.data.length = sprintf(buf, "Capabilities: 0x%02X", cfg[6]); current_ev = iwe_stream_add_point_check(info, current_ev, end_buf, &iwe, buf); if (IS_ERR(current_ev)) goto unlock; break; case WLAN_EID_SUPP_RATES: case WLAN_EID_EXT_SUPP_RATES: /* display all supported rates in readable format */ p = current_ev + iwe_stream_lcp_len(info); memset(&iwe, 0, sizeof(iwe)); iwe.cmd = SIOCGIWRATE; /* Those two flags are ignored... */ iwe.u.bitrate.fixed = iwe.u.bitrate.disabled = 0; for (i = 0; i < ie[1]; i++) { iwe.u.bitrate.value = ((ie[i + 2] & 0x7f) * 500000); tmp = p; p = iwe_stream_add_value(info, current_ev, p, end_buf, &iwe, IW_EV_PARAM_LEN); if (p == tmp) { current_ev = ERR_PTR(-E2BIG); goto unlock; } } current_ev = p; break; } rem -= ie[1] + 2; ie += ie[1] + 2; } if (bss->pub.capability & (WLAN_CAPABILITY_ESS | WLAN_CAPABILITY_IBSS) || ismesh) { memset(&iwe, 0, sizeof(iwe)); iwe.cmd = SIOCGIWMODE; if (ismesh) iwe.u.mode = IW_MODE_MESH; else if (bss->pub.capability & WLAN_CAPABILITY_ESS) iwe.u.mode = IW_MODE_MASTER; else iwe.u.mode = IW_MODE_ADHOC; current_ev = iwe_stream_add_event_check(info, current_ev, end_buf, &iwe, IW_EV_UINT_LEN); if (IS_ERR(current_ev)) goto unlock; } memset(&iwe, 0, sizeof(iwe)); iwe.cmd = IWEVCUSTOM; iwe.u.data.length = sprintf(buf, "tsf=%016llx", (unsigned long long)(ies->tsf)); current_ev = iwe_stream_add_point_check(info, current_ev, end_buf, &iwe, buf); if (IS_ERR(current_ev)) goto unlock; memset(&iwe, 0, sizeof(iwe)); iwe.cmd = IWEVCUSTOM; iwe.u.data.length = sprintf(buf, " Last beacon: %ums ago", elapsed_jiffies_msecs(bss->ts)); current_ev = iwe_stream_add_point_check(info, current_ev, end_buf, &iwe, buf); if (IS_ERR(current_ev)) goto unlock; current_ev = ieee80211_scan_add_ies(info, ies, current_ev, end_buf); unlock: rcu_read_unlock(); return current_ev; } static int ieee80211_scan_results(struct cfg80211_registered_device *rdev, struct iw_request_info *info, char *buf, size_t len) { char *current_ev = buf; char *end_buf = buf + len; struct cfg80211_internal_bss *bss; int err = 0; spin_lock_bh(&rdev->bss_lock); cfg80211_bss_expire(rdev); list_for_each_entry(bss, &rdev->bss_list, list) { if (buf + len - current_ev <= IW_EV_ADDR_LEN) { err = -E2BIG; break; } current_ev = ieee80211_bss(&rdev->wiphy, info, bss, current_ev, end_buf); if (IS_ERR(current_ev)) { err = PTR_ERR(current_ev); break; } } spin_unlock_bh(&rdev->bss_lock); if (err) return err; return current_ev - buf; } int cfg80211_wext_giwscan(struct net_device *dev, struct iw_request_info *info, union iwreq_data *wrqu, char *extra) { struct iw_point *data = &wrqu->data; struct cfg80211_registered_device *rdev; int res; if (!netif_running(dev)) return -ENETDOWN; rdev = cfg80211_get_dev_from_ifindex(dev_net(dev), dev->ifindex); if (IS_ERR(rdev)) return PTR_ERR(rdev); if (rdev->scan_req || rdev->scan_msg) return -EAGAIN; res = ieee80211_scan_results(rdev, info, extra, data->length); data->length = 0; if (res >= 0) { data->length = res; res = 0; } return res; } #endif
2 2 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 // SPDX-License-Identifier: GPL-2.0-only /* * test/set flag bits stored in conntrack extension area. * * (C) 2013 Astaro GmbH & Co KG */ #include <linux/export.h> #include <linux/types.h> #include <net/netfilter/nf_conntrack_ecache.h> #include <net/netfilter/nf_conntrack_labels.h> static int replace_u32(u32 *address, u32 mask, u32 new) { u32 old, tmp; do { old = *address; tmp = (old & mask) ^ new; if (old == tmp) return 0; } while (cmpxchg(address, old, tmp) != old); return 1; } int nf_connlabels_replace(struct nf_conn *ct, const u32 *data, const u32 *mask, unsigned int words32) { struct nf_conn_labels *labels; unsigned int size, i; int changed = 0; u32 *dst; labels = nf_ct_labels_find(ct); if (!labels) return -ENOSPC; size = sizeof(labels->bits); if (size < (words32 * sizeof(u32))) words32 = size / sizeof(u32); dst = (u32 *) labels->bits; for (i = 0; i < words32; i++) changed |= replace_u32(&dst[i], mask ? ~mask[i] : 0, data[i]); size /= sizeof(u32); for (i = words32; i < size; i++) /* pad */ replace_u32(&dst[i], 0, 0); if (changed) nf_conntrack_event_cache(IPCT_LABEL, ct); return 0; } EXPORT_SYMBOL_GPL(nf_connlabels_replace); int nf_connlabels_get(struct net *net, unsigned int bits) { int v; if (BIT_WORD(bits) >= NF_CT_LABELS_MAX_SIZE / sizeof(long)) return -ERANGE; BUILD_BUG_ON(NF_CT_LABELS_MAX_SIZE / sizeof(long) >= U8_MAX); v = atomic_inc_return_relaxed(&net->ct.labels_used); WARN_ON_ONCE(v <= 0); return 0; } EXPORT_SYMBOL_GPL(nf_connlabels_get); void nf_connlabels_put(struct net *net) { int v = atomic_dec_return_relaxed(&net->ct.labels_used); WARN_ON_ONCE(v < 0); } EXPORT_SYMBOL_GPL(nf_connlabels_put);
2 2 2 2 6 6 6 7 6 7 17 18 18 18 18 2 2 2 6 6 6 6 6 6 6 17 1 1 1 1 22 6 6 6 6 2 6 3 6 17 3 3 3 3 3 17 1 22 15 17 3 15 15 15 2 2 1 2 17 2 17 16 16 16 16 17 19 20 20 20 2 2 5 5 1 5 5 5 25 19 11 17 17 20 2 2 2 2 2 2 2 2 2 2 2 25 25 20 13 13 13 13 13 13 13 17 4 13 9 9 9 9 15 15 15 15 15 15 15 24 15 9 24 15 15 15 24 9 9 9 9 24 2 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 // SPDX-License-Identifier: GPL-2.0-only /* Copyright (c) 2016 Facebook */ #include <linux/cpumask.h> #include <linux/spinlock.h> #include <linux/percpu.h> #include "bpf_lru_list.h" #define LOCAL_FREE_TARGET (128) #define LOCAL_NR_SCANS LOCAL_FREE_TARGET #define PERCPU_FREE_TARGET (4) #define PERCPU_NR_SCANS PERCPU_FREE_TARGET /* Helpers to get the local list index */ #define LOCAL_LIST_IDX(t) ((t) - BPF_LOCAL_LIST_T_OFFSET) #define LOCAL_FREE_LIST_IDX LOCAL_LIST_IDX(BPF_LRU_LOCAL_LIST_T_FREE) #define LOCAL_PENDING_LIST_IDX LOCAL_LIST_IDX(BPF_LRU_LOCAL_LIST_T_PENDING) #define IS_LOCAL_LIST_TYPE(t) ((t) >= BPF_LOCAL_LIST_T_OFFSET) static int get_next_cpu(int cpu) { cpu = cpumask_next(cpu, cpu_possible_mask); if (cpu >= nr_cpu_ids) cpu = cpumask_first(cpu_possible_mask); return cpu; } /* Local list helpers */ static struct list_head *local_free_list(struct bpf_lru_locallist *loc_l) { return &loc_l->lists[LOCAL_FREE_LIST_IDX]; } static struct list_head *local_pending_list(struct bpf_lru_locallist *loc_l) { return &loc_l->lists[LOCAL_PENDING_LIST_IDX]; } /* bpf_lru_node helpers */ static bool bpf_lru_node_is_ref(const struct bpf_lru_node *node) { return READ_ONCE(node->ref); } static void bpf_lru_node_clear_ref(struct bpf_lru_node *node) { WRITE_ONCE(node->ref, 0); } static void bpf_lru_list_count_inc(struct bpf_lru_list *l, enum bpf_lru_list_type type) { if (type < NR_BPF_LRU_LIST_COUNT) l->counts[type]++; } static void bpf_lru_list_count_dec(struct bpf_lru_list *l, enum bpf_lru_list_type type) { if (type < NR_BPF_LRU_LIST_COUNT) l->counts[type]--; } static void __bpf_lru_node_move_to_free(struct bpf_lru_list *l, struct bpf_lru_node *node, struct list_head *free_list, enum bpf_lru_list_type tgt_free_type) { if (WARN_ON_ONCE(IS_LOCAL_LIST_TYPE(node->type))) return; /* If the removing node is the next_inactive_rotation candidate, * move the next_inactive_rotation pointer also. */ if (&node->list == l->next_inactive_rotation) l->next_inactive_rotation = l->next_inactive_rotation->prev; bpf_lru_list_count_dec(l, node->type); node->type = tgt_free_type; list_move(&node->list, free_list); } /* Move nodes from local list to the LRU list */ static void __bpf_lru_node_move_in(struct bpf_lru_list *l, struct bpf_lru_node *node, enum bpf_lru_list_type tgt_type) { if (WARN_ON_ONCE(!IS_LOCAL_LIST_TYPE(node->type)) || WARN_ON_ONCE(IS_LOCAL_LIST_TYPE(tgt_type))) return; bpf_lru_list_count_inc(l, tgt_type); node->type = tgt_type; bpf_lru_node_clear_ref(node); list_move(&node->list, &l->lists[tgt_type]); } /* Move nodes between or within active and inactive list (like * active to inactive, inactive to active or tail of active back to * the head of active). */ static void __bpf_lru_node_move(struct bpf_lru_list *l, struct bpf_lru_node *node, enum bpf_lru_list_type tgt_type) { if (WARN_ON_ONCE(IS_LOCAL_LIST_TYPE(node->type)) || WARN_ON_ONCE(IS_LOCAL_LIST_TYPE(tgt_type))) return; if (node->type != tgt_type) { bpf_lru_list_count_dec(l, node->type); bpf_lru_list_count_inc(l, tgt_type); node->type = tgt_type; } bpf_lru_node_clear_ref(node); /* If the moving node is the next_inactive_rotation candidate, * move the next_inactive_rotation pointer also. */ if (&node->list == l->next_inactive_rotation) l->next_inactive_rotation = l->next_inactive_rotation->prev; list_move(&node->list, &l->lists[tgt_type]); } static bool bpf_lru_list_inactive_low(const struct bpf_lru_list *l) { return l->counts[BPF_LRU_LIST_T_INACTIVE] < l->counts[BPF_LRU_LIST_T_ACTIVE]; } /* Rotate the active list: * 1. Start from tail * 2. If the node has the ref bit set, it will be rotated * back to the head of active list with the ref bit cleared. * Give this node one more chance to survive in the active list. * 3. If the ref bit is not set, move it to the head of the * inactive list. * 4. It will at most scan nr_scans nodes */ static void __bpf_lru_list_rotate_active(struct bpf_lru *lru, struct bpf_lru_list *l) { struct list_head *active = &l->lists[BPF_LRU_LIST_T_ACTIVE]; struct bpf_lru_node *node, *tmp_node, *first_node; unsigned int i = 0; first_node = list_first_entry(active, struct bpf_lru_node, list); list_for_each_entry_safe_reverse(node, tmp_node, active, list) { if (bpf_lru_node_is_ref(node)) __bpf_lru_node_move(l, node, BPF_LRU_LIST_T_ACTIVE); else __bpf_lru_node_move(l, node, BPF_LRU_LIST_T_INACTIVE); if (++i == lru->nr_scans || node == first_node) break; } } /* Rotate the inactive list. It starts from the next_inactive_rotation * 1. If the node has ref bit set, it will be moved to the head * of active list with the ref bit cleared. * 2. If the node does not have ref bit set, it will leave it * at its current location (i.e. do nothing) so that it can * be considered during the next inactive_shrink. * 3. It will at most scan nr_scans nodes */ static void __bpf_lru_list_rotate_inactive(struct bpf_lru *lru, struct bpf_lru_list *l) { struct list_head *inactive = &l->lists[BPF_LRU_LIST_T_INACTIVE]; struct list_head *cur, *last, *next = inactive; struct bpf_lru_node *node; unsigned int i = 0; if (list_empty(inactive)) return; last = l->next_inactive_rotation->next; if (last == inactive) last = last->next; cur = l->next_inactive_rotation; while (i < lru->nr_scans) { if (cur == inactive) { cur = cur->prev; continue; } node = list_entry(cur, struct bpf_lru_node, list); next = cur->prev; if (bpf_lru_node_is_ref(node)) __bpf_lru_node_move(l, node, BPF_LRU_LIST_T_ACTIVE); if (cur == last) break; cur = next; i++; } l->next_inactive_rotation = next; } /* Shrink the inactive list. It starts from the tail of the * inactive list and only move the nodes without the ref bit * set to the designated free list. */ static unsigned int __bpf_lru_list_shrink_inactive(struct bpf_lru *lru, struct bpf_lru_list *l, unsigned int tgt_nshrink, struct list_head *free_list, enum bpf_lru_list_type tgt_free_type) { struct list_head *inactive = &l->lists[BPF_LRU_LIST_T_INACTIVE]; struct bpf_lru_node *node, *tmp_node; unsigned int nshrinked = 0; unsigned int i = 0; list_for_each_entry_safe_reverse(node, tmp_node, inactive, list) { if (bpf_lru_node_is_ref(node)) { __bpf_lru_node_move(l, node, BPF_LRU_LIST_T_ACTIVE); } else if (lru->del_from_htab(lru->del_arg, node)) { __bpf_lru_node_move_to_free(l, node, free_list, tgt_free_type); if (++nshrinked == tgt_nshrink) break; } if (++i == lru->nr_scans) break; } return nshrinked; } /* 1. Rotate the active list (if needed) * 2. Always rotate the inactive list */ static void __bpf_lru_list_rotate(struct bpf_lru *lru, struct bpf_lru_list *l) { if (bpf_lru_list_inactive_low(l)) __bpf_lru_list_rotate_active(lru, l); __bpf_lru_list_rotate_inactive(lru, l); } /* Calls __bpf_lru_list_shrink_inactive() to shrink some * ref-bit-cleared nodes and move them to the designated * free list. * * If it cannot get a free node after calling * __bpf_lru_list_shrink_inactive(). It will just remove * one node from either inactive or active list without * honoring the ref-bit. It prefers inactive list to active * list in this situation. */ static unsigned int __bpf_lru_list_shrink(struct bpf_lru *lru, struct bpf_lru_list *l, unsigned int tgt_nshrink, struct list_head *free_list, enum bpf_lru_list_type tgt_free_type) { struct bpf_lru_node *node, *tmp_node; struct list_head *force_shrink_list; unsigned int nshrinked; nshrinked = __bpf_lru_list_shrink_inactive(lru, l, tgt_nshrink, free_list, tgt_free_type); if (nshrinked) return nshrinked; /* Do a force shrink by ignoring the reference bit */ if (!list_empty(&l->lists[BPF_LRU_LIST_T_INACTIVE])) force_shrink_list = &l->lists[BPF_LRU_LIST_T_INACTIVE]; else force_shrink_list = &l->lists[BPF_LRU_LIST_T_ACTIVE]; list_for_each_entry_safe_reverse(node, tmp_node, force_shrink_list, list) { if (lru->del_from_htab(lru->del_arg, node)) { __bpf_lru_node_move_to_free(l, node, free_list, tgt_free_type); return 1; } } return 0; } /* Flush the nodes from the local pending list to the LRU list */ static void __local_list_flush(struct bpf_lru_list *l, struct bpf_lru_locallist *loc_l) { struct bpf_lru_node *node, *tmp_node; list_for_each_entry_safe_reverse(node, tmp_node, local_pending_list(loc_l), list) { if (bpf_lru_node_is_ref(node)) __bpf_lru_node_move_in(l, node, BPF_LRU_LIST_T_ACTIVE); else __bpf_lru_node_move_in(l, node, BPF_LRU_LIST_T_INACTIVE); } } static void bpf_lru_list_push_free(struct bpf_lru_list *l, struct bpf_lru_node *node) { unsigned long flags; if (WARN_ON_ONCE(IS_LOCAL_LIST_TYPE(node->type))) return; raw_spin_lock_irqsave(&l->lock, flags); __bpf_lru_node_move(l, node, BPF_LRU_LIST_T_FREE); raw_spin_unlock_irqrestore(&l->lock, flags); } static void bpf_lru_list_pop_free_to_local(struct bpf_lru *lru, struct bpf_lru_locallist *loc_l) { struct bpf_lru_list *l = &lru->common_lru.lru_list; struct bpf_lru_node *node, *tmp_node; unsigned int nfree = 0; raw_spin_lock(&l->lock); __local_list_flush(l, loc_l); __bpf_lru_list_rotate(lru, l); list_for_each_entry_safe(node, tmp_node, &l->lists[BPF_LRU_LIST_T_FREE], list) { __bpf_lru_node_move_to_free(l, node, local_free_list(loc_l), BPF_LRU_LOCAL_LIST_T_FREE); if (++nfree == LOCAL_FREE_TARGET) break; } if (nfree < LOCAL_FREE_TARGET) __bpf_lru_list_shrink(lru, l, LOCAL_FREE_TARGET - nfree, local_free_list(loc_l), BPF_LRU_LOCAL_LIST_T_FREE); raw_spin_unlock(&l->lock); } static void __local_list_add_pending(struct bpf_lru *lru, struct bpf_lru_locallist *loc_l, int cpu, struct bpf_lru_node *node, u32 hash) { *(u32 *)((void *)node + lru->hash_offset) = hash; node->cpu = cpu; node->type = BPF_LRU_LOCAL_LIST_T_PENDING; bpf_lru_node_clear_ref(node); list_add(&node->list, local_pending_list(loc_l)); } static struct bpf_lru_node * __local_list_pop_free(struct bpf_lru_locallist *loc_l) { struct bpf_lru_node *node; node = list_first_entry_or_null(local_free_list(loc_l), struct bpf_lru_node, list); if (node) list_del(&node->list); return node; } static struct bpf_lru_node * __local_list_pop_pending(struct bpf_lru *lru, struct bpf_lru_locallist *loc_l) { struct bpf_lru_node *node; bool force = false; ignore_ref: /* Get from the tail (i.e. older element) of the pending list. */ list_for_each_entry_reverse(node, local_pending_list(loc_l), list) { if ((!bpf_lru_node_is_ref(node) || force) && lru->del_from_htab(lru->del_arg, node)) { list_del(&node->list); return node; } } if (!force) { force = true; goto ignore_ref; } return NULL; } static struct bpf_lru_node *bpf_percpu_lru_pop_free(struct bpf_lru *lru, u32 hash) { struct list_head *free_list; struct bpf_lru_node *node = NULL; struct bpf_lru_list *l; unsigned long flags; int cpu = raw_smp_processor_id(); l = per_cpu_ptr(lru->percpu_lru, cpu); raw_spin_lock_irqsave(&l->lock, flags); __bpf_lru_list_rotate(lru, l); free_list = &l->lists[BPF_LRU_LIST_T_FREE]; if (list_empty(free_list)) __bpf_lru_list_shrink(lru, l, PERCPU_FREE_TARGET, free_list, BPF_LRU_LIST_T_FREE); if (!list_empty(free_list)) { node = list_first_entry(free_list, struct bpf_lru_node, list); *(u32 *)((void *)node + lru->hash_offset) = hash; bpf_lru_node_clear_ref(node); __bpf_lru_node_move(l, node, BPF_LRU_LIST_T_INACTIVE); } raw_spin_unlock_irqrestore(&l->lock, flags); return node; } static struct bpf_lru_node *bpf_common_lru_pop_free(struct bpf_lru *lru, u32 hash) { struct bpf_lru_locallist *loc_l, *steal_loc_l; struct bpf_common_lru *clru = &lru->common_lru; struct bpf_lru_node *node; int steal, first_steal; unsigned long flags; int cpu = raw_smp_processor_id(); loc_l = per_cpu_ptr(clru->local_list, cpu); raw_spin_lock_irqsave(&loc_l->lock, flags); node = __local_list_pop_free(loc_l); if (!node) { bpf_lru_list_pop_free_to_local(lru, loc_l); node = __local_list_pop_free(loc_l); } if (node) __local_list_add_pending(lru, loc_l, cpu, node, hash); raw_spin_unlock_irqrestore(&loc_l->lock, flags); if (node) return node; /* No free nodes found from the local free list and * the global LRU list. * * Steal from the local free/pending list of the * current CPU and remote CPU in RR. It starts * with the loc_l->next_steal CPU. */ first_steal = loc_l->next_steal; steal = first_steal; do { steal_loc_l = per_cpu_ptr(clru->local_list, steal); raw_spin_lock_irqsave(&steal_loc_l->lock, flags); node = __local_list_pop_free(steal_loc_l); if (!node) node = __local_list_pop_pending(lru, steal_loc_l); raw_spin_unlock_irqrestore(&steal_loc_l->lock, flags); steal = get_next_cpu(steal); } while (!node && steal != first_steal); loc_l->next_steal = steal; if (node) { raw_spin_lock_irqsave(&loc_l->lock, flags); __local_list_add_pending(lru, loc_l, cpu, node, hash); raw_spin_unlock_irqrestore(&loc_l->lock, flags); } return node; } struct bpf_lru_node *bpf_lru_pop_free(struct bpf_lru *lru, u32 hash) { if (lru->percpu) return bpf_percpu_lru_pop_free(lru, hash); else return bpf_common_lru_pop_free(lru, hash); } static void bpf_common_lru_push_free(struct bpf_lru *lru, struct bpf_lru_node *node) { u8 node_type = READ_ONCE(node->type); unsigned long flags; if (WARN_ON_ONCE(node_type == BPF_LRU_LIST_T_FREE) || WARN_ON_ONCE(node_type == BPF_LRU_LOCAL_LIST_T_FREE)) return; if (node_type == BPF_LRU_LOCAL_LIST_T_PENDING) { struct bpf_lru_locallist *loc_l; loc_l = per_cpu_ptr(lru->common_lru.local_list, node->cpu); raw_spin_lock_irqsave(&loc_l->lock, flags); if (unlikely(node->type != BPF_LRU_LOCAL_LIST_T_PENDING)) { raw_spin_unlock_irqrestore(&loc_l->lock, flags); goto check_lru_list; } node->type = BPF_LRU_LOCAL_LIST_T_FREE; bpf_lru_node_clear_ref(node); list_move(&node->list, local_free_list(loc_l)); raw_spin_unlock_irqrestore(&loc_l->lock, flags); return; } check_lru_list: bpf_lru_list_push_free(&lru->common_lru.lru_list, node); } static void bpf_percpu_lru_push_free(struct bpf_lru *lru, struct bpf_lru_node *node) { struct bpf_lru_list *l; unsigned long flags; l = per_cpu_ptr(lru->percpu_lru, node->cpu); raw_spin_lock_irqsave(&l->lock, flags); __bpf_lru_node_move(l, node, BPF_LRU_LIST_T_FREE); raw_spin_unlock_irqrestore(&l->lock, flags); } void bpf_lru_push_free(struct bpf_lru *lru, struct bpf_lru_node *node) { if (lru->percpu) bpf_percpu_lru_push_free(lru, node); else bpf_common_lru_push_free(lru, node); } static void bpf_common_lru_populate(struct bpf_lru *lru, void *buf, u32 node_offset, u32 elem_size, u32 nr_elems) { struct bpf_lru_list *l = &lru->common_lru.lru_list; u32 i; for (i = 0; i < nr_elems; i++) { struct bpf_lru_node *node; node = (struct bpf_lru_node *)(buf + node_offset); node->type = BPF_LRU_LIST_T_FREE; bpf_lru_node_clear_ref(node); list_add(&node->list, &l->lists[BPF_LRU_LIST_T_FREE]); buf += elem_size; } } static void bpf_percpu_lru_populate(struct bpf_lru *lru, void *buf, u32 node_offset, u32 elem_size, u32 nr_elems) { u32 i, pcpu_entries; int cpu; struct bpf_lru_list *l; pcpu_entries = nr_elems / num_possible_cpus(); i = 0; for_each_possible_cpu(cpu) { struct bpf_lru_node *node; l = per_cpu_ptr(lru->percpu_lru, cpu); again: node = (struct bpf_lru_node *)(buf + node_offset); node->cpu = cpu; node->type = BPF_LRU_LIST_T_FREE; bpf_lru_node_clear_ref(node); list_add(&node->list, &l->lists[BPF_LRU_LIST_T_FREE]); i++; buf += elem_size; if (i == nr_elems) break; if (i % pcpu_entries) goto again; } } void bpf_lru_populate(struct bpf_lru *lru, void *buf, u32 node_offset, u32 elem_size, u32 nr_elems) { if (lru->percpu) bpf_percpu_lru_populate(lru, buf, node_offset, elem_size, nr_elems); else bpf_common_lru_populate(lru, buf, node_offset, elem_size, nr_elems); } static void bpf_lru_locallist_init(struct bpf_lru_locallist *loc_l, int cpu) { int i; for (i = 0; i < NR_BPF_LRU_LOCAL_LIST_T; i++) INIT_LIST_HEAD(&loc_l->lists[i]); loc_l->next_steal = cpu; raw_spin_lock_init(&loc_l->lock); } static void bpf_lru_list_init(struct bpf_lru_list *l) { int i; for (i = 0; i < NR_BPF_LRU_LIST_T; i++) INIT_LIST_HEAD(&l->lists[i]); for (i = 0; i < NR_BPF_LRU_LIST_COUNT; i++) l->counts[i] = 0; l->next_inactive_rotation = &l->lists[BPF_LRU_LIST_T_INACTIVE]; raw_spin_lock_init(&l->lock); } int bpf_lru_init(struct bpf_lru *lru, bool percpu, u32 hash_offset, del_from_htab_func del_from_htab, void *del_arg) { int cpu; if (percpu) { lru->percpu_lru = alloc_percpu(struct bpf_lru_list); if (!lru->percpu_lru) return -ENOMEM; for_each_possible_cpu(cpu) { struct bpf_lru_list *l; l = per_cpu_ptr(lru->percpu_lru, cpu); bpf_lru_list_init(l); } lru->nr_scans = PERCPU_NR_SCANS; } else { struct bpf_common_lru *clru = &lru->common_lru; clru->local_list = alloc_percpu(struct bpf_lru_locallist); if (!clru->local_list) return -ENOMEM; for_each_possible_cpu(cpu) { struct bpf_lru_locallist *loc_l; loc_l = per_cpu_ptr(clru->local_list, cpu); bpf_lru_locallist_init(loc_l, cpu); } bpf_lru_list_init(&clru->lru_list); lru->nr_scans = LOCAL_NR_SCANS; } lru->percpu = percpu; lru->del_from_htab = del_from_htab; lru->del_arg = del_arg; lru->hash_offset = hash_offset; return 0; } void bpf_lru_destroy(struct bpf_lru *lru) { if (lru->percpu) free_percpu(lru->percpu_lru); else free_percpu(lru->common_lru.local_list); }
481 20 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __LINUX_SMP_H #define __LINUX_SMP_H /* * Generic SMP support * Alan Cox. <alan@redhat.com> */ #include <linux/errno.h> #include <linux/types.h> #include <linux/list.h> #include <linux/cpumask.h> #include <linux/init.h> #include <linux/smp_types.h> typedef void (*smp_call_func_t)(void *info); typedef bool (*smp_cond_func_t)(int cpu, void *info); /* * structure shares (partial) layout with struct irq_work */ struct __call_single_data { struct __call_single_node node; smp_call_func_t func; void *info; }; #define CSD_INIT(_func, _info) \ (struct __call_single_data){ .func = (_func), .info = (_info), } /* Use __aligned() to avoid to use 2 cache lines for 1 csd */ typedef struct __call_single_data call_single_data_t __aligned(sizeof(struct __call_single_data)); #define INIT_CSD(_csd, _func, _info) \ do { \ *(_csd) = CSD_INIT((_func), (_info)); \ } while (0) /* * Enqueue a llist_node on the call_single_queue; be very careful, read * flush_smp_call_function_queue() in detail. */ extern void __smp_call_single_queue(int cpu, struct llist_node *node); /* total number of cpus in this system (may exceed NR_CPUS) */ extern unsigned int total_cpus; int smp_call_function_single(int cpuid, smp_call_func_t func, void *info, int wait); void on_each_cpu_cond_mask(smp_cond_func_t cond_func, smp_call_func_t func, void *info, bool wait, const struct cpumask *mask); int smp_call_function_single_async(int cpu, call_single_data_t *csd); /* * Cpus stopping functions in panic. All have default weak definitions. * Architecture-dependent code may override them. */ void __noreturn panic_smp_self_stop(void); void __noreturn nmi_panic_self_stop(struct pt_regs *regs); void crash_smp_send_stop(void); /* * Call a function on all processors */ static inline void on_each_cpu(smp_call_func_t func, void *info, int wait) { on_each_cpu_cond_mask(NULL, func, info, wait, cpu_online_mask); } /** * on_each_cpu_mask(): Run a function on processors specified by * cpumask, which may include the local processor. * @mask: The set of cpus to run on (only runs on online subset). * @func: The function to run. This must be fast and non-blocking. * @info: An arbitrary pointer to pass to the function. * @wait: If true, wait (atomically) until function has completed * on other CPUs. * * If @wait is true, then returns once @func has returned. * * You must not call this function with disabled interrupts or from a * hardware interrupt handler or from a bottom half handler. The * exception is that it may be used during early boot while * early_boot_irqs_disabled is set. */ static inline void on_each_cpu_mask(const struct cpumask *mask, smp_call_func_t func, void *info, bool wait) { on_each_cpu_cond_mask(NULL, func, info, wait, mask); } /* * Call a function on each processor for which the supplied function * cond_func returns a positive value. This may include the local * processor. May be used during early boot while early_boot_irqs_disabled is * set. Use local_irq_save/restore() instead of local_irq_disable/enable(). */ static inline void on_each_cpu_cond(smp_cond_func_t cond_func, smp_call_func_t func, void *info, bool wait) { on_each_cpu_cond_mask(cond_func, func, info, wait, cpu_online_mask); } /* * Architecture specific boot CPU setup. Defined as empty weak function in * init/main.c. Architectures can override it. */ void __init smp_prepare_boot_cpu(void); #ifdef CONFIG_SMP #include <linux/preempt.h> #include <linux/compiler.h> #include <linux/thread_info.h> #include <asm/smp.h> /* * main cross-CPU interfaces, handles INIT, TLB flush, STOP, etc. * (defined in asm header): */ /* * stops all CPUs but the current one: */ extern void smp_send_stop(void); /* * sends a 'reschedule' event to another CPU: */ extern void arch_smp_send_reschedule(int cpu); /* * scheduler_ipi() is inline so can't be passed as callback reason, but the * callsite IP should be sufficient for root-causing IPIs sent from here. */ #define smp_send_reschedule(cpu) ({ \ trace_ipi_send_cpu(cpu, _RET_IP_, NULL); \ arch_smp_send_reschedule(cpu); \ }) /* * Prepare machine for booting other CPUs. */ extern void smp_prepare_cpus(unsigned int max_cpus); /* * Bring a CPU up */ extern int __cpu_up(unsigned int cpunum, struct task_struct *tidle); /* * Final polishing of CPUs */ extern void smp_cpus_done(unsigned int max_cpus); /* * Call a function on all other processors */ void smp_call_function(smp_call_func_t func, void *info, int wait); void smp_call_function_many(const struct cpumask *mask, smp_call_func_t func, void *info, bool wait); int smp_call_function_any(const struct cpumask *mask, smp_call_func_t func, void *info, int wait); void kick_all_cpus_sync(void); void wake_up_all_idle_cpus(void); /* * Generic and arch helpers */ void __init call_function_init(void); void generic_smp_call_function_single_interrupt(void); #define generic_smp_call_function_interrupt \ generic_smp_call_function_single_interrupt extern unsigned int setup_max_cpus; extern void __init setup_nr_cpu_ids(void); extern void __init smp_init(void); extern int __boot_cpu_id; static inline int get_boot_cpu_id(void) { return __boot_cpu_id; } #else /* !SMP */ static inline void smp_send_stop(void) { } /* * These macros fold the SMP functionality into a single CPU system */ #define raw_smp_processor_id() 0 static inline void up_smp_call_function(smp_call_func_t func, void *info) { } #define smp_call_function(func, info, wait) \ (up_smp_call_function(func, info)) static inline void smp_send_reschedule(int cpu) { } #define smp_call_function_many(mask, func, info, wait) \ (up_smp_call_function(func, info)) static inline void call_function_init(void) { } static inline int smp_call_function_any(const struct cpumask *mask, smp_call_func_t func, void *info, int wait) { return smp_call_function_single(0, func, info, wait); } static inline void kick_all_cpus_sync(void) { } static inline void wake_up_all_idle_cpus(void) { } #define setup_max_cpus 0 #ifdef CONFIG_UP_LATE_INIT extern void __init up_late_init(void); static inline void smp_init(void) { up_late_init(); } #else static inline void smp_init(void) { } #endif static inline int get_boot_cpu_id(void) { return 0; } #endif /* !SMP */ /** * raw_processor_id() - get the current (unstable) CPU id * * For then you know what you are doing and need an unstable * CPU id. */ /** * smp_processor_id() - get the current (stable) CPU id * * This is the normal accessor to the CPU id and should be used * whenever possible. * * The CPU id is stable when: * * - IRQs are disabled; * - preemption is disabled; * - the task is CPU affine. * * When CONFIG_DEBUG_PREEMPT; we verify these assumption and WARN * when smp_processor_id() is used when the CPU id is not stable. */ /* * Allow the architecture to differentiate between a stable and unstable read. * For example, x86 uses an IRQ-safe asm-volatile read for the unstable but a * regular asm read for the stable. */ #ifndef __smp_processor_id #define __smp_processor_id() raw_smp_processor_id() #endif #ifdef CONFIG_DEBUG_PREEMPT extern unsigned int debug_smp_processor_id(void); # define smp_processor_id() debug_smp_processor_id() #else # define smp_processor_id() __smp_processor_id() #endif #define get_cpu() ({ preempt_disable(); __smp_processor_id(); }) #define put_cpu() preempt_enable() /* * Callback to arch code if there's nosmp or maxcpus=0 on the * boot command line: */ extern void arch_disable_smp_support(void); extern void arch_thaw_secondary_cpus_begin(void); extern void arch_thaw_secondary_cpus_end(void); void smp_setup_processor_id(void); int smp_call_on_cpu(unsigned int cpu, int (*func)(void *), void *par, bool phys); /* SMP core functions */ int smpcfd_prepare_cpu(unsigned int cpu); int smpcfd_dead_cpu(unsigned int cpu); int smpcfd_dying_cpu(unsigned int cpu); #ifdef CONFIG_CSD_LOCK_WAIT_DEBUG bool csd_lock_is_stuck(void); #else static inline bool csd_lock_is_stuck(void) { return false; } #endif #endif /* __LINUX_SMP_H */
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 /* SPDX-License-Identifier: MIT */ /* * The VGA aribiter manages VGA space routing and VGA resource decode to * allow multiple VGA devices to be used in a system in a safe way. * * (C) Copyright 2005 Benjamin Herrenschmidt <benh@kernel.crashing.org> * (C) Copyright 2007 Paulo R. Zanoni <przanoni@gmail.com> * (C) Copyright 2007, 2009 Tiago Vignatti <vignatti@freedesktop.org> */ #ifndef LINUX_VGA_H #define LINUX_VGA_H #include <video/vga.h> struct pci_dev; /* Legacy VGA regions */ #define VGA_RSRC_NONE 0x00 #define VGA_RSRC_LEGACY_IO 0x01 #define VGA_RSRC_LEGACY_MEM 0x02 #define VGA_RSRC_LEGACY_MASK (VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM) /* Non-legacy access */ #define VGA_RSRC_NORMAL_IO 0x04 #define VGA_RSRC_NORMAL_MEM 0x08 #ifdef CONFIG_VGA_ARB void vga_set_legacy_decoding(struct pci_dev *pdev, unsigned int decodes); int vga_get(struct pci_dev *pdev, unsigned int rsrc, int interruptible); void vga_put(struct pci_dev *pdev, unsigned int rsrc); struct pci_dev *vga_default_device(void); void vga_set_default_device(struct pci_dev *pdev); int vga_remove_vgacon(struct pci_dev *pdev); int vga_client_register(struct pci_dev *pdev, unsigned int (*set_decode)(struct pci_dev *pdev, bool state)); #else /* CONFIG_VGA_ARB */ static inline void vga_set_legacy_decoding(struct pci_dev *pdev, unsigned int decodes) { }; static inline int vga_get(struct pci_dev *pdev, unsigned int rsrc, int interruptible) { return 0; } static inline void vga_put(struct pci_dev *pdev, unsigned int rsrc) { } static inline struct pci_dev *vga_default_device(void) { return NULL; } static inline void vga_set_default_device(struct pci_dev *pdev) { } static inline int vga_remove_vgacon(struct pci_dev *pdev) { return 0; } static inline int vga_client_register(struct pci_dev *pdev, unsigned int (*set_decode)(struct pci_dev *pdev, bool state)) { return 0; } #endif /* CONFIG_VGA_ARB */ /** * vga_get_interruptible * @pdev: pci device of the VGA card or NULL for the system default * @rsrc: bit mask of resources to acquire and lock * * Shortcut to vga_get with interruptible set to true. * * On success, release the VGA resource again with vga_put(). */ static inline int vga_get_interruptible(struct pci_dev *pdev, unsigned int rsrc) { return vga_get(pdev, rsrc, 1); } /** * vga_get_uninterruptible - shortcut to vga_get() * @pdev: pci device of the VGA card or NULL for the system default * @rsrc: bit mask of resources to acquire and lock * * Shortcut to vga_get with interruptible set to false. * * On success, release the VGA resource again with vga_put(). */ static inline int vga_get_uninterruptible(struct pci_dev *pdev, unsigned int rsrc) { return vga_get(pdev, rsrc, 0); } static inline void vga_client_unregister(struct pci_dev *pdev) { vga_client_register(pdev, NULL); } #endif /* LINUX_VGA_H */
2 29 29 10 32 17 367 32 21 7 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _linux_POSIX_TIMERS_H #define _linux_POSIX_TIMERS_H #include <linux/alarmtimer.h> #include <linux/list.h> #include <linux/mutex.h> #include <linux/pid.h> #include <linux/posix-timers_types.h> #include <linux/rcuref.h> #include <linux/spinlock.h> #include <linux/timerqueue.h> struct kernel_siginfo; struct task_struct; struct sigqueue; struct k_itimer; static inline clockid_t make_process_cpuclock(const unsigned int pid, const clockid_t clock) { return ((~pid) << 3) | clock; } static inline clockid_t make_thread_cpuclock(const unsigned int tid, const clockid_t clock) { return make_process_cpuclock(tid, clock | CPUCLOCK_PERTHREAD_MASK); } static inline clockid_t fd_to_clockid(const int fd) { return make_process_cpuclock((unsigned int) fd, CLOCKFD); } static inline int clockid_to_fd(const clockid_t clk) { return ~(clk >> 3); } #ifdef CONFIG_POSIX_TIMERS #include <linux/signal_types.h> /** * cpu_timer - Posix CPU timer representation for k_itimer * @node: timerqueue node to queue in the task/sig * @head: timerqueue head on which this timer is queued * @pid: Pointer to target task PID * @elist: List head for the expiry list * @firing: Timer is currently firing * @nanosleep: Timer is used for nanosleep and is not a regular posix-timer * @handling: Pointer to the task which handles expiry */ struct cpu_timer { struct timerqueue_node node; struct timerqueue_head *head; struct pid *pid; struct list_head elist; bool firing; bool nanosleep; struct task_struct __rcu *handling; }; static inline bool cpu_timer_enqueue(struct timerqueue_head *head, struct cpu_timer *ctmr) { ctmr->head = head; return timerqueue_add(head, &ctmr->node); } static inline bool cpu_timer_queued(struct cpu_timer *ctmr) { return !!ctmr->head; } static inline bool cpu_timer_dequeue(struct cpu_timer *ctmr) { if (cpu_timer_queued(ctmr)) { timerqueue_del(ctmr->head, &ctmr->node); ctmr->head = NULL; return true; } return false; } static inline u64 cpu_timer_getexpires(struct cpu_timer *ctmr) { return ctmr->node.expires; } static inline void cpu_timer_setexpires(struct cpu_timer *ctmr, u64 exp) { ctmr->node.expires = exp; } static inline void posix_cputimers_init(struct posix_cputimers *pct) { memset(pct, 0, sizeof(*pct)); pct->bases[0].nextevt = U64_MAX; pct->bases[1].nextevt = U64_MAX; pct->bases[2].nextevt = U64_MAX; } void posix_cputimers_group_init(struct posix_cputimers *pct, u64 cpu_limit); static inline void posix_cputimers_rt_watchdog(struct posix_cputimers *pct, u64 runtime) { pct->bases[CPUCLOCK_SCHED].nextevt = runtime; } void posixtimer_rearm_itimer(struct task_struct *p); bool posixtimer_init_sigqueue(struct sigqueue *q); void posixtimer_send_sigqueue(struct k_itimer *tmr); bool posixtimer_deliver_signal(struct kernel_siginfo *info, struct sigqueue *timer_sigq); void posixtimer_free_timer(struct k_itimer *timer); /* Init task static initializer */ #define INIT_CPU_TIMERBASE(b) { \ .nextevt = U64_MAX, \ } #define INIT_CPU_TIMERBASES(b) { \ INIT_CPU_TIMERBASE(b[0]), \ INIT_CPU_TIMERBASE(b[1]), \ INIT_CPU_TIMERBASE(b[2]), \ } #define INIT_CPU_TIMERS(s) \ .posix_cputimers = { \ .bases = INIT_CPU_TIMERBASES(s.posix_cputimers.bases), \ }, #else struct cpu_timer { }; #define INIT_CPU_TIMERS(s) static inline void posix_cputimers_init(struct posix_cputimers *pct) { } static inline void posix_cputimers_group_init(struct posix_cputimers *pct, u64 cpu_limit) { } static inline void posixtimer_rearm_itimer(struct task_struct *p) { } static inline bool posixtimer_deliver_signal(struct kernel_siginfo *info, struct sigqueue *timer_sigq) { return false; } static inline void posixtimer_free_timer(struct k_itimer *timer) { } #endif #ifdef CONFIG_POSIX_CPU_TIMERS_TASK_WORK void clear_posix_cputimers_work(struct task_struct *p); void posix_cputimers_init_work(void); #else static inline void clear_posix_cputimers_work(struct task_struct *p) { } static inline void posix_cputimers_init_work(void) { } #endif /** * struct k_itimer - POSIX.1b interval timer structure. * @list: List node for binding the timer to tsk::signal::posix_timers * @ignored_list: List node for tracking ignored timers in tsk::signal::ignored_posix_timers * @t_hash: Entry in the posix timer hash table * @it_lock: Lock protecting the timer * @kclock: Pointer to the k_clock struct handling this timer * @it_clock: The posix timer clock id * @it_id: The posix timer id for identifying the timer * @it_status: The status of the timer * @it_sig_periodic: The periodic status at signal delivery * @it_overrun: The overrun counter for pending signals * @it_overrun_last: The overrun at the time of the last delivered signal * @it_signal_seq: Sequence count to control signal delivery * @it_sigqueue_seq: The sequence count at the point where the signal was queued * @it_sigev_notify: The notify word of sigevent struct for signal delivery * @it_interval: The interval for periodic timers * @it_signal: Pointer to the creators signal struct * @it_pid: The pid of the process/task targeted by the signal * @it_process: The task to wakeup on clock_nanosleep (CPU timers) * @rcuref: Reference count for life time management * @sigq: Embedded sigqueue * @it: Union representing the various posix timer type * internals. * @rcu: RCU head for freeing the timer. */ struct k_itimer { struct hlist_node list; struct hlist_node ignored_list; struct hlist_node t_hash; spinlock_t it_lock; const struct k_clock *kclock; clockid_t it_clock; timer_t it_id; int it_status; bool it_sig_periodic; s64 it_overrun; s64 it_overrun_last; unsigned int it_signal_seq; unsigned int it_sigqueue_seq; int it_sigev_notify; enum pid_type it_pid_type; ktime_t it_interval; struct signal_struct *it_signal; union { struct pid *it_pid; struct task_struct *it_process; }; struct sigqueue sigq; rcuref_t rcuref; union { struct { struct hrtimer timer; } real; struct cpu_timer cpu; struct { struct alarm alarmtimer; } alarm; } it; struct rcu_head rcu; }; void run_posix_cpu_timers(void); void posix_cpu_timers_exit(struct task_struct *task); void posix_cpu_timers_exit_group(struct task_struct *task); void set_process_cpu_timer(struct task_struct *task, unsigned int clock_idx, u64 *newval, u64 *oldval); int update_rlimit_cpu(struct task_struct *task, unsigned long rlim_new); #ifdef CONFIG_POSIX_TIMERS static inline void posixtimer_putref(struct k_itimer *tmr) { if (rcuref_put(&tmr->rcuref)) posixtimer_free_timer(tmr); } static inline void posixtimer_sigqueue_getref(struct sigqueue *q) { struct k_itimer *tmr = container_of(q, struct k_itimer, sigq); WARN_ON_ONCE(!rcuref_get(&tmr->rcuref)); } static inline void posixtimer_sigqueue_putref(struct sigqueue *q) { struct k_itimer *tmr = container_of(q, struct k_itimer, sigq); posixtimer_putref(tmr); } #else /* CONFIG_POSIX_TIMERS */ static inline void posixtimer_sigqueue_getref(struct sigqueue *q) { } static inline void posixtimer_sigqueue_putref(struct sigqueue *q) { } #endif /* !CONFIG_POSIX_TIMERS */ #endif
2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 13 10 10 3 13 2 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 9 10 10 10 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 /* SPDX-License-Identifier: GPL-2.0 * * page_pool.c * Author: Jesper Dangaard Brouer <netoptimizer@brouer.com> * Copyright (C) 2016 Red Hat, Inc. */ #include <linux/error-injection.h> #include <linux/types.h> #include <linux/kernel.h> #include <linux/slab.h> #include <linux/device.h> #include <net/netdev_rx_queue.h> #include <net/page_pool/helpers.h> #include <net/xdp.h> #include <linux/dma-direction.h> #include <linux/dma-mapping.h> #include <linux/page-flags.h> #include <linux/mm.h> /* for put_page() */ #include <linux/poison.h> #include <linux/ethtool.h> #include <linux/netdevice.h> #include <trace/events/page_pool.h> #include "mp_dmabuf_devmem.h" #include "netmem_priv.h" #include "page_pool_priv.h" DEFINE_STATIC_KEY_FALSE(page_pool_mem_providers); #define DEFER_TIME (msecs_to_jiffies(1000)) #define DEFER_WARN_INTERVAL (60 * HZ) #define BIAS_MAX (LONG_MAX >> 1) #ifdef CONFIG_PAGE_POOL_STATS static DEFINE_PER_CPU(struct page_pool_recycle_stats, pp_system_recycle_stats); /* alloc_stat_inc is intended to be used in softirq context */ #define alloc_stat_inc(pool, __stat) (pool->alloc_stats.__stat++) /* recycle_stat_inc is safe to use when preemption is possible. */ #define recycle_stat_inc(pool, __stat) \ do { \ struct page_pool_recycle_stats __percpu *s = pool->recycle_stats; \ this_cpu_inc(s->__stat); \ } while (0) #define recycle_stat_add(pool, __stat, val) \ do { \ struct page_pool_recycle_stats __percpu *s = pool->recycle_stats; \ this_cpu_add(s->__stat, val); \ } while (0) static const char pp_stats[][ETH_GSTRING_LEN] = { "rx_pp_alloc_fast", "rx_pp_alloc_slow", "rx_pp_alloc_slow_ho", "rx_pp_alloc_empty", "rx_pp_alloc_refill", "rx_pp_alloc_waive", "rx_pp_recycle_cached", "rx_pp_recycle_cache_full", "rx_pp_recycle_ring", "rx_pp_recycle_ring_full", "rx_pp_recycle_released_ref", }; /** * page_pool_get_stats() - fetch page pool stats * @pool: pool from which page was allocated * @stats: struct page_pool_stats to fill in * * Retrieve statistics about the page_pool. This API is only available * if the kernel has been configured with ``CONFIG_PAGE_POOL_STATS=y``. * A pointer to a caller allocated struct page_pool_stats structure * is passed to this API which is filled in. The caller can then report * those stats to the user (perhaps via ethtool, debugfs, etc.). */ bool page_pool_get_stats(const struct page_pool *pool, struct page_pool_stats *stats) { int cpu = 0; if (!stats) return false; /* The caller is responsible to initialize stats. */ stats->alloc_stats.fast += pool->alloc_stats.fast; stats->alloc_stats.slow += pool->alloc_stats.slow; stats->alloc_stats.slow_high_order += pool->alloc_stats.slow_high_order; stats->alloc_stats.empty += pool->alloc_stats.empty; stats->alloc_stats.refill += pool->alloc_stats.refill; stats->alloc_stats.waive += pool->alloc_stats.waive; for_each_possible_cpu(cpu) { const struct page_pool_recycle_stats *pcpu = per_cpu_ptr(pool->recycle_stats, cpu); stats->recycle_stats.cached += pcpu->cached; stats->recycle_stats.cache_full += pcpu->cache_full; stats->recycle_stats.ring += pcpu->ring; stats->recycle_stats.ring_full += pcpu->ring_full; stats->recycle_stats.released_refcnt += pcpu->released_refcnt; } return true; } EXPORT_SYMBOL(page_pool_get_stats); u8 *page_pool_ethtool_stats_get_strings(u8 *data) { int i; for (i = 0; i < ARRAY_SIZE(pp_stats); i++) { memcpy(data, pp_stats[i], ETH_GSTRING_LEN); data += ETH_GSTRING_LEN; } return data; } EXPORT_SYMBOL(page_pool_ethtool_stats_get_strings); int page_pool_ethtool_stats_get_count(void) { return ARRAY_SIZE(pp_stats); } EXPORT_SYMBOL(page_pool_ethtool_stats_get_count); u64 *page_pool_ethtool_stats_get(u64 *data, const void *stats) { const struct page_pool_stats *pool_stats = stats; *data++ = pool_stats->alloc_stats.fast; *data++ = pool_stats->alloc_stats.slow; *data++ = pool_stats->alloc_stats.slow_high_order; *data++ = pool_stats->alloc_stats.empty; *data++ = pool_stats->alloc_stats.refill; *data++ = pool_stats->alloc_stats.waive; *data++ = pool_stats->recycle_stats.cached; *data++ = pool_stats->recycle_stats.cache_full; *data++ = pool_stats->recycle_stats.ring; *data++ = pool_stats->recycle_stats.ring_full; *data++ = pool_stats->recycle_stats.released_refcnt; return data; } EXPORT_SYMBOL(page_pool_ethtool_stats_get); #else #define alloc_stat_inc(pool, __stat) #define recycle_stat_inc(pool, __stat) #define recycle_stat_add(pool, __stat, val) #endif static bool page_pool_producer_lock(struct page_pool *pool) __acquires(&pool->ring.producer_lock) { bool in_softirq = in_softirq(); if (in_softirq) spin_lock(&pool->ring.producer_lock); else spin_lock_bh(&pool->ring.producer_lock); return in_softirq; } static void page_pool_producer_unlock(struct page_pool *pool, bool in_softirq) __releases(&pool->ring.producer_lock) { if (in_softirq) spin_unlock(&pool->ring.producer_lock); else spin_unlock_bh(&pool->ring.producer_lock); } static void page_pool_struct_check(void) { CACHELINE_ASSERT_GROUP_MEMBER(struct page_pool, frag, frag_users); CACHELINE_ASSERT_GROUP_MEMBER(struct page_pool, frag, frag_page); CACHELINE_ASSERT_GROUP_MEMBER(struct page_pool, frag, frag_offset); CACHELINE_ASSERT_GROUP_SIZE(struct page_pool, frag, PAGE_POOL_FRAG_GROUP_ALIGN); } static int page_pool_init(struct page_pool *pool, const struct page_pool_params *params, int cpuid) { unsigned int ring_qsize = 1024; /* Default */ struct netdev_rx_queue *rxq; int err; page_pool_struct_check(); memcpy(&pool->p, &params->fast, sizeof(pool->p)); memcpy(&pool->slow, &params->slow, sizeof(pool->slow)); pool->cpuid = cpuid; pool->dma_sync_for_cpu = true; /* Validate only known flags were used */ if (pool->slow.flags & ~PP_FLAG_ALL) return -EINVAL; if (pool->p.pool_size) ring_qsize = pool->p.pool_size; /* Sanity limit mem that can be pinned down */ if (ring_qsize > 32768) return -E2BIG; /* DMA direction is either DMA_FROM_DEVICE or DMA_BIDIRECTIONAL. * DMA_BIDIRECTIONAL is for allowing page used for DMA sending, * which is the XDP_TX use-case. */ if (pool->slow.flags & PP_FLAG_DMA_MAP) { if ((pool->p.dma_dir != DMA_FROM_DEVICE) && (pool->p.dma_dir != DMA_BIDIRECTIONAL)) return -EINVAL; pool->dma_map = true; } if (pool->slow.flags & PP_FLAG_DMA_SYNC_DEV) { /* In order to request DMA-sync-for-device the page * needs to be mapped */ if (!(pool->slow.flags & PP_FLAG_DMA_MAP)) return -EINVAL; if (!pool->p.max_len) return -EINVAL; pool->dma_sync = true; /* pool->p.offset has to be set according to the address * offset used by the DMA engine to start copying rx data */ } pool->has_init_callback = !!pool->slow.init_callback; #ifdef CONFIG_PAGE_POOL_STATS if (!(pool->slow.flags & PP_FLAG_SYSTEM_POOL)) { pool->recycle_stats = alloc_percpu(struct page_pool_recycle_stats); if (!pool->recycle_stats) return -ENOMEM; } else { /* For system page pool instance we use a singular stats object * instead of allocating a separate percpu variable for each * (also percpu) page pool instance. */ pool->recycle_stats = &pp_system_recycle_stats; pool->system = true; } #endif if (ptr_ring_init(&pool->ring, ring_qsize, GFP_KERNEL) < 0) { #ifdef CONFIG_PAGE_POOL_STATS if (!pool->system) free_percpu(pool->recycle_stats); #endif return -ENOMEM; } atomic_set(&pool->pages_state_release_cnt, 0); /* Driver calling page_pool_create() also call page_pool_destroy() */ refcount_set(&pool->user_cnt, 1); if (pool->dma_map) get_device(pool->p.dev); if (pool->slow.flags & PP_FLAG_ALLOW_UNREADABLE_NETMEM) { /* We rely on rtnl_lock()ing to make sure netdev_rx_queue * configuration doesn't change while we're initializing * the page_pool. */ ASSERT_RTNL(); rxq = __netif_get_rx_queue(pool->slow.netdev, pool->slow.queue_idx); pool->mp_priv = rxq->mp_params.mp_priv; } if (pool->mp_priv) { if (!pool->dma_map || !pool->dma_sync) return -EOPNOTSUPP; err = mp_dmabuf_devmem_init(pool); if (err) { pr_warn("%s() mem-provider init failed %d\n", __func__, err); goto free_ptr_ring; } static_branch_inc(&page_pool_mem_providers); } return 0; free_ptr_ring: ptr_ring_cleanup(&pool->ring, NULL); #ifdef CONFIG_PAGE_POOL_STATS if (!pool->system) free_percpu(pool->recycle_stats); #endif return err; } static void page_pool_uninit(struct page_pool *pool) { ptr_ring_cleanup(&pool->ring, NULL); if (pool->dma_map) put_device(pool->p.dev); #ifdef CONFIG_PAGE_POOL_STATS if (!pool->system) free_percpu(pool->recycle_stats); #endif } /** * page_pool_create_percpu() - create a page pool for a given cpu. * @params: parameters, see struct page_pool_params * @cpuid: cpu identifier */ struct page_pool * page_pool_create_percpu(const struct page_pool_params *params, int cpuid) { struct page_pool *pool; int err; pool = kzalloc_node(sizeof(*pool), GFP_KERNEL, params->nid); if (!pool) return ERR_PTR(-ENOMEM); err = page_pool_init(pool, params, cpuid); if (err < 0) goto err_free; err = page_pool_list(pool); if (err) goto err_uninit; return pool; err_uninit: page_pool_uninit(pool); err_free: pr_warn("%s() gave up with errno %d\n", __func__, err); kfree(pool); return ERR_PTR(err); } EXPORT_SYMBOL(page_pool_create_percpu); /** * page_pool_create() - create a page pool * @params: parameters, see struct page_pool_params */ struct page_pool *page_pool_create(const struct page_pool_params *params) { return page_pool_create_percpu(params, -1); } EXPORT_SYMBOL(page_pool_create); static void page_pool_return_page(struct page_pool *pool, netmem_ref netmem); static noinline netmem_ref page_pool_refill_alloc_cache(struct page_pool *pool) { struct ptr_ring *r = &pool->ring; netmem_ref netmem; int pref_nid; /* preferred NUMA node */ /* Quicker fallback, avoid locks when ring is empty */ if (__ptr_ring_empty(r)) { alloc_stat_inc(pool, empty); return 0; } /* Softirq guarantee CPU and thus NUMA node is stable. This, * assumes CPU refilling driver RX-ring will also run RX-NAPI. */ #ifdef CONFIG_NUMA pref_nid = (pool->p.nid == NUMA_NO_NODE) ? numa_mem_id() : pool->p.nid; #else /* Ignore pool->p.nid setting if !CONFIG_NUMA, helps compiler */ pref_nid = numa_mem_id(); /* will be zero like page_to_nid() */ #endif /* Refill alloc array, but only if NUMA match */ do { netmem = (__force netmem_ref)__ptr_ring_consume(r); if (unlikely(!netmem)) break; if (likely(netmem_is_pref_nid(netmem, pref_nid))) { pool->alloc.cache[pool->alloc.count++] = netmem; } else { /* NUMA mismatch; * (1) release 1 page to page-allocator and * (2) break out to fallthrough to alloc_pages_node. * This limit stress on page buddy alloactor. */ page_pool_return_page(pool, netmem); alloc_stat_inc(pool, waive); netmem = 0; break; } } while (pool->alloc.count < PP_ALLOC_CACHE_REFILL); /* Return last page */ if (likely(pool->alloc.count > 0)) { netmem = pool->alloc.cache[--pool->alloc.count]; alloc_stat_inc(pool, refill); } return netmem; } /* fast path */ static netmem_ref __page_pool_get_cached(struct page_pool *pool) { netmem_ref netmem; /* Caller MUST guarantee safe non-concurrent access, e.g. softirq */ if (likely(pool->alloc.count)) { /* Fast-path */ netmem = pool->alloc.cache[--pool->alloc.count]; alloc_stat_inc(pool, fast); } else { netmem = page_pool_refill_alloc_cache(pool); } return netmem; } static void __page_pool_dma_sync_for_device(const struct page_pool *pool, netmem_ref netmem, u32 dma_sync_size) { #if defined(CONFIG_HAS_DMA) && defined(CONFIG_DMA_NEED_SYNC) dma_addr_t dma_addr = page_pool_get_dma_addr_netmem(netmem); dma_sync_size = min(dma_sync_size, pool->p.max_len); __dma_sync_single_for_device(pool->p.dev, dma_addr + pool->p.offset, dma_sync_size, pool->p.dma_dir); #endif } static __always_inline void page_pool_dma_sync_for_device(const struct page_pool *pool, netmem_ref netmem, u32 dma_sync_size) { if (pool->dma_sync && dma_dev_need_sync(pool->p.dev)) __page_pool_dma_sync_for_device(pool, netmem, dma_sync_size); } static bool page_pool_dma_map(struct page_pool *pool, netmem_ref netmem) { dma_addr_t dma; /* Setup DMA mapping: use 'struct page' area for storing DMA-addr * since dma_addr_t can be either 32 or 64 bits and does not always fit * into page private data (i.e 32bit cpu with 64bit DMA caps) * This mapping is kept for lifetime of page, until leaving pool. */ dma = dma_map_page_attrs(pool->p.dev, netmem_to_page(netmem), 0, (PAGE_SIZE << pool->p.order), pool->p.dma_dir, DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_WEAK_ORDERING); if (dma_mapping_error(pool->p.dev, dma)) return false; if (page_pool_set_dma_addr_netmem(netmem, dma)) goto unmap_failed; page_pool_dma_sync_for_device(pool, netmem, pool->p.max_len); return true; unmap_failed: WARN_ONCE(1, "unexpected DMA address, please report to netdev@"); dma_unmap_page_attrs(pool->p.dev, dma, PAGE_SIZE << pool->p.order, pool->p.dma_dir, DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_WEAK_ORDERING); return false; } static struct page *__page_pool_alloc_page_order(struct page_pool *pool, gfp_t gfp) { struct page *page; gfp |= __GFP_COMP; page = alloc_pages_node(pool->p.nid, gfp, pool->p.order); if (unlikely(!page)) return NULL; if (pool->dma_map && unlikely(!page_pool_dma_map(pool, page_to_netmem(page)))) { put_page(page); return NULL; } alloc_stat_inc(pool, slow_high_order); page_pool_set_pp_info(pool, page_to_netmem(page)); /* Track how many pages are held 'in-flight' */ pool->pages_state_hold_cnt++; trace_page_pool_state_hold(pool, page_to_netmem(page), pool->pages_state_hold_cnt); return page; } /* slow path */ static noinline netmem_ref __page_pool_alloc_pages_slow(struct page_pool *pool, gfp_t gfp) { const int bulk = PP_ALLOC_CACHE_REFILL; unsigned int pp_order = pool->p.order; bool dma_map = pool->dma_map; netmem_ref netmem; int i, nr_pages; /* Don't support bulk alloc for high-order pages */ if (unlikely(pp_order)) return page_to_netmem(__page_pool_alloc_page_order(pool, gfp)); /* Unnecessary as alloc cache is empty, but guarantees zero count */ if (unlikely(pool->alloc.count > 0)) return pool->alloc.cache[--pool->alloc.count]; /* Mark empty alloc.cache slots "empty" for alloc_pages_bulk */ memset(&pool->alloc.cache, 0, sizeof(void *) * bulk); nr_pages = alloc_pages_bulk_node(gfp, pool->p.nid, bulk, (struct page **)pool->alloc.cache); if (unlikely(!nr_pages)) return 0; /* Pages have been filled into alloc.cache array, but count is zero and * page element have not been (possibly) DMA mapped. */ for (i = 0; i < nr_pages; i++) { netmem = pool->alloc.cache[i]; if (dma_map && unlikely(!page_pool_dma_map(pool, netmem))) { put_page(netmem_to_page(netmem)); continue; } page_pool_set_pp_info(pool, netmem); pool->alloc.cache[pool->alloc.count++] = netmem; /* Track how many pages are held 'in-flight' */ pool->pages_state_hold_cnt++; trace_page_pool_state_hold(pool, netmem, pool->pages_state_hold_cnt); } /* Return last page */ if (likely(pool->alloc.count > 0)) { netmem = pool->alloc.cache[--pool->alloc.count]; alloc_stat_inc(pool, slow); } else { netmem = 0; } /* When page just alloc'ed is should/must have refcnt 1. */ return netmem; } /* For using page_pool replace: alloc_pages() API calls, but provide * synchronization guarantee for allocation side. */ netmem_ref page_pool_alloc_netmems(struct page_pool *pool, gfp_t gfp) { netmem_ref netmem; /* Fast-path: Get a page from cache */ netmem = __page_pool_get_cached(pool); if (netmem) return netmem; /* Slow-path: cache empty, do real allocation */ if (static_branch_unlikely(&page_pool_mem_providers) && pool->mp_priv) netmem = mp_dmabuf_devmem_alloc_netmems(pool, gfp); else netmem = __page_pool_alloc_pages_slow(pool, gfp); return netmem; } EXPORT_SYMBOL(page_pool_alloc_netmems); ALLOW_ERROR_INJECTION(page_pool_alloc_netmems, NULL); struct page *page_pool_alloc_pages(struct page_pool *pool, gfp_t gfp) { return netmem_to_page(page_pool_alloc_netmems(pool, gfp)); } EXPORT_SYMBOL(page_pool_alloc_pages); /* Calculate distance between two u32 values, valid if distance is below 2^(31) * https://en.wikipedia.org/wiki/Serial_number_arithmetic#General_Solution */ #define _distance(a, b) (s32)((a) - (b)) s32 page_pool_inflight(const struct page_pool *pool, bool strict) { u32 release_cnt = atomic_read(&pool->pages_state_release_cnt); u32 hold_cnt = READ_ONCE(pool->pages_state_hold_cnt); s32 inflight; inflight = _distance(hold_cnt, release_cnt); if (strict) { trace_page_pool_release(pool, inflight, hold_cnt, release_cnt); WARN(inflight < 0, "Negative(%d) inflight packet-pages", inflight); } else { inflight = max(0, inflight); } return inflight; } void page_pool_set_pp_info(struct page_pool *pool, netmem_ref netmem) { netmem_set_pp(netmem, pool); netmem_or_pp_magic(netmem, PP_SIGNATURE); /* Ensuring all pages have been split into one fragment initially: * page_pool_set_pp_info() is only called once for every page when it * is allocated from the page allocator and page_pool_fragment_page() * is dirtying the same cache line as the page->pp_magic above, so * the overhead is negligible. */ page_pool_fragment_netmem(netmem, 1); if (pool->has_init_callback) pool->slow.init_callback(netmem, pool->slow.init_arg); } void page_pool_clear_pp_info(netmem_ref netmem) { netmem_clear_pp_magic(netmem); netmem_set_pp(netmem, NULL); } static __always_inline void __page_pool_release_page_dma(struct page_pool *pool, netmem_ref netmem) { dma_addr_t dma; if (!pool->dma_map) /* Always account for inflight pages, even if we didn't * map them */ return; dma = page_pool_get_dma_addr_netmem(netmem); /* When page is unmapped, it cannot be returned to our pool */ dma_unmap_page_attrs(pool->p.dev, dma, PAGE_SIZE << pool->p.order, pool->p.dma_dir, DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_WEAK_ORDERING); page_pool_set_dma_addr_netmem(netmem, 0); } /* Disconnects a page (from a page_pool). API users can have a need * to disconnect a page (from a page_pool), to allow it to be used as * a regular page (that will eventually be returned to the normal * page-allocator via put_page). */ void page_pool_return_page(struct page_pool *pool, netmem_ref netmem) { int count; bool put; put = true; if (static_branch_unlikely(&page_pool_mem_providers) && pool->mp_priv) put = mp_dmabuf_devmem_release_page(pool, netmem); else __page_pool_release_page_dma(pool, netmem); /* This may be the last page returned, releasing the pool, so * it is not safe to reference pool afterwards. */ count = atomic_inc_return_relaxed(&pool->pages_state_release_cnt); trace_page_pool_state_release(pool, netmem, count); if (put) { page_pool_clear_pp_info(netmem); put_page(netmem_to_page(netmem)); } /* An optimization would be to call __free_pages(page, pool->p.order) * knowing page is not part of page-cache (thus avoiding a * __page_cache_release() call). */ } static bool page_pool_recycle_in_ring(struct page_pool *pool, netmem_ref netmem) { int ret; /* BH protection not needed if current is softirq */ if (in_softirq()) ret = ptr_ring_produce(&pool->ring, (__force void *)netmem); else ret = ptr_ring_produce_bh(&pool->ring, (__force void *)netmem); if (!ret) { recycle_stat_inc(pool, ring); return true; } return false; } /* Only allow direct recycling in special circumstances, into the * alloc side cache. E.g. during RX-NAPI processing for XDP_DROP use-case. * * Caller must provide appropriate safe context. */ static bool page_pool_recycle_in_cache(netmem_ref netmem, struct page_pool *pool) { if (unlikely(pool->alloc.count == PP_ALLOC_CACHE_SIZE)) { recycle_stat_inc(pool, cache_full); return false; } /* Caller MUST have verified/know (page_ref_count(page) == 1) */ pool->alloc.cache[pool->alloc.count++] = netmem; recycle_stat_inc(pool, cached); return true; } static bool __page_pool_page_can_be_recycled(netmem_ref netmem) { return netmem_is_net_iov(netmem) || (page_ref_count(netmem_to_page(netmem)) == 1 && !page_is_pfmemalloc(netmem_to_page(netmem))); } /* If the page refcnt == 1, this will try to recycle the page. * If pool->dma_sync is set, we'll try to sync the DMA area for * the configured size min(dma_sync_size, pool->max_len). * If the page refcnt != 1, then the page will be returned to memory * subsystem. */ static __always_inline netmem_ref __page_pool_put_page(struct page_pool *pool, netmem_ref netmem, unsigned int dma_sync_size, bool allow_direct) { lockdep_assert_no_hardirq(); /* This allocator is optimized for the XDP mode that uses * one-frame-per-page, but have fallbacks that act like the * regular page allocator APIs. * * refcnt == 1 means page_pool owns page, and can recycle it. * * page is NOT reusable when allocated when system is under * some pressure. (page_is_pfmemalloc) */ if (likely(__page_pool_page_can_be_recycled(netmem))) { /* Read barrier done in page_ref_count / READ_ONCE */ page_pool_dma_sync_for_device(pool, netmem, dma_sync_size); if (allow_direct && page_pool_recycle_in_cache(netmem, pool)) return 0; /* Page found as candidate for recycling */ return netmem; } /* Fallback/non-XDP mode: API user have elevated refcnt. * * Many drivers split up the page into fragments, and some * want to keep doing this to save memory and do refcnt based * recycling. Support this use case too, to ease drivers * switching between XDP/non-XDP. * * In-case page_pool maintains the DMA mapping, API user must * call page_pool_put_page once. In this elevated refcnt * case, the DMA is unmapped/released, as driver is likely * doing refcnt based recycle tricks, meaning another process * will be invoking put_page. */ recycle_stat_inc(pool, released_refcnt); page_pool_return_page(pool, netmem); return 0; } static bool page_pool_napi_local(const struct page_pool *pool) { const struct napi_struct *napi; u32 cpuid; if (unlikely(!in_softirq())) return false; /* Allow direct recycle if we have reasons to believe that we are * in the same context as the consumer would run, so there's * no possible race. * __page_pool_put_page() makes sure we're not in hardirq context * and interrupts are enabled prior to accessing the cache. */ cpuid = smp_processor_id(); if (READ_ONCE(pool->cpuid) == cpuid) return true; napi = READ_ONCE(pool->p.napi); return napi && READ_ONCE(napi->list_owner) == cpuid; } void page_pool_put_unrefed_netmem(struct page_pool *pool, netmem_ref netmem, unsigned int dma_sync_size, bool allow_direct) { if (!allow_direct) allow_direct = page_pool_napi_local(pool); netmem = __page_pool_put_page(pool, netmem, dma_sync_size, allow_direct); if (netmem && !page_pool_recycle_in_ring(pool, netmem)) { /* Cache full, fallback to free pages */ recycle_stat_inc(pool, ring_full); page_pool_return_page(pool, netmem); } } EXPORT_SYMBOL(page_pool_put_unrefed_netmem); void page_pool_put_unrefed_page(struct page_pool *pool, struct page *page, unsigned int dma_sync_size, bool allow_direct) { page_pool_put_unrefed_netmem(pool, page_to_netmem(page), dma_sync_size, allow_direct); } EXPORT_SYMBOL(page_pool_put_unrefed_page); static void page_pool_recycle_ring_bulk(struct page_pool *pool, netmem_ref *bulk, u32 bulk_len) { bool in_softirq; u32 i; /* Bulk produce into ptr_ring page_pool cache */ in_softirq = page_pool_producer_lock(pool); for (i = 0; i < bulk_len; i++) { if (__ptr_ring_produce(&pool->ring, (__force void *)bulk[i])) { /* ring full */ recycle_stat_inc(pool, ring_full); break; } } page_pool_producer_unlock(pool, in_softirq); recycle_stat_add(pool, ring, i); /* Hopefully all pages were returned into ptr_ring */ if (likely(i == bulk_len)) return; /* * ptr_ring cache is full, free remaining pages outside producer lock * since put_page() with refcnt == 1 can be an expensive operation. */ for (; i < bulk_len; i++) page_pool_return_page(pool, bulk[i]); } /** * page_pool_put_netmem_bulk() - release references on multiple netmems * @data: array holding netmem references * @count: number of entries in @data * * Tries to refill a number of netmems into the ptr_ring cache holding ptr_ring * producer lock. If the ptr_ring is full, page_pool_put_netmem_bulk() * will release leftover netmems to the memory provider. * page_pool_put_netmem_bulk() is suitable to be run inside the driver NAPI tx * completion loop for the XDP_REDIRECT use case. * * Please note the caller must not use data area after running * page_pool_put_netmem_bulk(), as this function overwrites it. */ void page_pool_put_netmem_bulk(netmem_ref *data, u32 count) { u32 bulk_len = 0; for (u32 i = 0; i < count; i++) { netmem_ref netmem = netmem_compound_head(data[i]); if (page_pool_unref_and_test(netmem)) data[bulk_len++] = netmem; } count = bulk_len; while (count) { netmem_ref bulk[XDP_BULK_QUEUE_SIZE]; struct page_pool *pool = NULL; bool allow_direct; u32 foreign = 0; bulk_len = 0; for (u32 i = 0; i < count; i++) { struct page_pool *netmem_pp; netmem_ref netmem = data[i]; netmem_pp = netmem_get_pp(netmem); if (unlikely(!pool)) { pool = netmem_pp; allow_direct = page_pool_napi_local(pool); } else if (netmem_pp != pool) { /* * If the netmem belongs to a different * page_pool, save it for another round. */ data[foreign++] = netmem; continue; } netmem = __page_pool_put_page(pool, netmem, -1, allow_direct); /* Approved for bulk recycling in ptr_ring cache */ if (netmem) bulk[bulk_len++] = netmem; } if (bulk_len) page_pool_recycle_ring_bulk(pool, bulk, bulk_len); count = foreign; } } EXPORT_SYMBOL(page_pool_put_netmem_bulk); static netmem_ref page_pool_drain_frag(struct page_pool *pool, netmem_ref netmem) { long drain_count = BIAS_MAX - pool->frag_users; /* Some user is still using the page frag */ if (likely(page_pool_unref_netmem(netmem, drain_count))) return 0; if (__page_pool_page_can_be_recycled(netmem)) { page_pool_dma_sync_for_device(pool, netmem, -1); return netmem; } page_pool_return_page(pool, netmem); return 0; } static void page_pool_free_frag(struct page_pool *pool) { long drain_count = BIAS_MAX - pool->frag_users; netmem_ref netmem = pool->frag_page; pool->frag_page = 0; if (!netmem || page_pool_unref_netmem(netmem, drain_count)) return; page_pool_return_page(pool, netmem); } netmem_ref page_pool_alloc_frag_netmem(struct page_pool *pool, unsigned int *offset, unsigned int size, gfp_t gfp) { unsigned int max_size = PAGE_SIZE << pool->p.order; netmem_ref netmem = pool->frag_page; if (WARN_ON(size > max_size)) return 0; size = ALIGN(size, dma_get_cache_alignment()); *offset = pool->frag_offset; if (netmem && *offset + size > max_size) { netmem = page_pool_drain_frag(pool, netmem); if (netmem) { recycle_stat_inc(pool, cached); alloc_stat_inc(pool, fast); goto frag_reset; } } if (!netmem) { netmem = page_pool_alloc_netmems(pool, gfp); if (unlikely(!netmem)) { pool->frag_page = 0; return 0; } pool->frag_page = netmem; frag_reset: pool->frag_users = 1; *offset = 0; pool->frag_offset = size; page_pool_fragment_netmem(netmem, BIAS_MAX); return netmem; } pool->frag_users++; pool->frag_offset = *offset + size; return netmem; } EXPORT_SYMBOL(page_pool_alloc_frag_netmem); struct page *page_pool_alloc_frag(struct page_pool *pool, unsigned int *offset, unsigned int size, gfp_t gfp) { return netmem_to_page(page_pool_alloc_frag_netmem(pool, offset, size, gfp)); } EXPORT_SYMBOL(page_pool_alloc_frag); static void page_pool_empty_ring(struct page_pool *pool) { netmem_ref netmem; /* Empty recycle ring */ while ((netmem = (__force netmem_ref)ptr_ring_consume_bh(&pool->ring))) { /* Verify the refcnt invariant of cached pages */ if (!(netmem_ref_count(netmem) == 1)) pr_crit("%s() page_pool refcnt %d violation\n", __func__, netmem_ref_count(netmem)); page_pool_return_page(pool, netmem); } } static void __page_pool_destroy(struct page_pool *pool) { if (pool->disconnect) pool->disconnect(pool); page_pool_unlist(pool); page_pool_uninit(pool); if (pool->mp_priv) { mp_dmabuf_devmem_destroy(pool); static_branch_dec(&page_pool_mem_providers); } kfree(pool); } static void page_pool_empty_alloc_cache_once(struct page_pool *pool) { netmem_ref netmem; if (pool->destroy_cnt) return; /* Empty alloc cache, assume caller made sure this is * no-longer in use, and page_pool_alloc_pages() cannot be * call concurrently. */ while (pool->alloc.count) { netmem = pool->alloc.cache[--pool->alloc.count]; page_pool_return_page(pool, netmem); } } static void page_pool_scrub(struct page_pool *pool) { page_pool_empty_alloc_cache_once(pool); pool->destroy_cnt++; /* No more consumers should exist, but producers could still * be in-flight. */ page_pool_empty_ring(pool); } static int page_pool_release(struct page_pool *pool) { int inflight; page_pool_scrub(pool); inflight = page_pool_inflight(pool, true); if (!inflight) __page_pool_destroy(pool); return inflight; } static void page_pool_release_retry(struct work_struct *wq) { struct delayed_work *dwq = to_delayed_work(wq); struct page_pool *pool = container_of(dwq, typeof(*pool), release_dw); void *netdev; int inflight; inflight = page_pool_release(pool); if (!inflight) return; /* Periodic warning for page pools the user can't see */ netdev = READ_ONCE(pool->slow.netdev); if (time_after_eq(jiffies, pool->defer_warn) && (!netdev || netdev == NET_PTR_POISON)) { int sec = (s32)((u32)jiffies - (u32)pool->defer_start) / HZ; pr_warn("%s() stalled pool shutdown: id %u, %d inflight %d sec\n", __func__, pool->user.id, inflight, sec); pool->defer_warn = jiffies + DEFER_WARN_INTERVAL; } /* Still not ready to be disconnected, retry later */ schedule_delayed_work(&pool->release_dw, DEFER_TIME); } void page_pool_use_xdp_mem(struct page_pool *pool, void (*disconnect)(void *), const struct xdp_mem_info *mem) { refcount_inc(&pool->user_cnt); pool->disconnect = disconnect; pool->xdp_mem_id = mem->id; } void page_pool_disable_direct_recycling(struct page_pool *pool) { /* Disable direct recycling based on pool->cpuid. * Paired with READ_ONCE() in page_pool_napi_local(). */ WRITE_ONCE(pool->cpuid, -1); if (!pool->p.napi) return; /* To avoid races with recycling and additional barriers make sure * pool and NAPI are unlinked when NAPI is disabled. */ WARN_ON(!test_bit(NAPI_STATE_SCHED, &pool->p.napi->state)); WARN_ON(READ_ONCE(pool->p.napi->list_owner) != -1); mutex_lock(&page_pools_lock); WRITE_ONCE(pool->p.napi, NULL); mutex_unlock(&page_pools_lock); } EXPORT_SYMBOL(page_pool_disable_direct_recycling); void page_pool_destroy(struct page_pool *pool) { if (!pool) return; if (!page_pool_put(pool)) return; page_pool_disable_direct_recycling(pool); page_pool_free_frag(pool); if (!page_pool_release(pool)) return; page_pool_detached(pool); pool->defer_start = jiffies; pool->defer_warn = jiffies + DEFER_WARN_INTERVAL; INIT_DELAYED_WORK(&pool->release_dw, page_pool_release_retry); schedule_delayed_work(&pool->release_dw, DEFER_TIME); } EXPORT_SYMBOL(page_pool_destroy); /* Caller must provide appropriate safe context, e.g. NAPI. */ void page_pool_update_nid(struct page_pool *pool, int new_nid) { netmem_ref netmem; trace_page_pool_update_nid(pool, new_nid); pool->p.nid = new_nid; /* Flush pool alloc cache, as refill will check NUMA node */ while (pool->alloc.count) { netmem = pool->alloc.cache[--pool->alloc.count]; page_pool_return_page(pool, netmem); } } EXPORT_SYMBOL(page_pool_update_nid);
3 3 3 3 3 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 /* * llc_sap.c - driver routines for SAP component. * * Copyright (c) 1997 by Procom Technology, Inc. * 2001-2003 by Arnaldo Carvalho de Melo <acme@conectiva.com.br> * * This program can be redistributed or modified under the terms of the * GNU General Public License as published by the Free Software Foundation. * This program is distributed without any warranty or implied warranty * of merchantability or fitness for a particular purpose. * * See the GNU General Public License for more details. */ #include <net/llc.h> #include <net/llc_if.h> #include <net/llc_conn.h> #include <net/llc_pdu.h> #include <net/llc_sap.h> #include <net/llc_s_ac.h> #include <net/llc_s_ev.h> #include <net/llc_s_st.h> #include <net/sock.h> #include <net/tcp_states.h> #include <linux/llc.h> #include <linux/slab.h> static int llc_mac_header_len(unsigned short devtype) { switch (devtype) { case ARPHRD_ETHER: case ARPHRD_LOOPBACK: return sizeof(struct ethhdr); } return 0; } /** * llc_alloc_frame - allocates sk_buff for frame * @sk: socket to allocate frame to * @dev: network device this skb will be sent over * @type: pdu type to allocate * @data_size: data size to allocate * * Allocates an sk_buff for frame and initializes sk_buff fields. * Returns allocated skb or %NULL when out of memory. */ struct sk_buff *llc_alloc_frame(struct sock *sk, struct net_device *dev, u8 type, u32 data_size) { int hlen = type == LLC_PDU_TYPE_U ? 3 : 4; struct sk_buff *skb; hlen += llc_mac_header_len(dev->type); skb = alloc_skb(hlen + data_size, GFP_ATOMIC); if (skb) { skb_reset_mac_header(skb); skb_reserve(skb, hlen); skb_reset_network_header(skb); skb_reset_transport_header(skb); skb->protocol = htons(ETH_P_802_2); skb->dev = dev; if (sk != NULL) skb_set_owner_w(skb, sk); } return skb; } void llc_save_primitive(struct sock *sk, struct sk_buff *skb, u8 prim) { struct sockaddr_llc *addr; /* save primitive for use by the user. */ addr = llc_ui_skb_cb(skb); memset(addr, 0, sizeof(*addr)); addr->sllc_family = sk->sk_family; addr->sllc_arphrd = skb->dev->type; addr->sllc_test = prim == LLC_TEST_PRIM; addr->sllc_xid = prim == LLC_XID_PRIM; addr->sllc_ua = prim == LLC_DATAUNIT_PRIM; llc_pdu_decode_sa(skb, addr->sllc_mac); llc_pdu_decode_ssap(skb, &addr->sllc_sap); } /** * llc_sap_rtn_pdu - Informs upper layer on rx of an UI, XID or TEST pdu. * @sap: pointer to SAP * @skb: received pdu */ void llc_sap_rtn_pdu(struct llc_sap *sap, struct sk_buff *skb) { struct llc_sap_state_ev *ev = llc_sap_ev(skb); struct llc_pdu_un *pdu = llc_pdu_un_hdr(skb); switch (LLC_U_PDU_RSP(pdu)) { case LLC_1_PDU_CMD_TEST: ev->prim = LLC_TEST_PRIM; break; case LLC_1_PDU_CMD_XID: ev->prim = LLC_XID_PRIM; break; case LLC_1_PDU_CMD_UI: ev->prim = LLC_DATAUNIT_PRIM; break; } ev->ind_cfm_flag = LLC_IND; } /** * llc_find_sap_trans - finds transition for event * @sap: pointer to SAP * @skb: happened event * * This function finds transition that matches with happened event. * Returns the pointer to found transition on success or %NULL for * failure. */ static const struct llc_sap_state_trans *llc_find_sap_trans(struct llc_sap *sap, struct sk_buff *skb) { int i = 0; const struct llc_sap_state_trans *rc = NULL; const struct llc_sap_state_trans **next_trans; struct llc_sap_state *curr_state = &llc_sap_state_table[sap->state - 1]; /* * Search thru events for this state until list exhausted or until * its obvious the event is not valid for the current state */ for (next_trans = curr_state->transitions; next_trans[i]->ev; i++) if (!next_trans[i]->ev(sap, skb)) { rc = next_trans[i]; /* got event match; return it */ break; } return rc; } /** * llc_exec_sap_trans_actions - execute actions related to event * @sap: pointer to SAP * @trans: pointer to transition that it's actions must be performed * @skb: happened event. * * This function executes actions that is related to happened event. * Returns 0 for success and 1 for failure of at least one action. */ static int llc_exec_sap_trans_actions(struct llc_sap *sap, const struct llc_sap_state_trans *trans, struct sk_buff *skb) { int rc = 0; const llc_sap_action_t *next_action = trans->ev_actions; for (; next_action && *next_action; next_action++) if ((*next_action)(sap, skb)) rc = 1; return rc; } /** * llc_sap_next_state - finds transition, execs actions & change SAP state * @sap: pointer to SAP * @skb: happened event * * This function finds transition that matches with happened event, then * executes related actions and finally changes state of SAP. It returns * 0 on success and 1 for failure. */ static int llc_sap_next_state(struct llc_sap *sap, struct sk_buff *skb) { const struct llc_sap_state_trans *trans; int rc = 1; if (sap->state > LLC_NR_SAP_STATES) goto out; trans = llc_find_sap_trans(sap, skb); if (!trans) goto out; /* * Got the state to which we next transition; perform the actions * associated with this transition before actually transitioning to the * next state */ rc = llc_exec_sap_trans_actions(sap, trans, skb); if (rc) goto out; /* * Transition SAP to next state if all actions execute successfully */ sap->state = trans->next_state; out: return rc; } /** * llc_sap_state_process - sends event to SAP state machine * @sap: sap to use * @skb: pointer to occurred event * * After executing actions of the event, upper layer will be indicated * if needed(on receiving an UI frame). sk can be null for the * datalink_proto case. * * This function always consumes a reference to the skb. */ static void llc_sap_state_process(struct llc_sap *sap, struct sk_buff *skb) { struct llc_sap_state_ev *ev = llc_sap_ev(skb); ev->ind_cfm_flag = 0; llc_sap_next_state(sap, skb); if (ev->ind_cfm_flag == LLC_IND && skb->sk->sk_state != TCP_LISTEN) { llc_save_primitive(skb->sk, skb, ev->prim); /* queue skb to the user. */ if (sock_queue_rcv_skb(skb->sk, skb) == 0) return; } kfree_skb(skb); } /** * llc_build_and_send_test_pkt - TEST interface for upper layers. * @sap: sap to use * @skb: packet to send * @dmac: destination mac address * @dsap: destination sap * * This function is called when upper layer wants to send a TEST pdu. * Returns 0 for success, 1 otherwise. */ void llc_build_and_send_test_pkt(struct llc_sap *sap, struct sk_buff *skb, u8 *dmac, u8 dsap) { struct llc_sap_state_ev *ev = llc_sap_ev(skb); ev->saddr.lsap = sap->laddr.lsap; ev->daddr.lsap = dsap; memcpy(ev->saddr.mac, skb->dev->dev_addr, IFHWADDRLEN); memcpy(ev->daddr.mac, dmac, IFHWADDRLEN); ev->type = LLC_SAP_EV_TYPE_PRIM; ev->prim = LLC_TEST_PRIM; ev->prim_type = LLC_PRIM_TYPE_REQ; llc_sap_state_process(sap, skb); } /** * llc_build_and_send_xid_pkt - XID interface for upper layers * @sap: sap to use * @skb: packet to send * @dmac: destination mac address * @dsap: destination sap * * This function is called when upper layer wants to send a XID pdu. * Returns 0 for success, 1 otherwise. */ void llc_build_and_send_xid_pkt(struct llc_sap *sap, struct sk_buff *skb, u8 *dmac, u8 dsap) { struct llc_sap_state_ev *ev = llc_sap_ev(skb); ev->saddr.lsap = sap->laddr.lsap; ev->daddr.lsap = dsap; memcpy(ev->saddr.mac, skb->dev->dev_addr, IFHWADDRLEN); memcpy(ev->daddr.mac, dmac, IFHWADDRLEN); ev->type = LLC_SAP_EV_TYPE_PRIM; ev->prim = LLC_XID_PRIM; ev->prim_type = LLC_PRIM_TYPE_REQ; llc_sap_state_process(sap, skb); } /** * llc_sap_rcv - sends received pdus to the sap state machine * @sap: current sap component structure. * @skb: received frame. * @sk: socket to associate to frame * * Sends received pdus to the sap state machine. */ static void llc_sap_rcv(struct llc_sap *sap, struct sk_buff *skb, struct sock *sk) { struct llc_sap_state_ev *ev = llc_sap_ev(skb); ev->type = LLC_SAP_EV_TYPE_PDU; ev->reason = 0; skb_orphan(skb); sock_hold(sk); skb->sk = sk; skb->destructor = sock_efree; llc_sap_state_process(sap, skb); } static inline bool llc_dgram_match(const struct llc_sap *sap, const struct llc_addr *laddr, const struct sock *sk, const struct net *net) { struct llc_sock *llc = llc_sk(sk); return sk->sk_type == SOCK_DGRAM && net_eq(sock_net(sk), net) && llc->laddr.lsap == laddr->lsap && ether_addr_equal(llc->laddr.mac, laddr->mac); } /** * llc_lookup_dgram - Finds dgram socket for the local sap/mac * @sap: SAP * @laddr: address of local LLC (MAC + SAP) * @net: netns to look up a socket in * * Search socket list of the SAP and finds connection using the local * mac, and local sap. Returns pointer for socket found, %NULL otherwise. */ static struct sock *llc_lookup_dgram(struct llc_sap *sap, const struct llc_addr *laddr, const struct net *net) { struct sock *rc; struct hlist_nulls_node *node; int slot = llc_sk_laddr_hashfn(sap, laddr); struct hlist_nulls_head *laddr_hb = &sap->sk_laddr_hash[slot]; rcu_read_lock_bh(); again: sk_nulls_for_each_rcu(rc, node, laddr_hb) { if (llc_dgram_match(sap, laddr, rc, net)) { /* Extra checks required by SLAB_TYPESAFE_BY_RCU */ if (unlikely(!refcount_inc_not_zero(&rc->sk_refcnt))) goto again; if (unlikely(llc_sk(rc)->sap != sap || !llc_dgram_match(sap, laddr, rc, net))) { sock_put(rc); continue; } goto found; } } rc = NULL; /* * if the nulls value we got at the end of this lookup is * not the expected one, we must restart lookup. * We probably met an item that was moved to another chain. */ if (unlikely(get_nulls_value(node) != slot)) goto again; found: rcu_read_unlock_bh(); return rc; } static inline bool llc_mcast_match(const struct llc_sap *sap, const struct llc_addr *laddr, const struct sk_buff *skb, const struct sock *sk) { struct llc_sock *llc = llc_sk(sk); return sk->sk_type == SOCK_DGRAM && llc->laddr.lsap == laddr->lsap && llc->dev == skb->dev; } static void llc_do_mcast(struct llc_sap *sap, struct sk_buff *skb, struct sock **stack, int count) { struct sk_buff *skb1; int i; for (i = 0; i < count; i++) { skb1 = skb_clone(skb, GFP_ATOMIC); if (!skb1) { sock_put(stack[i]); continue; } llc_sap_rcv(sap, skb1, stack[i]); sock_put(stack[i]); } } /** * llc_sap_mcast - Deliver multicast PDU's to all matching datagram sockets. * @sap: SAP * @laddr: address of local LLC (MAC + SAP) * @skb: PDU to deliver * * Search socket list of the SAP and finds connections with same sap. * Deliver clone to each. */ static void llc_sap_mcast(struct llc_sap *sap, const struct llc_addr *laddr, struct sk_buff *skb) { int i = 0; struct sock *sk; struct sock *stack[256 / sizeof(struct sock *)]; struct llc_sock *llc; struct hlist_head *dev_hb = llc_sk_dev_hash(sap, skb->dev->ifindex); spin_lock_bh(&sap->sk_lock); hlist_for_each_entry(llc, dev_hb, dev_hash_node) { sk = &llc->sk; if (!llc_mcast_match(sap, laddr, skb, sk)) continue; sock_hold(sk); if (i < ARRAY_SIZE(stack)) stack[i++] = sk; else { llc_do_mcast(sap, skb, stack, i); i = 0; } } spin_unlock_bh(&sap->sk_lock); llc_do_mcast(sap, skb, stack, i); } void llc_sap_handler(struct llc_sap *sap, struct sk_buff *skb) { struct llc_addr laddr; llc_pdu_decode_da(skb, laddr.mac); llc_pdu_decode_dsap(skb, &laddr.lsap); if (is_multicast_ether_addr(laddr.mac)) { llc_sap_mcast(sap, &laddr, skb); kfree_skb(skb); } else { struct sock *sk = llc_lookup_dgram(sap, &laddr, dev_net(skb->dev)); if (sk) { llc_sap_rcv(sap, skb, sk); sock_put(sk); } else kfree_skb(skb); } }
16 13 16 16 6 16 16 13 13 16 6 16 16 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 // SPDX-License-Identifier: GPL-2.0-only #ifndef KVM_X86_MMU_SPTE_H #define KVM_X86_MMU_SPTE_H #include <asm/vmx.h> #include "mmu.h" #include "mmu_internal.h" /* * A MMU present SPTE is backed by actual memory and may or may not be present * in hardware. E.g. MMIO SPTEs are not considered present. Use bit 11, as it * is ignored by all flavors of SPTEs and checking a low bit often generates * better code than for a high bit, e.g. 56+. MMU present checks are pervasive * enough that the improved code generation is noticeable in KVM's footprint. */ #define SPTE_MMU_PRESENT_MASK BIT_ULL(11) /* * TDP SPTES (more specifically, EPT SPTEs) may not have A/D bits, and may also * be restricted to using write-protection (for L2 when CPU dirty logging, i.e. * PML, is enabled). Use bits 52 and 53 to hold the type of A/D tracking that * is must be employed for a given TDP SPTE. * * Note, the "enabled" mask must be '0', as bits 62:52 are _reserved_ for PAE * paging, including NPT PAE. This scheme works because legacy shadow paging * is guaranteed to have A/D bits and write-protection is forced only for * TDP with CPU dirty logging (PML). If NPT ever gains PML-like support, it * must be restricted to 64-bit KVM. */ #define SPTE_TDP_AD_SHIFT 52 #define SPTE_TDP_AD_MASK (3ULL << SPTE_TDP_AD_SHIFT) #define SPTE_TDP_AD_ENABLED (0ULL << SPTE_TDP_AD_SHIFT) #define SPTE_TDP_AD_DISABLED (1ULL << SPTE_TDP_AD_SHIFT) #define SPTE_TDP_AD_WRPROT_ONLY (2ULL << SPTE_TDP_AD_SHIFT) static_assert(SPTE_TDP_AD_ENABLED == 0); #ifdef CONFIG_DYNAMIC_PHYSICAL_MASK #define SPTE_BASE_ADDR_MASK (physical_mask & ~(u64)(PAGE_SIZE-1)) #else #define SPTE_BASE_ADDR_MASK (((1ULL << 52) - 1) & ~(u64)(PAGE_SIZE-1)) #endif #define SPTE_PERM_MASK (PT_PRESENT_MASK | PT_WRITABLE_MASK | shadow_user_mask \ | shadow_x_mask | shadow_nx_mask | shadow_me_mask) #define ACC_EXEC_MASK 1 #define ACC_WRITE_MASK PT_WRITABLE_MASK #define ACC_USER_MASK PT_USER_MASK #define ACC_ALL (ACC_EXEC_MASK | ACC_WRITE_MASK | ACC_USER_MASK) /* The mask for the R/X bits in EPT PTEs */ #define SPTE_EPT_READABLE_MASK 0x1ull #define SPTE_EPT_EXECUTABLE_MASK 0x4ull #define SPTE_LEVEL_BITS 9 #define SPTE_LEVEL_SHIFT(level) __PT_LEVEL_SHIFT(level, SPTE_LEVEL_BITS) #define SPTE_INDEX(address, level) __PT_INDEX(address, level, SPTE_LEVEL_BITS) #define SPTE_ENT_PER_PAGE __PT_ENT_PER_PAGE(SPTE_LEVEL_BITS) /* * The mask/shift to use for saving the original R/X bits when marking the PTE * as not-present for access tracking purposes. We do not save the W bit as the * PTEs being access tracked also need to be dirty tracked, so the W bit will be * restored only when a write is attempted to the page. This mask obviously * must not overlap the A/D type mask. */ #define SHADOW_ACC_TRACK_SAVED_BITS_MASK (SPTE_EPT_READABLE_MASK | \ SPTE_EPT_EXECUTABLE_MASK) #define SHADOW_ACC_TRACK_SAVED_BITS_SHIFT 54 #define SHADOW_ACC_TRACK_SAVED_MASK (SHADOW_ACC_TRACK_SAVED_BITS_MASK << \ SHADOW_ACC_TRACK_SAVED_BITS_SHIFT) static_assert(!(SPTE_TDP_AD_MASK & SHADOW_ACC_TRACK_SAVED_MASK)); /* * {DEFAULT,EPT}_SPTE_{HOST,MMU}_WRITABLE are used to keep track of why a given * SPTE is write-protected. See is_writable_pte() for details. */ /* Bits 9 and 10 are ignored by all non-EPT PTEs. */ #define DEFAULT_SPTE_HOST_WRITABLE BIT_ULL(9) #define DEFAULT_SPTE_MMU_WRITABLE BIT_ULL(10) /* * Low ignored bits are at a premium for EPT, use high ignored bits, taking care * to not overlap the A/D type mask or the saved access bits of access-tracked * SPTEs when A/D bits are disabled. */ #define EPT_SPTE_HOST_WRITABLE BIT_ULL(57) #define EPT_SPTE_MMU_WRITABLE BIT_ULL(58) static_assert(!(EPT_SPTE_HOST_WRITABLE & SPTE_TDP_AD_MASK)); static_assert(!(EPT_SPTE_MMU_WRITABLE & SPTE_TDP_AD_MASK)); static_assert(!(EPT_SPTE_HOST_WRITABLE & SHADOW_ACC_TRACK_SAVED_MASK)); static_assert(!(EPT_SPTE_MMU_WRITABLE & SHADOW_ACC_TRACK_SAVED_MASK)); /* Defined only to keep the above static asserts readable. */ #undef SHADOW_ACC_TRACK_SAVED_MASK /* * Due to limited space in PTEs, the MMIO generation is a 19 bit subset of * the memslots generation and is derived as follows: * * Bits 0-7 of the MMIO generation are propagated to spte bits 3-10 * Bits 8-18 of the MMIO generation are propagated to spte bits 52-62 * * The KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS flag is intentionally not included in * the MMIO generation number, as doing so would require stealing a bit from * the "real" generation number and thus effectively halve the maximum number * of MMIO generations that can be handled before encountering a wrap (which * requires a full MMU zap). The flag is instead explicitly queried when * checking for MMIO spte cache hits. */ #define MMIO_SPTE_GEN_LOW_START 3 #define MMIO_SPTE_GEN_LOW_END 10 #define MMIO_SPTE_GEN_HIGH_START 52 #define MMIO_SPTE_GEN_HIGH_END 62 #define MMIO_SPTE_GEN_LOW_MASK GENMASK_ULL(MMIO_SPTE_GEN_LOW_END, \ MMIO_SPTE_GEN_LOW_START) #define MMIO_SPTE_GEN_HIGH_MASK GENMASK_ULL(MMIO_SPTE_GEN_HIGH_END, \ MMIO_SPTE_GEN_HIGH_START) static_assert(!(SPTE_MMU_PRESENT_MASK & (MMIO_SPTE_GEN_LOW_MASK | MMIO_SPTE_GEN_HIGH_MASK))); /* * The SPTE MMIO mask must NOT overlap the MMIO generation bits or the * MMU-present bit. The generation obviously co-exists with the magic MMIO * mask/value, and MMIO SPTEs are considered !MMU-present. * * The SPTE MMIO mask is allowed to use hardware "present" bits (i.e. all EPT * RWX bits), all physical address bits (legal PA bits are used for "fast" MMIO * and so they're off-limits for generation; additional checks ensure the mask * doesn't overlap legal PA bits), and bit 63 (carved out for future usage). */ #define SPTE_MMIO_ALLOWED_MASK (BIT_ULL(63) | GENMASK_ULL(51, 12) | GENMASK_ULL(2, 0)) static_assert(!(SPTE_MMIO_ALLOWED_MASK & (SPTE_MMU_PRESENT_MASK | MMIO_SPTE_GEN_LOW_MASK | MMIO_SPTE_GEN_HIGH_MASK))); #define MMIO_SPTE_GEN_LOW_BITS (MMIO_SPTE_GEN_LOW_END - MMIO_SPTE_GEN_LOW_START + 1) #define MMIO_SPTE_GEN_HIGH_BITS (MMIO_SPTE_GEN_HIGH_END - MMIO_SPTE_GEN_HIGH_START + 1) /* remember to adjust the comment above as well if you change these */ static_assert(MMIO_SPTE_GEN_LOW_BITS == 8 && MMIO_SPTE_GEN_HIGH_BITS == 11); #define MMIO_SPTE_GEN_LOW_SHIFT (MMIO_SPTE_GEN_LOW_START - 0) #define MMIO_SPTE_GEN_HIGH_SHIFT (MMIO_SPTE_GEN_HIGH_START - MMIO_SPTE_GEN_LOW_BITS) #define MMIO_SPTE_GEN_MASK GENMASK_ULL(MMIO_SPTE_GEN_LOW_BITS + MMIO_SPTE_GEN_HIGH_BITS - 1, 0) /* * Non-present SPTE value needs to set bit 63 for TDX, in order to suppress * #VE and get EPT violations on non-present PTEs. We can use the * same value also without TDX for both VMX and SVM: * * For SVM NPT, for non-present spte (bit 0 = 0), other bits are ignored. * For VMX EPT, bit 63 is ignored if #VE is disabled. (EPT_VIOLATION_VE=0) * bit 63 is #VE suppress if #VE is enabled. (EPT_VIOLATION_VE=1) */ #ifdef CONFIG_X86_64 #define SHADOW_NONPRESENT_VALUE BIT_ULL(63) static_assert(!(SHADOW_NONPRESENT_VALUE & SPTE_MMU_PRESENT_MASK)); #else #define SHADOW_NONPRESENT_VALUE 0ULL #endif /* * True if A/D bits are supported in hardware and are enabled by KVM. When * enabled, KVM uses A/D bits for all non-nested MMUs. Because L1 can disable * A/D bits in EPTP12, SP and SPTE variants are needed to handle the scenario * where KVM is using A/D bits for L1, but not L2. */ extern bool __read_mostly kvm_ad_enabled; extern u64 __read_mostly shadow_host_writable_mask; extern u64 __read_mostly shadow_mmu_writable_mask; extern u64 __read_mostly shadow_nx_mask; extern u64 __read_mostly shadow_x_mask; /* mutual exclusive with nx_mask */ extern u64 __read_mostly shadow_user_mask; extern u64 __read_mostly shadow_accessed_mask; extern u64 __read_mostly shadow_dirty_mask; extern u64 __read_mostly shadow_mmio_value; extern u64 __read_mostly shadow_mmio_mask; extern u64 __read_mostly shadow_mmio_access_mask; extern u64 __read_mostly shadow_present_mask; extern u64 __read_mostly shadow_memtype_mask; extern u64 __read_mostly shadow_me_value; extern u64 __read_mostly shadow_me_mask; /* * SPTEs in MMUs without A/D bits are marked with SPTE_TDP_AD_DISABLED; * shadow_acc_track_mask is the set of bits to be cleared in non-accessed * pages. */ extern u64 __read_mostly shadow_acc_track_mask; /* * This mask must be set on all non-zero Non-Present or Reserved SPTEs in order * to guard against L1TF attacks. */ extern u64 __read_mostly shadow_nonpresent_or_rsvd_mask; /* * The number of high-order 1 bits to use in the mask above. */ #define SHADOW_NONPRESENT_OR_RSVD_MASK_LEN 5 /* * If a thread running without exclusive control of the MMU lock must perform a * multi-part operation on an SPTE, it can set the SPTE to FROZEN_SPTE as a * non-present intermediate value. Other threads which encounter this value * should not modify the SPTE. * * Use a semi-arbitrary value that doesn't set RWX bits, i.e. is not-present on * both AMD and Intel CPUs, and doesn't set PFN bits, i.e. doesn't create a L1TF * vulnerability. * * Only used by the TDP MMU. */ #define FROZEN_SPTE (SHADOW_NONPRESENT_VALUE | 0x5a0ULL) /* Frozen SPTEs must not be misconstrued as shadow present PTEs. */ static_assert(!(FROZEN_SPTE & SPTE_MMU_PRESENT_MASK)); static inline bool is_frozen_spte(u64 spte) { return spte == FROZEN_SPTE; } /* Get an SPTE's index into its parent's page table (and the spt array). */ static inline int spte_index(u64 *sptep) { return ((unsigned long)sptep / sizeof(*sptep)) & (SPTE_ENT_PER_PAGE - 1); } /* * In some cases, we need to preserve the GFN of a non-present or reserved * SPTE when we usurp the upper five bits of the physical address space to * defend against L1TF, e.g. for MMIO SPTEs. To preserve the GFN, we'll * shift bits of the GFN that overlap with shadow_nonpresent_or_rsvd_mask * left into the reserved bits, i.e. the GFN in the SPTE will be split into * high and low parts. This mask covers the lower bits of the GFN. */ extern u64 __read_mostly shadow_nonpresent_or_rsvd_lower_gfn_mask; static inline struct kvm_mmu_page *to_shadow_page(hpa_t shadow_page) { struct page *page = pfn_to_page((shadow_page) >> PAGE_SHIFT); return (struct kvm_mmu_page *)page_private(page); } static inline struct kvm_mmu_page *spte_to_child_sp(u64 spte) { return to_shadow_page(spte & SPTE_BASE_ADDR_MASK); } static inline struct kvm_mmu_page *sptep_to_sp(u64 *sptep) { return to_shadow_page(__pa(sptep)); } static inline struct kvm_mmu_page *root_to_sp(hpa_t root) { if (kvm_mmu_is_dummy_root(root)) return NULL; /* * The "root" may be a special root, e.g. a PAE entry, treat it as a * SPTE to ensure any non-PA bits are dropped. */ return spte_to_child_sp(root); } static inline bool is_mirror_sptep(tdp_ptep_t sptep) { return is_mirror_sp(sptep_to_sp(rcu_dereference(sptep))); } static inline bool is_mmio_spte(struct kvm *kvm, u64 spte) { return (spte & shadow_mmio_mask) == kvm->arch.shadow_mmio_value && likely(enable_mmio_caching); } static inline bool is_shadow_present_pte(u64 pte) { return !!(pte & SPTE_MMU_PRESENT_MASK); } static inline bool is_ept_ve_possible(u64 spte) { return (shadow_present_mask & VMX_EPT_SUPPRESS_VE_BIT) && !(spte & VMX_EPT_SUPPRESS_VE_BIT) && (spte & VMX_EPT_RWX_MASK) != VMX_EPT_MISCONFIG_WX_VALUE; } static inline bool sp_ad_disabled(struct kvm_mmu_page *sp) { return sp->role.ad_disabled; } static inline bool spte_ad_enabled(u64 spte) { KVM_MMU_WARN_ON(!is_shadow_present_pte(spte)); return (spte & SPTE_TDP_AD_MASK) != SPTE_TDP_AD_DISABLED; } static inline bool spte_ad_need_write_protect(u64 spte) { KVM_MMU_WARN_ON(!is_shadow_present_pte(spte)); /* * This is benign for non-TDP SPTEs as SPTE_TDP_AD_ENABLED is '0', * and non-TDP SPTEs will never set these bits. Optimize for 64-bit * TDP and do the A/D type check unconditionally. */ return (spte & SPTE_TDP_AD_MASK) != SPTE_TDP_AD_ENABLED; } static inline bool is_access_track_spte(u64 spte) { return !spte_ad_enabled(spte) && (spte & shadow_acc_track_mask) == 0; } static inline bool is_large_pte(u64 pte) { return pte & PT_PAGE_SIZE_MASK; } static inline bool is_last_spte(u64 pte, int level) { return (level == PG_LEVEL_4K) || is_large_pte(pte); } static inline bool is_executable_pte(u64 spte) { return (spte & (shadow_x_mask | shadow_nx_mask)) == shadow_x_mask; } static inline kvm_pfn_t spte_to_pfn(u64 pte) { return (pte & SPTE_BASE_ADDR_MASK) >> PAGE_SHIFT; } static inline bool is_accessed_spte(u64 spte) { return spte & shadow_accessed_mask; } static inline u64 get_rsvd_bits(struct rsvd_bits_validate *rsvd_check, u64 pte, int level) { int bit7 = (pte >> 7) & 1; return rsvd_check->rsvd_bits_mask[bit7][level-1]; } static inline bool __is_rsvd_bits_set(struct rsvd_bits_validate *rsvd_check, u64 pte, int level) { return pte & get_rsvd_bits(rsvd_check, pte, level); } static inline bool __is_bad_mt_xwr(struct rsvd_bits_validate *rsvd_check, u64 pte) { return rsvd_check->bad_mt_xwr & BIT_ULL(pte & 0x3f); } static __always_inline bool is_rsvd_spte(struct rsvd_bits_validate *rsvd_check, u64 spte, int level) { return __is_bad_mt_xwr(rsvd_check, spte) || __is_rsvd_bits_set(rsvd_check, spte, level); } /* * A shadow-present leaf SPTE may be non-writable for 4 possible reasons: * * 1. To intercept writes for dirty logging. KVM write-protects huge pages * so that they can be split down into the dirty logging * granularity (4KiB) whenever the guest writes to them. KVM also * write-protects 4KiB pages so that writes can be recorded in the dirty log * (e.g. if not using PML). SPTEs are write-protected for dirty logging * during the VM-iotcls that enable dirty logging. * * 2. To intercept writes to guest page tables that KVM is shadowing. When a * guest writes to its page table the corresponding shadow page table will * be marked "unsync". That way KVM knows which shadow page tables need to * be updated on the next TLB flush, INVLPG, etc. and which do not. * * 3. To prevent guest writes to read-only memory, such as for memory in a * read-only memslot or guest memory backed by a read-only VMA. Writes to * such pages are disallowed entirely. * * 4. To emulate the Accessed bit for SPTEs without A/D bits. Note, in this * case, the SPTE is access-protected, not just write-protected! * * For cases #1 and #4, KVM can safely make such SPTEs writable without taking * mmu_lock as capturing the Accessed/Dirty state doesn't require taking it. * To differentiate #1 and #4 from #2 and #3, KVM uses two software-only bits * in the SPTE: * * shadow_mmu_writable_mask, aka MMU-writable - * Cleared on SPTEs that KVM is currently write-protecting for shadow paging * purposes (case 2 above). * * shadow_host_writable_mask, aka Host-writable - * Cleared on SPTEs that are not host-writable (case 3 above) * * Note, not all possible combinations of PT_WRITABLE_MASK, * shadow_mmu_writable_mask, and shadow_host_writable_mask are valid. A given * SPTE can be in only one of the following states, which map to the * aforementioned 3 cases: * * shadow_host_writable_mask | shadow_mmu_writable_mask | PT_WRITABLE_MASK * ------------------------- | ------------------------ | ---------------- * 1 | 1 | 1 (writable) * 1 | 1 | 0 (case 1) * 1 | 0 | 0 (case 2) * 0 | 0 | 0 (case 3) * * The valid combinations of these bits are checked by * check_spte_writable_invariants() whenever an SPTE is modified. * * Clearing the MMU-writable bit is always done under the MMU lock and always * accompanied by a TLB flush before dropping the lock to avoid corrupting the * shadow page tables between vCPUs. Write-protecting an SPTE for dirty logging * (which does not clear the MMU-writable bit), does not flush TLBs before * dropping the lock, as it only needs to synchronize guest writes with the * dirty bitmap. Similarly, making the SPTE inaccessible (and non-writable) for * access-tracking via the clear_young() MMU notifier also does not flush TLBs. * * So, there is the problem: clearing the MMU-writable bit can encounter a * write-protected SPTE while CPUs still have writable mappings for that SPTE * cached in their TLB. To address this, KVM always flushes TLBs when * write-protecting SPTEs if the MMU-writable bit is set on the old SPTE. * * The Host-writable bit is not modified on present SPTEs, it is only set or * cleared when an SPTE is first faulted in from non-present and then remains * immutable. */ static inline bool is_writable_pte(unsigned long pte) { return pte & PT_WRITABLE_MASK; } /* Note: spte must be a shadow-present leaf SPTE. */ static inline void check_spte_writable_invariants(u64 spte) { if (spte & shadow_mmu_writable_mask) WARN_ONCE(!(spte & shadow_host_writable_mask), KBUILD_MODNAME ": MMU-writable SPTE is not Host-writable: %llx", spte); else WARN_ONCE(is_writable_pte(spte), KBUILD_MODNAME ": Writable SPTE is not MMU-writable: %llx", spte); } static inline bool is_mmu_writable_spte(u64 spte) { return spte & shadow_mmu_writable_mask; } /* * Returns true if the access indicated by @fault is allowed by the existing * SPTE protections. Note, the caller is responsible for checking that the * SPTE is a shadow-present, leaf SPTE (either before or after). */ static inline bool is_access_allowed(struct kvm_page_fault *fault, u64 spte) { if (fault->exec) return is_executable_pte(spte); if (fault->write) return is_writable_pte(spte); /* Fault was on Read access */ return spte & PT_PRESENT_MASK; } /* * If the MMU-writable flag is cleared, i.e. the SPTE is write-protected for * write-tracking, remote TLBs must be flushed, even if the SPTE was read-only, * as KVM allows stale Writable TLB entries to exist. When dirty logging, KVM * flushes TLBs based on whether or not dirty bitmap/ring entries were reaped, * not whether or not SPTEs were modified, i.e. only the write-tracking case * needs to flush at the time the SPTEs is modified, before dropping mmu_lock. * * Don't flush if the Accessed bit is cleared, as access tracking tolerates * false negatives, e.g. KVM x86 omits TLB flushes even when aging SPTEs for a * mmu_notifier.clear_flush_young() event. * * Lastly, don't flush if the Dirty bit is cleared, as KVM unconditionally * flushes when enabling dirty logging (see kvm_mmu_slot_apply_flags()), and * when clearing dirty logs, KVM flushes based on whether or not dirty entries * were reaped from the bitmap/ring, not whether or not dirty SPTEs were found. * * Note, this logic only applies to shadow-present leaf SPTEs. The caller is * responsible for checking that the old SPTE is shadow-present, and is also * responsible for determining whether or not a TLB flush is required when * modifying a shadow-present non-leaf SPTE. */ static inline bool leaf_spte_change_needs_tlb_flush(u64 old_spte, u64 new_spte) { return is_mmu_writable_spte(old_spte) && !is_mmu_writable_spte(new_spte); } static inline u64 get_mmio_spte_generation(u64 spte) { u64 gen; gen = (spte & MMIO_SPTE_GEN_LOW_MASK) >> MMIO_SPTE_GEN_LOW_SHIFT; gen |= (spte & MMIO_SPTE_GEN_HIGH_MASK) >> MMIO_SPTE_GEN_HIGH_SHIFT; return gen; } bool spte_has_volatile_bits(u64 spte); bool make_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, const struct kvm_memory_slot *slot, unsigned int pte_access, gfn_t gfn, kvm_pfn_t pfn, u64 old_spte, bool prefetch, bool synchronizing, bool host_writable, u64 *new_spte); u64 make_small_spte(struct kvm *kvm, u64 huge_spte, union kvm_mmu_page_role role, int index); u64 make_huge_spte(struct kvm *kvm, u64 small_spte, int level); u64 make_nonleaf_spte(u64 *child_pt, bool ad_disabled); u64 make_mmio_spte(struct kvm_vcpu *vcpu, u64 gfn, unsigned int access); u64 mark_spte_for_access_track(u64 spte); /* Restore an acc-track PTE back to a regular PTE */ static inline u64 restore_acc_track_spte(u64 spte) { u64 saved_bits = (spte >> SHADOW_ACC_TRACK_SAVED_BITS_SHIFT) & SHADOW_ACC_TRACK_SAVED_BITS_MASK; spte &= ~shadow_acc_track_mask; spte &= ~(SHADOW_ACC_TRACK_SAVED_BITS_MASK << SHADOW_ACC_TRACK_SAVED_BITS_SHIFT); spte |= saved_bits; return spte; } void __init kvm_mmu_spte_module_init(void); void kvm_mmu_reset_all_pte_masks(void); #endif
1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 // SPDX-License-Identifier: GPL-2.0-only /* * This file contains functions assisting in mapping VFS to 9P2000 * * Copyright (C) 2004-2008 by Eric Van Hensbergen <ericvh@gmail.com> * Copyright (C) 2002 by Ron Minnich <rminnich@lanl.gov> */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/module.h> #include <linux/errno.h> #include <linux/fs.h> #include <linux/sched.h> #include <linux/cred.h> #include <linux/parser.h> #include <linux/slab.h> #include <linux/seq_file.h> #include <net/9p/9p.h> #include <net/9p/client.h> #include <net/9p/transport.h> #include "v9fs.h" #include "v9fs_vfs.h" #include "cache.h" static DEFINE_SPINLOCK(v9fs_sessionlist_lock); static LIST_HEAD(v9fs_sessionlist); struct kmem_cache *v9fs_inode_cache; /* * Option Parsing (code inspired by NFS code) * NOTE: each transport will parse its own options */ enum { /* Options that take integer arguments */ Opt_debug, Opt_dfltuid, Opt_dfltgid, Opt_afid, /* String options */ Opt_uname, Opt_remotename, Opt_cache, Opt_cachetag, /* Options that take no arguments */ Opt_nodevmap, Opt_noxattr, Opt_directio, Opt_ignoreqv, /* Access options */ Opt_access, Opt_posixacl, /* Lock timeout option */ Opt_locktimeout, /* Error token */ Opt_err }; static const match_table_t tokens = { {Opt_debug, "debug=%x"}, {Opt_dfltuid, "dfltuid=%u"}, {Opt_dfltgid, "dfltgid=%u"}, {Opt_afid, "afid=%u"}, {Opt_uname, "uname=%s"}, {Opt_remotename, "aname=%s"}, {Opt_nodevmap, "nodevmap"}, {Opt_noxattr, "noxattr"}, {Opt_directio, "directio"}, {Opt_ignoreqv, "ignoreqv"}, {Opt_cache, "cache=%s"}, {Opt_cachetag, "cachetag=%s"}, {Opt_access, "access=%s"}, {Opt_posixacl, "posixacl"}, {Opt_locktimeout, "locktimeout=%u"}, {Opt_err, NULL} }; /* Interpret mount options for cache mode */ static int get_cache_mode(char *s) { int version = -EINVAL; if (!strcmp(s, "loose")) { version = CACHE_SC_LOOSE; p9_debug(P9_DEBUG_9P, "Cache mode: loose\n"); } else if (!strcmp(s, "fscache")) { version = CACHE_SC_FSCACHE; p9_debug(P9_DEBUG_9P, "Cache mode: fscache\n"); } else if (!strcmp(s, "mmap")) { version = CACHE_SC_MMAP; p9_debug(P9_DEBUG_9P, "Cache mode: mmap\n"); } else if (!strcmp(s, "readahead")) { version = CACHE_SC_READAHEAD; p9_debug(P9_DEBUG_9P, "Cache mode: readahead\n"); } else if (!strcmp(s, "none")) { version = CACHE_SC_NONE; p9_debug(P9_DEBUG_9P, "Cache mode: none\n"); } else if (kstrtoint(s, 0, &version) != 0) { version = -EINVAL; pr_info("Unknown Cache mode or invalid value %s\n", s); } return version; } /* * Display the mount options in /proc/mounts. */ int v9fs_show_options(struct seq_file *m, struct dentry *root) { struct v9fs_session_info *v9ses = root->d_sb->s_fs_info; if (v9ses->debug) seq_printf(m, ",debug=%x", v9ses->debug); if (!uid_eq(v9ses->dfltuid, V9FS_DEFUID)) seq_printf(m, ",dfltuid=%u", from_kuid_munged(&init_user_ns, v9ses->dfltuid)); if (!gid_eq(v9ses->dfltgid, V9FS_DEFGID)) seq_printf(m, ",dfltgid=%u", from_kgid_munged(&init_user_ns, v9ses->dfltgid)); if (v9ses->afid != ~0) seq_printf(m, ",afid=%u", v9ses->afid); if (strcmp(v9ses->uname, V9FS_DEFUSER) != 0) seq_printf(m, ",uname=%s", v9ses->uname); if (strcmp(v9ses->aname, V9FS_DEFANAME) != 0) seq_printf(m, ",aname=%s", v9ses->aname); if (v9ses->nodev) seq_puts(m, ",nodevmap"); if (v9ses->cache) seq_printf(m, ",cache=%x", v9ses->cache); #ifdef CONFIG_9P_FSCACHE if (v9ses->cachetag && (v9ses->cache & CACHE_FSCACHE)) seq_printf(m, ",cachetag=%s", v9ses->cachetag); #endif switch (v9ses->flags & V9FS_ACCESS_MASK) { case V9FS_ACCESS_USER: seq_puts(m, ",access=user"); break; case V9FS_ACCESS_ANY: seq_puts(m, ",access=any"); break; case V9FS_ACCESS_CLIENT: seq_puts(m, ",access=client"); break; case V9FS_ACCESS_SINGLE: seq_printf(m, ",access=%u", from_kuid_munged(&init_user_ns, v9ses->uid)); break; } if (v9ses->flags & V9FS_IGNORE_QV) seq_puts(m, ",ignoreqv"); if (v9ses->flags & V9FS_DIRECT_IO) seq_puts(m, ",directio"); if (v9ses->flags & V9FS_POSIX_ACL) seq_puts(m, ",posixacl"); if (v9ses->flags & V9FS_NO_XATTR) seq_puts(m, ",noxattr"); return p9_show_client_options(m, v9ses->clnt); } /** * v9fs_parse_options - parse mount options into session structure * @v9ses: existing v9fs session information * @opts: The mount option string * * Return 0 upon success, -ERRNO upon failure. */ static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts) { char *options, *tmp_options; substring_t args[MAX_OPT_ARGS]; char *p; int option = 0; char *s; int ret = 0; /* setup defaults */ v9ses->afid = ~0; v9ses->debug = 0; v9ses->cache = CACHE_NONE; #ifdef CONFIG_9P_FSCACHE v9ses->cachetag = NULL; #endif v9ses->session_lock_timeout = P9_LOCK_TIMEOUT; if (!opts) return 0; tmp_options = kstrdup(opts, GFP_KERNEL); if (!tmp_options) { ret = -ENOMEM; goto fail_option_alloc; } options = tmp_options; while ((p = strsep(&options, ",")) != NULL) { int token, r; if (!*p) continue; token = match_token(p, tokens, args); switch (token) { case Opt_debug: r = match_int(&args[0], &option); if (r < 0) { p9_debug(P9_DEBUG_ERROR, "integer field, but no integer?\n"); ret = r; } else { v9ses->debug = option; #ifdef CONFIG_NET_9P_DEBUG p9_debug_level = option; #endif } break; case Opt_dfltuid: r = match_int(&args[0], &option); if (r < 0) { p9_debug(P9_DEBUG_ERROR, "integer field, but no integer?\n"); ret = r; continue; } v9ses->dfltuid = make_kuid(current_user_ns(), option); if (!uid_valid(v9ses->dfltuid)) { p9_debug(P9_DEBUG_ERROR, "uid field, but not a uid?\n"); ret = -EINVAL; } break; case Opt_dfltgid: r = match_int(&args[0], &option); if (r < 0) { p9_debug(P9_DEBUG_ERROR, "integer field, but no integer?\n"); ret = r; continue; } v9ses->dfltgid = make_kgid(current_user_ns(), option); if (!gid_valid(v9ses->dfltgid)) { p9_debug(P9_DEBUG_ERROR, "gid field, but not a gid?\n"); ret = -EINVAL; } break; case Opt_afid: r = match_int(&args[0], &option); if (r < 0) { p9_debug(P9_DEBUG_ERROR, "integer field, but no integer?\n"); ret = r; } else { v9ses->afid = option; } break; case Opt_uname: kfree(v9ses->uname); v9ses->uname = match_strdup(&args[0]); if (!v9ses->uname) { ret = -ENOMEM; goto free_and_return; } break; case Opt_remotename: kfree(v9ses->aname); v9ses->aname = match_strdup(&args[0]); if (!v9ses->aname) { ret = -ENOMEM; goto free_and_return; } break; case Opt_nodevmap: v9ses->nodev = 1; break; case Opt_noxattr: v9ses->flags |= V9FS_NO_XATTR; break; case Opt_directio: v9ses->flags |= V9FS_DIRECT_IO; break; case Opt_ignoreqv: v9ses->flags |= V9FS_IGNORE_QV; break; case Opt_cachetag: #ifdef CONFIG_9P_FSCACHE kfree(v9ses->cachetag); v9ses->cachetag = match_strdup(&args[0]); if (!v9ses->cachetag) { ret = -ENOMEM; goto free_and_return; } #endif break; case Opt_cache: s = match_strdup(&args[0]); if (!s) { ret = -ENOMEM; p9_debug(P9_DEBUG_ERROR, "problem allocating copy of cache arg\n"); goto free_and_return; } r = get_cache_mode(s); if (r < 0) ret = r; else v9ses->cache = r; kfree(s); break; case Opt_access: s = match_strdup(&args[0]); if (!s) { ret = -ENOMEM; p9_debug(P9_DEBUG_ERROR, "problem allocating copy of access arg\n"); goto free_and_return; } v9ses->flags &= ~V9FS_ACCESS_MASK; if (strcmp(s, "user") == 0) v9ses->flags |= V9FS_ACCESS_USER; else if (strcmp(s, "any") == 0) v9ses->flags |= V9FS_ACCESS_ANY; else if (strcmp(s, "client") == 0) { v9ses->flags |= V9FS_ACCESS_CLIENT; } else { uid_t uid; v9ses->flags |= V9FS_ACCESS_SINGLE; r = kstrtouint(s, 10, &uid); if (r) { ret = r; pr_info("Unknown access argument %s: %d\n", s, r); kfree(s); continue; } v9ses->uid = make_kuid(current_user_ns(), uid); if (!uid_valid(v9ses->uid)) { ret = -EINVAL; pr_info("Unknown uid %s\n", s); } } kfree(s); break; case Opt_posixacl: #ifdef CONFIG_9P_FS_POSIX_ACL v9ses->flags |= V9FS_POSIX_ACL; #else p9_debug(P9_DEBUG_ERROR, "Not defined CONFIG_9P_FS_POSIX_ACL. Ignoring posixacl option\n"); #endif break; case Opt_locktimeout: r = match_int(&args[0], &option); if (r < 0) { p9_debug(P9_DEBUG_ERROR, "integer field, but no integer?\n"); ret = r; continue; } if (option < 1) { p9_debug(P9_DEBUG_ERROR, "locktimeout must be a greater than zero integer.\n"); ret = -EINVAL; continue; } v9ses->session_lock_timeout = (long)option * HZ; break; default: continue; } } free_and_return: kfree(tmp_options); fail_option_alloc: return ret; } /** * v9fs_session_init - initialize session * @v9ses: session information structure * @dev_name: device being mounted * @data: options * */ struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses, const char *dev_name, char *data) { struct p9_fid *fid; int rc = -ENOMEM; v9ses->uname = kstrdup(V9FS_DEFUSER, GFP_KERNEL); if (!v9ses->uname) goto err_names; v9ses->aname = kstrdup(V9FS_DEFANAME, GFP_KERNEL); if (!v9ses->aname) goto err_names; init_rwsem(&v9ses->rename_sem); v9ses->uid = INVALID_UID; v9ses->dfltuid = V9FS_DEFUID; v9ses->dfltgid = V9FS_DEFGID; v9ses->clnt = p9_client_create(dev_name, data); if (IS_ERR(v9ses->clnt)) { rc = PTR_ERR(v9ses->clnt); p9_debug(P9_DEBUG_ERROR, "problem initializing 9p client\n"); goto err_names; } v9ses->flags = V9FS_ACCESS_USER; if (p9_is_proto_dotl(v9ses->clnt)) { v9ses->flags = V9FS_ACCESS_CLIENT; v9ses->flags |= V9FS_PROTO_2000L; } else if (p9_is_proto_dotu(v9ses->clnt)) { v9ses->flags |= V9FS_PROTO_2000U; } rc = v9fs_parse_options(v9ses, data); if (rc < 0) goto err_clnt; v9ses->maxdata = v9ses->clnt->msize - P9_IOHDRSZ; if (!v9fs_proto_dotl(v9ses) && ((v9ses->flags & V9FS_ACCESS_MASK) == V9FS_ACCESS_CLIENT)) { /* * We support ACCESS_CLIENT only for dotl. * Fall back to ACCESS_USER */ v9ses->flags &= ~V9FS_ACCESS_MASK; v9ses->flags |= V9FS_ACCESS_USER; } /*FIXME !! */ /* for legacy mode, fall back to V9FS_ACCESS_ANY */ if (!(v9fs_proto_dotu(v9ses) || v9fs_proto_dotl(v9ses)) && ((v9ses->flags&V9FS_ACCESS_MASK) == V9FS_ACCESS_USER)) { v9ses->flags &= ~V9FS_ACCESS_MASK; v9ses->flags |= V9FS_ACCESS_ANY; v9ses->uid = INVALID_UID; } if (!v9fs_proto_dotl(v9ses) || !((v9ses->flags & V9FS_ACCESS_MASK) == V9FS_ACCESS_CLIENT)) { /* * We support ACL checks on clinet only if the protocol is * 9P2000.L and access is V9FS_ACCESS_CLIENT. */ v9ses->flags &= ~V9FS_ACL_MASK; } fid = p9_client_attach(v9ses->clnt, NULL, v9ses->uname, INVALID_UID, v9ses->aname); if (IS_ERR(fid)) { rc = PTR_ERR(fid); p9_debug(P9_DEBUG_ERROR, "cannot attach\n"); goto err_clnt; } if ((v9ses->flags & V9FS_ACCESS_MASK) == V9FS_ACCESS_SINGLE) fid->uid = v9ses->uid; else fid->uid = INVALID_UID; #ifdef CONFIG_9P_FSCACHE /* register the session for caching */ if (v9ses->cache & CACHE_FSCACHE) { rc = v9fs_cache_session_get_cookie(v9ses, dev_name); if (rc < 0) goto err_clnt; } #endif spin_lock(&v9fs_sessionlist_lock); list_add(&v9ses->slist, &v9fs_sessionlist); spin_unlock(&v9fs_sessionlist_lock); return fid; err_clnt: #ifdef CONFIG_9P_FSCACHE kfree(v9ses->cachetag); #endif p9_client_destroy(v9ses->clnt); err_names: kfree(v9ses->uname); kfree(v9ses->aname); return ERR_PTR(rc); } /** * v9fs_session_close - shutdown a session * @v9ses: session information structure * */ void v9fs_session_close(struct v9fs_session_info *v9ses) { if (v9ses->clnt) { p9_client_destroy(v9ses->clnt); v9ses->clnt = NULL; } #ifdef CONFIG_9P_FSCACHE fscache_relinquish_volume(v9fs_session_cache(v9ses), NULL, false); kfree(v9ses->cachetag); #endif kfree(v9ses->uname); kfree(v9ses->aname); spin_lock(&v9fs_sessionlist_lock); list_del(&v9ses->slist); spin_unlock(&v9fs_sessionlist_lock); } /** * v9fs_session_cancel - terminate a session * @v9ses: session to terminate * * mark transport as disconnected and cancel all pending requests. */ void v9fs_session_cancel(struct v9fs_session_info *v9ses) { p9_debug(P9_DEBUG_ERROR, "cancel session %p\n", v9ses); p9_client_disconnect(v9ses->clnt); } /** * v9fs_session_begin_cancel - Begin terminate of a session * @v9ses: session to terminate * * After this call we don't allow any request other than clunk. */ void v9fs_session_begin_cancel(struct v9fs_session_info *v9ses) { p9_debug(P9_DEBUG_ERROR, "begin cancel session %p\n", v9ses); p9_client_begin_disconnect(v9ses->clnt); } static struct kobject *v9fs_kobj; #ifdef CONFIG_9P_FSCACHE /* * List caches associated with a session */ static ssize_t caches_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { ssize_t n = 0, count = 0, limit = PAGE_SIZE; struct v9fs_session_info *v9ses; spin_lock(&v9fs_sessionlist_lock); list_for_each_entry(v9ses, &v9fs_sessionlist, slist) { if (v9ses->cachetag) { n = snprintf(buf, limit, "%s\n", v9ses->cachetag); if (n < 0) { count = n; break; } count += n; limit -= n; } } spin_unlock(&v9fs_sessionlist_lock); return count; } static struct kobj_attribute v9fs_attr_cache = __ATTR_RO(caches); #endif /* CONFIG_9P_FSCACHE */ static struct attribute *v9fs_attrs[] = { #ifdef CONFIG_9P_FSCACHE &v9fs_attr_cache.attr, #endif NULL, }; static const struct attribute_group v9fs_attr_group = { .attrs = v9fs_attrs, }; /** * v9fs_sysfs_init - Initialize the v9fs sysfs interface * */ static int __init v9fs_sysfs_init(void) { v9fs_kobj = kobject_create_and_add("9p", fs_kobj); if (!v9fs_kobj) return -ENOMEM; if (sysfs_create_group(v9fs_kobj, &v9fs_attr_group)) { kobject_put(v9fs_kobj); return -ENOMEM; } return 0; } /** * v9fs_sysfs_cleanup - Unregister the v9fs sysfs interface * */ static void v9fs_sysfs_cleanup(void) { sysfs_remove_group(v9fs_kobj, &v9fs_attr_group); kobject_put(v9fs_kobj); } static void v9fs_inode_init_once(void *foo) { struct v9fs_inode *v9inode = (struct v9fs_inode *)foo; memset(&v9inode->qid, 0, sizeof(v9inode->qid)); inode_init_once(&v9inode->netfs.inode); } /** * v9fs_init_inode_cache - initialize a cache for 9P * Returns 0 on success. */ static int v9fs_init_inode_cache(void) { v9fs_inode_cache = kmem_cache_create("v9fs_inode_cache", sizeof(struct v9fs_inode), 0, (SLAB_RECLAIM_ACCOUNT| SLAB_ACCOUNT), v9fs_inode_init_once); if (!v9fs_inode_cache) return -ENOMEM; return 0; } /** * v9fs_destroy_inode_cache - destroy the cache of 9P inode * */ static void v9fs_destroy_inode_cache(void) { /* * Make sure all delayed rcu free inodes are flushed before we * destroy cache. */ rcu_barrier(); kmem_cache_destroy(v9fs_inode_cache); } /** * init_v9fs - Initialize module * */ static int __init init_v9fs(void) { int err; pr_info("Installing v9fs 9p2000 file system support\n"); /* TODO: Setup list of registered trasnport modules */ err = v9fs_init_inode_cache(); if (err < 0) { pr_err("Failed to register v9fs for caching\n"); return err; } err = v9fs_sysfs_init(); if (err < 0) { pr_err("Failed to register with sysfs\n"); goto out_cache; } err = register_filesystem(&v9fs_fs_type); if (err < 0) { pr_err("Failed to register filesystem\n"); goto out_sysfs_cleanup; } return 0; out_sysfs_cleanup: v9fs_sysfs_cleanup(); out_cache: v9fs_destroy_inode_cache(); return err; } /** * exit_v9fs - shutdown module * */ static void __exit exit_v9fs(void) { v9fs_sysfs_cleanup(); v9fs_destroy_inode_cache(); unregister_filesystem(&v9fs_fs_type); } module_init(init_v9fs) module_exit(exit_v9fs) MODULE_AUTHOR("Latchesar Ionkov <lucho@ionkov.net>"); MODULE_AUTHOR("Eric Van Hensbergen <ericvh@gmail.com>"); MODULE_AUTHOR("Ron Minnich <rminnich@lanl.gov>"); MODULE_DESCRIPTION("9P Client File System"); MODULE_LICENSE("GPL");
6 6 6 5 6 5 4 4 4 5 6 5 5 6 6 5 5 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 // SPDX-License-Identifier: GPL-2.0-or-later /************************************************************ * EFI GUID Partition Table handling * * http://www.uefi.org/specs/ * http://www.intel.com/technology/efi/ * * efi.[ch] by Matt Domsch <Matt_Domsch@dell.com> * Copyright 2000,2001,2002,2004 Dell Inc. * * TODO: * * Changelog: * Mon August 5th, 2013 Davidlohr Bueso <davidlohr@hp.com> * - detect hybrid MBRs, tighter pMBR checking & cleanups. * * Mon Nov 09 2004 Matt Domsch <Matt_Domsch@dell.com> * - test for valid PMBR and valid PGPT before ever reading * AGPT, allow override with 'gpt' kernel command line option. * - check for first/last_usable_lba outside of size of disk * * Tue Mar 26 2002 Matt Domsch <Matt_Domsch@dell.com> * - Ported to 2.5.7-pre1 and 2.5.7-dj2 * - Applied patch to avoid fault in alternate header handling * - cleaned up find_valid_gpt * - On-disk structure and copy in memory is *always* LE now - * swab fields as needed * - remove print_gpt_header() * - only use first max_p partition entries, to keep the kernel minor number * and partition numbers tied. * * Mon Feb 04 2002 Matt Domsch <Matt_Domsch@dell.com> * - Removed __PRIPTR_PREFIX - not being used * * Mon Jan 14 2002 Matt Domsch <Matt_Domsch@dell.com> * - Ported to 2.5.2-pre11 + library crc32 patch Linus applied * * Thu Dec 6 2001 Matt Domsch <Matt_Domsch@dell.com> * - Added compare_gpts(). * - moved le_efi_guid_to_cpus() back into this file. GPT is the only * thing that keeps EFI GUIDs on disk. * - Changed gpt structure names and members to be simpler and more Linux-like. * * Wed Oct 17 2001 Matt Domsch <Matt_Domsch@dell.com> * - Removed CONFIG_DEVFS_VOLUMES_UUID code entirely per Martin Wilck * * Wed Oct 10 2001 Matt Domsch <Matt_Domsch@dell.com> * - Changed function comments to DocBook style per Andreas Dilger suggestion. * * Mon Oct 08 2001 Matt Domsch <Matt_Domsch@dell.com> * - Change read_lba() to use the page cache per Al Viro's work. * - print u64s properly on all architectures * - fixed debug_printk(), now Dprintk() * * Mon Oct 01 2001 Matt Domsch <Matt_Domsch@dell.com> * - Style cleanups * - made most functions static * - Endianness addition * - remove test for second alternate header, as it's not per spec, * and is unnecessary. There's now a method to read/write the last * sector of an odd-sized disk from user space. No tools have ever * been released which used this code, so it's effectively dead. * - Per Asit Mallick of Intel, added a test for a valid PMBR. * - Added kernel command line option 'gpt' to override valid PMBR test. * * Wed Jun 6 2001 Martin Wilck <Martin.Wilck@Fujitsu-Siemens.com> * - added devfs volume UUID support (/dev/volumes/uuids) for * mounting file systems by the partition GUID. * * Tue Dec 5 2000 Matt Domsch <Matt_Domsch@dell.com> * - Moved crc32() to linux/lib, added efi_crc32(). * * Thu Nov 30 2000 Matt Domsch <Matt_Domsch@dell.com> * - Replaced Intel's CRC32 function with an equivalent * non-license-restricted version. * * Wed Oct 25 2000 Matt Domsch <Matt_Domsch@dell.com> * - Fixed the last_lba() call to return the proper last block * * Thu Oct 12 2000 Matt Domsch <Matt_Domsch@dell.com> * - Thanks to Andries Brouwer for his debugging assistance. * - Code works, detects all the partitions. * ************************************************************/ #include <linux/kernel.h> #include <linux/crc32.h> #include <linux/ctype.h> #include <linux/math64.h> #include <linux/slab.h> #include "check.h" #include "efi.h" /* This allows a kernel command line option 'gpt' to override * the test for invalid PMBR. Not __initdata because reloading * the partition tables happens after init too. */ static int force_gpt; static int __init force_gpt_fn(char *str) { force_gpt = 1; return 1; } __setup("gpt", force_gpt_fn); /** * efi_crc32() - EFI version of crc32 function * @buf: buffer to calculate crc32 of * @len: length of buf * * Description: Returns EFI-style CRC32 value for @buf * * This function uses the little endian Ethernet polynomial * but seeds the function with ~0, and xor's with ~0 at the end. * Note, the EFI Specification, v1.02, has a reference to * Dr. Dobbs Journal, May 1994 (actually it's in May 1992). */ static inline u32 efi_crc32(const void *buf, unsigned long len) { return (crc32(~0L, buf, len) ^ ~0L); } /** * last_lba(): return number of last logical block of device * @disk: block device * * Description: Returns last LBA value on success, 0 on error. * This is stored (by sd and ide-geometry) in * the part[0] entry for this disk, and is the number of * physical sectors available on the disk. */ static u64 last_lba(struct gendisk *disk) { return div_u64(bdev_nr_bytes(disk->part0), queue_logical_block_size(disk->queue)) - 1ULL; } static inline int pmbr_part_valid(gpt_mbr_record *part) { if (part->os_type != EFI_PMBR_OSTYPE_EFI_GPT) goto invalid; /* set to 0x00000001 (i.e., the LBA of the GPT Partition Header) */ if (le32_to_cpu(part->starting_lba) != GPT_PRIMARY_PARTITION_TABLE_LBA) goto invalid; return GPT_MBR_PROTECTIVE; invalid: return 0; } /** * is_pmbr_valid(): test Protective MBR for validity * @mbr: pointer to a legacy mbr structure * @total_sectors: amount of sectors in the device * * Description: Checks for a valid protective or hybrid * master boot record (MBR). The validity of a pMBR depends * on all of the following properties: * 1) MSDOS signature is in the last two bytes of the MBR * 2) One partition of type 0xEE is found * * In addition, a hybrid MBR will have up to three additional * primary partitions, which point to the same space that's * marked out by up to three GPT partitions. * * Returns 0 upon invalid MBR, or GPT_MBR_PROTECTIVE or * GPT_MBR_HYBRID depending on the device layout. */ static int is_pmbr_valid(legacy_mbr *mbr, sector_t total_sectors) { uint32_t sz = 0; int i, part = 0, ret = 0; /* invalid by default */ if (!mbr || le16_to_cpu(mbr->signature) != MSDOS_MBR_SIGNATURE) goto done; for (i = 0; i < 4; i++) { ret = pmbr_part_valid(&mbr->partition_record[i]); if (ret == GPT_MBR_PROTECTIVE) { part = i; /* * Ok, we at least know that there's a protective MBR, * now check if there are other partition types for * hybrid MBR. */ goto check_hybrid; } } if (ret != GPT_MBR_PROTECTIVE) goto done; check_hybrid: for (i = 0; i < 4; i++) if ((mbr->partition_record[i].os_type != EFI_PMBR_OSTYPE_EFI_GPT) && (mbr->partition_record[i].os_type != 0x00)) ret = GPT_MBR_HYBRID; /* * Protective MBRs take up the lesser of the whole disk * or 2 TiB (32bit LBA), ignoring the rest of the disk. * Some partitioning programs, nonetheless, choose to set * the size to the maximum 32-bit limitation, disregarding * the disk size. * * Hybrid MBRs do not necessarily comply with this. * * Consider a bad value here to be a warning to support dd'ing * an image from a smaller disk to a larger disk. */ if (ret == GPT_MBR_PROTECTIVE) { sz = le32_to_cpu(mbr->partition_record[part].size_in_lba); if (sz != (uint32_t) total_sectors - 1 && sz != 0xFFFFFFFF) pr_debug("GPT: mbr size in lba (%u) different than whole disk (%u).\n", sz, min_t(uint32_t, total_sectors - 1, 0xFFFFFFFF)); } done: return ret; } /** * read_lba(): Read bytes from disk, starting at given LBA * @state: disk parsed partitions * @lba: the Logical Block Address of the partition table * @buffer: destination buffer * @count: bytes to read * * Description: Reads @count bytes from @state->disk into @buffer. * Returns number of bytes read on success, 0 on error. */ static size_t read_lba(struct parsed_partitions *state, u64 lba, u8 *buffer, size_t count) { size_t totalreadcount = 0; sector_t n = lba * (queue_logical_block_size(state->disk->queue) / 512); if (!buffer || lba > last_lba(state->disk)) return 0; while (count) { int copied = 512; Sector sect; unsigned char *data = read_part_sector(state, n++, &sect); if (!data) break; if (copied > count) copied = count; memcpy(buffer, data, copied); put_dev_sector(sect); buffer += copied; totalreadcount +=copied; count -= copied; } return totalreadcount; } /** * alloc_read_gpt_entries(): reads partition entries from disk * @state: disk parsed partitions * @gpt: GPT header * * Description: Returns ptes on success, NULL on error. * Allocates space for PTEs based on information found in @gpt. * Notes: remember to free pte when you're done! */ static gpt_entry *alloc_read_gpt_entries(struct parsed_partitions *state, gpt_header *gpt) { size_t count; gpt_entry *pte; if (!gpt) return NULL; count = (size_t)le32_to_cpu(gpt->num_partition_entries) * le32_to_cpu(gpt->sizeof_partition_entry); if (!count) return NULL; pte = kmalloc(count, GFP_KERNEL); if (!pte) return NULL; if (read_lba(state, le64_to_cpu(gpt->partition_entry_lba), (u8 *) pte, count) < count) { kfree(pte); pte=NULL; return NULL; } return pte; } /** * alloc_read_gpt_header(): Allocates GPT header, reads into it from disk * @state: disk parsed partitions * @lba: the Logical Block Address of the partition table * * Description: returns GPT header on success, NULL on error. Allocates * and fills a GPT header starting at @ from @state->disk. * Note: remember to free gpt when finished with it. */ static gpt_header *alloc_read_gpt_header(struct parsed_partitions *state, u64 lba) { gpt_header *gpt; unsigned ssz = queue_logical_block_size(state->disk->queue); gpt = kmalloc(ssz, GFP_KERNEL); if (!gpt) return NULL; if (read_lba(state, lba, (u8 *) gpt, ssz) < ssz) { kfree(gpt); gpt=NULL; return NULL; } return gpt; } /** * is_gpt_valid() - tests one GPT header and PTEs for validity * @state: disk parsed partitions * @lba: logical block address of the GPT header to test * @gpt: GPT header ptr, filled on return. * @ptes: PTEs ptr, filled on return. * * Description: returns 1 if valid, 0 on error. * If valid, returns pointers to newly allocated GPT header and PTEs. */ static int is_gpt_valid(struct parsed_partitions *state, u64 lba, gpt_header **gpt, gpt_entry **ptes) { u32 crc, origcrc; u64 lastlba, pt_size; if (!ptes) return 0; if (!(*gpt = alloc_read_gpt_header(state, lba))) return 0; /* Check the GUID Partition Table signature */ if (le64_to_cpu((*gpt)->signature) != GPT_HEADER_SIGNATURE) { pr_debug("GUID Partition Table Header signature is wrong:" "%lld != %lld\n", (unsigned long long)le64_to_cpu((*gpt)->signature), (unsigned long long)GPT_HEADER_SIGNATURE); goto fail; } /* Check the GUID Partition Table header size is too big */ if (le32_to_cpu((*gpt)->header_size) > queue_logical_block_size(state->disk->queue)) { pr_debug("GUID Partition Table Header size is too large: %u > %u\n", le32_to_cpu((*gpt)->header_size), queue_logical_block_size(state->disk->queue)); goto fail; } /* Check the GUID Partition Table header size is too small */ if (le32_to_cpu((*gpt)->header_size) < sizeof(gpt_header)) { pr_debug("GUID Partition Table Header size is too small: %u < %zu\n", le32_to_cpu((*gpt)->header_size), sizeof(gpt_header)); goto fail; } /* Check the GUID Partition Table CRC */ origcrc = le32_to_cpu((*gpt)->header_crc32); (*gpt)->header_crc32 = 0; crc = efi_crc32((const unsigned char *) (*gpt), le32_to_cpu((*gpt)->header_size)); if (crc != origcrc) { pr_debug("GUID Partition Table Header CRC is wrong: %x != %x\n", crc, origcrc); goto fail; } (*gpt)->header_crc32 = cpu_to_le32(origcrc); /* Check that the my_lba entry points to the LBA that contains * the GUID Partition Table */ if (le64_to_cpu((*gpt)->my_lba) != lba) { pr_debug("GPT my_lba incorrect: %lld != %lld\n", (unsigned long long)le64_to_cpu((*gpt)->my_lba), (unsigned long long)lba); goto fail; } /* Check the first_usable_lba and last_usable_lba are * within the disk. */ lastlba = last_lba(state->disk); if (le64_to_cpu((*gpt)->first_usable_lba) > lastlba) { pr_debug("GPT: first_usable_lba incorrect: %lld > %lld\n", (unsigned long long)le64_to_cpu((*gpt)->first_usable_lba), (unsigned long long)lastlba); goto fail; } if (le64_to_cpu((*gpt)->last_usable_lba) > lastlba) { pr_debug("GPT: last_usable_lba incorrect: %lld > %lld\n", (unsigned long long)le64_to_cpu((*gpt)->last_usable_lba), (unsigned long long)lastlba); goto fail; } if (le64_to_cpu((*gpt)->last_usable_lba) < le64_to_cpu((*gpt)->first_usable_lba)) { pr_debug("GPT: last_usable_lba incorrect: %lld > %lld\n", (unsigned long long)le64_to_cpu((*gpt)->last_usable_lba), (unsigned long long)le64_to_cpu((*gpt)->first_usable_lba)); goto fail; } /* Check that sizeof_partition_entry has the correct value */ if (le32_to_cpu((*gpt)->sizeof_partition_entry) != sizeof(gpt_entry)) { pr_debug("GUID Partition Entry Size check failed.\n"); goto fail; } /* Sanity check partition table size */ pt_size = (u64)le32_to_cpu((*gpt)->num_partition_entries) * le32_to_cpu((*gpt)->sizeof_partition_entry); if (pt_size > KMALLOC_MAX_SIZE) { pr_debug("GUID Partition Table is too large: %llu > %lu bytes\n", (unsigned long long)pt_size, KMALLOC_MAX_SIZE); goto fail; } if (!(*ptes = alloc_read_gpt_entries(state, *gpt))) goto fail; /* Check the GUID Partition Entry Array CRC */ crc = efi_crc32((const unsigned char *) (*ptes), pt_size); if (crc != le32_to_cpu((*gpt)->partition_entry_array_crc32)) { pr_debug("GUID Partition Entry Array CRC check failed.\n"); goto fail_ptes; } /* We're done, all's well */ return 1; fail_ptes: kfree(*ptes); *ptes = NULL; fail: kfree(*gpt); *gpt = NULL; return 0; } /** * is_pte_valid() - tests one PTE for validity * @pte:pte to check * @lastlba: last lba of the disk * * Description: returns 1 if valid, 0 on error. */ static inline int is_pte_valid(const gpt_entry *pte, const u64 lastlba) { if ((!efi_guidcmp(pte->partition_type_guid, NULL_GUID)) || le64_to_cpu(pte->starting_lba) > lastlba || le64_to_cpu(pte->ending_lba) > lastlba) return 0; return 1; } /** * compare_gpts() - Search disk for valid GPT headers and PTEs * @pgpt: primary GPT header * @agpt: alternate GPT header * @lastlba: last LBA number * * Description: Returns nothing. Sanity checks pgpt and agpt fields * and prints warnings on discrepancies. * */ static void compare_gpts(gpt_header *pgpt, gpt_header *agpt, u64 lastlba) { int error_found = 0; if (!pgpt || !agpt) return; if (le64_to_cpu(pgpt->my_lba) != le64_to_cpu(agpt->alternate_lba)) { pr_warn("GPT:Primary header LBA != Alt. header alternate_lba\n"); pr_warn("GPT:%lld != %lld\n", (unsigned long long)le64_to_cpu(pgpt->my_lba), (unsigned long long)le64_to_cpu(agpt->alternate_lba)); error_found++; } if (le64_to_cpu(pgpt->alternate_lba) != le64_to_cpu(agpt->my_lba)) { pr_warn("GPT:Primary header alternate_lba != Alt. header my_lba\n"); pr_warn("GPT:%lld != %lld\n", (unsigned long long)le64_to_cpu(pgpt->alternate_lba), (unsigned long long)le64_to_cpu(agpt->my_lba)); error_found++; } if (le64_to_cpu(pgpt->first_usable_lba) != le64_to_cpu(agpt->first_usable_lba)) { pr_warn("GPT:first_usable_lbas don't match.\n"); pr_warn("GPT:%lld != %lld\n", (unsigned long long)le64_to_cpu(pgpt->first_usable_lba), (unsigned long long)le64_to_cpu(agpt->first_usable_lba)); error_found++; } if (le64_to_cpu(pgpt->last_usable_lba) != le64_to_cpu(agpt->last_usable_lba)) { pr_warn("GPT:last_usable_lbas don't match.\n"); pr_warn("GPT:%lld != %lld\n", (unsigned long long)le64_to_cpu(pgpt->last_usable_lba), (unsigned long long)le64_to_cpu(agpt->last_usable_lba)); error_found++; } if (efi_guidcmp(pgpt->disk_guid, agpt->disk_guid)) { pr_warn("GPT:disk_guids don't match.\n"); error_found++; } if (le32_to_cpu(pgpt->num_partition_entries) != le32_to_cpu(agpt->num_partition_entries)) { pr_warn("GPT:num_partition_entries don't match: " "0x%x != 0x%x\n", le32_to_cpu(pgpt->num_partition_entries), le32_to_cpu(agpt->num_partition_entries)); error_found++; } if (le32_to_cpu(pgpt->sizeof_partition_entry) != le32_to_cpu(agpt->sizeof_partition_entry)) { pr_warn("GPT:sizeof_partition_entry values don't match: " "0x%x != 0x%x\n", le32_to_cpu(pgpt->sizeof_partition_entry), le32_to_cpu(agpt->sizeof_partition_entry)); error_found++; } if (le32_to_cpu(pgpt->partition_entry_array_crc32) != le32_to_cpu(agpt->partition_entry_array_crc32)) { pr_warn("GPT:partition_entry_array_crc32 values don't match: " "0x%x != 0x%x\n", le32_to_cpu(pgpt->partition_entry_array_crc32), le32_to_cpu(agpt->partition_entry_array_crc32)); error_found++; } if (le64_to_cpu(pgpt->alternate_lba) != lastlba) { pr_warn("GPT:Primary header thinks Alt. header is not at the end of the disk.\n"); pr_warn("GPT:%lld != %lld\n", (unsigned long long)le64_to_cpu(pgpt->alternate_lba), (unsigned long long)lastlba); error_found++; } if (le64_to_cpu(agpt->my_lba) != lastlba) { pr_warn("GPT:Alternate GPT header not at the end of the disk.\n"); pr_warn("GPT:%lld != %lld\n", (unsigned long long)le64_to_cpu(agpt->my_lba), (unsigned long long)lastlba); error_found++; } if (error_found) pr_warn("GPT: Use GNU Parted to correct GPT errors.\n"); return; } /** * find_valid_gpt() - Search disk for valid GPT headers and PTEs * @state: disk parsed partitions * @gpt: GPT header ptr, filled on return. * @ptes: PTEs ptr, filled on return. * * Description: Returns 1 if valid, 0 on error. * If valid, returns pointers to newly allocated GPT header and PTEs. * Validity depends on PMBR being valid (or being overridden by the * 'gpt' kernel command line option) and finding either the Primary * GPT header and PTEs valid, or the Alternate GPT header and PTEs * valid. If the Primary GPT header is not valid, the Alternate GPT header * is not checked unless the 'gpt' kernel command line option is passed. * This protects against devices which misreport their size, and forces * the user to decide to use the Alternate GPT. */ static int find_valid_gpt(struct parsed_partitions *state, gpt_header **gpt, gpt_entry **ptes) { int good_pgpt = 0, good_agpt = 0, good_pmbr = 0; gpt_header *pgpt = NULL, *agpt = NULL; gpt_entry *pptes = NULL, *aptes = NULL; legacy_mbr *legacymbr; struct gendisk *disk = state->disk; const struct block_device_operations *fops = disk->fops; sector_t total_sectors = get_capacity(state->disk); u64 lastlba; if (!ptes) return 0; lastlba = last_lba(state->disk); if (!force_gpt) { /* This will be added to the EFI Spec. per Intel after v1.02. */ legacymbr = kzalloc(sizeof(*legacymbr), GFP_KERNEL); if (!legacymbr) goto fail; read_lba(state, 0, (u8 *)legacymbr, sizeof(*legacymbr)); good_pmbr = is_pmbr_valid(legacymbr, total_sectors); kfree(legacymbr); if (!good_pmbr) goto fail; pr_debug("Device has a %s MBR\n", good_pmbr == GPT_MBR_PROTECTIVE ? "protective" : "hybrid"); } good_pgpt = is_gpt_valid(state, GPT_PRIMARY_PARTITION_TABLE_LBA, &pgpt, &pptes); if (good_pgpt) good_agpt = is_gpt_valid(state, le64_to_cpu(pgpt->alternate_lba), &agpt, &aptes); if (!good_agpt && force_gpt) good_agpt = is_gpt_valid(state, lastlba, &agpt, &aptes); if (!good_agpt && force_gpt && fops->alternative_gpt_sector) { sector_t agpt_sector; int err; err = fops->alternative_gpt_sector(disk, &agpt_sector); if (!err) good_agpt = is_gpt_valid(state, agpt_sector, &agpt, &aptes); } /* The obviously unsuccessful case */ if (!good_pgpt && !good_agpt) goto fail; compare_gpts(pgpt, agpt, lastlba); /* The good cases */ if (good_pgpt) { *gpt = pgpt; *ptes = pptes; kfree(agpt); kfree(aptes); if (!good_agpt) pr_warn("Alternate GPT is invalid, using primary GPT.\n"); return 1; } else if (good_agpt) { *gpt = agpt; *ptes = aptes; kfree(pgpt); kfree(pptes); pr_warn("Primary GPT is invalid, using alternate GPT.\n"); return 1; } fail: kfree(pgpt); kfree(agpt); kfree(pptes); kfree(aptes); *gpt = NULL; *ptes = NULL; return 0; } /** * utf16_le_to_7bit(): Naively converts a UTF-16LE string to 7-bit ASCII characters * @in: input UTF-16LE string * @size: size of the input string * @out: output string ptr, should be capable to store @size+1 characters * * Description: Converts @size UTF16-LE symbols from @in string to 7-bit * ASCII characters and stores them to @out. Adds trailing zero to @out array. */ static void utf16_le_to_7bit(const __le16 *in, unsigned int size, u8 *out) { unsigned int i = 0; out[size] = 0; while (i < size) { u8 c = le16_to_cpu(in[i]) & 0x7f; if (c && !isprint(c)) c = '!'; out[i] = c; i++; } } /** * efi_partition - scan for GPT partitions * @state: disk parsed partitions * * Description: called from check.c, if the disk contains GPT * partitions, sets up partition entries in the kernel. * * If the first block on the disk is a legacy MBR, * it will get handled by msdos_partition(). * If it's a Protective MBR, we'll handle it here. * * We do not create a Linux partition for GPT, but * only for the actual data partitions. * Returns: * -1 if unable to read the partition table * 0 if this isn't our partition table * 1 if successful * */ int efi_partition(struct parsed_partitions *state) { gpt_header *gpt = NULL; gpt_entry *ptes = NULL; u32 i; unsigned ssz = queue_logical_block_size(state->disk->queue) / 512; if (!find_valid_gpt(state, &gpt, &ptes) || !gpt || !ptes) { kfree(gpt); kfree(ptes); return 0; } pr_debug("GUID Partition Table is valid! Yea!\n"); for (i = 0; i < le32_to_cpu(gpt->num_partition_entries) && i < state->limit-1; i++) { struct partition_meta_info *info; unsigned label_max; u64 start = le64_to_cpu(ptes[i].starting_lba); u64 size = le64_to_cpu(ptes[i].ending_lba) - le64_to_cpu(ptes[i].starting_lba) + 1ULL; if (!is_pte_valid(&ptes[i], last_lba(state->disk))) continue; put_partition(state, i+1, start * ssz, size * ssz); /* If this is a RAID volume, tell md */ if (!efi_guidcmp(ptes[i].partition_type_guid, PARTITION_LINUX_RAID_GUID)) state->parts[i + 1].flags = ADDPART_FLAG_RAID; info = &state->parts[i + 1].info; efi_guid_to_str(&ptes[i].unique_partition_guid, info->uuid); /* Naively convert UTF16-LE to 7 bits. */ label_max = min(ARRAY_SIZE(info->volname) - 1, ARRAY_SIZE(ptes[i].partition_name)); utf16_le_to_7bit(ptes[i].partition_name, label_max, info->volname); state->parts[i + 1].has_info = true; } kfree(ptes); kfree(gpt); strlcat(state->pp_buf, "\n", PAGE_SIZE); return 1; }
12 12 12 2 10 12 5 5 12 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 // SPDX-License-Identifier: GPL-2.0-or-later /* * V4L2 controls framework control definitions. * * Copyright (C) 2010-2021 Hans Verkuil <hverkuil-cisco@xs4all.nl> */ #include <linux/export.h> #include <media/v4l2-ctrls.h> /* * Returns NULL or a character pointer array containing the menu for * the given control ID. The pointer array ends with a NULL pointer. * An empty string signifies a menu entry that is invalid. This allows * drivers to disable certain options if it is not supported. */ const char * const *v4l2_ctrl_get_menu(u32 id) { static const char * const mpeg_audio_sampling_freq[] = { "44.1 kHz", "48 kHz", "32 kHz", NULL }; static const char * const mpeg_audio_encoding[] = { "MPEG-1/2 Layer I", "MPEG-1/2 Layer II", "MPEG-1/2 Layer III", "MPEG-2/4 AAC", "AC-3", NULL }; static const char * const mpeg_audio_l1_bitrate[] = { "32 kbps", "64 kbps", "96 kbps", "128 kbps", "160 kbps", "192 kbps", "224 kbps", "256 kbps", "288 kbps", "320 kbps", "352 kbps", "384 kbps", "416 kbps", "448 kbps", NULL }; static const char * const mpeg_audio_l2_bitrate[] = { "32 kbps", "48 kbps", "56 kbps", "64 kbps", "80 kbps", "96 kbps", "112 kbps", "128 kbps", "160 kbps", "192 kbps", "224 kbps", "256 kbps", "320 kbps", "384 kbps", NULL }; static const char * const mpeg_audio_l3_bitrate[] = { "32 kbps", "40 kbps", "48 kbps", "56 kbps", "64 kbps", "80 kbps", "96 kbps", "112 kbps", "128 kbps", "160 kbps", "192 kbps", "224 kbps", "256 kbps", "320 kbps", NULL }; static const char * const mpeg_audio_ac3_bitrate[] = { "32 kbps", "40 kbps", "48 kbps", "56 kbps", "64 kbps", "80 kbps", "96 kbps", "112 kbps", "128 kbps", "160 kbps", "192 kbps", "224 kbps", "256 kbps", "320 kbps", "384 kbps", "448 kbps", "512 kbps", "576 kbps", "640 kbps", NULL }; static const char * const mpeg_audio_mode[] = { "Stereo", "Joint Stereo", "Dual", "Mono", NULL }; static const char * const mpeg_audio_mode_extension[] = { "Bound 4", "Bound 8", "Bound 12", "Bound 16", NULL }; static const char * const mpeg_audio_emphasis[] = { "No Emphasis", "50/15 us", "CCITT J17", NULL }; static const char * const mpeg_audio_crc[] = { "No CRC", "16-bit CRC", NULL }; static const char * const mpeg_audio_dec_playback[] = { "Auto", "Stereo", "Left", "Right", "Mono", "Swapped Stereo", NULL }; static const char * const mpeg_video_encoding[] = { "MPEG-1", "MPEG-2", "MPEG-4 AVC", NULL }; static const char * const mpeg_video_aspect[] = { "1x1", "4x3", "16x9", "2.21x1", NULL }; static const char * const mpeg_video_bitrate_mode[] = { "Variable Bitrate", "Constant Bitrate", "Constant Quality", NULL }; static const char * const mpeg_stream_type[] = { "MPEG-2 Program Stream", "MPEG-2 Transport Stream", "MPEG-1 System Stream", "MPEG-2 DVD-compatible Stream", "MPEG-1 VCD-compatible Stream", "MPEG-2 SVCD-compatible Stream", NULL }; static const char * const mpeg_stream_vbi_fmt[] = { "No VBI", "Private Packet, IVTV Format", NULL }; static const char * const camera_power_line_frequency[] = { "Disabled", "50 Hz", "60 Hz", "Auto", NULL }; static const char * const camera_exposure_auto[] = { "Auto Mode", "Manual Mode", "Shutter Priority Mode", "Aperture Priority Mode", NULL }; static const char * const camera_exposure_metering[] = { "Average", "Center Weighted", "Spot", "Matrix", NULL }; static const char * const camera_auto_focus_range[] = { "Auto", "Normal", "Macro", "Infinity", NULL }; static const char * const colorfx[] = { "None", "Black & White", "Sepia", "Negative", "Emboss", "Sketch", "Sky Blue", "Grass Green", "Skin Whiten", "Vivid", "Aqua", "Art Freeze", "Silhouette", "Solarization", "Antique", "Set Cb/Cr", NULL }; static const char * const auto_n_preset_white_balance[] = { "Manual", "Auto", "Incandescent", "Fluorescent", "Fluorescent H", "Horizon", "Daylight", "Flash", "Cloudy", "Shade", NULL, }; static const char * const camera_iso_sensitivity_auto[] = { "Manual", "Auto", NULL }; static const char * const scene_mode[] = { "None", "Backlight", "Beach/Snow", "Candle Light", "Dusk/Dawn", "Fall Colors", "Fireworks", "Landscape", "Night", "Party/Indoor", "Portrait", "Sports", "Sunset", "Text", NULL }; static const char * const tune_emphasis[] = { "None", "50 Microseconds", "75 Microseconds", NULL, }; static const char * const header_mode[] = { "Separate Buffer", "Joined With 1st Frame", NULL, }; static const char * const multi_slice[] = { "Single", "Max Macroblocks", "Max Bytes", NULL, }; static const char * const entropy_mode[] = { "CAVLC", "CABAC", NULL, }; static const char * const mpeg_h264_level[] = { "1", "1b", "1.1", "1.2", "1.3", "2", "2.1", "2.2", "3", "3.1", "3.2", "4", "4.1", "4.2", "5", "5.1", "5.2", "6.0", "6.1", "6.2", NULL, }; static const char * const h264_loop_filter[] = { "Enabled", "Disabled", "Disabled at Slice Boundary", NULL, }; static const char * const h264_profile[] = { "Baseline", "Constrained Baseline", "Main", "Extended", "High", "High 10", "High 422", "High 444 Predictive", "High 10 Intra", "High 422 Intra", "High 444 Intra", "CAVLC 444 Intra", "Scalable Baseline", "Scalable High", "Scalable High Intra", "Stereo High", "Multiview High", "Constrained High", NULL, }; static const char * const vui_sar_idc[] = { "Unspecified", "1:1", "12:11", "10:11", "16:11", "40:33", "24:11", "20:11", "32:11", "80:33", "18:11", "15:11", "64:33", "160:99", "4:3", "3:2", "2:1", "Extended SAR", NULL, }; static const char * const h264_fp_arrangement_type[] = { "Checkerboard", "Column", "Row", "Side by Side", "Top Bottom", "Temporal", NULL, }; static const char * const h264_fmo_map_type[] = { "Interleaved Slices", "Scattered Slices", "Foreground with Leftover", "Box Out", "Raster Scan", "Wipe Scan", "Explicit", NULL, }; static const char * const h264_decode_mode[] = { "Slice-Based", "Frame-Based", NULL, }; static const char * const h264_start_code[] = { "No Start Code", "Annex B Start Code", NULL, }; static const char * const h264_hierarchical_coding_type[] = { "Hier Coding B", "Hier Coding P", NULL, }; static const char * const mpeg_mpeg2_level[] = { "Low", "Main", "High 1440", "High", NULL, }; static const char * const mpeg2_profile[] = { "Simple", "Main", "SNR Scalable", "Spatially Scalable", "High", NULL, }; static const char * const mpeg_mpeg4_level[] = { "0", "0b", "1", "2", "3", "3b", "4", "5", NULL, }; static const char * const mpeg4_profile[] = { "Simple", "Advanced Simple", "Core", "Simple Scalable", "Advanced Coding Efficiency", NULL, }; static const char * const vpx_golden_frame_sel[] = { "Use Previous Frame", "Use Previous Specific Frame", NULL, }; static const char * const vp8_profile[] = { "0", "1", "2", "3", NULL, }; static const char * const vp9_profile[] = { "0", "1", "2", "3", NULL, }; static const char * const vp9_level[] = { "1", "1.1", "2", "2.1", "3", "3.1", "4", "4.1", "5", "5.1", "5.2", "6", "6.1", "6.2", NULL, }; static const char * const flash_led_mode[] = { "Off", "Flash", "Torch", NULL, }; static const char * const flash_strobe_source[] = { "Software", "External", NULL, }; static const char * const jpeg_chroma_subsampling[] = { "4:4:4", "4:2:2", "4:2:0", "4:1:1", "4:1:0", "Gray", NULL, }; static const char * const dv_tx_mode[] = { "DVI-D", "HDMI", NULL, }; static const char * const dv_rgb_range[] = { "Automatic", "RGB Limited Range (16-235)", "RGB Full Range (0-255)", NULL, }; static const char * const dv_it_content_type[] = { "Graphics", "Photo", "Cinema", "Game", "No IT Content", NULL, }; static const char * const detect_md_mode[] = { "Disabled", "Global", "Threshold Grid", "Region Grid", NULL, }; static const char * const av1_profile[] = { "Main", "High", "Professional", NULL, }; static const char * const av1_level[] = { "2.0", "2.1", "2.2", "2.3", "3.0", "3.1", "3.2", "3.3", "4.0", "4.1", "4.2", "4.3", "5.0", "5.1", "5.2", "5.3", "6.0", "6.1", "6.2", "6.3", "7.0", "7.1", "7.2", "7.3", NULL, }; static const char * const hevc_profile[] = { "Main", "Main Still Picture", "Main 10", NULL, }; static const char * const hevc_level[] = { "1", "2", "2.1", "3", "3.1", "4", "4.1", "5", "5.1", "5.2", "6", "6.1", "6.2", NULL, }; static const char * const hevc_hierarchial_coding_type[] = { "B", "P", NULL, }; static const char * const hevc_refresh_type[] = { "None", "CRA", "IDR", NULL, }; static const char * const hevc_size_of_length_field[] = { "0", "1", "2", "4", NULL, }; static const char * const hevc_tier[] = { "Main", "High", NULL, }; static const char * const hevc_loop_filter_mode[] = { "Disabled", "Enabled", "Disabled at slice boundary", "NULL", }; static const char * const hevc_decode_mode[] = { "Slice-Based", "Frame-Based", NULL, }; static const char * const hevc_start_code[] = { "No Start Code", "Annex B Start Code", NULL, }; static const char * const camera_orientation[] = { "Front", "Back", "External", NULL, }; static const char * const mpeg_video_frame_skip[] = { "Disabled", "Level Limit", "VBV/CPB Limit", NULL, }; static const char * const intra_refresh_period_type[] = { "Random", "Cyclic", NULL, }; switch (id) { case V4L2_CID_MPEG_AUDIO_SAMPLING_FREQ: return mpeg_audio_sampling_freq; case V4L2_CID_MPEG_AUDIO_ENCODING: return mpeg_audio_encoding; case V4L2_CID_MPEG_AUDIO_L1_BITRATE: return mpeg_audio_l1_bitrate; case V4L2_CID_MPEG_AUDIO_L2_BITRATE: return mpeg_audio_l2_bitrate; case V4L2_CID_MPEG_AUDIO_L3_BITRATE: return mpeg_audio_l3_bitrate; case V4L2_CID_MPEG_AUDIO_AC3_BITRATE: return mpeg_audio_ac3_bitrate; case V4L2_CID_MPEG_AUDIO_MODE: return mpeg_audio_mode; case V4L2_CID_MPEG_AUDIO_MODE_EXTENSION: return mpeg_audio_mode_extension; case V4L2_CID_MPEG_AUDIO_EMPHASIS: return mpeg_audio_emphasis; case V4L2_CID_MPEG_AUDIO_CRC: return mpeg_audio_crc; case V4L2_CID_MPEG_AUDIO_DEC_PLAYBACK: case V4L2_CID_MPEG_AUDIO_DEC_MULTILINGUAL_PLAYBACK: return mpeg_audio_dec_playback; case V4L2_CID_MPEG_VIDEO_ENCODING: return mpeg_video_encoding; case V4L2_CID_MPEG_VIDEO_ASPECT: return mpeg_video_aspect; case V4L2_CID_MPEG_VIDEO_BITRATE_MODE: return mpeg_video_bitrate_mode; case V4L2_CID_MPEG_STREAM_TYPE: return mpeg_stream_type; case V4L2_CID_MPEG_STREAM_VBI_FMT: return mpeg_stream_vbi_fmt; case V4L2_CID_POWER_LINE_FREQUENCY: return camera_power_line_frequency; case V4L2_CID_EXPOSURE_AUTO: return camera_exposure_auto; case V4L2_CID_EXPOSURE_METERING: return camera_exposure_metering; case V4L2_CID_AUTO_FOCUS_RANGE: return camera_auto_focus_range; case V4L2_CID_COLORFX: return colorfx; case V4L2_CID_AUTO_N_PRESET_WHITE_BALANCE: return auto_n_preset_white_balance; case V4L2_CID_ISO_SENSITIVITY_AUTO: return camera_iso_sensitivity_auto; case V4L2_CID_SCENE_MODE: return scene_mode; case V4L2_CID_TUNE_PREEMPHASIS: return tune_emphasis; case V4L2_CID_TUNE_DEEMPHASIS: return tune_emphasis; case V4L2_CID_FLASH_LED_MODE: return flash_led_mode; case V4L2_CID_FLASH_STROBE_SOURCE: return flash_strobe_source; case V4L2_CID_MPEG_VIDEO_HEADER_MODE: return header_mode; case V4L2_CID_MPEG_VIDEO_FRAME_SKIP_MODE: return mpeg_video_frame_skip; case V4L2_CID_MPEG_VIDEO_MULTI_SLICE_MODE: return multi_slice; case V4L2_CID_MPEG_VIDEO_H264_ENTROPY_MODE: return entropy_mode; case V4L2_CID_MPEG_VIDEO_H264_LEVEL: return mpeg_h264_level; case V4L2_CID_MPEG_VIDEO_H264_LOOP_FILTER_MODE: return h264_loop_filter; case V4L2_CID_MPEG_VIDEO_H264_PROFILE: return h264_profile; case V4L2_CID_MPEG_VIDEO_H264_VUI_SAR_IDC: return vui_sar_idc; case V4L2_CID_MPEG_VIDEO_H264_SEI_FP_ARRANGEMENT_TYPE: return h264_fp_arrangement_type; case V4L2_CID_MPEG_VIDEO_H264_FMO_MAP_TYPE: return h264_fmo_map_type; case V4L2_CID_STATELESS_H264_DECODE_MODE: return h264_decode_mode; case V4L2_CID_STATELESS_H264_START_CODE: return h264_start_code; case V4L2_CID_MPEG_VIDEO_H264_HIERARCHICAL_CODING_TYPE: return h264_hierarchical_coding_type; case V4L2_CID_MPEG_VIDEO_MPEG2_LEVEL: return mpeg_mpeg2_level; case V4L2_CID_MPEG_VIDEO_MPEG2_PROFILE: return mpeg2_profile; case V4L2_CID_MPEG_VIDEO_MPEG4_LEVEL: return mpeg_mpeg4_level; case V4L2_CID_MPEG_VIDEO_MPEG4_PROFILE: return mpeg4_profile; case V4L2_CID_MPEG_VIDEO_VPX_GOLDEN_FRAME_SEL: return vpx_golden_frame_sel; case V4L2_CID_MPEG_VIDEO_VP8_PROFILE: return vp8_profile; case V4L2_CID_MPEG_VIDEO_VP9_PROFILE: return vp9_profile; case V4L2_CID_MPEG_VIDEO_VP9_LEVEL: return vp9_level; case V4L2_CID_JPEG_CHROMA_SUBSAMPLING: return jpeg_chroma_subsampling; case V4L2_CID_DV_TX_MODE: return dv_tx_mode; case V4L2_CID_DV_TX_RGB_RANGE: case V4L2_CID_DV_RX_RGB_RANGE: return dv_rgb_range; case V4L2_CID_DV_TX_IT_CONTENT_TYPE: case V4L2_CID_DV_RX_IT_CONTENT_TYPE: return dv_it_content_type; case V4L2_CID_DETECT_MD_MODE: return detect_md_mode; case V4L2_CID_MPEG_VIDEO_HEVC_PROFILE: return hevc_profile; case V4L2_CID_MPEG_VIDEO_HEVC_LEVEL: return hevc_level; case V4L2_CID_MPEG_VIDEO_HEVC_HIER_CODING_TYPE: return hevc_hierarchial_coding_type; case V4L2_CID_MPEG_VIDEO_HEVC_REFRESH_TYPE: return hevc_refresh_type; case V4L2_CID_MPEG_VIDEO_HEVC_SIZE_OF_LENGTH_FIELD: return hevc_size_of_length_field; case V4L2_CID_MPEG_VIDEO_HEVC_TIER: return hevc_tier; case V4L2_CID_MPEG_VIDEO_HEVC_LOOP_FILTER_MODE: return hevc_loop_filter_mode; case V4L2_CID_MPEG_VIDEO_AV1_PROFILE: return av1_profile; case V4L2_CID_MPEG_VIDEO_AV1_LEVEL: return av1_level; case V4L2_CID_STATELESS_HEVC_DECODE_MODE: return hevc_decode_mode; case V4L2_CID_STATELESS_HEVC_START_CODE: return hevc_start_code; case V4L2_CID_CAMERA_ORIENTATION: return camera_orientation; case V4L2_CID_MPEG_VIDEO_INTRA_REFRESH_PERIOD_TYPE: return intra_refresh_period_type; default: return NULL; } } EXPORT_SYMBOL(v4l2_ctrl_get_menu); #define __v4l2_qmenu_int_len(arr, len) ({ *(len) = ARRAY_SIZE(arr); (arr); }) /* * Returns NULL or an s64 type array containing the menu for given * control ID. The total number of the menu items is returned in @len. */ const s64 *v4l2_ctrl_get_int_menu(u32 id, u32 *len) { static const s64 qmenu_int_vpx_num_partitions[] = { 1, 2, 4, 8, }; static const s64 qmenu_int_vpx_num_ref_frames[] = { 1, 2, 3, }; switch (id) { case V4L2_CID_MPEG_VIDEO_VPX_NUM_PARTITIONS: return __v4l2_qmenu_int_len(qmenu_int_vpx_num_partitions, len); case V4L2_CID_MPEG_VIDEO_VPX_NUM_REF_FRAMES: return __v4l2_qmenu_int_len(qmenu_int_vpx_num_ref_frames, len); default: *len = 0; return NULL; } } EXPORT_SYMBOL(v4l2_ctrl_get_int_menu); /* Return the control name. */ const char *v4l2_ctrl_get_name(u32 id) { switch (id) { /* USER controls */ /* Keep the order of the 'case's the same as in v4l2-controls.h! */ case V4L2_CID_USER_CLASS: return "User Controls"; case V4L2_CID_BRIGHTNESS: return "Brightness"; case V4L2_CID_CONTRAST: return "Contrast"; case V4L2_CID_SATURATION: return "Saturation"; case V4L2_CID_HUE: return "Hue"; case V4L2_CID_AUDIO_VOLUME: return "Volume"; case V4L2_CID_AUDIO_BALANCE: return "Balance"; case V4L2_CID_AUDIO_BASS: return "Bass"; case V4L2_CID_AUDIO_TREBLE: return "Treble"; case V4L2_CID_AUDIO_MUTE: return "Mute"; case V4L2_CID_AUDIO_LOUDNESS: return "Loudness"; case V4L2_CID_BLACK_LEVEL: return "Black Level"; case V4L2_CID_AUTO_WHITE_BALANCE: return "White Balance, Automatic"; case V4L2_CID_DO_WHITE_BALANCE: return "Do White Balance"; case V4L2_CID_RED_BALANCE: return "Red Balance"; case V4L2_CID_BLUE_BALANCE: return "Blue Balance"; case V4L2_CID_GAMMA: return "Gamma"; case V4L2_CID_EXPOSURE: return "Exposure"; case V4L2_CID_AUTOGAIN: return "Gain, Automatic"; case V4L2_CID_GAIN: return "Gain"; case V4L2_CID_HFLIP: return "Horizontal Flip"; case V4L2_CID_VFLIP: return "Vertical Flip"; case V4L2_CID_POWER_LINE_FREQUENCY: return "Power Line Frequency"; case V4L2_CID_HUE_AUTO: return "Hue, Automatic"; case V4L2_CID_WHITE_BALANCE_TEMPERATURE: return "White Balance Temperature"; case V4L2_CID_SHARPNESS: return "Sharpness"; case V4L2_CID_BACKLIGHT_COMPENSATION: return "Backlight Compensation"; case V4L2_CID_CHROMA_AGC: return "Chroma AGC"; case V4L2_CID_COLOR_KILLER: return "Color Killer"; case V4L2_CID_COLORFX: return "Color Effects"; case V4L2_CID_AUTOBRIGHTNESS: return "Brightness, Automatic"; case V4L2_CID_BAND_STOP_FILTER: return "Band-Stop Filter"; case V4L2_CID_ROTATE: return "Rotate"; case V4L2_CID_BG_COLOR: return "Background Color"; case V4L2_CID_CHROMA_GAIN: return "Chroma Gain"; case V4L2_CID_ILLUMINATORS_1: return "Illuminator 1"; case V4L2_CID_ILLUMINATORS_2: return "Illuminator 2"; case V4L2_CID_MIN_BUFFERS_FOR_CAPTURE: return "Min Number of Capture Buffers"; case V4L2_CID_MIN_BUFFERS_FOR_OUTPUT: return "Min Number of Output Buffers"; case V4L2_CID_ALPHA_COMPONENT: return "Alpha Component"; case V4L2_CID_COLORFX_CBCR: return "Color Effects, CbCr"; case V4L2_CID_COLORFX_RGB: return "Color Effects, RGB"; /* * Codec controls * * The MPEG controls are applicable to all codec controls * and the 'MPEG' part of the define is historical. * * Keep the order of the 'case's the same as in videodev2.h! */ case V4L2_CID_CODEC_CLASS: return "Codec Controls"; case V4L2_CID_MPEG_STREAM_TYPE: return "Stream Type"; case V4L2_CID_MPEG_STREAM_PID_PMT: return "Stream PMT Program ID"; case V4L2_CID_MPEG_STREAM_PID_AUDIO: return "Stream Audio Program ID"; case V4L2_CID_MPEG_STREAM_PID_VIDEO: return "Stream Video Program ID"; case V4L2_CID_MPEG_STREAM_PID_PCR: return "Stream PCR Program ID"; case V4L2_CID_MPEG_STREAM_PES_ID_AUDIO: return "Stream PES Audio ID"; case V4L2_CID_MPEG_STREAM_PES_ID_VIDEO: return "Stream PES Video ID"; case V4L2_CID_MPEG_STREAM_VBI_FMT: return "Stream VBI Format"; case V4L2_CID_MPEG_AUDIO_SAMPLING_FREQ: return "Audio Sampling Frequency"; case V4L2_CID_MPEG_AUDIO_ENCODING: return "Audio Encoding"; case V4L2_CID_MPEG_AUDIO_L1_BITRATE: return "Audio Layer I Bitrate"; case V4L2_CID_MPEG_AUDIO_L2_BITRATE: return "Audio Layer II Bitrate"; case V4L2_CID_MPEG_AUDIO_L3_BITRATE: return "Audio Layer III Bitrate"; case V4L2_CID_MPEG_AUDIO_MODE: return "Audio Stereo Mode"; case V4L2_CID_MPEG_AUDIO_MODE_EXTENSION: return "Audio Stereo Mode Extension"; case V4L2_CID_MPEG_AUDIO_EMPHASIS: return "Audio Emphasis"; case V4L2_CID_MPEG_AUDIO_CRC: return "Audio CRC"; case V4L2_CID_MPEG_AUDIO_MUTE: return "Audio Mute"; case V4L2_CID_MPEG_AUDIO_AAC_BITRATE: return "Audio AAC Bitrate"; case V4L2_CID_MPEG_AUDIO_AC3_BITRATE: return "Audio AC-3 Bitrate"; case V4L2_CID_MPEG_AUDIO_DEC_PLAYBACK: return "Audio Playback"; case V4L2_CID_MPEG_AUDIO_DEC_MULTILINGUAL_PLAYBACK: return "Audio Multilingual Playback"; case V4L2_CID_MPEG_VIDEO_ENCODING: return "Video Encoding"; case V4L2_CID_MPEG_VIDEO_ASPECT: return "Video Aspect"; case V4L2_CID_MPEG_VIDEO_B_FRAMES: return "Video B Frames"; case V4L2_CID_MPEG_VIDEO_GOP_SIZE: return "Video GOP Size"; case V4L2_CID_MPEG_VIDEO_GOP_CLOSURE: return "Video GOP Closure"; case V4L2_CID_MPEG_VIDEO_PULLDOWN: return "Video Pulldown"; case V4L2_CID_MPEG_VIDEO_BITRATE_MODE: return "Video Bitrate Mode"; case V4L2_CID_MPEG_VIDEO_CONSTANT_QUALITY: return "Constant Quality"; case V4L2_CID_MPEG_VIDEO_BITRATE: return "Video Bitrate"; case V4L2_CID_MPEG_VIDEO_BITRATE_PEAK: return "Video Peak Bitrate"; case V4L2_CID_MPEG_VIDEO_TEMPORAL_DECIMATION: return "Video Temporal Decimation"; case V4L2_CID_MPEG_VIDEO_MUTE: return "Video Mute"; case V4L2_CID_MPEG_VIDEO_MUTE_YUV: return "Video Mute YUV"; case V4L2_CID_MPEG_VIDEO_DECODER_SLICE_INTERFACE: return "Decoder Slice Interface"; case V4L2_CID_MPEG_VIDEO_DECODER_MPEG4_DEBLOCK_FILTER: return "MPEG4 Loop Filter Enable"; case V4L2_CID_MPEG_VIDEO_CYCLIC_INTRA_REFRESH_MB: return "Number of Intra Refresh MBs"; case V4L2_CID_MPEG_VIDEO_INTRA_REFRESH_PERIOD_TYPE: return "Intra Refresh Period Type"; case V4L2_CID_MPEG_VIDEO_INTRA_REFRESH_PERIOD: return "Intra Refresh Period"; case V4L2_CID_MPEG_VIDEO_FRAME_RC_ENABLE: return "Frame Level Rate Control Enable"; case V4L2_CID_MPEG_VIDEO_MB_RC_ENABLE: return "H264 MB Level Rate Control"; case V4L2_CID_MPEG_VIDEO_HEADER_MODE: return "Sequence Header Mode"; case V4L2_CID_MPEG_VIDEO_MAX_REF_PIC: return "Max Number of Reference Pics"; case V4L2_CID_MPEG_VIDEO_FRAME_SKIP_MODE: return "Frame Skip Mode"; case V4L2_CID_MPEG_VIDEO_DEC_DISPLAY_DELAY: return "Display Delay"; case V4L2_CID_MPEG_VIDEO_DEC_DISPLAY_DELAY_ENABLE: return "Display Delay Enable"; case V4L2_CID_MPEG_VIDEO_AU_DELIMITER: return "Generate Access Unit Delimiters"; case V4L2_CID_MPEG_VIDEO_H263_I_FRAME_QP: return "H263 I-Frame QP Value"; case V4L2_CID_MPEG_VIDEO_H263_P_FRAME_QP: return "H263 P-Frame QP Value"; case V4L2_CID_MPEG_VIDEO_H263_B_FRAME_QP: return "H263 B-Frame QP Value"; case V4L2_CID_MPEG_VIDEO_H263_MIN_QP: return "H263 Minimum QP Value"; case V4L2_CID_MPEG_VIDEO_H263_MAX_QP: return "H263 Maximum QP Value"; case V4L2_CID_MPEG_VIDEO_H264_I_FRAME_QP: return "H264 I-Frame QP Value"; case V4L2_CID_MPEG_VIDEO_H264_P_FRAME_QP: return "H264 P-Frame QP Value"; case V4L2_CID_MPEG_VIDEO_H264_B_FRAME_QP: return "H264 B-Frame QP Value"; case V4L2_CID_MPEG_VIDEO_H264_MAX_QP: return "H264 Maximum QP Value"; case V4L2_CID_MPEG_VIDEO_H264_MIN_QP: return "H264 Minimum QP Value"; case V4L2_CID_MPEG_VIDEO_H264_8X8_TRANSFORM: return "H264 8x8 Transform Enable"; case V4L2_CID_MPEG_VIDEO_H264_CPB_SIZE: return "H264 CPB Buffer Size"; case V4L2_CID_MPEG_VIDEO_H264_ENTROPY_MODE: return "H264 Entropy Mode"; case V4L2_CID_MPEG_VIDEO_H264_I_PERIOD: return "H264 I-Frame Period"; case V4L2_CID_MPEG_VIDEO_H264_LEVEL: return "H264 Level"; case V4L2_CID_MPEG_VIDEO_H264_LOOP_FILTER_ALPHA: return "H264 Loop Filter Alpha Offset"; case V4L2_CID_MPEG_VIDEO_H264_LOOP_FILTER_BETA: return "H264 Loop Filter Beta Offset"; case V4L2_CID_MPEG_VIDEO_H264_LOOP_FILTER_MODE: return "H264 Loop Filter Mode"; case V4L2_CID_MPEG_VIDEO_H264_PROFILE: return "H264 Profile"; case V4L2_CID_MPEG_VIDEO_H264_VUI_EXT_SAR_HEIGHT: return "Vertical Size of SAR"; case V4L2_CID_MPEG_VIDEO_H264_VUI_EXT_SAR_WIDTH: return "Horizontal Size of SAR"; case V4L2_CID_MPEG_VIDEO_H264_VUI_SAR_ENABLE: return "Aspect Ratio VUI Enable"; case V4L2_CID_MPEG_VIDEO_H264_VUI_SAR_IDC: return "VUI Aspect Ratio IDC"; case V4L2_CID_MPEG_VIDEO_H264_SEI_FRAME_PACKING: return "H264 Enable Frame Packing SEI"; case V4L2_CID_MPEG_VIDEO_H264_SEI_FP_CURRENT_FRAME_0: return "H264 Set Curr. Frame as Frame0"; case V4L2_CID_MPEG_VIDEO_H264_SEI_FP_ARRANGEMENT_TYPE: return "H264 FP Arrangement Type"; case V4L2_CID_MPEG_VIDEO_H264_FMO: return "H264 Flexible MB Ordering"; case V4L2_CID_MPEG_VIDEO_H264_FMO_MAP_TYPE: return "H264 Map Type for FMO"; case V4L2_CID_MPEG_VIDEO_H264_FMO_SLICE_GROUP: return "H264 FMO Number of Slice Groups"; case V4L2_CID_MPEG_VIDEO_H264_FMO_CHANGE_DIRECTION: return "H264 FMO Direction of Change"; case V4L2_CID_MPEG_VIDEO_H264_FMO_CHANGE_RATE: return "H264 FMO Size of 1st Slice Grp"; case V4L2_CID_MPEG_VIDEO_H264_FMO_RUN_LENGTH: return "H264 FMO No. of Consecutive MBs"; case V4L2_CID_MPEG_VIDEO_H264_ASO: return "H264 Arbitrary Slice Ordering"; case V4L2_CID_MPEG_VIDEO_H264_ASO_SLICE_ORDER: return "H264 ASO Slice Order"; case V4L2_CID_MPEG_VIDEO_H264_HIERARCHICAL_CODING: return "Enable H264 Hierarchical Coding"; case V4L2_CID_MPEG_VIDEO_H264_HIERARCHICAL_CODING_TYPE: return "H264 Hierarchical Coding Type"; case V4L2_CID_MPEG_VIDEO_H264_HIERARCHICAL_CODING_LAYER:return "H264 Number of HC Layers"; case V4L2_CID_MPEG_VIDEO_H264_HIERARCHICAL_CODING_LAYER_QP: return "H264 Set QP Value for HC Layers"; case V4L2_CID_MPEG_VIDEO_H264_CONSTRAINED_INTRA_PREDICTION: return "H264 Constrained Intra Pred"; case V4L2_CID_MPEG_VIDEO_H264_CHROMA_QP_INDEX_OFFSET: return "H264 Chroma QP Index Offset"; case V4L2_CID_MPEG_VIDEO_H264_I_FRAME_MIN_QP: return "H264 I-Frame Minimum QP Value"; case V4L2_CID_MPEG_VIDEO_H264_I_FRAME_MAX_QP: return "H264 I-Frame Maximum QP Value"; case V4L2_CID_MPEG_VIDEO_H264_P_FRAME_MIN_QP: return "H264 P-Frame Minimum QP Value"; case V4L2_CID_MPEG_VIDEO_H264_P_FRAME_MAX_QP: return "H264 P-Frame Maximum QP Value"; case V4L2_CID_MPEG_VIDEO_H264_B_FRAME_MIN_QP: return "H264 B-Frame Minimum QP Value"; case V4L2_CID_MPEG_VIDEO_H264_B_FRAME_MAX_QP: return "H264 B-Frame Maximum QP Value"; case V4L2_CID_MPEG_VIDEO_H264_HIER_CODING_L0_BR: return "H264 Hierarchical Lay 0 Bitrate"; case V4L2_CID_MPEG_VIDEO_H264_HIER_CODING_L1_BR: return "H264 Hierarchical Lay 1 Bitrate"; case V4L2_CID_MPEG_VIDEO_H264_HIER_CODING_L2_BR: return "H264 Hierarchical Lay 2 Bitrate"; case V4L2_CID_MPEG_VIDEO_H264_HIER_CODING_L3_BR: return "H264 Hierarchical Lay 3 Bitrate"; case V4L2_CID_MPEG_VIDEO_H264_HIER_CODING_L4_BR: return "H264 Hierarchical Lay 4 Bitrate"; case V4L2_CID_MPEG_VIDEO_H264_HIER_CODING_L5_BR: return "H264 Hierarchical Lay 5 Bitrate"; case V4L2_CID_MPEG_VIDEO_H264_HIER_CODING_L6_BR: return "H264 Hierarchical Lay 6 Bitrate"; case V4L2_CID_MPEG_VIDEO_MPEG2_LEVEL: return "MPEG2 Level"; case V4L2_CID_MPEG_VIDEO_MPEG2_PROFILE: return "MPEG2 Profile"; case V4L2_CID_MPEG_VIDEO_MPEG4_I_FRAME_QP: return "MPEG4 I-Frame QP Value"; case V4L2_CID_MPEG_VIDEO_MPEG4_P_FRAME_QP: return "MPEG4 P-Frame QP Value"; case V4L2_CID_MPEG_VIDEO_MPEG4_B_FRAME_QP: return "MPEG4 B-Frame QP Value"; case V4L2_CID_MPEG_VIDEO_MPEG4_MIN_QP: return "MPEG4 Minimum QP Value"; case V4L2_CID_MPEG_VIDEO_MPEG4_MAX_QP: return "MPEG4 Maximum QP Value"; case V4L2_CID_MPEG_VIDEO_MPEG4_LEVEL: return "MPEG4 Level"; case V4L2_CID_MPEG_VIDEO_MPEG4_PROFILE: return "MPEG4 Profile"; case V4L2_CID_MPEG_VIDEO_MPEG4_QPEL: return "Quarter Pixel Search Enable"; case V4L2_CID_MPEG_VIDEO_MULTI_SLICE_MAX_BYTES: return "Maximum Bytes in a Slice"; case V4L2_CID_MPEG_VIDEO_MULTI_SLICE_MAX_MB: return "Number of MBs in a Slice"; case V4L2_CID_MPEG_VIDEO_MULTI_SLICE_MODE: return "Slice Partitioning Method"; case V4L2_CID_MPEG_VIDEO_VBV_SIZE: return "VBV Buffer Size"; case V4L2_CID_MPEG_VIDEO_DEC_PTS: return "Video Decoder PTS"; case V4L2_CID_MPEG_VIDEO_DEC_FRAME: return "Video Decoder Frame Count"; case V4L2_CID_MPEG_VIDEO_DEC_CONCEAL_COLOR: return "Video Decoder Conceal Color"; case V4L2_CID_MPEG_VIDEO_VBV_DELAY: return "Initial Delay for VBV Control"; case V4L2_CID_MPEG_VIDEO_MV_H_SEARCH_RANGE: return "Horizontal MV Search Range"; case V4L2_CID_MPEG_VIDEO_MV_V_SEARCH_RANGE: return "Vertical MV Search Range"; case V4L2_CID_MPEG_VIDEO_REPEAT_SEQ_HEADER: return "Repeat Sequence Header"; case V4L2_CID_MPEG_VIDEO_FORCE_KEY_FRAME: return "Force Key Frame"; case V4L2_CID_MPEG_VIDEO_BASELAYER_PRIORITY_ID: return "Base Layer Priority ID"; case V4L2_CID_MPEG_VIDEO_LTR_COUNT: return "LTR Count"; case V4L2_CID_MPEG_VIDEO_FRAME_LTR_INDEX: return "Frame LTR Index"; case V4L2_CID_MPEG_VIDEO_USE_LTR_FRAMES: return "Use LTR Frames"; case V4L2_CID_MPEG_VIDEO_AVERAGE_QP: return "Average QP Value"; case V4L2_CID_FWHT_I_FRAME_QP: return "FWHT I-Frame QP Value"; case V4L2_CID_FWHT_P_FRAME_QP: return "FWHT P-Frame QP Value"; /* VPX controls */ case V4L2_CID_MPEG_VIDEO_VPX_NUM_PARTITIONS: return "VPX Number of Partitions"; case V4L2_CID_MPEG_VIDEO_VPX_IMD_DISABLE_4X4: return "VPX Intra Mode Decision Disable"; case V4L2_CID_MPEG_VIDEO_VPX_NUM_REF_FRAMES: return "VPX No. of Refs for P Frame"; case V4L2_CID_MPEG_VIDEO_VPX_FILTER_LEVEL: return "VPX Loop Filter Level Range"; case V4L2_CID_MPEG_VIDEO_VPX_FILTER_SHARPNESS: return "VPX Deblocking Effect Control"; case V4L2_CID_MPEG_VIDEO_VPX_GOLDEN_FRAME_REF_PERIOD: return "VPX Golden Frame Refresh Period"; case V4L2_CID_MPEG_VIDEO_VPX_GOLDEN_FRAME_SEL: return "VPX Golden Frame Indicator"; case V4L2_CID_MPEG_VIDEO_VPX_MIN_QP: return "VPX Minimum QP Value"; case V4L2_CID_MPEG_VIDEO_VPX_MAX_QP: return "VPX Maximum QP Value"; case V4L2_CID_MPEG_VIDEO_VPX_I_FRAME_QP: return "VPX I-Frame QP Value"; case V4L2_CID_MPEG_VIDEO_VPX_P_FRAME_QP: return "VPX P-Frame QP Value"; case V4L2_CID_MPEG_VIDEO_VP8_PROFILE: return "VP8 Profile"; case V4L2_CID_MPEG_VIDEO_VP9_PROFILE: return "VP9 Profile"; case V4L2_CID_MPEG_VIDEO_VP9_LEVEL: return "VP9 Level"; /* HEVC controls */ case V4L2_CID_MPEG_VIDEO_HEVC_I_FRAME_QP: return "HEVC I-Frame QP Value"; case V4L2_CID_MPEG_VIDEO_HEVC_P_FRAME_QP: return "HEVC P-Frame QP Value"; case V4L2_CID_MPEG_VIDEO_HEVC_B_FRAME_QP: return "HEVC B-Frame QP Value"; case V4L2_CID_MPEG_VIDEO_HEVC_MIN_QP: return "HEVC Minimum QP Value"; case V4L2_CID_MPEG_VIDEO_HEVC_MAX_QP: return "HEVC Maximum QP Value"; case V4L2_CID_MPEG_VIDEO_HEVC_I_FRAME_MIN_QP: return "HEVC I-Frame Minimum QP Value"; case V4L2_CID_MPEG_VIDEO_HEVC_I_FRAME_MAX_QP: return "HEVC I-Frame Maximum QP Value"; case V4L2_CID_MPEG_VIDEO_HEVC_P_FRAME_MIN_QP: return "HEVC P-Frame Minimum QP Value"; case V4L2_CID_MPEG_VIDEO_HEVC_P_FRAME_MAX_QP: return "HEVC P-Frame Maximum QP Value"; case V4L2_CID_MPEG_VIDEO_HEVC_B_FRAME_MIN_QP: return "HEVC B-Frame Minimum QP Value"; case V4L2_CID_MPEG_VIDEO_HEVC_B_FRAME_MAX_QP: return "HEVC B-Frame Maximum QP Value"; case V4L2_CID_MPEG_VIDEO_HEVC_PROFILE: return "HEVC Profile"; case V4L2_CID_MPEG_VIDEO_HEVC_LEVEL: return "HEVC Level"; case V4L2_CID_MPEG_VIDEO_HEVC_TIER: return "HEVC Tier"; case V4L2_CID_MPEG_VIDEO_HEVC_FRAME_RATE_RESOLUTION: return "HEVC Frame Rate Resolution"; case V4L2_CID_MPEG_VIDEO_HEVC_MAX_PARTITION_DEPTH: return "HEVC Maximum Coding Unit Depth"; case V4L2_CID_MPEG_VIDEO_HEVC_REFRESH_TYPE: return "HEVC Refresh Type"; case V4L2_CID_MPEG_VIDEO_HEVC_CONST_INTRA_PRED: return "HEVC Constant Intra Prediction"; case V4L2_CID_MPEG_VIDEO_HEVC_LOSSLESS_CU: return "HEVC Lossless Encoding"; case V4L2_CID_MPEG_VIDEO_HEVC_WAVEFRONT: return "HEVC Wavefront"; case V4L2_CID_MPEG_VIDEO_HEVC_LOOP_FILTER_MODE: return "HEVC Loop Filter"; case V4L2_CID_MPEG_VIDEO_HEVC_HIER_QP: return "HEVC QP Values"; case V4L2_CID_MPEG_VIDEO_HEVC_HIER_CODING_TYPE: return "HEVC Hierarchical Coding Type"; case V4L2_CID_MPEG_VIDEO_HEVC_HIER_CODING_LAYER: return "HEVC Hierarchical Coding Layer"; case V4L2_CID_MPEG_VIDEO_HEVC_HIER_CODING_L0_QP: return "HEVC Hierarchical Layer 0 QP"; case V4L2_CID_MPEG_VIDEO_HEVC_HIER_CODING_L1_QP: return "HEVC Hierarchical Layer 1 QP"; case V4L2_CID_MPEG_VIDEO_HEVC_HIER_CODING_L2_QP: return "HEVC Hierarchical Layer 2 QP"; case V4L2_CID_MPEG_VIDEO_HEVC_HIER_CODING_L3_QP: return "HEVC Hierarchical Layer 3 QP"; case V4L2_CID_MPEG_VIDEO_HEVC_HIER_CODING_L4_QP: return "HEVC Hierarchical Layer 4 QP"; case V4L2_CID_MPEG_VIDEO_HEVC_HIER_CODING_L5_QP: return "HEVC Hierarchical Layer 5 QP"; case V4L2_CID_MPEG_VIDEO_HEVC_HIER_CODING_L6_QP: return "HEVC Hierarchical Layer 6 QP"; case V4L2_CID_MPEG_VIDEO_HEVC_HIER_CODING_L0_BR: return "HEVC Hierarchical Lay 0 BitRate"; case V4L2_CID_MPEG_VIDEO_HEVC_HIER_CODING_L1_BR: return "HEVC Hierarchical Lay 1 BitRate"; case V4L2_CID_MPEG_VIDEO_HEVC_HIER_CODING_L2_BR: return "HEVC Hierarchical Lay 2 BitRate"; case V4L2_CID_MPEG_VIDEO_HEVC_HIER_CODING_L3_BR: return "HEVC Hierarchical Lay 3 BitRate"; case V4L2_CID_MPEG_VIDEO_HEVC_HIER_CODING_L4_BR: return "HEVC Hierarchical Lay 4 BitRate"; case V4L2_CID_MPEG_VIDEO_HEVC_HIER_CODING_L5_BR: return "HEVC Hierarchical Lay 5 BitRate"; case V4L2_CID_MPEG_VIDEO_HEVC_HIER_CODING_L6_BR: return "HEVC Hierarchical Lay 6 BitRate"; case V4L2_CID_MPEG_VIDEO_HEVC_GENERAL_PB: return "HEVC General PB"; case V4L2_CID_MPEG_VIDEO_HEVC_TEMPORAL_ID: return "HEVC Temporal ID"; case V4L2_CID_MPEG_VIDEO_HEVC_STRONG_SMOOTHING: return "HEVC Strong Intra Smoothing"; case V4L2_CID_MPEG_VIDEO_HEVC_INTRA_PU_SPLIT: return "HEVC Intra PU Split"; case V4L2_CID_MPEG_VIDEO_HEVC_TMV_PREDICTION: return "HEVC TMV Prediction"; case V4L2_CID_MPEG_VIDEO_HEVC_MAX_NUM_MERGE_MV_MINUS1: return "HEVC Max Num of Candidate MVs"; case V4L2_CID_MPEG_VIDEO_HEVC_WITHOUT_STARTCODE: return "HEVC ENC Without Startcode"; case V4L2_CID_MPEG_VIDEO_HEVC_REFRESH_PERIOD: return "HEVC Num of I-Frame b/w 2 IDR"; case V4L2_CID_MPEG_VIDEO_HEVC_LF_BETA_OFFSET_DIV2: return "HEVC Loop Filter Beta Offset"; case V4L2_CID_MPEG_VIDEO_HEVC_LF_TC_OFFSET_DIV2: return "HEVC Loop Filter TC Offset"; case V4L2_CID_MPEG_VIDEO_HEVC_SIZE_OF_LENGTH_FIELD: return "HEVC Size of Length Field"; case V4L2_CID_MPEG_VIDEO_REF_NUMBER_FOR_PFRAMES: return "Reference Frames for a P-Frame"; case V4L2_CID_MPEG_VIDEO_PREPEND_SPSPPS_TO_IDR: return "Prepend SPS and PPS to IDR"; /* AV1 controls */ case V4L2_CID_MPEG_VIDEO_AV1_PROFILE: return "AV1 Profile"; case V4L2_CID_MPEG_VIDEO_AV1_LEVEL: return "AV1 Level"; /* CAMERA controls */ /* Keep the order of the 'case's the same as in v4l2-controls.h! */ case V4L2_CID_CAMERA_CLASS: return "Camera Controls"; case V4L2_CID_EXPOSURE_AUTO: return "Auto Exposure"; case V4L2_CID_EXPOSURE_ABSOLUTE: return "Exposure Time, Absolute"; case V4L2_CID_EXPOSURE_AUTO_PRIORITY: return "Exposure, Dynamic Framerate"; case V4L2_CID_PAN_RELATIVE: return "Pan, Relative"; case V4L2_CID_TILT_RELATIVE: return "Tilt, Relative"; case V4L2_CID_PAN_RESET: return "Pan, Reset"; case V4L2_CID_TILT_RESET: return "Tilt, Reset"; case V4L2_CID_PAN_ABSOLUTE: return "Pan, Absolute"; case V4L2_CID_TILT_ABSOLUTE: return "Tilt, Absolute"; case V4L2_CID_FOCUS_ABSOLUTE: return "Focus, Absolute"; case V4L2_CID_FOCUS_RELATIVE: return "Focus, Relative"; case V4L2_CID_FOCUS_AUTO: return "Focus, Automatic Continuous"; case V4L2_CID_ZOOM_ABSOLUTE: return "Zoom, Absolute"; case V4L2_CID_ZOOM_RELATIVE: return "Zoom, Relative"; case V4L2_CID_ZOOM_CONTINUOUS: return "Zoom, Continuous"; case V4L2_CID_PRIVACY: return "Privacy"; case V4L2_CID_IRIS_ABSOLUTE: return "Iris, Absolute"; case V4L2_CID_IRIS_RELATIVE: return "Iris, Relative"; case V4L2_CID_AUTO_EXPOSURE_BIAS: return "Auto Exposure, Bias"; case V4L2_CID_AUTO_N_PRESET_WHITE_BALANCE: return "White Balance, Auto & Preset"; case V4L2_CID_WIDE_DYNAMIC_RANGE: return "Wide Dynamic Range"; case V4L2_CID_IMAGE_STABILIZATION: return "Image Stabilization"; case V4L2_CID_ISO_SENSITIVITY: return "ISO Sensitivity"; case V4L2_CID_ISO_SENSITIVITY_AUTO: return "ISO Sensitivity, Auto"; case V4L2_CID_EXPOSURE_METERING: return "Exposure, Metering Mode"; case V4L2_CID_SCENE_MODE: return "Scene Mode"; case V4L2_CID_3A_LOCK: return "3A Lock"; case V4L2_CID_AUTO_FOCUS_START: return "Auto Focus, Start"; case V4L2_CID_AUTO_FOCUS_STOP: return "Auto Focus, Stop"; case V4L2_CID_AUTO_FOCUS_STATUS: return "Auto Focus, Status"; case V4L2_CID_AUTO_FOCUS_RANGE: return "Auto Focus, Range"; case V4L2_CID_PAN_SPEED: return "Pan, Speed"; case V4L2_CID_TILT_SPEED: return "Tilt, Speed"; case V4L2_CID_UNIT_CELL_SIZE: return "Unit Cell Size"; case V4L2_CID_CAMERA_ORIENTATION: return "Camera Orientation"; case V4L2_CID_CAMERA_SENSOR_ROTATION: return "Camera Sensor Rotation"; case V4L2_CID_HDR_SENSOR_MODE: return "HDR Sensor Mode"; /* FM Radio Modulator controls */ /* Keep the order of the 'case's the same as in v4l2-controls.h! */ case V4L2_CID_FM_TX_CLASS: return "FM Radio Modulator Controls"; case V4L2_CID_RDS_TX_DEVIATION: return "RDS Signal Deviation"; case V4L2_CID_RDS_TX_PI: return "RDS Program ID"; case V4L2_CID_RDS_TX_PTY: return "RDS Program Type"; case V4L2_CID_RDS_TX_PS_NAME: return "RDS PS Name"; case V4L2_CID_RDS_TX_RADIO_TEXT: return "RDS Radio Text"; case V4L2_CID_RDS_TX_MONO_STEREO: return "RDS Stereo"; case V4L2_CID_RDS_TX_ARTIFICIAL_HEAD: return "RDS Artificial Head"; case V4L2_CID_RDS_TX_COMPRESSED: return "RDS Compressed"; case V4L2_CID_RDS_TX_DYNAMIC_PTY: return "RDS Dynamic PTY"; case V4L2_CID_RDS_TX_TRAFFIC_ANNOUNCEMENT: return "RDS Traffic Announcement"; case V4L2_CID_RDS_TX_TRAFFIC_PROGRAM: return "RDS Traffic Program"; case V4L2_CID_RDS_TX_MUSIC_SPEECH: return "RDS Music"; case V4L2_CID_RDS_TX_ALT_FREQS_ENABLE: return "RDS Enable Alt Frequencies"; case V4L2_CID_RDS_TX_ALT_FREQS: return "RDS Alternate Frequencies"; case V4L2_CID_AUDIO_LIMITER_ENABLED: return "Audio Limiter Feature Enabled"; case V4L2_CID_AUDIO_LIMITER_RELEASE_TIME: return "Audio Limiter Release Time"; case V4L2_CID_AUDIO_LIMITER_DEVIATION: return "Audio Limiter Deviation"; case V4L2_CID_AUDIO_COMPRESSION_ENABLED: return "Audio Compression Enabled"; case V4L2_CID_AUDIO_COMPRESSION_GAIN: return "Audio Compression Gain"; case V4L2_CID_AUDIO_COMPRESSION_THRESHOLD: return "Audio Compression Threshold"; case V4L2_CID_AUDIO_COMPRESSION_ATTACK_TIME: return "Audio Compression Attack Time"; case V4L2_CID_AUDIO_COMPRESSION_RELEASE_TIME: return "Audio Compression Release Time"; case V4L2_CID_PILOT_TONE_ENABLED: return "Pilot Tone Feature Enabled"; case V4L2_CID_PILOT_TONE_DEVIATION: return "Pilot Tone Deviation"; case V4L2_CID_PILOT_TONE_FREQUENCY: return "Pilot Tone Frequency"; case V4L2_CID_TUNE_PREEMPHASIS: return "Pre-Emphasis"; case V4L2_CID_TUNE_POWER_LEVEL: return "Tune Power Level"; case V4L2_CID_TUNE_ANTENNA_CAPACITOR: return "Tune Antenna Capacitor"; /* Flash controls */ /* Keep the order of the 'case's the same as in v4l2-controls.h! */ case V4L2_CID_FLASH_CLASS: return "Flash Controls"; case V4L2_CID_FLASH_LED_MODE: return "LED Mode"; case V4L2_CID_FLASH_STROBE_SOURCE: return "Strobe Source"; case V4L2_CID_FLASH_STROBE: return "Strobe"; case V4L2_CID_FLASH_STROBE_STOP: return "Stop Strobe"; case V4L2_CID_FLASH_STROBE_STATUS: return "Strobe Status"; case V4L2_CID_FLASH_TIMEOUT: return "Strobe Timeout"; case V4L2_CID_FLASH_INTENSITY: return "Intensity, Flash Mode"; case V4L2_CID_FLASH_TORCH_INTENSITY: return "Intensity, Torch Mode"; case V4L2_CID_FLASH_INDICATOR_INTENSITY: return "Intensity, Indicator"; case V4L2_CID_FLASH_FAULT: return "Faults"; case V4L2_CID_FLASH_CHARGE: return "Charge"; case V4L2_CID_FLASH_READY: return "Ready to Strobe"; /* JPEG encoder controls */ /* Keep the order of the 'case's the same as in v4l2-controls.h! */ case V4L2_CID_JPEG_CLASS: return "JPEG Compression Controls"; case V4L2_CID_JPEG_CHROMA_SUBSAMPLING: return "Chroma Subsampling"; case V4L2_CID_JPEG_RESTART_INTERVAL: return "Restart Interval"; case V4L2_CID_JPEG_COMPRESSION_QUALITY: return "Compression Quality"; case V4L2_CID_JPEG_ACTIVE_MARKER: return "Active Markers"; /* Image source controls */ /* Keep the order of the 'case's the same as in v4l2-controls.h! */ case V4L2_CID_IMAGE_SOURCE_CLASS: return "Image Source Controls"; case V4L2_CID_VBLANK: return "Vertical Blanking"; case V4L2_CID_HBLANK: return "Horizontal Blanking"; case V4L2_CID_ANALOGUE_GAIN: return "Analogue Gain"; case V4L2_CID_TEST_PATTERN_RED: return "Red Pixel Value"; case V4L2_CID_TEST_PATTERN_GREENR: return "Green (Red) Pixel Value"; case V4L2_CID_TEST_PATTERN_BLUE: return "Blue Pixel Value"; case V4L2_CID_TEST_PATTERN_GREENB: return "Green (Blue) Pixel Value"; case V4L2_CID_NOTIFY_GAINS: return "Notify Gains"; /* Image processing controls */ /* Keep the order of the 'case's the same as in v4l2-controls.h! */ case V4L2_CID_IMAGE_PROC_CLASS: return "Image Processing Controls"; case V4L2_CID_LINK_FREQ: return "Link Frequency"; case V4L2_CID_PIXEL_RATE: return "Pixel Rate"; case V4L2_CID_TEST_PATTERN: return "Test Pattern"; case V4L2_CID_DEINTERLACING_MODE: return "Deinterlacing Mode"; case V4L2_CID_DIGITAL_GAIN: return "Digital Gain"; /* DV controls */ /* Keep the order of the 'case's the same as in v4l2-controls.h! */ case V4L2_CID_DV_CLASS: return "Digital Video Controls"; case V4L2_CID_DV_TX_HOTPLUG: return "Hotplug Present"; case V4L2_CID_DV_TX_RXSENSE: return "RxSense Present"; case V4L2_CID_DV_TX_EDID_PRESENT: return "EDID Present"; case V4L2_CID_DV_TX_MODE: return "Transmit Mode"; case V4L2_CID_DV_TX_RGB_RANGE: return "Tx RGB Quantization Range"; case V4L2_CID_DV_TX_IT_CONTENT_TYPE: return "Tx IT Content Type"; case V4L2_CID_DV_RX_POWER_PRESENT: return "Power Present"; case V4L2_CID_DV_RX_RGB_RANGE: return "Rx RGB Quantization Range"; case V4L2_CID_DV_RX_IT_CONTENT_TYPE: return "Rx IT Content Type"; case V4L2_CID_FM_RX_CLASS: return "FM Radio Receiver Controls"; case V4L2_CID_TUNE_DEEMPHASIS: return "De-Emphasis"; case V4L2_CID_RDS_RECEPTION: return "RDS Reception"; case V4L2_CID_RF_TUNER_CLASS: return "RF Tuner Controls"; case V4L2_CID_RF_TUNER_RF_GAIN: return "RF Gain"; case V4L2_CID_RF_TUNER_LNA_GAIN_AUTO: return "LNA Gain, Auto"; case V4L2_CID_RF_TUNER_LNA_GAIN: return "LNA Gain"; case V4L2_CID_RF_TUNER_MIXER_GAIN_AUTO: return "Mixer Gain, Auto"; case V4L2_CID_RF_TUNER_MIXER_GAIN: return "Mixer Gain"; case V4L2_CID_RF_TUNER_IF_GAIN_AUTO: return "IF Gain, Auto"; case V4L2_CID_RF_TUNER_IF_GAIN: return "IF Gain"; case V4L2_CID_RF_TUNER_BANDWIDTH_AUTO: return "Bandwidth, Auto"; case V4L2_CID_RF_TUNER_BANDWIDTH: return "Bandwidth"; case V4L2_CID_RF_TUNER_PLL_LOCK: return "PLL Lock"; case V4L2_CID_RDS_RX_PTY: return "RDS Program Type"; case V4L2_CID_RDS_RX_PS_NAME: return "RDS PS Name"; case V4L2_CID_RDS_RX_RADIO_TEXT: return "RDS Radio Text"; case V4L2_CID_RDS_RX_TRAFFIC_ANNOUNCEMENT: return "RDS Traffic Announcement"; case V4L2_CID_RDS_RX_TRAFFIC_PROGRAM: return "RDS Traffic Program"; case V4L2_CID_RDS_RX_MUSIC_SPEECH: return "RDS Music"; /* Detection controls */ /* Keep the order of the 'case's the same as in v4l2-controls.h! */ case V4L2_CID_DETECT_CLASS: return "Detection Controls"; case V4L2_CID_DETECT_MD_MODE: return "Motion Detection Mode"; case V4L2_CID_DETECT_MD_GLOBAL_THRESHOLD: return "MD Global Threshold"; case V4L2_CID_DETECT_MD_THRESHOLD_GRID: return "MD Threshold Grid"; case V4L2_CID_DETECT_MD_REGION_GRID: return "MD Region Grid"; /* Stateless Codec controls */ /* Keep the order of the 'case's the same as in v4l2-controls.h! */ case V4L2_CID_CODEC_STATELESS_CLASS: return "Stateless Codec Controls"; case V4L2_CID_STATELESS_H264_DECODE_MODE: return "H264 Decode Mode"; case V4L2_CID_STATELESS_H264_START_CODE: return "H264 Start Code"; case V4L2_CID_STATELESS_H264_SPS: return "H264 Sequence Parameter Set"; case V4L2_CID_STATELESS_H264_PPS: return "H264 Picture Parameter Set"; case V4L2_CID_STATELESS_H264_SCALING_MATRIX: return "H264 Scaling Matrix"; case V4L2_CID_STATELESS_H264_PRED_WEIGHTS: return "H264 Prediction Weight Table"; case V4L2_CID_STATELESS_H264_SLICE_PARAMS: return "H264 Slice Parameters"; case V4L2_CID_STATELESS_H264_DECODE_PARAMS: return "H264 Decode Parameters"; case V4L2_CID_STATELESS_FWHT_PARAMS: return "FWHT Stateless Parameters"; case V4L2_CID_STATELESS_VP8_FRAME: return "VP8 Frame Parameters"; case V4L2_CID_STATELESS_MPEG2_SEQUENCE: return "MPEG-2 Sequence Header"; case V4L2_CID_STATELESS_MPEG2_PICTURE: return "MPEG-2 Picture Header"; case V4L2_CID_STATELESS_MPEG2_QUANTISATION: return "MPEG-2 Quantisation Matrices"; case V4L2_CID_STATELESS_VP9_COMPRESSED_HDR: return "VP9 Probabilities Updates"; case V4L2_CID_STATELESS_VP9_FRAME: return "VP9 Frame Decode Parameters"; case V4L2_CID_STATELESS_HEVC_SPS: return "HEVC Sequence Parameter Set"; case V4L2_CID_STATELESS_HEVC_PPS: return "HEVC Picture Parameter Set"; case V4L2_CID_STATELESS_HEVC_SLICE_PARAMS: return "HEVC Slice Parameters"; case V4L2_CID_STATELESS_HEVC_SCALING_MATRIX: return "HEVC Scaling Matrix"; case V4L2_CID_STATELESS_HEVC_DECODE_PARAMS: return "HEVC Decode Parameters"; case V4L2_CID_STATELESS_HEVC_DECODE_MODE: return "HEVC Decode Mode"; case V4L2_CID_STATELESS_HEVC_START_CODE: return "HEVC Start Code"; case V4L2_CID_STATELESS_HEVC_ENTRY_POINT_OFFSETS: return "HEVC Entry Point Offsets"; case V4L2_CID_STATELESS_AV1_SEQUENCE: return "AV1 Sequence Parameters"; case V4L2_CID_STATELESS_AV1_TILE_GROUP_ENTRY: return "AV1 Tile Group Entry"; case V4L2_CID_STATELESS_AV1_FRAME: return "AV1 Frame Parameters"; case V4L2_CID_STATELESS_AV1_FILM_GRAIN: return "AV1 Film Grain"; /* Colorimetry controls */ /* Keep the order of the 'case's the same as in v4l2-controls.h! */ case V4L2_CID_COLORIMETRY_CLASS: return "Colorimetry Controls"; case V4L2_CID_COLORIMETRY_HDR10_CLL_INFO: return "HDR10 Content Light Info"; case V4L2_CID_COLORIMETRY_HDR10_MASTERING_DISPLAY: return "HDR10 Mastering Display"; default: return NULL; } } EXPORT_SYMBOL(v4l2_ctrl_get_name); void v4l2_ctrl_fill(u32 id, const char **name, enum v4l2_ctrl_type *type, s64 *min, s64 *max, u64 *step, s64 *def, u32 *flags) { *name = v4l2_ctrl_get_name(id); *flags = 0; switch (id) { case V4L2_CID_AUDIO_MUTE: case V4L2_CID_AUDIO_LOUDNESS: case V4L2_CID_AUTO_WHITE_BALANCE: case V4L2_CID_AUTOGAIN: case V4L2_CID_HFLIP: case V4L2_CID_VFLIP: case V4L2_CID_HUE_AUTO: case V4L2_CID_CHROMA_AGC: case V4L2_CID_COLOR_KILLER: case V4L2_CID_AUTOBRIGHTNESS: case V4L2_CID_MPEG_AUDIO_MUTE: case V4L2_CID_MPEG_VIDEO_MUTE: case V4L2_CID_MPEG_VIDEO_GOP_CLOSURE: case V4L2_CID_MPEG_VIDEO_PULLDOWN: case V4L2_CID_EXPOSURE_AUTO_PRIORITY: case V4L2_CID_FOCUS_AUTO: case V4L2_CID_PRIVACY: case V4L2_CID_AUDIO_LIMITER_ENABLED: case V4L2_CID_AUDIO_COMPRESSION_ENABLED: case V4L2_CID_PILOT_TONE_ENABLED: case V4L2_CID_ILLUMINATORS_1: case V4L2_CID_ILLUMINATORS_2: case V4L2_CID_FLASH_STROBE_STATUS: case V4L2_CID_FLASH_CHARGE: case V4L2_CID_FLASH_READY: case V4L2_CID_MPEG_VIDEO_DECODER_MPEG4_DEBLOCK_FILTER: case V4L2_CID_MPEG_VIDEO_DECODER_SLICE_INTERFACE: case V4L2_CID_MPEG_VIDEO_DEC_DISPLAY_DELAY_ENABLE: case V4L2_CID_MPEG_VIDEO_FRAME_RC_ENABLE: case V4L2_CID_MPEG_VIDEO_MB_RC_ENABLE: case V4L2_CID_MPEG_VIDEO_H264_8X8_TRANSFORM: case V4L2_CID_MPEG_VIDEO_H264_VUI_SAR_ENABLE: case V4L2_CID_MPEG_VIDEO_MPEG4_QPEL: case V4L2_CID_MPEG_VIDEO_REPEAT_SEQ_HEADER: case V4L2_CID_MPEG_VIDEO_AU_DELIMITER: case V4L2_CID_WIDE_DYNAMIC_RANGE: case V4L2_CID_IMAGE_STABILIZATION: case V4L2_CID_RDS_RECEPTION: case V4L2_CID_RF_TUNER_LNA_GAIN_AUTO: case V4L2_CID_RF_TUNER_MIXER_GAIN_AUTO: case V4L2_CID_RF_TUNER_IF_GAIN_AUTO: case V4L2_CID_RF_TUNER_BANDWIDTH_AUTO: case V4L2_CID_RF_TUNER_PLL_LOCK: case V4L2_CID_RDS_TX_MONO_STEREO: case V4L2_CID_RDS_TX_ARTIFICIAL_HEAD: case V4L2_CID_RDS_TX_COMPRESSED: case V4L2_CID_RDS_TX_DYNAMIC_PTY: case V4L2_CID_RDS_TX_TRAFFIC_ANNOUNCEMENT: case V4L2_CID_RDS_TX_TRAFFIC_PROGRAM: case V4L2_CID_RDS_TX_MUSIC_SPEECH: case V4L2_CID_RDS_TX_ALT_FREQS_ENABLE: case V4L2_CID_RDS_RX_TRAFFIC_ANNOUNCEMENT: case V4L2_CID_RDS_RX_TRAFFIC_PROGRAM: case V4L2_CID_RDS_RX_MUSIC_SPEECH: *type = V4L2_CTRL_TYPE_BOOLEAN; *min = 0; *max = *step = 1; break; case V4L2_CID_ROTATE: *type = V4L2_CTRL_TYPE_INTEGER; *flags |= V4L2_CTRL_FLAG_MODIFY_LAYOUT; break; case V4L2_CID_MPEG_VIDEO_MV_H_SEARCH_RANGE: case V4L2_CID_MPEG_VIDEO_MV_V_SEARCH_RANGE: case V4L2_CID_MPEG_VIDEO_DEC_DISPLAY_DELAY: case V4L2_CID_MPEG_VIDEO_INTRA_REFRESH_PERIOD: *type = V4L2_CTRL_TYPE_INTEGER; break; case V4L2_CID_MPEG_VIDEO_LTR_COUNT: *type = V4L2_CTRL_TYPE_INTEGER; break; case V4L2_CID_MPEG_VIDEO_FRAME_LTR_INDEX: *type = V4L2_CTRL_TYPE_INTEGER; *flags |= V4L2_CTRL_FLAG_EXECUTE_ON_WRITE; break; case V4L2_CID_MPEG_VIDEO_USE_LTR_FRAMES: *type = V4L2_CTRL_TYPE_BITMASK; *flags |= V4L2_CTRL_FLAG_EXECUTE_ON_WRITE; break; case V4L2_CID_MPEG_VIDEO_FORCE_KEY_FRAME: case V4L2_CID_PAN_RESET: case V4L2_CID_TILT_RESET: case V4L2_CID_FLASH_STROBE: case V4L2_CID_FLASH_STROBE_STOP: case V4L2_CID_AUTO_FOCUS_START: case V4L2_CID_AUTO_FOCUS_STOP: case V4L2_CID_DO_WHITE_BALANCE: *type = V4L2_CTRL_TYPE_BUTTON; *flags |= V4L2_CTRL_FLAG_WRITE_ONLY | V4L2_CTRL_FLAG_EXECUTE_ON_WRITE; *min = *max = *step = *def = 0; break; case V4L2_CID_POWER_LINE_FREQUENCY: case V4L2_CID_MPEG_AUDIO_SAMPLING_FREQ: case V4L2_CID_MPEG_AUDIO_ENCODING: case V4L2_CID_MPEG_AUDIO_L1_BITRATE: case V4L2_CID_MPEG_AUDIO_L2_BITRATE: case V4L2_CID_MPEG_AUDIO_L3_BITRATE: case V4L2_CID_MPEG_AUDIO_AC3_BITRATE: case V4L2_CID_MPEG_AUDIO_MODE: case V4L2_CID_MPEG_AUDIO_MODE_EXTENSION: case V4L2_CID_MPEG_AUDIO_EMPHASIS: case V4L2_CID_MPEG_AUDIO_CRC: case V4L2_CID_MPEG_AUDIO_DEC_PLAYBACK: case V4L2_CID_MPEG_AUDIO_DEC_MULTILINGUAL_PLAYBACK: case V4L2_CID_MPEG_VIDEO_ENCODING: case V4L2_CID_MPEG_VIDEO_ASPECT: case V4L2_CID_MPEG_VIDEO_BITRATE_MODE: case V4L2_CID_MPEG_STREAM_TYPE: case V4L2_CID_MPEG_STREAM_VBI_FMT: case V4L2_CID_EXPOSURE_AUTO: case V4L2_CID_AUTO_FOCUS_RANGE: case V4L2_CID_COLORFX: case V4L2_CID_AUTO_N_PRESET_WHITE_BALANCE: case V4L2_CID_TUNE_PREEMPHASIS: case V4L2_CID_FLASH_LED_MODE: case V4L2_CID_FLASH_STROBE_SOURCE: case V4L2_CID_MPEG_VIDEO_HEADER_MODE: case V4L2_CID_MPEG_VIDEO_FRAME_SKIP_MODE: case V4L2_CID_MPEG_VIDEO_MULTI_SLICE_MODE: case V4L2_CID_MPEG_VIDEO_H264_ENTROPY_MODE: case V4L2_CID_MPEG_VIDEO_H264_LEVEL: case V4L2_CID_MPEG_VIDEO_H264_LOOP_FILTER_MODE: case V4L2_CID_MPEG_VIDEO_H264_PROFILE: case V4L2_CID_MPEG_VIDEO_H264_VUI_SAR_IDC: case V4L2_CID_MPEG_VIDEO_H264_SEI_FP_ARRANGEMENT_TYPE: case V4L2_CID_MPEG_VIDEO_H264_FMO_MAP_TYPE: case V4L2_CID_MPEG_VIDEO_H264_HIERARCHICAL_CODING_TYPE: case V4L2_CID_MPEG_VIDEO_MPEG2_LEVEL: case V4L2_CID_MPEG_VIDEO_MPEG2_PROFILE: case V4L2_CID_MPEG_VIDEO_MPEG4_LEVEL: case V4L2_CID_MPEG_VIDEO_MPEG4_PROFILE: case V4L2_CID_JPEG_CHROMA_SUBSAMPLING: case V4L2_CID_ISO_SENSITIVITY_AUTO: case V4L2_CID_EXPOSURE_METERING: case V4L2_CID_SCENE_MODE: case V4L2_CID_DV_TX_MODE: case V4L2_CID_DV_TX_RGB_RANGE: case V4L2_CID_DV_TX_IT_CONTENT_TYPE: case V4L2_CID_DV_RX_RGB_RANGE: case V4L2_CID_DV_RX_IT_CONTENT_TYPE: case V4L2_CID_TEST_PATTERN: case V4L2_CID_DEINTERLACING_MODE: case V4L2_CID_TUNE_DEEMPHASIS: case V4L2_CID_MPEG_VIDEO_VPX_GOLDEN_FRAME_SEL: case V4L2_CID_MPEG_VIDEO_VP8_PROFILE: case V4L2_CID_MPEG_VIDEO_VP9_PROFILE: case V4L2_CID_MPEG_VIDEO_VP9_LEVEL: case V4L2_CID_DETECT_MD_MODE: case V4L2_CID_MPEG_VIDEO_HEVC_PROFILE: case V4L2_CID_MPEG_VIDEO_HEVC_LEVEL: case V4L2_CID_MPEG_VIDEO_HEVC_HIER_CODING_TYPE: case V4L2_CID_MPEG_VIDEO_HEVC_REFRESH_TYPE: case V4L2_CID_MPEG_VIDEO_HEVC_SIZE_OF_LENGTH_FIELD: case V4L2_CID_MPEG_VIDEO_HEVC_TIER: case V4L2_CID_MPEG_VIDEO_HEVC_LOOP_FILTER_MODE: case V4L2_CID_MPEG_VIDEO_AV1_PROFILE: case V4L2_CID_MPEG_VIDEO_AV1_LEVEL: case V4L2_CID_STATELESS_HEVC_DECODE_MODE: case V4L2_CID_STATELESS_HEVC_START_CODE: case V4L2_CID_STATELESS_H264_DECODE_MODE: case V4L2_CID_STATELESS_H264_START_CODE: case V4L2_CID_CAMERA_ORIENTATION: case V4L2_CID_MPEG_VIDEO_INTRA_REFRESH_PERIOD_TYPE: case V4L2_CID_HDR_SENSOR_MODE: *type = V4L2_CTRL_TYPE_MENU; break; case V4L2_CID_LINK_FREQ: *type = V4L2_CTRL_TYPE_INTEGER_MENU; break; case V4L2_CID_RDS_TX_PS_NAME: case V4L2_CID_RDS_TX_RADIO_TEXT: case V4L2_CID_RDS_RX_PS_NAME: case V4L2_CID_RDS_RX_RADIO_TEXT: *type = V4L2_CTRL_TYPE_STRING; break; case V4L2_CID_ISO_SENSITIVITY: case V4L2_CID_AUTO_EXPOSURE_BIAS: case V4L2_CID_MPEG_VIDEO_VPX_NUM_PARTITIONS: case V4L2_CID_MPEG_VIDEO_VPX_NUM_REF_FRAMES: *type = V4L2_CTRL_TYPE_INTEGER_MENU; break; case V4L2_CID_USER_CLASS: case V4L2_CID_CAMERA_CLASS: case V4L2_CID_CODEC_CLASS: case V4L2_CID_FM_TX_CLASS: case V4L2_CID_FLASH_CLASS: case V4L2_CID_JPEG_CLASS: case V4L2_CID_IMAGE_SOURCE_CLASS: case V4L2_CID_IMAGE_PROC_CLASS: case V4L2_CID_DV_CLASS: case V4L2_CID_FM_RX_CLASS: case V4L2_CID_RF_TUNER_CLASS: case V4L2_CID_DETECT_CLASS: case V4L2_CID_CODEC_STATELESS_CLASS: case V4L2_CID_COLORIMETRY_CLASS: *type = V4L2_CTRL_TYPE_CTRL_CLASS; /* You can neither read nor write these */ *flags |= V4L2_CTRL_FLAG_READ_ONLY | V4L2_CTRL_FLAG_WRITE_ONLY; *min = *max = *step = *def = 0; break; case V4L2_CID_BG_COLOR: case V4L2_CID_COLORFX_RGB: *type = V4L2_CTRL_TYPE_INTEGER; *step = 1; *min = 0; /* Max is calculated as RGB888 that is 2^24 - 1 */ *max = 0xffffff; break; case V4L2_CID_COLORFX_CBCR: *type = V4L2_CTRL_TYPE_INTEGER; *step = 1; *min = 0; *max = 0xffff; break; case V4L2_CID_FLASH_FAULT: case V4L2_CID_JPEG_ACTIVE_MARKER: case V4L2_CID_3A_LOCK: case V4L2_CID_AUTO_FOCUS_STATUS: case V4L2_CID_DV_TX_HOTPLUG: case V4L2_CID_DV_TX_RXSENSE: case V4L2_CID_DV_TX_EDID_PRESENT: case V4L2_CID_DV_RX_POWER_PRESENT: *type = V4L2_CTRL_TYPE_BITMASK; break; case V4L2_CID_MIN_BUFFERS_FOR_CAPTURE: case V4L2_CID_MIN_BUFFERS_FOR_OUTPUT: *type = V4L2_CTRL_TYPE_INTEGER; *flags |= V4L2_CTRL_FLAG_READ_ONLY; break; case V4L2_CID_MPEG_VIDEO_DEC_PTS: *type = V4L2_CTRL_TYPE_INTEGER64; *flags |= V4L2_CTRL_FLAG_VOLATILE | V4L2_CTRL_FLAG_READ_ONLY; *min = *def = 0; *max = 0x1ffffffffLL; *step = 1; break; case V4L2_CID_MPEG_VIDEO_DEC_FRAME: *type = V4L2_CTRL_TYPE_INTEGER64; *flags |= V4L2_CTRL_FLAG_VOLATILE | V4L2_CTRL_FLAG_READ_ONLY; *min = *def = 0; *max = 0x7fffffffffffffffLL; *step = 1; break; case V4L2_CID_MPEG_VIDEO_DEC_CONCEAL_COLOR: *type = V4L2_CTRL_TYPE_INTEGER64; *min = 0; /* default for 8 bit black, luma is 16, chroma is 128 */ *def = 0x8000800010LL; *max = 0xffffffffffffLL; *step = 1; break; case V4L2_CID_MPEG_VIDEO_AVERAGE_QP: *type = V4L2_CTRL_TYPE_INTEGER; *flags |= V4L2_CTRL_FLAG_READ_ONLY; break; case V4L2_CID_PIXEL_RATE: *type = V4L2_CTRL_TYPE_INTEGER64; *flags |= V4L2_CTRL_FLAG_READ_ONLY; break; case V4L2_CID_DETECT_MD_REGION_GRID: *type = V4L2_CTRL_TYPE_U8; break; case V4L2_CID_DETECT_MD_THRESHOLD_GRID: *type = V4L2_CTRL_TYPE_U16; break; case V4L2_CID_RDS_TX_ALT_FREQS: *type = V4L2_CTRL_TYPE_U32; break; case V4L2_CID_STATELESS_MPEG2_SEQUENCE: *type = V4L2_CTRL_TYPE_MPEG2_SEQUENCE; break; case V4L2_CID_STATELESS_MPEG2_PICTURE: *type = V4L2_CTRL_TYPE_MPEG2_PICTURE; break; case V4L2_CID_STATELESS_MPEG2_QUANTISATION: *type = V4L2_CTRL_TYPE_MPEG2_QUANTISATION; break; case V4L2_CID_STATELESS_FWHT_PARAMS: *type = V4L2_CTRL_TYPE_FWHT_PARAMS; break; case V4L2_CID_STATELESS_H264_SPS: *type = V4L2_CTRL_TYPE_H264_SPS; break; case V4L2_CID_STATELESS_H264_PPS: *type = V4L2_CTRL_TYPE_H264_PPS; break; case V4L2_CID_STATELESS_H264_SCALING_MATRIX: *type = V4L2_CTRL_TYPE_H264_SCALING_MATRIX; break; case V4L2_CID_STATELESS_H264_SLICE_PARAMS: *type = V4L2_CTRL_TYPE_H264_SLICE_PARAMS; break; case V4L2_CID_STATELESS_H264_DECODE_PARAMS: *type = V4L2_CTRL_TYPE_H264_DECODE_PARAMS; break; case V4L2_CID_STATELESS_H264_PRED_WEIGHTS: *type = V4L2_CTRL_TYPE_H264_PRED_WEIGHTS; break; case V4L2_CID_STATELESS_VP8_FRAME: *type = V4L2_CTRL_TYPE_VP8_FRAME; break; case V4L2_CID_STATELESS_HEVC_SPS: *type = V4L2_CTRL_TYPE_HEVC_SPS; break; case V4L2_CID_STATELESS_HEVC_PPS: *type = V4L2_CTRL_TYPE_HEVC_PPS; break; case V4L2_CID_STATELESS_HEVC_SLICE_PARAMS: *type = V4L2_CTRL_TYPE_HEVC_SLICE_PARAMS; *flags |= V4L2_CTRL_FLAG_DYNAMIC_ARRAY; break; case V4L2_CID_STATELESS_HEVC_SCALING_MATRIX: *type = V4L2_CTRL_TYPE_HEVC_SCALING_MATRIX; break; case V4L2_CID_STATELESS_HEVC_DECODE_PARAMS: *type = V4L2_CTRL_TYPE_HEVC_DECODE_PARAMS; break; case V4L2_CID_STATELESS_HEVC_ENTRY_POINT_OFFSETS: *type = V4L2_CTRL_TYPE_U32; *flags |= V4L2_CTRL_FLAG_DYNAMIC_ARRAY; break; case V4L2_CID_STATELESS_VP9_COMPRESSED_HDR: *type = V4L2_CTRL_TYPE_VP9_COMPRESSED_HDR; break; case V4L2_CID_STATELESS_VP9_FRAME: *type = V4L2_CTRL_TYPE_VP9_FRAME; break; case V4L2_CID_STATELESS_AV1_SEQUENCE: *type = V4L2_CTRL_TYPE_AV1_SEQUENCE; break; case V4L2_CID_STATELESS_AV1_TILE_GROUP_ENTRY: *type = V4L2_CTRL_TYPE_AV1_TILE_GROUP_ENTRY; *flags |= V4L2_CTRL_FLAG_DYNAMIC_ARRAY; break; case V4L2_CID_STATELESS_AV1_FRAME: *type = V4L2_CTRL_TYPE_AV1_FRAME; break; case V4L2_CID_STATELESS_AV1_FILM_GRAIN: *type = V4L2_CTRL_TYPE_AV1_FILM_GRAIN; break; case V4L2_CID_UNIT_CELL_SIZE: *type = V4L2_CTRL_TYPE_AREA; *flags |= V4L2_CTRL_FLAG_READ_ONLY; break; case V4L2_CID_COLORIMETRY_HDR10_CLL_INFO: *type = V4L2_CTRL_TYPE_HDR10_CLL_INFO; break; case V4L2_CID_COLORIMETRY_HDR10_MASTERING_DISPLAY: *type = V4L2_CTRL_TYPE_HDR10_MASTERING_DISPLAY; break; default: *type = V4L2_CTRL_TYPE_INTEGER; break; } switch (id) { case V4L2_CID_MPEG_AUDIO_ENCODING: case V4L2_CID_MPEG_AUDIO_MODE: case V4L2_CID_MPEG_VIDEO_BITRATE_MODE: case V4L2_CID_MPEG_VIDEO_B_FRAMES: case V4L2_CID_MPEG_STREAM_TYPE: *flags |= V4L2_CTRL_FLAG_UPDATE; break; case V4L2_CID_AUDIO_VOLUME: case V4L2_CID_AUDIO_BALANCE: case V4L2_CID_AUDIO_BASS: case V4L2_CID_AUDIO_TREBLE: case V4L2_CID_BRIGHTNESS: case V4L2_CID_CONTRAST: case V4L2_CID_SATURATION: case V4L2_CID_HUE: case V4L2_CID_RED_BALANCE: case V4L2_CID_BLUE_BALANCE: case V4L2_CID_GAMMA: case V4L2_CID_SHARPNESS: case V4L2_CID_CHROMA_GAIN: case V4L2_CID_RDS_TX_DEVIATION: case V4L2_CID_AUDIO_LIMITER_RELEASE_TIME: case V4L2_CID_AUDIO_LIMITER_DEVIATION: case V4L2_CID_AUDIO_COMPRESSION_GAIN: case V4L2_CID_AUDIO_COMPRESSION_THRESHOLD: case V4L2_CID_AUDIO_COMPRESSION_ATTACK_TIME: case V4L2_CID_AUDIO_COMPRESSION_RELEASE_TIME: case V4L2_CID_PILOT_TONE_DEVIATION: case V4L2_CID_PILOT_TONE_FREQUENCY: case V4L2_CID_TUNE_POWER_LEVEL: case V4L2_CID_TUNE_ANTENNA_CAPACITOR: case V4L2_CID_RF_TUNER_RF_GAIN: case V4L2_CID_RF_TUNER_LNA_GAIN: case V4L2_CID_RF_TUNER_MIXER_GAIN: case V4L2_CID_RF_TUNER_IF_GAIN: case V4L2_CID_RF_TUNER_BANDWIDTH: case V4L2_CID_DETECT_MD_GLOBAL_THRESHOLD: *flags |= V4L2_CTRL_FLAG_SLIDER; break; case V4L2_CID_PAN_RELATIVE: case V4L2_CID_TILT_RELATIVE: case V4L2_CID_FOCUS_RELATIVE: case V4L2_CID_IRIS_RELATIVE: case V4L2_CID_ZOOM_RELATIVE: *flags |= V4L2_CTRL_FLAG_WRITE_ONLY | V4L2_CTRL_FLAG_EXECUTE_ON_WRITE; break; case V4L2_CID_FLASH_STROBE_STATUS: case V4L2_CID_AUTO_FOCUS_STATUS: case V4L2_CID_FLASH_READY: case V4L2_CID_DV_TX_HOTPLUG: case V4L2_CID_DV_TX_RXSENSE: case V4L2_CID_DV_TX_EDID_PRESENT: case V4L2_CID_DV_RX_POWER_PRESENT: case V4L2_CID_DV_RX_IT_CONTENT_TYPE: case V4L2_CID_RDS_RX_PTY: case V4L2_CID_RDS_RX_PS_NAME: case V4L2_CID_RDS_RX_RADIO_TEXT: case V4L2_CID_RDS_RX_TRAFFIC_ANNOUNCEMENT: case V4L2_CID_RDS_RX_TRAFFIC_PROGRAM: case V4L2_CID_RDS_RX_MUSIC_SPEECH: case V4L2_CID_CAMERA_ORIENTATION: case V4L2_CID_CAMERA_SENSOR_ROTATION: *flags |= V4L2_CTRL_FLAG_READ_ONLY; break; case V4L2_CID_RF_TUNER_PLL_LOCK: *flags |= V4L2_CTRL_FLAG_VOLATILE; break; } } EXPORT_SYMBOL(v4l2_ctrl_fill);
130 130 2 2 2 74 74 46 29 46 1 1 46 74 80 126 135 80 126 126 129 130 45 130 130 130 19 18 8 130 91 129 81 79 128 130 129 125 126 130 130 126 126 126 126 8 8 2 8 130 74 74 74 74 73 73 73 73 73 45 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 // SPDX-License-Identifier: GPL-2.0-or-later /* SCTP kernel implementation * (C) Copyright IBM Corp. 2003, 2004 * * This file is part of the SCTP kernel implementation * * This file contains the code relating the chunk abstraction. * * Please send any bug reports or fixes you make to the * email address(es): * lksctp developers <linux-sctp@vger.kernel.org> * * Written or modified by: * Jon Grimm <jgrimm@us.ibm.com> * Sridhar Samudrala <sri@us.ibm.com> */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/types.h> #include <linux/kernel.h> #include <linux/net.h> #include <linux/inet.h> #include <linux/skbuff.h> #include <linux/slab.h> #include <net/sock.h> #include <net/sctp/sctp.h> #include <net/sctp/sm.h> /* This file is mostly in anticipation of future work, but initially * populate with fragment tracking for an outbound message. */ /* Initialize datamsg from memory. */ static void sctp_datamsg_init(struct sctp_datamsg *msg) { refcount_set(&msg->refcnt, 1); msg->send_failed = 0; msg->send_error = 0; msg->can_delay = 1; msg->abandoned = 0; msg->expires_at = 0; INIT_LIST_HEAD(&msg->chunks); } /* Allocate and initialize datamsg. */ static struct sctp_datamsg *sctp_datamsg_new(gfp_t gfp) { struct sctp_datamsg *msg; msg = kmalloc(sizeof(struct sctp_datamsg), gfp); if (msg) { sctp_datamsg_init(msg); SCTP_DBG_OBJCNT_INC(datamsg); } return msg; } void sctp_datamsg_free(struct sctp_datamsg *msg) { struct sctp_chunk *chunk; /* This doesn't have to be a _safe vairant because * sctp_chunk_free() only drops the refs. */ list_for_each_entry(chunk, &msg->chunks, frag_list) sctp_chunk_free(chunk); sctp_datamsg_put(msg); } /* Final destructruction of datamsg memory. */ static void sctp_datamsg_destroy(struct sctp_datamsg *msg) { struct sctp_association *asoc = NULL; struct list_head *pos, *temp; struct sctp_chunk *chunk; struct sctp_ulpevent *ev; int error, sent; /* Release all references. */ list_for_each_safe(pos, temp, &msg->chunks) { list_del_init(pos); chunk = list_entry(pos, struct sctp_chunk, frag_list); if (!msg->send_failed) { sctp_chunk_put(chunk); continue; } asoc = chunk->asoc; error = msg->send_error ?: asoc->outqueue.error; sent = chunk->has_tsn ? SCTP_DATA_SENT : SCTP_DATA_UNSENT; if (sctp_ulpevent_type_enabled(asoc->subscribe, SCTP_SEND_FAILED)) { ev = sctp_ulpevent_make_send_failed(asoc, chunk, sent, error, GFP_ATOMIC); if (ev) asoc->stream.si->enqueue_event(&asoc->ulpq, ev); } if (sctp_ulpevent_type_enabled(asoc->subscribe, SCTP_SEND_FAILED_EVENT)) { ev = sctp_ulpevent_make_send_failed_event(asoc, chunk, sent, error, GFP_ATOMIC); if (ev) asoc->stream.si->enqueue_event(&asoc->ulpq, ev); } sctp_chunk_put(chunk); } SCTP_DBG_OBJCNT_DEC(datamsg); kfree(msg); } /* Hold a reference. */ static void sctp_datamsg_hold(struct sctp_datamsg *msg) { refcount_inc(&msg->refcnt); } /* Release a reference. */ void sctp_datamsg_put(struct sctp_datamsg *msg) { if (refcount_dec_and_test(&msg->refcnt)) sctp_datamsg_destroy(msg); } /* Assign a chunk to this datamsg. */ static void sctp_datamsg_assign(struct sctp_datamsg *msg, struct sctp_chunk *chunk) { sctp_datamsg_hold(msg); chunk->msg = msg; } /* A data chunk can have a maximum payload of (2^16 - 20). Break * down any such message into smaller chunks. Opportunistically, fragment * the chunks down to the current MTU constraints. We may get refragmented * later if the PMTU changes, but it is _much better_ to fragment immediately * with a reasonable guess than always doing our fragmentation on the * soft-interrupt. */ struct sctp_datamsg *sctp_datamsg_from_user(struct sctp_association *asoc, struct sctp_sndrcvinfo *sinfo, struct iov_iter *from) { size_t len, first_len, max_data, remaining; size_t msg_len = iov_iter_count(from); struct sctp_shared_key *shkey = NULL; struct list_head *pos, *temp; struct sctp_chunk *chunk; struct sctp_datamsg *msg; int err; msg = sctp_datamsg_new(GFP_KERNEL); if (!msg) return ERR_PTR(-ENOMEM); /* Note: Calculate this outside of the loop, so that all fragments * have the same expiration. */ if (asoc->peer.prsctp_capable && sinfo->sinfo_timetolive && (SCTP_PR_TTL_ENABLED(sinfo->sinfo_flags) || !SCTP_PR_POLICY(sinfo->sinfo_flags))) msg->expires_at = jiffies + msecs_to_jiffies(sinfo->sinfo_timetolive); /* This is the biggest possible DATA chunk that can fit into * the packet */ max_data = asoc->frag_point; if (unlikely(!max_data)) { max_data = sctp_min_frag_point(sctp_sk(asoc->base.sk), sctp_datachk_len(&asoc->stream)); pr_warn_ratelimited("%s: asoc:%p frag_point is zero, forcing max_data to default minimum (%zu)", __func__, asoc, max_data); } /* If the peer requested that we authenticate DATA chunks * we need to account for bundling of the AUTH chunks along with * DATA. */ if (sctp_auth_send_cid(SCTP_CID_DATA, asoc)) { struct sctp_hmac *hmac_desc = sctp_auth_asoc_get_hmac(asoc); if (hmac_desc) max_data -= SCTP_PAD4(sizeof(struct sctp_auth_chunk) + hmac_desc->hmac_len); if (sinfo->sinfo_tsn && sinfo->sinfo_ssn != asoc->active_key_id) { shkey = sctp_auth_get_shkey(asoc, sinfo->sinfo_ssn); if (!shkey) { err = -EINVAL; goto errout; } } else { shkey = asoc->shkey; } } /* Set first_len and then account for possible bundles on first frag */ first_len = max_data; /* Check to see if we have a pending SACK and try to let it be bundled * with this message. Do this if we don't have any data queued already. * To check that, look at out_qlen and retransmit list. * NOTE: we will not reduce to account for SACK, if the message would * not have been fragmented. */ if (timer_pending(&asoc->timers[SCTP_EVENT_TIMEOUT_SACK]) && asoc->outqueue.out_qlen == 0 && list_empty(&asoc->outqueue.retransmit) && msg_len > max_data) first_len -= SCTP_PAD4(sizeof(struct sctp_sack_chunk)); /* Encourage Cookie-ECHO bundling. */ if (asoc->state < SCTP_STATE_COOKIE_ECHOED) first_len -= SCTP_ARBITRARY_COOKIE_ECHO_LEN; /* Account for a different sized first fragment */ if (msg_len >= first_len) { msg->can_delay = 0; if (msg_len > first_len) SCTP_INC_STATS(asoc->base.net, SCTP_MIB_FRAGUSRMSGS); } else { /* Which may be the only one... */ first_len = msg_len; } /* Create chunks for all DATA chunks. */ for (remaining = msg_len; remaining; remaining -= len) { u8 frag = SCTP_DATA_MIDDLE_FRAG; if (remaining == msg_len) { /* First frag, which may also be the last */ frag |= SCTP_DATA_FIRST_FRAG; len = first_len; } else { /* Middle frags */ len = max_data; } if (len >= remaining) { /* Last frag, which may also be the first */ len = remaining; frag |= SCTP_DATA_LAST_FRAG; /* The application requests to set the I-bit of the * last DATA chunk of a user message when providing * the user message to the SCTP implementation. */ if ((sinfo->sinfo_flags & SCTP_EOF) || (sinfo->sinfo_flags & SCTP_SACK_IMMEDIATELY)) frag |= SCTP_DATA_SACK_IMM; } chunk = asoc->stream.si->make_datafrag(asoc, sinfo, len, frag, GFP_KERNEL); if (!chunk) { err = -ENOMEM; goto errout; } err = sctp_user_addto_chunk(chunk, len, from); if (err < 0) goto errout_chunk_free; chunk->shkey = shkey; /* Put the chunk->skb back into the form expected by send. */ __skb_pull(chunk->skb, (__u8 *)chunk->chunk_hdr - chunk->skb->data); sctp_datamsg_assign(msg, chunk); list_add_tail(&chunk->frag_list, &msg->chunks); } return msg; errout_chunk_free: sctp_chunk_free(chunk); errout: list_for_each_safe(pos, temp, &msg->chunks) { list_del_init(pos); chunk = list_entry(pos, struct sctp_chunk, frag_list); sctp_chunk_free(chunk); } sctp_datamsg_put(msg); return ERR_PTR(err); } /* Check whether this message has expired. */ int sctp_chunk_abandoned(struct sctp_chunk *chunk) { if (!chunk->asoc->peer.prsctp_capable) return 0; if (chunk->msg->abandoned) return 1; if (!chunk->has_tsn && !(chunk->chunk_hdr->flags & SCTP_DATA_FIRST_FRAG)) return 0; if (SCTP_PR_TTL_ENABLED(chunk->sinfo.sinfo_flags) && time_after(jiffies, chunk->msg->expires_at)) { struct sctp_stream_out *streamout = SCTP_SO(&chunk->asoc->stream, chunk->sinfo.sinfo_stream); if (chunk->sent_count) { chunk->asoc->abandoned_sent[SCTP_PR_INDEX(TTL)]++; streamout->ext->abandoned_sent[SCTP_PR_INDEX(TTL)]++; } else { chunk->asoc->abandoned_unsent[SCTP_PR_INDEX(TTL)]++; streamout->ext->abandoned_unsent[SCTP_PR_INDEX(TTL)]++; } chunk->msg->abandoned = 1; return 1; } else if (SCTP_PR_RTX_ENABLED(chunk->sinfo.sinfo_flags) && chunk->sent_count > chunk->sinfo.sinfo_timetolive) { struct sctp_stream_out *streamout = SCTP_SO(&chunk->asoc->stream, chunk->sinfo.sinfo_stream); chunk->asoc->abandoned_sent[SCTP_PR_INDEX(RTX)]++; streamout->ext->abandoned_sent[SCTP_PR_INDEX(RTX)]++; chunk->msg->abandoned = 1; return 1; } else if (!SCTP_PR_POLICY(chunk->sinfo.sinfo_flags) && chunk->msg->expires_at && time_after(jiffies, chunk->msg->expires_at)) { chunk->msg->abandoned = 1; return 1; } /* PRIO policy is processed by sendmsg, not here */ return 0; } /* This chunk (and consequently entire message) has failed in its sending. */ void sctp_chunk_fail(struct sctp_chunk *chunk, int error) { chunk->msg->send_failed = 1; chunk->msg->send_error = error; }
2218 358 18 108 773 772 771 770 769 770 2769 2771 2771 2768 1459 988 987 47 912 2041 229 151 150 122 122 1960 1958 16 358 358 21 15 15 1009 5739 24 24 24 23 289 256 6456 6456 5272 609 94 407 2712 141 1350 194 103 251 959 458 5912 1759 2312 3321 3257 4 4 104 104 427 395 1176 1396 427 986 9 79 88 10926 11792 401 1278 79 924 6100 6146 413 4 4 4 3381 1056 454 98 3083 811 808 811 255 6550 5877 113 519 3019 480 3019 379 89 138 368 141 450 239 239 9 4974 4992 47 376 47 47 6568 1252 6 27 204 57 1335 29 364 364 263 165 155 39 29 5422 4929 349 48 49 6 46 49 4 548 74 175 393 3946 9 507 489 546 6 542 1382 884 5 10 4503 298 24 95 6815 6807 4676 6833 493 4428 4429 457 7104 7082 4321 4523 4319 4559 4560 422 454 454 454 454 4330 4330 7083 102 2104 498 4737 4737 4737 4349 4357 4443 4357 4356 4351 375 370 14 1113 411 1 3 4 4 40 1079 937 39 38 70 59 60 16 935 2087 930 3729 82 32 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_MM_H #define _LINUX_MM_H #include <linux/errno.h> #include <linux/mmdebug.h> #include <linux/gfp.h> #include <linux/pgalloc_tag.h> #include <linux/bug.h> #include <linux/list.h> #include <linux/mmzone.h> #include <linux/rbtree.h> #include <linux/atomic.h> #include <linux/debug_locks.h> #include <linux/mm_types.h> #include <linux/mmap_lock.h> #include <linux/range.h> #include <linux/pfn.h> #include <linux/percpu-refcount.h> #include <linux/bit_spinlock.h> #include <linux/shrinker.h> #include <linux/resource.h> #include <linux/page_ext.h> #include <linux/err.h> #include <linux/page-flags.h> #include <linux/page_ref.h> #include <linux/overflow.h> #include <linux/sizes.h> #include <linux/sched.h> #include <linux/pgtable.h> #include <linux/kasan.h> #include <linux/memremap.h> #include <linux/slab.h> #include <linux/cacheinfo.h> struct mempolicy; struct anon_vma; struct anon_vma_chain; struct user_struct; struct pt_regs; struct folio_batch; extern int sysctl_page_lock_unfairness; void mm_core_init(void); void init_mm_internals(void); #ifndef CONFIG_NUMA /* Don't use mapnrs, do it properly */ extern unsigned long max_mapnr; static inline void set_max_mapnr(unsigned long limit) { max_mapnr = limit; } #else static inline void set_max_mapnr(unsigned long limit) { } #endif extern atomic_long_t _totalram_pages; static inline unsigned long totalram_pages(void) { return (unsigned long)atomic_long_read(&_totalram_pages); } static inline void totalram_pages_inc(void) { atomic_long_inc(&_totalram_pages); } static inline void totalram_pages_dec(void) { atomic_long_dec(&_totalram_pages); } static inline void totalram_pages_add(long count) { atomic_long_add(count, &_totalram_pages); } extern void * high_memory; extern int page_cluster; extern const int page_cluster_max; #ifdef CONFIG_SYSCTL extern int sysctl_legacy_va_layout; #else #define sysctl_legacy_va_layout 0 #endif #ifdef CONFIG_HAVE_ARCH_MMAP_RND_BITS extern const int mmap_rnd_bits_min; extern int mmap_rnd_bits_max __ro_after_init; extern int mmap_rnd_bits __read_mostly; #endif #ifdef CONFIG_HAVE_ARCH_MMAP_RND_COMPAT_BITS extern const int mmap_rnd_compat_bits_min; extern const int mmap_rnd_compat_bits_max; extern int mmap_rnd_compat_bits __read_mostly; #endif #ifndef DIRECT_MAP_PHYSMEM_END # ifdef MAX_PHYSMEM_BITS # define DIRECT_MAP_PHYSMEM_END ((1ULL << MAX_PHYSMEM_BITS) - 1) # else # define DIRECT_MAP_PHYSMEM_END (((phys_addr_t)-1)&~(1ULL<<63)) # endif #endif #include <asm/page.h> #include <asm/processor.h> #ifndef __pa_symbol #define __pa_symbol(x) __pa(RELOC_HIDE((unsigned long)(x), 0)) #endif #ifndef page_to_virt #define page_to_virt(x) __va(PFN_PHYS(page_to_pfn(x))) #endif #ifndef lm_alias #define lm_alias(x) __va(__pa_symbol(x)) #endif /* * To prevent common memory management code establishing * a zero page mapping on a read fault. * This macro should be defined within <asm/pgtable.h>. * s390 does this to prevent multiplexing of hardware bits * related to the physical page in case of virtualization. */ #ifndef mm_forbids_zeropage #define mm_forbids_zeropage(X) (0) #endif /* * On some architectures it is expensive to call memset() for small sizes. * If an architecture decides to implement their own version of * mm_zero_struct_page they should wrap the defines below in a #ifndef and * define their own version of this macro in <asm/pgtable.h> */ #if BITS_PER_LONG == 64 /* This function must be updated when the size of struct page grows above 96 * or reduces below 56. The idea that compiler optimizes out switch() * statement, and only leaves move/store instructions. Also the compiler can * combine write statements if they are both assignments and can be reordered, * this can result in several of the writes here being dropped. */ #define mm_zero_struct_page(pp) __mm_zero_struct_page(pp) static inline void __mm_zero_struct_page(struct page *page) { unsigned long *_pp = (void *)page; /* Check that struct page is either 56, 64, 72, 80, 88 or 96 bytes */ BUILD_BUG_ON(sizeof(struct page) & 7); BUILD_BUG_ON(sizeof(struct page) < 56); BUILD_BUG_ON(sizeof(struct page) > 96); switch (sizeof(struct page)) { case 96: _pp[11] = 0; fallthrough; case 88: _pp[10] = 0; fallthrough; case 80: _pp[9] = 0; fallthrough; case 72: _pp[8] = 0; fallthrough; case 64: _pp[7] = 0; fallthrough; case 56: _pp[6] = 0; _pp[5] = 0; _pp[4] = 0; _pp[3] = 0; _pp[2] = 0; _pp[1] = 0; _pp[0] = 0; } } #else #define mm_zero_struct_page(pp) ((void)memset((pp), 0, sizeof(struct page))) #endif /* * Default maximum number of active map areas, this limits the number of vmas * per mm struct. Users can overwrite this number by sysctl but there is a * problem. * * When a program's coredump is generated as ELF format, a section is created * per a vma. In ELF, the number of sections is represented in unsigned short. * This means the number of sections should be smaller than 65535 at coredump. * Because the kernel adds some informative sections to a image of program at * generating coredump, we need some margin. The number of extra sections is * 1-3 now and depends on arch. We use "5" as safe margin, here. * * ELF extended numbering allows more than 65535 sections, so 16-bit bound is * not a hard limit any more. Although some userspace tools can be surprised by * that. */ #define MAPCOUNT_ELF_CORE_MARGIN (5) #define DEFAULT_MAX_MAP_COUNT (USHRT_MAX - MAPCOUNT_ELF_CORE_MARGIN) extern int sysctl_max_map_count; extern unsigned long sysctl_user_reserve_kbytes; extern unsigned long sysctl_admin_reserve_kbytes; extern int sysctl_overcommit_memory; extern int sysctl_overcommit_ratio; extern unsigned long sysctl_overcommit_kbytes; int overcommit_ratio_handler(const struct ctl_table *, int, void *, size_t *, loff_t *); int overcommit_kbytes_handler(const struct ctl_table *, int, void *, size_t *, loff_t *); int overcommit_policy_handler(const struct ctl_table *, int, void *, size_t *, loff_t *); #if defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP) #define nth_page(page,n) pfn_to_page(page_to_pfn((page)) + (n)) #define folio_page_idx(folio, p) (page_to_pfn(p) - folio_pfn(folio)) #else #define nth_page(page,n) ((page) + (n)) #define folio_page_idx(folio, p) ((p) - &(folio)->page) #endif /* to align the pointer to the (next) page boundary */ #define PAGE_ALIGN(addr) ALIGN(addr, PAGE_SIZE) /* to align the pointer to the (prev) page boundary */ #define PAGE_ALIGN_DOWN(addr) ALIGN_DOWN(addr, PAGE_SIZE) /* test whether an address (unsigned long or pointer) is aligned to PAGE_SIZE */ #define PAGE_ALIGNED(addr) IS_ALIGNED((unsigned long)(addr), PAGE_SIZE) static inline struct folio *lru_to_folio(struct list_head *head) { return list_entry((head)->prev, struct folio, lru); } void setup_initial_init_mm(void *start_code, void *end_code, void *end_data, void *brk); /* * Linux kernel virtual memory manager primitives. * The idea being to have a "virtual" mm in the same way * we have a virtual fs - giving a cleaner interface to the * mm details, and allowing different kinds of memory mappings * (from shared memory to executable loading to arbitrary * mmap() functions). */ struct vm_area_struct *vm_area_alloc(struct mm_struct *); struct vm_area_struct *vm_area_dup(struct vm_area_struct *); void vm_area_free(struct vm_area_struct *); /* Use only if VMA has no other users */ void __vm_area_free(struct vm_area_struct *vma); #ifndef CONFIG_MMU extern struct rb_root nommu_region_tree; extern struct rw_semaphore nommu_region_sem; extern unsigned int kobjsize(const void *objp); #endif /* * vm_flags in vm_area_struct, see mm_types.h. * When changing, update also include/trace/events/mmflags.h */ #define VM_NONE 0x00000000 #define VM_READ 0x00000001 /* currently active flags */ #define VM_WRITE 0x00000002 #define VM_EXEC 0x00000004 #define VM_SHARED 0x00000008 /* mprotect() hardcodes VM_MAYREAD >> 4 == VM_READ, and so for r/w/x bits. */ #define VM_MAYREAD 0x00000010 /* limits for mprotect() etc */ #define VM_MAYWRITE 0x00000020 #define VM_MAYEXEC 0x00000040 #define VM_MAYSHARE 0x00000080 #define VM_GROWSDOWN 0x00000100 /* general info on the segment */ #ifdef CONFIG_MMU #define VM_UFFD_MISSING 0x00000200 /* missing pages tracking */ #else /* CONFIG_MMU */ #define VM_MAYOVERLAY 0x00000200 /* nommu: R/O MAP_PRIVATE mapping that might overlay a file mapping */ #define VM_UFFD_MISSING 0 #endif /* CONFIG_MMU */ #define VM_PFNMAP 0x00000400 /* Page-ranges managed without "struct page", just pure PFN */ #define VM_UFFD_WP 0x00001000 /* wrprotect pages tracking */ #define VM_LOCKED 0x00002000 #define VM_IO 0x00004000 /* Memory mapped I/O or similar */ /* Used by sys_madvise() */ #define VM_SEQ_READ 0x00008000 /* App will access data sequentially */ #define VM_RAND_READ 0x00010000 /* App will not benefit from clustered reads */ #define VM_DONTCOPY 0x00020000 /* Do not copy this vma on fork */ #define VM_DONTEXPAND 0x00040000 /* Cannot expand with mremap() */ #define VM_LOCKONFAULT 0x00080000 /* Lock the pages covered when they are faulted in */ #define VM_ACCOUNT 0x00100000 /* Is a VM accounted object */ #define VM_NORESERVE 0x00200000 /* should the VM suppress accounting */ #define VM_HUGETLB 0x00400000 /* Huge TLB Page VM */ #define VM_SYNC 0x00800000 /* Synchronous page faults */ #define VM_ARCH_1 0x01000000 /* Architecture-specific flag */ #define VM_WIPEONFORK 0x02000000 /* Wipe VMA contents in child. */ #define VM_DONTDUMP 0x04000000 /* Do not include in the core dump */ #ifdef CONFIG_MEM_SOFT_DIRTY # define VM_SOFTDIRTY 0x08000000 /* Not soft dirty clean area */ #else # define VM_SOFTDIRTY 0 #endif #define VM_MIXEDMAP 0x10000000 /* Can contain "struct page" and pure PFN pages */ #define VM_HUGEPAGE 0x20000000 /* MADV_HUGEPAGE marked this vma */ #define VM_NOHUGEPAGE 0x40000000 /* MADV_NOHUGEPAGE marked this vma */ #define VM_MERGEABLE 0x80000000 /* KSM may merge identical pages */ #ifdef CONFIG_ARCH_USES_HIGH_VMA_FLAGS #define VM_HIGH_ARCH_BIT_0 32 /* bit only usable on 64-bit architectures */ #define VM_HIGH_ARCH_BIT_1 33 /* bit only usable on 64-bit architectures */ #define VM_HIGH_ARCH_BIT_2 34 /* bit only usable on 64-bit architectures */ #define VM_HIGH_ARCH_BIT_3 35 /* bit only usable on 64-bit architectures */ #define VM_HIGH_ARCH_BIT_4 36 /* bit only usable on 64-bit architectures */ #define VM_HIGH_ARCH_BIT_5 37 /* bit only usable on 64-bit architectures */ #define VM_HIGH_ARCH_BIT_6 38 /* bit only usable on 64-bit architectures */ #define VM_HIGH_ARCH_0 BIT(VM_HIGH_ARCH_BIT_0) #define VM_HIGH_ARCH_1 BIT(VM_HIGH_ARCH_BIT_1) #define VM_HIGH_ARCH_2 BIT(VM_HIGH_ARCH_BIT_2) #define VM_HIGH_ARCH_3 BIT(VM_HIGH_ARCH_BIT_3) #define VM_HIGH_ARCH_4 BIT(VM_HIGH_ARCH_BIT_4) #define VM_HIGH_ARCH_5 BIT(VM_HIGH_ARCH_BIT_5) #define VM_HIGH_ARCH_6 BIT(VM_HIGH_ARCH_BIT_6) #endif /* CONFIG_ARCH_USES_HIGH_VMA_FLAGS */ #ifdef CONFIG_ARCH_HAS_PKEYS # define VM_PKEY_SHIFT VM_HIGH_ARCH_BIT_0 # define VM_PKEY_BIT0 VM_HIGH_ARCH_0 # define VM_PKEY_BIT1 VM_HIGH_ARCH_1 # define VM_PKEY_BIT2 VM_HIGH_ARCH_2 #if CONFIG_ARCH_PKEY_BITS > 3 # define VM_PKEY_BIT3 VM_HIGH_ARCH_3 #else # define VM_PKEY_BIT3 0 #endif #if CONFIG_ARCH_PKEY_BITS > 4 # define VM_PKEY_BIT4 VM_HIGH_ARCH_4 #else # define VM_PKEY_BIT4 0 #endif #endif /* CONFIG_ARCH_HAS_PKEYS */ #ifdef CONFIG_X86_USER_SHADOW_STACK /* * VM_SHADOW_STACK should not be set with VM_SHARED because of lack of * support core mm. * * These VMAs will get a single end guard page. This helps userspace protect * itself from attacks. A single page is enough for current shadow stack archs * (x86). See the comments near alloc_shstk() in arch/x86/kernel/shstk.c * for more details on the guard size. */ # define VM_SHADOW_STACK VM_HIGH_ARCH_5 #endif #if defined(CONFIG_ARM64_GCS) /* * arm64's Guarded Control Stack implements similar functionality and * has similar constraints to shadow stacks. */ # define VM_SHADOW_STACK VM_HIGH_ARCH_6 #endif #ifndef VM_SHADOW_STACK # define VM_SHADOW_STACK VM_NONE #endif #if defined(CONFIG_X86) # define VM_PAT VM_ARCH_1 /* PAT reserves whole VMA at once (x86) */ #elif defined(CONFIG_PPC64) # define VM_SAO VM_ARCH_1 /* Strong Access Ordering (powerpc) */ #elif defined(CONFIG_PARISC) # define VM_GROWSUP VM_ARCH_1 #elif defined(CONFIG_SPARC64) # define VM_SPARC_ADI VM_ARCH_1 /* Uses ADI tag for access control */ # define VM_ARCH_CLEAR VM_SPARC_ADI #elif defined(CONFIG_ARM64) # define VM_ARM64_BTI VM_ARCH_1 /* BTI guarded page, a.k.a. GP bit */ # define VM_ARCH_CLEAR VM_ARM64_BTI #elif !defined(CONFIG_MMU) # define VM_MAPPED_COPY VM_ARCH_1 /* T if mapped copy of data (nommu mmap) */ #endif #if defined(CONFIG_ARM64_MTE) # define VM_MTE VM_HIGH_ARCH_4 /* Use Tagged memory for access control */ # define VM_MTE_ALLOWED VM_HIGH_ARCH_5 /* Tagged memory permitted */ #else # define VM_MTE VM_NONE # define VM_MTE_ALLOWED VM_NONE #endif #ifndef VM_GROWSUP # define VM_GROWSUP VM_NONE #endif #ifdef CONFIG_HAVE_ARCH_USERFAULTFD_MINOR # define VM_UFFD_MINOR_BIT 38 # define VM_UFFD_MINOR BIT(VM_UFFD_MINOR_BIT) /* UFFD minor faults */ #else /* !CONFIG_HAVE_ARCH_USERFAULTFD_MINOR */ # define VM_UFFD_MINOR VM_NONE #endif /* CONFIG_HAVE_ARCH_USERFAULTFD_MINOR */ /* * This flag is used to connect VFIO to arch specific KVM code. It * indicates that the memory under this VMA is safe for use with any * non-cachable memory type inside KVM. Some VFIO devices, on some * platforms, are thought to be unsafe and can cause machine crashes * if KVM does not lock down the memory type. */ #ifdef CONFIG_64BIT #define VM_ALLOW_ANY_UNCACHED_BIT 39 #define VM_ALLOW_ANY_UNCACHED BIT(VM_ALLOW_ANY_UNCACHED_BIT) #else #define VM_ALLOW_ANY_UNCACHED VM_NONE #endif #ifdef CONFIG_64BIT #define VM_DROPPABLE_BIT 40 #define VM_DROPPABLE BIT(VM_DROPPABLE_BIT) #elif defined(CONFIG_PPC32) #define VM_DROPPABLE VM_ARCH_1 #else #define VM_DROPPABLE VM_NONE #endif #ifdef CONFIG_64BIT /* VM is sealed, in vm_flags */ #define VM_SEALED _BITUL(63) #endif /* Bits set in the VMA until the stack is in its final location */ #define VM_STACK_INCOMPLETE_SETUP (VM_RAND_READ | VM_SEQ_READ | VM_STACK_EARLY) #define TASK_EXEC ((current->personality & READ_IMPLIES_EXEC) ? VM_EXEC : 0) /* Common data flag combinations */ #define VM_DATA_FLAGS_TSK_EXEC (VM_READ | VM_WRITE | TASK_EXEC | \ VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC) #define VM_DATA_FLAGS_NON_EXEC (VM_READ | VM_WRITE | VM_MAYREAD | \ VM_MAYWRITE | VM_MAYEXEC) #define VM_DATA_FLAGS_EXEC (VM_READ | VM_WRITE | VM_EXEC | \ VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC) #ifndef VM_DATA_DEFAULT_FLAGS /* arch can override this */ #define VM_DATA_DEFAULT_FLAGS VM_DATA_FLAGS_EXEC #endif #ifndef VM_STACK_DEFAULT_FLAGS /* arch can override this */ #define VM_STACK_DEFAULT_FLAGS VM_DATA_DEFAULT_FLAGS #endif #define VM_STARTGAP_FLAGS (VM_GROWSDOWN | VM_SHADOW_STACK) #ifdef CONFIG_STACK_GROWSUP #define VM_STACK VM_GROWSUP #define VM_STACK_EARLY VM_GROWSDOWN #else #define VM_STACK VM_GROWSDOWN #define VM_STACK_EARLY 0 #endif #define VM_STACK_FLAGS (VM_STACK | VM_STACK_DEFAULT_FLAGS | VM_ACCOUNT) /* VMA basic access permission flags */ #define VM_ACCESS_FLAGS (VM_READ | VM_WRITE | VM_EXEC) /* * Special vmas that are non-mergable, non-mlock()able. */ #define VM_SPECIAL (VM_IO | VM_DONTEXPAND | VM_PFNMAP | VM_MIXEDMAP) /* This mask prevents VMA from being scanned with khugepaged */ #define VM_NO_KHUGEPAGED (VM_SPECIAL | VM_HUGETLB) /* This mask defines which mm->def_flags a process can inherit its parent */ #define VM_INIT_DEF_MASK VM_NOHUGEPAGE /* This mask represents all the VMA flag bits used by mlock */ #define VM_LOCKED_MASK (VM_LOCKED | VM_LOCKONFAULT) /* Arch-specific flags to clear when updating VM flags on protection change */ #ifndef VM_ARCH_CLEAR # define VM_ARCH_CLEAR VM_NONE #endif #define VM_FLAGS_CLEAR (ARCH_VM_PKEY_FLAGS | VM_ARCH_CLEAR) /* * mapping from the currently active vm_flags protection bits (the * low four bits) to a page protection mask.. */ /* * The default fault flags that should be used by most of the * arch-specific page fault handlers. */ #define FAULT_FLAG_DEFAULT (FAULT_FLAG_ALLOW_RETRY | \ FAULT_FLAG_KILLABLE | \ FAULT_FLAG_INTERRUPTIBLE) /** * fault_flag_allow_retry_first - check ALLOW_RETRY the first time * @flags: Fault flags. * * This is mostly used for places where we want to try to avoid taking * the mmap_lock for too long a time when waiting for another condition * to change, in which case we can try to be polite to release the * mmap_lock in the first round to avoid potential starvation of other * processes that would also want the mmap_lock. * * Return: true if the page fault allows retry and this is the first * attempt of the fault handling; false otherwise. */ static inline bool fault_flag_allow_retry_first(enum fault_flag flags) { return (flags & FAULT_FLAG_ALLOW_RETRY) && (!(flags & FAULT_FLAG_TRIED)); } #define FAULT_FLAG_TRACE \ { FAULT_FLAG_WRITE, "WRITE" }, \ { FAULT_FLAG_MKWRITE, "MKWRITE" }, \ { FAULT_FLAG_ALLOW_RETRY, "ALLOW_RETRY" }, \ { FAULT_FLAG_RETRY_NOWAIT, "RETRY_NOWAIT" }, \ { FAULT_FLAG_KILLABLE, "KILLABLE" }, \ { FAULT_FLAG_TRIED, "TRIED" }, \ { FAULT_FLAG_USER, "USER" }, \ { FAULT_FLAG_REMOTE, "REMOTE" }, \ { FAULT_FLAG_INSTRUCTION, "INSTRUCTION" }, \ { FAULT_FLAG_INTERRUPTIBLE, "INTERRUPTIBLE" }, \ { FAULT_FLAG_VMA_LOCK, "VMA_LOCK" } /* * vm_fault is filled by the pagefault handler and passed to the vma's * ->fault function. The vma's ->fault is responsible for returning a bitmask * of VM_FAULT_xxx flags that give details about how the fault was handled. * * MM layer fills up gfp_mask for page allocations but fault handler might * alter it if its implementation requires a different allocation context. * * pgoff should be used in favour of virtual_address, if possible. */ struct vm_fault { const struct { struct vm_area_struct *vma; /* Target VMA */ gfp_t gfp_mask; /* gfp mask to be used for allocations */ pgoff_t pgoff; /* Logical page offset based on vma */ unsigned long address; /* Faulting virtual address - masked */ unsigned long real_address; /* Faulting virtual address - unmasked */ }; enum fault_flag flags; /* FAULT_FLAG_xxx flags * XXX: should really be 'const' */ pmd_t *pmd; /* Pointer to pmd entry matching * the 'address' */ pud_t *pud; /* Pointer to pud entry matching * the 'address' */ union { pte_t orig_pte; /* Value of PTE at the time of fault */ pmd_t orig_pmd; /* Value of PMD at the time of fault, * used by PMD fault only. */ }; struct page *cow_page; /* Page handler may use for COW fault */ struct page *page; /* ->fault handlers should return a * page here, unless VM_FAULT_NOPAGE * is set (which is also implied by * VM_FAULT_ERROR). */ /* These three entries are valid only while holding ptl lock */ pte_t *pte; /* Pointer to pte entry matching * the 'address'. NULL if the page * table hasn't been allocated. */ spinlock_t *ptl; /* Page table lock. * Protects pte page table if 'pte' * is not NULL, otherwise pmd. */ pgtable_t prealloc_pte; /* Pre-allocated pte page table. * vm_ops->map_pages() sets up a page * table from atomic context. * do_fault_around() pre-allocates * page table to avoid allocation from * atomic context. */ }; /* * These are the virtual MM functions - opening of an area, closing and * unmapping it (needed to keep files on disk up-to-date etc), pointer * to the functions called when a no-page or a wp-page exception occurs. */ struct vm_operations_struct { void (*open)(struct vm_area_struct * area); /** * @close: Called when the VMA is being removed from the MM. * Context: User context. May sleep. Caller holds mmap_lock. */ void (*close)(struct vm_area_struct * area); /* Called any time before splitting to check if it's allowed */ int (*may_split)(struct vm_area_struct *area, unsigned long addr); int (*mremap)(struct vm_area_struct *area); /* * Called by mprotect() to make driver-specific permission * checks before mprotect() is finalised. The VMA must not * be modified. Returns 0 if mprotect() can proceed. */ int (*mprotect)(struct vm_area_struct *vma, unsigned long start, unsigned long end, unsigned long newflags); vm_fault_t (*fault)(struct vm_fault *vmf); vm_fault_t (*huge_fault)(struct vm_fault *vmf, unsigned int order); vm_fault_t (*map_pages)(struct vm_fault *vmf, pgoff_t start_pgoff, pgoff_t end_pgoff); unsigned long (*pagesize)(struct vm_area_struct * area); /* notification that a previously read-only page is about to become * writable, if an error is returned it will cause a SIGBUS */ vm_fault_t (*page_mkwrite)(struct vm_fault *vmf); /* same as page_mkwrite when using VM_PFNMAP|VM_MIXEDMAP */ vm_fault_t (*pfn_mkwrite)(struct vm_fault *vmf); /* called by access_process_vm when get_user_pages() fails, typically * for use by special VMAs. See also generic_access_phys() for a generic * implementation useful for any iomem mapping. */ int (*access)(struct vm_area_struct *vma, unsigned long addr, void *buf, int len, int write); /* Called by the /proc/PID/maps code to ask the vma whether it * has a special name. Returning non-NULL will also cause this * vma to be dumped unconditionally. */ const char *(*name)(struct vm_area_struct *vma); #ifdef CONFIG_NUMA /* * set_policy() op must add a reference to any non-NULL @new mempolicy * to hold the policy upon return. Caller should pass NULL @new to * remove a policy and fall back to surrounding context--i.e. do not * install a MPOL_DEFAULT policy, nor the task or system default * mempolicy. */ int (*set_policy)(struct vm_area_struct *vma, struct mempolicy *new); /* * get_policy() op must add reference [mpol_get()] to any policy at * (vma,addr) marked as MPOL_SHARED. The shared policy infrastructure * in mm/mempolicy.c will do this automatically. * get_policy() must NOT add a ref if the policy at (vma,addr) is not * marked as MPOL_SHARED. vma policies are protected by the mmap_lock. * If no [shared/vma] mempolicy exists at the addr, get_policy() op * must return NULL--i.e., do not "fallback" to task or system default * policy. */ struct mempolicy *(*get_policy)(struct vm_area_struct *vma, unsigned long addr, pgoff_t *ilx); #endif /* * Called by vm_normal_page() for special PTEs to find the * page for @addr. This is useful if the default behavior * (using pte_page()) would not find the correct page. */ struct page *(*find_special_page)(struct vm_area_struct *vma, unsigned long addr); }; #ifdef CONFIG_NUMA_BALANCING static inline void vma_numab_state_init(struct vm_area_struct *vma) { vma->numab_state = NULL; } static inline void vma_numab_state_free(struct vm_area_struct *vma) { kfree(vma->numab_state); } #else static inline void vma_numab_state_init(struct vm_area_struct *vma) {} static inline void vma_numab_state_free(struct vm_area_struct *vma) {} #endif /* CONFIG_NUMA_BALANCING */ #ifdef CONFIG_PER_VMA_LOCK /* * Try to read-lock a vma. The function is allowed to occasionally yield false * locked result to avoid performance overhead, in which case we fall back to * using mmap_lock. The function should never yield false unlocked result. */ static inline bool vma_start_read(struct vm_area_struct *vma) { /* * Check before locking. A race might cause false locked result. * We can use READ_ONCE() for the mm_lock_seq here, and don't need * ACQUIRE semantics, because this is just a lockless check whose result * we don't rely on for anything - the mm_lock_seq read against which we * need ordering is below. */ if (READ_ONCE(vma->vm_lock_seq) == READ_ONCE(vma->vm_mm->mm_lock_seq.sequence)) return false; if (unlikely(down_read_trylock(&vma->vm_lock->lock) == 0)) return false; /* * Overflow might produce false locked result. * False unlocked result is impossible because we modify and check * vma->vm_lock_seq under vma->vm_lock protection and mm->mm_lock_seq * modification invalidates all existing locks. * * We must use ACQUIRE semantics for the mm_lock_seq so that if we are * racing with vma_end_write_all(), we only start reading from the VMA * after it has been unlocked. * This pairs with RELEASE semantics in vma_end_write_all(). */ if (unlikely(vma->vm_lock_seq == raw_read_seqcount(&vma->vm_mm->mm_lock_seq))) { up_read(&vma->vm_lock->lock); return false; } return true; } static inline void vma_end_read(struct vm_area_struct *vma) { rcu_read_lock(); /* keeps vma alive till the end of up_read */ up_read(&vma->vm_lock->lock); rcu_read_unlock(); } /* WARNING! Can only be used if mmap_lock is expected to be write-locked */ static bool __is_vma_write_locked(struct vm_area_struct *vma, unsigned int *mm_lock_seq) { mmap_assert_write_locked(vma->vm_mm); /* * current task is holding mmap_write_lock, both vma->vm_lock_seq and * mm->mm_lock_seq can't be concurrently modified. */ *mm_lock_seq = vma->vm_mm->mm_lock_seq.sequence; return (vma->vm_lock_seq == *mm_lock_seq); } /* * Begin writing to a VMA. * Exclude concurrent readers under the per-VMA lock until the currently * write-locked mmap_lock is dropped or downgraded. */ static inline void vma_start_write(struct vm_area_struct *vma) { unsigned int mm_lock_seq; if (__is_vma_write_locked(vma, &mm_lock_seq)) return; down_write(&vma->vm_lock->lock); /* * We should use WRITE_ONCE() here because we can have concurrent reads * from the early lockless pessimistic check in vma_start_read(). * We don't really care about the correctness of that early check, but * we should use WRITE_ONCE() for cleanliness and to keep KCSAN happy. */ WRITE_ONCE(vma->vm_lock_seq, mm_lock_seq); up_write(&vma->vm_lock->lock); } static inline void vma_assert_write_locked(struct vm_area_struct *vma) { unsigned int mm_lock_seq; VM_BUG_ON_VMA(!__is_vma_write_locked(vma, &mm_lock_seq), vma); } static inline void vma_assert_locked(struct vm_area_struct *vma) { if (!rwsem_is_locked(&vma->vm_lock->lock)) vma_assert_write_locked(vma); } static inline void vma_mark_detached(struct vm_area_struct *vma, bool detached) { /* When detaching vma should be write-locked */ if (detached) vma_assert_write_locked(vma); vma->detached = detached; } static inline void release_fault_lock(struct vm_fault *vmf) { if (vmf->flags & FAULT_FLAG_VMA_LOCK) vma_end_read(vmf->vma); else mmap_read_unlock(vmf->vma->vm_mm); } static inline void assert_fault_locked(struct vm_fault *vmf) { if (vmf->flags & FAULT_FLAG_VMA_LOCK) vma_assert_locked(vmf->vma); else mmap_assert_locked(vmf->vma->vm_mm); } struct vm_area_struct *lock_vma_under_rcu(struct mm_struct *mm, unsigned long address); #else /* CONFIG_PER_VMA_LOCK */ static inline bool vma_start_read(struct vm_area_struct *vma) { return false; } static inline void vma_end_read(struct vm_area_struct *vma) {} static inline void vma_start_write(struct vm_area_struct *vma) {} static inline void vma_assert_write_locked(struct vm_area_struct *vma) { mmap_assert_write_locked(vma->vm_mm); } static inline void vma_mark_detached(struct vm_area_struct *vma, bool detached) {} static inline struct vm_area_struct *lock_vma_under_rcu(struct mm_struct *mm, unsigned long address) { return NULL; } static inline void vma_assert_locked(struct vm_area_struct *vma) { mmap_assert_locked(vma->vm_mm); } static inline void release_fault_lock(struct vm_fault *vmf) { mmap_read_unlock(vmf->vma->vm_mm); } static inline void assert_fault_locked(struct vm_fault *vmf) { mmap_assert_locked(vmf->vma->vm_mm); } #endif /* CONFIG_PER_VMA_LOCK */ extern const struct vm_operations_struct vma_dummy_vm_ops; /* * WARNING: vma_init does not initialize vma->vm_lock. * Use vm_area_alloc()/vm_area_free() if vma needs locking. */ static inline void vma_init(struct vm_area_struct *vma, struct mm_struct *mm) { memset(vma, 0, sizeof(*vma)); vma->vm_mm = mm; vma->vm_ops = &vma_dummy_vm_ops; INIT_LIST_HEAD(&vma->anon_vma_chain); vma_mark_detached(vma, false); vma_numab_state_init(vma); } /* Use when VMA is not part of the VMA tree and needs no locking */ static inline void vm_flags_init(struct vm_area_struct *vma, vm_flags_t flags) { ACCESS_PRIVATE(vma, __vm_flags) = flags; } /* * Use when VMA is part of the VMA tree and modifications need coordination * Note: vm_flags_reset and vm_flags_reset_once do not lock the vma and * it should be locked explicitly beforehand. */ static inline void vm_flags_reset(struct vm_area_struct *vma, vm_flags_t flags) { vma_assert_write_locked(vma); vm_flags_init(vma, flags); } static inline void vm_flags_reset_once(struct vm_area_struct *vma, vm_flags_t flags) { vma_assert_write_locked(vma); WRITE_ONCE(ACCESS_PRIVATE(vma, __vm_flags), flags); } static inline void vm_flags_set(struct vm_area_struct *vma, vm_flags_t flags) { vma_start_write(vma); ACCESS_PRIVATE(vma, __vm_flags) |= flags; } static inline void vm_flags_clear(struct vm_area_struct *vma, vm_flags_t flags) { vma_start_write(vma); ACCESS_PRIVATE(vma, __vm_flags) &= ~flags; } /* * Use only if VMA is not part of the VMA tree or has no other users and * therefore needs no locking. */ static inline void __vm_flags_mod(struct vm_area_struct *vma, vm_flags_t set, vm_flags_t clear) { vm_flags_init(vma, (vma->vm_flags | set) & ~clear); } /* * Use only when the order of set/clear operations is unimportant, otherwise * use vm_flags_{set|clear} explicitly. */ static inline void vm_flags_mod(struct vm_area_struct *vma, vm_flags_t set, vm_flags_t clear) { vma_start_write(vma); __vm_flags_mod(vma, set, clear); } static inline void vma_set_anonymous(struct vm_area_struct *vma) { vma->vm_ops = NULL; } static inline bool vma_is_anonymous(struct vm_area_struct *vma) { return !vma->vm_ops; } /* * Indicate if the VMA is a heap for the given task; for * /proc/PID/maps that is the heap of the main task. */ static inline bool vma_is_initial_heap(const struct vm_area_struct *vma) { return vma->vm_start < vma->vm_mm->brk && vma->vm_end > vma->vm_mm->start_brk; } /* * Indicate if the VMA is a stack for the given task; for * /proc/PID/maps that is the stack of the main task. */ static inline bool vma_is_initial_stack(const struct vm_area_struct *vma) { /* * We make no effort to guess what a given thread considers to be * its "stack". It's not even well-defined for programs written * languages like Go. */ return vma->vm_start <= vma->vm_mm->start_stack && vma->vm_end >= vma->vm_mm->start_stack; } static inline bool vma_is_temporary_stack(struct vm_area_struct *vma) { int maybe_stack = vma->vm_flags & (VM_GROWSDOWN | VM_GROWSUP); if (!maybe_stack) return false; if ((vma->vm_flags & VM_STACK_INCOMPLETE_SETUP) == VM_STACK_INCOMPLETE_SETUP) return true; return false; } static inline bool vma_is_foreign(struct vm_area_struct *vma) { if (!current->mm) return true; if (current->mm != vma->vm_mm) return true; return false; } static inline bool vma_is_accessible(struct vm_area_struct *vma) { return vma->vm_flags & VM_ACCESS_FLAGS; } static inline bool is_shared_maywrite(vm_flags_t vm_flags) { return (vm_flags & (VM_SHARED | VM_MAYWRITE)) == (VM_SHARED | VM_MAYWRITE); } static inline bool vma_is_shared_maywrite(struct vm_area_struct *vma) { return is_shared_maywrite(vma->vm_flags); } static inline struct vm_area_struct *vma_find(struct vma_iterator *vmi, unsigned long max) { return mas_find(&vmi->mas, max - 1); } static inline struct vm_area_struct *vma_next(struct vma_iterator *vmi) { /* * Uses mas_find() to get the first VMA when the iterator starts. * Calling mas_next() could skip the first entry. */ return mas_find(&vmi->mas, ULONG_MAX); } static inline struct vm_area_struct *vma_iter_next_range(struct vma_iterator *vmi) { return mas_next_range(&vmi->mas, ULONG_MAX); } static inline struct vm_area_struct *vma_prev(struct vma_iterator *vmi) { return mas_prev(&vmi->mas, 0); } static inline int vma_iter_clear_gfp(struct vma_iterator *vmi, unsigned long start, unsigned long end, gfp_t gfp) { __mas_set_range(&vmi->mas, start, end - 1); mas_store_gfp(&vmi->mas, NULL, gfp); if (unlikely(mas_is_err(&vmi->mas))) return -ENOMEM; return 0; } /* Free any unused preallocations */ static inline void vma_iter_free(struct vma_iterator *vmi) { mas_destroy(&vmi->mas); } static inline int vma_iter_bulk_store(struct vma_iterator *vmi, struct vm_area_struct *vma) { vmi->mas.index = vma->vm_start; vmi->mas.last = vma->vm_end - 1; mas_store(&vmi->mas, vma); if (unlikely(mas_is_err(&vmi->mas))) return -ENOMEM; return 0; } static inline void vma_iter_invalidate(struct vma_iterator *vmi) { mas_pause(&vmi->mas); } static inline void vma_iter_set(struct vma_iterator *vmi, unsigned long addr) { mas_set(&vmi->mas, addr); } #define for_each_vma(__vmi, __vma) \ while (((__vma) = vma_next(&(__vmi))) != NULL) /* The MM code likes to work with exclusive end addresses */ #define for_each_vma_range(__vmi, __vma, __end) \ while (((__vma) = vma_find(&(__vmi), (__end))) != NULL) #ifdef CONFIG_SHMEM /* * The vma_is_shmem is not inline because it is used only by slow * paths in userfault. */ bool vma_is_shmem(struct vm_area_struct *vma); bool vma_is_anon_shmem(struct vm_area_struct *vma); #else static inline bool vma_is_shmem(struct vm_area_struct *vma) { return false; } static inline bool vma_is_anon_shmem(struct vm_area_struct *vma) { return false; } #endif int vma_is_stack_for_current(struct vm_area_struct *vma); /* flush_tlb_range() takes a vma, not a mm, and can care about flags */ #define TLB_FLUSH_VMA(mm,flags) { .vm_mm = (mm), .vm_flags = (flags) } struct mmu_gather; struct inode; /* * compound_order() can be called without holding a reference, which means * that niceties like page_folio() don't work. These callers should be * prepared to handle wild return values. For example, PG_head may be * set before the order is initialised, or this may be a tail page. * See compaction.c for some good examples. */ static inline unsigned int compound_order(struct page *page) { struct folio *folio = (struct folio *)page; if (!test_bit(PG_head, &folio->flags)) return 0; return folio->_flags_1 & 0xff; } /** * folio_order - The allocation order of a folio. * @folio: The folio. * * A folio is composed of 2^order pages. See get_order() for the definition * of order. * * Return: The order of the folio. */ static inline unsigned int folio_order(const struct folio *folio) { if (!folio_test_large(folio)) return 0; return folio->_flags_1 & 0xff; } #include <linux/huge_mm.h> /* * Methods to modify the page usage count. * * What counts for a page usage: * - cache mapping (page->mapping) * - private data (page->private) * - page mapped in a task's page tables, each mapping * is counted separately * * Also, many kernel routines increase the page count before a critical * routine so they can be sure the page doesn't go away from under them. */ /* * Drop a ref, return true if the refcount fell to zero (the page has no users) */ static inline int put_page_testzero(struct page *page) { VM_BUG_ON_PAGE(page_ref_count(page) == 0, page); return page_ref_dec_and_test(page); } static inline int folio_put_testzero(struct folio *folio) { return put_page_testzero(&folio->page); } /* * Try to grab a ref unless the page has a refcount of zero, return false if * that is the case. * This can be called when MMU is off so it must not access * any of the virtual mappings. */ static inline bool get_page_unless_zero(struct page *page) { return page_ref_add_unless(page, 1, 0); } static inline struct folio *folio_get_nontail_page(struct page *page) { if (unlikely(!get_page_unless_zero(page))) return NULL; return (struct folio *)page; } extern int page_is_ram(unsigned long pfn); enum { REGION_INTERSECTS, REGION_DISJOINT, REGION_MIXED, }; int region_intersects(resource_size_t offset, size_t size, unsigned long flags, unsigned long desc); /* Support for virtually mapped pages */ struct page *vmalloc_to_page(const void *addr); unsigned long vmalloc_to_pfn(const void *addr); /* * Determine if an address is within the vmalloc range * * On nommu, vmalloc/vfree wrap through kmalloc/kfree directly, so there * is no special casing required. */ #ifdef CONFIG_MMU extern bool is_vmalloc_addr(const void *x); extern int is_vmalloc_or_module_addr(const void *x); #else static inline bool is_vmalloc_addr(const void *x) { return false; } static inline int is_vmalloc_or_module_addr(const void *x) { return 0; } #endif /* * How many times the entire folio is mapped as a single unit (eg by a * PMD or PUD entry). This is probably not what you want, except for * debugging purposes or implementation of other core folio_*() primitives. */ static inline int folio_entire_mapcount(const struct folio *folio) { VM_BUG_ON_FOLIO(!folio_test_large(folio), folio); return atomic_read(&folio->_entire_mapcount) + 1; } static inline int folio_large_mapcount(const struct folio *folio) { VM_WARN_ON_FOLIO(!folio_test_large(folio), folio); return atomic_read(&folio->_large_mapcount) + 1; } /** * folio_mapcount() - Number of mappings of this folio. * @folio: The folio. * * The folio mapcount corresponds to the number of present user page table * entries that reference any part of a folio. Each such present user page * table entry must be paired with exactly on folio reference. * * For ordindary folios, each user page table entry (PTE/PMD/PUD/...) counts * exactly once. * * For hugetlb folios, each abstracted "hugetlb" user page table entry that * references the entire folio counts exactly once, even when such special * page table entries are comprised of multiple ordinary page table entries. * * Will report 0 for pages which cannot be mapped into userspace, such as * slab, page tables and similar. * * Return: The number of times this folio is mapped. */ static inline int folio_mapcount(const struct folio *folio) { int mapcount; if (likely(!folio_test_large(folio))) { mapcount = atomic_read(&folio->_mapcount) + 1; if (page_mapcount_is_type(mapcount)) mapcount = 0; return mapcount; } return folio_large_mapcount(folio); } /** * folio_mapped - Is this folio mapped into userspace? * @folio: The folio. * * Return: True if any page in this folio is referenced by user page tables. */ static inline bool folio_mapped(const struct folio *folio) { return folio_mapcount(folio) >= 1; } /* * Return true if this page is mapped into pagetables. * For compound page it returns true if any sub-page of compound page is mapped, * even if this particular sub-page is not itself mapped by any PTE or PMD. */ static inline bool page_mapped(const struct page *page) { return folio_mapped(page_folio(page)); } static inline struct page *virt_to_head_page(const void *x) { struct page *page = virt_to_page(x); return compound_head(page); } static inline struct folio *virt_to_folio(const void *x) { struct page *page = virt_to_page(x); return page_folio(page); } void __folio_put(struct folio *folio); void split_page(struct page *page, unsigned int order); void folio_copy(struct folio *dst, struct folio *src); int folio_mc_copy(struct folio *dst, struct folio *src); unsigned long nr_free_buffer_pages(void); /* Returns the number of bytes in this potentially compound page. */ static inline unsigned long page_size(struct page *page) { return PAGE_SIZE << compound_order(page); } /* Returns the number of bits needed for the number of bytes in a page */ static inline unsigned int page_shift(struct page *page) { return PAGE_SHIFT + compound_order(page); } /** * thp_order - Order of a transparent huge page. * @page: Head page of a transparent huge page. */ static inline unsigned int thp_order(struct page *page) { VM_BUG_ON_PGFLAGS(PageTail(page), page); return compound_order(page); } /** * thp_size - Size of a transparent huge page. * @page: Head page of a transparent huge page. * * Return: Number of bytes in this page. */ static inline unsigned long thp_size(struct page *page) { return PAGE_SIZE << thp_order(page); } #ifdef CONFIG_MMU /* * Do pte_mkwrite, but only if the vma says VM_WRITE. We do this when * servicing faults for write access. In the normal case, do always want * pte_mkwrite. But get_user_pages can cause write faults for mappings * that do not have writing enabled, when used by access_process_vm. */ static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma) { if (likely(vma->vm_flags & VM_WRITE)) pte = pte_mkwrite(pte, vma); return pte; } vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page); void set_pte_range(struct vm_fault *vmf, struct folio *folio, struct page *page, unsigned int nr, unsigned long addr); vm_fault_t finish_fault(struct vm_fault *vmf); #endif /* * Multiple processes may "see" the same page. E.g. for untouched * mappings of /dev/null, all processes see the same page full of * zeroes, and text pages of executables and shared libraries have * only one copy in memory, at most, normally. * * For the non-reserved pages, page_count(page) denotes a reference count. * page_count() == 0 means the page is free. page->lru is then used for * freelist management in the buddy allocator. * page_count() > 0 means the page has been allocated. * * Pages are allocated by the slab allocator in order to provide memory * to kmalloc and kmem_cache_alloc. In this case, the management of the * page, and the fields in 'struct page' are the responsibility of mm/slab.c * unless a particular usage is carefully commented. (the responsibility of * freeing the kmalloc memory is the caller's, of course). * * A page may be used by anyone else who does a __get_free_page(). * In this case, page_count still tracks the references, and should only * be used through the normal accessor functions. The top bits of page->flags * and page->virtual store page management information, but all other fields * are unused and could be used privately, carefully. The management of this * page is the responsibility of the one who allocated it, and those who have * subsequently been given references to it. * * The other pages (we may call them "pagecache pages") are completely * managed by the Linux memory manager: I/O, buffers, swapping etc. * The following discussion applies only to them. * * A pagecache page contains an opaque `private' member, which belongs to the * page's address_space. Usually, this is the address of a circular list of * the page's disk buffers. PG_private must be set to tell the VM to call * into the filesystem to release these pages. * * A page may belong to an inode's memory mapping. In this case, page->mapping * is the pointer to the inode, and page->index is the file offset of the page, * in units of PAGE_SIZE. * * If pagecache pages are not associated with an inode, they are said to be * anonymous pages. These may become associated with the swapcache, and in that * case PG_swapcache is set, and page->private is an offset into the swapcache. * * In either case (swapcache or inode backed), the pagecache itself holds one * reference to the page. Setting PG_private should also increment the * refcount. The each user mapping also has a reference to the page. * * The pagecache pages are stored in a per-mapping radix tree, which is * rooted at mapping->i_pages, and indexed by offset. * Where 2.4 and early 2.6 kernels kept dirty/clean pages in per-address_space * lists, we instead now tag pages as dirty/writeback in the radix tree. * * All pagecache pages may be subject to I/O: * - inode pages may need to be read from disk, * - inode pages which have been modified and are MAP_SHARED may need * to be written back to the inode on disk, * - anonymous pages (including MAP_PRIVATE file mappings) which have been * modified may need to be swapped out to swap space and (later) to be read * back into memory. */ #if defined(CONFIG_ZONE_DEVICE) && defined(CONFIG_FS_DAX) DECLARE_STATIC_KEY_FALSE(devmap_managed_key); bool __put_devmap_managed_folio_refs(struct folio *folio, int refs); static inline bool put_devmap_managed_folio_refs(struct folio *folio, int refs) { if (!static_branch_unlikely(&devmap_managed_key)) return false; if (!folio_is_zone_device(folio)) return false; return __put_devmap_managed_folio_refs(folio, refs); } #else /* CONFIG_ZONE_DEVICE && CONFIG_FS_DAX */ static inline bool put_devmap_managed_folio_refs(struct folio *folio, int refs) { return false; } #endif /* CONFIG_ZONE_DEVICE && CONFIG_FS_DAX */ /* 127: arbitrary random number, small enough to assemble well */ #define folio_ref_zero_or_close_to_overflow(folio) \ ((unsigned int) folio_ref_count(folio) + 127u <= 127u) /** * folio_get - Increment the reference count on a folio. * @folio: The folio. * * Context: May be called in any context, as long as you know that * you have a refcount on the folio. If you do not already have one, * folio_try_get() may be the right interface for you to use. */ static inline void folio_get(struct folio *folio) { VM_BUG_ON_FOLIO(folio_ref_zero_or_close_to_overflow(folio), folio); folio_ref_inc(folio); } static inline void get_page(struct page *page) { folio_get(page_folio(page)); } static inline __must_check bool try_get_page(struct page *page) { page = compound_head(page); if (WARN_ON_ONCE(page_ref_count(page) <= 0)) return false; page_ref_inc(page); return true; } /** * folio_put - Decrement the reference count on a folio. * @folio: The folio. * * If the folio's reference count reaches zero, the memory will be * released back to the page allocator and may be used by another * allocation immediately. Do not access the memory or the struct folio * after calling folio_put() unless you can be sure that it wasn't the * last reference. * * Context: May be called in process or interrupt context, but not in NMI * context. May be called while holding a spinlock. */ static inline void folio_put(struct folio *folio) { if (folio_put_testzero(folio)) __folio_put(folio); } /** * folio_put_refs - Reduce the reference count on a folio. * @folio: The folio. * @refs: The amount to subtract from the folio's reference count. * * If the folio's reference count reaches zero, the memory will be * released back to the page allocator and may be used by another * allocation immediately. Do not access the memory or the struct folio * after calling folio_put_refs() unless you can be sure that these weren't * the last references. * * Context: May be called in process or interrupt context, but not in NMI * context. May be called while holding a spinlock. */ static inline void folio_put_refs(struct folio *folio, int refs) { if (folio_ref_sub_and_test(folio, refs)) __folio_put(folio); } void folios_put_refs(struct folio_batch *folios, unsigned int *refs); /* * union release_pages_arg - an array of pages or folios * * release_pages() releases a simple array of multiple pages, and * accepts various different forms of said page array: either * a regular old boring array of pages, an array of folios, or * an array of encoded page pointers. * * The transparent union syntax for this kind of "any of these * argument types" is all kinds of ugly, so look away. */ typedef union { struct page **pages; struct folio **folios; struct encoded_page **encoded_pages; } release_pages_arg __attribute__ ((__transparent_union__)); void release_pages(release_pages_arg, int nr); /** * folios_put - Decrement the reference count on an array of folios. * @folios: The folios. * * Like folio_put(), but for a batch of folios. This is more efficient * than writing the loop yourself as it will optimise the locks which need * to be taken if the folios are freed. The folios batch is returned * empty and ready to be reused for another batch; there is no need to * reinitialise it. * * Context: May be called in process or interrupt context, but not in NMI * context. May be called while holding a spinlock. */ static inline void folios_put(struct folio_batch *folios) { folios_put_refs(folios, NULL); } static inline void put_page(struct page *page) { struct folio *folio = page_folio(page); /* * For some devmap managed pages we need to catch refcount transition * from 2 to 1: */ if (put_devmap_managed_folio_refs(folio, 1)) return; folio_put(folio); } /* * GUP_PIN_COUNTING_BIAS, and the associated functions that use it, overload * the page's refcount so that two separate items are tracked: the original page * reference count, and also a new count of how many pin_user_pages() calls were * made against the page. ("gup-pinned" is another term for the latter). * * With this scheme, pin_user_pages() becomes special: such pages are marked as * distinct from normal pages. As such, the unpin_user_page() call (and its * variants) must be used in order to release gup-pinned pages. * * Choice of value: * * By making GUP_PIN_COUNTING_BIAS a power of two, debugging of page reference * counts with respect to pin_user_pages() and unpin_user_page() becomes * simpler, due to the fact that adding an even power of two to the page * refcount has the effect of using only the upper N bits, for the code that * counts up using the bias value. This means that the lower bits are left for * the exclusive use of the original code that increments and decrements by one * (or at least, by much smaller values than the bias value). * * Of course, once the lower bits overflow into the upper bits (and this is * OK, because subtraction recovers the original values), then visual inspection * no longer suffices to directly view the separate counts. However, for normal * applications that don't have huge page reference counts, this won't be an * issue. * * Locking: the lockless algorithm described in folio_try_get_rcu() * provides safe operation for get_user_pages(), folio_mkclean() and * other calls that race to set up page table entries. */ #define GUP_PIN_COUNTING_BIAS (1U << 10) void unpin_user_page(struct page *page); void unpin_folio(struct folio *folio); void unpin_user_pages_dirty_lock(struct page **pages, unsigned long npages, bool make_dirty); void unpin_user_page_range_dirty_lock(struct page *page, unsigned long npages, bool make_dirty); void unpin_user_pages(struct page **pages, unsigned long npages); void unpin_user_folio(struct folio *folio, unsigned long npages); void unpin_folios(struct folio **folios, unsigned long nfolios); static inline bool is_cow_mapping(vm_flags_t flags) { return (flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE; } #ifndef CONFIG_MMU static inline bool is_nommu_shared_mapping(vm_flags_t flags) { /* * NOMMU shared mappings are ordinary MAP_SHARED mappings and selected * R/O MAP_PRIVATE file mappings that are an effective R/O overlay of * a file mapping. R/O MAP_PRIVATE mappings might still modify * underlying memory if ptrace is active, so this is only possible if * ptrace does not apply. Note that there is no mprotect() to upgrade * write permissions later. */ return flags & (VM_MAYSHARE | VM_MAYOVERLAY); } #endif #if defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP) #define SECTION_IN_PAGE_FLAGS #endif /* * The identification function is mainly used by the buddy allocator for * determining if two pages could be buddies. We are not really identifying * the zone since we could be using the section number id if we do not have * node id available in page flags. * We only guarantee that it will return the same value for two combinable * pages in a zone. */ static inline int page_zone_id(struct page *page) { return (page->flags >> ZONEID_PGSHIFT) & ZONEID_MASK; } #ifdef NODE_NOT_IN_PAGE_FLAGS int page_to_nid(const struct page *page); #else static inline int page_to_nid(const struct page *page) { return (PF_POISONED_CHECK(page)->flags >> NODES_PGSHIFT) & NODES_MASK; } #endif static inline int folio_nid(const struct folio *folio) { return page_to_nid(&folio->page); } #ifdef CONFIG_NUMA_BALANCING /* page access time bits needs to hold at least 4 seconds */ #define PAGE_ACCESS_TIME_MIN_BITS 12 #if LAST_CPUPID_SHIFT < PAGE_ACCESS_TIME_MIN_BITS #define PAGE_ACCESS_TIME_BUCKETS \ (PAGE_ACCESS_TIME_MIN_BITS - LAST_CPUPID_SHIFT) #else #define PAGE_ACCESS_TIME_BUCKETS 0 #endif #define PAGE_ACCESS_TIME_MASK \ (LAST_CPUPID_MASK << PAGE_ACCESS_TIME_BUCKETS) static inline int cpu_pid_to_cpupid(int cpu, int pid) { return ((cpu & LAST__CPU_MASK) << LAST__PID_SHIFT) | (pid & LAST__PID_MASK); } static inline int cpupid_to_pid(int cpupid) { return cpupid & LAST__PID_MASK; } static inline int cpupid_to_cpu(int cpupid) { return (cpupid >> LAST__PID_SHIFT) & LAST__CPU_MASK; } static inline int cpupid_to_nid(int cpupid) { return cpu_to_node(cpupid_to_cpu(cpupid)); } static inline bool cpupid_pid_unset(int cpupid) { return cpupid_to_pid(cpupid) == (-1 & LAST__PID_MASK); } static inline bool cpupid_cpu_unset(int cpupid) { return cpupid_to_cpu(cpupid) == (-1 & LAST__CPU_MASK); } static inline bool __cpupid_match_pid(pid_t task_pid, int cpupid) { return (task_pid & LAST__PID_MASK) == cpupid_to_pid(cpupid); } #define cpupid_match_pid(task, cpupid) __cpupid_match_pid(task->pid, cpupid) #ifdef LAST_CPUPID_NOT_IN_PAGE_FLAGS static inline int folio_xchg_last_cpupid(struct folio *folio, int cpupid) { return xchg(&folio->_last_cpupid, cpupid & LAST_CPUPID_MASK); } static inline int folio_last_cpupid(struct folio *folio) { return folio->_last_cpupid; } static inline void page_cpupid_reset_last(struct page *page) { page->_last_cpupid = -1 & LAST_CPUPID_MASK; } #else static inline int folio_last_cpupid(struct folio *folio) { return (folio->flags >> LAST_CPUPID_PGSHIFT) & LAST_CPUPID_MASK; } int folio_xchg_last_cpupid(struct folio *folio, int cpupid); static inline void page_cpupid_reset_last(struct page *page) { page->flags |= LAST_CPUPID_MASK << LAST_CPUPID_PGSHIFT; } #endif /* LAST_CPUPID_NOT_IN_PAGE_FLAGS */ static inline int folio_xchg_access_time(struct folio *folio, int time) { int last_time; last_time = folio_xchg_last_cpupid(folio, time >> PAGE_ACCESS_TIME_BUCKETS); return last_time << PAGE_ACCESS_TIME_BUCKETS; } static inline void vma_set_access_pid_bit(struct vm_area_struct *vma) { unsigned int pid_bit; pid_bit = hash_32(current->pid, ilog2(BITS_PER_LONG)); if (vma->numab_state && !test_bit(pid_bit, &vma->numab_state->pids_active[1])) { __set_bit(pid_bit, &vma->numab_state->pids_active[1]); } } bool folio_use_access_time(struct folio *folio); #else /* !CONFIG_NUMA_BALANCING */ static inline int folio_xchg_last_cpupid(struct folio *folio, int cpupid) { return folio_nid(folio); /* XXX */ } static inline int folio_xchg_access_time(struct folio *folio, int time) { return 0; } static inline int folio_last_cpupid(struct folio *folio) { return folio_nid(folio); /* XXX */ } static inline int cpupid_to_nid(int cpupid) { return -1; } static inline int cpupid_to_pid(int cpupid) { return -1; } static inline int cpupid_to_cpu(int cpupid) { return -1; } static inline int cpu_pid_to_cpupid(int nid, int pid) { return -1; } static inline bool cpupid_pid_unset(int cpupid) { return true; } static inline void page_cpupid_reset_last(struct page *page) { } static inline bool cpupid_match_pid(struct task_struct *task, int cpupid) { return false; } static inline void vma_set_access_pid_bit(struct vm_area_struct *vma) { } static inline bool folio_use_access_time(struct folio *folio) { return false; } #endif /* CONFIG_NUMA_BALANCING */ #if defined(CONFIG_KASAN_SW_TAGS) || defined(CONFIG_KASAN_HW_TAGS) /* * KASAN per-page tags are stored xor'ed with 0xff. This allows to avoid * setting tags for all pages to native kernel tag value 0xff, as the default * value 0x00 maps to 0xff. */ static inline u8 page_kasan_tag(const struct page *page) { u8 tag = KASAN_TAG_KERNEL; if (kasan_enabled()) { tag = (page->flags >> KASAN_TAG_PGSHIFT) & KASAN_TAG_MASK; tag ^= 0xff; } return tag; } static inline void page_kasan_tag_set(struct page *page, u8 tag) { unsigned long old_flags, flags; if (!kasan_enabled()) return; tag ^= 0xff; old_flags = READ_ONCE(page->flags); do { flags = old_flags; flags &= ~(KASAN_TAG_MASK << KASAN_TAG_PGSHIFT); flags |= (tag & KASAN_TAG_MASK) << KASAN_TAG_PGSHIFT; } while (unlikely(!try_cmpxchg(&page->flags, &old_flags, flags))); } static inline void page_kasan_tag_reset(struct page *page) { if (kasan_enabled()) page_kasan_tag_set(page, KASAN_TAG_KERNEL); } #else /* CONFIG_KASAN_SW_TAGS || CONFIG_KASAN_HW_TAGS */ static inline u8 page_kasan_tag(const struct page *page) { return 0xff; } static inline void page_kasan_tag_set(struct page *page, u8 tag) { } static inline void page_kasan_tag_reset(struct page *page) { } #endif /* CONFIG_KASAN_SW_TAGS || CONFIG_KASAN_HW_TAGS */ static inline struct zone *page_zone(const struct page *page) { return &NODE_DATA(page_to_nid(page))->node_zones[page_zonenum(page)]; } static inline pg_data_t *page_pgdat(const struct page *page) { return NODE_DATA(page_to_nid(page)); } static inline struct zone *folio_zone(const struct folio *folio) { return page_zone(&folio->page); } static inline pg_data_t *folio_pgdat(const struct folio *folio) { return page_pgdat(&folio->page); } #ifdef SECTION_IN_PAGE_FLAGS static inline void set_page_section(struct page *page, unsigned long section) { page->flags &= ~(SECTIONS_MASK << SECTIONS_PGSHIFT); page->flags |= (section & SECTIONS_MASK) << SECTIONS_PGSHIFT; } static inline unsigned long page_to_section(const struct page *page) { return (page->flags >> SECTIONS_PGSHIFT) & SECTIONS_MASK; } #endif /** * folio_pfn - Return the Page Frame Number of a folio. * @folio: The folio. * * A folio may contain multiple pages. The pages have consecutive * Page Frame Numbers. * * Return: The Page Frame Number of the first page in the folio. */ static inline unsigned long folio_pfn(const struct folio *folio) { return page_to_pfn(&folio->page); } static inline struct folio *pfn_folio(unsigned long pfn) { return page_folio(pfn_to_page(pfn)); } /** * folio_maybe_dma_pinned - Report if a folio may be pinned for DMA. * @folio: The folio. * * This function checks if a folio has been pinned via a call to * a function in the pin_user_pages() family. * * For small folios, the return value is partially fuzzy: false is not fuzzy, * because it means "definitely not pinned for DMA", but true means "probably * pinned for DMA, but possibly a false positive due to having at least * GUP_PIN_COUNTING_BIAS worth of normal folio references". * * False positives are OK, because: a) it's unlikely for a folio to * get that many refcounts, and b) all the callers of this routine are * expected to be able to deal gracefully with a false positive. * * For large folios, the result will be exactly correct. That's because * we have more tracking data available: the _pincount field is used * instead of the GUP_PIN_COUNTING_BIAS scheme. * * For more information, please see Documentation/core-api/pin_user_pages.rst. * * Return: True, if it is likely that the folio has been "dma-pinned". * False, if the folio is definitely not dma-pinned. */ static inline bool folio_maybe_dma_pinned(struct folio *folio) { if (folio_test_large(folio)) return atomic_read(&folio->_pincount) > 0; /* * folio_ref_count() is signed. If that refcount overflows, then * folio_ref_count() returns a negative value, and callers will avoid * further incrementing the refcount. * * Here, for that overflow case, use the sign bit to count a little * bit higher via unsigned math, and thus still get an accurate result. */ return ((unsigned int)folio_ref_count(folio)) >= GUP_PIN_COUNTING_BIAS; } /* * This should most likely only be called during fork() to see whether we * should break the cow immediately for an anon page on the src mm. * * The caller has to hold the PT lock and the vma->vm_mm->->write_protect_seq. */ static inline bool folio_needs_cow_for_dma(struct vm_area_struct *vma, struct folio *folio) { VM_BUG_ON(!(raw_read_seqcount(&vma->vm_mm->write_protect_seq) & 1)); if (!test_bit(MMF_HAS_PINNED, &vma->vm_mm->flags)) return false; return folio_maybe_dma_pinned(folio); } /** * is_zero_page - Query if a page is a zero page * @page: The page to query * * This returns true if @page is one of the permanent zero pages. */ static inline bool is_zero_page(const struct page *page) { return is_zero_pfn(page_to_pfn(page)); } /** * is_zero_folio - Query if a folio is a zero page * @folio: The folio to query * * This returns true if @folio is one of the permanent zero pages. */ static inline bool is_zero_folio(const struct folio *folio) { return is_zero_page(&folio->page); } /* MIGRATE_CMA and ZONE_MOVABLE do not allow pin folios */ #ifdef CONFIG_MIGRATION static inline bool folio_is_longterm_pinnable(struct folio *folio) { #ifdef CONFIG_CMA int mt = folio_migratetype(folio); if (mt == MIGRATE_CMA || mt == MIGRATE_ISOLATE) return false; #endif /* The zero page can be "pinned" but gets special handling. */ if (is_zero_folio(folio)) return true; /* Coherent device memory must always allow eviction. */ if (folio_is_device_coherent(folio)) return false; /* Otherwise, non-movable zone folios can be pinned. */ return !folio_is_zone_movable(folio); } #else static inline bool folio_is_longterm_pinnable(struct folio *folio) { return true; } #endif static inline void set_page_zone(struct page *page, enum zone_type zone) { page->flags &= ~(ZONES_MASK << ZONES_PGSHIFT); page->flags |= (zone & ZONES_MASK) << ZONES_PGSHIFT; } static inline void set_page_node(struct page *page, unsigned long node) { page->flags &= ~(NODES_MASK << NODES_PGSHIFT); page->flags |= (node & NODES_MASK) << NODES_PGSHIFT; } static inline void set_page_links(struct page *page, enum zone_type zone, unsigned long node, unsigned long pfn) { set_page_zone(page, zone); set_page_node(page, node); #ifdef SECTION_IN_PAGE_FLAGS set_page_section(page, pfn_to_section_nr(pfn)); #endif } /** * folio_nr_pages - The number of pages in the folio. * @folio: The folio. * * Return: A positive power of two. */ static inline long folio_nr_pages(const struct folio *folio) { if (!folio_test_large(folio)) return 1; #ifdef CONFIG_64BIT return folio->_folio_nr_pages; #else return 1L << (folio->_flags_1 & 0xff); #endif } /* Only hugetlbfs can allocate folios larger than MAX_ORDER */ #ifdef CONFIG_ARCH_HAS_GIGANTIC_PAGE #define MAX_FOLIO_NR_PAGES (1UL << PUD_ORDER) #else #define MAX_FOLIO_NR_PAGES MAX_ORDER_NR_PAGES #endif /* * compound_nr() returns the number of pages in this potentially compound * page. compound_nr() can be called on a tail page, and is defined to * return 1 in that case. */ static inline unsigned long compound_nr(struct page *page) { struct folio *folio = (struct folio *)page; if (!test_bit(PG_head, &folio->flags)) return 1; #ifdef CONFIG_64BIT return folio->_folio_nr_pages; #else return 1L << (folio->_flags_1 & 0xff); #endif } /** * thp_nr_pages - The number of regular pages in this huge page. * @page: The head page of a huge page. */ static inline int thp_nr_pages(struct page *page) { return folio_nr_pages((struct folio *)page); } /** * folio_next - Move to the next physical folio. * @folio: The folio we're currently operating on. * * If you have physically contiguous memory which may span more than * one folio (eg a &struct bio_vec), use this function to move from one * folio to the next. Do not use it if the memory is only virtually * contiguous as the folios are almost certainly not adjacent to each * other. This is the folio equivalent to writing ``page++``. * * Context: We assume that the folios are refcounted and/or locked at a * higher level and do not adjust the reference counts. * Return: The next struct folio. */ static inline struct folio *folio_next(struct folio *folio) { return (struct folio *)folio_page(folio, folio_nr_pages(folio)); } /** * folio_shift - The size of the memory described by this folio. * @folio: The folio. * * A folio represents a number of bytes which is a power-of-two in size. * This function tells you which power-of-two the folio is. See also * folio_size() and folio_order(). * * Context: The caller should have a reference on the folio to prevent * it from being split. It is not necessary for the folio to be locked. * Return: The base-2 logarithm of the size of this folio. */ static inline unsigned int folio_shift(const struct folio *folio) { return PAGE_SHIFT + folio_order(folio); } /** * folio_size - The number of bytes in a folio. * @folio: The folio. * * Context: The caller should have a reference on the folio to prevent * it from being split. It is not necessary for the folio to be locked. * Return: The number of bytes in this folio. */ static inline size_t folio_size(const struct folio *folio) { return PAGE_SIZE << folio_order(folio); } /** * folio_likely_mapped_shared - Estimate if the folio is mapped into the page * tables of more than one MM * @folio: The folio. * * This function checks if the folio is currently mapped into more than one * MM ("mapped shared"), or if the folio is only mapped into a single MM * ("mapped exclusively"). * * For KSM folios, this function also returns "mapped shared" when a folio is * mapped multiple times into the same MM, because the individual page mappings * are independent. * * As precise information is not easily available for all folios, this function * estimates the number of MMs ("sharers") that are currently mapping a folio * using the number of times the first page of the folio is currently mapped * into page tables. * * For small anonymous folios and anonymous hugetlb folios, the return * value will be exactly correct: non-KSM folios can only be mapped at most once * into an MM, and they cannot be partially mapped. KSM folios are * considered shared even if mapped multiple times into the same MM. * * For other folios, the result can be fuzzy: * #. For partially-mappable large folios (THP), the return value can wrongly * indicate "mapped exclusively" (false negative) when the folio is * only partially mapped into at least one MM. * #. For pagecache folios (including hugetlb), the return value can wrongly * indicate "mapped shared" (false positive) when two VMAs in the same MM * cover the same file range. * * Further, this function only considers current page table mappings that * are tracked using the folio mapcount(s). * * This function does not consider: * #. If the folio might get mapped in the (near) future (e.g., swapcache, * pagecache, temporary unmapping for migration). * #. If the folio is mapped differently (VM_PFNMAP). * #. If hugetlb page table sharing applies. Callers might want to check * hugetlb_pmd_shared(). * * Return: Whether the folio is estimated to be mapped into more than one MM. */ static inline bool folio_likely_mapped_shared(struct folio *folio) { int mapcount = folio_mapcount(folio); /* Only partially-mappable folios require more care. */ if (!folio_test_large(folio) || unlikely(folio_test_hugetlb(folio))) return mapcount > 1; /* A single mapping implies "mapped exclusively". */ if (mapcount <= 1) return false; /* If any page is mapped more than once we treat it "mapped shared". */ if (folio_entire_mapcount(folio) || mapcount > folio_nr_pages(folio)) return true; /* Let's guess based on the first subpage. */ return atomic_read(&folio->_mapcount) > 0; } #ifndef HAVE_ARCH_MAKE_FOLIO_ACCESSIBLE static inline int arch_make_folio_accessible(struct folio *folio) { return 0; } #endif /* * Some inline functions in vmstat.h depend on page_zone() */ #include <linux/vmstat.h> #if defined(CONFIG_HIGHMEM) && !defined(WANT_PAGE_VIRTUAL) #define HASHED_PAGE_VIRTUAL #endif #if defined(WANT_PAGE_VIRTUAL) static inline void *page_address(const struct page *page) { return page->virtual; } static inline void set_page_address(struct page *page, void *address) { page->virtual = address; } #define page_address_init() do { } while(0) #endif #if defined(HASHED_PAGE_VIRTUAL) void *page_address(const struct page *page); void set_page_address(struct page *page, void *virtual); void page_address_init(void); #endif static __always_inline void *lowmem_page_address(const struct page *page) { return page_to_virt(page); } #if !defined(HASHED_PAGE_VIRTUAL) && !defined(WANT_PAGE_VIRTUAL) #define page_address(page) lowmem_page_address(page) #define set_page_address(page, address) do { } while(0) #define page_address_init() do { } while(0) #endif static inline void *folio_address(const struct folio *folio) { return page_address(&folio->page); } /* * Return true only if the page has been allocated with * ALLOC_NO_WATERMARKS and the low watermark was not * met implying that the system is under some pressure. */ static inline bool page_is_pfmemalloc(const struct page *page) { /* * lru.next has bit 1 set if the page is allocated from the * pfmemalloc reserves. Callers may simply overwrite it if * they do not need to preserve that information. */ return (uintptr_t)page->lru.next & BIT(1); } /* * Return true only if the folio has been allocated with * ALLOC_NO_WATERMARKS and the low watermark was not * met implying that the system is under some pressure. */ static inline bool folio_is_pfmemalloc(const struct folio *folio) { /* * lru.next has bit 1 set if the page is allocated from the * pfmemalloc reserves. Callers may simply overwrite it if * they do not need to preserve that information. */ return (uintptr_t)folio->lru.next & BIT(1); } /* * Only to be called by the page allocator on a freshly allocated * page. */ static inline void set_page_pfmemalloc(struct page *page) { page->lru.next = (void *)BIT(1); } static inline void clear_page_pfmemalloc(struct page *page) { page->lru.next = NULL; } /* * Can be called by the pagefault handler when it gets a VM_FAULT_OOM. */ extern void pagefault_out_of_memory(void); #define offset_in_page(p) ((unsigned long)(p) & ~PAGE_MASK) #define offset_in_thp(page, p) ((unsigned long)(p) & (thp_size(page) - 1)) #define offset_in_folio(folio, p) ((unsigned long)(p) & (folio_size(folio) - 1)) /* * Parameter block passed down to zap_pte_range in exceptional cases. */ struct zap_details { struct folio *single_folio; /* Locked folio to be unmapped */ bool even_cows; /* Zap COWed private pages too? */ bool reclaim_pt; /* Need reclaim page tables? */ zap_flags_t zap_flags; /* Extra flags for zapping */ }; /* * Whether to drop the pte markers, for example, the uffd-wp information for * file-backed memory. This should only be specified when we will completely * drop the page in the mm, either by truncation or unmapping of the vma. By * default, the flag is not set. */ #define ZAP_FLAG_DROP_MARKER ((__force zap_flags_t) BIT(0)) /* Set in unmap_vmas() to indicate a final unmap call. Only used by hugetlb */ #define ZAP_FLAG_UNMAP ((__force zap_flags_t) BIT(1)) #ifdef CONFIG_SCHED_MM_CID void sched_mm_cid_before_execve(struct task_struct *t); void sched_mm_cid_after_execve(struct task_struct *t); void sched_mm_cid_fork(struct task_struct *t); void sched_mm_cid_exit_signals(struct task_struct *t); static inline int task_mm_cid(struct task_struct *t) { return t->mm_cid; } #else static inline void sched_mm_cid_before_execve(struct task_struct *t) { } static inline void sched_mm_cid_after_execve(struct task_struct *t) { } static inline void sched_mm_cid_fork(struct task_struct *t) { } static inline void sched_mm_cid_exit_signals(struct task_struct *t) { } static inline int task_mm_cid(struct task_struct *t) { /* * Use the processor id as a fall-back when the mm cid feature is * disabled. This provides functional per-cpu data structure accesses * in user-space, althrough it won't provide the memory usage benefits. */ return raw_smp_processor_id(); } #endif #ifdef CONFIG_MMU extern bool can_do_mlock(void); #else static inline bool can_do_mlock(void) { return false; } #endif extern int user_shm_lock(size_t, struct ucounts *); extern void user_shm_unlock(size_t, struct ucounts *); struct folio *vm_normal_folio(struct vm_area_struct *vma, unsigned long addr, pte_t pte); struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr, pte_t pte); struct folio *vm_normal_folio_pmd(struct vm_area_struct *vma, unsigned long addr, pmd_t pmd); struct page *vm_normal_page_pmd(struct vm_area_struct *vma, unsigned long addr, pmd_t pmd); void zap_vma_ptes(struct vm_area_struct *vma, unsigned long address, unsigned long size); void zap_page_range_single(struct vm_area_struct *vma, unsigned long address, unsigned long size, struct zap_details *details); static inline void zap_vma_pages(struct vm_area_struct *vma) { zap_page_range_single(vma, vma->vm_start, vma->vm_end - vma->vm_start, NULL); } void unmap_vmas(struct mmu_gather *tlb, struct ma_state *mas, struct vm_area_struct *start_vma, unsigned long start, unsigned long end, unsigned long tree_end, bool mm_wr_locked); struct mmu_notifier_range; void free_pgd_range(struct mmu_gather *tlb, unsigned long addr, unsigned long end, unsigned long floor, unsigned long ceiling); int copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma); int generic_access_phys(struct vm_area_struct *vma, unsigned long addr, void *buf, int len, int write); struct follow_pfnmap_args { /** * Inputs: * @vma: Pointer to @vm_area_struct struct * @address: the virtual address to walk */ struct vm_area_struct *vma; unsigned long address; /** * Internals: * * The caller shouldn't touch any of these. */ spinlock_t *lock; pte_t *ptep; /** * Outputs: * * @pfn: the PFN of the address * @pgprot: the pgprot_t of the mapping * @writable: whether the mapping is writable * @special: whether the mapping is a special mapping (real PFN maps) */ unsigned long pfn; pgprot_t pgprot; bool writable; bool special; }; int follow_pfnmap_start(struct follow_pfnmap_args *args); void follow_pfnmap_end(struct follow_pfnmap_args *args); extern void truncate_pagecache(struct inode *inode, loff_t new); extern void truncate_setsize(struct inode *inode, loff_t newsize); void pagecache_isize_extended(struct inode *inode, loff_t from, loff_t to); void truncate_pagecache_range(struct inode *inode, loff_t offset, loff_t end); int generic_error_remove_folio(struct address_space *mapping, struct folio *folio); struct vm_area_struct *lock_mm_and_find_vma(struct mm_struct *mm, unsigned long address, struct pt_regs *regs); #ifdef CONFIG_MMU extern vm_fault_t handle_mm_fault(struct vm_area_struct *vma, unsigned long address, unsigned int flags, struct pt_regs *regs); extern int fixup_user_fault(struct mm_struct *mm, unsigned long address, unsigned int fault_flags, bool *unlocked); void unmap_mapping_pages(struct address_space *mapping, pgoff_t start, pgoff_t nr, bool even_cows); void unmap_mapping_range(struct address_space *mapping, loff_t const holebegin, loff_t const holelen, int even_cows); #else static inline vm_fault_t handle_mm_fault(struct vm_area_struct *vma, unsigned long address, unsigned int flags, struct pt_regs *regs) { /* should never happen if there's no MMU */ BUG(); return VM_FAULT_SIGBUS; } static inline int fixup_user_fault(struct mm_struct *mm, unsigned long address, unsigned int fault_flags, bool *unlocked) { /* should never happen if there's no MMU */ BUG(); return -EFAULT; } static inline void unmap_mapping_pages(struct address_space *mapping, pgoff_t start, pgoff_t nr, bool even_cows) { } static inline void unmap_mapping_range(struct address_space *mapping, loff_t const holebegin, loff_t const holelen, int even_cows) { } #endif static inline void unmap_shared_mapping_range(struct address_space *mapping, loff_t const holebegin, loff_t const holelen) { unmap_mapping_range(mapping, holebegin, holelen, 0); } static inline struct vm_area_struct *vma_lookup(struct mm_struct *mm, unsigned long addr); extern int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, unsigned int gup_flags); extern int access_remote_vm(struct mm_struct *mm, unsigned long addr, void *buf, int len, unsigned int gup_flags); long get_user_pages_remote(struct mm_struct *mm, unsigned long start, unsigned long nr_pages, unsigned int gup_flags, struct page **pages, int *locked); long pin_user_pages_remote(struct mm_struct *mm, unsigned long start, unsigned long nr_pages, unsigned int gup_flags, struct page **pages, int *locked); /* * Retrieves a single page alongside its VMA. Does not support FOLL_NOWAIT. */ static inline struct page *get_user_page_vma_remote(struct mm_struct *mm, unsigned long addr, int gup_flags, struct vm_area_struct **vmap) { struct page *page; struct vm_area_struct *vma; int got; if (WARN_ON_ONCE(unlikely(gup_flags & FOLL_NOWAIT))) return ERR_PTR(-EINVAL); got = get_user_pages_remote(mm, addr, 1, gup_flags, &page, NULL); if (got < 0) return ERR_PTR(got); vma = vma_lookup(mm, addr); if (WARN_ON_ONCE(!vma)) { put_page(page); return ERR_PTR(-EINVAL); } *vmap = vma; return page; } long get_user_pages(unsigned long start, unsigned long nr_pages, unsigned int gup_flags, struct page **pages); long pin_user_pages(unsigned long start, unsigned long nr_pages, unsigned int gup_flags, struct page **pages); long get_user_pages_unlocked(unsigned long start, unsigned long nr_pages, struct page **pages, unsigned int gup_flags); long pin_user_pages_unlocked(unsigned long start, unsigned long nr_pages, struct page **pages, unsigned int gup_flags); long memfd_pin_folios(struct file *memfd, loff_t start, loff_t end, struct folio **folios, unsigned int max_folios, pgoff_t *offset); int folio_add_pins(struct folio *folio, unsigned int pins); int get_user_pages_fast(unsigned long start, int nr_pages, unsigned int gup_flags, struct page **pages); int pin_user_pages_fast(unsigned long start, int nr_pages, unsigned int gup_flags, struct page **pages); void folio_add_pin(struct folio *folio); int account_locked_vm(struct mm_struct *mm, unsigned long pages, bool inc); int __account_locked_vm(struct mm_struct *mm, unsigned long pages, bool inc, struct task_struct *task, bool bypass_rlim); struct kvec; struct page *get_dump_page(unsigned long addr); bool folio_mark_dirty(struct folio *folio); bool folio_mark_dirty_lock(struct folio *folio); bool set_page_dirty(struct page *page); int set_page_dirty_lock(struct page *page); int get_cmdline(struct task_struct *task, char *buffer, int buflen); /* * Flags used by change_protection(). For now we make it a bitmap so * that we can pass in multiple flags just like parameters. However * for now all the callers are only use one of the flags at the same * time. */ /* * Whether we should manually check if we can map individual PTEs writable, * because something (e.g., COW, uffd-wp) blocks that from happening for all * PTEs automatically in a writable mapping. */ #define MM_CP_TRY_CHANGE_WRITABLE (1UL << 0) /* Whether this protection change is for NUMA hints */ #define MM_CP_PROT_NUMA (1UL << 1) /* Whether this change is for write protecting */ #define MM_CP_UFFD_WP (1UL << 2) /* do wp */ #define MM_CP_UFFD_WP_RESOLVE (1UL << 3) /* Resolve wp */ #define MM_CP_UFFD_WP_ALL (MM_CP_UFFD_WP | \ MM_CP_UFFD_WP_RESOLVE) bool can_change_pte_writable(struct vm_area_struct *vma, unsigned long addr, pte_t pte); extern long change_protection(struct mmu_gather *tlb, struct vm_area_struct *vma, unsigned long start, unsigned long end, unsigned long cp_flags); extern int mprotect_fixup(struct vma_iterator *vmi, struct mmu_gather *tlb, struct vm_area_struct *vma, struct vm_area_struct **pprev, unsigned long start, unsigned long end, unsigned long newflags); /* * doesn't attempt to fault and will return short. */ int get_user_pages_fast_only(unsigned long start, int nr_pages, unsigned int gup_flags, struct page **pages); static inline bool get_user_page_fast_only(unsigned long addr, unsigned int gup_flags, struct page **pagep) { return get_user_pages_fast_only(addr, 1, gup_flags, pagep) == 1; } /* * per-process(per-mm_struct) statistics. */ static inline unsigned long get_mm_counter(struct mm_struct *mm, int member) { return percpu_counter_read_positive(&mm->rss_stat[member]); } void mm_trace_rss_stat(struct mm_struct *mm, int member); static inline void add_mm_counter(struct mm_struct *mm, int member, long value) { percpu_counter_add(&mm->rss_stat[member], value); mm_trace_rss_stat(mm, member); } static inline void inc_mm_counter(struct mm_struct *mm, int member) { percpu_counter_inc(&mm->rss_stat[member]); mm_trace_rss_stat(mm, member); } static inline void dec_mm_counter(struct mm_struct *mm, int member) { percpu_counter_dec(&mm->rss_stat[member]); mm_trace_rss_stat(mm, member); } /* Optimized variant when folio is already known not to be anon */ static inline int mm_counter_file(struct folio *folio) { if (folio_test_swapbacked(folio)) return MM_SHMEMPAGES; return MM_FILEPAGES; } static inline int mm_counter(struct folio *folio) { if (folio_test_anon(folio)) return MM_ANONPAGES; return mm_counter_file(folio); } static inline unsigned long get_mm_rss(struct mm_struct *mm) { return get_mm_counter(mm, MM_FILEPAGES) + get_mm_counter(mm, MM_ANONPAGES) + get_mm_counter(mm, MM_SHMEMPAGES); } static inline unsigned long get_mm_hiwater_rss(struct mm_struct *mm) { return max(mm->hiwater_rss, get_mm_rss(mm)); } static inline unsigned long get_mm_hiwater_vm(struct mm_struct *mm) { return max(mm->hiwater_vm, mm->total_vm); } static inline void update_hiwater_rss(struct mm_struct *mm) { unsigned long _rss = get_mm_rss(mm); if ((mm)->hiwater_rss < _rss) (mm)->hiwater_rss = _rss; } static inline void update_hiwater_vm(struct mm_struct *mm) { if (mm->hiwater_vm < mm->total_vm) mm->hiwater_vm = mm->total_vm; } static inline void reset_mm_hiwater_rss(struct mm_struct *mm) { mm->hiwater_rss = get_mm_rss(mm); } static inline void setmax_mm_hiwater_rss(unsigned long *maxrss, struct mm_struct *mm) { unsigned long hiwater_rss = get_mm_hiwater_rss(mm); if (*maxrss < hiwater_rss) *maxrss = hiwater_rss; } #ifndef CONFIG_ARCH_HAS_PTE_SPECIAL static inline int pte_special(pte_t pte) { return 0; } static inline pte_t pte_mkspecial(pte_t pte) { return pte; } #endif #ifndef CONFIG_ARCH_SUPPORTS_PMD_PFNMAP static inline bool pmd_special(pmd_t pmd) { return false; } static inline pmd_t pmd_mkspecial(pmd_t pmd) { return pmd; } #endif /* CONFIG_ARCH_SUPPORTS_PMD_PFNMAP */ #ifndef CONFIG_ARCH_SUPPORTS_PUD_PFNMAP static inline bool pud_special(pud_t pud) { return false; } static inline pud_t pud_mkspecial(pud_t pud) { return pud; } #endif /* CONFIG_ARCH_SUPPORTS_PUD_PFNMAP */ #ifndef CONFIG_ARCH_HAS_PTE_DEVMAP static inline int pte_devmap(pte_t pte) { return 0; } #endif extern pte_t *__get_locked_pte(struct mm_struct *mm, unsigned long addr, spinlock_t **ptl); static inline pte_t *get_locked_pte(struct mm_struct *mm, unsigned long addr, spinlock_t **ptl) { pte_t *ptep; __cond_lock(*ptl, ptep = __get_locked_pte(mm, addr, ptl)); return ptep; } #ifdef __PAGETABLE_P4D_FOLDED static inline int __p4d_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address) { return 0; } #else int __p4d_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address); #endif #if defined(__PAGETABLE_PUD_FOLDED) || !defined(CONFIG_MMU) static inline int __pud_alloc(struct mm_struct *mm, p4d_t *p4d, unsigned long address) { return 0; } static inline void mm_inc_nr_puds(struct mm_struct *mm) {} static inline void mm_dec_nr_puds(struct mm_struct *mm) {} #else int __pud_alloc(struct mm_struct *mm, p4d_t *p4d, unsigned long address); static inline void mm_inc_nr_puds(struct mm_struct *mm) { if (mm_pud_folded(mm)) return; atomic_long_add(PTRS_PER_PUD * sizeof(pud_t), &mm->pgtables_bytes); } static inline void mm_dec_nr_puds(struct mm_struct *mm) { if (mm_pud_folded(mm)) return; atomic_long_sub(PTRS_PER_PUD * sizeof(pud_t), &mm->pgtables_bytes); } #endif #if defined(__PAGETABLE_PMD_FOLDED) || !defined(CONFIG_MMU) static inline int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address) { return 0; } static inline void mm_inc_nr_pmds(struct mm_struct *mm) {} static inline void mm_dec_nr_pmds(struct mm_struct *mm) {} #else int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address); static inline void mm_inc_nr_pmds(struct mm_struct *mm) { if (mm_pmd_folded(mm)) return; atomic_long_add(PTRS_PER_PMD * sizeof(pmd_t), &mm->pgtables_bytes); } static inline void mm_dec_nr_pmds(struct mm_struct *mm) { if (mm_pmd_folded(mm)) return; atomic_long_sub(PTRS_PER_PMD * sizeof(pmd_t), &mm->pgtables_bytes); } #endif #ifdef CONFIG_MMU static inline void mm_pgtables_bytes_init(struct mm_struct *mm) { atomic_long_set(&mm->pgtables_bytes, 0); } static inline unsigned long mm_pgtables_bytes(const struct mm_struct *mm) { return atomic_long_read(&mm->pgtables_bytes); } static inline void mm_inc_nr_ptes(struct mm_struct *mm) { atomic_long_add(PTRS_PER_PTE * sizeof(pte_t), &mm->pgtables_bytes); } static inline void mm_dec_nr_ptes(struct mm_struct *mm) { atomic_long_sub(PTRS_PER_PTE * sizeof(pte_t), &mm->pgtables_bytes); } #else static inline void mm_pgtables_bytes_init(struct mm_struct *mm) {} static inline unsigned long mm_pgtables_bytes(const struct mm_struct *mm) { return 0; } static inline void mm_inc_nr_ptes(struct mm_struct *mm) {} static inline void mm_dec_nr_ptes(struct mm_struct *mm) {} #endif int __pte_alloc(struct mm_struct *mm, pmd_t *pmd); int __pte_alloc_kernel(pmd_t *pmd); #if defined(CONFIG_MMU) static inline p4d_t *p4d_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address) { return (unlikely(pgd_none(*pgd)) && __p4d_alloc(mm, pgd, address)) ? NULL : p4d_offset(pgd, address); } static inline pud_t *pud_alloc(struct mm_struct *mm, p4d_t *p4d, unsigned long address) { return (unlikely(p4d_none(*p4d)) && __pud_alloc(mm, p4d, address)) ? NULL : pud_offset(p4d, address); } static inline pmd_t *pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address) { return (unlikely(pud_none(*pud)) && __pmd_alloc(mm, pud, address))? NULL: pmd_offset(pud, address); } #endif /* CONFIG_MMU */ static inline struct ptdesc *virt_to_ptdesc(const void *x) { return page_ptdesc(virt_to_page(x)); } static inline void *ptdesc_to_virt(const struct ptdesc *pt) { return page_to_virt(ptdesc_page(pt)); } static inline void *ptdesc_address(const struct ptdesc *pt) { return folio_address(ptdesc_folio(pt)); } static inline bool pagetable_is_reserved(struct ptdesc *pt) { return folio_test_reserved(ptdesc_folio(pt)); } /** * pagetable_alloc - Allocate pagetables * @gfp: GFP flags * @order: desired pagetable order * * pagetable_alloc allocates memory for page tables as well as a page table * descriptor to describe that memory. * * Return: The ptdesc describing the allocated page tables. */ static inline struct ptdesc *pagetable_alloc_noprof(gfp_t gfp, unsigned int order) { struct page *page = alloc_pages_noprof(gfp | __GFP_COMP, order); return page_ptdesc(page); } #define pagetable_alloc(...) alloc_hooks(pagetable_alloc_noprof(__VA_ARGS__)) /** * pagetable_free - Free pagetables * @pt: The page table descriptor * * pagetable_free frees the memory of all page tables described by a page * table descriptor and the memory for the descriptor itself. */ static inline void pagetable_free(struct ptdesc *pt) { struct page *page = ptdesc_page(pt); __free_pages(page, compound_order(page)); } #if defined(CONFIG_SPLIT_PTE_PTLOCKS) #if ALLOC_SPLIT_PTLOCKS void __init ptlock_cache_init(void); bool ptlock_alloc(struct ptdesc *ptdesc); void ptlock_free(struct ptdesc *ptdesc); static inline spinlock_t *ptlock_ptr(struct ptdesc *ptdesc) { return ptdesc->ptl; } #else /* ALLOC_SPLIT_PTLOCKS */ static inline void ptlock_cache_init(void) { } static inline bool ptlock_alloc(struct ptdesc *ptdesc) { return true; } static inline void ptlock_free(struct ptdesc *ptdesc) { } static inline spinlock_t *ptlock_ptr(struct ptdesc *ptdesc) { return &ptdesc->ptl; } #endif /* ALLOC_SPLIT_PTLOCKS */ static inline spinlock_t *pte_lockptr(struct mm_struct *mm, pmd_t *pmd) { return ptlock_ptr(page_ptdesc(pmd_page(*pmd))); } static inline spinlock_t *ptep_lockptr(struct mm_struct *mm, pte_t *pte) { BUILD_BUG_ON(IS_ENABLED(CONFIG_HIGHPTE)); BUILD_BUG_ON(MAX_PTRS_PER_PTE * sizeof(pte_t) > PAGE_SIZE); return ptlock_ptr(virt_to_ptdesc(pte)); } static inline bool ptlock_init(struct ptdesc *ptdesc) { /* * prep_new_page() initialize page->private (and therefore page->ptl) * with 0. Make sure nobody took it in use in between. * * It can happen if arch try to use slab for page table allocation: * slab code uses page->slab_cache, which share storage with page->ptl. */ VM_BUG_ON_PAGE(*(unsigned long *)&ptdesc->ptl, ptdesc_page(ptdesc)); if (!ptlock_alloc(ptdesc)) return false; spin_lock_init(ptlock_ptr(ptdesc)); return true; } #else /* !defined(CONFIG_SPLIT_PTE_PTLOCKS) */ /* * We use mm->page_table_lock to guard all pagetable pages of the mm. */ static inline spinlock_t *pte_lockptr(struct mm_struct *mm, pmd_t *pmd) { return &mm->page_table_lock; } static inline spinlock_t *ptep_lockptr(struct mm_struct *mm, pte_t *pte) { return &mm->page_table_lock; } static inline void ptlock_cache_init(void) {} static inline bool ptlock_init(struct ptdesc *ptdesc) { return true; } static inline void ptlock_free(struct ptdesc *ptdesc) {} #endif /* defined(CONFIG_SPLIT_PTE_PTLOCKS) */ static inline void __pagetable_ctor(struct ptdesc *ptdesc) { struct folio *folio = ptdesc_folio(ptdesc); __folio_set_pgtable(folio); lruvec_stat_add_folio(folio, NR_PAGETABLE); } static inline void pagetable_dtor(struct ptdesc *ptdesc) { struct folio *folio = ptdesc_folio(ptdesc); ptlock_free(ptdesc); __folio_clear_pgtable(folio); lruvec_stat_sub_folio(folio, NR_PAGETABLE); } static inline void pagetable_dtor_free(struct ptdesc *ptdesc) { pagetable_dtor(ptdesc); pagetable_free(ptdesc); } static inline bool pagetable_pte_ctor(struct ptdesc *ptdesc) { if (!ptlock_init(ptdesc)) return false; __pagetable_ctor(ptdesc); return true; } pte_t *___pte_offset_map(pmd_t *pmd, unsigned long addr, pmd_t *pmdvalp); static inline pte_t *__pte_offset_map(pmd_t *pmd, unsigned long addr, pmd_t *pmdvalp) { pte_t *pte; __cond_lock(RCU, pte = ___pte_offset_map(pmd, addr, pmdvalp)); return pte; } static inline pte_t *pte_offset_map(pmd_t *pmd, unsigned long addr) { return __pte_offset_map(pmd, addr, NULL); } pte_t *__pte_offset_map_lock(struct mm_struct *mm, pmd_t *pmd, unsigned long addr, spinlock_t **ptlp); static inline pte_t *pte_offset_map_lock(struct mm_struct *mm, pmd_t *pmd, unsigned long addr, spinlock_t **ptlp) { pte_t *pte; __cond_lock(RCU, __cond_lock(*ptlp, pte = __pte_offset_map_lock(mm, pmd, addr, ptlp))); return pte; } pte_t *pte_offset_map_ro_nolock(struct mm_struct *mm, pmd_t *pmd, unsigned long addr, spinlock_t **ptlp); pte_t *pte_offset_map_rw_nolock(struct mm_struct *mm, pmd_t *pmd, unsigned long addr, pmd_t *pmdvalp, spinlock_t **ptlp); #define pte_unmap_unlock(pte, ptl) do { \ spin_unlock(ptl); \ pte_unmap(pte); \ } while (0) #define pte_alloc(mm, pmd) (unlikely(pmd_none(*(pmd))) && __pte_alloc(mm, pmd)) #define pte_alloc_map(mm, pmd, address) \ (pte_alloc(mm, pmd) ? NULL : pte_offset_map(pmd, address)) #define pte_alloc_map_lock(mm, pmd, address, ptlp) \ (pte_alloc(mm, pmd) ? \ NULL : pte_offset_map_lock(mm, pmd, address, ptlp)) #define pte_alloc_kernel(pmd, address) \ ((unlikely(pmd_none(*(pmd))) && __pte_alloc_kernel(pmd))? \ NULL: pte_offset_kernel(pmd, address)) #if defined(CONFIG_SPLIT_PMD_PTLOCKS) static inline struct page *pmd_pgtable_page(pmd_t *pmd) { unsigned long mask = ~(PTRS_PER_PMD * sizeof(pmd_t) - 1); return virt_to_page((void *)((unsigned long) pmd & mask)); } static inline struct ptdesc *pmd_ptdesc(pmd_t *pmd) { return page_ptdesc(pmd_pgtable_page(pmd)); } static inline spinlock_t *pmd_lockptr(struct mm_struct *mm, pmd_t *pmd) { return ptlock_ptr(pmd_ptdesc(pmd)); } static inline bool pmd_ptlock_init(struct ptdesc *ptdesc) { #ifdef CONFIG_TRANSPARENT_HUGEPAGE ptdesc->pmd_huge_pte = NULL; #endif return ptlock_init(ptdesc); } #define pmd_huge_pte(mm, pmd) (pmd_ptdesc(pmd)->pmd_huge_pte) #else static inline spinlock_t *pmd_lockptr(struct mm_struct *mm, pmd_t *pmd) { return &mm->page_table_lock; } static inline bool pmd_ptlock_init(struct ptdesc *ptdesc) { return true; } #define pmd_huge_pte(mm, pmd) ((mm)->pmd_huge_pte) #endif static inline spinlock_t *pmd_lock(struct mm_struct *mm, pmd_t *pmd) { spinlock_t *ptl = pmd_lockptr(mm, pmd); spin_lock(ptl); return ptl; } static inline bool pagetable_pmd_ctor(struct ptdesc *ptdesc) { if (!pmd_ptlock_init(ptdesc)) return false; ptdesc_pmd_pts_init(ptdesc); __pagetable_ctor(ptdesc); return true; } /* * No scalability reason to split PUD locks yet, but follow the same pattern * as the PMD locks to make it easier if we decide to. The VM should not be * considered ready to switch to split PUD locks yet; there may be places * which need to be converted from page_table_lock. */ static inline spinlock_t *pud_lockptr(struct mm_struct *mm, pud_t *pud) { return &mm->page_table_lock; } static inline spinlock_t *pud_lock(struct mm_struct *mm, pud_t *pud) { spinlock_t *ptl = pud_lockptr(mm, pud); spin_lock(ptl); return ptl; } static inline void pagetable_pud_ctor(struct ptdesc *ptdesc) { __pagetable_ctor(ptdesc); } static inline void pagetable_p4d_ctor(struct ptdesc *ptdesc) { __pagetable_ctor(ptdesc); } static inline void pagetable_pgd_ctor(struct ptdesc *ptdesc) { __pagetable_ctor(ptdesc); } extern void __init pagecache_init(void); extern void free_initmem(void); /* * Free reserved pages within range [PAGE_ALIGN(start), end & PAGE_MASK) * into the buddy system. The freed pages will be poisoned with pattern * "poison" if it's within range [0, UCHAR_MAX]. * Return pages freed into the buddy system. */ extern unsigned long free_reserved_area(void *start, void *end, int poison, const char *s); extern void adjust_managed_page_count(struct page *page, long count); extern void reserve_bootmem_region(phys_addr_t start, phys_addr_t end, int nid); /* Free the reserved page into the buddy system, so it gets managed. */ void free_reserved_page(struct page *page); #define free_highmem_page(page) free_reserved_page(page) static inline void mark_page_reserved(struct page *page) { SetPageReserved(page); adjust_managed_page_count(page, -1); } static inline void free_reserved_ptdesc(struct ptdesc *pt) { free_reserved_page(ptdesc_page(pt)); } /* * Default method to free all the __init memory into the buddy system. * The freed pages will be poisoned with pattern "poison" if it's within * range [0, UCHAR_MAX]. * Return pages freed into the buddy system. */ static inline unsigned long free_initmem_default(int poison) { extern char __init_begin[], __init_end[]; return free_reserved_area(&__init_begin, &__init_end, poison, "unused kernel image (initmem)"); } static inline unsigned long get_num_physpages(void) { int nid; unsigned long phys_pages = 0; for_each_online_node(nid) phys_pages += node_present_pages(nid); return phys_pages; } /* * Using memblock node mappings, an architecture may initialise its * zones, allocate the backing mem_map and account for memory holes in an * architecture independent manner. * * An architecture is expected to register range of page frames backed by * physical memory with memblock_add[_node]() before calling * free_area_init() passing in the PFN each zone ends at. At a basic * usage, an architecture is expected to do something like * * unsigned long max_zone_pfns[MAX_NR_ZONES] = {max_dma, max_normal_pfn, * max_highmem_pfn}; * for_each_valid_physical_page_range() * memblock_add_node(base, size, nid, MEMBLOCK_NONE) * free_area_init(max_zone_pfns); */ void free_area_init(unsigned long *max_zone_pfn); unsigned long node_map_pfn_alignment(void); extern unsigned long absent_pages_in_range(unsigned long start_pfn, unsigned long end_pfn); extern void get_pfn_range_for_nid(unsigned int nid, unsigned long *start_pfn, unsigned long *end_pfn); #ifndef CONFIG_NUMA static inline int early_pfn_to_nid(unsigned long pfn) { return 0; } #else /* please see mm/page_alloc.c */ extern int __meminit early_pfn_to_nid(unsigned long pfn); #endif extern void mem_init(void); extern void __init mmap_init(void); extern void __show_mem(unsigned int flags, nodemask_t *nodemask, int max_zone_idx); static inline void show_mem(void) { __show_mem(0, NULL, MAX_NR_ZONES - 1); } extern long si_mem_available(void); extern void si_meminfo(struct sysinfo * val); extern void si_meminfo_node(struct sysinfo *val, int nid); extern __printf(3, 4) void warn_alloc(gfp_t gfp_mask, nodemask_t *nodemask, const char *fmt, ...); extern void setup_per_cpu_pageset(void); /* nommu.c */ extern atomic_long_t mmap_pages_allocated; extern int nommu_shrink_inode_mappings(struct inode *, size_t, size_t); /* interval_tree.c */ void vma_interval_tree_insert(struct vm_area_struct *node, struct rb_root_cached *root); void vma_interval_tree_insert_after(struct vm_area_struct *node, struct vm_area_struct *prev, struct rb_root_cached *root); void vma_interval_tree_remove(struct vm_area_struct *node, struct rb_root_cached *root); struct vm_area_struct *vma_interval_tree_iter_first(struct rb_root_cached *root, unsigned long start, unsigned long last); struct vm_area_struct *vma_interval_tree_iter_next(struct vm_area_struct *node, unsigned long start, unsigned long last); #define vma_interval_tree_foreach(vma, root, start, last) \ for (vma = vma_interval_tree_iter_first(root, start, last); \ vma; vma = vma_interval_tree_iter_next(vma, start, last)) void anon_vma_interval_tree_insert(struct anon_vma_chain *node, struct rb_root_cached *root); void anon_vma_interval_tree_remove(struct anon_vma_chain *node, struct rb_root_cached *root); struct anon_vma_chain * anon_vma_interval_tree_iter_first(struct rb_root_cached *root, unsigned long start, unsigned long last); struct anon_vma_chain *anon_vma_interval_tree_iter_next( struct anon_vma_chain *node, unsigned long start, unsigned long last); #ifdef CONFIG_DEBUG_VM_RB void anon_vma_interval_tree_verify(struct anon_vma_chain *node); #endif #define anon_vma_interval_tree_foreach(avc, root, start, last) \ for (avc = anon_vma_interval_tree_iter_first(root, start, last); \ avc; avc = anon_vma_interval_tree_iter_next(avc, start, last)) /* mmap.c */ extern int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin); extern int insert_vm_struct(struct mm_struct *, struct vm_area_struct *); extern void exit_mmap(struct mm_struct *); int relocate_vma_down(struct vm_area_struct *vma, unsigned long shift); bool mmap_read_lock_maybe_expand(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, bool write); static inline int check_data_rlimit(unsigned long rlim, unsigned long new, unsigned long start, unsigned long end_data, unsigned long start_data) { if (rlim < RLIM_INFINITY) { if (((new - start) + (end_data - start_data)) > rlim) return -ENOSPC; } return 0; } extern int mm_take_all_locks(struct mm_struct *mm); extern void mm_drop_all_locks(struct mm_struct *mm); extern int set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file); extern int replace_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file); extern struct file *get_mm_exe_file(struct mm_struct *mm); extern struct file *get_task_exe_file(struct task_struct *task); extern bool may_expand_vm(struct mm_struct *, vm_flags_t, unsigned long npages); extern void vm_stat_account(struct mm_struct *, vm_flags_t, long npages); extern bool vma_is_special_mapping(const struct vm_area_struct *vma, const struct vm_special_mapping *sm); extern struct vm_area_struct *_install_special_mapping(struct mm_struct *mm, unsigned long addr, unsigned long len, unsigned long flags, const struct vm_special_mapping *spec); unsigned long randomize_stack_top(unsigned long stack_top); unsigned long randomize_page(unsigned long start, unsigned long range); unsigned long __get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags, vm_flags_t vm_flags); static inline unsigned long get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags) { return __get_unmapped_area(file, addr, len, pgoff, flags, 0); } extern unsigned long do_mmap(struct file *file, unsigned long addr, unsigned long len, unsigned long prot, unsigned long flags, vm_flags_t vm_flags, unsigned long pgoff, unsigned long *populate, struct list_head *uf); extern int do_vmi_munmap(struct vma_iterator *vmi, struct mm_struct *mm, unsigned long start, size_t len, struct list_head *uf, bool unlock); int do_vmi_align_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma, struct mm_struct *mm, unsigned long start, unsigned long end, struct list_head *uf, bool unlock); extern int do_munmap(struct mm_struct *, unsigned long, size_t, struct list_head *uf); extern int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int behavior); #ifdef CONFIG_MMU extern int __mm_populate(unsigned long addr, unsigned long len, int ignore_errors); static inline void mm_populate(unsigned long addr, unsigned long len) { /* Ignore errors */ (void) __mm_populate(addr, len, 1); } #else static inline void mm_populate(unsigned long addr, unsigned long len) {} #endif /* This takes the mm semaphore itself */ extern int __must_check vm_brk_flags(unsigned long, unsigned long, unsigned long); extern int vm_munmap(unsigned long, size_t); extern unsigned long __must_check vm_mmap(struct file *, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long); struct vm_unmapped_area_info { #define VM_UNMAPPED_AREA_TOPDOWN 1 unsigned long flags; unsigned long length; unsigned long low_limit; unsigned long high_limit; unsigned long align_mask; unsigned long align_offset; unsigned long start_gap; }; extern unsigned long vm_unmapped_area(struct vm_unmapped_area_info *info); /* truncate.c */ extern void truncate_inode_pages(struct address_space *, loff_t); extern void truncate_inode_pages_range(struct address_space *, loff_t lstart, loff_t lend); extern void truncate_inode_pages_final(struct address_space *); /* generic vm_area_ops exported for stackable file systems */ extern vm_fault_t filemap_fault(struct vm_fault *vmf); extern vm_fault_t filemap_map_pages(struct vm_fault *vmf, pgoff_t start_pgoff, pgoff_t end_pgoff); extern vm_fault_t filemap_page_mkwrite(struct vm_fault *vmf); extern vm_fault_t filemap_fsnotify_fault(struct vm_fault *vmf); extern unsigned long stack_guard_gap; /* Generic expand stack which grows the stack according to GROWS{UP,DOWN} */ int expand_stack_locked(struct vm_area_struct *vma, unsigned long address); struct vm_area_struct *expand_stack(struct mm_struct * mm, unsigned long addr); /* Look up the first VMA which satisfies addr < vm_end, NULL if none. */ extern struct vm_area_struct * find_vma(struct mm_struct * mm, unsigned long addr); extern struct vm_area_struct * find_vma_prev(struct mm_struct * mm, unsigned long addr, struct vm_area_struct **pprev); /* * Look up the first VMA which intersects the interval [start_addr, end_addr) * NULL if none. Assume start_addr < end_addr. */ struct vm_area_struct *find_vma_intersection(struct mm_struct *mm, unsigned long start_addr, unsigned long end_addr); /** * vma_lookup() - Find a VMA at a specific address * @mm: The process address space. * @addr: The user address. * * Return: The vm_area_struct at the given address, %NULL otherwise. */ static inline struct vm_area_struct *vma_lookup(struct mm_struct *mm, unsigned long addr) { return mtree_load(&mm->mm_mt, addr); } static inline unsigned long stack_guard_start_gap(struct vm_area_struct *vma) { if (vma->vm_flags & VM_GROWSDOWN) return stack_guard_gap; /* See reasoning around the VM_SHADOW_STACK definition */ if (vma->vm_flags & VM_SHADOW_STACK) return PAGE_SIZE; return 0; } static inline unsigned long vm_start_gap(struct vm_area_struct *vma) { unsigned long gap = stack_guard_start_gap(vma); unsigned long vm_start = vma->vm_start; vm_start -= gap; if (vm_start > vma->vm_start) vm_start = 0; return vm_start; } static inline unsigned long vm_end_gap(struct vm_area_struct *vma) { unsigned long vm_end = vma->vm_end; if (vma->vm_flags & VM_GROWSUP) { vm_end += stack_guard_gap; if (vm_end < vma->vm_end) vm_end = -PAGE_SIZE; } return vm_end; } static inline unsigned long vma_pages(struct vm_area_struct *vma) { return (vma->vm_end - vma->vm_start) >> PAGE_SHIFT; } /* Look up the first VMA which exactly match the interval vm_start ... vm_end */ static inline struct vm_area_struct *find_exact_vma(struct mm_struct *mm, unsigned long vm_start, unsigned long vm_end) { struct vm_area_struct *vma = vma_lookup(mm, vm_start); if (vma && (vma->vm_start != vm_start || vma->vm_end != vm_end)) vma = NULL; return vma; } static inline bool range_in_vma(struct vm_area_struct *vma, unsigned long start, unsigned long end) { return (vma && vma->vm_start <= start && end <= vma->vm_end); } #ifdef CONFIG_MMU pgprot_t vm_get_page_prot(unsigned long vm_flags); void vma_set_page_prot(struct vm_area_struct *vma); #else static inline pgprot_t vm_get_page_prot(unsigned long vm_flags) { return __pgprot(0); } static inline void vma_set_page_prot(struct vm_area_struct *vma) { vma->vm_page_prot = vm_get_page_prot(vma->vm_flags); } #endif void vma_set_file(struct vm_area_struct *vma, struct file *file); #ifdef CONFIG_NUMA_BALANCING unsigned long change_prot_numa(struct vm_area_struct *vma, unsigned long start, unsigned long end); #endif struct vm_area_struct *find_extend_vma_locked(struct mm_struct *, unsigned long addr); int remap_pfn_range(struct vm_area_struct *, unsigned long addr, unsigned long pfn, unsigned long size, pgprot_t); int remap_pfn_range_notrack(struct vm_area_struct *vma, unsigned long addr, unsigned long pfn, unsigned long size, pgprot_t prot); int vm_insert_page(struct vm_area_struct *, unsigned long addr, struct page *); int vm_insert_pages(struct vm_area_struct *vma, unsigned long addr, struct page **pages, unsigned long *num); int vm_map_pages(struct vm_area_struct *vma, struct page **pages, unsigned long num); int vm_map_pages_zero(struct vm_area_struct *vma, struct page **pages, unsigned long num); vm_fault_t vmf_insert_pfn(struct vm_area_struct *vma, unsigned long addr, unsigned long pfn); vm_fault_t vmf_insert_pfn_prot(struct vm_area_struct *vma, unsigned long addr, unsigned long pfn, pgprot_t pgprot); vm_fault_t vmf_insert_mixed(struct vm_area_struct *vma, unsigned long addr, pfn_t pfn); vm_fault_t vmf_insert_mixed_mkwrite(struct vm_area_struct *vma, unsigned long addr, pfn_t pfn); int vm_iomap_memory(struct vm_area_struct *vma, phys_addr_t start, unsigned long len); static inline vm_fault_t vmf_insert_page(struct vm_area_struct *vma, unsigned long addr, struct page *page) { int err = vm_insert_page(vma, addr, page); if (err == -ENOMEM) return VM_FAULT_OOM; if (err < 0 && err != -EBUSY) return VM_FAULT_SIGBUS; return VM_FAULT_NOPAGE; } #ifndef io_remap_pfn_range static inline int io_remap_pfn_range(struct vm_area_struct *vma, unsigned long addr, unsigned long pfn, unsigned long size, pgprot_t prot) { return remap_pfn_range(vma, addr, pfn, size, pgprot_decrypted(prot)); } #endif static inline vm_fault_t vmf_error(int err) { if (err == -ENOMEM) return VM_FAULT_OOM; else if (err == -EHWPOISON) return VM_FAULT_HWPOISON; return VM_FAULT_SIGBUS; } /* * Convert errno to return value for ->page_mkwrite() calls. * * This should eventually be merged with vmf_error() above, but will need a * careful audit of all vmf_error() callers. */ static inline vm_fault_t vmf_fs_error(int err) { if (err == 0) return VM_FAULT_LOCKED; if (err == -EFAULT || err == -EAGAIN) return VM_FAULT_NOPAGE; if (err == -ENOMEM) return VM_FAULT_OOM; /* -ENOSPC, -EDQUOT, -EIO ... */ return VM_FAULT_SIGBUS; } static inline int vm_fault_to_errno(vm_fault_t vm_fault, int foll_flags) { if (vm_fault & VM_FAULT_OOM) return -ENOMEM; if (vm_fault & (VM_FAULT_HWPOISON | VM_FAULT_HWPOISON_LARGE)) return (foll_flags & FOLL_HWPOISON) ? -EHWPOISON : -EFAULT; if (vm_fault & (VM_FAULT_SIGBUS | VM_FAULT_SIGSEGV)) return -EFAULT; return 0; } /* * Indicates whether GUP can follow a PROT_NONE mapped page, or whether * a (NUMA hinting) fault is required. */ static inline bool gup_can_follow_protnone(struct vm_area_struct *vma, unsigned int flags) { /* * If callers don't want to honor NUMA hinting faults, no need to * determine if we would actually have to trigger a NUMA hinting fault. */ if (!(flags & FOLL_HONOR_NUMA_FAULT)) return true; /* * NUMA hinting faults don't apply in inaccessible (PROT_NONE) VMAs. * * Requiring a fault here even for inaccessible VMAs would mean that * FOLL_FORCE cannot make any progress, because handle_mm_fault() * refuses to process NUMA hinting faults in inaccessible VMAs. */ return !vma_is_accessible(vma); } typedef int (*pte_fn_t)(pte_t *pte, unsigned long addr, void *data); extern int apply_to_page_range(struct mm_struct *mm, unsigned long address, unsigned long size, pte_fn_t fn, void *data); extern int apply_to_existing_page_range(struct mm_struct *mm, unsigned long address, unsigned long size, pte_fn_t fn, void *data); #ifdef CONFIG_PAGE_POISONING extern void __kernel_poison_pages(struct page *page, int numpages); extern void __kernel_unpoison_pages(struct page *page, int numpages); extern bool _page_poisoning_enabled_early; DECLARE_STATIC_KEY_FALSE(_page_poisoning_enabled); static inline bool page_poisoning_enabled(void) { return _page_poisoning_enabled_early; } /* * For use in fast paths after init_mem_debugging() has run, or when a * false negative result is not harmful when called too early. */ static inline bool page_poisoning_enabled_static(void) { return static_branch_unlikely(&_page_poisoning_enabled); } static inline void kernel_poison_pages(struct page *page, int numpages) { if (page_poisoning_enabled_static()) __kernel_poison_pages(page, numpages); } static inline void kernel_unpoison_pages(struct page *page, int numpages) { if (page_poisoning_enabled_static()) __kernel_unpoison_pages(page, numpages); } #else static inline bool page_poisoning_enabled(void) { return false; } static inline bool page_poisoning_enabled_static(void) { return false; } static inline void __kernel_poison_pages(struct page *page, int nunmpages) { } static inline void kernel_poison_pages(struct page *page, int numpages) { } static inline void kernel_unpoison_pages(struct page *page, int numpages) { } #endif DECLARE_STATIC_KEY_MAYBE(CONFIG_INIT_ON_ALLOC_DEFAULT_ON, init_on_alloc); static inline bool want_init_on_alloc(gfp_t flags) { if (static_branch_maybe(CONFIG_INIT_ON_ALLOC_DEFAULT_ON, &init_on_alloc)) return true; return flags & __GFP_ZERO; } DECLARE_STATIC_KEY_MAYBE(CONFIG_INIT_ON_FREE_DEFAULT_ON, init_on_free); static inline bool want_init_on_free(void) { return static_branch_maybe(CONFIG_INIT_ON_FREE_DEFAULT_ON, &init_on_free); } extern bool _debug_pagealloc_enabled_early; DECLARE_STATIC_KEY_FALSE(_debug_pagealloc_enabled); static inline bool debug_pagealloc_enabled(void) { return IS_ENABLED(CONFIG_DEBUG_PAGEALLOC) && _debug_pagealloc_enabled_early; } /* * For use in fast paths after mem_debugging_and_hardening_init() has run, * or when a false negative result is not harmful when called too early. */ static inline bool debug_pagealloc_enabled_static(void) { if (!IS_ENABLED(CONFIG_DEBUG_PAGEALLOC)) return false; return static_branch_unlikely(&_debug_pagealloc_enabled); } /* * To support DEBUG_PAGEALLOC architecture must ensure that * __kernel_map_pages() never fails */ extern void __kernel_map_pages(struct page *page, int numpages, int enable); #ifdef CONFIG_DEBUG_PAGEALLOC static inline void debug_pagealloc_map_pages(struct page *page, int numpages) { if (debug_pagealloc_enabled_static()) __kernel_map_pages(page, numpages, 1); } static inline void debug_pagealloc_unmap_pages(struct page *page, int numpages) { if (debug_pagealloc_enabled_static()) __kernel_map_pages(page, numpages, 0); } extern unsigned int _debug_guardpage_minorder; DECLARE_STATIC_KEY_FALSE(_debug_guardpage_enabled); static inline unsigned int debug_guardpage_minorder(void) { return _debug_guardpage_minorder; } static inline bool debug_guardpage_enabled(void) { return static_branch_unlikely(&_debug_guardpage_enabled); } static inline bool page_is_guard(struct page *page) { if (!debug_guardpage_enabled()) return false; return PageGuard(page); } bool __set_page_guard(struct zone *zone, struct page *page, unsigned int order); static inline bool set_page_guard(struct zone *zone, struct page *page, unsigned int order) { if (!debug_guardpage_enabled()) return false; return __set_page_guard(zone, page, order); } void __clear_page_guard(struct zone *zone, struct page *page, unsigned int order); static inline void clear_page_guard(struct zone *zone, struct page *page, unsigned int order) { if (!debug_guardpage_enabled()) return; __clear_page_guard(zone, page, order); } #else /* CONFIG_DEBUG_PAGEALLOC */ static inline void debug_pagealloc_map_pages(struct page *page, int numpages) {} static inline void debug_pagealloc_unmap_pages(struct page *page, int numpages) {} static inline unsigned int debug_guardpage_minorder(void) { return 0; } static inline bool debug_guardpage_enabled(void) { return false; } static inline bool page_is_guard(struct page *page) { return false; } static inline bool set_page_guard(struct zone *zone, struct page *page, unsigned int order) { return false; } static inline void clear_page_guard(struct zone *zone, struct page *page, unsigned int order) {} #endif /* CONFIG_DEBUG_PAGEALLOC */ #ifdef __HAVE_ARCH_GATE_AREA extern struct vm_area_struct *get_gate_vma(struct mm_struct *mm); extern int in_gate_area_no_mm(unsigned long addr); extern int in_gate_area(struct mm_struct *mm, unsigned long addr); #else static inline struct vm_area_struct *get_gate_vma(struct mm_struct *mm) { return NULL; } static inline int in_gate_area_no_mm(unsigned long addr) { return 0; } static inline int in_gate_area(struct mm_struct *mm, unsigned long addr) { return 0; } #endif /* __HAVE_ARCH_GATE_AREA */ extern bool process_shares_mm(struct task_struct *p, struct mm_struct *mm); #ifdef CONFIG_SYSCTL extern int sysctl_drop_caches; int drop_caches_sysctl_handler(const struct ctl_table *, int, void *, size_t *, loff_t *); #endif void drop_slab(void); #ifndef CONFIG_MMU #define randomize_va_space 0 #else extern int randomize_va_space; #endif const char * arch_vma_name(struct vm_area_struct *vma); #ifdef CONFIG_MMU void print_vma_addr(char *prefix, unsigned long rip); #else static inline void print_vma_addr(char *prefix, unsigned long rip) { } #endif void *sparse_buffer_alloc(unsigned long size); struct page * __populate_section_memmap(unsigned long pfn, unsigned long nr_pages, int nid, struct vmem_altmap *altmap, struct dev_pagemap *pgmap); pgd_t *vmemmap_pgd_populate(unsigned long addr, int node); p4d_t *vmemmap_p4d_populate(pgd_t *pgd, unsigned long addr, int node); pud_t *vmemmap_pud_populate(p4d_t *p4d, unsigned long addr, int node); pmd_t *vmemmap_pmd_populate(pud_t *pud, unsigned long addr, int node); pte_t *vmemmap_pte_populate(pmd_t *pmd, unsigned long addr, int node, struct vmem_altmap *altmap, struct page *reuse); void *vmemmap_alloc_block(unsigned long size, int node); struct vmem_altmap; void *vmemmap_alloc_block_buf(unsigned long size, int node, struct vmem_altmap *altmap); void vmemmap_verify(pte_t *, int, unsigned long, unsigned long); void vmemmap_set_pmd(pmd_t *pmd, void *p, int node, unsigned long addr, unsigned long next); int vmemmap_check_pmd(pmd_t *pmd, int node, unsigned long addr, unsigned long next); int vmemmap_populate_basepages(unsigned long start, unsigned long end, int node, struct vmem_altmap *altmap); int vmemmap_populate_hugepages(unsigned long start, unsigned long end, int node, struct vmem_altmap *altmap); int vmemmap_populate(unsigned long start, unsigned long end, int node, struct vmem_altmap *altmap); void vmemmap_populate_print_last(void); #ifdef CONFIG_MEMORY_HOTPLUG void vmemmap_free(unsigned long start, unsigned long end, struct vmem_altmap *altmap); #endif #ifdef CONFIG_SPARSEMEM_VMEMMAP static inline unsigned long vmem_altmap_offset(struct vmem_altmap *altmap) { /* number of pfns from base where pfn_to_page() is valid */ if (altmap) return altmap->reserve + altmap->free; return 0; } static inline void vmem_altmap_free(struct vmem_altmap *altmap, unsigned long nr_pfns) { altmap->alloc -= nr_pfns; } #else static inline unsigned long vmem_altmap_offset(struct vmem_altmap *altmap) { return 0; } static inline void vmem_altmap_free(struct vmem_altmap *altmap, unsigned long nr_pfns) { } #endif #define VMEMMAP_RESERVE_NR 2 #ifdef CONFIG_ARCH_WANT_OPTIMIZE_DAX_VMEMMAP static inline bool __vmemmap_can_optimize(struct vmem_altmap *altmap, struct dev_pagemap *pgmap) { unsigned long nr_pages; unsigned long nr_vmemmap_pages; if (!pgmap || !is_power_of_2(sizeof(struct page))) return false; nr_pages = pgmap_vmemmap_nr(pgmap); nr_vmemmap_pages = ((nr_pages * sizeof(struct page)) >> PAGE_SHIFT); /* * For vmemmap optimization with DAX we need minimum 2 vmemmap * pages. See layout diagram in Documentation/mm/vmemmap_dedup.rst */ return !altmap && (nr_vmemmap_pages > VMEMMAP_RESERVE_NR); } /* * If we don't have an architecture override, use the generic rule */ #ifndef vmemmap_can_optimize #define vmemmap_can_optimize __vmemmap_can_optimize #endif #else static inline bool vmemmap_can_optimize(struct vmem_altmap *altmap, struct dev_pagemap *pgmap) { return false; } #endif void register_page_bootmem_memmap(unsigned long section_nr, struct page *map, unsigned long nr_pages); enum mf_flags { MF_COUNT_INCREASED = 1 << 0, MF_ACTION_REQUIRED = 1 << 1, MF_MUST_KILL = 1 << 2, MF_SOFT_OFFLINE = 1 << 3, MF_UNPOISON = 1 << 4, MF_SW_SIMULATED = 1 << 5, MF_NO_RETRY = 1 << 6, MF_MEM_PRE_REMOVE = 1 << 7, }; int mf_dax_kill_procs(struct address_space *mapping, pgoff_t index, unsigned long count, int mf_flags); extern int memory_failure(unsigned long pfn, int flags); extern void memory_failure_queue_kick(int cpu); extern int unpoison_memory(unsigned long pfn); extern atomic_long_t num_poisoned_pages __read_mostly; extern int soft_offline_page(unsigned long pfn, int flags); #ifdef CONFIG_MEMORY_FAILURE /* * Sysfs entries for memory failure handling statistics. */ extern const struct attribute_group memory_failure_attr_group; extern void memory_failure_queue(unsigned long pfn, int flags); extern int __get_huge_page_for_hwpoison(unsigned long pfn, int flags, bool *migratable_cleared); void num_poisoned_pages_inc(unsigned long pfn); void num_poisoned_pages_sub(unsigned long pfn, long i); #else static inline void memory_failure_queue(unsigned long pfn, int flags) { } static inline int __get_huge_page_for_hwpoison(unsigned long pfn, int flags, bool *migratable_cleared) { return 0; } static inline void num_poisoned_pages_inc(unsigned long pfn) { } static inline void num_poisoned_pages_sub(unsigned long pfn, long i) { } #endif #if defined(CONFIG_MEMORY_FAILURE) && defined(CONFIG_MEMORY_HOTPLUG) extern void memblk_nr_poison_inc(unsigned long pfn); extern void memblk_nr_poison_sub(unsigned long pfn, long i); #else static inline void memblk_nr_poison_inc(unsigned long pfn) { } static inline void memblk_nr_poison_sub(unsigned long pfn, long i) { } #endif #ifndef arch_memory_failure static inline int arch_memory_failure(unsigned long pfn, int flags) { return -ENXIO; } #endif #ifndef arch_is_platform_page static inline bool arch_is_platform_page(u64 paddr) { return false; } #endif /* * Error handlers for various types of pages. */ enum mf_result { MF_IGNORED, /* Error: cannot be handled */ MF_FAILED, /* Error: handling failed */ MF_DELAYED, /* Will be handled later */ MF_RECOVERED, /* Successfully recovered */ }; enum mf_action_page_type { MF_MSG_KERNEL, MF_MSG_KERNEL_HIGH_ORDER, MF_MSG_DIFFERENT_COMPOUND, MF_MSG_HUGE, MF_MSG_FREE_HUGE, MF_MSG_GET_HWPOISON, MF_MSG_UNMAP_FAILED, MF_MSG_DIRTY_SWAPCACHE, MF_MSG_CLEAN_SWAPCACHE, MF_MSG_DIRTY_MLOCKED_LRU, MF_MSG_CLEAN_MLOCKED_LRU, MF_MSG_DIRTY_UNEVICTABLE_LRU, MF_MSG_CLEAN_UNEVICTABLE_LRU, MF_MSG_DIRTY_LRU, MF_MSG_CLEAN_LRU, MF_MSG_TRUNCATED_LRU, MF_MSG_BUDDY, MF_MSG_DAX, MF_MSG_UNSPLIT_THP, MF_MSG_ALREADY_POISONED, MF_MSG_UNKNOWN, }; #if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_HUGETLBFS) void folio_zero_user(struct folio *folio, unsigned long addr_hint); int copy_user_large_folio(struct folio *dst, struct folio *src, unsigned long addr_hint, struct vm_area_struct *vma); long copy_folio_from_user(struct folio *dst_folio, const void __user *usr_src, bool allow_pagefault); /** * vma_is_special_huge - Are transhuge page-table entries considered special? * @vma: Pointer to the struct vm_area_struct to consider * * Whether transhuge page-table entries are considered "special" following * the definition in vm_normal_page(). * * Return: true if transhuge page-table entries should be considered special, * false otherwise. */ static inline bool vma_is_special_huge(const struct vm_area_struct *vma) { return vma_is_dax(vma) || (vma->vm_file && (vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP))); } #endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_HUGETLBFS */ #if MAX_NUMNODES > 1 void __init setup_nr_node_ids(void); #else static inline void setup_nr_node_ids(void) {} #endif extern int memcmp_pages(struct page *page1, struct page *page2); static inline int pages_identical(struct page *page1, struct page *page2) { return !memcmp_pages(page1, page2); } #ifdef CONFIG_MAPPING_DIRTY_HELPERS unsigned long clean_record_shared_mapping_range(struct address_space *mapping, pgoff_t first_index, pgoff_t nr, pgoff_t bitmap_pgoff, unsigned long *bitmap, pgoff_t *start, pgoff_t *end); unsigned long wp_shared_mapping_range(struct address_space *mapping, pgoff_t first_index, pgoff_t nr); #endif extern int sysctl_nr_trim_pages; #ifdef CONFIG_ANON_VMA_NAME int madvise_set_anon_name(struct mm_struct *mm, unsigned long start, unsigned long len_in, struct anon_vma_name *anon_name); #else static inline int madvise_set_anon_name(struct mm_struct *mm, unsigned long start, unsigned long len_in, struct anon_vma_name *anon_name) { return 0; } #endif #ifdef CONFIG_UNACCEPTED_MEMORY bool range_contains_unaccepted_memory(phys_addr_t start, unsigned long size); void accept_memory(phys_addr_t start, unsigned long size); #else static inline bool range_contains_unaccepted_memory(phys_addr_t start, unsigned long size) { return false; } static inline void accept_memory(phys_addr_t start, unsigned long size) { } #endif static inline bool pfn_is_unaccepted_memory(unsigned long pfn) { return range_contains_unaccepted_memory(pfn << PAGE_SHIFT, PAGE_SIZE); } void vma_pgtable_walk_begin(struct vm_area_struct *vma); void vma_pgtable_walk_end(struct vm_area_struct *vma); int reserve_mem_find_by_name(const char *name, phys_addr_t *start, phys_addr_t *size); #ifdef CONFIG_64BIT int do_mseal(unsigned long start, size_t len_in, unsigned long flags); #else static inline int do_mseal(unsigned long start, size_t len_in, unsigned long flags) { /* noop on 32 bit */ return 0; } #endif /* * user_alloc_needs_zeroing checks if a user folio from page allocator needs to * be zeroed or not. */ static inline bool user_alloc_needs_zeroing(void) { /* * for user folios, arch with cache aliasing requires cache flush and * arc changes folio->flags to make icache coherent with dcache, so * always return false to make caller use * clear_user_page()/clear_user_highpage(). */ return cpu_dcache_is_aliasing() || cpu_icache_is_aliasing() || !static_branch_maybe(CONFIG_INIT_ON_ALLOC_DEFAULT_ON, &init_on_alloc); } int arch_get_shadow_stack_status(struct task_struct *t, unsigned long __user *status); int arch_set_shadow_stack_status(struct task_struct *t, unsigned long status); int arch_lock_shadow_stack_status(struct task_struct *t, unsigned long status); #endif /* _LINUX_MM_H */
149 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_SCHED_SMT_H #define _LINUX_SCHED_SMT_H #include <linux/static_key.h> #ifdef CONFIG_SCHED_SMT extern struct static_key_false sched_smt_present; static __always_inline bool sched_smt_active(void) { return static_branch_likely(&sched_smt_present); } #else static inline bool sched_smt_active(void) { return false; } #endif void arch_smt_update(void); #endif /* _LINUX_SCHED_SMT_H */
15 15 15 15 15 15 9 15 15 15 15 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 // SPDX-License-Identifier: GPL-2.0 #include <linux/log2.h> #include <linux/slab.h> #include <linux/vmalloc.h> #include "darray.h" int __bch2_darray_resize_noprof(darray_char *d, size_t element_size, size_t new_size, gfp_t gfp) { if (new_size > d->size) { new_size = roundup_pow_of_two(new_size); /* * This is a workaround: kvmalloc() doesn't support > INT_MAX * allocations, but vmalloc() does. * The limit needs to be lifted from kvmalloc, and when it does * we'll go back to just using that. */ size_t bytes; if (unlikely(check_mul_overflow(new_size, element_size, &bytes))) return -ENOMEM; void *data = likely(bytes < INT_MAX) ? kvmalloc_noprof(bytes, gfp) : vmalloc_noprof(bytes); if (!data) return -ENOMEM; if (d->size) memcpy(data, d->data, d->size * element_size); if (d->data != d->preallocated) kvfree(d->data); d->data = data; d->size = new_size; } return 0; }
53 52 29 27 27 27 25 25 25 25 25 25 25 25 2 4 3 3 2 1 3 22 22 3 3 2 1 1 1 3 27 25 3 3 27 22 22 21 21 22 17 22 22 22 31 3 9 10 10 22 17 22 31 31 64 3 64 63 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 /* * Created: Tue Feb 2 08:37:54 1999 by faith@valinux.com * * Copyright 1999 Precision Insight, Inc., Cedar Park, Texas. * Copyright 2000 VA Linux Systems, Inc., Sunnyvale, California. * All Rights Reserved. * * Author Rickard E. (Rik) Faith <faith@valinux.com> * Author Gareth Hughes <gareth@valinux.com> * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), * to deal in the Software without restriction, including without limitation * the rights to use, copy, modify, merge, publish, distribute, sublicense, * and/or sell copies of the Software, and to permit persons to whom the * Software is furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice (including the next * paragraph) shall be included in all copies or substantial portions of the * Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL * VA LINUX SYSTEMS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, DAMAGES OR * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR * OTHER DEALINGS IN THE SOFTWARE. */ #include <linux/slab.h> #include <drm/drm_auth.h> #include <drm/drm_drv.h> #include <drm/drm_file.h> #include <drm/drm_lease.h> #include <drm/drm_print.h> #include "drm_internal.h" /** * DOC: master and authentication * * &struct drm_master is used to track groups of clients with open * primary device nodes. For every &struct drm_file which has had at * least once successfully became the device master (either through the * SET_MASTER IOCTL, or implicitly through opening the primary device node when * no one else is the current master that time) there exists one &drm_master. * This is noted in &drm_file.is_master. All other clients have just a pointer * to the &drm_master they are associated with. * * In addition only one &drm_master can be the current master for a &drm_device. * It can be switched through the DROP_MASTER and SET_MASTER IOCTL, or * implicitly through closing/opening the primary device node. See also * drm_is_current_master(). * * Clients can authenticate against the current master (if it matches their own) * using the GETMAGIC and AUTHMAGIC IOCTLs. Together with exchanging masters, * this allows controlled access to the device for an entire group of mutually * trusted clients. */ static bool drm_is_current_master_locked(struct drm_file *fpriv) { lockdep_assert_once(lockdep_is_held(&fpriv->master_lookup_lock) || lockdep_is_held(&fpriv->minor->dev->master_mutex)); return fpriv->is_master && drm_lease_owner(fpriv->master) == fpriv->minor->dev->master; } /** * drm_is_current_master - checks whether @priv is the current master * @fpriv: DRM file private * * Checks whether @fpriv is current master on its device. This decides whether a * client is allowed to run DRM_MASTER IOCTLs. * * Most of the modern IOCTL which require DRM_MASTER are for kernel modesetting * - the current master is assumed to own the non-shareable display hardware. */ bool drm_is_current_master(struct drm_file *fpriv) { bool ret; spin_lock(&fpriv->master_lookup_lock); ret = drm_is_current_master_locked(fpriv); spin_unlock(&fpriv->master_lookup_lock); return ret; } EXPORT_SYMBOL(drm_is_current_master); int drm_getmagic(struct drm_device *dev, void *data, struct drm_file *file_priv) { struct drm_auth *auth = data; int ret = 0; mutex_lock(&dev->master_mutex); if (!file_priv->magic) { ret = idr_alloc(&file_priv->master->magic_map, file_priv, 1, 0, GFP_KERNEL); if (ret >= 0) file_priv->magic = ret; } auth->magic = file_priv->magic; mutex_unlock(&dev->master_mutex); drm_dbg_core(dev, "%u\n", auth->magic); return ret < 0 ? ret : 0; } int drm_authmagic(struct drm_device *dev, void *data, struct drm_file *file_priv) { struct drm_auth *auth = data; struct drm_file *file; drm_dbg_core(dev, "%u\n", auth->magic); mutex_lock(&dev->master_mutex); file = idr_find(&file_priv->master->magic_map, auth->magic); if (file) { file->authenticated = 1; idr_replace(&file_priv->master->magic_map, NULL, auth->magic); } mutex_unlock(&dev->master_mutex); return file ? 0 : -EINVAL; } struct drm_master *drm_master_create(struct drm_device *dev) { struct drm_master *master; master = kzalloc(sizeof(*master), GFP_KERNEL); if (!master) return NULL; kref_init(&master->refcount); idr_init_base(&master->magic_map, 1); master->dev = dev; /* initialize the tree of output resource lessees */ INIT_LIST_HEAD(&master->lessees); INIT_LIST_HEAD(&master->lessee_list); idr_init(&master->leases); idr_init_base(&master->lessee_idr, 1); return master; } static void drm_set_master(struct drm_device *dev, struct drm_file *fpriv, bool new_master) { dev->master = drm_master_get(fpriv->master); if (dev->driver->master_set) dev->driver->master_set(dev, fpriv, new_master); fpriv->was_master = true; } static int drm_new_set_master(struct drm_device *dev, struct drm_file *fpriv) { struct drm_master *old_master; struct drm_master *new_master; lockdep_assert_held_once(&dev->master_mutex); WARN_ON(fpriv->is_master); old_master = fpriv->master; new_master = drm_master_create(dev); if (!new_master) return -ENOMEM; spin_lock(&fpriv->master_lookup_lock); fpriv->master = new_master; spin_unlock(&fpriv->master_lookup_lock); fpriv->is_master = 1; fpriv->authenticated = 1; drm_set_master(dev, fpriv, true); if (old_master) drm_master_put(&old_master); return 0; } /* * In the olden days the SET/DROP_MASTER ioctls used to return EACCES when * CAP_SYS_ADMIN was not set. This was used to prevent rogue applications * from becoming master and/or failing to release it. * * At the same time, the first client (for a given VT) is _always_ master. * Thus in order for the ioctls to succeed, one had to _explicitly_ run the * application as root or flip the setuid bit. * * If the CAP_SYS_ADMIN was missing, no other client could become master... * EVER :-( Leading to a) the graphics session dying badly or b) a completely * locked session. * * * As some point systemd-logind was introduced to orchestrate and delegate * master as applicable. It does so by opening the fd and passing it to users * while in itself logind a) does the set/drop master per users' request and * b) * implicitly drops master on VT switch. * * Even though logind looks like the future, there are a few issues: * - some platforms don't have equivalent (Android, CrOS, some BSDs) so * root is required _solely_ for SET/DROP MASTER. * - applications may not be updated to use it, * - any client which fails to drop master* can DoS the application using * logind, to a varying degree. * * * Either due missing CAP_SYS_ADMIN or simply not calling DROP_MASTER. * * * Here we implement the next best thing: * - ensure the logind style of fd passing works unchanged, and * - allow a client to drop/set master, iff it is/was master at a given point * in time. * * Note: DROP_MASTER cannot be free for all, as an arbitrator user could: * - DoS/crash the arbitrator - details would be implementation specific * - open the node, become master implicitly and cause issues * * As a result this fixes the following when using root-less build w/o logind * - startx * - weston * - various compositors based on wlroots */ static int drm_master_check_perm(struct drm_device *dev, struct drm_file *file_priv) { if (file_priv->was_master && rcu_access_pointer(file_priv->pid) == task_tgid(current)) return 0; if (!capable(CAP_SYS_ADMIN)) return -EACCES; return 0; } int drm_setmaster_ioctl(struct drm_device *dev, void *data, struct drm_file *file_priv) { int ret; mutex_lock(&dev->master_mutex); ret = drm_master_check_perm(dev, file_priv); if (ret) goto out_unlock; if (drm_is_current_master_locked(file_priv)) goto out_unlock; if (dev->master) { ret = -EBUSY; goto out_unlock; } if (!file_priv->master) { ret = -EINVAL; goto out_unlock; } if (!file_priv->is_master) { ret = drm_new_set_master(dev, file_priv); goto out_unlock; } if (file_priv->master->lessor != NULL) { drm_dbg_lease(dev, "Attempt to set lessee %d as master\n", file_priv->master->lessee_id); ret = -EINVAL; goto out_unlock; } drm_set_master(dev, file_priv, false); out_unlock: mutex_unlock(&dev->master_mutex); return ret; } static void drm_drop_master(struct drm_device *dev, struct drm_file *fpriv) { if (dev->driver->master_drop) dev->driver->master_drop(dev, fpriv); drm_master_put(&dev->master); } int drm_dropmaster_ioctl(struct drm_device *dev, void *data, struct drm_file *file_priv) { int ret; mutex_lock(&dev->master_mutex); ret = drm_master_check_perm(dev, file_priv); if (ret) goto out_unlock; if (!drm_is_current_master_locked(file_priv)) { ret = -EINVAL; goto out_unlock; } if (!dev->master) { ret = -EINVAL; goto out_unlock; } if (file_priv->master->lessor != NULL) { drm_dbg_lease(dev, "Attempt to drop lessee %d as master\n", file_priv->master->lessee_id); ret = -EINVAL; goto out_unlock; } drm_drop_master(dev, file_priv); out_unlock: mutex_unlock(&dev->master_mutex); return ret; } int drm_master_open(struct drm_file *file_priv) { struct drm_device *dev = file_priv->minor->dev; int ret = 0; /* if there is no current master make this fd it, but do not create * any master object for render clients */ mutex_lock(&dev->master_mutex); if (!dev->master) { ret = drm_new_set_master(dev, file_priv); } else { spin_lock(&file_priv->master_lookup_lock); file_priv->master = drm_master_get(dev->master); spin_unlock(&file_priv->master_lookup_lock); } mutex_unlock(&dev->master_mutex); return ret; } void drm_master_release(struct drm_file *file_priv) { struct drm_device *dev = file_priv->minor->dev; struct drm_master *master; mutex_lock(&dev->master_mutex); master = file_priv->master; if (file_priv->magic) idr_remove(&file_priv->master->magic_map, file_priv->magic); if (!drm_is_current_master_locked(file_priv)) goto out; if (dev->master == file_priv->master) drm_drop_master(dev, file_priv); out: if (drm_core_check_feature(dev, DRIVER_MODESET) && file_priv->is_master) { /* Revoke any leases held by this or lessees, but only if * this is the "real" master */ drm_lease_revoke(master); } /* drop the master reference held by the file priv */ if (file_priv->master) drm_master_put(&file_priv->master); mutex_unlock(&dev->master_mutex); } /** * drm_master_get - reference a master pointer * @master: &struct drm_master * * Increments the reference count of @master and returns a pointer to @master. */ struct drm_master *drm_master_get(struct drm_master *master) { kref_get(&master->refcount); return master; } EXPORT_SYMBOL(drm_master_get); /** * drm_file_get_master - reference &drm_file.master of @file_priv * @file_priv: DRM file private * * Increments the reference count of @file_priv's &drm_file.master and returns * the &drm_file.master. If @file_priv has no &drm_file.master, returns NULL. * * Master pointers returned from this function should be unreferenced using * drm_master_put(). */ struct drm_master *drm_file_get_master(struct drm_file *file_priv) { struct drm_master *master = NULL; spin_lock(&file_priv->master_lookup_lock); if (!file_priv->master) goto unlock; master = drm_master_get(file_priv->master); unlock: spin_unlock(&file_priv->master_lookup_lock); return master; } EXPORT_SYMBOL(drm_file_get_master); static void drm_master_destroy(struct kref *kref) { struct drm_master *master = container_of(kref, struct drm_master, refcount); struct drm_device *dev = master->dev; if (drm_core_check_feature(dev, DRIVER_MODESET)) drm_lease_destroy(master); idr_destroy(&master->magic_map); idr_destroy(&master->leases); idr_destroy(&master->lessee_idr); kfree(master->unique); kfree(master); } /** * drm_master_put - unreference and clear a master pointer * @master: pointer to a pointer of &struct drm_master * * This decrements the &drm_master behind @master and sets it to NULL. */ void drm_master_put(struct drm_master **master) { kref_put(&(*master)->refcount, drm_master_destroy); *master = NULL; } EXPORT_SYMBOL(drm_master_put); /* Used by drm_client and drm_fb_helper */ bool drm_master_internal_acquire(struct drm_device *dev) { mutex_lock(&dev->master_mutex); if (dev->master) { mutex_unlock(&dev->master_mutex); return false; } return true; } EXPORT_SYMBOL(drm_master_internal_acquire); /* Used by drm_client and drm_fb_helper */ void drm_master_internal_release(struct drm_device *dev) { mutex_unlock(&dev->master_mutex); } EXPORT_SYMBOL(drm_master_internal_release);
658 618 658 657 658 2 2 6 18 6 6 10 10 10 4 3 9 10 3 2 1 1 3 4 4 4 4 4 4 4 4 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 // SPDX-License-Identifier: GPL-2.0 /* Copyright (c) 2019 Facebook */ #include <linux/rculist.h> #include <linux/list.h> #include <linux/hash.h> #include <linux/types.h> #include <linux/spinlock.h> #include <linux/bpf.h> #include <linux/btf.h> #include <linux/btf_ids.h> #include <linux/bpf_local_storage.h> #include <net/bpf_sk_storage.h> #include <net/sock.h> #include <uapi/linux/sock_diag.h> #include <uapi/linux/btf.h> #include <linux/rcupdate_trace.h> DEFINE_BPF_STORAGE_CACHE(sk_cache); static struct bpf_local_storage_data * bpf_sk_storage_lookup(struct sock *sk, struct bpf_map *map, bool cacheit_lockit) { struct bpf_local_storage *sk_storage; struct bpf_local_storage_map *smap; sk_storage = rcu_dereference_check(sk->sk_bpf_storage, bpf_rcu_lock_held()); if (!sk_storage) return NULL; smap = (struct bpf_local_storage_map *)map; return bpf_local_storage_lookup(sk_storage, smap, cacheit_lockit); } static int bpf_sk_storage_del(struct sock *sk, struct bpf_map *map) { struct bpf_local_storage_data *sdata; sdata = bpf_sk_storage_lookup(sk, map, false); if (!sdata) return -ENOENT; bpf_selem_unlink(SELEM(sdata), false); return 0; } /* Called by __sk_destruct() & bpf_sk_storage_clone() */ void bpf_sk_storage_free(struct sock *sk) { struct bpf_local_storage *sk_storage; migrate_disable(); rcu_read_lock(); sk_storage = rcu_dereference(sk->sk_bpf_storage); if (!sk_storage) goto out; bpf_local_storage_destroy(sk_storage); out: rcu_read_unlock(); migrate_enable(); } static void bpf_sk_storage_map_free(struct bpf_map *map) { bpf_local_storage_map_free(map, &sk_cache, NULL); } static struct bpf_map *bpf_sk_storage_map_alloc(union bpf_attr *attr) { return bpf_local_storage_map_alloc(attr, &sk_cache, false); } static int notsupp_get_next_key(struct bpf_map *map, void *key, void *next_key) { return -ENOTSUPP; } static void *bpf_fd_sk_storage_lookup_elem(struct bpf_map *map, void *key) { struct bpf_local_storage_data *sdata; struct socket *sock; int fd, err; fd = *(int *)key; sock = sockfd_lookup(fd, &err); if (sock) { sdata = bpf_sk_storage_lookup(sock->sk, map, true); sockfd_put(sock); return sdata ? sdata->data : NULL; } return ERR_PTR(err); } static long bpf_fd_sk_storage_update_elem(struct bpf_map *map, void *key, void *value, u64 map_flags) { struct bpf_local_storage_data *sdata; struct socket *sock; int fd, err; fd = *(int *)key; sock = sockfd_lookup(fd, &err); if (sock) { sdata = bpf_local_storage_update( sock->sk, (struct bpf_local_storage_map *)map, value, map_flags, false, GFP_ATOMIC); sockfd_put(sock); return PTR_ERR_OR_ZERO(sdata); } return err; } static long bpf_fd_sk_storage_delete_elem(struct bpf_map *map, void *key) { struct socket *sock; int fd, err; fd = *(int *)key; sock = sockfd_lookup(fd, &err); if (sock) { err = bpf_sk_storage_del(sock->sk, map); sockfd_put(sock); return err; } return err; } static struct bpf_local_storage_elem * bpf_sk_storage_clone_elem(struct sock *newsk, struct bpf_local_storage_map *smap, struct bpf_local_storage_elem *selem) { struct bpf_local_storage_elem *copy_selem; copy_selem = bpf_selem_alloc(smap, newsk, NULL, true, false, GFP_ATOMIC); if (!copy_selem) return NULL; if (btf_record_has_field(smap->map.record, BPF_SPIN_LOCK)) copy_map_value_locked(&smap->map, SDATA(copy_selem)->data, SDATA(selem)->data, true); else copy_map_value(&smap->map, SDATA(copy_selem)->data, SDATA(selem)->data); return copy_selem; } int bpf_sk_storage_clone(const struct sock *sk, struct sock *newsk) { struct bpf_local_storage *new_sk_storage = NULL; struct bpf_local_storage *sk_storage; struct bpf_local_storage_elem *selem; int ret = 0; RCU_INIT_POINTER(newsk->sk_bpf_storage, NULL); migrate_disable(); rcu_read_lock(); sk_storage = rcu_dereference(sk->sk_bpf_storage); if (!sk_storage || hlist_empty(&sk_storage->list)) goto out; hlist_for_each_entry_rcu(selem, &sk_storage->list, snode) { struct bpf_local_storage_elem *copy_selem; struct bpf_local_storage_map *smap; struct bpf_map *map; smap = rcu_dereference(SDATA(selem)->smap); if (!(smap->map.map_flags & BPF_F_CLONE)) continue; /* Note that for lockless listeners adding new element * here can race with cleanup in bpf_local_storage_map_free. * Try to grab map refcnt to make sure that it's still * alive and prevent concurrent removal. */ map = bpf_map_inc_not_zero(&smap->map); if (IS_ERR(map)) continue; copy_selem = bpf_sk_storage_clone_elem(newsk, smap, selem); if (!copy_selem) { ret = -ENOMEM; bpf_map_put(map); goto out; } if (new_sk_storage) { bpf_selem_link_map(smap, copy_selem); bpf_selem_link_storage_nolock(new_sk_storage, copy_selem); } else { ret = bpf_local_storage_alloc(newsk, smap, copy_selem, GFP_ATOMIC); if (ret) { bpf_selem_free(copy_selem, smap, true); atomic_sub(smap->elem_size, &newsk->sk_omem_alloc); bpf_map_put(map); goto out; } new_sk_storage = rcu_dereference(copy_selem->local_storage); } bpf_map_put(map); } out: rcu_read_unlock(); migrate_enable(); /* In case of an error, don't free anything explicitly here, the * caller is responsible to call bpf_sk_storage_free. */ return ret; } /* *gfp_flags* is a hidden argument provided by the verifier */ BPF_CALL_5(bpf_sk_storage_get, struct bpf_map *, map, struct sock *, sk, void *, value, u64, flags, gfp_t, gfp_flags) { struct bpf_local_storage_data *sdata; WARN_ON_ONCE(!bpf_rcu_lock_held()); if (!sk || !sk_fullsock(sk) || flags > BPF_SK_STORAGE_GET_F_CREATE) return (unsigned long)NULL; sdata = bpf_sk_storage_lookup(sk, map, true); if (sdata) return (unsigned long)sdata->data; if (flags == BPF_SK_STORAGE_GET_F_CREATE && /* Cannot add new elem to a going away sk. * Otherwise, the new elem may become a leak * (and also other memory issues during map * destruction). */ refcount_inc_not_zero(&sk->sk_refcnt)) { sdata = bpf_local_storage_update( sk, (struct bpf_local_storage_map *)map, value, BPF_NOEXIST, false, gfp_flags); /* sk must be a fullsock (guaranteed by verifier), * so sock_gen_put() is unnecessary. */ sock_put(sk); return IS_ERR(sdata) ? (unsigned long)NULL : (unsigned long)sdata->data; } return (unsigned long)NULL; } BPF_CALL_2(bpf_sk_storage_delete, struct bpf_map *, map, struct sock *, sk) { WARN_ON_ONCE(!bpf_rcu_lock_held()); if (!sk || !sk_fullsock(sk)) return -EINVAL; if (refcount_inc_not_zero(&sk->sk_refcnt)) { int err; err = bpf_sk_storage_del(sk, map); sock_put(sk); return err; } return -ENOENT; } static int bpf_sk_storage_charge(struct bpf_local_storage_map *smap, void *owner, u32 size) { struct sock *sk = (struct sock *)owner; int optmem_max; optmem_max = READ_ONCE(sock_net(sk)->core.sysctl_optmem_max); /* same check as in sock_kmalloc() */ if (size <= optmem_max && atomic_read(&sk->sk_omem_alloc) + size < optmem_max) { atomic_add(size, &sk->sk_omem_alloc); return 0; } return -ENOMEM; } static void bpf_sk_storage_uncharge(struct bpf_local_storage_map *smap, void *owner, u32 size) { struct sock *sk = owner; atomic_sub(size, &sk->sk_omem_alloc); } static struct bpf_local_storage __rcu ** bpf_sk_storage_ptr(void *owner) { struct sock *sk = owner; return &sk->sk_bpf_storage; } const struct bpf_map_ops sk_storage_map_ops = { .map_meta_equal = bpf_map_meta_equal, .map_alloc_check = bpf_local_storage_map_alloc_check, .map_alloc = bpf_sk_storage_map_alloc, .map_free = bpf_sk_storage_map_free, .map_get_next_key = notsupp_get_next_key, .map_lookup_elem = bpf_fd_sk_storage_lookup_elem, .map_update_elem = bpf_fd_sk_storage_update_elem, .map_delete_elem = bpf_fd_sk_storage_delete_elem, .map_check_btf = bpf_local_storage_map_check_btf, .map_btf_id = &bpf_local_storage_map_btf_id[0], .map_local_storage_charge = bpf_sk_storage_charge, .map_local_storage_uncharge = bpf_sk_storage_uncharge, .map_owner_storage_ptr = bpf_sk_storage_ptr, .map_mem_usage = bpf_local_storage_map_mem_usage, }; const struct bpf_func_proto bpf_sk_storage_get_proto = { .func = bpf_sk_storage_get, .gpl_only = false, .ret_type = RET_PTR_TO_MAP_VALUE_OR_NULL, .arg1_type = ARG_CONST_MAP_PTR, .arg2_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON, .arg3_type = ARG_PTR_TO_MAP_VALUE_OR_NULL, .arg4_type = ARG_ANYTHING, }; const struct bpf_func_proto bpf_sk_storage_get_cg_sock_proto = { .func = bpf_sk_storage_get, .gpl_only = false, .ret_type = RET_PTR_TO_MAP_VALUE_OR_NULL, .arg1_type = ARG_CONST_MAP_PTR, .arg2_type = ARG_PTR_TO_CTX, /* context is 'struct sock' */ .arg3_type = ARG_PTR_TO_MAP_VALUE_OR_NULL, .arg4_type = ARG_ANYTHING, }; const struct bpf_func_proto bpf_sk_storage_delete_proto = { .func = bpf_sk_storage_delete, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_CONST_MAP_PTR, .arg2_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON, }; static bool bpf_sk_storage_tracing_allowed(const struct bpf_prog *prog) { if (prog->aux->dst_prog) return false; /* Ensure the tracing program is not tracing * any bpf_sk_storage*() function and also * use the bpf_sk_storage_(get|delete) helper. */ switch (prog->expected_attach_type) { case BPF_TRACE_ITER: case BPF_TRACE_RAW_TP: /* bpf_sk_storage has no trace point */ return true; case BPF_TRACE_FENTRY: case BPF_TRACE_FEXIT: return !!strncmp(prog->aux->attach_func_name, "bpf_sk_storage", strlen("bpf_sk_storage")); default: return false; } return false; } /* *gfp_flags* is a hidden argument provided by the verifier */ BPF_CALL_5(bpf_sk_storage_get_tracing, struct bpf_map *, map, struct sock *, sk, void *, value, u64, flags, gfp_t, gfp_flags) { WARN_ON_ONCE(!bpf_rcu_lock_held()); if (in_hardirq() || in_nmi()) return (unsigned long)NULL; return (unsigned long)____bpf_sk_storage_get(map, sk, value, flags, gfp_flags); } BPF_CALL_2(bpf_sk_storage_delete_tracing, struct bpf_map *, map, struct sock *, sk) { WARN_ON_ONCE(!bpf_rcu_lock_held()); if (in_hardirq() || in_nmi()) return -EPERM; return ____bpf_sk_storage_delete(map, sk); } const struct bpf_func_proto bpf_sk_storage_get_tracing_proto = { .func = bpf_sk_storage_get_tracing, .gpl_only = false, .ret_type = RET_PTR_TO_MAP_VALUE_OR_NULL, .arg1_type = ARG_CONST_MAP_PTR, .arg2_type = ARG_PTR_TO_BTF_ID_OR_NULL, .arg2_btf_id = &btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON], .arg3_type = ARG_PTR_TO_MAP_VALUE_OR_NULL, .arg4_type = ARG_ANYTHING, .allowed = bpf_sk_storage_tracing_allowed, }; const struct bpf_func_proto bpf_sk_storage_delete_tracing_proto = { .func = bpf_sk_storage_delete_tracing, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_CONST_MAP_PTR, .arg2_type = ARG_PTR_TO_BTF_ID_OR_NULL, .arg2_btf_id = &btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON], .allowed = bpf_sk_storage_tracing_allowed, }; struct bpf_sk_storage_diag { u32 nr_maps; struct bpf_map *maps[]; }; /* The reply will be like: * INET_DIAG_BPF_SK_STORAGES (nla_nest) * SK_DIAG_BPF_STORAGE (nla_nest) * SK_DIAG_BPF_STORAGE_MAP_ID (nla_put_u32) * SK_DIAG_BPF_STORAGE_MAP_VALUE (nla_reserve_64bit) * SK_DIAG_BPF_STORAGE (nla_nest) * SK_DIAG_BPF_STORAGE_MAP_ID (nla_put_u32) * SK_DIAG_BPF_STORAGE_MAP_VALUE (nla_reserve_64bit) * .... */ static int nla_value_size(u32 value_size) { /* SK_DIAG_BPF_STORAGE (nla_nest) * SK_DIAG_BPF_STORAGE_MAP_ID (nla_put_u32) * SK_DIAG_BPF_STORAGE_MAP_VALUE (nla_reserve_64bit) */ return nla_total_size(0) + nla_total_size(sizeof(u32)) + nla_total_size_64bit(value_size); } void bpf_sk_storage_diag_free(struct bpf_sk_storage_diag *diag) { u32 i; if (!diag) return; for (i = 0; i < diag->nr_maps; i++) bpf_map_put(diag->maps[i]); kfree(diag); } EXPORT_SYMBOL_GPL(bpf_sk_storage_diag_free); static bool diag_check_dup(const struct bpf_sk_storage_diag *diag, const struct bpf_map *map) { u32 i; for (i = 0; i < diag->nr_maps; i++) { if (diag->maps[i] == map) return true; } return false; } struct bpf_sk_storage_diag * bpf_sk_storage_diag_alloc(const struct nlattr *nla_stgs) { struct bpf_sk_storage_diag *diag; struct nlattr *nla; u32 nr_maps = 0; int rem, err; /* bpf_local_storage_map is currently limited to CAP_SYS_ADMIN as * the map_alloc_check() side also does. */ if (!bpf_capable()) return ERR_PTR(-EPERM); nla_for_each_nested_type(nla, SK_DIAG_BPF_STORAGE_REQ_MAP_FD, nla_stgs, rem) { if (nla_len(nla) != sizeof(u32)) return ERR_PTR(-EINVAL); nr_maps++; } diag = kzalloc(struct_size(diag, maps, nr_maps), GFP_KERNEL); if (!diag) return ERR_PTR(-ENOMEM); nla_for_each_nested_type(nla, SK_DIAG_BPF_STORAGE_REQ_MAP_FD, nla_stgs, rem) { int map_fd = nla_get_u32(nla); struct bpf_map *map = bpf_map_get(map_fd); if (IS_ERR(map)) { err = PTR_ERR(map); goto err_free; } if (map->map_type != BPF_MAP_TYPE_SK_STORAGE) { bpf_map_put(map); err = -EINVAL; goto err_free; } if (diag_check_dup(diag, map)) { bpf_map_put(map); err = -EEXIST; goto err_free; } diag->maps[diag->nr_maps++] = map; } return diag; err_free: bpf_sk_storage_diag_free(diag); return ERR_PTR(err); } EXPORT_SYMBOL_GPL(bpf_sk_storage_diag_alloc); static int diag_get(struct bpf_local_storage_data *sdata, struct sk_buff *skb) { struct nlattr *nla_stg, *nla_value; struct bpf_local_storage_map *smap; /* It cannot exceed max nlattr's payload */ BUILD_BUG_ON(U16_MAX - NLA_HDRLEN < BPF_LOCAL_STORAGE_MAX_VALUE_SIZE); nla_stg = nla_nest_start(skb, SK_DIAG_BPF_STORAGE); if (!nla_stg) return -EMSGSIZE; smap = rcu_dereference(sdata->smap); if (nla_put_u32(skb, SK_DIAG_BPF_STORAGE_MAP_ID, smap->map.id)) goto errout; nla_value = nla_reserve_64bit(skb, SK_DIAG_BPF_STORAGE_MAP_VALUE, smap->map.value_size, SK_DIAG_BPF_STORAGE_PAD); if (!nla_value) goto errout; if (btf_record_has_field(smap->map.record, BPF_SPIN_LOCK)) copy_map_value_locked(&smap->map, nla_data(nla_value), sdata->data, true); else copy_map_value(&smap->map, nla_data(nla_value), sdata->data); nla_nest_end(skb, nla_stg); return 0; errout: nla_nest_cancel(skb, nla_stg); return -EMSGSIZE; } static int bpf_sk_storage_diag_put_all(struct sock *sk, struct sk_buff *skb, int stg_array_type, unsigned int *res_diag_size) { /* stg_array_type (e.g. INET_DIAG_BPF_SK_STORAGES) */ unsigned int diag_size = nla_total_size(0); struct bpf_local_storage *sk_storage; struct bpf_local_storage_elem *selem; struct bpf_local_storage_map *smap; struct nlattr *nla_stgs; unsigned int saved_len; int err = 0; rcu_read_lock(); sk_storage = rcu_dereference(sk->sk_bpf_storage); if (!sk_storage || hlist_empty(&sk_storage->list)) { rcu_read_unlock(); return 0; } nla_stgs = nla_nest_start(skb, stg_array_type); if (!nla_stgs) /* Continue to learn diag_size */ err = -EMSGSIZE; saved_len = skb->len; hlist_for_each_entry_rcu(selem, &sk_storage->list, snode) { smap = rcu_dereference(SDATA(selem)->smap); diag_size += nla_value_size(smap->map.value_size); if (nla_stgs && diag_get(SDATA(selem), skb)) /* Continue to learn diag_size */ err = -EMSGSIZE; } rcu_read_unlock(); if (nla_stgs) { if (saved_len == skb->len) nla_nest_cancel(skb, nla_stgs); else nla_nest_end(skb, nla_stgs); } if (diag_size == nla_total_size(0)) { *res_diag_size = 0; return 0; } *res_diag_size = diag_size; return err; } int bpf_sk_storage_diag_put(struct bpf_sk_storage_diag *diag, struct sock *sk, struct sk_buff *skb, int stg_array_type, unsigned int *res_diag_size) { /* stg_array_type (e.g. INET_DIAG_BPF_SK_STORAGES) */ unsigned int diag_size = nla_total_size(0); struct bpf_local_storage *sk_storage; struct bpf_local_storage_data *sdata; struct nlattr *nla_stgs; unsigned int saved_len; int err = 0; u32 i; *res_diag_size = 0; /* No map has been specified. Dump all. */ if (!diag->nr_maps) return bpf_sk_storage_diag_put_all(sk, skb, stg_array_type, res_diag_size); rcu_read_lock(); sk_storage = rcu_dereference(sk->sk_bpf_storage); if (!sk_storage || hlist_empty(&sk_storage->list)) { rcu_read_unlock(); return 0; } nla_stgs = nla_nest_start(skb, stg_array_type); if (!nla_stgs) /* Continue to learn diag_size */ err = -EMSGSIZE; saved_len = skb->len; for (i = 0; i < diag->nr_maps; i++) { sdata = bpf_local_storage_lookup(sk_storage, (struct bpf_local_storage_map *)diag->maps[i], false); if (!sdata) continue; diag_size += nla_value_size(diag->maps[i]->value_size); if (nla_stgs && diag_get(sdata, skb)) /* Continue to learn diag_size */ err = -EMSGSIZE; } rcu_read_unlock(); if (nla_stgs) { if (saved_len == skb->len) nla_nest_cancel(skb, nla_stgs); else nla_nest_end(skb, nla_stgs); } if (diag_size == nla_total_size(0)) { *res_diag_size = 0; return 0; } *res_diag_size = diag_size; return err; } EXPORT_SYMBOL_GPL(bpf_sk_storage_diag_put); struct bpf_iter_seq_sk_storage_map_info { struct bpf_map *map; unsigned int bucket_id; unsigned skip_elems; }; static struct bpf_local_storage_elem * bpf_sk_storage_map_seq_find_next(struct bpf_iter_seq_sk_storage_map_info *info, struct bpf_local_storage_elem *prev_selem) __acquires(RCU) __releases(RCU) { struct bpf_local_storage *sk_storage; struct bpf_local_storage_elem *selem; u32 skip_elems = info->skip_elems; struct bpf_local_storage_map *smap; u32 bucket_id = info->bucket_id; u32 i, count, n_buckets; struct bpf_local_storage_map_bucket *b; smap = (struct bpf_local_storage_map *)info->map; n_buckets = 1U << smap->bucket_log; if (bucket_id >= n_buckets) return NULL; /* try to find next selem in the same bucket */ selem = prev_selem; count = 0; while (selem) { selem = hlist_entry_safe(rcu_dereference(hlist_next_rcu(&selem->map_node)), struct bpf_local_storage_elem, map_node); if (!selem) { /* not found, unlock and go to the next bucket */ b = &smap->buckets[bucket_id++]; rcu_read_unlock(); skip_elems = 0; break; } sk_storage = rcu_dereference(selem->local_storage); if (sk_storage) { info->skip_elems = skip_elems + count; return selem; } count++; } for (i = bucket_id; i < (1U << smap->bucket_log); i++) { b = &smap->buckets[i]; rcu_read_lock(); count = 0; hlist_for_each_entry_rcu(selem, &b->list, map_node) { sk_storage = rcu_dereference(selem->local_storage); if (sk_storage && count >= skip_elems) { info->bucket_id = i; info->skip_elems = count; return selem; } count++; } rcu_read_unlock(); skip_elems = 0; } info->bucket_id = i; info->skip_elems = 0; return NULL; } static void *bpf_sk_storage_map_seq_start(struct seq_file *seq, loff_t *pos) { struct bpf_local_storage_elem *selem; selem = bpf_sk_storage_map_seq_find_next(seq->private, NULL); if (!selem) return NULL; if (*pos == 0) ++*pos; return selem; } static void *bpf_sk_storage_map_seq_next(struct seq_file *seq, void *v, loff_t *pos) { struct bpf_iter_seq_sk_storage_map_info *info = seq->private; ++*pos; ++info->skip_elems; return bpf_sk_storage_map_seq_find_next(seq->private, v); } struct bpf_iter__bpf_sk_storage_map { __bpf_md_ptr(struct bpf_iter_meta *, meta); __bpf_md_ptr(struct bpf_map *, map); __bpf_md_ptr(struct sock *, sk); __bpf_md_ptr(void *, value); }; DEFINE_BPF_ITER_FUNC(bpf_sk_storage_map, struct bpf_iter_meta *meta, struct bpf_map *map, struct sock *sk, void *value) static int __bpf_sk_storage_map_seq_show(struct seq_file *seq, struct bpf_local_storage_elem *selem) { struct bpf_iter_seq_sk_storage_map_info *info = seq->private; struct bpf_iter__bpf_sk_storage_map ctx = {}; struct bpf_local_storage *sk_storage; struct bpf_iter_meta meta; struct bpf_prog *prog; int ret = 0; meta.seq = seq; prog = bpf_iter_get_info(&meta, selem == NULL); if (prog) { ctx.meta = &meta; ctx.map = info->map; if (selem) { sk_storage = rcu_dereference(selem->local_storage); ctx.sk = sk_storage->owner; ctx.value = SDATA(selem)->data; } ret = bpf_iter_run_prog(prog, &ctx); } return ret; } static int bpf_sk_storage_map_seq_show(struct seq_file *seq, void *v) { return __bpf_sk_storage_map_seq_show(seq, v); } static void bpf_sk_storage_map_seq_stop(struct seq_file *seq, void *v) __releases(RCU) { if (!v) (void)__bpf_sk_storage_map_seq_show(seq, v); else rcu_read_unlock(); } static int bpf_iter_init_sk_storage_map(void *priv_data, struct bpf_iter_aux_info *aux) { struct bpf_iter_seq_sk_storage_map_info *seq_info = priv_data; bpf_map_inc_with_uref(aux->map); seq_info->map = aux->map; return 0; } static void bpf_iter_fini_sk_storage_map(void *priv_data) { struct bpf_iter_seq_sk_storage_map_info *seq_info = priv_data; bpf_map_put_with_uref(seq_info->map); } static int bpf_iter_attach_map(struct bpf_prog *prog, union bpf_iter_link_info *linfo, struct bpf_iter_aux_info *aux) { struct bpf_map *map; int err = -EINVAL; if (!linfo->map.map_fd) return -EBADF; map = bpf_map_get_with_uref(linfo->map.map_fd); if (IS_ERR(map)) return PTR_ERR(map); if (map->map_type != BPF_MAP_TYPE_SK_STORAGE) goto put_map; if (prog->aux->max_rdwr_access > map->value_size) { err = -EACCES; goto put_map; } aux->map = map; return 0; put_map: bpf_map_put_with_uref(map); return err; } static void bpf_iter_detach_map(struct bpf_iter_aux_info *aux) { bpf_map_put_with_uref(aux->map); } static const struct seq_operations bpf_sk_storage_map_seq_ops = { .start = bpf_sk_storage_map_seq_start, .next = bpf_sk_storage_map_seq_next, .stop = bpf_sk_storage_map_seq_stop, .show = bpf_sk_storage_map_seq_show, }; static const struct bpf_iter_seq_info iter_seq_info = { .seq_ops = &bpf_sk_storage_map_seq_ops, .init_seq_private = bpf_iter_init_sk_storage_map, .fini_seq_private = bpf_iter_fini_sk_storage_map, .seq_priv_size = sizeof(struct bpf_iter_seq_sk_storage_map_info), }; static struct bpf_iter_reg bpf_sk_storage_map_reg_info = { .target = "bpf_sk_storage_map", .attach_target = bpf_iter_attach_map, .detach_target = bpf_iter_detach_map, .show_fdinfo = bpf_iter_map_show_fdinfo, .fill_link_info = bpf_iter_map_fill_link_info, .ctx_arg_info_size = 2, .ctx_arg_info = { { offsetof(struct bpf_iter__bpf_sk_storage_map, sk), PTR_TO_BTF_ID_OR_NULL }, { offsetof(struct bpf_iter__bpf_sk_storage_map, value), PTR_TO_BUF | PTR_MAYBE_NULL }, }, .seq_info = &iter_seq_info, }; static int __init bpf_sk_storage_map_iter_init(void) { bpf_sk_storage_map_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_SOCK]; return bpf_iter_reg_target(&bpf_sk_storage_map_reg_info); } late_initcall(bpf_sk_storage_map_iter_init);
37 497 497 269 5668 149 230 1 333 487 14 14 14 14 2203 5341 332 298 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_SWAP_H #define _LINUX_SWAP_H #include <linux/spinlock.h> #include <linux/linkage.h> #include <linux/mmzone.h> #include <linux/list.h> #include <linux/memcontrol.h> #include <linux/sched.h> #include <linux/node.h> #include <linux/fs.h> #include <linux/pagemap.h> #include <linux/atomic.h> #include <linux/page-flags.h> #include <uapi/linux/mempolicy.h> #include <asm/page.h> struct notifier_block; struct bio; struct pagevec; #define SWAP_FLAG_PREFER 0x8000 /* set if swap priority specified */ #define SWAP_FLAG_PRIO_MASK 0x7fff #define SWAP_FLAG_PRIO_SHIFT 0 #define SWAP_FLAG_DISCARD 0x10000 /* enable discard for swap */ #define SWAP_FLAG_DISCARD_ONCE 0x20000 /* discard swap area at swapon-time */ #define SWAP_FLAG_DISCARD_PAGES 0x40000 /* discard page-clusters after use */ #define SWAP_FLAGS_VALID (SWAP_FLAG_PRIO_MASK | SWAP_FLAG_PREFER | \ SWAP_FLAG_DISCARD | SWAP_FLAG_DISCARD_ONCE | \ SWAP_FLAG_DISCARD_PAGES) #define SWAP_BATCH 64 static inline int current_is_kswapd(void) { return current->flags & PF_KSWAPD; } /* * MAX_SWAPFILES defines the maximum number of swaptypes: things which can * be swapped to. The swap type and the offset into that swap type are * encoded into pte's and into pgoff_t's in the swapcache. Using five bits * for the type means that the maximum number of swapcache pages is 27 bits * on 32-bit-pgoff_t architectures. And that assumes that the architecture packs * the type/offset into the pte as 5/27 as well. */ #define MAX_SWAPFILES_SHIFT 5 /* * Use some of the swap files numbers for other purposes. This * is a convenient way to hook into the VM to trigger special * actions on faults. */ /* * PTE markers are used to persist information onto PTEs that otherwise * should be a none pte. As its name "PTE" hints, it should only be * applied to the leaves of pgtables. */ #define SWP_PTE_MARKER_NUM 1 #define SWP_PTE_MARKER (MAX_SWAPFILES + SWP_HWPOISON_NUM + \ SWP_MIGRATION_NUM + SWP_DEVICE_NUM) /* * Unaddressable device memory support. See include/linux/hmm.h and * Documentation/mm/hmm.rst. Short description is we need struct pages for * device memory that is unaddressable (inaccessible) by CPU, so that we can * migrate part of a process memory to device memory. * * When a page is migrated from CPU to device, we set the CPU page table entry * to a special SWP_DEVICE_{READ|WRITE} entry. * * When a page is mapped by the device for exclusive access we set the CPU page * table entries to special SWP_DEVICE_EXCLUSIVE_* entries. */ #ifdef CONFIG_DEVICE_PRIVATE #define SWP_DEVICE_NUM 4 #define SWP_DEVICE_WRITE (MAX_SWAPFILES+SWP_HWPOISON_NUM+SWP_MIGRATION_NUM) #define SWP_DEVICE_READ (MAX_SWAPFILES+SWP_HWPOISON_NUM+SWP_MIGRATION_NUM+1) #define SWP_DEVICE_EXCLUSIVE_WRITE (MAX_SWAPFILES+SWP_HWPOISON_NUM+SWP_MIGRATION_NUM+2) #define SWP_DEVICE_EXCLUSIVE_READ (MAX_SWAPFILES+SWP_HWPOISON_NUM+SWP_MIGRATION_NUM+3) #else #define SWP_DEVICE_NUM 0 #endif /* * Page migration support. * * SWP_MIGRATION_READ_EXCLUSIVE is only applicable to anonymous pages and * indicates that the referenced (part of) an anonymous page is exclusive to * a single process. For SWP_MIGRATION_WRITE, that information is implicit: * (part of) an anonymous page that are mapped writable are exclusive to a * single process. */ #ifdef CONFIG_MIGRATION #define SWP_MIGRATION_NUM 3 #define SWP_MIGRATION_READ (MAX_SWAPFILES + SWP_HWPOISON_NUM) #define SWP_MIGRATION_READ_EXCLUSIVE (MAX_SWAPFILES + SWP_HWPOISON_NUM + 1) #define SWP_MIGRATION_WRITE (MAX_SWAPFILES + SWP_HWPOISON_NUM + 2) #else #define SWP_MIGRATION_NUM 0 #endif /* * Handling of hardware poisoned pages with memory corruption. */ #ifdef CONFIG_MEMORY_FAILURE #define SWP_HWPOISON_NUM 1 #define SWP_HWPOISON MAX_SWAPFILES #else #define SWP_HWPOISON_NUM 0 #endif #define MAX_SWAPFILES \ ((1 << MAX_SWAPFILES_SHIFT) - SWP_DEVICE_NUM - \ SWP_MIGRATION_NUM - SWP_HWPOISON_NUM - \ SWP_PTE_MARKER_NUM) /* * Magic header for a swap area. The first part of the union is * what the swap magic looks like for the old (limited to 128MB) * swap area format, the second part of the union adds - in the * old reserved area - some extra information. Note that the first * kilobyte is reserved for boot loader or disk label stuff... * * Having the magic at the end of the PAGE_SIZE makes detecting swap * areas somewhat tricky on machines that support multiple page sizes. * For 2.5 we'll probably want to move the magic to just beyond the * bootbits... */ union swap_header { struct { char reserved[PAGE_SIZE - 10]; char magic[10]; /* SWAP-SPACE or SWAPSPACE2 */ } magic; struct { char bootbits[1024]; /* Space for disklabel etc. */ __u32 version; __u32 last_page; __u32 nr_badpages; unsigned char sws_uuid[16]; unsigned char sws_volume[16]; __u32 padding[117]; __u32 badpages[1]; } info; }; /* * current->reclaim_state points to one of these when a task is running * memory reclaim */ struct reclaim_state { /* pages reclaimed outside of LRU-based reclaim */ unsigned long reclaimed; #ifdef CONFIG_LRU_GEN /* per-thread mm walk data */ struct lru_gen_mm_walk *mm_walk; #endif }; /* * mm_account_reclaimed_pages(): account reclaimed pages outside of LRU-based * reclaim * @pages: number of pages reclaimed * * If the current process is undergoing a reclaim operation, increment the * number of reclaimed pages by @pages. */ static inline void mm_account_reclaimed_pages(unsigned long pages) { if (current->reclaim_state) current->reclaim_state->reclaimed += pages; } #ifdef __KERNEL__ struct address_space; struct sysinfo; struct writeback_control; struct zone; /* * A swap extent maps a range of a swapfile's PAGE_SIZE pages onto a range of * disk blocks. A rbtree of swap extents maps the entire swapfile (Where the * term `swapfile' refers to either a blockdevice or an IS_REG file). Apart * from setup, they're handled identically. * * We always assume that blocks are of size PAGE_SIZE. */ struct swap_extent { struct rb_node rb_node; pgoff_t start_page; pgoff_t nr_pages; sector_t start_block; }; /* * Max bad pages in the new format.. */ #define MAX_SWAP_BADPAGES \ ((offsetof(union swap_header, magic.magic) - \ offsetof(union swap_header, info.badpages)) / sizeof(int)) enum { SWP_USED = (1 << 0), /* is slot in swap_info[] used? */ SWP_WRITEOK = (1 << 1), /* ok to write to this swap? */ SWP_DISCARDABLE = (1 << 2), /* blkdev support discard */ SWP_DISCARDING = (1 << 3), /* now discarding a free cluster */ SWP_SOLIDSTATE = (1 << 4), /* blkdev seeks are cheap */ SWP_CONTINUED = (1 << 5), /* swap_map has count continuation */ SWP_BLKDEV = (1 << 6), /* its a block device */ SWP_ACTIVATED = (1 << 7), /* set after swap_activate success */ SWP_FS_OPS = (1 << 8), /* swapfile operations go through fs */ SWP_AREA_DISCARD = (1 << 9), /* single-time swap area discards */ SWP_PAGE_DISCARD = (1 << 10), /* freed swap page-cluster discards */ SWP_STABLE_WRITES = (1 << 11), /* no overwrite PG_writeback pages */ SWP_SYNCHRONOUS_IO = (1 << 12), /* synchronous IO is efficient */ /* add others here before... */ }; #define SWAP_CLUSTER_MAX 32UL #define SWAP_CLUSTER_MAX_SKIPPED (SWAP_CLUSTER_MAX << 10) #define COMPACT_CLUSTER_MAX SWAP_CLUSTER_MAX /* Bit flag in swap_map */ #define SWAP_HAS_CACHE 0x40 /* Flag page is cached, in first swap_map */ #define COUNT_CONTINUED 0x80 /* Flag swap_map continuation for full count */ /* Special value in first swap_map */ #define SWAP_MAP_MAX 0x3e /* Max count */ #define SWAP_MAP_BAD 0x3f /* Note page is bad */ #define SWAP_MAP_SHMEM 0xbf /* Owned by shmem/tmpfs */ /* Special value in each swap_map continuation */ #define SWAP_CONT_MAX 0x7f /* Max count */ /* * We use this to track usage of a cluster. A cluster is a block of swap disk * space with SWAPFILE_CLUSTER pages long and naturally aligns in disk. All * free clusters are organized into a list. We fetch an entry from the list to * get a free cluster. * * The flags field determines if a cluster is free. This is * protected by cluster lock. */ struct swap_cluster_info { spinlock_t lock; /* * Protect swap_cluster_info fields * other than list, and swap_info_struct->swap_map * elements corresponding to the swap cluster. */ u16 count; u8 flags; u8 order; struct list_head list; }; /* All on-list cluster must have a non-zero flag. */ enum swap_cluster_flags { CLUSTER_FLAG_NONE = 0, /* For temporary off-list cluster */ CLUSTER_FLAG_FREE, CLUSTER_FLAG_NONFULL, CLUSTER_FLAG_FRAG, /* Clusters with flags above are allocatable */ CLUSTER_FLAG_USABLE = CLUSTER_FLAG_FRAG, CLUSTER_FLAG_FULL, CLUSTER_FLAG_DISCARD, CLUSTER_FLAG_MAX, }; /* * The first page in the swap file is the swap header, which is always marked * bad to prevent it from being allocated as an entry. This also prevents the * cluster to which it belongs being marked free. Therefore 0 is safe to use as * a sentinel to indicate an entry is not valid. */ #define SWAP_ENTRY_INVALID 0 #ifdef CONFIG_THP_SWAP #define SWAP_NR_ORDERS (PMD_ORDER + 1) #else #define SWAP_NR_ORDERS 1 #endif /* * We assign a cluster to each CPU, so each CPU can allocate swap entry from * its own cluster and swapout sequentially. The purpose is to optimize swapout * throughput. */ struct percpu_cluster { local_lock_t lock; /* Protect the percpu_cluster above */ unsigned int next[SWAP_NR_ORDERS]; /* Likely next allocation offset */ }; /* * The in-memory structure used to track swap areas. */ struct swap_info_struct { struct percpu_ref users; /* indicate and keep swap device valid. */ unsigned long flags; /* SWP_USED etc: see above */ signed short prio; /* swap priority of this type */ struct plist_node list; /* entry in swap_active_head */ signed char type; /* strange name for an index */ unsigned int max; /* extent of the swap_map */ unsigned char *swap_map; /* vmalloc'ed array of usage counts */ unsigned long *zeromap; /* kvmalloc'ed bitmap to track zero pages */ struct swap_cluster_info *cluster_info; /* cluster info. Only for SSD */ struct list_head free_clusters; /* free clusters list */ struct list_head full_clusters; /* full clusters list */ struct list_head nonfull_clusters[SWAP_NR_ORDERS]; /* list of cluster that contains at least one free slot */ struct list_head frag_clusters[SWAP_NR_ORDERS]; /* list of cluster that are fragmented or contented */ atomic_long_t frag_cluster_nr[SWAP_NR_ORDERS]; unsigned int pages; /* total of usable pages of swap */ atomic_long_t inuse_pages; /* number of those currently in use */ struct percpu_cluster __percpu *percpu_cluster; /* per cpu's swap location */ struct percpu_cluster *global_cluster; /* Use one global cluster for rotating device */ spinlock_t global_cluster_lock; /* Serialize usage of global cluster */ struct rb_root swap_extent_root;/* root of the swap extent rbtree */ struct block_device *bdev; /* swap device or bdev of swap file */ struct file *swap_file; /* seldom referenced */ struct completion comp; /* seldom referenced */ spinlock_t lock; /* * protect map scan related fields like * swap_map, lowest_bit, highest_bit, * inuse_pages, cluster_next, * cluster_nr, lowest_alloc, * highest_alloc, free/discard cluster * list. other fields are only changed * at swapon/swapoff, so are protected * by swap_lock. changing flags need * hold this lock and swap_lock. If * both locks need hold, hold swap_lock * first. */ spinlock_t cont_lock; /* * protect swap count continuation page * list. */ struct work_struct discard_work; /* discard worker */ struct work_struct reclaim_work; /* reclaim worker */ struct list_head discard_clusters; /* discard clusters list */ struct plist_node avail_lists[]; /* * entries in swap_avail_heads, one * entry per node. * Must be last as the number of the * array is nr_node_ids, which is not * a fixed value so have to allocate * dynamically. * And it has to be an array so that * plist_for_each_* can work. */ }; static inline swp_entry_t page_swap_entry(struct page *page) { struct folio *folio = page_folio(page); swp_entry_t entry = folio->swap; entry.val += folio_page_idx(folio, page); return entry; } /* linux/mm/workingset.c */ bool workingset_test_recent(void *shadow, bool file, bool *workingset, bool flush); void workingset_age_nonresident(struct lruvec *lruvec, unsigned long nr_pages); void *workingset_eviction(struct folio *folio, struct mem_cgroup *target_memcg); void workingset_refault(struct folio *folio, void *shadow); void workingset_activation(struct folio *folio); /* linux/mm/page_alloc.c */ extern unsigned long totalreserve_pages; /* Definition of global_zone_page_state not available yet */ #define nr_free_pages() global_zone_page_state(NR_FREE_PAGES) /* linux/mm/swap.c */ void lru_note_cost(struct lruvec *lruvec, bool file, unsigned int nr_io, unsigned int nr_rotated); void lru_note_cost_refault(struct folio *); void folio_add_lru(struct folio *); void folio_add_lru_vma(struct folio *, struct vm_area_struct *); void mark_page_accessed(struct page *); void folio_mark_accessed(struct folio *); extern atomic_t lru_disable_count; static inline bool lru_cache_disabled(void) { return atomic_read(&lru_disable_count); } static inline void lru_cache_enable(void) { atomic_dec(&lru_disable_count); } extern void lru_cache_disable(void); extern void lru_add_drain(void); extern void lru_add_drain_cpu(int cpu); extern void lru_add_drain_cpu_zone(struct zone *zone); extern void lru_add_drain_all(void); void folio_deactivate(struct folio *folio); void folio_mark_lazyfree(struct folio *folio); extern void swap_setup(void); /* linux/mm/vmscan.c */ extern unsigned long zone_reclaimable_pages(struct zone *zone); extern unsigned long try_to_free_pages(struct zonelist *zonelist, int order, gfp_t gfp_mask, nodemask_t *mask); #define MEMCG_RECLAIM_MAY_SWAP (1 << 1) #define MEMCG_RECLAIM_PROACTIVE (1 << 2) #define MIN_SWAPPINESS 0 #define MAX_SWAPPINESS 200 extern unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg, unsigned long nr_pages, gfp_t gfp_mask, unsigned int reclaim_options, int *swappiness); extern unsigned long mem_cgroup_shrink_node(struct mem_cgroup *mem, gfp_t gfp_mask, bool noswap, pg_data_t *pgdat, unsigned long *nr_scanned); extern unsigned long shrink_all_memory(unsigned long nr_pages); extern int vm_swappiness; long remove_mapping(struct address_space *mapping, struct folio *folio); #ifdef CONFIG_NUMA extern int node_reclaim_mode; extern int sysctl_min_unmapped_ratio; extern int sysctl_min_slab_ratio; #else #define node_reclaim_mode 0 #endif static inline bool node_reclaim_enabled(void) { /* Is any node_reclaim_mode bit set? */ return node_reclaim_mode & (RECLAIM_ZONE|RECLAIM_WRITE|RECLAIM_UNMAP); } void check_move_unevictable_folios(struct folio_batch *fbatch); extern void __meminit kswapd_run(int nid); extern void __meminit kswapd_stop(int nid); #ifdef CONFIG_SWAP int add_swap_extent(struct swap_info_struct *sis, unsigned long start_page, unsigned long nr_pages, sector_t start_block); int generic_swapfile_activate(struct swap_info_struct *, struct file *, sector_t *); static inline unsigned long total_swapcache_pages(void) { return global_node_page_state(NR_SWAPCACHE); } void free_swap_cache(struct folio *folio); void free_page_and_swap_cache(struct page *); void free_pages_and_swap_cache(struct encoded_page **, int); /* linux/mm/swapfile.c */ extern atomic_long_t nr_swap_pages; extern long total_swap_pages; extern atomic_t nr_rotate_swap; extern bool has_usable_swap(void); /* Swap 50% full? Release swapcache more aggressively.. */ static inline bool vm_swap_full(void) { return atomic_long_read(&nr_swap_pages) * 2 < total_swap_pages; } static inline long get_nr_swap_pages(void) { return atomic_long_read(&nr_swap_pages); } extern void si_swapinfo(struct sysinfo *); swp_entry_t folio_alloc_swap(struct folio *folio); bool folio_free_swap(struct folio *folio); void put_swap_folio(struct folio *folio, swp_entry_t entry); extern swp_entry_t get_swap_page_of_type(int); extern int get_swap_pages(int n, swp_entry_t swp_entries[], int order); extern int add_swap_count_continuation(swp_entry_t, gfp_t); extern void swap_shmem_alloc(swp_entry_t, int); extern int swap_duplicate(swp_entry_t); extern int swapcache_prepare(swp_entry_t entry, int nr); extern void swap_free_nr(swp_entry_t entry, int nr_pages); extern void swapcache_free_entries(swp_entry_t *entries, int n); extern void free_swap_and_cache_nr(swp_entry_t entry, int nr); int swap_type_of(dev_t device, sector_t offset); int find_first_swap(dev_t *device); extern unsigned int count_swap_pages(int, int); extern sector_t swapdev_block(int, pgoff_t); extern int __swap_count(swp_entry_t entry); extern int swap_swapcount(struct swap_info_struct *si, swp_entry_t entry); extern int swp_swapcount(swp_entry_t entry); struct swap_info_struct *swp_swap_info(swp_entry_t entry); struct backing_dev_info; extern int init_swap_address_space(unsigned int type, unsigned long nr_pages); extern void exit_swap_address_space(unsigned int type); extern struct swap_info_struct *get_swap_device(swp_entry_t entry); sector_t swap_folio_sector(struct folio *folio); static inline void put_swap_device(struct swap_info_struct *si) { percpu_ref_put(&si->users); } #else /* CONFIG_SWAP */ static inline struct swap_info_struct *swp_swap_info(swp_entry_t entry) { return NULL; } static inline struct swap_info_struct *get_swap_device(swp_entry_t entry) { return NULL; } static inline void put_swap_device(struct swap_info_struct *si) { } #define get_nr_swap_pages() 0L #define total_swap_pages 0L #define total_swapcache_pages() 0UL #define vm_swap_full() 0 #define si_swapinfo(val) \ do { (val)->freeswap = (val)->totalswap = 0; } while (0) /* only sparc can not include linux/pagemap.h in this file * so leave put_page and release_pages undeclared... */ #define free_page_and_swap_cache(page) \ put_page(page) #define free_pages_and_swap_cache(pages, nr) \ release_pages((pages), (nr)); static inline void free_swap_and_cache_nr(swp_entry_t entry, int nr) { } static inline void free_swap_cache(struct folio *folio) { } static inline int add_swap_count_continuation(swp_entry_t swp, gfp_t gfp_mask) { return 0; } static inline void swap_shmem_alloc(swp_entry_t swp, int nr) { } static inline int swap_duplicate(swp_entry_t swp) { return 0; } static inline int swapcache_prepare(swp_entry_t swp, int nr) { return 0; } static inline void swap_free_nr(swp_entry_t entry, int nr_pages) { } static inline void put_swap_folio(struct folio *folio, swp_entry_t swp) { } static inline int __swap_count(swp_entry_t entry) { return 0; } static inline int swap_swapcount(struct swap_info_struct *si, swp_entry_t entry) { return 0; } static inline int swp_swapcount(swp_entry_t entry) { return 0; } static inline swp_entry_t folio_alloc_swap(struct folio *folio) { swp_entry_t entry; entry.val = 0; return entry; } static inline bool folio_free_swap(struct folio *folio) { return false; } static inline int add_swap_extent(struct swap_info_struct *sis, unsigned long start_page, unsigned long nr_pages, sector_t start_block) { return -EINVAL; } #endif /* CONFIG_SWAP */ static inline void free_swap_and_cache(swp_entry_t entry) { free_swap_and_cache_nr(entry, 1); } static inline void swap_free(swp_entry_t entry) { swap_free_nr(entry, 1); } #ifdef CONFIG_MEMCG static inline int mem_cgroup_swappiness(struct mem_cgroup *memcg) { /* Cgroup2 doesn't have per-cgroup swappiness */ if (cgroup_subsys_on_dfl(memory_cgrp_subsys)) return READ_ONCE(vm_swappiness); /* root ? */ if (mem_cgroup_disabled() || mem_cgroup_is_root(memcg)) return READ_ONCE(vm_swappiness); return READ_ONCE(memcg->swappiness); } #else static inline int mem_cgroup_swappiness(struct mem_cgroup *mem) { return READ_ONCE(vm_swappiness); } #endif #if defined(CONFIG_SWAP) && defined(CONFIG_MEMCG) && defined(CONFIG_BLK_CGROUP) void __folio_throttle_swaprate(struct folio *folio, gfp_t gfp); static inline void folio_throttle_swaprate(struct folio *folio, gfp_t gfp) { if (mem_cgroup_disabled()) return; __folio_throttle_swaprate(folio, gfp); } #else static inline void folio_throttle_swaprate(struct folio *folio, gfp_t gfp) { } #endif #if defined(CONFIG_MEMCG) && defined(CONFIG_SWAP) void mem_cgroup_swapout(struct folio *folio, swp_entry_t entry); int __mem_cgroup_try_charge_swap(struct folio *folio, swp_entry_t entry); static inline int mem_cgroup_try_charge_swap(struct folio *folio, swp_entry_t entry) { if (mem_cgroup_disabled()) return 0; return __mem_cgroup_try_charge_swap(folio, entry); } extern void __mem_cgroup_uncharge_swap(swp_entry_t entry, unsigned int nr_pages); static inline void mem_cgroup_uncharge_swap(swp_entry_t entry, unsigned int nr_pages) { if (mem_cgroup_disabled()) return; __mem_cgroup_uncharge_swap(entry, nr_pages); } extern long mem_cgroup_get_nr_swap_pages(struct mem_cgroup *memcg); extern bool mem_cgroup_swap_full(struct folio *folio); #else static inline void mem_cgroup_swapout(struct folio *folio, swp_entry_t entry) { } static inline int mem_cgroup_try_charge_swap(struct folio *folio, swp_entry_t entry) { return 0; } static inline void mem_cgroup_uncharge_swap(swp_entry_t entry, unsigned int nr_pages) { } static inline long mem_cgroup_get_nr_swap_pages(struct mem_cgroup *memcg) { return get_nr_swap_pages(); } static inline bool mem_cgroup_swap_full(struct folio *folio) { return vm_swap_full(); } #endif #endif /* __KERNEL__*/ #endif /* _LINUX_SWAP_H */
15 16 16 16 16 16 16 23 7 7 7 7 6 6 1 12 12 7 7 7 6 12 12 14 3 31 31 31 31 4 31 31 6 31 6 6 30 20 20 22 20 20 20 20 13 9 8 9 9 9 20 2 2 2 1 18 18 17 17 17 17 17 2 9 22 13 8 13 10 10 9 16 11 28 16 13 13 10 5 3 3 8 6 1 8 31 2 1 2 1 1 1 1 1 1 1 1 1 1 1 1 6 5 4 3 2 1 1 2 6 7 6 5 4 3 7 16 4 4 16 16 2 1 1 1 1 1 1 1 4 4 4 32 3 2 1 1 1 4 5 4 4 2 3 21 21 21 21 21 21 22 22 21 22 22 22 21 22 22 23 22 1 22 1 22 22 22 22 23 14 13 12 12 12 12 12 14 374 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 // SPDX-License-Identifier: GPL-2.0-only /* * Kernel Connection Multiplexor * * Copyright (c) 2016 Tom Herbert <tom@herbertland.com> */ #include <linux/bpf.h> #include <linux/errno.h> #include <linux/errqueue.h> #include <linux/file.h> #include <linux/filter.h> #include <linux/in.h> #include <linux/kernel.h> #include <linux/module.h> #include <linux/net.h> #include <linux/netdevice.h> #include <linux/poll.h> #include <linux/rculist.h> #include <linux/skbuff.h> #include <linux/socket.h> #include <linux/uaccess.h> #include <linux/workqueue.h> #include <linux/syscalls.h> #include <linux/sched/signal.h> #include <net/kcm.h> #include <net/netns/generic.h> #include <net/sock.h> #include <uapi/linux/kcm.h> #include <trace/events/sock.h> unsigned int kcm_net_id; static struct kmem_cache *kcm_psockp __read_mostly; static struct kmem_cache *kcm_muxp __read_mostly; static struct workqueue_struct *kcm_wq; static inline struct kcm_sock *kcm_sk(const struct sock *sk) { return (struct kcm_sock *)sk; } static inline struct kcm_tx_msg *kcm_tx_msg(struct sk_buff *skb) { return (struct kcm_tx_msg *)skb->cb; } static void report_csk_error(struct sock *csk, int err) { csk->sk_err = EPIPE; sk_error_report(csk); } static void kcm_abort_tx_psock(struct kcm_psock *psock, int err, bool wakeup_kcm) { struct sock *csk = psock->sk; struct kcm_mux *mux = psock->mux; /* Unrecoverable error in transmit */ spin_lock_bh(&mux->lock); if (psock->tx_stopped) { spin_unlock_bh(&mux->lock); return; } psock->tx_stopped = 1; KCM_STATS_INCR(psock->stats.tx_aborts); if (!psock->tx_kcm) { /* Take off psocks_avail list */ list_del(&psock->psock_avail_list); } else if (wakeup_kcm) { /* In this case psock is being aborted while outside of * write_msgs and psock is reserved. Schedule tx_work * to handle the failure there. Need to commit tx_stopped * before queuing work. */ smp_mb(); queue_work(kcm_wq, &psock->tx_kcm->tx_work); } spin_unlock_bh(&mux->lock); /* Report error on lower socket */ report_csk_error(csk, err); } /* RX mux lock held. */ static void kcm_update_rx_mux_stats(struct kcm_mux *mux, struct kcm_psock *psock) { STRP_STATS_ADD(mux->stats.rx_bytes, psock->strp.stats.bytes - psock->saved_rx_bytes); mux->stats.rx_msgs += psock->strp.stats.msgs - psock->saved_rx_msgs; psock->saved_rx_msgs = psock->strp.stats.msgs; psock->saved_rx_bytes = psock->strp.stats.bytes; } static void kcm_update_tx_mux_stats(struct kcm_mux *mux, struct kcm_psock *psock) { KCM_STATS_ADD(mux->stats.tx_bytes, psock->stats.tx_bytes - psock->saved_tx_bytes); mux->stats.tx_msgs += psock->stats.tx_msgs - psock->saved_tx_msgs; psock->saved_tx_msgs = psock->stats.tx_msgs; psock->saved_tx_bytes = psock->stats.tx_bytes; } static int kcm_queue_rcv_skb(struct sock *sk, struct sk_buff *skb); /* KCM is ready to receive messages on its queue-- either the KCM is new or * has become unblocked after being blocked on full socket buffer. Queue any * pending ready messages on a psock. RX mux lock held. */ static void kcm_rcv_ready(struct kcm_sock *kcm) { struct kcm_mux *mux = kcm->mux; struct kcm_psock *psock; struct sk_buff *skb; if (unlikely(kcm->rx_wait || kcm->rx_psock || kcm->rx_disabled)) return; while (unlikely((skb = __skb_dequeue(&mux->rx_hold_queue)))) { if (kcm_queue_rcv_skb(&kcm->sk, skb)) { /* Assuming buffer limit has been reached */ skb_queue_head(&mux->rx_hold_queue, skb); WARN_ON(!sk_rmem_alloc_get(&kcm->sk)); return; } } while (!list_empty(&mux->psocks_ready)) { psock = list_first_entry(&mux->psocks_ready, struct kcm_psock, psock_ready_list); if (kcm_queue_rcv_skb(&kcm->sk, psock->ready_rx_msg)) { /* Assuming buffer limit has been reached */ WARN_ON(!sk_rmem_alloc_get(&kcm->sk)); return; } /* Consumed the ready message on the psock. Schedule rx_work to * get more messages. */ list_del(&psock->psock_ready_list); psock->ready_rx_msg = NULL; /* Commit clearing of ready_rx_msg for queuing work */ smp_mb(); strp_unpause(&psock->strp); strp_check_rcv(&psock->strp); } /* Buffer limit is okay now, add to ready list */ list_add_tail(&kcm->wait_rx_list, &kcm->mux->kcm_rx_waiters); /* paired with lockless reads in kcm_rfree() */ WRITE_ONCE(kcm->rx_wait, true); } static void kcm_rfree(struct sk_buff *skb) { struct sock *sk = skb->sk; struct kcm_sock *kcm = kcm_sk(sk); struct kcm_mux *mux = kcm->mux; unsigned int len = skb->truesize; sk_mem_uncharge(sk, len); atomic_sub(len, &sk->sk_rmem_alloc); /* For reading rx_wait and rx_psock without holding lock */ smp_mb__after_atomic(); if (!READ_ONCE(kcm->rx_wait) && !READ_ONCE(kcm->rx_psock) && sk_rmem_alloc_get(sk) < sk->sk_rcvlowat) { spin_lock_bh(&mux->rx_lock); kcm_rcv_ready(kcm); spin_unlock_bh(&mux->rx_lock); } } static int kcm_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) { struct sk_buff_head *list = &sk->sk_receive_queue; if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) return -ENOMEM; if (!sk_rmem_schedule(sk, skb, skb->truesize)) return -ENOBUFS; skb->dev = NULL; skb_orphan(skb); skb->sk = sk; skb->destructor = kcm_rfree; atomic_add(skb->truesize, &sk->sk_rmem_alloc); sk_mem_charge(sk, skb->truesize); skb_queue_tail(list, skb); if (!sock_flag(sk, SOCK_DEAD)) sk->sk_data_ready(sk); return 0; } /* Requeue received messages for a kcm socket to other kcm sockets. This is * called with a kcm socket is receive disabled. * RX mux lock held. */ static void requeue_rx_msgs(struct kcm_mux *mux, struct sk_buff_head *head) { struct sk_buff *skb; struct kcm_sock *kcm; while ((skb = skb_dequeue(head))) { /* Reset destructor to avoid calling kcm_rcv_ready */ skb->destructor = sock_rfree; skb_orphan(skb); try_again: if (list_empty(&mux->kcm_rx_waiters)) { skb_queue_tail(&mux->rx_hold_queue, skb); continue; } kcm = list_first_entry(&mux->kcm_rx_waiters, struct kcm_sock, wait_rx_list); if (kcm_queue_rcv_skb(&kcm->sk, skb)) { /* Should mean socket buffer full */ list_del(&kcm->wait_rx_list); /* paired with lockless reads in kcm_rfree() */ WRITE_ONCE(kcm->rx_wait, false); /* Commit rx_wait to read in kcm_free */ smp_wmb(); goto try_again; } } } /* Lower sock lock held */ static struct kcm_sock *reserve_rx_kcm(struct kcm_psock *psock, struct sk_buff *head) { struct kcm_mux *mux = psock->mux; struct kcm_sock *kcm; WARN_ON(psock->ready_rx_msg); if (psock->rx_kcm) return psock->rx_kcm; spin_lock_bh(&mux->rx_lock); if (psock->rx_kcm) { spin_unlock_bh(&mux->rx_lock); return psock->rx_kcm; } kcm_update_rx_mux_stats(mux, psock); if (list_empty(&mux->kcm_rx_waiters)) { psock->ready_rx_msg = head; strp_pause(&psock->strp); list_add_tail(&psock->psock_ready_list, &mux->psocks_ready); spin_unlock_bh(&mux->rx_lock); return NULL; } kcm = list_first_entry(&mux->kcm_rx_waiters, struct kcm_sock, wait_rx_list); list_del(&kcm->wait_rx_list); /* paired with lockless reads in kcm_rfree() */ WRITE_ONCE(kcm->rx_wait, false); psock->rx_kcm = kcm; /* paired with lockless reads in kcm_rfree() */ WRITE_ONCE(kcm->rx_psock, psock); spin_unlock_bh(&mux->rx_lock); return kcm; } static void kcm_done(struct kcm_sock *kcm); static void kcm_done_work(struct work_struct *w) { kcm_done(container_of(w, struct kcm_sock, done_work)); } /* Lower sock held */ static void unreserve_rx_kcm(struct kcm_psock *psock, bool rcv_ready) { struct kcm_sock *kcm = psock->rx_kcm; struct kcm_mux *mux = psock->mux; if (!kcm) return; spin_lock_bh(&mux->rx_lock); psock->rx_kcm = NULL; /* paired with lockless reads in kcm_rfree() */ WRITE_ONCE(kcm->rx_psock, NULL); /* Commit kcm->rx_psock before sk_rmem_alloc_get to sync with * kcm_rfree */ smp_mb(); if (unlikely(kcm->done)) { spin_unlock_bh(&mux->rx_lock); /* Need to run kcm_done in a task since we need to qcquire * callback locks which may already be held here. */ INIT_WORK(&kcm->done_work, kcm_done_work); schedule_work(&kcm->done_work); return; } if (unlikely(kcm->rx_disabled)) { requeue_rx_msgs(mux, &kcm->sk.sk_receive_queue); } else if (rcv_ready || unlikely(!sk_rmem_alloc_get(&kcm->sk))) { /* Check for degenerative race with rx_wait that all * data was dequeued (accounted for in kcm_rfree). */ kcm_rcv_ready(kcm); } spin_unlock_bh(&mux->rx_lock); } /* Lower sock lock held */ static void psock_data_ready(struct sock *sk) { struct kcm_psock *psock; trace_sk_data_ready(sk); read_lock_bh(&sk->sk_callback_lock); psock = (struct kcm_psock *)sk->sk_user_data; if (likely(psock)) strp_data_ready(&psock->strp); read_unlock_bh(&sk->sk_callback_lock); } /* Called with lower sock held */ static void kcm_rcv_strparser(struct strparser *strp, struct sk_buff *skb) { struct kcm_psock *psock = container_of(strp, struct kcm_psock, strp); struct kcm_sock *kcm; try_queue: kcm = reserve_rx_kcm(psock, skb); if (!kcm) { /* Unable to reserve a KCM, message is held in psock and strp * is paused. */ return; } if (kcm_queue_rcv_skb(&kcm->sk, skb)) { /* Should mean socket buffer full */ unreserve_rx_kcm(psock, false); goto try_queue; } } static int kcm_parse_func_strparser(struct strparser *strp, struct sk_buff *skb) { struct kcm_psock *psock = container_of(strp, struct kcm_psock, strp); struct bpf_prog *prog = psock->bpf_prog; int res; res = bpf_prog_run_pin_on_cpu(prog, skb); return res; } static int kcm_read_sock_done(struct strparser *strp, int err) { struct kcm_psock *psock = container_of(strp, struct kcm_psock, strp); unreserve_rx_kcm(psock, true); return err; } static void psock_state_change(struct sock *sk) { /* TCP only does a EPOLLIN for a half close. Do a EPOLLHUP here * since application will normally not poll with EPOLLIN * on the TCP sockets. */ report_csk_error(sk, EPIPE); } static void psock_write_space(struct sock *sk) { struct kcm_psock *psock; struct kcm_mux *mux; struct kcm_sock *kcm; read_lock_bh(&sk->sk_callback_lock); psock = (struct kcm_psock *)sk->sk_user_data; if (unlikely(!psock)) goto out; mux = psock->mux; spin_lock_bh(&mux->lock); /* Check if the socket is reserved so someone is waiting for sending. */ kcm = psock->tx_kcm; if (kcm && !unlikely(kcm->tx_stopped)) queue_work(kcm_wq, &kcm->tx_work); spin_unlock_bh(&mux->lock); out: read_unlock_bh(&sk->sk_callback_lock); } static void unreserve_psock(struct kcm_sock *kcm); /* kcm sock is locked. */ static struct kcm_psock *reserve_psock(struct kcm_sock *kcm) { struct kcm_mux *mux = kcm->mux; struct kcm_psock *psock; psock = kcm->tx_psock; smp_rmb(); /* Must read tx_psock before tx_wait */ if (psock) { WARN_ON(kcm->tx_wait); if (unlikely(psock->tx_stopped)) unreserve_psock(kcm); else return kcm->tx_psock; } spin_lock_bh(&mux->lock); /* Check again under lock to see if psock was reserved for this * psock via psock_unreserve. */ psock = kcm->tx_psock; if (unlikely(psock)) { WARN_ON(kcm->tx_wait); spin_unlock_bh(&mux->lock); return kcm->tx_psock; } if (!list_empty(&mux->psocks_avail)) { psock = list_first_entry(&mux->psocks_avail, struct kcm_psock, psock_avail_list); list_del(&psock->psock_avail_list); if (kcm->tx_wait) { list_del(&kcm->wait_psock_list); kcm->tx_wait = false; } kcm->tx_psock = psock; psock->tx_kcm = kcm; KCM_STATS_INCR(psock->stats.reserved); } else if (!kcm->tx_wait) { list_add_tail(&kcm->wait_psock_list, &mux->kcm_tx_waiters); kcm->tx_wait = true; } spin_unlock_bh(&mux->lock); return psock; } /* mux lock held */ static void psock_now_avail(struct kcm_psock *psock) { struct kcm_mux *mux = psock->mux; struct kcm_sock *kcm; if (list_empty(&mux->kcm_tx_waiters)) { list_add_tail(&psock->psock_avail_list, &mux->psocks_avail); } else { kcm = list_first_entry(&mux->kcm_tx_waiters, struct kcm_sock, wait_psock_list); list_del(&kcm->wait_psock_list); kcm->tx_wait = false; psock->tx_kcm = kcm; /* Commit before changing tx_psock since that is read in * reserve_psock before queuing work. */ smp_mb(); kcm->tx_psock = psock; KCM_STATS_INCR(psock->stats.reserved); queue_work(kcm_wq, &kcm->tx_work); } } /* kcm sock is locked. */ static void unreserve_psock(struct kcm_sock *kcm) { struct kcm_psock *psock; struct kcm_mux *mux = kcm->mux; spin_lock_bh(&mux->lock); psock = kcm->tx_psock; if (WARN_ON(!psock)) { spin_unlock_bh(&mux->lock); return; } smp_rmb(); /* Read tx_psock before tx_wait */ kcm_update_tx_mux_stats(mux, psock); WARN_ON(kcm->tx_wait); kcm->tx_psock = NULL; psock->tx_kcm = NULL; KCM_STATS_INCR(psock->stats.unreserved); if (unlikely(psock->tx_stopped)) { if (psock->done) { /* Deferred free */ list_del(&psock->psock_list); mux->psocks_cnt--; sock_put(psock->sk); fput(psock->sk->sk_socket->file); kmem_cache_free(kcm_psockp, psock); } /* Don't put back on available list */ spin_unlock_bh(&mux->lock); return; } psock_now_avail(psock); spin_unlock_bh(&mux->lock); } static void kcm_report_tx_retry(struct kcm_sock *kcm) { struct kcm_mux *mux = kcm->mux; spin_lock_bh(&mux->lock); KCM_STATS_INCR(mux->stats.tx_retries); spin_unlock_bh(&mux->lock); } /* Write any messages ready on the kcm socket. Called with kcm sock lock * held. Return bytes actually sent or error. */ static int kcm_write_msgs(struct kcm_sock *kcm) { unsigned int total_sent = 0; struct sock *sk = &kcm->sk; struct kcm_psock *psock; struct sk_buff *head; int ret = 0; kcm->tx_wait_more = false; psock = kcm->tx_psock; if (unlikely(psock && psock->tx_stopped)) { /* A reserved psock was aborted asynchronously. Unreserve * it and we'll retry the message. */ unreserve_psock(kcm); kcm_report_tx_retry(kcm); if (skb_queue_empty(&sk->sk_write_queue)) return 0; kcm_tx_msg(skb_peek(&sk->sk_write_queue))->started_tx = false; } retry: while ((head = skb_peek(&sk->sk_write_queue))) { struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_SPLICE_PAGES, }; struct kcm_tx_msg *txm = kcm_tx_msg(head); struct sk_buff *skb; unsigned int msize; int i; if (!txm->started_tx) { psock = reserve_psock(kcm); if (!psock) goto out; skb = head; txm->frag_offset = 0; txm->sent = 0; txm->started_tx = true; } else { if (WARN_ON(!psock)) { ret = -EINVAL; goto out; } skb = txm->frag_skb; } if (WARN_ON(!skb_shinfo(skb)->nr_frags) || WARN_ON_ONCE(!skb_frag_page(&skb_shinfo(skb)->frags[0]))) { ret = -EINVAL; goto out; } msize = 0; for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) msize += skb_frag_size(&skb_shinfo(skb)->frags[i]); iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, (const struct bio_vec *)skb_shinfo(skb)->frags, skb_shinfo(skb)->nr_frags, msize); iov_iter_advance(&msg.msg_iter, txm->frag_offset); do { ret = sock_sendmsg(psock->sk->sk_socket, &msg); if (ret <= 0) { if (ret == -EAGAIN) { /* Save state to try again when there's * write space on the socket */ txm->frag_skb = skb; ret = 0; goto out; } /* Hard failure in sending message, abort this * psock since it has lost framing * synchronization and retry sending the * message from the beginning. */ kcm_abort_tx_psock(psock, ret ? -ret : EPIPE, true); unreserve_psock(kcm); psock = NULL; txm->started_tx = false; kcm_report_tx_retry(kcm); ret = 0; goto retry; } txm->sent += ret; txm->frag_offset += ret; KCM_STATS_ADD(psock->stats.tx_bytes, ret); } while (msg.msg_iter.count > 0); if (skb == head) { if (skb_has_frag_list(skb)) { txm->frag_skb = skb_shinfo(skb)->frag_list; txm->frag_offset = 0; continue; } } else if (skb->next) { txm->frag_skb = skb->next; txm->frag_offset = 0; continue; } /* Successfully sent the whole packet, account for it. */ sk->sk_wmem_queued -= txm->sent; total_sent += txm->sent; skb_dequeue(&sk->sk_write_queue); kfree_skb(head); KCM_STATS_INCR(psock->stats.tx_msgs); } out: if (!head) { /* Done with all queued messages. */ WARN_ON(!skb_queue_empty(&sk->sk_write_queue)); if (psock) unreserve_psock(kcm); } /* Check if write space is available */ sk->sk_write_space(sk); return total_sent ? : ret; } static void kcm_tx_work(struct work_struct *w) { struct kcm_sock *kcm = container_of(w, struct kcm_sock, tx_work); struct sock *sk = &kcm->sk; int err; lock_sock(sk); /* Primarily for SOCK_DGRAM sockets, also handle asynchronous tx * aborts */ err = kcm_write_msgs(kcm); if (err < 0) { /* Hard failure in write, report error on KCM socket */ pr_warn("KCM: Hard failure on kcm_write_msgs %d\n", err); report_csk_error(&kcm->sk, -err); goto out; } /* Primarily for SOCK_SEQPACKET sockets */ if (likely(sk->sk_socket) && test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) { clear_bit(SOCK_NOSPACE, &sk->sk_socket->flags); sk->sk_write_space(sk); } out: release_sock(sk); } static void kcm_push(struct kcm_sock *kcm) { if (kcm->tx_wait_more) kcm_write_msgs(kcm); } static int kcm_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) { struct sock *sk = sock->sk; struct kcm_sock *kcm = kcm_sk(sk); struct sk_buff *skb = NULL, *head = NULL; size_t copy, copied = 0; long timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT); int eor = (sock->type == SOCK_DGRAM) ? !(msg->msg_flags & MSG_MORE) : !!(msg->msg_flags & MSG_EOR); int err = -EPIPE; mutex_lock(&kcm->tx_mutex); lock_sock(sk); /* Per tcp_sendmsg this should be in poll */ sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk); if (sk->sk_err) goto out_error; if (kcm->seq_skb) { /* Previously opened message */ head = kcm->seq_skb; skb = kcm_tx_msg(head)->last_skb; goto start; } /* Call the sk_stream functions to manage the sndbuf mem. */ if (!sk_stream_memory_free(sk)) { kcm_push(kcm); set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); err = sk_stream_wait_memory(sk, &timeo); if (err) goto out_error; } if (msg_data_left(msg)) { /* New message, alloc head skb */ head = alloc_skb(0, sk->sk_allocation); while (!head) { kcm_push(kcm); err = sk_stream_wait_memory(sk, &timeo); if (err) goto out_error; head = alloc_skb(0, sk->sk_allocation); } skb = head; /* Set ip_summed to CHECKSUM_UNNECESSARY to avoid calling * csum_and_copy_from_iter from skb_do_copy_data_nocache. */ skb->ip_summed = CHECKSUM_UNNECESSARY; } start: while (msg_data_left(msg)) { bool merge = true; int i = skb_shinfo(skb)->nr_frags; struct page_frag *pfrag = sk_page_frag(sk); if (!sk_page_frag_refill(sk, pfrag)) goto wait_for_memory; if (!skb_can_coalesce(skb, i, pfrag->page, pfrag->offset)) { if (i == MAX_SKB_FRAGS) { struct sk_buff *tskb; tskb = alloc_skb(0, sk->sk_allocation); if (!tskb) goto wait_for_memory; if (head == skb) skb_shinfo(head)->frag_list = tskb; else skb->next = tskb; skb = tskb; skb->ip_summed = CHECKSUM_UNNECESSARY; continue; } merge = false; } if (msg->msg_flags & MSG_SPLICE_PAGES) { copy = msg_data_left(msg); if (!sk_wmem_schedule(sk, copy)) goto wait_for_memory; err = skb_splice_from_iter(skb, &msg->msg_iter, copy, sk->sk_allocation); if (err < 0) { if (err == -EMSGSIZE) goto wait_for_memory; goto out_error; } copy = err; skb_shinfo(skb)->flags |= SKBFL_SHARED_FRAG; sk_wmem_queued_add(sk, copy); sk_mem_charge(sk, copy); if (head != skb) head->truesize += copy; } else { copy = min_t(int, msg_data_left(msg), pfrag->size - pfrag->offset); if (!sk_wmem_schedule(sk, copy)) goto wait_for_memory; err = skb_copy_to_page_nocache(sk, &msg->msg_iter, skb, pfrag->page, pfrag->offset, copy); if (err) goto out_error; /* Update the skb. */ if (merge) { skb_frag_size_add( &skb_shinfo(skb)->frags[i - 1], copy); } else { skb_fill_page_desc(skb, i, pfrag->page, pfrag->offset, copy); get_page(pfrag->page); } pfrag->offset += copy; } copied += copy; if (head != skb) { head->len += copy; head->data_len += copy; } continue; wait_for_memory: kcm_push(kcm); err = sk_stream_wait_memory(sk, &timeo); if (err) goto out_error; } if (eor) { bool not_busy = skb_queue_empty(&sk->sk_write_queue); if (head) { /* Message complete, queue it on send buffer */ __skb_queue_tail(&sk->sk_write_queue, head); kcm->seq_skb = NULL; KCM_STATS_INCR(kcm->stats.tx_msgs); } if (msg->msg_flags & MSG_BATCH) { kcm->tx_wait_more = true; } else if (kcm->tx_wait_more || not_busy) { err = kcm_write_msgs(kcm); if (err < 0) { /* We got a hard error in write_msgs but have * already queued this message. Report an error * in the socket, but don't affect return value * from sendmsg */ pr_warn("KCM: Hard failure on kcm_write_msgs\n"); report_csk_error(&kcm->sk, -err); } } } else { /* Message not complete, save state */ partial_message: if (head) { kcm->seq_skb = head; kcm_tx_msg(head)->last_skb = skb; } } KCM_STATS_ADD(kcm->stats.tx_bytes, copied); release_sock(sk); mutex_unlock(&kcm->tx_mutex); return copied; out_error: kcm_push(kcm); if (sock->type == SOCK_SEQPACKET) { /* Wrote some bytes before encountering an * error, return partial success. */ if (copied) goto partial_message; if (head != kcm->seq_skb) kfree_skb(head); } else { kfree_skb(head); kcm->seq_skb = NULL; } err = sk_stream_error(sk, msg->msg_flags, err); /* make sure we wake any epoll edge trigger waiter */ if (unlikely(skb_queue_len(&sk->sk_write_queue) == 0 && err == -EAGAIN)) sk->sk_write_space(sk); release_sock(sk); mutex_unlock(&kcm->tx_mutex); return err; } static void kcm_splice_eof(struct socket *sock) { struct sock *sk = sock->sk; struct kcm_sock *kcm = kcm_sk(sk); if (skb_queue_empty_lockless(&sk->sk_write_queue)) return; lock_sock(sk); kcm_write_msgs(kcm); release_sock(sk); } static int kcm_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, int flags) { struct sock *sk = sock->sk; struct kcm_sock *kcm = kcm_sk(sk); int err = 0; struct strp_msg *stm; int copied = 0; struct sk_buff *skb; skb = skb_recv_datagram(sk, flags, &err); if (!skb) goto out; /* Okay, have a message on the receive queue */ stm = strp_msg(skb); if (len > stm->full_len) len = stm->full_len; err = skb_copy_datagram_msg(skb, stm->offset, msg, len); if (err < 0) goto out; copied = len; if (likely(!(flags & MSG_PEEK))) { KCM_STATS_ADD(kcm->stats.rx_bytes, copied); if (copied < stm->full_len) { if (sock->type == SOCK_DGRAM) { /* Truncated message */ msg->msg_flags |= MSG_TRUNC; goto msg_finished; } stm->offset += copied; stm->full_len -= copied; } else { msg_finished: /* Finished with message */ msg->msg_flags |= MSG_EOR; KCM_STATS_INCR(kcm->stats.rx_msgs); } } out: skb_free_datagram(sk, skb); return copied ? : err; } static ssize_t kcm_splice_read(struct socket *sock, loff_t *ppos, struct pipe_inode_info *pipe, size_t len, unsigned int flags) { struct sock *sk = sock->sk; struct kcm_sock *kcm = kcm_sk(sk); struct strp_msg *stm; int err = 0; ssize_t copied; struct sk_buff *skb; /* Only support splice for SOCKSEQPACKET */ skb = skb_recv_datagram(sk, flags, &err); if (!skb) goto err_out; /* Okay, have a message on the receive queue */ stm = strp_msg(skb); if (len > stm->full_len) len = stm->full_len; copied = skb_splice_bits(skb, sk, stm->offset, pipe, len, flags); if (copied < 0) { err = copied; goto err_out; } KCM_STATS_ADD(kcm->stats.rx_bytes, copied); stm->offset += copied; stm->full_len -= copied; /* We have no way to return MSG_EOR. If all the bytes have been * read we still leave the message in the receive socket buffer. * A subsequent recvmsg needs to be done to return MSG_EOR and * finish reading the message. */ skb_free_datagram(sk, skb); return copied; err_out: skb_free_datagram(sk, skb); return err; } /* kcm sock lock held */ static void kcm_recv_disable(struct kcm_sock *kcm) { struct kcm_mux *mux = kcm->mux; if (kcm->rx_disabled) return; spin_lock_bh(&mux->rx_lock); kcm->rx_disabled = 1; /* If a psock is reserved we'll do cleanup in unreserve */ if (!kcm->rx_psock) { if (kcm->rx_wait) { list_del(&kcm->wait_rx_list); /* paired with lockless reads in kcm_rfree() */ WRITE_ONCE(kcm->rx_wait, false); } requeue_rx_msgs(mux, &kcm->sk.sk_receive_queue); } spin_unlock_bh(&mux->rx_lock); } /* kcm sock lock held */ static void kcm_recv_enable(struct kcm_sock *kcm) { struct kcm_mux *mux = kcm->mux; if (!kcm->rx_disabled) return; spin_lock_bh(&mux->rx_lock); kcm->rx_disabled = 0; kcm_rcv_ready(kcm); spin_unlock_bh(&mux->rx_lock); } static int kcm_setsockopt(struct socket *sock, int level, int optname, sockptr_t optval, unsigned int optlen) { struct kcm_sock *kcm = kcm_sk(sock->sk); int val, valbool; int err = 0; if (level != SOL_KCM) return -ENOPROTOOPT; if (optlen < sizeof(int)) return -EINVAL; if (copy_from_sockptr(&val, optval, sizeof(int))) return -EFAULT; valbool = val ? 1 : 0; switch (optname) { case KCM_RECV_DISABLE: lock_sock(&kcm->sk); if (valbool) kcm_recv_disable(kcm); else kcm_recv_enable(kcm); release_sock(&kcm->sk); break; default: err = -ENOPROTOOPT; } return err; } static int kcm_getsockopt(struct socket *sock, int level, int optname, char __user *optval, int __user *optlen) { struct kcm_sock *kcm = kcm_sk(sock->sk); int val, len; if (level != SOL_KCM) return -ENOPROTOOPT; if (get_user(len, optlen)) return -EFAULT; if (len < 0) return -EINVAL; len = min_t(unsigned int, len, sizeof(int)); switch (optname) { case KCM_RECV_DISABLE: val = kcm->rx_disabled; break; default: return -ENOPROTOOPT; } if (put_user(len, optlen)) return -EFAULT; if (copy_to_user(optval, &val, len)) return -EFAULT; return 0; } static void init_kcm_sock(struct kcm_sock *kcm, struct kcm_mux *mux) { struct kcm_sock *tkcm; struct list_head *head; int index = 0; /* For SOCK_SEQPACKET sock type, datagram_poll checks the sk_state, so * we set sk_state, otherwise epoll_wait always returns right away with * EPOLLHUP */ kcm->sk.sk_state = TCP_ESTABLISHED; /* Add to mux's kcm sockets list */ kcm->mux = mux; spin_lock_bh(&mux->lock); head = &mux->kcm_socks; list_for_each_entry(tkcm, &mux->kcm_socks, kcm_sock_list) { if (tkcm->index != index) break; head = &tkcm->kcm_sock_list; index++; } list_add(&kcm->kcm_sock_list, head); kcm->index = index; mux->kcm_socks_cnt++; spin_unlock_bh(&mux->lock); INIT_WORK(&kcm->tx_work, kcm_tx_work); mutex_init(&kcm->tx_mutex); spin_lock_bh(&mux->rx_lock); kcm_rcv_ready(kcm); spin_unlock_bh(&mux->rx_lock); } static int kcm_attach(struct socket *sock, struct socket *csock, struct bpf_prog *prog) { struct kcm_sock *kcm = kcm_sk(sock->sk); struct kcm_mux *mux = kcm->mux; struct sock *csk; struct kcm_psock *psock = NULL, *tpsock; struct list_head *head; int index = 0; static const struct strp_callbacks cb = { .rcv_msg = kcm_rcv_strparser, .parse_msg = kcm_parse_func_strparser, .read_sock_done = kcm_read_sock_done, }; int err = 0; csk = csock->sk; if (!csk) return -EINVAL; lock_sock(csk); /* Only allow TCP sockets to be attached for now */ if ((csk->sk_family != AF_INET && csk->sk_family != AF_INET6) || csk->sk_protocol != IPPROTO_TCP) { err = -EOPNOTSUPP; goto out; } /* Don't allow listeners or closed sockets */ if (csk->sk_state == TCP_LISTEN || csk->sk_state == TCP_CLOSE) { err = -EOPNOTSUPP; goto out; } psock = kmem_cache_zalloc(kcm_psockp, GFP_KERNEL); if (!psock) { err = -ENOMEM; goto out; } psock->mux = mux; psock->sk = csk; psock->bpf_prog = prog; write_lock_bh(&csk->sk_callback_lock); /* Check if sk_user_data is already by KCM or someone else. * Must be done under lock to prevent race conditions. */ if (csk->sk_user_data) { write_unlock_bh(&csk->sk_callback_lock); kmem_cache_free(kcm_psockp, psock); err = -EALREADY; goto out; } err = strp_init(&psock->strp, csk, &cb); if (err) { write_unlock_bh(&csk->sk_callback_lock); kmem_cache_free(kcm_psockp, psock); goto out; } psock->save_data_ready = csk->sk_data_ready; psock->save_write_space = csk->sk_write_space; psock->save_state_change = csk->sk_state_change; csk->sk_user_data = psock; csk->sk_data_ready = psock_data_ready; csk->sk_write_space = psock_write_space; csk->sk_state_change = psock_state_change; write_unlock_bh(&csk->sk_callback_lock); sock_hold(csk); /* Finished initialization, now add the psock to the MUX. */ spin_lock_bh(&mux->lock); head = &mux->psocks; list_for_each_entry(tpsock, &mux->psocks, psock_list) { if (tpsock->index != index) break; head = &tpsock->psock_list; index++; } list_add(&psock->psock_list, head); psock->index = index; KCM_STATS_INCR(mux->stats.psock_attach); mux->psocks_cnt++; psock_now_avail(psock); spin_unlock_bh(&mux->lock); /* Schedule RX work in case there are already bytes queued */ strp_check_rcv(&psock->strp); out: release_sock(csk); return err; } static int kcm_attach_ioctl(struct socket *sock, struct kcm_attach *info) { struct socket *csock; struct bpf_prog *prog; int err; csock = sockfd_lookup(info->fd, &err); if (!csock) return -ENOENT; prog = bpf_prog_get_type(info->bpf_fd, BPF_PROG_TYPE_SOCKET_FILTER); if (IS_ERR(prog)) { err = PTR_ERR(prog); goto out; } err = kcm_attach(sock, csock, prog); if (err) { bpf_prog_put(prog); goto out; } /* Keep reference on file also */ return 0; out: sockfd_put(csock); return err; } static void kcm_unattach(struct kcm_psock *psock) { struct sock *csk = psock->sk; struct kcm_mux *mux = psock->mux; lock_sock(csk); /* Stop getting callbacks from TCP socket. After this there should * be no way to reserve a kcm for this psock. */ write_lock_bh(&csk->sk_callback_lock); csk->sk_user_data = NULL; csk->sk_data_ready = psock->save_data_ready; csk->sk_write_space = psock->save_write_space; csk->sk_state_change = psock->save_state_change; strp_stop(&psock->strp); if (WARN_ON(psock->rx_kcm)) { write_unlock_bh(&csk->sk_callback_lock); release_sock(csk); return; } spin_lock_bh(&mux->rx_lock); /* Stop receiver activities. After this point psock should not be * able to get onto ready list either through callbacks or work. */ if (psock->ready_rx_msg) { list_del(&psock->psock_ready_list); kfree_skb(psock->ready_rx_msg); psock->ready_rx_msg = NULL; KCM_STATS_INCR(mux->stats.rx_ready_drops); } spin_unlock_bh(&mux->rx_lock); write_unlock_bh(&csk->sk_callback_lock); /* Call strp_done without sock lock */ release_sock(csk); strp_done(&psock->strp); lock_sock(csk); bpf_prog_put(psock->bpf_prog); spin_lock_bh(&mux->lock); aggregate_psock_stats(&psock->stats, &mux->aggregate_psock_stats); save_strp_stats(&psock->strp, &mux->aggregate_strp_stats); KCM_STATS_INCR(mux->stats.psock_unattach); if (psock->tx_kcm) { /* psock was reserved. Just mark it finished and we will clean * up in the kcm paths, we need kcm lock which can not be * acquired here. */ KCM_STATS_INCR(mux->stats.psock_unattach_rsvd); spin_unlock_bh(&mux->lock); /* We are unattaching a socket that is reserved. Abort the * socket since we may be out of sync in sending on it. We need * to do this without the mux lock. */ kcm_abort_tx_psock(psock, EPIPE, false); spin_lock_bh(&mux->lock); if (!psock->tx_kcm) { /* psock now unreserved in window mux was unlocked */ goto no_reserved; } psock->done = 1; /* Commit done before queuing work to process it */ smp_mb(); /* Queue tx work to make sure psock->done is handled */ queue_work(kcm_wq, &psock->tx_kcm->tx_work); spin_unlock_bh(&mux->lock); } else { no_reserved: if (!psock->tx_stopped) list_del(&psock->psock_avail_list); list_del(&psock->psock_list); mux->psocks_cnt--; spin_unlock_bh(&mux->lock); sock_put(csk); fput(csk->sk_socket->file); kmem_cache_free(kcm_psockp, psock); } release_sock(csk); } static int kcm_unattach_ioctl(struct socket *sock, struct kcm_unattach *info) { struct kcm_sock *kcm = kcm_sk(sock->sk); struct kcm_mux *mux = kcm->mux; struct kcm_psock *psock; struct socket *csock; struct sock *csk; int err; csock = sockfd_lookup(info->fd, &err); if (!csock) return -ENOENT; csk = csock->sk; if (!csk) { err = -EINVAL; goto out; } err = -ENOENT; spin_lock_bh(&mux->lock); list_for_each_entry(psock, &mux->psocks, psock_list) { if (psock->sk != csk) continue; /* Found the matching psock */ if (psock->unattaching || WARN_ON(psock->done)) { err = -EALREADY; break; } psock->unattaching = 1; spin_unlock_bh(&mux->lock); /* Lower socket lock should already be held */ kcm_unattach(psock); err = 0; goto out; } spin_unlock_bh(&mux->lock); out: sockfd_put(csock); return err; } static struct proto kcm_proto = { .name = "KCM", .owner = THIS_MODULE, .obj_size = sizeof(struct kcm_sock), }; /* Clone a kcm socket. */ static struct file *kcm_clone(struct socket *osock) { struct socket *newsock; struct sock *newsk; newsock = sock_alloc(); if (!newsock) return ERR_PTR(-ENFILE); newsock->type = osock->type; newsock->ops = osock->ops; __module_get(newsock->ops->owner); newsk = sk_alloc(sock_net(osock->sk), PF_KCM, GFP_KERNEL, &kcm_proto, false); if (!newsk) { sock_release(newsock); return ERR_PTR(-ENOMEM); } sock_init_data(newsock, newsk); init_kcm_sock(kcm_sk(newsk), kcm_sk(osock->sk)->mux); return sock_alloc_file(newsock, 0, osock->sk->sk_prot_creator->name); } static int kcm_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) { int err; switch (cmd) { case SIOCKCMATTACH: { struct kcm_attach info; if (copy_from_user(&info, (void __user *)arg, sizeof(info))) return -EFAULT; err = kcm_attach_ioctl(sock, &info); break; } case SIOCKCMUNATTACH: { struct kcm_unattach info; if (copy_from_user(&info, (void __user *)arg, sizeof(info))) return -EFAULT; err = kcm_unattach_ioctl(sock, &info); break; } case SIOCKCMCLONE: { struct kcm_clone info; struct file *file; info.fd = get_unused_fd_flags(0); if (unlikely(info.fd < 0)) return info.fd; file = kcm_clone(sock); if (IS_ERR(file)) { put_unused_fd(info.fd); return PTR_ERR(file); } if (copy_to_user((void __user *)arg, &info, sizeof(info))) { put_unused_fd(info.fd); fput(file); return -EFAULT; } fd_install(info.fd, file); err = 0; break; } default: err = -ENOIOCTLCMD; break; } return err; } static void release_mux(struct kcm_mux *mux) { struct kcm_net *knet = mux->knet; struct kcm_psock *psock, *tmp_psock; /* Release psocks */ list_for_each_entry_safe(psock, tmp_psock, &mux->psocks, psock_list) { if (!WARN_ON(psock->unattaching)) kcm_unattach(psock); } if (WARN_ON(mux->psocks_cnt)) return; __skb_queue_purge(&mux->rx_hold_queue); mutex_lock(&knet->mutex); aggregate_mux_stats(&mux->stats, &knet->aggregate_mux_stats); aggregate_psock_stats(&mux->aggregate_psock_stats, &knet->aggregate_psock_stats); aggregate_strp_stats(&mux->aggregate_strp_stats, &knet->aggregate_strp_stats); list_del_rcu(&mux->kcm_mux_list); knet->count--; mutex_unlock(&knet->mutex); kfree_rcu(mux, rcu); } static void kcm_done(struct kcm_sock *kcm) { struct kcm_mux *mux = kcm->mux; struct sock *sk = &kcm->sk; int socks_cnt; spin_lock_bh(&mux->rx_lock); if (kcm->rx_psock) { /* Cleanup in unreserve_rx_kcm */ WARN_ON(kcm->done); kcm->rx_disabled = 1; kcm->done = 1; spin_unlock_bh(&mux->rx_lock); return; } if (kcm->rx_wait) { list_del(&kcm->wait_rx_list); /* paired with lockless reads in kcm_rfree() */ WRITE_ONCE(kcm->rx_wait, false); } /* Move any pending receive messages to other kcm sockets */ requeue_rx_msgs(mux, &sk->sk_receive_queue); spin_unlock_bh(&mux->rx_lock); if (WARN_ON(sk_rmem_alloc_get(sk))) return; /* Detach from MUX */ spin_lock_bh(&mux->lock); list_del(&kcm->kcm_sock_list); mux->kcm_socks_cnt--; socks_cnt = mux->kcm_socks_cnt; spin_unlock_bh(&mux->lock); if (!socks_cnt) { /* We are done with the mux now. */ release_mux(mux); } WARN_ON(kcm->rx_wait); sock_put(&kcm->sk); } /* Called by kcm_release to close a KCM socket. * If this is the last KCM socket on the MUX, destroy the MUX. */ static int kcm_release(struct socket *sock) { struct sock *sk = sock->sk; struct kcm_sock *kcm; struct kcm_mux *mux; struct kcm_psock *psock; if (!sk) return 0; kcm = kcm_sk(sk); mux = kcm->mux; lock_sock(sk); sock_orphan(sk); kfree_skb(kcm->seq_skb); /* Purge queue under lock to avoid race condition with tx_work trying * to act when queue is nonempty. If tx_work runs after this point * it will just return. */ __skb_queue_purge(&sk->sk_write_queue); /* Set tx_stopped. This is checked when psock is bound to a kcm and we * get a writespace callback. This prevents further work being queued * from the callback (unbinding the psock occurs after canceling work. */ kcm->tx_stopped = 1; release_sock(sk); spin_lock_bh(&mux->lock); if (kcm->tx_wait) { /* Take of tx_wait list, after this point there should be no way * that a psock will be assigned to this kcm. */ list_del(&kcm->wait_psock_list); kcm->tx_wait = false; } spin_unlock_bh(&mux->lock); /* Cancel work. After this point there should be no outside references * to the kcm socket. */ cancel_work_sync(&kcm->tx_work); lock_sock(sk); psock = kcm->tx_psock; if (psock) { /* A psock was reserved, so we need to kill it since it * may already have some bytes queued from a message. We * need to do this after removing kcm from tx_wait list. */ kcm_abort_tx_psock(psock, EPIPE, false); unreserve_psock(kcm); } release_sock(sk); WARN_ON(kcm->tx_wait); WARN_ON(kcm->tx_psock); sock->sk = NULL; kcm_done(kcm); return 0; } static const struct proto_ops kcm_dgram_ops = { .family = PF_KCM, .owner = THIS_MODULE, .release = kcm_release, .bind = sock_no_bind, .connect = sock_no_connect, .socketpair = sock_no_socketpair, .accept = sock_no_accept, .getname = sock_no_getname, .poll = datagram_poll, .ioctl = kcm_ioctl, .listen = sock_no_listen, .shutdown = sock_no_shutdown, .setsockopt = kcm_setsockopt, .getsockopt = kcm_getsockopt, .sendmsg = kcm_sendmsg, .recvmsg = kcm_recvmsg, .mmap = sock_no_mmap, .splice_eof = kcm_splice_eof, }; static const struct proto_ops kcm_seqpacket_ops = { .family = PF_KCM, .owner = THIS_MODULE, .release = kcm_release, .bind = sock_no_bind, .connect = sock_no_connect, .socketpair = sock_no_socketpair, .accept = sock_no_accept, .getname = sock_no_getname, .poll = datagram_poll, .ioctl = kcm_ioctl, .listen = sock_no_listen, .shutdown = sock_no_shutdown, .setsockopt = kcm_setsockopt, .getsockopt = kcm_getsockopt, .sendmsg = kcm_sendmsg, .recvmsg = kcm_recvmsg, .mmap = sock_no_mmap, .splice_eof = kcm_splice_eof, .splice_read = kcm_splice_read, }; /* Create proto operation for kcm sockets */ static int kcm_create(struct net *net, struct socket *sock, int protocol, int kern) { struct kcm_net *knet = net_generic(net, kcm_net_id); struct sock *sk; struct kcm_mux *mux; switch (sock->type) { case SOCK_DGRAM: sock->ops = &kcm_dgram_ops; break; case SOCK_SEQPACKET: sock->ops = &kcm_seqpacket_ops; break; default: return -ESOCKTNOSUPPORT; } if (protocol != KCMPROTO_CONNECTED) return -EPROTONOSUPPORT; sk = sk_alloc(net, PF_KCM, GFP_KERNEL, &kcm_proto, kern); if (!sk) return -ENOMEM; /* Allocate a kcm mux, shared between KCM sockets */ mux = kmem_cache_zalloc(kcm_muxp, GFP_KERNEL); if (!mux) { sk_free(sk); return -ENOMEM; } spin_lock_init(&mux->lock); spin_lock_init(&mux->rx_lock); INIT_LIST_HEAD(&mux->kcm_socks); INIT_LIST_HEAD(&mux->kcm_rx_waiters); INIT_LIST_HEAD(&mux->kcm_tx_waiters); INIT_LIST_HEAD(&mux->psocks); INIT_LIST_HEAD(&mux->psocks_ready); INIT_LIST_HEAD(&mux->psocks_avail); mux->knet = knet; /* Add new MUX to list */ mutex_lock(&knet->mutex); list_add_rcu(&mux->kcm_mux_list, &knet->mux_list); knet->count++; mutex_unlock(&knet->mutex); skb_queue_head_init(&mux->rx_hold_queue); /* Init KCM socket */ sock_init_data(sock, sk); init_kcm_sock(kcm_sk(sk), mux); return 0; } static const struct net_proto_family kcm_family_ops = { .family = PF_KCM, .create = kcm_create, .owner = THIS_MODULE, }; static __net_init int kcm_init_net(struct net *net) { struct kcm_net *knet = net_generic(net, kcm_net_id); INIT_LIST_HEAD_RCU(&knet->mux_list); mutex_init(&knet->mutex); return 0; } static __net_exit void kcm_exit_net(struct net *net) { struct kcm_net *knet = net_generic(net, kcm_net_id); /* All KCM sockets should be closed at this point, which should mean * that all multiplexors and psocks have been destroyed. */ WARN_ON(!list_empty(&knet->mux_list)); mutex_destroy(&knet->mutex); } static struct pernet_operations kcm_net_ops = { .init = kcm_init_net, .exit = kcm_exit_net, .id = &kcm_net_id, .size = sizeof(struct kcm_net), }; static int __init kcm_init(void) { int err = -ENOMEM; kcm_muxp = KMEM_CACHE(kcm_mux, SLAB_HWCACHE_ALIGN); if (!kcm_muxp) goto fail; kcm_psockp = KMEM_CACHE(kcm_psock, SLAB_HWCACHE_ALIGN); if (!kcm_psockp) goto fail; kcm_wq = create_singlethread_workqueue("kkcmd"); if (!kcm_wq) goto fail; err = proto_register(&kcm_proto, 1); if (err) goto fail; err = register_pernet_device(&kcm_net_ops); if (err) goto net_ops_fail; err = sock_register(&kcm_family_ops); if (err) goto sock_register_fail; err = kcm_proc_init(); if (err) goto proc_init_fail; return 0; proc_init_fail: sock_unregister(PF_KCM); sock_register_fail: unregister_pernet_device(&kcm_net_ops); net_ops_fail: proto_unregister(&kcm_proto); fail: kmem_cache_destroy(kcm_muxp); kmem_cache_destroy(kcm_psockp); if (kcm_wq) destroy_workqueue(kcm_wq); return err; } static void __exit kcm_exit(void) { kcm_proc_exit(); sock_unregister(PF_KCM); unregister_pernet_device(&kcm_net_ops); proto_unregister(&kcm_proto); destroy_workqueue(kcm_wq); kmem_cache_destroy(kcm_muxp); kmem_cache_destroy(kcm_psockp); } module_init(kcm_init); module_exit(kcm_exit); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("KCM (Kernel Connection Multiplexor) sockets"); MODULE_ALIAS_NETPROTO(PF_KCM);
1894 839 256 2606 165 2592 12 12 10 19 284 37 2536 83 800 6 17 33 34 828 60 3636 3631 1863 1858 1862 419 1348 3022 31 14 956 957 169 36 939 74 939 67 67 612 460 5 464 1805 21 1972 1587 2532 4 2801 2155 1928 2 3 198 5 1 243 85 874 81 462 883 4 846 3 848 110 54 1 1380 121 110 146 671 72 21 10 79 4 136 4 59 2 7 10 3 7 1 27 2 16 9 5 1395 546 972 1428 229 880 7 7 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 1