Total coverage: 100680 (6%)of 1748617
3664 212 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 /* SPDX-License-Identifier: GPL-2.0 */ #undef TRACE_SYSTEM #define TRACE_SYSTEM percpu #if !defined(_TRACE_PERCPU_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_PERCPU_H #include <linux/tracepoint.h> #include <trace/events/mmflags.h> TRACE_EVENT(percpu_alloc_percpu, TP_PROTO(unsigned long call_site, bool reserved, bool is_atomic, size_t size, size_t align, void *base_addr, int off, void __percpu *ptr, size_t bytes_alloc, gfp_t gfp_flags), TP_ARGS(call_site, reserved, is_atomic, size, align, base_addr, off, ptr, bytes_alloc, gfp_flags), TP_STRUCT__entry( __field( unsigned long, call_site ) __field( bool, reserved ) __field( bool, is_atomic ) __field( size_t, size ) __field( size_t, align ) __field( void *, base_addr ) __field( int, off ) __field( void __percpu *, ptr ) __field( size_t, bytes_alloc ) __field( unsigned long, gfp_flags ) ), TP_fast_assign( __entry->call_site = call_site; __entry->reserved = reserved; __entry->is_atomic = is_atomic; __entry->size = size; __entry->align = align; __entry->base_addr = base_addr; __entry->off = off; __entry->ptr = ptr; __entry->bytes_alloc = bytes_alloc; __entry->gfp_flags = (__force unsigned long)gfp_flags; ), TP_printk("call_site=%pS reserved=%d is_atomic=%d size=%zu align=%zu base_addr=%p off=%d ptr=%p bytes_alloc=%zu gfp_flags=%s", (void *)__entry->call_site, __entry->reserved, __entry->is_atomic, __entry->size, __entry->align, __entry->base_addr, __entry->off, __entry->ptr, __entry->bytes_alloc, show_gfp_flags(__entry->gfp_flags)) ); TRACE_EVENT(percpu_free_percpu, TP_PROTO(void *base_addr, int off, void __percpu *ptr), TP_ARGS(base_addr, off, ptr), TP_STRUCT__entry( __field( void *, base_addr ) __field( int, off ) __field( void __percpu *, ptr ) ), TP_fast_assign( __entry->base_addr = base_addr; __entry->off = off; __entry->ptr = ptr; ), TP_printk("base_addr=%p off=%d ptr=%p", __entry->base_addr, __entry->off, __entry->ptr) ); TRACE_EVENT(percpu_alloc_percpu_fail, TP_PROTO(bool reserved, bool is_atomic, size_t size, size_t align), TP_ARGS(reserved, is_atomic, size, align), TP_STRUCT__entry( __field( bool, reserved ) __field( bool, is_atomic ) __field( size_t, size ) __field( size_t, align ) ), TP_fast_assign( __entry->reserved = reserved; __entry->is_atomic = is_atomic; __entry->size = size; __entry->align = align; ), TP_printk("reserved=%d is_atomic=%d size=%zu align=%zu", __entry->reserved, __entry->is_atomic, __entry->size, __entry->align) ); TRACE_EVENT(percpu_create_chunk, TP_PROTO(void *base_addr), TP_ARGS(base_addr), TP_STRUCT__entry( __field( void *, base_addr ) ), TP_fast_assign( __entry->base_addr = base_addr; ), TP_printk("base_addr=%p", __entry->base_addr) ); TRACE_EVENT(percpu_destroy_chunk, TP_PROTO(void *base_addr), TP_ARGS(base_addr), TP_STRUCT__entry( __field( void *, base_addr ) ), TP_fast_assign( __entry->base_addr = base_addr; ), TP_printk("base_addr=%p", __entry->base_addr) ); #endif /* _TRACE_PERCPU_H */ #include <trace/define_trace.h>
4 4 4 4 4 3 4 4 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (c) 2010 Werner Fink, Jiri Slaby */ #include <linux/console.h> #include <linux/kernel.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <linux/tty_driver.h> /* * This is handler for /proc/consoles */ static int show_console_dev(struct seq_file *m, void *v) { static const struct { short flag; char name; } con_flags[] = { { CON_ENABLED, 'E' }, { CON_CONSDEV, 'C' }, { CON_BOOT, 'B' }, { CON_NBCON, 'N' }, { CON_PRINTBUFFER, 'p' }, { CON_BRL, 'b' }, { CON_ANYTIME, 'a' }, }; char flags[ARRAY_SIZE(con_flags) + 1]; struct console *con = v; unsigned int a; dev_t dev = 0; if (con->device) { const struct tty_driver *driver; int index; /* * Take console_lock to serialize device() callback with * other console operations. For example, fg_console is * modified under console_lock when switching vt. */ console_lock(); driver = con->device(con, &index); console_unlock(); if (driver) { dev = MKDEV(driver->major, driver->minor_start); dev += index; } } for (a = 0; a < ARRAY_SIZE(con_flags); a++) flags[a] = (con->flags & con_flags[a].flag) ? con_flags[a].name : ' '; flags[a] = 0; seq_setwidth(m, 21 - 1); seq_printf(m, "%s%d", con->name, con->index); seq_pad(m, ' '); seq_printf(m, "%c%c%c (%s)", con->read ? 'R' : '-', ((con->flags & CON_NBCON) || con->write) ? 'W' : '-', con->unblank ? 'U' : '-', flags); if (dev) seq_printf(m, " %4d:%d", MAJOR(dev), MINOR(dev)); seq_putc(m, '\n'); return 0; } static void *c_start(struct seq_file *m, loff_t *pos) __acquires(&console_mutex) { struct console *con; loff_t off = 0; /* * Hold the console_list_lock to guarantee safe traversal of the * console list. SRCU cannot be used because there is no * place to store the SRCU cookie. */ console_list_lock(); for_each_console(con) if (off++ == *pos) break; return con; } static void *c_next(struct seq_file *m, void *v, loff_t *pos) { struct console *con = v; ++*pos; return hlist_entry_safe(con->node.next, struct console, node); } static void c_stop(struct seq_file *m, void *v) __releases(&console_mutex) { console_list_unlock(); } static const struct seq_operations consoles_op = { .start = c_start, .next = c_next, .stop = c_stop, .show = show_console_dev }; static int __init proc_consoles_init(void) { proc_create_seq("consoles", 0, NULL, &consoles_op); return 0; } fs_initcall(proc_consoles_init);
35 5 33 4 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 // SPDX-License-Identifier: GPL-2.0-only /* * Lock-less NULL terminated single linked list * * The basic atomic operation of this list is cmpxchg on long. On * architectures that don't have NMI-safe cmpxchg implementation, the * list can NOT be used in NMI handlers. So code that uses the list in * an NMI handler should depend on CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG. * * Copyright 2010,2011 Intel Corp. * Author: Huang Ying <ying.huang@intel.com> */ #include <linux/kernel.h> #include <linux/export.h> #include <linux/llist.h> /** * llist_del_first - delete the first entry of lock-less list * @head: the head for your lock-less list * * If list is empty, return NULL, otherwise, return the first entry * deleted, this is the newest added one. * * Only one llist_del_first user can be used simultaneously with * multiple llist_add users without lock. Because otherwise * llist_del_first, llist_add, llist_add (or llist_del_all, llist_add, * llist_add) sequence in another user may change @head->first->next, * but keep @head->first. If multiple consumers are needed, please * use llist_del_all or use lock between consumers. */ struct llist_node *llist_del_first(struct llist_head *head) { struct llist_node *entry, *next; entry = smp_load_acquire(&head->first); do { if (entry == NULL) return NULL; next = READ_ONCE(entry->next); } while (!try_cmpxchg(&head->first, &entry, next)); return entry; } EXPORT_SYMBOL_GPL(llist_del_first); /** * llist_del_first_this - delete given entry of lock-less list if it is first * @head: the head for your lock-less list * @this: a list entry. * * If head of the list is given entry, delete and return %true else * return %false. * * Multiple callers can safely call this concurrently with multiple * llist_add() callers, providing all the callers offer a different @this. */ bool llist_del_first_this(struct llist_head *head, struct llist_node *this) { struct llist_node *entry, *next; /* acquire ensures orderig wrt try_cmpxchg() is llist_del_first() */ entry = smp_load_acquire(&head->first); do { if (entry != this) return false; next = READ_ONCE(entry->next); } while (!try_cmpxchg(&head->first, &entry, next)); return true; } EXPORT_SYMBOL_GPL(llist_del_first_this); /** * llist_reverse_order - reverse order of a llist chain * @head: first item of the list to be reversed * * Reverse the order of a chain of llist entries and return the * new first entry. */ struct llist_node *llist_reverse_order(struct llist_node *head) { struct llist_node *new_head = NULL; while (head) { struct llist_node *tmp = head; head = head->next; tmp->next = new_head; new_head = tmp; } return new_head; } EXPORT_SYMBOL_GPL(llist_reverse_order);
6 6 6 6 6 6 63 49 61 5 59 63 61 61 4 6 54 35 22 54 6 6 5 6 6 4 63 6 6 6 6 4 1 6 47 46 47 1 16 37 35 13 19 16 16 38 6 6 6 6 6 6 2 2 2 12 12 12 11 12 3 1 1 1 2 2 1 1 6 6 2 4 2 6 6 4 4 4 4 4 4 4 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 // SPDX-License-Identifier: GPL-2.0 /* * /proc/sys support */ #include <linux/init.h> #include <linux/sysctl.h> #include <linux/poll.h> #include <linux/proc_fs.h> #include <linux/printk.h> #include <linux/security.h> #include <linux/sched.h> #include <linux/cred.h> #include <linux/namei.h> #include <linux/mm.h> #include <linux/uio.h> #include <linux/module.h> #include <linux/bpf-cgroup.h> #include <linux/mount.h> #include <linux/kmemleak.h> #include <linux/lockdep.h> #include "internal.h" #define list_for_each_table_entry(entry, header) \ entry = header->ctl_table; \ for (size_t i = 0 ; i < header->ctl_table_size; ++i, entry++) static const struct dentry_operations proc_sys_dentry_operations; static const struct file_operations proc_sys_file_operations; static const struct inode_operations proc_sys_inode_operations; static const struct file_operations proc_sys_dir_file_operations; static const struct inode_operations proc_sys_dir_operations; /* * Support for permanently empty directories. * Must be non-empty to avoid sharing an address with other tables. */ static const struct ctl_table sysctl_mount_point[] = { { } }; /** * register_sysctl_mount_point() - registers a sysctl mount point * @path: path for the mount point * * Used to create a permanently empty directory to serve as mount point. * There are some subtle but important permission checks this allows in the * case of unprivileged mounts. */ struct ctl_table_header *register_sysctl_mount_point(const char *path) { return register_sysctl_sz(path, sysctl_mount_point, 0); } EXPORT_SYMBOL(register_sysctl_mount_point); #define sysctl_is_perm_empty_ctl_header(hptr) \ (hptr->type == SYSCTL_TABLE_TYPE_PERMANENTLY_EMPTY) #define sysctl_set_perm_empty_ctl_header(hptr) \ (hptr->type = SYSCTL_TABLE_TYPE_PERMANENTLY_EMPTY) #define sysctl_clear_perm_empty_ctl_header(hptr) \ (hptr->type = SYSCTL_TABLE_TYPE_DEFAULT) void proc_sys_poll_notify(struct ctl_table_poll *poll) { if (!poll) return; atomic_inc(&poll->event); wake_up_interruptible(&poll->wait); } static const struct ctl_table root_table[] = { { .procname = "", .mode = S_IFDIR|S_IRUGO|S_IXUGO, }, }; static struct ctl_table_root sysctl_table_root = { .default_set.dir.header = { {{.count = 1, .nreg = 1, .ctl_table = root_table }}, .ctl_table_arg = root_table, .root = &sysctl_table_root, .set = &sysctl_table_root.default_set, }, }; static DEFINE_SPINLOCK(sysctl_lock); static void drop_sysctl_table(struct ctl_table_header *header); static int sysctl_follow_link(struct ctl_table_header **phead, const struct ctl_table **pentry); static int insert_links(struct ctl_table_header *head); static void put_links(struct ctl_table_header *header); static void sysctl_print_dir(struct ctl_dir *dir) { if (dir->header.parent) sysctl_print_dir(dir->header.parent); pr_cont("%s/", dir->header.ctl_table[0].procname); } static int namecmp(const char *name1, int len1, const char *name2, int len2) { int cmp; cmp = memcmp(name1, name2, min(len1, len2)); if (cmp == 0) cmp = len1 - len2; return cmp; } static const struct ctl_table *find_entry(struct ctl_table_header **phead, struct ctl_dir *dir, const char *name, int namelen) { struct ctl_table_header *head; const struct ctl_table *entry; struct rb_node *node = dir->root.rb_node; lockdep_assert_held(&sysctl_lock); while (node) { struct ctl_node *ctl_node; const char *procname; int cmp; ctl_node = rb_entry(node, struct ctl_node, node); head = ctl_node->header; entry = &head->ctl_table[ctl_node - head->node]; procname = entry->procname; cmp = namecmp(name, namelen, procname, strlen(procname)); if (cmp < 0) node = node->rb_left; else if (cmp > 0) node = node->rb_right; else { *phead = head; return entry; } } return NULL; } static int insert_entry(struct ctl_table_header *head, const struct ctl_table *entry) { struct rb_node *node = &head->node[entry - head->ctl_table].node; struct rb_node **p = &head->parent->root.rb_node; struct rb_node *parent = NULL; const char *name = entry->procname; int namelen = strlen(name); while (*p) { struct ctl_table_header *parent_head; const struct ctl_table *parent_entry; struct ctl_node *parent_node; const char *parent_name; int cmp; parent = *p; parent_node = rb_entry(parent, struct ctl_node, node); parent_head = parent_node->header; parent_entry = &parent_head->ctl_table[parent_node - parent_head->node]; parent_name = parent_entry->procname; cmp = namecmp(name, namelen, parent_name, strlen(parent_name)); if (cmp < 0) p = &(*p)->rb_left; else if (cmp > 0) p = &(*p)->rb_right; else { pr_err("sysctl duplicate entry: "); sysctl_print_dir(head->parent); pr_cont("%s\n", entry->procname); return -EEXIST; } } rb_link_node(node, parent, p); rb_insert_color(node, &head->parent->root); return 0; } static void erase_entry(struct ctl_table_header *head, const struct ctl_table *entry) { struct rb_node *node = &head->node[entry - head->ctl_table].node; rb_erase(node, &head->parent->root); } static void init_header(struct ctl_table_header *head, struct ctl_table_root *root, struct ctl_table_set *set, struct ctl_node *node, const struct ctl_table *table, size_t table_size) { head->ctl_table = table; head->ctl_table_size = table_size; head->ctl_table_arg = table; head->used = 0; head->count = 1; head->nreg = 1; head->unregistering = NULL; head->root = root; head->set = set; head->parent = NULL; head->node = node; INIT_HLIST_HEAD(&head->inodes); if (node) { const struct ctl_table *entry; list_for_each_table_entry(entry, head) { node->header = head; node++; } } if (table == sysctl_mount_point) sysctl_set_perm_empty_ctl_header(head); } static void erase_header(struct ctl_table_header *head) { const struct ctl_table *entry; list_for_each_table_entry(entry, head) erase_entry(head, entry); } static int insert_header(struct ctl_dir *dir, struct ctl_table_header *header) { const struct ctl_table *entry; struct ctl_table_header *dir_h = &dir->header; int err; /* Is this a permanently empty directory? */ if (sysctl_is_perm_empty_ctl_header(dir_h)) return -EROFS; /* Am I creating a permanently empty directory? */ if (sysctl_is_perm_empty_ctl_header(header)) { if (!RB_EMPTY_ROOT(&dir->root)) return -EINVAL; sysctl_set_perm_empty_ctl_header(dir_h); } dir_h->nreg++; header->parent = dir; err = insert_links(header); if (err) goto fail_links; list_for_each_table_entry(entry, header) { err = insert_entry(header, entry); if (err) goto fail; } return 0; fail: erase_header(header); put_links(header); fail_links: if (header->ctl_table == sysctl_mount_point) sysctl_clear_perm_empty_ctl_header(dir_h); header->parent = NULL; drop_sysctl_table(dir_h); return err; } static int use_table(struct ctl_table_header *p) { lockdep_assert_held(&sysctl_lock); if (unlikely(p->unregistering)) return 0; p->used++; return 1; } static void unuse_table(struct ctl_table_header *p) { lockdep_assert_held(&sysctl_lock); if (!--p->used) if (unlikely(p->unregistering)) complete(p->unregistering); } static void proc_sys_invalidate_dcache(struct ctl_table_header *head) { proc_invalidate_siblings_dcache(&head->inodes, &sysctl_lock); } static void start_unregistering(struct ctl_table_header *p) { /* will reacquire if has to wait */ lockdep_assert_held(&sysctl_lock); /* * if p->used is 0, nobody will ever touch that entry again; * we'll eliminate all paths to it before dropping sysctl_lock */ if (unlikely(p->used)) { struct completion wait; init_completion(&wait); p->unregistering = &wait; spin_unlock(&sysctl_lock); wait_for_completion(&wait); } else { /* anything non-NULL; we'll never dereference it */ p->unregistering = ERR_PTR(-EINVAL); spin_unlock(&sysctl_lock); } /* * Invalidate dentries for unregistered sysctls: namespaced sysctls * can have duplicate names and contaminate dcache very badly. */ proc_sys_invalidate_dcache(p); /* * do not remove from the list until nobody holds it; walking the * list in do_sysctl() relies on that. */ spin_lock(&sysctl_lock); erase_header(p); } static struct ctl_table_header *sysctl_head_grab(struct ctl_table_header *head) { BUG_ON(!head); spin_lock(&sysctl_lock); if (!use_table(head)) head = ERR_PTR(-ENOENT); spin_unlock(&sysctl_lock); return head; } static void sysctl_head_finish(struct ctl_table_header *head) { if (!head) return; spin_lock(&sysctl_lock); unuse_table(head); spin_unlock(&sysctl_lock); } static struct ctl_table_set * lookup_header_set(struct ctl_table_root *root) { struct ctl_table_set *set = &root->default_set; if (root->lookup) set = root->lookup(root); return set; } static const struct ctl_table *lookup_entry(struct ctl_table_header **phead, struct ctl_dir *dir, const char *name, int namelen) { struct ctl_table_header *head; const struct ctl_table *entry; spin_lock(&sysctl_lock); entry = find_entry(&head, dir, name, namelen); if (entry && use_table(head)) *phead = head; else entry = NULL; spin_unlock(&sysctl_lock); return entry; } static struct ctl_node *first_usable_entry(struct rb_node *node) { struct ctl_node *ctl_node; for (;node; node = rb_next(node)) { ctl_node = rb_entry(node, struct ctl_node, node); if (use_table(ctl_node->header)) return ctl_node; } return NULL; } static void first_entry(struct ctl_dir *dir, struct ctl_table_header **phead, const struct ctl_table **pentry) { struct ctl_table_header *head = NULL; const struct ctl_table *entry = NULL; struct ctl_node *ctl_node; spin_lock(&sysctl_lock); ctl_node = first_usable_entry(rb_first(&dir->root)); spin_unlock(&sysctl_lock); if (ctl_node) { head = ctl_node->header; entry = &head->ctl_table[ctl_node - head->node]; } *phead = head; *pentry = entry; } static void next_entry(struct ctl_table_header **phead, const struct ctl_table **pentry) { struct ctl_table_header *head = *phead; const struct ctl_table *entry = *pentry; struct ctl_node *ctl_node = &head->node[entry - head->ctl_table]; spin_lock(&sysctl_lock); unuse_table(head); ctl_node = first_usable_entry(rb_next(&ctl_node->node)); spin_unlock(&sysctl_lock); head = NULL; if (ctl_node) { head = ctl_node->header; entry = &head->ctl_table[ctl_node - head->node]; } *phead = head; *pentry = entry; } /* * sysctl_perm does NOT grant the superuser all rights automatically, because * some sysctl variables are readonly even to root. */ static int test_perm(int mode, int op) { if (uid_eq(current_euid(), GLOBAL_ROOT_UID)) mode >>= 6; else if (in_egroup_p(GLOBAL_ROOT_GID)) mode >>= 3; if ((op & ~mode & (MAY_READ|MAY_WRITE|MAY_EXEC)) == 0) return 0; return -EACCES; } static int sysctl_perm(struct ctl_table_header *head, const struct ctl_table *table, int op) { struct ctl_table_root *root = head->root; int mode; if (root->permissions) mode = root->permissions(head, table); else mode = table->mode; return test_perm(mode, op); } static struct inode *proc_sys_make_inode(struct super_block *sb, struct ctl_table_header *head, const struct ctl_table *table) { struct ctl_table_root *root = head->root; struct inode *inode; struct proc_inode *ei; inode = new_inode(sb); if (!inode) return ERR_PTR(-ENOMEM); inode->i_ino = get_next_ino(); ei = PROC_I(inode); spin_lock(&sysctl_lock); if (unlikely(head->unregistering)) { spin_unlock(&sysctl_lock); iput(inode); return ERR_PTR(-ENOENT); } ei->sysctl = head; ei->sysctl_entry = table; hlist_add_head_rcu(&ei->sibling_inodes, &head->inodes); head->count++; spin_unlock(&sysctl_lock); simple_inode_init_ts(inode); inode->i_mode = table->mode; if (!S_ISDIR(table->mode)) { inode->i_mode |= S_IFREG; inode->i_op = &proc_sys_inode_operations; inode->i_fop = &proc_sys_file_operations; } else { inode->i_mode |= S_IFDIR; inode->i_op = &proc_sys_dir_operations; inode->i_fop = &proc_sys_dir_file_operations; if (sysctl_is_perm_empty_ctl_header(head)) make_empty_dir_inode(inode); } inode->i_uid = GLOBAL_ROOT_UID; inode->i_gid = GLOBAL_ROOT_GID; if (root->set_ownership) root->set_ownership(head, &inode->i_uid, &inode->i_gid); return inode; } void proc_sys_evict_inode(struct inode *inode, struct ctl_table_header *head) { spin_lock(&sysctl_lock); hlist_del_init_rcu(&PROC_I(inode)->sibling_inodes); if (!--head->count) kfree_rcu(head, rcu); spin_unlock(&sysctl_lock); } static struct ctl_table_header *grab_header(struct inode *inode) { struct ctl_table_header *head = PROC_I(inode)->sysctl; if (!head) head = &sysctl_table_root.default_set.dir.header; return sysctl_head_grab(head); } static struct dentry *proc_sys_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) { struct ctl_table_header *head = grab_header(dir); struct ctl_table_header *h = NULL; const struct qstr *name = &dentry->d_name; const struct ctl_table *p; struct inode *inode; struct dentry *err = ERR_PTR(-ENOENT); struct ctl_dir *ctl_dir; int ret; if (IS_ERR(head)) return ERR_CAST(head); ctl_dir = container_of(head, struct ctl_dir, header); p = lookup_entry(&h, ctl_dir, name->name, name->len); if (!p) goto out; if (S_ISLNK(p->mode)) { ret = sysctl_follow_link(&h, &p); err = ERR_PTR(ret); if (ret) goto out; } inode = proc_sys_make_inode(dir->i_sb, h ? h : head, p); err = d_splice_alias_ops(inode, dentry, &proc_sys_dentry_operations); out: if (h) sysctl_head_finish(h); sysctl_head_finish(head); return err; } static ssize_t proc_sys_call_handler(struct kiocb *iocb, struct iov_iter *iter, int write) { struct inode *inode = file_inode(iocb->ki_filp); struct ctl_table_header *head = grab_header(inode); const struct ctl_table *table = PROC_I(inode)->sysctl_entry; size_t count = iov_iter_count(iter); char *kbuf; ssize_t error; if (IS_ERR(head)) return PTR_ERR(head); /* * At this point we know that the sysctl was not unregistered * and won't be until we finish. */ error = -EPERM; if (sysctl_perm(head, table, write ? MAY_WRITE : MAY_READ)) goto out; /* if that can happen at all, it should be -EINVAL, not -EISDIR */ error = -EINVAL; if (!table->proc_handler) goto out; /* don't even try if the size is too large */ error = -ENOMEM; if (count >= KMALLOC_MAX_SIZE) goto out; kbuf = kvzalloc(count + 1, GFP_KERNEL); if (!kbuf) goto out; if (write) { error = -EFAULT; if (!copy_from_iter_full(kbuf, count, iter)) goto out_free_buf; kbuf[count] = '\0'; } error = BPF_CGROUP_RUN_PROG_SYSCTL(head, table, write, &kbuf, &count, &iocb->ki_pos); if (error) goto out_free_buf; /* careful: calling conventions are nasty here */ error = table->proc_handler(table, write, kbuf, &count, &iocb->ki_pos); if (error) goto out_free_buf; if (!write) { error = -EFAULT; if (copy_to_iter(kbuf, count, iter) < count) goto out_free_buf; } error = count; out_free_buf: kvfree(kbuf); out: sysctl_head_finish(head); return error; } static ssize_t proc_sys_read(struct kiocb *iocb, struct iov_iter *iter) { return proc_sys_call_handler(iocb, iter, 0); } static ssize_t proc_sys_write(struct kiocb *iocb, struct iov_iter *iter) { return proc_sys_call_handler(iocb, iter, 1); } static int proc_sys_open(struct inode *inode, struct file *filp) { struct ctl_table_header *head = grab_header(inode); const struct ctl_table *table = PROC_I(inode)->sysctl_entry; /* sysctl was unregistered */ if (IS_ERR(head)) return PTR_ERR(head); if (table->poll) filp->private_data = proc_sys_poll_event(table->poll); sysctl_head_finish(head); return 0; } static __poll_t proc_sys_poll(struct file *filp, poll_table *wait) { struct inode *inode = file_inode(filp); struct ctl_table_header *head = grab_header(inode); const struct ctl_table *table = PROC_I(inode)->sysctl_entry; __poll_t ret = DEFAULT_POLLMASK; unsigned long event; /* sysctl was unregistered */ if (IS_ERR(head)) return EPOLLERR | EPOLLHUP; if (!table->proc_handler) goto out; if (!table->poll) goto out; event = (unsigned long)filp->private_data; poll_wait(filp, &table->poll->wait, wait); if (event != atomic_read(&table->poll->event)) { filp->private_data = proc_sys_poll_event(table->poll); ret = EPOLLIN | EPOLLRDNORM | EPOLLERR | EPOLLPRI; } out: sysctl_head_finish(head); return ret; } static bool proc_sys_fill_cache(struct file *file, struct dir_context *ctx, struct ctl_table_header *head, const struct ctl_table *table) { struct dentry *child, *dir = file->f_path.dentry; struct inode *inode; struct qstr qname; ino_t ino = 0; unsigned type = DT_UNKNOWN; qname.name = table->procname; qname.len = strlen(table->procname); qname.hash = full_name_hash(dir, qname.name, qname.len); child = d_lookup(dir, &qname); if (!child) { DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq); child = d_alloc_parallel(dir, &qname, &wq); if (IS_ERR(child)) return false; if (d_in_lookup(child)) { struct dentry *res; inode = proc_sys_make_inode(dir->d_sb, head, table); res = d_splice_alias_ops(inode, child, &proc_sys_dentry_operations); d_lookup_done(child); if (unlikely(res)) { dput(child); if (IS_ERR(res)) return false; child = res; } } } inode = d_inode(child); ino = inode->i_ino; type = inode->i_mode >> 12; dput(child); return dir_emit(ctx, qname.name, qname.len, ino, type); } static bool proc_sys_link_fill_cache(struct file *file, struct dir_context *ctx, struct ctl_table_header *head, const struct ctl_table *table) { bool ret = true; head = sysctl_head_grab(head); if (IS_ERR(head)) return false; /* It is not an error if we can not follow the link ignore it */ if (sysctl_follow_link(&head, &table)) goto out; ret = proc_sys_fill_cache(file, ctx, head, table); out: sysctl_head_finish(head); return ret; } static int scan(struct ctl_table_header *head, const struct ctl_table *table, unsigned long *pos, struct file *file, struct dir_context *ctx) { bool res; if ((*pos)++ < ctx->pos) return true; if (unlikely(S_ISLNK(table->mode))) res = proc_sys_link_fill_cache(file, ctx, head, table); else res = proc_sys_fill_cache(file, ctx, head, table); if (res) ctx->pos = *pos; return res; } static int proc_sys_readdir(struct file *file, struct dir_context *ctx) { struct ctl_table_header *head = grab_header(file_inode(file)); struct ctl_table_header *h = NULL; const struct ctl_table *entry; struct ctl_dir *ctl_dir; unsigned long pos; if (IS_ERR(head)) return PTR_ERR(head); ctl_dir = container_of(head, struct ctl_dir, header); if (!dir_emit_dots(file, ctx)) goto out; pos = 2; for (first_entry(ctl_dir, &h, &entry); h; next_entry(&h, &entry)) { if (!scan(h, entry, &pos, file, ctx)) { sysctl_head_finish(h); break; } } out: sysctl_head_finish(head); return 0; } static int proc_sys_permission(struct mnt_idmap *idmap, struct inode *inode, int mask) { /* * sysctl entries that are not writeable, * are _NOT_ writeable, capabilities or not. */ struct ctl_table_header *head; const struct ctl_table *table; int error; /* Executable files are not allowed under /proc/sys/ */ if ((mask & MAY_EXEC) && S_ISREG(inode->i_mode)) return -EACCES; head = grab_header(inode); if (IS_ERR(head)) return PTR_ERR(head); table = PROC_I(inode)->sysctl_entry; if (!table) /* global root - r-xr-xr-x */ error = mask & MAY_WRITE ? -EACCES : 0; else /* Use the permissions on the sysctl table entry */ error = sysctl_perm(head, table, mask & ~MAY_NOT_BLOCK); sysctl_head_finish(head); return error; } static int proc_sys_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *attr) { struct inode *inode = d_inode(dentry); int error; if (attr->ia_valid & (ATTR_MODE | ATTR_UID | ATTR_GID)) return -EPERM; error = setattr_prepare(&nop_mnt_idmap, dentry, attr); if (error) return error; setattr_copy(&nop_mnt_idmap, inode, attr); return 0; } static int proc_sys_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int query_flags) { struct inode *inode = d_inode(path->dentry); struct ctl_table_header *head = grab_header(inode); const struct ctl_table *table = PROC_I(inode)->sysctl_entry; if (IS_ERR(head)) return PTR_ERR(head); generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat); if (table) stat->mode = (stat->mode & S_IFMT) | table->mode; sysctl_head_finish(head); return 0; } static const struct file_operations proc_sys_file_operations = { .open = proc_sys_open, .poll = proc_sys_poll, .read_iter = proc_sys_read, .write_iter = proc_sys_write, .splice_read = copy_splice_read, .splice_write = iter_file_splice_write, .llseek = default_llseek, }; static const struct file_operations proc_sys_dir_file_operations = { .read = generic_read_dir, .iterate_shared = proc_sys_readdir, .llseek = generic_file_llseek, }; static const struct inode_operations proc_sys_inode_operations = { .permission = proc_sys_permission, .setattr = proc_sys_setattr, .getattr = proc_sys_getattr, }; static const struct inode_operations proc_sys_dir_operations = { .lookup = proc_sys_lookup, .permission = proc_sys_permission, .setattr = proc_sys_setattr, .getattr = proc_sys_getattr, }; static int proc_sys_revalidate(struct inode *dir, const struct qstr *name, struct dentry *dentry, unsigned int flags) { if (flags & LOOKUP_RCU) return -ECHILD; return !PROC_I(d_inode(dentry))->sysctl->unregistering; } static int proc_sys_delete(const struct dentry *dentry) { return !!PROC_I(d_inode(dentry))->sysctl->unregistering; } static int sysctl_is_seen(struct ctl_table_header *p) { struct ctl_table_set *set = p->set; int res; spin_lock(&sysctl_lock); if (p->unregistering) res = 0; else if (!set->is_seen) res = 1; else res = set->is_seen(set); spin_unlock(&sysctl_lock); return res; } static int proc_sys_compare(const struct dentry *dentry, unsigned int len, const char *str, const struct qstr *name) { struct ctl_table_header *head; struct inode *inode; if (name->len != len) return 1; if (memcmp(name->name, str, len)) return 1; // false positive is fine here - we'll recheck anyway if (d_in_lookup(dentry)) return 0; inode = d_inode_rcu(dentry); // we just might have run into dentry in the middle of __dentry_kill() if (!inode) return 1; head = READ_ONCE(PROC_I(inode)->sysctl); return !head || !sysctl_is_seen(head); } static const struct dentry_operations proc_sys_dentry_operations = { .d_revalidate = proc_sys_revalidate, .d_delete = proc_sys_delete, .d_compare = proc_sys_compare, }; static struct ctl_dir *find_subdir(struct ctl_dir *dir, const char *name, int namelen) { struct ctl_table_header *head; const struct ctl_table *entry; entry = find_entry(&head, dir, name, namelen); if (!entry) return ERR_PTR(-ENOENT); if (!S_ISDIR(entry->mode)) return ERR_PTR(-ENOTDIR); return container_of(head, struct ctl_dir, header); } static struct ctl_dir *new_dir(struct ctl_table_set *set, const char *name, int namelen) { struct ctl_table *table; struct ctl_dir *new; struct ctl_node *node; char *new_name; new = kzalloc(sizeof(*new) + sizeof(struct ctl_node) + sizeof(struct ctl_table) + namelen + 1, GFP_KERNEL); if (!new) return NULL; node = (struct ctl_node *)(new + 1); table = (struct ctl_table *)(node + 1); new_name = (char *)(table + 1); memcpy(new_name, name, namelen); table[0].procname = new_name; table[0].mode = S_IFDIR|S_IRUGO|S_IXUGO; init_header(&new->header, set->dir.header.root, set, node, table, 1); return new; } /** * get_subdir - find or create a subdir with the specified name. * @dir: Directory to create the subdirectory in * @name: The name of the subdirectory to find or create * @namelen: The length of name * * Takes a directory with an elevated reference count so we know that * if we drop the lock the directory will not go away. Upon success * the reference is moved from @dir to the returned subdirectory. * Upon error an error code is returned and the reference on @dir is * simply dropped. */ static struct ctl_dir *get_subdir(struct ctl_dir *dir, const char *name, int namelen) { struct ctl_table_set *set = dir->header.set; struct ctl_dir *subdir, *new = NULL; int err; spin_lock(&sysctl_lock); subdir = find_subdir(dir, name, namelen); if (!IS_ERR(subdir)) goto found; if (PTR_ERR(subdir) != -ENOENT) goto failed; spin_unlock(&sysctl_lock); new = new_dir(set, name, namelen); spin_lock(&sysctl_lock); subdir = ERR_PTR(-ENOMEM); if (!new) goto failed; /* Was the subdir added while we dropped the lock? */ subdir = find_subdir(dir, name, namelen); if (!IS_ERR(subdir)) goto found; if (PTR_ERR(subdir) != -ENOENT) goto failed; /* Nope. Use the our freshly made directory entry. */ err = insert_header(dir, &new->header); subdir = ERR_PTR(err); if (err) goto failed; subdir = new; found: subdir->header.nreg++; failed: if (IS_ERR(subdir)) { pr_err("sysctl could not get directory: "); sysctl_print_dir(dir); pr_cont("%*.*s %ld\n", namelen, namelen, name, PTR_ERR(subdir)); } drop_sysctl_table(&dir->header); if (new) drop_sysctl_table(&new->header); spin_unlock(&sysctl_lock); return subdir; } static struct ctl_dir *xlate_dir(struct ctl_table_set *set, struct ctl_dir *dir) { struct ctl_dir *parent; const char *procname; if (!dir->header.parent) return &set->dir; parent = xlate_dir(set, dir->header.parent); if (IS_ERR(parent)) return parent; procname = dir->header.ctl_table[0].procname; return find_subdir(parent, procname, strlen(procname)); } static int sysctl_follow_link(struct ctl_table_header **phead, const struct ctl_table **pentry) { struct ctl_table_header *head; const struct ctl_table *entry; struct ctl_table_root *root; struct ctl_table_set *set; struct ctl_dir *dir; int ret; spin_lock(&sysctl_lock); root = (*pentry)->data; set = lookup_header_set(root); dir = xlate_dir(set, (*phead)->parent); if (IS_ERR(dir)) ret = PTR_ERR(dir); else { const char *procname = (*pentry)->procname; head = NULL; entry = find_entry(&head, dir, procname, strlen(procname)); ret = -ENOENT; if (entry && use_table(head)) { unuse_table(*phead); *phead = head; *pentry = entry; ret = 0; } } spin_unlock(&sysctl_lock); return ret; } static int sysctl_err(const char *path, const struct ctl_table *table, char *fmt, ...) { struct va_format vaf; va_list args; va_start(args, fmt); vaf.fmt = fmt; vaf.va = &args; pr_err("sysctl table check failed: %s/%s %pV\n", path, table->procname, &vaf); va_end(args); return -EINVAL; } static int sysctl_check_table_array(const char *path, const struct ctl_table *table) { unsigned int extra; int err = 0; if ((table->proc_handler == proc_douintvec) || (table->proc_handler == proc_douintvec_minmax)) { if (table->maxlen != sizeof(unsigned int)) err |= sysctl_err(path, table, "array not allowed"); } if (table->proc_handler == proc_dou8vec_minmax) { if (table->maxlen != sizeof(u8)) err |= sysctl_err(path, table, "array not allowed"); if (table->extra1) { extra = *(unsigned int *) table->extra1; if (extra > 255U) err |= sysctl_err(path, table, "range value too large for proc_dou8vec_minmax"); } if (table->extra2) { extra = *(unsigned int *) table->extra2; if (extra > 255U) err |= sysctl_err(path, table, "range value too large for proc_dou8vec_minmax"); } } if (table->proc_handler == proc_dobool) { if (table->maxlen != sizeof(bool)) err |= sysctl_err(path, table, "array not allowed"); } return err; } static int sysctl_check_table(const char *path, struct ctl_table_header *header) { const struct ctl_table *entry; int err = 0; list_for_each_table_entry(entry, header) { if (!entry->procname) err |= sysctl_err(path, entry, "procname is null"); if ((entry->proc_handler == proc_dostring) || (entry->proc_handler == proc_dobool) || (entry->proc_handler == proc_dointvec) || (entry->proc_handler == proc_douintvec) || (entry->proc_handler == proc_douintvec_minmax) || (entry->proc_handler == proc_dointvec_minmax) || (entry->proc_handler == proc_dou8vec_minmax) || (entry->proc_handler == proc_dointvec_jiffies) || (entry->proc_handler == proc_dointvec_userhz_jiffies) || (entry->proc_handler == proc_dointvec_ms_jiffies) || (entry->proc_handler == proc_doulongvec_minmax) || (entry->proc_handler == proc_doulongvec_ms_jiffies_minmax)) { if (!entry->data) err |= sysctl_err(path, entry, "No data"); if (!entry->maxlen) err |= sysctl_err(path, entry, "No maxlen"); else err |= sysctl_check_table_array(path, entry); } if (!entry->proc_handler) err |= sysctl_err(path, entry, "No proc_handler"); if ((entry->mode & (S_IRUGO|S_IWUGO)) != entry->mode) err |= sysctl_err(path, entry, "bogus .mode 0%o", entry->mode); } return err; } static struct ctl_table_header *new_links(struct ctl_dir *dir, struct ctl_table_header *head) { struct ctl_table *link_table, *link; struct ctl_table_header *links; const struct ctl_table *entry; struct ctl_node *node; char *link_name; int name_bytes; name_bytes = 0; list_for_each_table_entry(entry, head) { name_bytes += strlen(entry->procname) + 1; } links = kzalloc(sizeof(struct ctl_table_header) + sizeof(struct ctl_node)*head->ctl_table_size + sizeof(struct ctl_table)*head->ctl_table_size + name_bytes, GFP_KERNEL); if (!links) return NULL; node = (struct ctl_node *)(links + 1); link_table = (struct ctl_table *)(node + head->ctl_table_size); link_name = (char *)(link_table + head->ctl_table_size); link = link_table; list_for_each_table_entry(entry, head) { int len = strlen(entry->procname) + 1; memcpy(link_name, entry->procname, len); link->procname = link_name; link->mode = S_IFLNK|S_IRWXUGO; link->data = head->root; link_name += len; link++; } init_header(links, dir->header.root, dir->header.set, node, link_table, head->ctl_table_size); links->nreg = head->ctl_table_size; return links; } static bool get_links(struct ctl_dir *dir, struct ctl_table_header *header, struct ctl_table_root *link_root) { struct ctl_table_header *tmp_head; const struct ctl_table *entry, *link; if (header->ctl_table_size == 0 || sysctl_is_perm_empty_ctl_header(header)) return true; /* Are there links available for every entry in table? */ list_for_each_table_entry(entry, header) { const char *procname = entry->procname; link = find_entry(&tmp_head, dir, procname, strlen(procname)); if (!link) return false; if (S_ISDIR(link->mode) && S_ISDIR(entry->mode)) continue; if (S_ISLNK(link->mode) && (link->data == link_root)) continue; return false; } /* The checks passed. Increase the registration count on the links */ list_for_each_table_entry(entry, header) { const char *procname = entry->procname; link = find_entry(&tmp_head, dir, procname, strlen(procname)); tmp_head->nreg++; } return true; } static int insert_links(struct ctl_table_header *head) { struct ctl_table_set *root_set = &sysctl_table_root.default_set; struct ctl_dir *core_parent; struct ctl_table_header *links; int err; if (head->set == root_set) return 0; core_parent = xlate_dir(root_set, head->parent); if (IS_ERR(core_parent)) return 0; if (get_links(core_parent, head, head->root)) return 0; core_parent->header.nreg++; spin_unlock(&sysctl_lock); links = new_links(core_parent, head); spin_lock(&sysctl_lock); err = -ENOMEM; if (!links) goto out; err = 0; if (get_links(core_parent, head, head->root)) { kfree(links); goto out; } err = insert_header(core_parent, links); if (err) kfree(links); out: drop_sysctl_table(&core_parent->header); return err; } /* Find the directory for the ctl_table. If one is not found create it. */ static struct ctl_dir *sysctl_mkdir_p(struct ctl_dir *dir, const char *path) { const char *name, *nextname; for (name = path; name; name = nextname) { int namelen; nextname = strchr(name, '/'); if (nextname) { namelen = nextname - name; nextname++; } else { namelen = strlen(name); } if (namelen == 0) continue; /* * namelen ensures if name is "foo/bar/yay" only foo is * registered first. We traverse as if using mkdir -p and * return a ctl_dir for the last directory entry. */ dir = get_subdir(dir, name, namelen); if (IS_ERR(dir)) break; } return dir; } /** * __register_sysctl_table - register a leaf sysctl table * @set: Sysctl tree to register on * @path: The path to the directory the sysctl table is in. * * @table: the top-level table structure. This table should not be free'd * after registration. So it should not be used on stack. It can either * be a global or dynamically allocated by the caller and free'd later * after sysctl unregistration. * @table_size : The number of elements in table * * Register a sysctl table hierarchy. @table should be a filled in ctl_table * array. * * The members of the &struct ctl_table structure are used as follows: * procname - the name of the sysctl file under /proc/sys. Set to %NULL to not * enter a sysctl file * data - a pointer to data for use by proc_handler * maxlen - the maximum size in bytes of the data * mode - the file permissions for the /proc/sys file * type - Defines the target type (described in struct definition) * proc_handler - the text handler routine (described below) * * extra1, extra2 - extra pointers usable by the proc handler routines * XXX: we should eventually modify these to use long min / max [0] * [0] https://lkml.kernel.org/87zgpte9o4.fsf@email.froward.int.ebiederm.org * * Leaf nodes in the sysctl tree will be represented by a single file * under /proc; non-leaf nodes are not allowed. * * There must be a proc_handler routine for any terminal nodes. * Several default handlers are available to cover common cases - * * proc_dostring(), proc_dointvec(), proc_dointvec_jiffies(), * proc_dointvec_userhz_jiffies(), proc_dointvec_minmax(), * proc_doulongvec_ms_jiffies_minmax(), proc_doulongvec_minmax() * * It is the handler's job to read the input buffer from user memory * and process it. The handler should return 0 on success. * * This routine returns %NULL on a failure to register, and a pointer * to the table header on success. */ struct ctl_table_header *__register_sysctl_table( struct ctl_table_set *set, const char *path, const struct ctl_table *table, size_t table_size) { struct ctl_table_root *root = set->dir.header.root; struct ctl_table_header *header; struct ctl_dir *dir; struct ctl_node *node; header = kzalloc(sizeof(struct ctl_table_header) + sizeof(struct ctl_node)*table_size, GFP_KERNEL_ACCOUNT); if (!header) return NULL; node = (struct ctl_node *)(header + 1); init_header(header, root, set, node, table, table_size); if (sysctl_check_table(path, header)) goto fail; spin_lock(&sysctl_lock); dir = &set->dir; /* Reference moved down the directory tree get_subdir */ dir->header.nreg++; spin_unlock(&sysctl_lock); dir = sysctl_mkdir_p(dir, path); if (IS_ERR(dir)) goto fail; spin_lock(&sysctl_lock); if (insert_header(dir, header)) goto fail_put_dir_locked; drop_sysctl_table(&dir->header); spin_unlock(&sysctl_lock); return header; fail_put_dir_locked: drop_sysctl_table(&dir->header); spin_unlock(&sysctl_lock); fail: kfree(header); return NULL; } /** * register_sysctl_sz - register a sysctl table * @path: The path to the directory the sysctl table is in. If the path * doesn't exist we will create it for you. * @table: the table structure. The calller must ensure the life of the @table * will be kept during the lifetime use of the syctl. It must not be freed * until unregister_sysctl_table() is called with the given returned table * with this registration. If your code is non modular then you don't need * to call unregister_sysctl_table() and can instead use something like * register_sysctl_init() which does not care for the result of the syctl * registration. * @table_size: The number of elements in table. * * Register a sysctl table. @table should be a filled in ctl_table * array. A completely 0 filled entry terminates the table. * * See __register_sysctl_table for more details. */ struct ctl_table_header *register_sysctl_sz(const char *path, const struct ctl_table *table, size_t table_size) { return __register_sysctl_table(&sysctl_table_root.default_set, path, table, table_size); } EXPORT_SYMBOL(register_sysctl_sz); /** * __register_sysctl_init() - register sysctl table to path * @path: path name for sysctl base. If that path doesn't exist we will create * it for you. * @table: This is the sysctl table that needs to be registered to the path. * The caller must ensure the life of the @table will be kept during the * lifetime use of the sysctl. * @table_name: The name of sysctl table, only used for log printing when * registration fails * @table_size: The number of elements in table * * The sysctl interface is used by userspace to query or modify at runtime * a predefined value set on a variable. These variables however have default * values pre-set. Code which depends on these variables will always work even * if register_sysctl() fails. If register_sysctl() fails you'd just loose the * ability to query or modify the sysctls dynamically at run time. Chances of * register_sysctl() failing on init are extremely low, and so for both reasons * this function does not return any error as it is used by initialization code. * * Context: if your base directory does not exist it will be created for you. */ void __init __register_sysctl_init(const char *path, const struct ctl_table *table, const char *table_name, size_t table_size) { struct ctl_table_header *hdr = register_sysctl_sz(path, table, table_size); if (unlikely(!hdr)) { pr_err("failed when register_sysctl_sz %s to %s\n", table_name, path); return; } kmemleak_not_leak(hdr); } static void put_links(struct ctl_table_header *header) { struct ctl_table_set *root_set = &sysctl_table_root.default_set; struct ctl_table_root *root = header->root; struct ctl_dir *parent = header->parent; struct ctl_dir *core_parent; const struct ctl_table *entry; if (header->set == root_set) return; core_parent = xlate_dir(root_set, parent); if (IS_ERR(core_parent)) return; list_for_each_table_entry(entry, header) { struct ctl_table_header *link_head; const struct ctl_table *link; const char *name = entry->procname; link = find_entry(&link_head, core_parent, name, strlen(name)); if (link && ((S_ISDIR(link->mode) && S_ISDIR(entry->mode)) || (S_ISLNK(link->mode) && (link->data == root)))) { drop_sysctl_table(link_head); } else { pr_err("sysctl link missing during unregister: "); sysctl_print_dir(parent); pr_cont("%s\n", name); } } } static void drop_sysctl_table(struct ctl_table_header *header) { struct ctl_dir *parent = header->parent; if (--header->nreg) return; if (parent) { put_links(header); start_unregistering(header); } if (!--header->count) kfree_rcu(header, rcu); if (parent) drop_sysctl_table(&parent->header); } /** * unregister_sysctl_table - unregister a sysctl table hierarchy * @header: the header returned from register_sysctl or __register_sysctl_table * * Unregisters the sysctl table and all children. proc entries may not * actually be removed until they are no longer used by anyone. */ void unregister_sysctl_table(struct ctl_table_header * header) { might_sleep(); if (header == NULL) return; spin_lock(&sysctl_lock); drop_sysctl_table(header); spin_unlock(&sysctl_lock); } EXPORT_SYMBOL(unregister_sysctl_table); void setup_sysctl_set(struct ctl_table_set *set, struct ctl_table_root *root, int (*is_seen)(struct ctl_table_set *)) { memset(set, 0, sizeof(*set)); set->is_seen = is_seen; init_header(&set->dir.header, root, set, NULL, root_table, 1); } void retire_sysctl_set(struct ctl_table_set *set) { WARN_ON(!RB_EMPTY_ROOT(&set->dir.root)); } int __init proc_sys_init(void) { struct proc_dir_entry *proc_sys_root; proc_sys_root = proc_mkdir("sys", NULL); proc_sys_root->proc_iops = &proc_sys_dir_operations; proc_sys_root->proc_dir_ops = &proc_sys_dir_file_operations; proc_sys_root->nlink = 0; return sysctl_init_bases(); } struct sysctl_alias { const char *kernel_param; const char *sysctl_param; }; /* * Historically some settings had both sysctl and a command line parameter. * With the generic sysctl. parameter support, we can handle them at a single * place and only keep the historical name for compatibility. This is not meant * to add brand new aliases. When adding existing aliases, consider whether * the possibly different moment of changing the value (e.g. from early_param * to the moment do_sysctl_args() is called) is an issue for the specific * parameter. */ static const struct sysctl_alias sysctl_aliases[] = { {"hardlockup_all_cpu_backtrace", "kernel.hardlockup_all_cpu_backtrace" }, {"hung_task_panic", "kernel.hung_task_panic" }, {"numa_zonelist_order", "vm.numa_zonelist_order" }, {"softlockup_all_cpu_backtrace", "kernel.softlockup_all_cpu_backtrace" }, { } }; static const char *sysctl_find_alias(char *param) { const struct sysctl_alias *alias; for (alias = &sysctl_aliases[0]; alias->kernel_param != NULL; alias++) { if (strcmp(alias->kernel_param, param) == 0) return alias->sysctl_param; } return NULL; } bool sysctl_is_alias(char *param) { const char *alias = sysctl_find_alias(param); return alias != NULL; } /* Set sysctl value passed on kernel command line. */ static int process_sysctl_arg(char *param, char *val, const char *unused, void *arg) { char *path; struct vfsmount **proc_mnt = arg; struct file_system_type *proc_fs_type; struct file *file; int len; int err; loff_t pos = 0; ssize_t wret; if (strncmp(param, "sysctl", sizeof("sysctl") - 1) == 0) { param += sizeof("sysctl") - 1; if (param[0] != '/' && param[0] != '.') return 0; param++; } else { param = (char *) sysctl_find_alias(param); if (!param) return 0; } if (!val) return -EINVAL; len = strlen(val); if (len == 0) return -EINVAL; /* * To set sysctl options, we use a temporary mount of proc, look up the * respective sys/ file and write to it. To avoid mounting it when no * options were given, we mount it only when the first sysctl option is * found. Why not a persistent mount? There are problems with a * persistent mount of proc in that it forces userspace not to use any * proc mount options. */ if (!*proc_mnt) { proc_fs_type = get_fs_type("proc"); if (!proc_fs_type) { pr_err("Failed to find procfs to set sysctl from command line\n"); return 0; } *proc_mnt = kern_mount(proc_fs_type); put_filesystem(proc_fs_type); if (IS_ERR(*proc_mnt)) { pr_err("Failed to mount procfs to set sysctl from command line\n"); return 0; } } path = kasprintf(GFP_KERNEL, "sys/%s", param); if (!path) panic("%s: Failed to allocate path for %s\n", __func__, param); strreplace(path, '.', '/'); file = file_open_root_mnt(*proc_mnt, path, O_WRONLY, 0); if (IS_ERR(file)) { err = PTR_ERR(file); if (err == -ENOENT) pr_err("Failed to set sysctl parameter '%s=%s': parameter not found\n", param, val); else if (err == -EACCES) pr_err("Failed to set sysctl parameter '%s=%s': permission denied (read-only?)\n", param, val); else pr_err("Error %pe opening proc file to set sysctl parameter '%s=%s'\n", file, param, val); goto out; } wret = kernel_write(file, val, len, &pos); if (wret < 0) { err = wret; if (err == -EINVAL) pr_err("Failed to set sysctl parameter '%s=%s': invalid value\n", param, val); else pr_err("Error %pe writing to proc file to set sysctl parameter '%s=%s'\n", ERR_PTR(err), param, val); } else if (wret != len) { pr_err("Wrote only %zd bytes of %d writing to proc file %s to set sysctl parameter '%s=%s\n", wret, len, path, param, val); } err = filp_close(file, NULL); if (err) pr_err("Error %pe closing proc file to set sysctl parameter '%s=%s\n", ERR_PTR(err), param, val); out: kfree(path); return 0; } void do_sysctl_args(void) { char *command_line; struct vfsmount *proc_mnt = NULL; command_line = kstrdup(saved_command_line, GFP_KERNEL); if (!command_line) panic("%s: Failed to allocate copy of command line\n", __func__); parse_args("Setting sysctl args", command_line, NULL, 0, -1, -1, &proc_mnt, process_sysctl_arg); if (proc_mnt) kern_unmount(proc_mnt); kfree(command_line); }
4 32 32 32 32 559 560 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 // SPDX-License-Identifier: GPL-2.0 /* * Wakeup statistics in sysfs * * Copyright (c) 2019 Linux Foundation * Copyright (c) 2019 Greg Kroah-Hartman <gregkh@linuxfoundation.org> * Copyright (c) 2019 Google Inc. */ #include <linux/device.h> #include <linux/idr.h> #include <linux/init.h> #include <linux/kdev_t.h> #include <linux/kernel.h> #include <linux/kobject.h> #include <linux/slab.h> #include <linux/timekeeping.h> #include "power.h" static struct class *wakeup_class; #define wakeup_attr(_name) \ static ssize_t _name##_show(struct device *dev, \ struct device_attribute *attr, char *buf) \ { \ struct wakeup_source *ws = dev_get_drvdata(dev); \ \ return sysfs_emit(buf, "%lu\n", ws->_name); \ } \ static DEVICE_ATTR_RO(_name) wakeup_attr(active_count); wakeup_attr(event_count); wakeup_attr(wakeup_count); wakeup_attr(expire_count); wakeup_attr(relax_count); static ssize_t active_time_ms_show(struct device *dev, struct device_attribute *attr, char *buf) { struct wakeup_source *ws = dev_get_drvdata(dev); ktime_t active_time = ws->active ? ktime_sub(ktime_get(), ws->last_time) : 0; return sysfs_emit(buf, "%lld\n", ktime_to_ms(active_time)); } static DEVICE_ATTR_RO(active_time_ms); static ssize_t total_time_ms_show(struct device *dev, struct device_attribute *attr, char *buf) { struct wakeup_source *ws = dev_get_drvdata(dev); ktime_t active_time; ktime_t total_time = ws->total_time; if (ws->active) { active_time = ktime_sub(ktime_get(), ws->last_time); total_time = ktime_add(total_time, active_time); } return sysfs_emit(buf, "%lld\n", ktime_to_ms(total_time)); } static DEVICE_ATTR_RO(total_time_ms); static ssize_t max_time_ms_show(struct device *dev, struct device_attribute *attr, char *buf) { struct wakeup_source *ws = dev_get_drvdata(dev); ktime_t active_time; ktime_t max_time = ws->max_time; if (ws->active) { active_time = ktime_sub(ktime_get(), ws->last_time); if (active_time > max_time) max_time = active_time; } return sysfs_emit(buf, "%lld\n", ktime_to_ms(max_time)); } static DEVICE_ATTR_RO(max_time_ms); static ssize_t last_change_ms_show(struct device *dev, struct device_attribute *attr, char *buf) { struct wakeup_source *ws = dev_get_drvdata(dev); return sysfs_emit(buf, "%lld\n", ktime_to_ms(ws->last_time)); } static DEVICE_ATTR_RO(last_change_ms); static ssize_t name_show(struct device *dev, struct device_attribute *attr, char *buf) { struct wakeup_source *ws = dev_get_drvdata(dev); return sysfs_emit(buf, "%s\n", ws->name); } static DEVICE_ATTR_RO(name); static ssize_t prevent_suspend_time_ms_show(struct device *dev, struct device_attribute *attr, char *buf) { struct wakeup_source *ws = dev_get_drvdata(dev); ktime_t prevent_sleep_time = ws->prevent_sleep_time; if (ws->active && ws->autosleep_enabled) { prevent_sleep_time = ktime_add(prevent_sleep_time, ktime_sub(ktime_get(), ws->start_prevent_time)); } return sysfs_emit(buf, "%lld\n", ktime_to_ms(prevent_sleep_time)); } static DEVICE_ATTR_RO(prevent_suspend_time_ms); static struct attribute *wakeup_source_attrs[] = { &dev_attr_name.attr, &dev_attr_active_count.attr, &dev_attr_event_count.attr, &dev_attr_wakeup_count.attr, &dev_attr_expire_count.attr, &dev_attr_relax_count.attr, &dev_attr_active_time_ms.attr, &dev_attr_total_time_ms.attr, &dev_attr_max_time_ms.attr, &dev_attr_last_change_ms.attr, &dev_attr_prevent_suspend_time_ms.attr, NULL, }; ATTRIBUTE_GROUPS(wakeup_source); static void device_create_release(struct device *dev) { kfree(dev); } static struct device *wakeup_source_device_create(struct device *parent, struct wakeup_source *ws) { struct device *dev = NULL; int retval; dev = kzalloc(sizeof(*dev), GFP_KERNEL); if (!dev) { retval = -ENOMEM; goto error; } device_initialize(dev); dev->devt = MKDEV(0, 0); dev->class = wakeup_class; dev->parent = parent; dev->groups = wakeup_source_groups; dev->release = device_create_release; dev_set_drvdata(dev, ws); device_set_pm_not_required(dev); retval = dev_set_name(dev, "wakeup%d", ws->id); if (retval) goto error; retval = device_add(dev); if (retval) goto error; return dev; error: put_device(dev); return ERR_PTR(retval); } /** * wakeup_source_sysfs_add - Add wakeup_source attributes to sysfs. * @parent: Device given wakeup source is associated with (or NULL if virtual). * @ws: Wakeup source to be added in sysfs. */ int wakeup_source_sysfs_add(struct device *parent, struct wakeup_source *ws) { struct device *dev; dev = wakeup_source_device_create(parent, ws); if (IS_ERR(dev)) return PTR_ERR(dev); ws->dev = dev; return 0; } /** * pm_wakeup_source_sysfs_add - Add wakeup_source attributes to sysfs * for a device if they're missing. * @parent: Device given wakeup source is associated with */ int pm_wakeup_source_sysfs_add(struct device *parent) { if (!parent->power.wakeup || parent->power.wakeup->dev) return 0; return wakeup_source_sysfs_add(parent, parent->power.wakeup); } /** * wakeup_source_sysfs_remove - Remove wakeup_source attributes from sysfs. * @ws: Wakeup source to be removed from sysfs. */ void wakeup_source_sysfs_remove(struct wakeup_source *ws) { device_unregister(ws->dev); } static int __init wakeup_sources_sysfs_init(void) { wakeup_class = class_create("wakeup"); return PTR_ERR_OR_ZERO(wakeup_class); } postcore_initcall(wakeup_sources_sysfs_init);
1 169 169 169 55 182 182 15 1 1 8 171 182 31 1 1 31 181 181 122 75 17 59 5 169 170 3 90 86 8 8 142 142 142 156 160 121 3 169 141 131 20 121 56 54 142 5 135 121 135 23 23 23 23 22 23 19 5 5 5 22 23 23 23 41 1 41 7 152 153 153 153 30 100 149 1 9 10 142 136 124 10 135 130 13 130 13 135 122 23 135 21 152 136 82 103 134 34 5 2 134 5 49 94 41 41 39 89 81 63 42 19 9 9 9 9 26 5 12 12 17 17 12 17 6 3 2 2 4 3 2 3 2 2 4 34 34 101 1 41 67 101 1 13 13 13 13 13 13 13 13 13 13 13 13 13 13 4 13 2 2 2 2 1 3 3 45 45 7 34 1 34 18 29 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 // SPDX-License-Identifier: GPL-2.0-or-later /* * Copyright (C) International Business Machines Corp., 2000-2005 * Portions Copyright (C) Christoph Hellwig, 2001-2002 */ /* * jfs_txnmgr.c: transaction manager * * notes: * transaction starts with txBegin() and ends with txCommit() * or txAbort(). * * tlock is acquired at the time of update; * (obviate scan at commit time for xtree and dtree) * tlock and mp points to each other; * (no hashlist for mp -> tlock). * * special cases: * tlock on in-memory inode: * in-place tlock in the in-memory inode itself; * converted to page lock by iWrite() at commit time. * * tlock during write()/mmap() under anonymous transaction (tid = 0): * transferred (?) to transaction at commit time. * * use the page itself to update allocation maps * (obviate intermediate replication of allocation/deallocation data) * hold on to mp+lock thru update of maps */ #include <linux/fs.h> #include <linux/vmalloc.h> #include <linux/completion.h> #include <linux/freezer.h> #include <linux/module.h> #include <linux/moduleparam.h> #include <linux/kthread.h> #include <linux/seq_file.h> #include "jfs_incore.h" #include "jfs_inode.h" #include "jfs_filsys.h" #include "jfs_metapage.h" #include "jfs_dinode.h" #include "jfs_imap.h" #include "jfs_dmap.h" #include "jfs_superblock.h" #include "jfs_debug.h" /* * transaction management structures */ static struct { int freetid; /* index of a free tid structure */ int freelock; /* index first free lock word */ wait_queue_head_t freewait; /* eventlist of free tblock */ wait_queue_head_t freelockwait; /* eventlist of free tlock */ wait_queue_head_t lowlockwait; /* eventlist of ample tlocks */ int tlocksInUse; /* Number of tlocks in use */ spinlock_t LazyLock; /* synchronize sync_queue & unlock_queue */ /* struct tblock *sync_queue; * Transactions waiting for data sync */ struct list_head unlock_queue; /* Txns waiting to be released */ struct list_head anon_list; /* inodes having anonymous txns */ struct list_head anon_list2; /* inodes having anonymous txns that couldn't be sync'ed */ } TxAnchor; int jfs_tlocks_low; /* Indicates low number of available tlocks */ #ifdef CONFIG_JFS_STATISTICS static struct { uint txBegin; uint txBegin_barrier; uint txBegin_lockslow; uint txBegin_freetid; uint txBeginAnon; uint txBeginAnon_barrier; uint txBeginAnon_lockslow; uint txLockAlloc; uint txLockAlloc_freelock; } TxStat; #endif static int nTxBlock = -1; /* number of transaction blocks */ module_param(nTxBlock, int, 0); MODULE_PARM_DESC(nTxBlock, "Number of transaction blocks (max:65536)"); static int nTxLock = -1; /* number of transaction locks */ module_param(nTxLock, int, 0); MODULE_PARM_DESC(nTxLock, "Number of transaction locks (max:65536)"); struct tblock *TxBlock; /* transaction block table */ static int TxLockLWM; /* Low water mark for number of txLocks used */ static int TxLockHWM; /* High water mark for number of txLocks used */ static int TxLockVHWM; /* Very High water mark */ struct tlock *TxLock; /* transaction lock table */ /* * transaction management lock */ static DEFINE_SPINLOCK(jfsTxnLock); #define TXN_LOCK() spin_lock(&jfsTxnLock) #define TXN_UNLOCK() spin_unlock(&jfsTxnLock) #define LAZY_LOCK_INIT() spin_lock_init(&TxAnchor.LazyLock) #define LAZY_LOCK(flags) spin_lock_irqsave(&TxAnchor.LazyLock, flags) #define LAZY_UNLOCK(flags) spin_unlock_irqrestore(&TxAnchor.LazyLock, flags) static DECLARE_WAIT_QUEUE_HEAD(jfs_commit_thread_wait); static int jfs_commit_thread_waking; /* * Retry logic exist outside these macros to protect from spurrious wakeups. */ static inline void TXN_SLEEP_DROP_LOCK(wait_queue_head_t * event) { DECLARE_WAITQUEUE(wait, current); add_wait_queue(event, &wait); set_current_state(TASK_UNINTERRUPTIBLE); TXN_UNLOCK(); io_schedule(); remove_wait_queue(event, &wait); } #define TXN_SLEEP(event)\ {\ TXN_SLEEP_DROP_LOCK(event);\ TXN_LOCK();\ } #define TXN_WAKEUP(event) wake_up_all(event) /* * statistics */ static struct { tid_t maxtid; /* 4: biggest tid ever used */ lid_t maxlid; /* 4: biggest lid ever used */ int ntid; /* 4: # of transactions performed */ int nlid; /* 4: # of tlocks acquired */ int waitlock; /* 4: # of tlock wait */ } stattx; /* * forward references */ static void diLog(struct jfs_log *log, struct tblock *tblk, struct lrd *lrd, struct tlock *tlck, struct commit *cd); static void dataLog(struct jfs_log *log, struct tblock *tblk, struct lrd *lrd, struct tlock *tlck); static void dtLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd, struct tlock * tlck); static void mapLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd, struct tlock * tlck); static void txAllocPMap(struct inode *ip, struct maplock * maplock, struct tblock * tblk); static void txForce(struct tblock * tblk); static void txLog(struct jfs_log *log, struct tblock *tblk, struct commit *cd); static void txUpdateMap(struct tblock * tblk); static void txRelease(struct tblock * tblk); static void xtLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd, struct tlock * tlck); static void LogSyncRelease(struct metapage * mp); /* * transaction block/lock management * --------------------------------- */ /* * Get a transaction lock from the free list. If the number in use is * greater than the high water mark, wake up the sync daemon. This should * free some anonymous transaction locks. (TXN_LOCK must be held.) */ static lid_t txLockAlloc(void) { lid_t lid; INCREMENT(TxStat.txLockAlloc); if (!TxAnchor.freelock) { INCREMENT(TxStat.txLockAlloc_freelock); } while (!(lid = TxAnchor.freelock)) TXN_SLEEP(&TxAnchor.freelockwait); TxAnchor.freelock = TxLock[lid].next; HIGHWATERMARK(stattx.maxlid, lid); if ((++TxAnchor.tlocksInUse > TxLockHWM) && (jfs_tlocks_low == 0)) { jfs_info("txLockAlloc tlocks low"); jfs_tlocks_low = 1; wake_up_process(jfsSyncThread); } return lid; } static void txLockFree(lid_t lid) { TxLock[lid].tid = 0; TxLock[lid].next = TxAnchor.freelock; TxAnchor.freelock = lid; TxAnchor.tlocksInUse--; if (jfs_tlocks_low && (TxAnchor.tlocksInUse < TxLockLWM)) { jfs_info("txLockFree jfs_tlocks_low no more"); jfs_tlocks_low = 0; TXN_WAKEUP(&TxAnchor.lowlockwait); } TXN_WAKEUP(&TxAnchor.freelockwait); } /* * NAME: txInit() * * FUNCTION: initialize transaction management structures * * RETURN: * * serialization: single thread at jfs_init() */ int txInit(void) { int k, size; struct sysinfo si; /* Set defaults for nTxLock and nTxBlock if unset */ if (nTxLock == -1) { if (nTxBlock == -1) { /* Base default on memory size */ si_meminfo(&si); if (si.totalram > (256 * 1024)) /* 1 GB */ nTxLock = 64 * 1024; else nTxLock = si.totalram >> 2; } else if (nTxBlock > (8 * 1024)) nTxLock = 64 * 1024; else nTxLock = nTxBlock << 3; } if (nTxBlock == -1) nTxBlock = nTxLock >> 3; /* Verify tunable parameters */ if (nTxBlock < 16) nTxBlock = 16; /* No one should set it this low */ if (nTxBlock > 65536) nTxBlock = 65536; if (nTxLock < 256) nTxLock = 256; /* No one should set it this low */ if (nTxLock > 65536) nTxLock = 65536; printk(KERN_INFO "JFS: nTxBlock = %d, nTxLock = %d\n", nTxBlock, nTxLock); /* * initialize transaction block (tblock) table * * transaction id (tid) = tblock index * tid = 0 is reserved. */ TxLockLWM = (nTxLock * 4) / 10; TxLockHWM = (nTxLock * 7) / 10; TxLockVHWM = (nTxLock * 8) / 10; size = sizeof(struct tblock) * nTxBlock; TxBlock = vmalloc(size); if (TxBlock == NULL) return -ENOMEM; for (k = 0; k < nTxBlock; k++) { init_waitqueue_head(&TxBlock[k].gcwait); init_waitqueue_head(&TxBlock[k].waitor); } for (k = 1; k < nTxBlock - 1; k++) { TxBlock[k].next = k + 1; } TxBlock[k].next = 0; TxAnchor.freetid = 1; init_waitqueue_head(&TxAnchor.freewait); stattx.maxtid = 1; /* statistics */ /* * initialize transaction lock (tlock) table * * transaction lock id = tlock index * tlock id = 0 is reserved. */ size = sizeof(struct tlock) * nTxLock; TxLock = vmalloc(size); if (TxLock == NULL) { vfree(TxBlock); return -ENOMEM; } /* initialize tlock table */ for (k = 1; k < nTxLock - 1; k++) TxLock[k].next = k + 1; TxLock[k].next = 0; init_waitqueue_head(&TxAnchor.freelockwait); init_waitqueue_head(&TxAnchor.lowlockwait); TxAnchor.freelock = 1; TxAnchor.tlocksInUse = 0; INIT_LIST_HEAD(&TxAnchor.anon_list); INIT_LIST_HEAD(&TxAnchor.anon_list2); LAZY_LOCK_INIT(); INIT_LIST_HEAD(&TxAnchor.unlock_queue); stattx.maxlid = 1; /* statistics */ return 0; } /* * NAME: txExit() * * FUNCTION: clean up when module is unloaded */ void txExit(void) { vfree(TxLock); TxLock = NULL; vfree(TxBlock); TxBlock = NULL; } /* * NAME: txBegin() * * FUNCTION: start a transaction. * * PARAMETER: sb - superblock * flag - force for nested tx; * * RETURN: tid - transaction id * * note: flag force allows to start tx for nested tx * to prevent deadlock on logsync barrier; */ tid_t txBegin(struct super_block *sb, int flag) { tid_t t; struct tblock *tblk; struct jfs_log *log; jfs_info("txBegin: flag = 0x%x", flag); log = JFS_SBI(sb)->log; if (!log) { jfs_error(sb, "read-only filesystem\n"); return 0; } TXN_LOCK(); INCREMENT(TxStat.txBegin); retry: if (!(flag & COMMIT_FORCE)) { /* * synchronize with logsync barrier */ if (test_bit(log_SYNCBARRIER, &log->flag) || test_bit(log_QUIESCE, &log->flag)) { INCREMENT(TxStat.txBegin_barrier); TXN_SLEEP(&log->syncwait); goto retry; } } if (flag == 0) { /* * Don't begin transaction if we're getting starved for tlocks * unless COMMIT_FORCE or COMMIT_INODE (which may ultimately * free tlocks) */ if (TxAnchor.tlocksInUse > TxLockVHWM) { INCREMENT(TxStat.txBegin_lockslow); TXN_SLEEP(&TxAnchor.lowlockwait); goto retry; } } /* * allocate transaction id/block */ if ((t = TxAnchor.freetid) == 0) { jfs_info("txBegin: waiting for free tid"); INCREMENT(TxStat.txBegin_freetid); TXN_SLEEP(&TxAnchor.freewait); goto retry; } tblk = tid_to_tblock(t); if ((tblk->next == 0) && !(flag & COMMIT_FORCE)) { /* Don't let a non-forced transaction take the last tblk */ jfs_info("txBegin: waiting for free tid"); INCREMENT(TxStat.txBegin_freetid); TXN_SLEEP(&TxAnchor.freewait); goto retry; } TxAnchor.freetid = tblk->next; /* * initialize transaction */ /* * We can't zero the whole thing or we screw up another thread being * awakened after sleeping on tblk->waitor * * memset(tblk, 0, sizeof(struct tblock)); */ tblk->next = tblk->last = tblk->xflag = tblk->flag = tblk->lsn = 0; tblk->sb = sb; ++log->logtid; tblk->logtid = log->logtid; ++log->active; HIGHWATERMARK(stattx.maxtid, t); /* statistics */ INCREMENT(stattx.ntid); /* statistics */ TXN_UNLOCK(); jfs_info("txBegin: returning tid = %d", t); return t; } /* * NAME: txBeginAnon() * * FUNCTION: start an anonymous transaction. * Blocks if logsync or available tlocks are low to prevent * anonymous tlocks from depleting supply. * * PARAMETER: sb - superblock * * RETURN: none */ void txBeginAnon(struct super_block *sb) { struct jfs_log *log; log = JFS_SBI(sb)->log; TXN_LOCK(); INCREMENT(TxStat.txBeginAnon); retry: /* * synchronize with logsync barrier */ if (test_bit(log_SYNCBARRIER, &log->flag) || test_bit(log_QUIESCE, &log->flag)) { INCREMENT(TxStat.txBeginAnon_barrier); TXN_SLEEP(&log->syncwait); goto retry; } /* * Don't begin transaction if we're getting starved for tlocks */ if (TxAnchor.tlocksInUse > TxLockVHWM) { INCREMENT(TxStat.txBeginAnon_lockslow); TXN_SLEEP(&TxAnchor.lowlockwait); goto retry; } TXN_UNLOCK(); } /* * txEnd() * * function: free specified transaction block. * * logsync barrier processing: * * serialization: */ void txEnd(tid_t tid) { struct tblock *tblk = tid_to_tblock(tid); struct jfs_log *log; jfs_info("txEnd: tid = %d", tid); TXN_LOCK(); /* * wakeup transactions waiting on the page locked * by the current transaction */ TXN_WAKEUP(&tblk->waitor); log = JFS_SBI(tblk->sb)->log; /* * Lazy commit thread can't free this guy until we mark it UNLOCKED, * otherwise, we would be left with a transaction that may have been * reused. * * Lazy commit thread will turn off tblkGC_LAZY before calling this * routine. */ if (tblk->flag & tblkGC_LAZY) { jfs_info("txEnd called w/lazy tid: %d, tblk = 0x%p", tid, tblk); TXN_UNLOCK(); spin_lock_irq(&log->gclock); // LOGGC_LOCK tblk->flag |= tblkGC_UNLOCKED; spin_unlock_irq(&log->gclock); // LOGGC_UNLOCK return; } jfs_info("txEnd: tid: %d, tblk = 0x%p", tid, tblk); assert(tblk->next == 0); /* * insert tblock back on freelist */ tblk->next = TxAnchor.freetid; TxAnchor.freetid = tid; /* * mark the tblock not active */ if (--log->active == 0) { clear_bit(log_FLUSH, &log->flag); /* * synchronize with logsync barrier */ if (test_bit(log_SYNCBARRIER, &log->flag)) { TXN_UNLOCK(); /* write dirty metadata & forward log syncpt */ jfs_syncpt(log, 1); jfs_info("log barrier off: 0x%x", log->lsn); /* enable new transactions start */ clear_bit(log_SYNCBARRIER, &log->flag); /* wakeup all waitors for logsync barrier */ TXN_WAKEUP(&log->syncwait); goto wakeup; } } TXN_UNLOCK(); wakeup: /* * wakeup all waitors for a free tblock */ TXN_WAKEUP(&TxAnchor.freewait); } /* * txLock() * * function: acquire a transaction lock on the specified <mp> * * parameter: * * return: transaction lock id * * serialization: */ struct tlock *txLock(tid_t tid, struct inode *ip, struct metapage * mp, int type) { struct jfs_inode_info *jfs_ip = JFS_IP(ip); int dir_xtree = 0; lid_t lid; tid_t xtid; struct tlock *tlck; struct xtlock *xtlck; struct linelock *linelock; xtpage_t *p; struct tblock *tblk; TXN_LOCK(); if (S_ISDIR(ip->i_mode) && (type & tlckXTREE) && !(mp->xflag & COMMIT_PAGE)) { /* * Directory inode is special. It can have both an xtree tlock * and a dtree tlock associated with it. */ dir_xtree = 1; lid = jfs_ip->xtlid; } else lid = mp->lid; /* is page not locked by a transaction ? */ if (lid == 0) goto allocateLock; jfs_info("txLock: tid:%d ip:0x%p mp:0x%p lid:%d", tid, ip, mp, lid); /* is page locked by the requester transaction ? */ tlck = lid_to_tlock(lid); if ((xtid = tlck->tid) == tid) { TXN_UNLOCK(); goto grantLock; } /* * is page locked by anonymous transaction/lock ? * * (page update without transaction (i.e., file write) is * locked under anonymous transaction tid = 0: * anonymous tlocks maintained on anonymous tlock list of * the inode of the page and available to all anonymous * transactions until txCommit() time at which point * they are transferred to the transaction tlock list of * the committing transaction of the inode) */ if (xtid == 0) { tlck->tid = tid; TXN_UNLOCK(); tblk = tid_to_tblock(tid); /* * The order of the tlocks in the transaction is important * (during truncate, child xtree pages must be freed before * parent's tlocks change the working map). * Take tlock off anonymous list and add to tail of * transaction list * * Note: We really need to get rid of the tid & lid and * use list_head's. This code is getting UGLY! */ if (jfs_ip->atlhead == lid) { if (jfs_ip->atltail == lid) { /* only anonymous txn. * Remove from anon_list */ TXN_LOCK(); list_del_init(&jfs_ip->anon_inode_list); TXN_UNLOCK(); } jfs_ip->atlhead = tlck->next; } else { lid_t last; for (last = jfs_ip->atlhead; lid_to_tlock(last)->next != lid; last = lid_to_tlock(last)->next) { assert(last); } lid_to_tlock(last)->next = tlck->next; if (jfs_ip->atltail == lid) jfs_ip->atltail = last; } /* insert the tlock at tail of transaction tlock list */ if (tblk->next) lid_to_tlock(tblk->last)->next = lid; else tblk->next = lid; tlck->next = 0; tblk->last = lid; goto grantLock; } goto waitLock; /* * allocate a tlock */ allocateLock: lid = txLockAlloc(); tlck = lid_to_tlock(lid); /* * initialize tlock */ tlck->tid = tid; TXN_UNLOCK(); /* mark tlock for meta-data page */ if (mp->xflag & COMMIT_PAGE) { tlck->flag = tlckPAGELOCK; /* mark the page dirty and nohomeok */ metapage_nohomeok(mp); jfs_info("locking mp = 0x%p, nohomeok = %d tid = %d tlck = 0x%p", mp, mp->nohomeok, tid, tlck); /* if anonymous transaction, and buffer is on the group * commit synclist, mark inode to show this. This will * prevent the buffer from being marked nohomeok for too * long a time. */ if ((tid == 0) && mp->lsn) set_cflag(COMMIT_Synclist, ip); } /* mark tlock for in-memory inode */ else tlck->flag = tlckINODELOCK; if (S_ISDIR(ip->i_mode)) tlck->flag |= tlckDIRECTORY; tlck->type = 0; /* bind the tlock and the page */ tlck->ip = ip; tlck->mp = mp; if (dir_xtree) jfs_ip->xtlid = lid; else mp->lid = lid; /* * enqueue transaction lock to transaction/inode */ /* insert the tlock at tail of transaction tlock list */ if (tid) { tblk = tid_to_tblock(tid); if (tblk->next) lid_to_tlock(tblk->last)->next = lid; else tblk->next = lid; tlck->next = 0; tblk->last = lid; } /* anonymous transaction: * insert the tlock at head of inode anonymous tlock list */ else { tlck->next = jfs_ip->atlhead; jfs_ip->atlhead = lid; if (tlck->next == 0) { /* This inode's first anonymous transaction */ jfs_ip->atltail = lid; TXN_LOCK(); list_add_tail(&jfs_ip->anon_inode_list, &TxAnchor.anon_list); TXN_UNLOCK(); } } /* initialize type dependent area for linelock */ linelock = (struct linelock *) & tlck->lock; linelock->next = 0; linelock->flag = tlckLINELOCK; linelock->maxcnt = TLOCKSHORT; linelock->index = 0; switch (type & tlckTYPE) { case tlckDTREE: linelock->l2linesize = L2DTSLOTSIZE; break; case tlckXTREE: linelock->l2linesize = L2XTSLOTSIZE; xtlck = (struct xtlock *) linelock; xtlck->header.offset = 0; xtlck->header.length = 2; if (type & tlckNEW) { xtlck->lwm.offset = XTENTRYSTART; } else { if (mp->xflag & COMMIT_PAGE) p = (xtpage_t *) mp->data; else p = (xtpage_t *) &jfs_ip->i_xtroot; xtlck->lwm.offset = le16_to_cpu(p->header.nextindex); } xtlck->lwm.length = 0; /* ! */ xtlck->twm.offset = 0; xtlck->hwm.offset = 0; xtlck->index = 2; break; case tlckINODE: linelock->l2linesize = L2INODESLOTSIZE; break; case tlckDATA: linelock->l2linesize = L2DATASLOTSIZE; break; default: jfs_err("UFO tlock:0x%p", tlck); } /* * update tlock vector */ grantLock: tlck->type |= type; return tlck; /* * page is being locked by another transaction: */ waitLock: /* Only locks on ipimap or ipaimap should reach here */ /* assert(jfs_ip->fileset == AGGREGATE_I); */ if (jfs_ip->fileset != AGGREGATE_I) { printk(KERN_ERR "txLock: trying to lock locked page!"); print_hex_dump(KERN_ERR, "ip: ", DUMP_PREFIX_ADDRESS, 16, 4, ip, sizeof(*ip), 0); print_hex_dump(KERN_ERR, "mp: ", DUMP_PREFIX_ADDRESS, 16, 4, mp, sizeof(*mp), 0); print_hex_dump(KERN_ERR, "Locker's tblock: ", DUMP_PREFIX_ADDRESS, 16, 4, tid_to_tblock(tid), sizeof(struct tblock), 0); print_hex_dump(KERN_ERR, "Tlock: ", DUMP_PREFIX_ADDRESS, 16, 4, tlck, sizeof(*tlck), 0); BUG(); } INCREMENT(stattx.waitlock); /* statistics */ TXN_UNLOCK(); release_metapage(mp); TXN_LOCK(); xtid = tlck->tid; /* reacquire after dropping TXN_LOCK */ jfs_info("txLock: in waitLock, tid = %d, xtid = %d, lid = %d", tid, xtid, lid); /* Recheck everything since dropping TXN_LOCK */ if (xtid && (tlck->mp == mp) && (mp->lid == lid)) TXN_SLEEP_DROP_LOCK(&tid_to_tblock(xtid)->waitor); else TXN_UNLOCK(); jfs_info("txLock: awakened tid = %d, lid = %d", tid, lid); return NULL; } /* * NAME: txRelease() * * FUNCTION: Release buffers associated with transaction locks, but don't * mark homeok yet. The allows other transactions to modify * buffers, but won't let them go to disk until commit record * actually gets written. * * PARAMETER: * tblk - * * RETURN: Errors from subroutines. */ static void txRelease(struct tblock * tblk) { struct metapage *mp; lid_t lid; struct tlock *tlck; TXN_LOCK(); for (lid = tblk->next; lid; lid = tlck->next) { tlck = lid_to_tlock(lid); if ((mp = tlck->mp) != NULL && (tlck->type & tlckBTROOT) == 0) { assert(mp->xflag & COMMIT_PAGE); mp->lid = 0; } } /* * wakeup transactions waiting on a page locked * by the current transaction */ TXN_WAKEUP(&tblk->waitor); TXN_UNLOCK(); } /* * NAME: txUnlock() * * FUNCTION: Initiates pageout of pages modified by tid in journalled * objects and frees their lockwords. */ static void txUnlock(struct tblock * tblk) { struct tlock *tlck; struct linelock *linelock; lid_t lid, next, llid, k; struct metapage *mp; struct jfs_log *log; int difft, diffp; unsigned long flags; jfs_info("txUnlock: tblk = 0x%p", tblk); log = JFS_SBI(tblk->sb)->log; /* * mark page under tlock homeok (its log has been written): */ for (lid = tblk->next; lid; lid = next) { tlck = lid_to_tlock(lid); next = tlck->next; jfs_info("unlocking lid = %d, tlck = 0x%p", lid, tlck); /* unbind page from tlock */ if ((mp = tlck->mp) != NULL && (tlck->type & tlckBTROOT) == 0) { assert(mp->xflag & COMMIT_PAGE); /* hold buffer */ hold_metapage(mp); assert(mp->nohomeok > 0); _metapage_homeok(mp); /* inherit younger/larger clsn */ LOGSYNC_LOCK(log, flags); if (mp->clsn) { logdiff(difft, tblk->clsn, log); logdiff(diffp, mp->clsn, log); if (difft > diffp) mp->clsn = tblk->clsn; } else mp->clsn = tblk->clsn; LOGSYNC_UNLOCK(log, flags); assert(!(tlck->flag & tlckFREEPAGE)); put_metapage(mp); } /* insert tlock, and linelock(s) of the tlock if any, * at head of freelist */ TXN_LOCK(); llid = ((struct linelock *) & tlck->lock)->next; while (llid) { linelock = (struct linelock *) lid_to_tlock(llid); k = linelock->next; txLockFree(llid); llid = k; } txLockFree(lid); TXN_UNLOCK(); } tblk->next = tblk->last = 0; /* * remove tblock from logsynclist * (allocation map pages inherited lsn of tblk and * has been inserted in logsync list at txUpdateMap()) */ if (tblk->lsn) { LOGSYNC_LOCK(log, flags); log->count--; list_del(&tblk->synclist); LOGSYNC_UNLOCK(log, flags); } } /* * txMaplock() * * function: allocate a transaction lock for freed page/entry; * for freed page, maplock is used as xtlock/dtlock type; */ struct tlock *txMaplock(tid_t tid, struct inode *ip, int type) { struct jfs_inode_info *jfs_ip = JFS_IP(ip); lid_t lid; struct tblock *tblk; struct tlock *tlck; struct maplock *maplock; TXN_LOCK(); /* * allocate a tlock */ lid = txLockAlloc(); tlck = lid_to_tlock(lid); /* * initialize tlock */ tlck->tid = tid; /* bind the tlock and the object */ tlck->flag = tlckINODELOCK; if (S_ISDIR(ip->i_mode)) tlck->flag |= tlckDIRECTORY; tlck->ip = ip; tlck->mp = NULL; tlck->type = type; /* * enqueue transaction lock to transaction/inode */ /* insert the tlock at tail of transaction tlock list */ if (tid) { tblk = tid_to_tblock(tid); if (tblk->next) lid_to_tlock(tblk->last)->next = lid; else tblk->next = lid; tlck->next = 0; tblk->last = lid; } /* anonymous transaction: * insert the tlock at head of inode anonymous tlock list */ else { tlck->next = jfs_ip->atlhead; jfs_ip->atlhead = lid; if (tlck->next == 0) { /* This inode's first anonymous transaction */ jfs_ip->atltail = lid; list_add_tail(&jfs_ip->anon_inode_list, &TxAnchor.anon_list); } } TXN_UNLOCK(); /* initialize type dependent area for maplock */ maplock = (struct maplock *) & tlck->lock; maplock->next = 0; maplock->maxcnt = 0; maplock->index = 0; return tlck; } /* * txLinelock() * * function: allocate a transaction lock for log vector list */ struct linelock *txLinelock(struct linelock * tlock) { lid_t lid; struct tlock *tlck; struct linelock *linelock; TXN_LOCK(); /* allocate a TxLock structure */ lid = txLockAlloc(); tlck = lid_to_tlock(lid); TXN_UNLOCK(); /* initialize linelock */ linelock = (struct linelock *) tlck; linelock->next = 0; linelock->flag = tlckLINELOCK; linelock->maxcnt = TLOCKLONG; linelock->index = 0; if (tlck->flag & tlckDIRECTORY) linelock->flag |= tlckDIRECTORY; /* append linelock after tlock */ linelock->next = tlock->next; tlock->next = lid; return linelock; } /* * transaction commit management * ----------------------------- */ /* * NAME: txCommit() * * FUNCTION: commit the changes to the objects specified in * clist. For journalled segments only the * changes of the caller are committed, ie by tid. * for non-journalled segments the data are flushed to * disk and then the change to the disk inode and indirect * blocks committed (so blocks newly allocated to the * segment will be made a part of the segment atomically). * * all of the segments specified in clist must be in * one file system. no more than 6 segments are needed * to handle all unix svcs. * * if the i_nlink field (i.e. disk inode link count) * is zero, and the type of inode is a regular file or * directory, or symbolic link , the inode is truncated * to zero length. the truncation is committed but the * VM resources are unaffected until it is closed (see * iput and iclose). * * PARAMETER: * * RETURN: * * serialization: * on entry the inode lock on each segment is assumed * to be held. * * i/o error: */ int txCommit(tid_t tid, /* transaction identifier */ int nip, /* number of inodes to commit */ struct inode **iplist, /* list of inode to commit */ int flag) { int rc = 0; struct commit cd; struct jfs_log *log; struct tblock *tblk; struct lrd *lrd; struct inode *ip; struct jfs_inode_info *jfs_ip; int k, n; ino_t top; struct super_block *sb; jfs_info("txCommit, tid = %d, flag = %d", tid, flag); /* is read-only file system ? */ if (isReadOnly(iplist[0])) { rc = -EROFS; goto TheEnd; } sb = cd.sb = iplist[0]->i_sb; cd.tid = tid; if (tid == 0) tid = txBegin(sb, 0); tblk = tid_to_tblock(tid); /* * initialize commit structure */ log = JFS_SBI(sb)->log; cd.log = log; /* initialize log record descriptor in commit */ lrd = &cd.lrd; lrd->logtid = cpu_to_le32(tblk->logtid); lrd->backchain = 0; tblk->xflag |= flag; if ((flag & (COMMIT_FORCE | COMMIT_SYNC)) == 0) tblk->xflag |= COMMIT_LAZY; /* * prepare non-journaled objects for commit * * flush data pages of non-journaled file * to prevent the file getting non-initialized disk blocks * in case of crash. * (new blocks - ) */ cd.iplist = iplist; cd.nip = nip; /* * acquire transaction lock on (on-disk) inodes * * update on-disk inode from in-memory inode * acquiring transaction locks for AFTER records * on the on-disk inode of file object * * sort the inodes array by inode number in descending order * to prevent deadlock when acquiring transaction lock * of on-disk inodes on multiple on-disk inode pages by * multiple concurrent transactions */ for (k = 0; k < cd.nip; k++) { top = (cd.iplist[k])->i_ino; for (n = k + 1; n < cd.nip; n++) { ip = cd.iplist[n]; if (ip->i_ino > top) { top = ip->i_ino; cd.iplist[n] = cd.iplist[k]; cd.iplist[k] = ip; } } ip = cd.iplist[k]; jfs_ip = JFS_IP(ip); /* * BUGBUG - This code has temporarily been removed. The * intent is to ensure that any file data is written before * the metadata is committed to the journal. This prevents * uninitialized data from appearing in a file after the * journal has been replayed. (The uninitialized data * could be sensitive data removed by another user.) * * The problem now is that we are holding the IWRITELOCK * on the inode, and calling filemap_fdatawrite on an * unmapped page will cause a deadlock in jfs_get_block. * * The long term solution is to pare down the use of * IWRITELOCK. We are currently holding it too long. * We could also be smarter about which data pages need * to be written before the transaction is committed and * when we don't need to worry about it at all. * * if ((!S_ISDIR(ip->i_mode)) * && (tblk->flag & COMMIT_DELETE) == 0) * filemap_write_and_wait(ip->i_mapping); */ /* * Mark inode as not dirty. It will still be on the dirty * inode list, but we'll know not to commit it again unless * it gets marked dirty again */ clear_cflag(COMMIT_Dirty, ip); /* inherit anonymous tlock(s) of inode */ if (jfs_ip->atlhead) { lid_to_tlock(jfs_ip->atltail)->next = tblk->next; tblk->next = jfs_ip->atlhead; if (!tblk->last) tblk->last = jfs_ip->atltail; jfs_ip->atlhead = jfs_ip->atltail = 0; TXN_LOCK(); list_del_init(&jfs_ip->anon_inode_list); TXN_UNLOCK(); } /* * acquire transaction lock on on-disk inode page * (become first tlock of the tblk's tlock list) */ if (((rc = diWrite(tid, ip)))) goto out; } /* * write log records from transaction locks * * txUpdateMap() resets XAD_NEW in XAD. */ txLog(log, tblk, &cd); /* * Ensure that inode isn't reused before * lazy commit thread finishes processing */ if (tblk->xflag & COMMIT_DELETE) { ihold(tblk->u.ip); /* * Avoid a rare deadlock * * If the inode is locked, we may be blocked in * jfs_commit_inode. If so, we don't want the * lazy_commit thread doing the last iput() on the inode * since that may block on the locked inode. Instead, * commit the transaction synchronously, so the last iput * will be done by the calling thread (or later) */ /* * I believe this code is no longer needed. Splitting I_LOCK * into two bits, I_NEW and I_SYNC should prevent this * deadlock as well. But since I don't have a JFS testload * to verify this, only a trivial s/I_LOCK/I_SYNC/ was done. * Joern */ if (tblk->u.ip->i_state & I_SYNC) tblk->xflag &= ~COMMIT_LAZY; } ASSERT((!(tblk->xflag & COMMIT_DELETE)) || ((tblk->u.ip->i_nlink == 0) && !test_cflag(COMMIT_Nolink, tblk->u.ip))); /* * write COMMIT log record */ lrd->type = cpu_to_le16(LOG_COMMIT); lrd->length = 0; lmLog(log, tblk, lrd, NULL); lmGroupCommit(log, tblk); /* * - transaction is now committed - */ /* * force pages in careful update * (imap addressing structure update) */ if (flag & COMMIT_FORCE) txForce(tblk); /* * update allocation map. * * update inode allocation map and inode: * free pager lock on memory object of inode if any. * update block allocation map. * * txUpdateMap() resets XAD_NEW in XAD. */ if (tblk->xflag & COMMIT_FORCE) txUpdateMap(tblk); /* * free transaction locks and pageout/free pages */ txRelease(tblk); if ((tblk->flag & tblkGC_LAZY) == 0) txUnlock(tblk); /* * reset in-memory object state */ for (k = 0; k < cd.nip; k++) { ip = cd.iplist[k]; jfs_ip = JFS_IP(ip); /* * reset in-memory inode state */ jfs_ip->bxflag = 0; jfs_ip->blid = 0; } out: if (rc != 0) txAbort(tid, 1); TheEnd: jfs_info("txCommit: tid = %d, returning %d", tid, rc); return rc; } /* * NAME: txLog() * * FUNCTION: Writes AFTER log records for all lines modified * by tid for segments specified by inodes in comdata. * Code assumes only WRITELOCKS are recorded in lockwords. * * PARAMETERS: * * RETURN : */ static void txLog(struct jfs_log *log, struct tblock *tblk, struct commit *cd) { struct inode *ip; lid_t lid; struct tlock *tlck; struct lrd *lrd = &cd->lrd; /* * write log record(s) for each tlock of transaction, */ for (lid = tblk->next; lid; lid = tlck->next) { tlck = lid_to_tlock(lid); tlck->flag |= tlckLOG; /* initialize lrd common */ ip = tlck->ip; lrd->aggregate = cpu_to_le32(JFS_SBI(ip->i_sb)->aggregate); lrd->log.redopage.fileset = cpu_to_le32(JFS_IP(ip)->fileset); lrd->log.redopage.inode = cpu_to_le32(ip->i_ino); /* write log record of page from the tlock */ switch (tlck->type & tlckTYPE) { case tlckXTREE: xtLog(log, tblk, lrd, tlck); break; case tlckDTREE: dtLog(log, tblk, lrd, tlck); break; case tlckINODE: diLog(log, tblk, lrd, tlck, cd); break; case tlckMAP: mapLog(log, tblk, lrd, tlck); break; case tlckDATA: dataLog(log, tblk, lrd, tlck); break; default: jfs_err("UFO tlock:0x%p", tlck); } } return; } /* * diLog() * * function: log inode tlock and format maplock to update bmap; */ static void diLog(struct jfs_log *log, struct tblock *tblk, struct lrd *lrd, struct tlock *tlck, struct commit *cd) { struct metapage *mp; pxd_t *pxd; struct pxd_lock *pxdlock; mp = tlck->mp; /* initialize as REDOPAGE record format */ lrd->log.redopage.type = cpu_to_le16(LOG_INODE); lrd->log.redopage.l2linesize = cpu_to_le16(L2INODESLOTSIZE); pxd = &lrd->log.redopage.pxd; /* * inode after image */ if (tlck->type & tlckENTRY) { /* log after-image for logredo(): */ lrd->type = cpu_to_le16(LOG_REDOPAGE); PXDaddress(pxd, mp->index); PXDlength(pxd, mp->logical_size >> tblk->sb->s_blocksize_bits); lrd->backchain = cpu_to_le32(lmLog(log, tblk, lrd, tlck)); /* mark page as homeward bound */ tlck->flag |= tlckWRITEPAGE; } else if (tlck->type & tlckFREE) { /* * free inode extent * * (pages of the freed inode extent have been invalidated and * a maplock for free of the extent has been formatted at * txLock() time); * * the tlock had been acquired on the inode allocation map page * (iag) that specifies the freed extent, even though the map * page is not itself logged, to prevent pageout of the map * page before the log; */ /* log LOG_NOREDOINOEXT of the freed inode extent for * logredo() to start NoRedoPage filters, and to update * imap and bmap for free of the extent; */ lrd->type = cpu_to_le16(LOG_NOREDOINOEXT); /* * For the LOG_NOREDOINOEXT record, we need * to pass the IAG number and inode extent * index (within that IAG) from which the * extent is being released. These have been * passed to us in the iplist[1] and iplist[2]. */ lrd->log.noredoinoext.iagnum = cpu_to_le32((u32) (size_t) cd->iplist[1]); lrd->log.noredoinoext.inoext_idx = cpu_to_le32((u32) (size_t) cd->iplist[2]); pxdlock = (struct pxd_lock *) & tlck->lock; *pxd = pxdlock->pxd; lrd->backchain = cpu_to_le32(lmLog(log, tblk, lrd, NULL)); /* update bmap */ tlck->flag |= tlckUPDATEMAP; /* mark page as homeward bound */ tlck->flag |= tlckWRITEPAGE; } else jfs_err("diLog: UFO type tlck:0x%p", tlck); return; } /* * dataLog() * * function: log data tlock */ static void dataLog(struct jfs_log *log, struct tblock *tblk, struct lrd *lrd, struct tlock *tlck) { struct metapage *mp; pxd_t *pxd; mp = tlck->mp; /* initialize as REDOPAGE record format */ lrd->log.redopage.type = cpu_to_le16(LOG_DATA); lrd->log.redopage.l2linesize = cpu_to_le16(L2DATASLOTSIZE); pxd = &lrd->log.redopage.pxd; /* log after-image for logredo(): */ lrd->type = cpu_to_le16(LOG_REDOPAGE); if (jfs_dirtable_inline(tlck->ip)) { /* * The table has been truncated, we've must have deleted * the last entry, so don't bother logging this */ mp->lid = 0; grab_metapage(mp); metapage_homeok(mp); discard_metapage(mp); tlck->mp = NULL; return; } PXDaddress(pxd, mp->index); PXDlength(pxd, mp->logical_size >> tblk->sb->s_blocksize_bits); lrd->backchain = cpu_to_le32(lmLog(log, tblk, lrd, tlck)); /* mark page as homeward bound */ tlck->flag |= tlckWRITEPAGE; return; } /* * dtLog() * * function: log dtree tlock and format maplock to update bmap; */ static void dtLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd, struct tlock * tlck) { struct metapage *mp; struct pxd_lock *pxdlock; pxd_t *pxd; mp = tlck->mp; /* initialize as REDOPAGE/NOREDOPAGE record format */ lrd->log.redopage.type = cpu_to_le16(LOG_DTREE); lrd->log.redopage.l2linesize = cpu_to_le16(L2DTSLOTSIZE); pxd = &lrd->log.redopage.pxd; if (tlck->type & tlckBTROOT) lrd->log.redopage.type |= cpu_to_le16(LOG_BTROOT); /* * page extension via relocation: entry insertion; * page extension in-place: entry insertion; * new right page from page split, reinitialized in-line * root from root page split: entry insertion; */ if (tlck->type & (tlckNEW | tlckEXTEND)) { /* log after-image of the new page for logredo(): * mark log (LOG_NEW) for logredo() to initialize * freelist and update bmap for alloc of the new page; */ lrd->type = cpu_to_le16(LOG_REDOPAGE); if (tlck->type & tlckEXTEND) lrd->log.redopage.type |= cpu_to_le16(LOG_EXTEND); else lrd->log.redopage.type |= cpu_to_le16(LOG_NEW); PXDaddress(pxd, mp->index); PXDlength(pxd, mp->logical_size >> tblk->sb->s_blocksize_bits); lrd->backchain = cpu_to_le32(lmLog(log, tblk, lrd, tlck)); /* format a maplock for txUpdateMap() to update bPMAP for * alloc of the new page; */ if (tlck->type & tlckBTROOT) return; tlck->flag |= tlckUPDATEMAP; pxdlock = (struct pxd_lock *) & tlck->lock; pxdlock->flag = mlckALLOCPXD; pxdlock->pxd = *pxd; pxdlock->index = 1; /* mark page as homeward bound */ tlck->flag |= tlckWRITEPAGE; return; } /* * entry insertion/deletion, * sibling page link update (old right page before split); */ if (tlck->type & (tlckENTRY | tlckRELINK)) { /* log after-image for logredo(): */ lrd->type = cpu_to_le16(LOG_REDOPAGE); PXDaddress(pxd, mp->index); PXDlength(pxd, mp->logical_size >> tblk->sb->s_blocksize_bits); lrd->backchain = cpu_to_le32(lmLog(log, tblk, lrd, tlck)); /* mark page as homeward bound */ tlck->flag |= tlckWRITEPAGE; return; } /* * page deletion: page has been invalidated * page relocation: source extent * * a maplock for free of the page has been formatted * at txLock() time); */ if (tlck->type & (tlckFREE | tlckRELOCATE)) { /* log LOG_NOREDOPAGE of the deleted page for logredo() * to start NoRedoPage filter and to update bmap for free * of the deletd page */ lrd->type = cpu_to_le16(LOG_NOREDOPAGE); pxdlock = (struct pxd_lock *) & tlck->lock; *pxd = pxdlock->pxd; lrd->backchain = cpu_to_le32(lmLog(log, tblk, lrd, NULL)); /* a maplock for txUpdateMap() for free of the page * has been formatted at txLock() time; */ tlck->flag |= tlckUPDATEMAP; } return; } /* * xtLog() * * function: log xtree tlock and format maplock to update bmap; */ static void xtLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd, struct tlock * tlck) { struct inode *ip; struct metapage *mp; xtpage_t *p; struct xtlock *xtlck; struct maplock *maplock; struct xdlistlock *xadlock; struct pxd_lock *pxdlock; pxd_t *page_pxd; int next, lwm, hwm; ip = tlck->ip; mp = tlck->mp; /* initialize as REDOPAGE/NOREDOPAGE record format */ lrd->log.redopage.type = cpu_to_le16(LOG_XTREE); lrd->log.redopage.l2linesize = cpu_to_le16(L2XTSLOTSIZE); page_pxd = &lrd->log.redopage.pxd; if (tlck->type & tlckBTROOT) { lrd->log.redopage.type |= cpu_to_le16(LOG_BTROOT); p = (xtpage_t *) &JFS_IP(ip)->i_xtroot; if (S_ISDIR(ip->i_mode)) lrd->log.redopage.type |= cpu_to_le16(LOG_DIR_XTREE); } else p = (xtpage_t *) mp->data; next = le16_to_cpu(p->header.nextindex); xtlck = (struct xtlock *) & tlck->lock; maplock = (struct maplock *) & tlck->lock; xadlock = (struct xdlistlock *) maplock; /* * entry insertion/extension; * sibling page link update (old right page before split); */ if (tlck->type & (tlckNEW | tlckGROW | tlckRELINK)) { /* log after-image for logredo(): * logredo() will update bmap for alloc of new/extended * extents (XAD_NEW|XAD_EXTEND) of XAD[lwm:next) from * after-image of XADlist; * logredo() resets (XAD_NEW|XAD_EXTEND) flag when * applying the after-image to the meta-data page. */ lrd->type = cpu_to_le16(LOG_REDOPAGE); PXDaddress(page_pxd, mp->index); PXDlength(page_pxd, mp->logical_size >> tblk->sb->s_blocksize_bits); lrd->backchain = cpu_to_le32(lmLog(log, tblk, lrd, tlck)); /* format a maplock for txUpdateMap() to update bPMAP * for alloc of new/extended extents of XAD[lwm:next) * from the page itself; * txUpdateMap() resets (XAD_NEW|XAD_EXTEND) flag. */ lwm = xtlck->lwm.offset; if (lwm == 0) lwm = XTPAGEMAXSLOT; if (lwm == next) goto out; if (lwm > next) { jfs_err("xtLog: lwm > next"); goto out; } tlck->flag |= tlckUPDATEMAP; xadlock->flag = mlckALLOCXADLIST; xadlock->count = next - lwm; if ((xadlock->count <= 4) && (tblk->xflag & COMMIT_LAZY)) { int i; pxd_t *pxd; /* * Lazy commit may allow xtree to be modified before * txUpdateMap runs. Copy xad into linelock to * preserve correct data. * * We can fit twice as may pxd's as xads in the lock */ xadlock->flag = mlckALLOCPXDLIST; pxd = xadlock->xdlist = &xtlck->pxdlock; for (i = 0; i < xadlock->count; i++) { PXDaddress(pxd, addressXAD(&p->xad[lwm + i])); PXDlength(pxd, lengthXAD(&p->xad[lwm + i])); p->xad[lwm + i].flag &= ~(XAD_NEW | XAD_EXTENDED); pxd++; } } else { /* * xdlist will point to into inode's xtree, ensure * that transaction is not committed lazily. */ xadlock->flag = mlckALLOCXADLIST; xadlock->xdlist = &p->xad[lwm]; tblk->xflag &= ~COMMIT_LAZY; } jfs_info("xtLog: alloc ip:0x%p mp:0x%p tlck:0x%p lwm:%d count:%d", tlck->ip, mp, tlck, lwm, xadlock->count); maplock->index = 1; out: /* mark page as homeward bound */ tlck->flag |= tlckWRITEPAGE; return; } /* * page deletion: file deletion/truncation (ref. xtTruncate()) * * (page will be invalidated after log is written and bmap * is updated from the page); */ if (tlck->type & tlckFREE) { /* LOG_NOREDOPAGE log for NoRedoPage filter: * if page free from file delete, NoRedoFile filter from * inode image of zero link count will subsume NoRedoPage * filters for each page; * if page free from file truncattion, write NoRedoPage * filter; * * upadte of block allocation map for the page itself: * if page free from deletion and truncation, LOG_UPDATEMAP * log for the page itself is generated from processing * its parent page xad entries; */ /* if page free from file truncation, log LOG_NOREDOPAGE * of the deleted page for logredo() to start NoRedoPage * filter for the page; */ if (tblk->xflag & COMMIT_TRUNCATE) { /* write NOREDOPAGE for the page */ lrd->type = cpu_to_le16(LOG_NOREDOPAGE); PXDaddress(page_pxd, mp->index); PXDlength(page_pxd, mp->logical_size >> tblk->sb-> s_blocksize_bits); lrd->backchain = cpu_to_le32(lmLog(log, tblk, lrd, NULL)); if (tlck->type & tlckBTROOT) { /* Empty xtree must be logged */ lrd->type = cpu_to_le16(LOG_REDOPAGE); lrd->backchain = cpu_to_le32(lmLog(log, tblk, lrd, tlck)); } } /* init LOG_UPDATEMAP of the freed extents * XAD[XTENTRYSTART:hwm) from the deleted page itself * for logredo() to update bmap; */ lrd->type = cpu_to_le16(LOG_UPDATEMAP); lrd->log.updatemap.type = cpu_to_le16(LOG_FREEXADLIST); xtlck = (struct xtlock *) & tlck->lock; hwm = xtlck->hwm.offset; lrd->log.updatemap.nxd = cpu_to_le16(hwm - XTENTRYSTART + 1); /* reformat linelock for lmLog() */ xtlck->header.offset = XTENTRYSTART; xtlck->header.length = hwm - XTENTRYSTART + 1; xtlck->index = 1; lrd->backchain = cpu_to_le32(lmLog(log, tblk, lrd, tlck)); /* format a maplock for txUpdateMap() to update bmap * to free extents of XAD[XTENTRYSTART:hwm) from the * deleted page itself; */ tlck->flag |= tlckUPDATEMAP; xadlock->count = hwm - XTENTRYSTART + 1; if ((xadlock->count <= 4) && (tblk->xflag & COMMIT_LAZY)) { int i; pxd_t *pxd; /* * Lazy commit may allow xtree to be modified before * txUpdateMap runs. Copy xad into linelock to * preserve correct data. * * We can fit twice as may pxd's as xads in the lock */ xadlock->flag = mlckFREEPXDLIST; pxd = xadlock->xdlist = &xtlck->pxdlock; for (i = 0; i < xadlock->count; i++) { PXDaddress(pxd, addressXAD(&p->xad[XTENTRYSTART + i])); PXDlength(pxd, lengthXAD(&p->xad[XTENTRYSTART + i])); pxd++; } } else { /* * xdlist will point to into inode's xtree, ensure * that transaction is not committed lazily. */ xadlock->flag = mlckFREEXADLIST; xadlock->xdlist = &p->xad[XTENTRYSTART]; tblk->xflag &= ~COMMIT_LAZY; } jfs_info("xtLog: free ip:0x%p mp:0x%p count:%d lwm:2", tlck->ip, mp, xadlock->count); maplock->index = 1; /* mark page as invalid */ if (((tblk->xflag & COMMIT_PWMAP) || S_ISDIR(ip->i_mode)) && !(tlck->type & tlckBTROOT)) tlck->flag |= tlckFREEPAGE; /* else (tblk->xflag & COMMIT_PMAP) ? release the page; */ return; } /* * page/entry truncation: file truncation (ref. xtTruncate()) * * |----------+------+------+---------------| * | | | * | | hwm - hwm before truncation * | next - truncation point * lwm - lwm before truncation * header ? */ if (tlck->type & tlckTRUNCATE) { pxd_t pxd; /* truncated extent of xad */ int twm; /* * For truncation the entire linelock may be used, so it would * be difficult to store xad list in linelock itself. * Therefore, we'll just force transaction to be committed * synchronously, so that xtree pages won't be changed before * txUpdateMap runs. */ tblk->xflag &= ~COMMIT_LAZY; lwm = xtlck->lwm.offset; if (lwm == 0) lwm = XTPAGEMAXSLOT; hwm = xtlck->hwm.offset; twm = xtlck->twm.offset; /* * write log records */ /* log after-image for logredo(): * * logredo() will update bmap for alloc of new/extended * extents (XAD_NEW|XAD_EXTEND) of XAD[lwm:next) from * after-image of XADlist; * logredo() resets (XAD_NEW|XAD_EXTEND) flag when * applying the after-image to the meta-data page. */ lrd->type = cpu_to_le16(LOG_REDOPAGE); PXDaddress(page_pxd, mp->index); PXDlength(page_pxd, mp->logical_size >> tblk->sb->s_blocksize_bits); lrd->backchain = cpu_to_le32(lmLog(log, tblk, lrd, tlck)); /* * truncate entry XAD[twm == next - 1]: */ if (twm == next - 1) { /* init LOG_UPDATEMAP for logredo() to update bmap for * free of truncated delta extent of the truncated * entry XAD[next - 1]: * (xtlck->pxdlock = truncated delta extent); */ pxdlock = (struct pxd_lock *) & xtlck->pxdlock; /* assert(pxdlock->type & tlckTRUNCATE); */ lrd->type = cpu_to_le16(LOG_UPDATEMAP); lrd->log.updatemap.type = cpu_to_le16(LOG_FREEPXD); lrd->log.updatemap.nxd = cpu_to_le16(1); lrd->log.updatemap.pxd = pxdlock->pxd; pxd = pxdlock->pxd; /* save to format maplock */ lrd->backchain = cpu_to_le32(lmLog(log, tblk, lrd, NULL)); } /* * free entries XAD[next:hwm]: */ if (hwm >= next) { /* init LOG_UPDATEMAP of the freed extents * XAD[next:hwm] from the deleted page itself * for logredo() to update bmap; */ lrd->type = cpu_to_le16(LOG_UPDATEMAP); lrd->log.updatemap.type = cpu_to_le16(LOG_FREEXADLIST); xtlck = (struct xtlock *) & tlck->lock; hwm = xtlck->hwm.offset; lrd->log.updatemap.nxd = cpu_to_le16(hwm - next + 1); /* reformat linelock for lmLog() */ xtlck->header.offset = next; xtlck->header.length = hwm - next + 1; xtlck->index = 1; lrd->backchain = cpu_to_le32(lmLog(log, tblk, lrd, tlck)); } /* * format maplock(s) for txUpdateMap() to update bmap */ maplock->index = 0; /* * allocate entries XAD[lwm:next): */ if (lwm < next) { /* format a maplock for txUpdateMap() to update bPMAP * for alloc of new/extended extents of XAD[lwm:next) * from the page itself; * txUpdateMap() resets (XAD_NEW|XAD_EXTEND) flag. */ tlck->flag |= tlckUPDATEMAP; xadlock->flag = mlckALLOCXADLIST; xadlock->count = next - lwm; xadlock->xdlist = &p->xad[lwm]; jfs_info("xtLog: alloc ip:0x%p mp:0x%p count:%d lwm:%d next:%d", tlck->ip, mp, xadlock->count, lwm, next); maplock->index++; xadlock++; } /* * truncate entry XAD[twm == next - 1]: */ if (twm == next - 1) { /* format a maplock for txUpdateMap() to update bmap * to free truncated delta extent of the truncated * entry XAD[next - 1]; * (xtlck->pxdlock = truncated delta extent); */ tlck->flag |= tlckUPDATEMAP; pxdlock = (struct pxd_lock *) xadlock; pxdlock->flag = mlckFREEPXD; pxdlock->count = 1; pxdlock->pxd = pxd; jfs_info("xtLog: truncate ip:0x%p mp:0x%p count:%d hwm:%d", ip, mp, pxdlock->count, hwm); maplock->index++; xadlock++; } /* * free entries XAD[next:hwm]: */ if (hwm >= next) { /* format a maplock for txUpdateMap() to update bmap * to free extents of XAD[next:hwm] from thedeleted * page itself; */ tlck->flag |= tlckUPDATEMAP; xadlock->flag = mlckFREEXADLIST; xadlock->count = hwm - next + 1; xadlock->xdlist = &p->xad[next]; jfs_info("xtLog: free ip:0x%p mp:0x%p count:%d next:%d hwm:%d", tlck->ip, mp, xadlock->count, next, hwm); maplock->index++; } /* mark page as homeward bound */ tlck->flag |= tlckWRITEPAGE; } return; } /* * mapLog() * * function: log from maplock of freed data extents; */ static void mapLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd, struct tlock * tlck) { struct pxd_lock *pxdlock; int i, nlock; pxd_t *pxd; /* * page relocation: free the source page extent * * a maplock for txUpdateMap() for free of the page * has been formatted at txLock() time saving the src * relocated page address; */ if (tlck->type & tlckRELOCATE) { /* log LOG_NOREDOPAGE of the old relocated page * for logredo() to start NoRedoPage filter; */ lrd->type = cpu_to_le16(LOG_NOREDOPAGE); pxdlock = (struct pxd_lock *) & tlck->lock; pxd = &lrd->log.redopage.pxd; *pxd = pxdlock->pxd; lrd->backchain = cpu_to_le32(lmLog(log, tblk, lrd, NULL)); /* (N.B. currently, logredo() does NOT update bmap * for free of the page itself for (LOG_XTREE|LOG_NOREDOPAGE); * if page free from relocation, LOG_UPDATEMAP log is * specifically generated now for logredo() * to update bmap for free of src relocated page; * (new flag LOG_RELOCATE may be introduced which will * inform logredo() to start NORedoPage filter and also * update block allocation map at the same time, thus * avoiding an extra log write); */ lrd->type = cpu_to_le16(LOG_UPDATEMAP); lrd->log.updatemap.type = cpu_to_le16(LOG_FREEPXD); lrd->log.updatemap.nxd = cpu_to_le16(1); lrd->log.updatemap.pxd = pxdlock->pxd; lrd->backchain = cpu_to_le32(lmLog(log, tblk, lrd, NULL)); /* a maplock for txUpdateMap() for free of the page * has been formatted at txLock() time; */ tlck->flag |= tlckUPDATEMAP; return; } /* * Otherwise it's not a relocate request * */ else { /* log LOG_UPDATEMAP for logredo() to update bmap for * free of truncated/relocated delta extent of the data; * e.g.: external EA extent, relocated/truncated extent * from xtTailgate(); */ lrd->type = cpu_to_le16(LOG_UPDATEMAP); pxdlock = (struct pxd_lock *) & tlck->lock; nlock = pxdlock->index; for (i = 0; i < nlock; i++, pxdlock++) { if (pxdlock->flag & mlckALLOCPXD) lrd->log.updatemap.type = cpu_to_le16(LOG_ALLOCPXD); else lrd->log.updatemap.type = cpu_to_le16(LOG_FREEPXD); lrd->log.updatemap.nxd = cpu_to_le16(1); lrd->log.updatemap.pxd = pxdlock->pxd; lrd->backchain = cpu_to_le32(lmLog(log, tblk, lrd, NULL)); jfs_info("mapLog: xaddr:0x%lx xlen:0x%x", (ulong) addressPXD(&pxdlock->pxd), lengthPXD(&pxdlock->pxd)); } /* update bmap */ tlck->flag |= tlckUPDATEMAP; } } /* * txEA() * * function: acquire maplock for EA/ACL extents or * set COMMIT_INLINE flag; */ void txEA(tid_t tid, struct inode *ip, dxd_t * oldea, dxd_t * newea) { struct tlock *tlck = NULL; struct pxd_lock *maplock = NULL, *pxdlock = NULL; /* * format maplock for alloc of new EA extent */ if (newea) { /* Since the newea could be a completely zeroed entry we need to * check for the two flags which indicate we should actually * commit new EA data */ if (newea->flag & DXD_EXTENT) { tlck = txMaplock(tid, ip, tlckMAP); maplock = (struct pxd_lock *) & tlck->lock; pxdlock = (struct pxd_lock *) maplock; pxdlock->flag = mlckALLOCPXD; PXDaddress(&pxdlock->pxd, addressDXD(newea)); PXDlength(&pxdlock->pxd, lengthDXD(newea)); pxdlock++; maplock->index = 1; } else if (newea->flag & DXD_INLINE) { tlck = NULL; set_cflag(COMMIT_Inlineea, ip); } } /* * format maplock for free of old EA extent */ if (!test_cflag(COMMIT_Nolink, ip) && oldea->flag & DXD_EXTENT) { if (tlck == NULL) { tlck = txMaplock(tid, ip, tlckMAP); maplock = (struct pxd_lock *) & tlck->lock; pxdlock = (struct pxd_lock *) maplock; maplock->index = 0; } pxdlock->flag = mlckFREEPXD; PXDaddress(&pxdlock->pxd, addressDXD(oldea)); PXDlength(&pxdlock->pxd, lengthDXD(oldea)); maplock->index++; } } /* * txForce() * * function: synchronously write pages locked by transaction * after txLog() but before txUpdateMap(); */ static void txForce(struct tblock * tblk) { struct tlock *tlck; lid_t lid, next; struct metapage *mp; /* * reverse the order of transaction tlocks in * careful update order of address index pages * (right to left, bottom up) */ tlck = lid_to_tlock(tblk->next); lid = tlck->next; tlck->next = 0; while (lid) { tlck = lid_to_tlock(lid); next = tlck->next; tlck->next = tblk->next; tblk->next = lid; lid = next; } /* * synchronously write the page, and * hold the page for txUpdateMap(); */ for (lid = tblk->next; lid; lid = next) { tlck = lid_to_tlock(lid); next = tlck->next; if ((mp = tlck->mp) != NULL && (tlck->type & tlckBTROOT) == 0) { assert(mp->xflag & COMMIT_PAGE); if (tlck->flag & tlckWRITEPAGE) { tlck->flag &= ~tlckWRITEPAGE; /* do not release page to freelist */ force_metapage(mp); #if 0 /* * The "right" thing to do here is to * synchronously write the metadata. * With the current implementation this * is hard since write_metapage requires * us to kunmap & remap the page. If we * have tlocks pointing into the metadata * pages, we don't want to do this. I think * we can get by with synchronously writing * the pages when they are released. */ assert(mp->nohomeok); set_bit(META_dirty, &mp->flag); set_bit(META_sync, &mp->flag); #endif } } } } /* * txUpdateMap() * * function: update persistent allocation map (and working map * if appropriate); * * parameter: */ static void txUpdateMap(struct tblock * tblk) { struct inode *ip; struct inode *ipimap; lid_t lid; struct tlock *tlck; struct maplock *maplock; struct pxd_lock pxdlock; int maptype; int k, nlock; struct metapage *mp = NULL; ipimap = JFS_SBI(tblk->sb)->ipimap; maptype = (tblk->xflag & COMMIT_PMAP) ? COMMIT_PMAP : COMMIT_PWMAP; /* * update block allocation map * * update allocation state in pmap (and wmap) and * update lsn of the pmap page; */ /* * scan each tlock/page of transaction for block allocation/free: * * for each tlock/page of transaction, update map. * ? are there tlock for pmap and pwmap at the same time ? */ for (lid = tblk->next; lid; lid = tlck->next) { tlck = lid_to_tlock(lid); if ((tlck->flag & tlckUPDATEMAP) == 0) continue; if (tlck->flag & tlckFREEPAGE) { /* * Another thread may attempt to reuse freed space * immediately, so we want to get rid of the metapage * before anyone else has a chance to get it. * Lock metapage, update maps, then invalidate * the metapage. */ mp = tlck->mp; ASSERT(mp->xflag & COMMIT_PAGE); grab_metapage(mp); } /* * extent list: * . in-line PXD list: * . out-of-line XAD list: */ maplock = (struct maplock *) & tlck->lock; nlock = maplock->index; for (k = 0; k < nlock; k++, maplock++) { /* * allocate blocks in persistent map: * * blocks have been allocated from wmap at alloc time; */ if (maplock->flag & mlckALLOC) { txAllocPMap(ipimap, maplock, tblk); } /* * free blocks in persistent and working map: * blocks will be freed in pmap and then in wmap; * * ? tblock specifies the PMAP/PWMAP based upon * transaction * * free blocks in persistent map: * blocks will be freed from wmap at last reference * release of the object for regular files; * * Alway free blocks from both persistent & working * maps for directories */ else { /* (maplock->flag & mlckFREE) */ if (tlck->flag & tlckDIRECTORY) txFreeMap(ipimap, maplock, tblk, COMMIT_PWMAP); else txFreeMap(ipimap, maplock, tblk, maptype); } } if (tlck->flag & tlckFREEPAGE) { if (!(tblk->flag & tblkGC_LAZY)) { /* This is equivalent to txRelease */ ASSERT(mp->lid == lid); tlck->mp->lid = 0; } assert(mp->nohomeok == 1); metapage_homeok(mp); discard_metapage(mp); tlck->mp = NULL; } } /* * update inode allocation map * * update allocation state in pmap and * update lsn of the pmap page; * update in-memory inode flag/state * * unlock mapper/write lock */ if (tblk->xflag & COMMIT_CREATE) { diUpdatePMap(ipimap, tblk->ino, false, tblk); /* update persistent block allocation map * for the allocation of inode extent; */ pxdlock.flag = mlckALLOCPXD; pxdlock.pxd = tblk->u.ixpxd; pxdlock.index = 1; txAllocPMap(ipimap, (struct maplock *) & pxdlock, tblk); } else if (tblk->xflag & COMMIT_DELETE) { ip = tblk->u.ip; diUpdatePMap(ipimap, ip->i_ino, true, tblk); iput(ip); } } /* * txAllocPMap() * * function: allocate from persistent map; * * parameter: * ipbmap - * malock - * xad list: * pxd: * * maptype - * allocate from persistent map; * free from persistent map; * (e.g., tmp file - free from working map at releae * of last reference); * free from persistent and working map; * * lsn - log sequence number; */ static void txAllocPMap(struct inode *ip, struct maplock * maplock, struct tblock * tblk) { struct inode *ipbmap = JFS_SBI(ip->i_sb)->ipbmap; struct xdlistlock *xadlistlock; xad_t *xad; s64 xaddr; int xlen; struct pxd_lock *pxdlock; struct xdlistlock *pxdlistlock; pxd_t *pxd; int n; /* * allocate from persistent map; */ if (maplock->flag & mlckALLOCXADLIST) { xadlistlock = (struct xdlistlock *) maplock; xad = xadlistlock->xdlist; for (n = 0; n < xadlistlock->count; n++, xad++) { if (xad->flag & (XAD_NEW | XAD_EXTENDED)) { xaddr = addressXAD(xad); xlen = lengthXAD(xad); dbUpdatePMap(ipbmap, false, xaddr, (s64) xlen, tblk); xad->flag &= ~(XAD_NEW | XAD_EXTENDED); jfs_info("allocPMap: xaddr:0x%lx xlen:%d", (ulong) xaddr, xlen); } } } else if (maplock->flag & mlckALLOCPXD) { pxdlock = (struct pxd_lock *) maplock; xaddr = addressPXD(&pxdlock->pxd); xlen = lengthPXD(&pxdlock->pxd); dbUpdatePMap(ipbmap, false, xaddr, (s64) xlen, tblk); jfs_info("allocPMap: xaddr:0x%lx xlen:%d", (ulong) xaddr, xlen); } else { /* (maplock->flag & mlckALLOCPXDLIST) */ pxdlistlock = (struct xdlistlock *) maplock; pxd = pxdlistlock->xdlist; for (n = 0; n < pxdlistlock->count; n++, pxd++) { xaddr = addressPXD(pxd); xlen = lengthPXD(pxd); dbUpdatePMap(ipbmap, false, xaddr, (s64) xlen, tblk); jfs_info("allocPMap: xaddr:0x%lx xlen:%d", (ulong) xaddr, xlen); } } } /* * txFreeMap() * * function: free from persistent and/or working map; * * todo: optimization */ void txFreeMap(struct inode *ip, struct maplock * maplock, struct tblock * tblk, int maptype) { struct inode *ipbmap = JFS_SBI(ip->i_sb)->ipbmap; struct xdlistlock *xadlistlock; xad_t *xad; s64 xaddr; int xlen; struct pxd_lock *pxdlock; struct xdlistlock *pxdlistlock; pxd_t *pxd; int n; jfs_info("txFreeMap: tblk:0x%p maplock:0x%p maptype:0x%x", tblk, maplock, maptype); /* * free from persistent map; */ if (maptype == COMMIT_PMAP || maptype == COMMIT_PWMAP) { if (maplock->flag & mlckFREEXADLIST) { xadlistlock = (struct xdlistlock *) maplock; xad = xadlistlock->xdlist; for (n = 0; n < xadlistlock->count; n++, xad++) { if (!(xad->flag & XAD_NEW)) { xaddr = addressXAD(xad); xlen = lengthXAD(xad); dbUpdatePMap(ipbmap, true, xaddr, (s64) xlen, tblk); jfs_info("freePMap: xaddr:0x%lx xlen:%d", (ulong) xaddr, xlen); } } } else if (maplock->flag & mlckFREEPXD) { pxdlock = (struct pxd_lock *) maplock; xaddr = addressPXD(&pxdlock->pxd); xlen = lengthPXD(&pxdlock->pxd); dbUpdatePMap(ipbmap, true, xaddr, (s64) xlen, tblk); jfs_info("freePMap: xaddr:0x%lx xlen:%d", (ulong) xaddr, xlen); } else { /* (maplock->flag & mlckALLOCPXDLIST) */ pxdlistlock = (struct xdlistlock *) maplock; pxd = pxdlistlock->xdlist; for (n = 0; n < pxdlistlock->count; n++, pxd++) { xaddr = addressPXD(pxd); xlen = lengthPXD(pxd); dbUpdatePMap(ipbmap, true, xaddr, (s64) xlen, tblk); jfs_info("freePMap: xaddr:0x%lx xlen:%d", (ulong) xaddr, xlen); } } } /* * free from working map; */ if (maptype == COMMIT_PWMAP || maptype == COMMIT_WMAP) { if (maplock->flag & mlckFREEXADLIST) { xadlistlock = (struct xdlistlock *) maplock; xad = xadlistlock->xdlist; for (n = 0; n < xadlistlock->count; n++, xad++) { xaddr = addressXAD(xad); xlen = lengthXAD(xad); dbFree(ip, xaddr, (s64) xlen); xad->flag = 0; jfs_info("freeWMap: xaddr:0x%lx xlen:%d", (ulong) xaddr, xlen); } } else if (maplock->flag & mlckFREEPXD) { pxdlock = (struct pxd_lock *) maplock; xaddr = addressPXD(&pxdlock->pxd); xlen = lengthPXD(&pxdlock->pxd); dbFree(ip, xaddr, (s64) xlen); jfs_info("freeWMap: xaddr:0x%lx xlen:%d", (ulong) xaddr, xlen); } else { /* (maplock->flag & mlckFREEPXDLIST) */ pxdlistlock = (struct xdlistlock *) maplock; pxd = pxdlistlock->xdlist; for (n = 0; n < pxdlistlock->count; n++, pxd++) { xaddr = addressPXD(pxd); xlen = lengthPXD(pxd); dbFree(ip, xaddr, (s64) xlen); jfs_info("freeWMap: xaddr:0x%lx xlen:%d", (ulong) xaddr, xlen); } } } } /* * txFreelock() * * function: remove tlock from inode anonymous locklist */ void txFreelock(struct inode *ip) { struct jfs_inode_info *jfs_ip = JFS_IP(ip); struct tlock *xtlck, *tlck; lid_t xlid = 0, lid; if (!jfs_ip->atlhead) return; TXN_LOCK(); xtlck = (struct tlock *) &jfs_ip->atlhead; while ((lid = xtlck->next) != 0) { tlck = lid_to_tlock(lid); if (tlck->flag & tlckFREELOCK) { xtlck->next = tlck->next; txLockFree(lid); } else { xtlck = tlck; xlid = lid; } } if (jfs_ip->atlhead) jfs_ip->atltail = xlid; else { jfs_ip->atltail = 0; /* * If inode was on anon_list, remove it */ list_del_init(&jfs_ip->anon_inode_list); } TXN_UNLOCK(); } /* * txAbort() * * function: abort tx before commit; * * frees line-locks and segment locks for all * segments in comdata structure. * Optionally sets state of file-system to FM_DIRTY in super-block. * log age of page-frames in memory for which caller has * are reset to 0 (to avoid logwarap). */ void txAbort(tid_t tid, int dirty) { lid_t lid, next; struct metapage *mp; struct tblock *tblk = tid_to_tblock(tid); struct tlock *tlck; /* * free tlocks of the transaction */ for (lid = tblk->next; lid; lid = next) { tlck = lid_to_tlock(lid); next = tlck->next; mp = tlck->mp; JFS_IP(tlck->ip)->xtlid = 0; if (mp) { mp->lid = 0; /* * reset lsn of page to avoid logwarap: * * (page may have been previously committed by another * transaction(s) but has not been paged, i.e., * it may be on logsync list even though it has not * been logged for the current tx.) */ if (mp->xflag & COMMIT_PAGE && mp->lsn) LogSyncRelease(mp); } /* insert tlock at head of freelist */ TXN_LOCK(); txLockFree(lid); TXN_UNLOCK(); } /* caller will free the transaction block */ tblk->next = tblk->last = 0; /* * mark filesystem dirty */ if (dirty) jfs_error(tblk->sb, "\n"); return; } /* * txLazyCommit(void) * * All transactions except those changing ipimap (COMMIT_FORCE) are * processed by this routine. This insures that the inode and block * allocation maps are updated in order. For synchronous transactions, * let the user thread finish processing after txUpdateMap() is called. */ static void txLazyCommit(struct tblock * tblk) { struct jfs_log *log; while (((tblk->flag & tblkGC_READY) == 0) && ((tblk->flag & tblkGC_UNLOCKED) == 0)) { /* We must have gotten ahead of the user thread */ jfs_info("jfs_lazycommit: tblk 0x%p not unlocked", tblk); yield(); } jfs_info("txLazyCommit: processing tblk 0x%p", tblk); txUpdateMap(tblk); log = (struct jfs_log *) JFS_SBI(tblk->sb)->log; spin_lock_irq(&log->gclock); // LOGGC_LOCK tblk->flag |= tblkGC_COMMITTED; if (tblk->flag & tblkGC_READY) log->gcrtc--; wake_up_all(&tblk->gcwait); // LOGGC_WAKEUP /* * Can't release log->gclock until we've tested tblk->flag */ if (tblk->flag & tblkGC_LAZY) { spin_unlock_irq(&log->gclock); // LOGGC_UNLOCK txUnlock(tblk); tblk->flag &= ~tblkGC_LAZY; txEnd(tblk - TxBlock); /* Convert back to tid */ } else spin_unlock_irq(&log->gclock); // LOGGC_UNLOCK jfs_info("txLazyCommit: done: tblk = 0x%p", tblk); } /* * jfs_lazycommit(void) * * To be run as a kernel daemon. If lbmIODone is called in an interrupt * context, or where blocking is not wanted, this routine will process * committed transactions from the unlock queue. */ int jfs_lazycommit(void *arg) { int WorkDone; struct tblock *tblk; unsigned long flags; struct jfs_sb_info *sbi; set_freezable(); do { LAZY_LOCK(flags); jfs_commit_thread_waking = 0; /* OK to wake another thread */ while (!list_empty(&TxAnchor.unlock_queue)) { WorkDone = 0; list_for_each_entry(tblk, &TxAnchor.unlock_queue, cqueue) { sbi = JFS_SBI(tblk->sb); /* * For each volume, the transactions must be * handled in order. If another commit thread * is handling a tblk for this superblock, * skip it */ if (sbi->commit_state & IN_LAZYCOMMIT) continue; sbi->commit_state |= IN_LAZYCOMMIT; WorkDone = 1; /* * Remove transaction from queue */ list_del(&tblk->cqueue); LAZY_UNLOCK(flags); txLazyCommit(tblk); LAZY_LOCK(flags); sbi->commit_state &= ~IN_LAZYCOMMIT; /* * Don't continue in the for loop. (We can't * anyway, it's unsafe!) We want to go back to * the beginning of the list. */ break; } /* If there was nothing to do, don't continue */ if (!WorkDone) break; } /* In case a wakeup came while all threads were active */ jfs_commit_thread_waking = 0; if (freezing(current)) { LAZY_UNLOCK(flags); try_to_freeze(); } else { DECLARE_WAITQUEUE(wq, current); add_wait_queue(&jfs_commit_thread_wait, &wq); set_current_state(TASK_INTERRUPTIBLE); LAZY_UNLOCK(flags); schedule(); remove_wait_queue(&jfs_commit_thread_wait, &wq); } } while (!kthread_should_stop()); if (!list_empty(&TxAnchor.unlock_queue)) jfs_err("jfs_lazycommit being killed w/pending transactions!"); else jfs_info("jfs_lazycommit being killed"); return 0; } void txLazyUnlock(struct tblock * tblk) { unsigned long flags; LAZY_LOCK(flags); list_add_tail(&tblk->cqueue, &TxAnchor.unlock_queue); /* * Don't wake up a commit thread if there is already one servicing * this superblock, or if the last one we woke up hasn't started yet. */ if (!(JFS_SBI(tblk->sb)->commit_state & IN_LAZYCOMMIT) && !jfs_commit_thread_waking) { jfs_commit_thread_waking = 1; wake_up(&jfs_commit_thread_wait); } LAZY_UNLOCK(flags); } static void LogSyncRelease(struct metapage * mp) { struct jfs_log *log = mp->log; assert(mp->nohomeok); assert(log); metapage_homeok(mp); } /* * txQuiesce * * Block all new transactions and push anonymous transactions to * completion * * This does almost the same thing as jfs_sync below. We don't * worry about deadlocking when jfs_tlocks_low is set, since we would * expect jfs_sync to get us out of that jam. */ void txQuiesce(struct super_block *sb) { struct inode *ip; struct jfs_inode_info *jfs_ip; struct jfs_log *log = JFS_SBI(sb)->log; tid_t tid; set_bit(log_QUIESCE, &log->flag); TXN_LOCK(); restart: while (!list_empty(&TxAnchor.anon_list)) { jfs_ip = list_entry(TxAnchor.anon_list.next, struct jfs_inode_info, anon_inode_list); ip = &jfs_ip->vfs_inode; /* * inode will be removed from anonymous list * when it is committed */ TXN_UNLOCK(); tid = txBegin(ip->i_sb, COMMIT_INODE | COMMIT_FORCE); mutex_lock(&jfs_ip->commit_mutex); txCommit(tid, 1, &ip, 0); txEnd(tid); mutex_unlock(&jfs_ip->commit_mutex); /* * Just to be safe. I don't know how * long we can run without blocking */ cond_resched(); TXN_LOCK(); } /* * If jfs_sync is running in parallel, there could be some inodes * on anon_list2. Let's check. */ if (!list_empty(&TxAnchor.anon_list2)) { list_splice_init(&TxAnchor.anon_list2, &TxAnchor.anon_list); goto restart; } TXN_UNLOCK(); /* * We may need to kick off the group commit */ jfs_flush_journal(log, 0); } /* * txResume() * * Allows transactions to start again following txQuiesce */ void txResume(struct super_block *sb) { struct jfs_log *log = JFS_SBI(sb)->log; clear_bit(log_QUIESCE, &log->flag); TXN_WAKEUP(&log->syncwait); } /* * jfs_sync(void) * * To be run as a kernel daemon. This is awakened when tlocks run low. * We write any inodes that have anonymous tlocks so they will become * available. */ int jfs_sync(void *arg) { struct inode *ip; struct jfs_inode_info *jfs_ip; tid_t tid; set_freezable(); do { /* * write each inode on the anonymous inode list */ TXN_LOCK(); while (jfs_tlocks_low && !list_empty(&TxAnchor.anon_list)) { jfs_ip = list_entry(TxAnchor.anon_list.next, struct jfs_inode_info, anon_inode_list); ip = &jfs_ip->vfs_inode; if (! igrab(ip)) { /* * Inode is being freed */ list_del_init(&jfs_ip->anon_inode_list); } else if (mutex_trylock(&jfs_ip->commit_mutex)) { /* * inode will be removed from anonymous list * when it is committed */ TXN_UNLOCK(); tid = txBegin(ip->i_sb, COMMIT_INODE); txCommit(tid, 1, &ip, 0); txEnd(tid); mutex_unlock(&jfs_ip->commit_mutex); iput(ip); /* * Just to be safe. I don't know how * long we can run without blocking */ cond_resched(); TXN_LOCK(); } else { /* We can't get the commit mutex. It may * be held by a thread waiting for tlock's * so let's not block here. Save it to * put back on the anon_list. */ /* Move from anon_list to anon_list2 */ list_move(&jfs_ip->anon_inode_list, &TxAnchor.anon_list2); TXN_UNLOCK(); iput(ip); TXN_LOCK(); } } /* Add anon_list2 back to anon_list */ list_splice_init(&TxAnchor.anon_list2, &TxAnchor.anon_list); if (freezing(current)) { TXN_UNLOCK(); try_to_freeze(); } else { set_current_state(TASK_INTERRUPTIBLE); TXN_UNLOCK(); schedule(); } } while (!kthread_should_stop()); jfs_info("jfs_sync being killed"); return 0; } #if defined(CONFIG_PROC_FS) && defined(CONFIG_JFS_DEBUG) int jfs_txanchor_proc_show(struct seq_file *m, void *v) { char *freewait; char *freelockwait; char *lowlockwait; freewait = waitqueue_active(&TxAnchor.freewait) ? "active" : "empty"; freelockwait = waitqueue_active(&TxAnchor.freelockwait) ? "active" : "empty"; lowlockwait = waitqueue_active(&TxAnchor.lowlockwait) ? "active" : "empty"; seq_printf(m, "JFS TxAnchor\n" "============\n" "freetid = %d\n" "freewait = %s\n" "freelock = %d\n" "freelockwait = %s\n" "lowlockwait = %s\n" "tlocksInUse = %d\n" "jfs_tlocks_low = %d\n" "unlock_queue is %sempty\n", TxAnchor.freetid, freewait, TxAnchor.freelock, freelockwait, lowlockwait, TxAnchor.tlocksInUse, jfs_tlocks_low, list_empty(&TxAnchor.unlock_queue) ? "" : "not "); return 0; } #endif #if defined(CONFIG_PROC_FS) && defined(CONFIG_JFS_STATISTICS) int jfs_txstats_proc_show(struct seq_file *m, void *v) { seq_printf(m, "JFS TxStats\n" "===========\n" "calls to txBegin = %d\n" "txBegin blocked by sync barrier = %d\n" "txBegin blocked by tlocks low = %d\n" "txBegin blocked by no free tid = %d\n" "calls to txBeginAnon = %d\n" "txBeginAnon blocked by sync barrier = %d\n" "txBeginAnon blocked by tlocks low = %d\n" "calls to txLockAlloc = %d\n" "tLockAlloc blocked by no free lock = %d\n", TxStat.txBegin, TxStat.txBegin_barrier, TxStat.txBegin_lockslow, TxStat.txBegin_freetid, TxStat.txBeginAnon, TxStat.txBeginAnon_barrier, TxStat.txBeginAnon_lockslow, TxStat.txLockAlloc, TxStat.txLockAlloc_freelock); return 0; } #endif
4 1 1 2 2 1 1 2 2 2 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 // SPDX-License-Identifier: GPL-2.0-only /* * linux/kernel/power/user.c * * This file provides the user space interface for software suspend/resume. * * Copyright (C) 2006 Rafael J. Wysocki <rjw@sisk.pl> */ #include <linux/suspend.h> #include <linux/reboot.h> #include <linux/string.h> #include <linux/device.h> #include <linux/miscdevice.h> #include <linux/mm.h> #include <linux/swap.h> #include <linux/swapops.h> #include <linux/pm.h> #include <linux/fs.h> #include <linux/compat.h> #include <linux/console.h> #include <linux/cpu.h> #include <linux/freezer.h> #include <linux/uaccess.h> #include "power.h" static bool need_wait; static struct snapshot_data { struct snapshot_handle handle; int swap; int mode; bool frozen; bool ready; bool platform_support; bool free_bitmaps; dev_t dev; } snapshot_state; int is_hibernate_resume_dev(dev_t dev) { return hibernation_available() && snapshot_state.dev == dev; } static int snapshot_open(struct inode *inode, struct file *filp) { struct snapshot_data *data; unsigned int sleep_flags; int error; if (!hibernation_available()) return -EPERM; sleep_flags = lock_system_sleep(); if (!hibernate_acquire()) { error = -EBUSY; goto Unlock; } if ((filp->f_flags & O_ACCMODE) == O_RDWR) { hibernate_release(); error = -ENOSYS; goto Unlock; } nonseekable_open(inode, filp); data = &snapshot_state; filp->private_data = data; memset(&data->handle, 0, sizeof(struct snapshot_handle)); if ((filp->f_flags & O_ACCMODE) == O_RDONLY) { /* Hibernating. The image device should be accessible. */ data->swap = swap_type_of(swsusp_resume_device, 0); data->mode = O_RDONLY; data->free_bitmaps = false; error = pm_notifier_call_chain_robust(PM_HIBERNATION_PREPARE, PM_POST_HIBERNATION); } else { /* * Resuming. We may need to wait for the image device to * appear. */ need_wait = true; data->swap = -1; data->mode = O_WRONLY; error = pm_notifier_call_chain_robust(PM_RESTORE_PREPARE, PM_POST_RESTORE); if (!error) { error = create_basic_memory_bitmaps(); data->free_bitmaps = !error; } } if (error) hibernate_release(); data->frozen = false; data->ready = false; data->platform_support = false; data->dev = 0; Unlock: unlock_system_sleep(sleep_flags); return error; } static int snapshot_release(struct inode *inode, struct file *filp) { struct snapshot_data *data; unsigned int sleep_flags; sleep_flags = lock_system_sleep(); swsusp_free(); data = filp->private_data; data->dev = 0; free_all_swap_pages(data->swap); if (data->frozen) { pm_restore_gfp_mask(); free_basic_memory_bitmaps(); thaw_processes(); } else if (data->free_bitmaps) { free_basic_memory_bitmaps(); } pm_notifier_call_chain(data->mode == O_RDONLY ? PM_POST_HIBERNATION : PM_POST_RESTORE); hibernate_release(); unlock_system_sleep(sleep_flags); return 0; } static ssize_t snapshot_read(struct file *filp, char __user *buf, size_t count, loff_t *offp) { loff_t pg_offp = *offp & ~PAGE_MASK; struct snapshot_data *data; unsigned int sleep_flags; ssize_t res; sleep_flags = lock_system_sleep(); data = filp->private_data; if (!data->ready) { res = -ENODATA; goto Unlock; } if (!pg_offp) { /* on page boundary? */ res = snapshot_read_next(&data->handle); if (res <= 0) goto Unlock; } else { res = PAGE_SIZE - pg_offp; } res = simple_read_from_buffer(buf, count, &pg_offp, data_of(data->handle), res); if (res > 0) *offp += res; Unlock: unlock_system_sleep(sleep_flags); return res; } static ssize_t snapshot_write(struct file *filp, const char __user *buf, size_t count, loff_t *offp) { loff_t pg_offp = *offp & ~PAGE_MASK; struct snapshot_data *data; unsigned long sleep_flags; ssize_t res; if (need_wait) { wait_for_device_probe(); need_wait = false; } sleep_flags = lock_system_sleep(); data = filp->private_data; if (!pg_offp) { res = snapshot_write_next(&data->handle); if (res <= 0) goto unlock; } else { res = PAGE_SIZE; } if (!data_of(data->handle)) { res = -EINVAL; goto unlock; } res = simple_write_to_buffer(data_of(data->handle), res, &pg_offp, buf, count); if (res > 0) *offp += res; unlock: unlock_system_sleep(sleep_flags); return res; } struct compat_resume_swap_area { compat_loff_t offset; u32 dev; } __packed; static int snapshot_set_swap_area(struct snapshot_data *data, void __user *argp) { sector_t offset; dev_t swdev; if (swsusp_swap_in_use()) return -EPERM; if (in_compat_syscall()) { struct compat_resume_swap_area swap_area; if (copy_from_user(&swap_area, argp, sizeof(swap_area))) return -EFAULT; swdev = new_decode_dev(swap_area.dev); offset = swap_area.offset; } else { struct resume_swap_area swap_area; if (copy_from_user(&swap_area, argp, sizeof(swap_area))) return -EFAULT; swdev = new_decode_dev(swap_area.dev); offset = swap_area.offset; } /* * User space encodes device types as two-byte values, * so we need to recode them */ data->swap = swap_type_of(swdev, offset); if (data->swap < 0) return swdev ? -ENODEV : -EINVAL; data->dev = swdev; return 0; } static long snapshot_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { int error = 0; struct snapshot_data *data; loff_t size; sector_t offset; if (need_wait) { wait_for_device_probe(); need_wait = false; } if (_IOC_TYPE(cmd) != SNAPSHOT_IOC_MAGIC) return -ENOTTY; if (_IOC_NR(cmd) > SNAPSHOT_IOC_MAXNR) return -ENOTTY; if (!capable(CAP_SYS_ADMIN)) return -EPERM; if (!mutex_trylock(&system_transition_mutex)) return -EBUSY; lock_device_hotplug(); data = filp->private_data; switch (cmd) { case SNAPSHOT_FREEZE: if (data->frozen) break; ksys_sync_helper(); error = freeze_processes(); if (error) break; error = create_basic_memory_bitmaps(); if (error) thaw_processes(); else data->frozen = true; break; case SNAPSHOT_UNFREEZE: if (!data->frozen || data->ready) break; pm_restore_gfp_mask(); free_basic_memory_bitmaps(); data->free_bitmaps = false; thaw_processes(); data->frozen = false; break; case SNAPSHOT_CREATE_IMAGE: if (data->mode != O_RDONLY || !data->frozen || data->ready) { error = -EPERM; break; } pm_restore_gfp_mask(); error = hibernation_snapshot(data->platform_support); if (!error) { error = put_user(in_suspend, (int __user *)arg); data->ready = !freezer_test_done && !error; freezer_test_done = false; } break; case SNAPSHOT_ATOMIC_RESTORE: error = snapshot_write_finalize(&data->handle); if (error) break; if (data->mode != O_WRONLY || !data->frozen || !snapshot_image_loaded(&data->handle)) { error = -EPERM; break; } error = hibernation_restore(data->platform_support); break; case SNAPSHOT_FREE: swsusp_free(); memset(&data->handle, 0, sizeof(struct snapshot_handle)); data->ready = false; /* * It is necessary to thaw kernel threads here, because * SNAPSHOT_CREATE_IMAGE may be invoked directly after * SNAPSHOT_FREE. In that case, if kernel threads were not * thawed, the preallocation of memory carried out by * hibernation_snapshot() might run into problems (i.e. it * might fail or even deadlock). */ thaw_kernel_threads(); break; case SNAPSHOT_PREF_IMAGE_SIZE: image_size = arg; break; case SNAPSHOT_GET_IMAGE_SIZE: if (!data->ready) { error = -ENODATA; break; } size = snapshot_get_image_size(); size <<= PAGE_SHIFT; error = put_user(size, (loff_t __user *)arg); break; case SNAPSHOT_AVAIL_SWAP_SIZE: size = count_swap_pages(data->swap, 1); size <<= PAGE_SHIFT; error = put_user(size, (loff_t __user *)arg); break; case SNAPSHOT_ALLOC_SWAP_PAGE: if (data->swap < 0 || data->swap >= MAX_SWAPFILES) { error = -ENODEV; break; } offset = alloc_swapdev_block(data->swap); if (offset) { offset <<= PAGE_SHIFT; error = put_user(offset, (loff_t __user *)arg); } else { error = -ENOSPC; } break; case SNAPSHOT_FREE_SWAP_PAGES: if (data->swap < 0 || data->swap >= MAX_SWAPFILES) { error = -ENODEV; break; } free_all_swap_pages(data->swap); break; case SNAPSHOT_S2RAM: if (!data->frozen) { error = -EPERM; break; } /* * Tasks are frozen and the notifiers have been called with * PM_HIBERNATION_PREPARE */ error = suspend_devices_and_enter(PM_SUSPEND_MEM); data->ready = false; break; case SNAPSHOT_PLATFORM_SUPPORT: data->platform_support = !!arg; break; case SNAPSHOT_POWER_OFF: if (data->platform_support) error = hibernation_platform_enter(); break; case SNAPSHOT_SET_SWAP_AREA: error = snapshot_set_swap_area(data, (void __user *)arg); break; default: error = -ENOTTY; } unlock_device_hotplug(); mutex_unlock(&system_transition_mutex); return error; } #ifdef CONFIG_COMPAT static long snapshot_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { BUILD_BUG_ON(sizeof(loff_t) != sizeof(compat_loff_t)); switch (cmd) { case SNAPSHOT_GET_IMAGE_SIZE: case SNAPSHOT_AVAIL_SWAP_SIZE: case SNAPSHOT_ALLOC_SWAP_PAGE: case SNAPSHOT_CREATE_IMAGE: case SNAPSHOT_SET_SWAP_AREA: return snapshot_ioctl(file, cmd, (unsigned long) compat_ptr(arg)); default: return snapshot_ioctl(file, cmd, arg); } } #endif /* CONFIG_COMPAT */ static const struct file_operations snapshot_fops = { .open = snapshot_open, .release = snapshot_release, .read = snapshot_read, .write = snapshot_write, .unlocked_ioctl = snapshot_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = snapshot_compat_ioctl, #endif }; static struct miscdevice snapshot_device = { .minor = SNAPSHOT_MINOR, .name = "snapshot", .fops = &snapshot_fops, }; static int __init snapshot_device_init(void) { return misc_register(&snapshot_device); }; device_initcall(snapshot_device_init);
3 4 4 3 2 4 4 4 4 4 3 4 4 4 4 4 4 4 3 4 4 1 4 4 2 2 2 2 2 2 2 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 // SPDX-License-Identifier: GPL-2.0-or-later /* procfs files for key database enumeration * * Copyright (C) 2004 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) */ #include <linux/init.h> #include <linux/sched.h> #include <linux/fs.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <asm/errno.h> #include "internal.h" static void *proc_keys_start(struct seq_file *p, loff_t *_pos); static void *proc_keys_next(struct seq_file *p, void *v, loff_t *_pos); static void proc_keys_stop(struct seq_file *p, void *v); static int proc_keys_show(struct seq_file *m, void *v); static const struct seq_operations proc_keys_ops = { .start = proc_keys_start, .next = proc_keys_next, .stop = proc_keys_stop, .show = proc_keys_show, }; static void *proc_key_users_start(struct seq_file *p, loff_t *_pos); static void *proc_key_users_next(struct seq_file *p, void *v, loff_t *_pos); static void proc_key_users_stop(struct seq_file *p, void *v); static int proc_key_users_show(struct seq_file *m, void *v); static const struct seq_operations proc_key_users_ops = { .start = proc_key_users_start, .next = proc_key_users_next, .stop = proc_key_users_stop, .show = proc_key_users_show, }; /* * Declare the /proc files. */ static int __init key_proc_init(void) { struct proc_dir_entry *p; p = proc_create_seq("keys", 0, NULL, &proc_keys_ops); if (!p) panic("Cannot create /proc/keys\n"); p = proc_create_seq("key-users", 0, NULL, &proc_key_users_ops); if (!p) panic("Cannot create /proc/key-users\n"); return 0; } __initcall(key_proc_init); /* * Implement "/proc/keys" to provide a list of the keys on the system that * grant View permission to the caller. */ static struct rb_node *key_serial_next(struct seq_file *p, struct rb_node *n) { struct user_namespace *user_ns = seq_user_ns(p); n = rb_next(n); while (n) { struct key *key = rb_entry(n, struct key, serial_node); if (kuid_has_mapping(user_ns, key->user->uid)) break; n = rb_next(n); } return n; } static struct key *find_ge_key(struct seq_file *p, key_serial_t id) { struct user_namespace *user_ns = seq_user_ns(p); struct rb_node *n = key_serial_tree.rb_node; struct key *minkey = NULL; while (n) { struct key *key = rb_entry(n, struct key, serial_node); if (id < key->serial) { if (!minkey || minkey->serial > key->serial) minkey = key; n = n->rb_left; } else if (id > key->serial) { n = n->rb_right; } else { minkey = key; break; } key = NULL; } if (!minkey) return NULL; for (;;) { if (kuid_has_mapping(user_ns, minkey->user->uid)) return minkey; n = rb_next(&minkey->serial_node); if (!n) return NULL; minkey = rb_entry(n, struct key, serial_node); } } static void *proc_keys_start(struct seq_file *p, loff_t *_pos) __acquires(key_serial_lock) { key_serial_t pos = *_pos; struct key *key; spin_lock(&key_serial_lock); if (*_pos > INT_MAX) return NULL; key = find_ge_key(p, pos); if (!key) return NULL; *_pos = key->serial; return &key->serial_node; } static inline key_serial_t key_node_serial(struct rb_node *n) { struct key *key = rb_entry(n, struct key, serial_node); return key->serial; } static void *proc_keys_next(struct seq_file *p, void *v, loff_t *_pos) { struct rb_node *n; n = key_serial_next(p, v); if (n) *_pos = key_node_serial(n); else (*_pos)++; return n; } static void proc_keys_stop(struct seq_file *p, void *v) __releases(key_serial_lock) { spin_unlock(&key_serial_lock); } static int proc_keys_show(struct seq_file *m, void *v) { struct rb_node *_p = v; struct key *key = rb_entry(_p, struct key, serial_node); unsigned long flags; key_ref_t key_ref, skey_ref; time64_t now, expiry; char xbuf[16]; short state; u64 timo; int rc; struct keyring_search_context ctx = { .index_key = key->index_key, .cred = m->file->f_cred, .match_data.cmp = lookup_user_key_possessed, .match_data.raw_data = key, .match_data.lookup_type = KEYRING_SEARCH_LOOKUP_DIRECT, .flags = (KEYRING_SEARCH_NO_STATE_CHECK | KEYRING_SEARCH_RECURSE), }; key_ref = make_key_ref(key, 0); /* determine if the key is possessed by this process (a test we can * skip if the key does not indicate the possessor can view it */ if (key->perm & KEY_POS_VIEW) { rcu_read_lock(); skey_ref = search_cred_keyrings_rcu(&ctx); rcu_read_unlock(); if (!IS_ERR(skey_ref)) { key_ref_put(skey_ref); key_ref = make_key_ref(key, 1); } } /* check whether the current task is allowed to view the key */ rc = key_task_permission(key_ref, ctx.cred, KEY_NEED_VIEW); if (rc < 0) return 0; now = ktime_get_real_seconds(); rcu_read_lock(); /* come up with a suitable timeout value */ expiry = READ_ONCE(key->expiry); if (expiry == TIME64_MAX) { memcpy(xbuf, "perm", 5); } else if (now >= expiry) { memcpy(xbuf, "expd", 5); } else { timo = expiry - now; if (timo < 60) sprintf(xbuf, "%llus", timo); else if (timo < 60*60) sprintf(xbuf, "%llum", div_u64(timo, 60)); else if (timo < 60*60*24) sprintf(xbuf, "%lluh", div_u64(timo, 60 * 60)); else if (timo < 60*60*24*7) sprintf(xbuf, "%llud", div_u64(timo, 60 * 60 * 24)); else sprintf(xbuf, "%lluw", div_u64(timo, 60 * 60 * 24 * 7)); } state = key_read_state(key); #define showflag(FLAGS, LETTER, FLAG) \ ((FLAGS & (1 << FLAG)) ? LETTER : '-') flags = READ_ONCE(key->flags); seq_printf(m, "%08x %c%c%c%c%c%c%c %5d %4s %08x %5d %5d %-9.9s ", key->serial, state != KEY_IS_UNINSTANTIATED ? 'I' : '-', showflag(flags, 'R', KEY_FLAG_REVOKED), showflag(flags, 'D', KEY_FLAG_DEAD), showflag(flags, 'Q', KEY_FLAG_IN_QUOTA), showflag(flags, 'U', KEY_FLAG_USER_CONSTRUCT), state < 0 ? 'N' : '-', showflag(flags, 'i', KEY_FLAG_INVALIDATED), refcount_read(&key->usage), xbuf, key->perm, from_kuid_munged(seq_user_ns(m), key->uid), from_kgid_munged(seq_user_ns(m), key->gid), key->type->name); #undef showflag if (key->type->describe) key->type->describe(key, m); seq_putc(m, '\n'); rcu_read_unlock(); return 0; } static struct rb_node *__key_user_next(struct user_namespace *user_ns, struct rb_node *n) { while (n) { struct key_user *user = rb_entry(n, struct key_user, node); if (kuid_has_mapping(user_ns, user->uid)) break; n = rb_next(n); } return n; } static struct rb_node *key_user_next(struct user_namespace *user_ns, struct rb_node *n) { return __key_user_next(user_ns, rb_next(n)); } static struct rb_node *key_user_first(struct user_namespace *user_ns, struct rb_root *r) { struct rb_node *n = rb_first(r); return __key_user_next(user_ns, n); } static void *proc_key_users_start(struct seq_file *p, loff_t *_pos) __acquires(key_user_lock) { struct rb_node *_p; loff_t pos = *_pos; spin_lock(&key_user_lock); _p = key_user_first(seq_user_ns(p), &key_user_tree); while (pos > 0 && _p) { pos--; _p = key_user_next(seq_user_ns(p), _p); } return _p; } static void *proc_key_users_next(struct seq_file *p, void *v, loff_t *_pos) { (*_pos)++; return key_user_next(seq_user_ns(p), (struct rb_node *)v); } static void proc_key_users_stop(struct seq_file *p, void *v) __releases(key_user_lock) { spin_unlock(&key_user_lock); } static int proc_key_users_show(struct seq_file *m, void *v) { struct rb_node *_p = v; struct key_user *user = rb_entry(_p, struct key_user, node); unsigned maxkeys = uid_eq(user->uid, GLOBAL_ROOT_UID) ? key_quota_root_maxkeys : key_quota_maxkeys; unsigned maxbytes = uid_eq(user->uid, GLOBAL_ROOT_UID) ? key_quota_root_maxbytes : key_quota_maxbytes; seq_printf(m, "%5u: %5d %d/%d %d/%d %d/%d\n", from_kuid_munged(seq_user_ns(m), user->uid), refcount_read(&user->usage), atomic_read(&user->nkeys), atomic_read(&user->nikeys), user->qnkeys, maxkeys, user->qnbytes, maxbytes); return 0; }
1 9 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_NSPROXY_H #define _LINUX_NSPROXY_H #include <linux/refcount.h> #include <linux/spinlock.h> #include <linux/sched.h> struct mnt_namespace; struct uts_namespace; struct ipc_namespace; struct pid_namespace; struct cgroup_namespace; struct fs_struct; /* * A structure to contain pointers to all per-process * namespaces - fs (mount), uts, network, sysvipc, etc. * * The pid namespace is an exception -- it's accessed using * task_active_pid_ns. The pid namespace here is the * namespace that children will use. * * 'count' is the number of tasks holding a reference. * The count for each namespace, then, will be the number * of nsproxies pointing to it, not the number of tasks. * * The nsproxy is shared by tasks which share all namespaces. * As soon as a single namespace is cloned or unshared, the * nsproxy is copied. */ struct nsproxy { refcount_t count; struct uts_namespace *uts_ns; struct ipc_namespace *ipc_ns; struct mnt_namespace *mnt_ns; struct pid_namespace *pid_ns_for_children; struct net *net_ns; struct time_namespace *time_ns; struct time_namespace *time_ns_for_children; struct cgroup_namespace *cgroup_ns; }; extern struct nsproxy init_nsproxy; /* * A structure to encompass all bits needed to install * a partial or complete new set of namespaces. * * If a new user namespace is requested cred will * point to a modifiable set of credentials. If a pointer * to a modifiable set is needed nsset_cred() must be * used and tested. */ struct nsset { unsigned flags; struct nsproxy *nsproxy; struct fs_struct *fs; const struct cred *cred; }; static inline struct cred *nsset_cred(struct nsset *set) { if (set->flags & CLONE_NEWUSER) return (struct cred *)set->cred; return NULL; } /* * the namespaces access rules are: * * 1. only current task is allowed to change tsk->nsproxy pointer or * any pointer on the nsproxy itself. Current must hold the task_lock * when changing tsk->nsproxy. * * 2. when accessing (i.e. reading) current task's namespaces - no * precautions should be taken - just dereference the pointers * * 3. the access to other task namespaces is performed like this * task_lock(task); * nsproxy = task->nsproxy; * if (nsproxy != NULL) { * / * * * work with the namespaces here * * e.g. get the reference on one of them * * / * } / * * * NULL task->nsproxy means that this task is * * almost dead (zombie) * * / * task_unlock(task); * */ int copy_namespaces(u64 flags, struct task_struct *tsk); void exit_task_namespaces(struct task_struct *tsk); void switch_task_namespaces(struct task_struct *tsk, struct nsproxy *new); int exec_task_namespaces(void); void free_nsproxy(struct nsproxy *ns); int unshare_nsproxy_namespaces(unsigned long, struct nsproxy **, struct cred *, struct fs_struct *); int __init nsproxy_cache_init(void); static inline void put_nsproxy(struct nsproxy *ns) { if (refcount_dec_and_test(&ns->count)) free_nsproxy(ns); } static inline void get_nsproxy(struct nsproxy *ns) { refcount_inc(&ns->count); } DEFINE_FREE(put_nsproxy, struct nsproxy *, if (_T) put_nsproxy(_T)) #endif
2 2 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 // SPDX-License-Identifier: GPL-2.0-or-later /* * Virtual NCI device simulation driver * * Copyright (C) 2020 Samsung Electronics * Bongsu Jeon <bongsu.jeon@samsung.com> */ #include <linux/kernel.h> #include <linux/module.h> #include <linux/miscdevice.h> #include <linux/mutex.h> #include <linux/wait.h> #include <net/nfc/nci_core.h> #define IOCTL_GET_NCIDEV_IDX 0 #define VIRTUAL_NFC_PROTOCOLS (NFC_PROTO_JEWEL_MASK | \ NFC_PROTO_MIFARE_MASK | \ NFC_PROTO_FELICA_MASK | \ NFC_PROTO_ISO14443_MASK | \ NFC_PROTO_ISO14443_B_MASK | \ NFC_PROTO_ISO15693_MASK) struct virtual_nci_dev { struct nci_dev *ndev; struct mutex mtx; struct sk_buff *send_buff; struct wait_queue_head wq; bool running; }; static int virtual_nci_open(struct nci_dev *ndev) { struct virtual_nci_dev *vdev = nci_get_drvdata(ndev); vdev->running = true; return 0; } static int virtual_nci_close(struct nci_dev *ndev) { struct virtual_nci_dev *vdev = nci_get_drvdata(ndev); mutex_lock(&vdev->mtx); kfree_skb(vdev->send_buff); vdev->send_buff = NULL; vdev->running = false; mutex_unlock(&vdev->mtx); return 0; } static int virtual_nci_send(struct nci_dev *ndev, struct sk_buff *skb) { struct virtual_nci_dev *vdev = nci_get_drvdata(ndev); mutex_lock(&vdev->mtx); if (vdev->send_buff || !vdev->running) { mutex_unlock(&vdev->mtx); kfree_skb(skb); return -1; } vdev->send_buff = skb_copy(skb, GFP_KERNEL); if (!vdev->send_buff) { mutex_unlock(&vdev->mtx); kfree_skb(skb); return -1; } mutex_unlock(&vdev->mtx); wake_up_interruptible(&vdev->wq); consume_skb(skb); return 0; } static const struct nci_ops virtual_nci_ops = { .open = virtual_nci_open, .close = virtual_nci_close, .send = virtual_nci_send }; static ssize_t virtual_ncidev_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) { struct virtual_nci_dev *vdev = file->private_data; size_t actual_len; mutex_lock(&vdev->mtx); while (!vdev->send_buff) { mutex_unlock(&vdev->mtx); if (wait_event_interruptible(vdev->wq, vdev->send_buff)) return -EFAULT; mutex_lock(&vdev->mtx); } actual_len = min_t(size_t, count, vdev->send_buff->len); if (copy_to_user(buf, vdev->send_buff->data, actual_len)) { mutex_unlock(&vdev->mtx); return -EFAULT; } skb_pull(vdev->send_buff, actual_len); if (vdev->send_buff->len == 0) { consume_skb(vdev->send_buff); vdev->send_buff = NULL; } mutex_unlock(&vdev->mtx); return actual_len; } static ssize_t virtual_ncidev_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { struct virtual_nci_dev *vdev = file->private_data; struct sk_buff *skb; skb = alloc_skb(count, GFP_KERNEL); if (!skb) return -ENOMEM; if (copy_from_user(skb_put(skb, count), buf, count)) { kfree_skb(skb); return -EFAULT; } if (strnlen(skb->data, count) != count) { kfree_skb(skb); return -EINVAL; } nci_recv_frame(vdev->ndev, skb); return count; } static int virtual_ncidev_open(struct inode *inode, struct file *file) { int ret = 0; struct virtual_nci_dev *vdev; vdev = kzalloc(sizeof(*vdev), GFP_KERNEL); if (!vdev) return -ENOMEM; vdev->ndev = nci_allocate_device(&virtual_nci_ops, VIRTUAL_NFC_PROTOCOLS, 0, 0); if (!vdev->ndev) { kfree(vdev); return -ENOMEM; } mutex_init(&vdev->mtx); init_waitqueue_head(&vdev->wq); file->private_data = vdev; nci_set_drvdata(vdev->ndev, vdev); ret = nci_register_device(vdev->ndev); if (ret < 0) { nci_free_device(vdev->ndev); mutex_destroy(&vdev->mtx); kfree(vdev); return ret; } return 0; } static int virtual_ncidev_close(struct inode *inode, struct file *file) { struct virtual_nci_dev *vdev = file->private_data; nci_unregister_device(vdev->ndev); nci_free_device(vdev->ndev); mutex_destroy(&vdev->mtx); kfree(vdev); return 0; } static long virtual_ncidev_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { struct virtual_nci_dev *vdev = file->private_data; const struct nfc_dev *nfc_dev = vdev->ndev->nfc_dev; void __user *p = (void __user *)arg; if (cmd != IOCTL_GET_NCIDEV_IDX) return -ENOTTY; if (copy_to_user(p, &nfc_dev->idx, sizeof(nfc_dev->idx))) return -EFAULT; return 0; } static const struct file_operations virtual_ncidev_fops = { .owner = THIS_MODULE, .read = virtual_ncidev_read, .write = virtual_ncidev_write, .open = virtual_ncidev_open, .release = virtual_ncidev_close, .unlocked_ioctl = virtual_ncidev_ioctl }; static struct miscdevice miscdev = { .minor = MISC_DYNAMIC_MINOR, .name = "virtual_nci", .fops = &virtual_ncidev_fops, .mode = 0600, }; module_misc_device(miscdev); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("Virtual NCI device simulation driver"); MODULE_AUTHOR("Bongsu Jeon <bongsu.jeon@samsung.com>");
9 9 9 2440 9 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_WAIT_H #define _LINUX_WAIT_H /* * Linux wait queue related types and methods */ #include <linux/list.h> #include <linux/stddef.h> #include <linux/spinlock.h> #include <asm/current.h> typedef struct wait_queue_entry wait_queue_entry_t; typedef int (*wait_queue_func_t)(struct wait_queue_entry *wq_entry, unsigned mode, int flags, void *key); int default_wake_function(struct wait_queue_entry *wq_entry, unsigned mode, int flags, void *key); /* wait_queue_entry::flags */ #define WQ_FLAG_EXCLUSIVE 0x01 #define WQ_FLAG_WOKEN 0x02 #define WQ_FLAG_CUSTOM 0x04 #define WQ_FLAG_DONE 0x08 #define WQ_FLAG_PRIORITY 0x10 /* * A single wait-queue entry structure: */ struct wait_queue_entry { unsigned int flags; void *private; wait_queue_func_t func; struct list_head entry; }; struct wait_queue_head { spinlock_t lock; struct list_head head; }; typedef struct wait_queue_head wait_queue_head_t; struct task_struct; /* * Macros for declaration and initialisaton of the datatypes */ #define __WAITQUEUE_INITIALIZER(name, tsk) { \ .private = tsk, \ .func = default_wake_function, \ .entry = { NULL, NULL } } #define DECLARE_WAITQUEUE(name, tsk) \ struct wait_queue_entry name = __WAITQUEUE_INITIALIZER(name, tsk) #define __WAIT_QUEUE_HEAD_INITIALIZER(name) { \ .lock = __SPIN_LOCK_UNLOCKED(name.lock), \ .head = LIST_HEAD_INIT(name.head) } #define DECLARE_WAIT_QUEUE_HEAD(name) \ struct wait_queue_head name = __WAIT_QUEUE_HEAD_INITIALIZER(name) extern void __init_waitqueue_head(struct wait_queue_head *wq_head, const char *name, struct lock_class_key *); #define init_waitqueue_head(wq_head) \ do { \ static struct lock_class_key __key; \ \ __init_waitqueue_head((wq_head), #wq_head, &__key); \ } while (0) #ifdef CONFIG_LOCKDEP # define __WAIT_QUEUE_HEAD_INIT_ONSTACK(name) \ ({ init_waitqueue_head(&name); name; }) # define DECLARE_WAIT_QUEUE_HEAD_ONSTACK(name) \ struct wait_queue_head name = __WAIT_QUEUE_HEAD_INIT_ONSTACK(name) #else # define DECLARE_WAIT_QUEUE_HEAD_ONSTACK(name) DECLARE_WAIT_QUEUE_HEAD(name) #endif static inline void init_waitqueue_entry(struct wait_queue_entry *wq_entry, struct task_struct *p) { wq_entry->flags = 0; wq_entry->private = p; wq_entry->func = default_wake_function; } static inline void init_waitqueue_func_entry(struct wait_queue_entry *wq_entry, wait_queue_func_t func) { wq_entry->flags = 0; wq_entry->private = NULL; wq_entry->func = func; } /** * waitqueue_active -- locklessly test for waiters on the queue * @wq_head: the waitqueue to test for waiters * * returns true if the wait list is not empty * * NOTE: this function is lockless and requires care, incorrect usage _will_ * lead to sporadic and non-obvious failure. * * Use either while holding wait_queue_head::lock or when used for wakeups * with an extra smp_mb() like:: * * CPU0 - waker CPU1 - waiter * * for (;;) { * @cond = true; prepare_to_wait(&wq_head, &wait, state); * smp_mb(); // smp_mb() from set_current_state() * if (waitqueue_active(wq_head)) if (@cond) * wake_up(wq_head); break; * schedule(); * } * finish_wait(&wq_head, &wait); * * Because without the explicit smp_mb() it's possible for the * waitqueue_active() load to get hoisted over the @cond store such that we'll * observe an empty wait list while the waiter might not observe @cond. * * Also note that this 'optimization' trades a spin_lock() for an smp_mb(), * which (when the lock is uncontended) are of roughly equal cost. */ static inline int waitqueue_active(struct wait_queue_head *wq_head) { return !list_empty(&wq_head->head); } /** * wq_has_single_sleeper - check if there is only one sleeper * @wq_head: wait queue head * * Returns true of wq_head has only one sleeper on the list. * * Please refer to the comment for waitqueue_active. */ static inline bool wq_has_single_sleeper(struct wait_queue_head *wq_head) { return list_is_singular(&wq_head->head); } /** * wq_has_sleeper - check if there are any waiting processes * @wq_head: wait queue head * * Returns true if wq_head has waiting processes * * Please refer to the comment for waitqueue_active. */ static inline bool wq_has_sleeper(struct wait_queue_head *wq_head) { /* * We need to be sure we are in sync with the * add_wait_queue modifications to the wait queue. * * This memory barrier should be paired with one on the * waiting side. */ smp_mb(); return waitqueue_active(wq_head); } extern void add_wait_queue(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry); extern void add_wait_queue_exclusive(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry); extern void add_wait_queue_priority(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry); extern int add_wait_queue_priority_exclusive(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry); extern void remove_wait_queue(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry); static inline void __add_wait_queue(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry) { struct list_head *head = &wq_head->head; struct wait_queue_entry *wq; list_for_each_entry(wq, &wq_head->head, entry) { if (!(wq->flags & WQ_FLAG_PRIORITY)) break; head = &wq->entry; } list_add(&wq_entry->entry, head); } /* * Used for wake-one threads: */ static inline void __add_wait_queue_exclusive(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry) { wq_entry->flags |= WQ_FLAG_EXCLUSIVE; __add_wait_queue(wq_head, wq_entry); } static inline void __add_wait_queue_entry_tail(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry) { list_add_tail(&wq_entry->entry, &wq_head->head); } static inline void __add_wait_queue_entry_tail_exclusive(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry) { wq_entry->flags |= WQ_FLAG_EXCLUSIVE; __add_wait_queue_entry_tail(wq_head, wq_entry); } static inline void __remove_wait_queue(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry) { list_del(&wq_entry->entry); } int __wake_up(struct wait_queue_head *wq_head, unsigned int mode, int nr, void *key); void __wake_up_on_current_cpu(struct wait_queue_head *wq_head, unsigned int mode, void *key); void __wake_up_locked_key(struct wait_queue_head *wq_head, unsigned int mode, void *key); void __wake_up_sync_key(struct wait_queue_head *wq_head, unsigned int mode, void *key); void __wake_up_locked_sync_key(struct wait_queue_head *wq_head, unsigned int mode, void *key); void __wake_up_locked(struct wait_queue_head *wq_head, unsigned int mode, int nr); void __wake_up_sync(struct wait_queue_head *wq_head, unsigned int mode); void __wake_up_pollfree(struct wait_queue_head *wq_head); #define wake_up(x) __wake_up(x, TASK_NORMAL, 1, NULL) #define wake_up_nr(x, nr) __wake_up(x, TASK_NORMAL, nr, NULL) #define wake_up_all(x) __wake_up(x, TASK_NORMAL, 0, NULL) #define wake_up_locked(x) __wake_up_locked((x), TASK_NORMAL, 1) #define wake_up_all_locked(x) __wake_up_locked((x), TASK_NORMAL, 0) #define wake_up_sync(x) __wake_up_sync(x, TASK_NORMAL) #define wake_up_interruptible(x) __wake_up(x, TASK_INTERRUPTIBLE, 1, NULL) #define wake_up_interruptible_nr(x, nr) __wake_up(x, TASK_INTERRUPTIBLE, nr, NULL) #define wake_up_interruptible_all(x) __wake_up(x, TASK_INTERRUPTIBLE, 0, NULL) #define wake_up_interruptible_sync(x) __wake_up_sync((x), TASK_INTERRUPTIBLE) /* * Wakeup macros to be used to report events to the targets. */ #define poll_to_key(m) ((void *)(__force uintptr_t)(__poll_t)(m)) #define key_to_poll(m) ((__force __poll_t)(uintptr_t)(void *)(m)) #define wake_up_poll(x, m) \ __wake_up(x, TASK_NORMAL, 1, poll_to_key(m)) #define wake_up_poll_on_current_cpu(x, m) \ __wake_up_on_current_cpu(x, TASK_NORMAL, poll_to_key(m)) #define wake_up_locked_poll(x, m) \ __wake_up_locked_key((x), TASK_NORMAL, poll_to_key(m)) #define wake_up_interruptible_poll(x, m) \ __wake_up(x, TASK_INTERRUPTIBLE, 1, poll_to_key(m)) #define wake_up_interruptible_sync_poll(x, m) \ __wake_up_sync_key((x), TASK_INTERRUPTIBLE, poll_to_key(m)) #define wake_up_interruptible_sync_poll_locked(x, m) \ __wake_up_locked_sync_key((x), TASK_INTERRUPTIBLE, poll_to_key(m)) /** * wake_up_pollfree - signal that a polled waitqueue is going away * @wq_head: the wait queue head * * In the very rare cases where a ->poll() implementation uses a waitqueue whose * lifetime is tied to a task rather than to the 'struct file' being polled, * this function must be called before the waitqueue is freed so that * non-blocking polls (e.g. epoll) are notified that the queue is going away. * * The caller must also RCU-delay the freeing of the wait_queue_head, e.g. via * an explicit synchronize_rcu() or call_rcu(), or via SLAB_TYPESAFE_BY_RCU. */ static inline void wake_up_pollfree(struct wait_queue_head *wq_head) { /* * For performance reasons, we don't always take the queue lock here. * Therefore, we might race with someone removing the last entry from * the queue, and proceed while they still hold the queue lock. * However, rcu_read_lock() is required to be held in such cases, so we * can safely proceed with an RCU-delayed free. */ if (waitqueue_active(wq_head)) __wake_up_pollfree(wq_head); } #define ___wait_cond_timeout(condition) \ ({ \ bool __cond = (condition); \ if (__cond && !__ret) \ __ret = 1; \ __cond || !__ret; \ }) #define ___wait_is_interruptible(state) \ (!__builtin_constant_p(state) || \ (state & (TASK_INTERRUPTIBLE | TASK_WAKEKILL))) extern void init_wait_entry(struct wait_queue_entry *wq_entry, int flags); /* * The below macro ___wait_event() has an explicit shadow of the __ret * variable when used from the wait_event_*() macros. * * This is so that both can use the ___wait_cond_timeout() construct * to wrap the condition. * * The type inconsistency of the wait_event_*() __ret variable is also * on purpose; we use long where we can return timeout values and int * otherwise. */ #define ___wait_event(wq_head, condition, state, exclusive, ret, cmd) \ ({ \ __label__ __out; \ struct wait_queue_entry __wq_entry; \ long __ret = ret; /* explicit shadow */ \ \ init_wait_entry(&__wq_entry, exclusive ? WQ_FLAG_EXCLUSIVE : 0); \ for (;;) { \ long __int = prepare_to_wait_event(&wq_head, &__wq_entry, state);\ \ if (condition) \ break; \ \ if (___wait_is_interruptible(state) && __int) { \ __ret = __int; \ goto __out; \ } \ \ cmd; \ \ if (condition) \ break; \ } \ finish_wait(&wq_head, &__wq_entry); \ __out: __ret; \ }) #define __wait_event(wq_head, condition) \ (void)___wait_event(wq_head, condition, TASK_UNINTERRUPTIBLE, 0, 0, \ schedule()) /** * wait_event - sleep until a condition gets true * @wq_head: the waitqueue to wait on * @condition: a C expression for the event to wait for * * The process is put to sleep (TASK_UNINTERRUPTIBLE) until the * @condition evaluates to true. The @condition is checked each time * the waitqueue @wq_head is woken up. * * wake_up() has to be called after changing any variable that could * change the result of the wait condition. */ #define wait_event(wq_head, condition) \ do { \ might_sleep(); \ if (condition) \ break; \ __wait_event(wq_head, condition); \ } while (0) #define __io_wait_event(wq_head, condition) \ (void)___wait_event(wq_head, condition, TASK_UNINTERRUPTIBLE, 0, 0, \ io_schedule()) /* * io_wait_event() -- like wait_event() but with io_schedule() */ #define io_wait_event(wq_head, condition) \ do { \ might_sleep(); \ if (condition) \ break; \ __io_wait_event(wq_head, condition); \ } while (0) #define __wait_event_freezable(wq_head, condition) \ ___wait_event(wq_head, condition, (TASK_INTERRUPTIBLE|TASK_FREEZABLE), \ 0, 0, schedule()) /** * wait_event_freezable - sleep (or freeze) until a condition gets true * @wq_head: the waitqueue to wait on * @condition: a C expression for the event to wait for * * The process is put to sleep (TASK_INTERRUPTIBLE -- so as not to contribute * to system load) until the @condition evaluates to true. The * @condition is checked each time the waitqueue @wq_head is woken up. * * wake_up() has to be called after changing any variable that could * change the result of the wait condition. */ #define wait_event_freezable(wq_head, condition) \ ({ \ int __ret = 0; \ might_sleep(); \ if (!(condition)) \ __ret = __wait_event_freezable(wq_head, condition); \ __ret; \ }) #define __wait_event_timeout(wq_head, condition, timeout) \ ___wait_event(wq_head, ___wait_cond_timeout(condition), \ TASK_UNINTERRUPTIBLE, 0, timeout, \ __ret = schedule_timeout(__ret)) /** * wait_event_timeout - sleep until a condition gets true or a timeout elapses * @wq_head: the waitqueue to wait on * @condition: a C expression for the event to wait for * @timeout: timeout, in jiffies * * The process is put to sleep (TASK_UNINTERRUPTIBLE) until the * @condition evaluates to true. The @condition is checked each time * the waitqueue @wq_head is woken up. * * wake_up() has to be called after changing any variable that could * change the result of the wait condition. * * Returns: * 0 if the @condition evaluated to %false after the @timeout elapsed, * 1 if the @condition evaluated to %true after the @timeout elapsed, * or the remaining jiffies (at least 1) if the @condition evaluated * to %true before the @timeout elapsed. */ #define wait_event_timeout(wq_head, condition, timeout) \ ({ \ long __ret = timeout; \ might_sleep(); \ if (!___wait_cond_timeout(condition)) \ __ret = __wait_event_timeout(wq_head, condition, timeout); \ __ret; \ }) #define __wait_event_freezable_timeout(wq_head, condition, timeout) \ ___wait_event(wq_head, ___wait_cond_timeout(condition), \ (TASK_INTERRUPTIBLE|TASK_FREEZABLE), 0, timeout, \ __ret = schedule_timeout(__ret)) /* * like wait_event_timeout() -- except it uses TASK_INTERRUPTIBLE to avoid * increasing load and is freezable. */ #define wait_event_freezable_timeout(wq_head, condition, timeout) \ ({ \ long __ret = timeout; \ might_sleep(); \ if (!___wait_cond_timeout(condition)) \ __ret = __wait_event_freezable_timeout(wq_head, condition, timeout); \ __ret; \ }) #define __wait_event_exclusive_cmd(wq_head, condition, cmd1, cmd2) \ (void)___wait_event(wq_head, condition, TASK_UNINTERRUPTIBLE, 1, 0, \ cmd1; schedule(); cmd2) /* * Just like wait_event_cmd(), except it sets exclusive flag */ #define wait_event_exclusive_cmd(wq_head, condition, cmd1, cmd2) \ do { \ if (condition) \ break; \ __wait_event_exclusive_cmd(wq_head, condition, cmd1, cmd2); \ } while (0) #define __wait_event_cmd(wq_head, condition, cmd1, cmd2) \ (void)___wait_event(wq_head, condition, TASK_UNINTERRUPTIBLE, 0, 0, \ cmd1; schedule(); cmd2) /** * wait_event_cmd - sleep until a condition gets true * @wq_head: the waitqueue to wait on * @condition: a C expression for the event to wait for * @cmd1: the command will be executed before sleep * @cmd2: the command will be executed after sleep * * The process is put to sleep (TASK_UNINTERRUPTIBLE) until the * @condition evaluates to true. The @condition is checked each time * the waitqueue @wq_head is woken up. * * wake_up() has to be called after changing any variable that could * change the result of the wait condition. */ #define wait_event_cmd(wq_head, condition, cmd1, cmd2) \ do { \ if (condition) \ break; \ __wait_event_cmd(wq_head, condition, cmd1, cmd2); \ } while (0) #define __wait_event_interruptible(wq_head, condition) \ ___wait_event(wq_head, condition, TASK_INTERRUPTIBLE, 0, 0, \ schedule()) /** * wait_event_interruptible - sleep until a condition gets true * @wq_head: the waitqueue to wait on * @condition: a C expression for the event to wait for * * The process is put to sleep (TASK_INTERRUPTIBLE) until the * @condition evaluates to true or a signal is received. * The @condition is checked each time the waitqueue @wq_head is woken up. * * wake_up() has to be called after changing any variable that could * change the result of the wait condition. * * The function will return -ERESTARTSYS if it was interrupted by a * signal and 0 if @condition evaluated to true. */ #define wait_event_interruptible(wq_head, condition) \ ({ \ int __ret = 0; \ might_sleep(); \ if (!(condition)) \ __ret = __wait_event_interruptible(wq_head, condition); \ __ret; \ }) #define __wait_event_interruptible_timeout(wq_head, condition, timeout) \ ___wait_event(wq_head, ___wait_cond_timeout(condition), \ TASK_INTERRUPTIBLE, 0, timeout, \ __ret = schedule_timeout(__ret)) /** * wait_event_interruptible_timeout - sleep until a condition gets true or a timeout elapses * @wq_head: the waitqueue to wait on * @condition: a C expression for the event to wait for * @timeout: timeout, in jiffies * * The process is put to sleep (TASK_INTERRUPTIBLE) until the * @condition evaluates to true or a signal is received. * The @condition is checked each time the waitqueue @wq_head is woken up. * * wake_up() has to be called after changing any variable that could * change the result of the wait condition. * * Returns: * 0 if the @condition evaluated to %false after the @timeout elapsed, * 1 if the @condition evaluated to %true after the @timeout elapsed, * the remaining jiffies (at least 1) if the @condition evaluated * to %true before the @timeout elapsed, or -%ERESTARTSYS if it was * interrupted by a signal. */ #define wait_event_interruptible_timeout(wq_head, condition, timeout) \ ({ \ long __ret = timeout; \ might_sleep(); \ if (!___wait_cond_timeout(condition)) \ __ret = __wait_event_interruptible_timeout(wq_head, \ condition, timeout); \ __ret; \ }) #define __wait_event_hrtimeout(wq_head, condition, timeout, state) \ ({ \ int __ret = 0; \ struct hrtimer_sleeper __t; \ \ hrtimer_setup_sleeper_on_stack(&__t, CLOCK_MONOTONIC, \ HRTIMER_MODE_REL); \ if ((timeout) != KTIME_MAX) { \ hrtimer_set_expires_range_ns(&__t.timer, timeout, \ current->timer_slack_ns); \ hrtimer_sleeper_start_expires(&__t, HRTIMER_MODE_REL); \ } \ \ __ret = ___wait_event(wq_head, condition, state, 0, 0, \ if (!__t.task) { \ __ret = -ETIME; \ break; \ } \ schedule()); \ \ hrtimer_cancel(&__t.timer); \ destroy_hrtimer_on_stack(&__t.timer); \ __ret; \ }) /** * wait_event_hrtimeout - sleep until a condition gets true or a timeout elapses * @wq_head: the waitqueue to wait on * @condition: a C expression for the event to wait for * @timeout: timeout, as a ktime_t * * The process is put to sleep (TASK_UNINTERRUPTIBLE) until the * @condition evaluates to true or a signal is received. * The @condition is checked each time the waitqueue @wq_head is woken up. * * wake_up() has to be called after changing any variable that could * change the result of the wait condition. * * The function returns 0 if @condition became true, or -ETIME if the timeout * elapsed. */ #define wait_event_hrtimeout(wq_head, condition, timeout) \ ({ \ int __ret = 0; \ might_sleep(); \ if (!(condition)) \ __ret = __wait_event_hrtimeout(wq_head, condition, timeout, \ TASK_UNINTERRUPTIBLE); \ __ret; \ }) /** * wait_event_interruptible_hrtimeout - sleep until a condition gets true or a timeout elapses * @wq: the waitqueue to wait on * @condition: a C expression for the event to wait for * @timeout: timeout, as a ktime_t * * The process is put to sleep (TASK_INTERRUPTIBLE) until the * @condition evaluates to true or a signal is received. * The @condition is checked each time the waitqueue @wq is woken up. * * wake_up() has to be called after changing any variable that could * change the result of the wait condition. * * The function returns 0 if @condition became true, -ERESTARTSYS if it was * interrupted by a signal, or -ETIME if the timeout elapsed. */ #define wait_event_interruptible_hrtimeout(wq, condition, timeout) \ ({ \ long __ret = 0; \ might_sleep(); \ if (!(condition)) \ __ret = __wait_event_hrtimeout(wq, condition, timeout, \ TASK_INTERRUPTIBLE); \ __ret; \ }) #define __wait_event_interruptible_exclusive(wq, condition) \ ___wait_event(wq, condition, TASK_INTERRUPTIBLE, 1, 0, \ schedule()) #define wait_event_interruptible_exclusive(wq, condition) \ ({ \ int __ret = 0; \ might_sleep(); \ if (!(condition)) \ __ret = __wait_event_interruptible_exclusive(wq, condition); \ __ret; \ }) #define __wait_event_killable_exclusive(wq, condition) \ ___wait_event(wq, condition, TASK_KILLABLE, 1, 0, \ schedule()) #define wait_event_killable_exclusive(wq, condition) \ ({ \ int __ret = 0; \ might_sleep(); \ if (!(condition)) \ __ret = __wait_event_killable_exclusive(wq, condition); \ __ret; \ }) #define __wait_event_freezable_exclusive(wq, condition) \ ___wait_event(wq, condition, (TASK_INTERRUPTIBLE|TASK_FREEZABLE), 1, 0,\ schedule()) #define wait_event_freezable_exclusive(wq, condition) \ ({ \ int __ret = 0; \ might_sleep(); \ if (!(condition)) \ __ret = __wait_event_freezable_exclusive(wq, condition); \ __ret; \ }) /** * wait_event_idle - wait for a condition without contributing to system load * @wq_head: the waitqueue to wait on * @condition: a C expression for the event to wait for * * The process is put to sleep (TASK_IDLE) until the * @condition evaluates to true. * The @condition is checked each time the waitqueue @wq_head is woken up. * * wake_up() has to be called after changing any variable that could * change the result of the wait condition. * */ #define wait_event_idle(wq_head, condition) \ do { \ might_sleep(); \ if (!(condition)) \ ___wait_event(wq_head, condition, TASK_IDLE, 0, 0, schedule()); \ } while (0) /** * wait_event_idle_exclusive - wait for a condition with contributing to system load * @wq_head: the waitqueue to wait on * @condition: a C expression for the event to wait for * * The process is put to sleep (TASK_IDLE) until the * @condition evaluates to true. * The @condition is checked each time the waitqueue @wq_head is woken up. * * The process is put on the wait queue with an WQ_FLAG_EXCLUSIVE flag * set thus if other processes wait on the same list, when this * process is woken further processes are not considered. * * wake_up() has to be called after changing any variable that could * change the result of the wait condition. * */ #define wait_event_idle_exclusive(wq_head, condition) \ do { \ might_sleep(); \ if (!(condition)) \ ___wait_event(wq_head, condition, TASK_IDLE, 1, 0, schedule()); \ } while (0) #define __wait_event_idle_timeout(wq_head, condition, timeout) \ ___wait_event(wq_head, ___wait_cond_timeout(condition), \ TASK_IDLE, 0, timeout, \ __ret = schedule_timeout(__ret)) /** * wait_event_idle_timeout - sleep without load until a condition becomes true or a timeout elapses * @wq_head: the waitqueue to wait on * @condition: a C expression for the event to wait for * @timeout: timeout, in jiffies * * The process is put to sleep (TASK_IDLE) until the * @condition evaluates to true. The @condition is checked each time * the waitqueue @wq_head is woken up. * * wake_up() has to be called after changing any variable that could * change the result of the wait condition. * * Returns: * 0 if the @condition evaluated to %false after the @timeout elapsed, * 1 if the @condition evaluated to %true after the @timeout elapsed, * or the remaining jiffies (at least 1) if the @condition evaluated * to %true before the @timeout elapsed. */ #define wait_event_idle_timeout(wq_head, condition, timeout) \ ({ \ long __ret = timeout; \ might_sleep(); \ if (!___wait_cond_timeout(condition)) \ __ret = __wait_event_idle_timeout(wq_head, condition, timeout); \ __ret; \ }) #define __wait_event_idle_exclusive_timeout(wq_head, condition, timeout) \ ___wait_event(wq_head, ___wait_cond_timeout(condition), \ TASK_IDLE, 1, timeout, \ __ret = schedule_timeout(__ret)) /** * wait_event_idle_exclusive_timeout - sleep without load until a condition becomes true or a timeout elapses * @wq_head: the waitqueue to wait on * @condition: a C expression for the event to wait for * @timeout: timeout, in jiffies * * The process is put to sleep (TASK_IDLE) until the * @condition evaluates to true. The @condition is checked each time * the waitqueue @wq_head is woken up. * * The process is put on the wait queue with an WQ_FLAG_EXCLUSIVE flag * set thus if other processes wait on the same list, when this * process is woken further processes are not considered. * * wake_up() has to be called after changing any variable that could * change the result of the wait condition. * * Returns: * 0 if the @condition evaluated to %false after the @timeout elapsed, * 1 if the @condition evaluated to %true after the @timeout elapsed, * or the remaining jiffies (at least 1) if the @condition evaluated * to %true before the @timeout elapsed. */ #define wait_event_idle_exclusive_timeout(wq_head, condition, timeout) \ ({ \ long __ret = timeout; \ might_sleep(); \ if (!___wait_cond_timeout(condition)) \ __ret = __wait_event_idle_exclusive_timeout(wq_head, condition, timeout);\ __ret; \ }) extern int do_wait_intr(wait_queue_head_t *, wait_queue_entry_t *); extern int do_wait_intr_irq(wait_queue_head_t *, wait_queue_entry_t *); #define __wait_event_interruptible_locked(wq, condition, exclusive, fn) \ ({ \ int __ret; \ DEFINE_WAIT(__wait); \ if (exclusive) \ __wait.flags |= WQ_FLAG_EXCLUSIVE; \ do { \ __ret = fn(&(wq), &__wait); \ if (__ret) \ break; \ } while (!(condition)); \ __remove_wait_queue(&(wq), &__wait); \ __set_current_state(TASK_RUNNING); \ __ret; \ }) /** * wait_event_interruptible_locked - sleep until a condition gets true * @wq: the waitqueue to wait on * @condition: a C expression for the event to wait for * * The process is put to sleep (TASK_INTERRUPTIBLE) until the * @condition evaluates to true or a signal is received. * The @condition is checked each time the waitqueue @wq is woken up. * * It must be called with wq.lock being held. This spinlock is * unlocked while sleeping but @condition testing is done while lock * is held and when this macro exits the lock is held. * * The lock is locked/unlocked using spin_lock()/spin_unlock() * functions which must match the way they are locked/unlocked outside * of this macro. * * wake_up_locked() has to be called after changing any variable that could * change the result of the wait condition. * * The function will return -ERESTARTSYS if it was interrupted by a * signal and 0 if @condition evaluated to true. */ #define wait_event_interruptible_locked(wq, condition) \ ((condition) \ ? 0 : __wait_event_interruptible_locked(wq, condition, 0, do_wait_intr)) /** * wait_event_interruptible_locked_irq - sleep until a condition gets true * @wq: the waitqueue to wait on * @condition: a C expression for the event to wait for * * The process is put to sleep (TASK_INTERRUPTIBLE) until the * @condition evaluates to true or a signal is received. * The @condition is checked each time the waitqueue @wq is woken up. * * It must be called with wq.lock being held. This spinlock is * unlocked while sleeping but @condition testing is done while lock * is held and when this macro exits the lock is held. * * The lock is locked/unlocked using spin_lock_irq()/spin_unlock_irq() * functions which must match the way they are locked/unlocked outside * of this macro. * * wake_up_locked() has to be called after changing any variable that could * change the result of the wait condition. * * The function will return -ERESTARTSYS if it was interrupted by a * signal and 0 if @condition evaluated to true. */ #define wait_event_interruptible_locked_irq(wq, condition) \ ((condition) \ ? 0 : __wait_event_interruptible_locked(wq, condition, 0, do_wait_intr_irq)) /** * wait_event_interruptible_exclusive_locked - sleep exclusively until a condition gets true * @wq: the waitqueue to wait on * @condition: a C expression for the event to wait for * * The process is put to sleep (TASK_INTERRUPTIBLE) until the * @condition evaluates to true or a signal is received. * The @condition is checked each time the waitqueue @wq is woken up. * * It must be called with wq.lock being held. This spinlock is * unlocked while sleeping but @condition testing is done while lock * is held and when this macro exits the lock is held. * * The lock is locked/unlocked using spin_lock()/spin_unlock() * functions which must match the way they are locked/unlocked outside * of this macro. * * The process is put on the wait queue with an WQ_FLAG_EXCLUSIVE flag * set thus when other process waits process on the list if this * process is awaken further processes are not considered. * * wake_up_locked() has to be called after changing any variable that could * change the result of the wait condition. * * The function will return -ERESTARTSYS if it was interrupted by a * signal and 0 if @condition evaluated to true. */ #define wait_event_interruptible_exclusive_locked(wq, condition) \ ((condition) \ ? 0 : __wait_event_interruptible_locked(wq, condition, 1, do_wait_intr)) /** * wait_event_interruptible_exclusive_locked_irq - sleep until a condition gets true * @wq: the waitqueue to wait on * @condition: a C expression for the event to wait for * * The process is put to sleep (TASK_INTERRUPTIBLE) until the * @condition evaluates to true or a signal is received. * The @condition is checked each time the waitqueue @wq is woken up. * * It must be called with wq.lock being held. This spinlock is * unlocked while sleeping but @condition testing is done while lock * is held and when this macro exits the lock is held. * * The lock is locked/unlocked using spin_lock_irq()/spin_unlock_irq() * functions which must match the way they are locked/unlocked outside * of this macro. * * The process is put on the wait queue with an WQ_FLAG_EXCLUSIVE flag * set thus when other process waits process on the list if this * process is awaken further processes are not considered. * * wake_up_locked() has to be called after changing any variable that could * change the result of the wait condition. * * The function will return -ERESTARTSYS if it was interrupted by a * signal and 0 if @condition evaluated to true. */ #define wait_event_interruptible_exclusive_locked_irq(wq, condition) \ ((condition) \ ? 0 : __wait_event_interruptible_locked(wq, condition, 1, do_wait_intr_irq)) #define __wait_event_killable(wq, condition) \ ___wait_event(wq, condition, TASK_KILLABLE, 0, 0, schedule()) /** * wait_event_killable - sleep until a condition gets true * @wq_head: the waitqueue to wait on * @condition: a C expression for the event to wait for * * The process is put to sleep (TASK_KILLABLE) until the * @condition evaluates to true or a signal is received. * The @condition is checked each time the waitqueue @wq_head is woken up. * * wake_up() has to be called after changing any variable that could * change the result of the wait condition. * * The function will return -ERESTARTSYS if it was interrupted by a * signal and 0 if @condition evaluated to true. */ #define wait_event_killable(wq_head, condition) \ ({ \ int __ret = 0; \ might_sleep(); \ if (!(condition)) \ __ret = __wait_event_killable(wq_head, condition); \ __ret; \ }) #define __wait_event_state(wq, condition, state) \ ___wait_event(wq, condition, state, 0, 0, schedule()) /** * wait_event_state - sleep until a condition gets true * @wq_head: the waitqueue to wait on * @condition: a C expression for the event to wait for * @state: state to sleep in * * The process is put to sleep (@state) until the @condition evaluates to true * or a signal is received (when allowed by @state). The @condition is checked * each time the waitqueue @wq_head is woken up. * * wake_up() has to be called after changing any variable that could * change the result of the wait condition. * * The function will return -ERESTARTSYS if it was interrupted by a signal * (when allowed by @state) and 0 if @condition evaluated to true. */ #define wait_event_state(wq_head, condition, state) \ ({ \ int __ret = 0; \ might_sleep(); \ if (!(condition)) \ __ret = __wait_event_state(wq_head, condition, state); \ __ret; \ }) #define __wait_event_state_exclusive(wq, condition, state) \ ___wait_event(wq, condition, state, 1, 0, schedule()) #define wait_event_state_exclusive(wq, condition, state) \ ({ \ int __ret = 0; \ might_sleep(); \ if (!(condition)) \ __ret = __wait_event_state_exclusive(wq, condition, state); \ __ret; \ }) #define __wait_event_killable_timeout(wq_head, condition, timeout) \ ___wait_event(wq_head, ___wait_cond_timeout(condition), \ TASK_KILLABLE, 0, timeout, \ __ret = schedule_timeout(__ret)) /** * wait_event_killable_timeout - sleep until a condition gets true or a timeout elapses * @wq_head: the waitqueue to wait on * @condition: a C expression for the event to wait for * @timeout: timeout, in jiffies * * The process is put to sleep (TASK_KILLABLE) until the * @condition evaluates to true or a kill signal is received. * The @condition is checked each time the waitqueue @wq_head is woken up. * * wake_up() has to be called after changing any variable that could * change the result of the wait condition. * * Returns: * 0 if the @condition evaluated to %false after the @timeout elapsed, * 1 if the @condition evaluated to %true after the @timeout elapsed, * the remaining jiffies (at least 1) if the @condition evaluated * to %true before the @timeout elapsed, or -%ERESTARTSYS if it was * interrupted by a kill signal. * * Only kill signals interrupt this process. */ #define wait_event_killable_timeout(wq_head, condition, timeout) \ ({ \ long __ret = timeout; \ might_sleep(); \ if (!___wait_cond_timeout(condition)) \ __ret = __wait_event_killable_timeout(wq_head, \ condition, timeout); \ __ret; \ }) #define __wait_event_lock_irq(wq_head, condition, lock, cmd) \ (void)___wait_event(wq_head, condition, TASK_UNINTERRUPTIBLE, 0, 0, \ spin_unlock_irq(&lock); \ cmd; \ schedule(); \ spin_lock_irq(&lock)) /** * wait_event_lock_irq_cmd - sleep until a condition gets true. The * condition is checked under the lock. This * is expected to be called with the lock * taken. * @wq_head: the waitqueue to wait on * @condition: a C expression for the event to wait for * @lock: a locked spinlock_t, which will be released before cmd * and schedule() and reacquired afterwards. * @cmd: a command which is invoked outside the critical section before * sleep * * The process is put to sleep (TASK_UNINTERRUPTIBLE) until the * @condition evaluates to true. The @condition is checked each time * the waitqueue @wq_head is woken up. * * wake_up() has to be called after changing any variable that could * change the result of the wait condition. * * This is supposed to be called while holding the lock. The lock is * dropped before invoking the cmd and going to sleep and is reacquired * afterwards. */ #define wait_event_lock_irq_cmd(wq_head, condition, lock, cmd) \ do { \ if (condition) \ break; \ __wait_event_lock_irq(wq_head, condition, lock, cmd); \ } while (0) /** * wait_event_lock_irq - sleep until a condition gets true. The * condition is checked under the lock. This * is expected to be called with the lock * taken. * @wq_head: the waitqueue to wait on * @condition: a C expression for the event to wait for * @lock: a locked spinlock_t, which will be released before schedule() * and reacquired afterwards. * * The process is put to sleep (TASK_UNINTERRUPTIBLE) until the * @condition evaluates to true. The @condition is checked each time * the waitqueue @wq_head is woken up. * * wake_up() has to be called after changing any variable that could * change the result of the wait condition. * * This is supposed to be called while holding the lock. The lock is * dropped before going to sleep and is reacquired afterwards. */ #define wait_event_lock_irq(wq_head, condition, lock) \ do { \ if (condition) \ break; \ __wait_event_lock_irq(wq_head, condition, lock, ); \ } while (0) #define __wait_event_interruptible_lock_irq(wq_head, condition, lock, cmd) \ ___wait_event(wq_head, condition, TASK_INTERRUPTIBLE, 0, 0, \ spin_unlock_irq(&lock); \ cmd; \ schedule(); \ spin_lock_irq(&lock)) /** * wait_event_interruptible_lock_irq_cmd - sleep until a condition gets true. * The condition is checked under the lock. This is expected to * be called with the lock taken. * @wq_head: the waitqueue to wait on * @condition: a C expression for the event to wait for * @lock: a locked spinlock_t, which will be released before cmd and * schedule() and reacquired afterwards. * @cmd: a command which is invoked outside the critical section before * sleep * * The process is put to sleep (TASK_INTERRUPTIBLE) until the * @condition evaluates to true or a signal is received. The @condition is * checked each time the waitqueue @wq_head is woken up. * * wake_up() has to be called after changing any variable that could * change the result of the wait condition. * * This is supposed to be called while holding the lock. The lock is * dropped before invoking the cmd and going to sleep and is reacquired * afterwards. * * The macro will return -ERESTARTSYS if it was interrupted by a signal * and 0 if @condition evaluated to true. */ #define wait_event_interruptible_lock_irq_cmd(wq_head, condition, lock, cmd) \ ({ \ int __ret = 0; \ if (!(condition)) \ __ret = __wait_event_interruptible_lock_irq(wq_head, \ condition, lock, cmd); \ __ret; \ }) /** * wait_event_interruptible_lock_irq - sleep until a condition gets true. * The condition is checked under the lock. This is expected * to be called with the lock taken. * @wq_head: the waitqueue to wait on * @condition: a C expression for the event to wait for * @lock: a locked spinlock_t, which will be released before schedule() * and reacquired afterwards. * * The process is put to sleep (TASK_INTERRUPTIBLE) until the * @condition evaluates to true or signal is received. The @condition is * checked each time the waitqueue @wq_head is woken up. * * wake_up() has to be called after changing any variable that could * change the result of the wait condition. * * This is supposed to be called while holding the lock. The lock is * dropped before going to sleep and is reacquired afterwards. * * The macro will return -ERESTARTSYS if it was interrupted by a signal * and 0 if @condition evaluated to true. */ #define wait_event_interruptible_lock_irq(wq_head, condition, lock) \ ({ \ int __ret = 0; \ if (!(condition)) \ __ret = __wait_event_interruptible_lock_irq(wq_head, \ condition, lock,); \ __ret; \ }) #define __wait_event_lock_irq_timeout(wq_head, condition, lock, timeout, state) \ ___wait_event(wq_head, ___wait_cond_timeout(condition), \ state, 0, timeout, \ spin_unlock_irq(&lock); \ __ret = schedule_timeout(__ret); \ spin_lock_irq(&lock)); /** * wait_event_interruptible_lock_irq_timeout - sleep until a condition gets * true or a timeout elapses. The condition is checked under * the lock. This is expected to be called with the lock taken. * @wq_head: the waitqueue to wait on * @condition: a C expression for the event to wait for * @lock: a locked spinlock_t, which will be released before schedule() * and reacquired afterwards. * @timeout: timeout, in jiffies * * The process is put to sleep (TASK_INTERRUPTIBLE) until the * @condition evaluates to true or signal is received. The @condition is * checked each time the waitqueue @wq_head is woken up. * * wake_up() has to be called after changing any variable that could * change the result of the wait condition. * * This is supposed to be called while holding the lock. The lock is * dropped before going to sleep and is reacquired afterwards. * * The function returns 0 if the @timeout elapsed, -ERESTARTSYS if it * was interrupted by a signal, and the remaining jiffies otherwise * if the condition evaluated to true before the timeout elapsed. */ #define wait_event_interruptible_lock_irq_timeout(wq_head, condition, lock, \ timeout) \ ({ \ long __ret = timeout; \ if (!___wait_cond_timeout(condition)) \ __ret = __wait_event_lock_irq_timeout( \ wq_head, condition, lock, timeout, \ TASK_INTERRUPTIBLE); \ __ret; \ }) #define wait_event_lock_irq_timeout(wq_head, condition, lock, timeout) \ ({ \ long __ret = timeout; \ if (!___wait_cond_timeout(condition)) \ __ret = __wait_event_lock_irq_timeout( \ wq_head, condition, lock, timeout, \ TASK_UNINTERRUPTIBLE); \ __ret; \ }) /* * Waitqueues which are removed from the waitqueue_head at wakeup time */ void prepare_to_wait(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry, int state); bool prepare_to_wait_exclusive(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry, int state); long prepare_to_wait_event(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry, int state); void finish_wait(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry); long wait_woken(struct wait_queue_entry *wq_entry, unsigned mode, long timeout); int woken_wake_function(struct wait_queue_entry *wq_entry, unsigned mode, int sync, void *key); int autoremove_wake_function(struct wait_queue_entry *wq_entry, unsigned mode, int sync, void *key); #define DEFINE_WAIT_FUNC(name, function) \ struct wait_queue_entry name = { \ .private = current, \ .func = function, \ .entry = LIST_HEAD_INIT((name).entry), \ } #define DEFINE_WAIT(name) DEFINE_WAIT_FUNC(name, autoremove_wake_function) #define init_wait_func(wait, function) \ do { \ (wait)->private = current; \ (wait)->func = function; \ INIT_LIST_HEAD(&(wait)->entry); \ (wait)->flags = 0; \ } while (0) #define init_wait(wait) init_wait_func(wait, autoremove_wake_function) typedef int (*task_call_f)(struct task_struct *p, void *arg); extern int task_call_func(struct task_struct *p, task_call_f func, void *arg); #endif /* _LINUX_WAIT_H */
9 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* Network filesystem support services. * * Copyright (C) 2021 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) * * See: * * Documentation/filesystems/netfs_library.rst * * for a description of the network filesystem interface declared here. */ #ifndef _LINUX_NETFS_H #define _LINUX_NETFS_H #include <linux/workqueue.h> #include <linux/fs.h> #include <linux/pagemap.h> #include <linux/uio.h> #include <linux/rolling_buffer.h> enum netfs_sreq_ref_trace; typedef struct mempool mempool_t; struct folio_queue; /** * folio_start_private_2 - Start an fscache write on a folio. [DEPRECATED] * @folio: The folio. * * Call this function before writing a folio to a local cache. Starting a * second write before the first one finishes is not allowed. * * Note that this should no longer be used. */ static inline void folio_start_private_2(struct folio *folio) { VM_BUG_ON_FOLIO(folio_test_private_2(folio), folio); folio_get(folio); folio_set_private_2(folio); } enum netfs_io_source { NETFS_SOURCE_UNKNOWN, NETFS_FILL_WITH_ZEROES, NETFS_DOWNLOAD_FROM_SERVER, NETFS_READ_FROM_CACHE, NETFS_INVALID_READ, NETFS_UPLOAD_TO_SERVER, NETFS_WRITE_TO_CACHE, } __mode(byte); typedef void (*netfs_io_terminated_t)(void *priv, ssize_t transferred_or_error); /* * Per-inode context. This wraps the VFS inode. */ struct netfs_inode { struct inode inode; /* The VFS inode */ const struct netfs_request_ops *ops; #if IS_ENABLED(CONFIG_FSCACHE) struct fscache_cookie *cache; #endif struct mutex wb_lock; /* Writeback serialisation */ loff_t remote_i_size; /* Size of the remote file */ loff_t zero_point; /* Size after which we assume there's no data * on the server */ atomic_t io_count; /* Number of outstanding reqs */ unsigned long flags; #define NETFS_ICTX_ODIRECT 0 /* The file has DIO in progress */ #define NETFS_ICTX_UNBUFFERED 1 /* I/O should not use the pagecache */ #define NETFS_ICTX_MODIFIED_ATTR 3 /* Indicate change in mtime/ctime */ #define NETFS_ICTX_SINGLE_NO_UPLOAD 4 /* Monolithic payload, cache but no upload */ }; /* * A netfs group - for instance a ceph snap. This is marked on dirty pages and * pages marked with a group must be flushed before they can be written under * the domain of another group. */ struct netfs_group { refcount_t ref; void (*free)(struct netfs_group *netfs_group); }; /* * Information about a dirty page (attached only if necessary). * folio->private */ struct netfs_folio { struct netfs_group *netfs_group; /* Filesystem's grouping marker (or NULL). */ unsigned int dirty_offset; /* Write-streaming dirty data offset */ unsigned int dirty_len; /* Write-streaming dirty data length */ }; #define NETFS_FOLIO_INFO 0x1UL /* OR'd with folio->private. */ #define NETFS_FOLIO_COPY_TO_CACHE ((struct netfs_group *)0x356UL) /* Write to the cache only */ static inline bool netfs_is_folio_info(const void *priv) { return (unsigned long)priv & NETFS_FOLIO_INFO; } static inline struct netfs_folio *__netfs_folio_info(const void *priv) { if (netfs_is_folio_info(priv)) return (struct netfs_folio *)((unsigned long)priv & ~NETFS_FOLIO_INFO); return NULL; } static inline struct netfs_folio *netfs_folio_info(struct folio *folio) { return __netfs_folio_info(folio_get_private(folio)); } static inline struct netfs_group *netfs_folio_group(struct folio *folio) { struct netfs_folio *finfo; void *priv = folio_get_private(folio); finfo = netfs_folio_info(folio); if (finfo) return finfo->netfs_group; return priv; } /* * Stream of I/O subrequests going to a particular destination, such as the * server or the local cache. This is mainly intended for writing where we may * have to write to multiple destinations concurrently. */ struct netfs_io_stream { /* Submission tracking */ struct netfs_io_subrequest *construct; /* Op being constructed */ size_t sreq_max_len; /* Maximum size of a subrequest */ unsigned int sreq_max_segs; /* 0 or max number of segments in an iterator */ unsigned int submit_off; /* Folio offset we're submitting from */ unsigned int submit_len; /* Amount of data left to submit */ unsigned int submit_extendable_to; /* Amount I/O can be rounded up to */ void (*prepare_write)(struct netfs_io_subrequest *subreq); void (*issue_write)(struct netfs_io_subrequest *subreq); /* Collection tracking */ struct list_head subrequests; /* Contributory I/O operations */ struct netfs_io_subrequest *front; /* Op being collected */ unsigned long long collected_to; /* Position we've collected results to */ size_t transferred; /* The amount transferred from this stream */ unsigned short error; /* Aggregate error for the stream */ enum netfs_io_source source; /* Where to read from/write to */ unsigned char stream_nr; /* Index of stream in parent table */ bool avail; /* T if stream is available */ bool active; /* T if stream is active */ bool need_retry; /* T if this stream needs retrying */ bool failed; /* T if this stream failed */ bool transferred_valid; /* T is ->transferred is valid */ }; /* * Resources required to do operations on a cache. */ struct netfs_cache_resources { const struct netfs_cache_ops *ops; void *cache_priv; void *cache_priv2; unsigned int debug_id; /* Cookie debug ID */ unsigned int inval_counter; /* object->inval_counter at begin_op */ }; /* * Descriptor for a single component subrequest. Each operation represents an * individual read/write from/to a server, a cache, a journal, etc.. * * The buffer iterator is persistent for the life of the subrequest struct and * the pages it points to can be relied on to exist for the duration. */ struct netfs_io_subrequest { struct netfs_io_request *rreq; /* Supervising I/O request */ struct work_struct work; struct list_head rreq_link; /* Link in rreq->subrequests */ struct iov_iter io_iter; /* Iterator for this subrequest */ unsigned long long start; /* Where to start the I/O */ size_t len; /* Size of the I/O */ size_t transferred; /* Amount of data transferred */ refcount_t ref; short error; /* 0 or error that occurred */ unsigned short debug_index; /* Index in list (for debugging output) */ unsigned int nr_segs; /* Number of segs in io_iter */ u8 retry_count; /* The number of retries (0 on initial pass) */ enum netfs_io_source source; /* Where to read from/write to */ unsigned char stream_nr; /* I/O stream this belongs to */ unsigned long flags; #define NETFS_SREQ_COPY_TO_CACHE 0 /* Set if should copy the data to the cache */ #define NETFS_SREQ_CLEAR_TAIL 1 /* Set if the rest of the read should be cleared */ #define NETFS_SREQ_MADE_PROGRESS 4 /* Set if we transferred at least some data */ #define NETFS_SREQ_ONDEMAND 5 /* Set if it's from on-demand read mode */ #define NETFS_SREQ_BOUNDARY 6 /* Set if ends on hard boundary (eg. ceph object) */ #define NETFS_SREQ_HIT_EOF 7 /* Set if short due to EOF */ #define NETFS_SREQ_IN_PROGRESS 8 /* Unlocked when the subrequest completes */ #define NETFS_SREQ_NEED_RETRY 9 /* Set if the filesystem requests a retry */ #define NETFS_SREQ_FAILED 10 /* Set if the subreq failed unretryably */ }; enum netfs_io_origin { NETFS_READAHEAD, /* This read was triggered by readahead */ NETFS_READPAGE, /* This read is a synchronous read */ NETFS_READ_GAPS, /* This read is a synchronous read to fill gaps */ NETFS_READ_SINGLE, /* This read should be treated as a single object */ NETFS_READ_FOR_WRITE, /* This read is to prepare a write */ NETFS_UNBUFFERED_READ, /* This is an unbuffered read */ NETFS_DIO_READ, /* This is a direct I/O read */ NETFS_WRITEBACK, /* This write was triggered by writepages */ NETFS_WRITEBACK_SINGLE, /* This monolithic write was triggered by writepages */ NETFS_WRITETHROUGH, /* This write was made by netfs_perform_write() */ NETFS_UNBUFFERED_WRITE, /* This is an unbuffered write */ NETFS_DIO_WRITE, /* This is a direct I/O write */ NETFS_PGPRIV2_COPY_TO_CACHE, /* [DEPRECATED] This is writing read data to the cache */ nr__netfs_io_origin } __mode(byte); /* * Descriptor for an I/O helper request. This is used to make multiple I/O * operations to a variety of data stores and then stitch the result together. */ struct netfs_io_request { union { struct work_struct cleanup_work; /* Deferred cleanup work */ struct rcu_head rcu; }; struct work_struct work; /* Result collector work */ struct inode *inode; /* The file being accessed */ struct address_space *mapping; /* The mapping being accessed */ struct kiocb *iocb; /* AIO completion vector */ struct netfs_cache_resources cache_resources; struct netfs_io_request *copy_to_cache; /* Request to write just-read data to the cache */ #ifdef CONFIG_PROC_FS struct list_head proc_link; /* Link in netfs_iorequests */ #endif struct netfs_io_stream io_streams[2]; /* Streams of parallel I/O operations */ #define NR_IO_STREAMS 2 //wreq->nr_io_streams struct netfs_group *group; /* Writeback group being written back */ struct rolling_buffer buffer; /* Unencrypted buffer */ #define NETFS_ROLLBUF_PUT_MARK ROLLBUF_MARK_1 #define NETFS_ROLLBUF_PAGECACHE_MARK ROLLBUF_MARK_2 wait_queue_head_t waitq; /* Processor waiter */ void *netfs_priv; /* Private data for the netfs */ void *netfs_priv2; /* Private data for the netfs */ struct bio_vec *direct_bv; /* DIO buffer list (when handling iovec-iter) */ unsigned long long submitted; /* Amount submitted for I/O so far */ unsigned long long len; /* Length of the request */ size_t transferred; /* Amount to be indicated as transferred */ long error; /* 0 or error that occurred */ unsigned long long i_size; /* Size of the file */ unsigned long long start; /* Start position */ atomic64_t issued_to; /* Write issuer folio cursor */ unsigned long long collected_to; /* Point we've collected to */ unsigned long long cleaned_to; /* Position we've cleaned folios to */ unsigned long long abandon_to; /* Position to abandon folios to */ pgoff_t no_unlock_folio; /* Don't unlock this folio after read */ unsigned int direct_bv_count; /* Number of elements in direct_bv[] */ unsigned int debug_id; unsigned int rsize; /* Maximum read size (0 for none) */ unsigned int wsize; /* Maximum write size (0 for none) */ atomic_t subreq_counter; /* Next subreq->debug_index */ unsigned int nr_group_rel; /* Number of refs to release on ->group */ spinlock_t lock; /* Lock for queuing subreqs */ unsigned char front_folio_order; /* Order (size) of front folio */ enum netfs_io_origin origin; /* Origin of the request */ bool direct_bv_unpin; /* T if direct_bv[] must be unpinned */ refcount_t ref; unsigned long flags; #define NETFS_RREQ_IN_PROGRESS 0 /* Unlocked when the request completes (has ref) */ #define NETFS_RREQ_ALL_QUEUED 1 /* All subreqs are now queued */ #define NETFS_RREQ_PAUSE 2 /* Pause subrequest generation */ #define NETFS_RREQ_FAILED 3 /* The request failed */ #define NETFS_RREQ_RETRYING 4 /* Set if we're in the retry path */ #define NETFS_RREQ_SHORT_TRANSFER 5 /* Set if we have a short transfer */ #define NETFS_RREQ_OFFLOAD_COLLECTION 8 /* Offload collection to workqueue */ #define NETFS_RREQ_NO_UNLOCK_FOLIO 9 /* Don't unlock no_unlock_folio on completion */ #define NETFS_RREQ_FOLIO_COPY_TO_CACHE 10 /* Copy current folio to cache from read */ #define NETFS_RREQ_UPLOAD_TO_SERVER 11 /* Need to write to the server */ #define NETFS_RREQ_USE_IO_ITER 12 /* Use ->io_iter rather than ->i_pages */ #define NETFS_RREQ_USE_PGPRIV2 31 /* [DEPRECATED] Use PG_private_2 to mark * write to cache on read */ const struct netfs_request_ops *netfs_ops; }; /* * Operations the network filesystem can/must provide to the helpers. */ struct netfs_request_ops { mempool_t *request_pool; mempool_t *subrequest_pool; int (*init_request)(struct netfs_io_request *rreq, struct file *file); void (*free_request)(struct netfs_io_request *rreq); void (*free_subrequest)(struct netfs_io_subrequest *rreq); /* Read request handling */ void (*expand_readahead)(struct netfs_io_request *rreq); int (*prepare_read)(struct netfs_io_subrequest *subreq); void (*issue_read)(struct netfs_io_subrequest *subreq); bool (*is_still_valid)(struct netfs_io_request *rreq); int (*check_write_begin)(struct file *file, loff_t pos, unsigned len, struct folio **foliop, void **_fsdata); void (*done)(struct netfs_io_request *rreq); /* Modification handling */ void (*update_i_size)(struct inode *inode, loff_t i_size); void (*post_modify)(struct inode *inode); /* Write request handling */ void (*begin_writeback)(struct netfs_io_request *wreq); void (*prepare_write)(struct netfs_io_subrequest *subreq); void (*issue_write)(struct netfs_io_subrequest *subreq); void (*retry_request)(struct netfs_io_request *wreq, struct netfs_io_stream *stream); void (*invalidate_cache)(struct netfs_io_request *wreq); }; /* * How to handle reading from a hole. */ enum netfs_read_from_hole { NETFS_READ_HOLE_IGNORE, NETFS_READ_HOLE_FAIL, }; /* * Table of operations for access to a cache. */ struct netfs_cache_ops { /* End an operation */ void (*end_operation)(struct netfs_cache_resources *cres); /* Read data from the cache */ int (*read)(struct netfs_cache_resources *cres, loff_t start_pos, struct iov_iter *iter, enum netfs_read_from_hole read_hole, netfs_io_terminated_t term_func, void *term_func_priv); /* Write data to the cache */ int (*write)(struct netfs_cache_resources *cres, loff_t start_pos, struct iov_iter *iter, netfs_io_terminated_t term_func, void *term_func_priv); /* Write data to the cache from a netfs subrequest. */ void (*issue_write)(struct netfs_io_subrequest *subreq); /* Expand readahead request */ void (*expand_readahead)(struct netfs_cache_resources *cres, unsigned long long *_start, unsigned long long *_len, unsigned long long i_size); /* Prepare a read operation, shortening it to a cached/uncached * boundary as appropriate. */ enum netfs_io_source (*prepare_read)(struct netfs_io_subrequest *subreq, unsigned long long i_size); /* Prepare a write subrequest, working out if we're allowed to do it * and finding out the maximum amount of data to gather before * attempting to submit. If we're not permitted to do it, the * subrequest should be marked failed. */ void (*prepare_write_subreq)(struct netfs_io_subrequest *subreq); /* Prepare a write operation, working out what part of the write we can * actually do. */ int (*prepare_write)(struct netfs_cache_resources *cres, loff_t *_start, size_t *_len, size_t upper_len, loff_t i_size, bool no_space_allocated_yet); /* Prepare an on-demand read operation, shortening it to a cached/uncached * boundary as appropriate. */ enum netfs_io_source (*prepare_ondemand_read)(struct netfs_cache_resources *cres, loff_t start, size_t *_len, loff_t i_size, unsigned long *_flags, ino_t ino); /* Query the occupancy of the cache in a region, returning where the * next chunk of data starts and how long it is. */ int (*query_occupancy)(struct netfs_cache_resources *cres, loff_t start, size_t len, size_t granularity, loff_t *_data_start, size_t *_data_len); }; /* High-level read API. */ ssize_t netfs_unbuffered_read_iter_locked(struct kiocb *iocb, struct iov_iter *iter); ssize_t netfs_unbuffered_read_iter(struct kiocb *iocb, struct iov_iter *iter); ssize_t netfs_buffered_read_iter(struct kiocb *iocb, struct iov_iter *iter); ssize_t netfs_file_read_iter(struct kiocb *iocb, struct iov_iter *iter); /* High-level write API */ ssize_t netfs_perform_write(struct kiocb *iocb, struct iov_iter *iter, struct netfs_group *netfs_group); ssize_t netfs_buffered_write_iter_locked(struct kiocb *iocb, struct iov_iter *from, struct netfs_group *netfs_group); ssize_t netfs_unbuffered_write_iter(struct kiocb *iocb, struct iov_iter *from); ssize_t netfs_unbuffered_write_iter_locked(struct kiocb *iocb, struct iov_iter *iter, struct netfs_group *netfs_group); ssize_t netfs_file_write_iter(struct kiocb *iocb, struct iov_iter *from); /* Single, monolithic object read/write API. */ void netfs_single_mark_inode_dirty(struct inode *inode); ssize_t netfs_read_single(struct inode *inode, struct file *file, struct iov_iter *iter); int netfs_writeback_single(struct address_space *mapping, struct writeback_control *wbc, struct iov_iter *iter); /* Address operations API */ struct readahead_control; void netfs_readahead(struct readahead_control *); int netfs_read_folio(struct file *, struct folio *); int netfs_write_begin(struct netfs_inode *, struct file *, struct address_space *, loff_t pos, unsigned int len, struct folio **, void **fsdata); int netfs_writepages(struct address_space *mapping, struct writeback_control *wbc); bool netfs_dirty_folio(struct address_space *mapping, struct folio *folio); int netfs_unpin_writeback(struct inode *inode, struct writeback_control *wbc); void netfs_clear_inode_writeback(struct inode *inode, const void *aux); void netfs_invalidate_folio(struct folio *folio, size_t offset, size_t length); bool netfs_release_folio(struct folio *folio, gfp_t gfp); /* VMA operations API. */ vm_fault_t netfs_page_mkwrite(struct vm_fault *vmf, struct netfs_group *netfs_group); /* (Sub)request management API. */ void netfs_read_subreq_progress(struct netfs_io_subrequest *subreq); void netfs_read_subreq_terminated(struct netfs_io_subrequest *subreq); void netfs_get_subrequest(struct netfs_io_subrequest *subreq, enum netfs_sreq_ref_trace what); void netfs_put_subrequest(struct netfs_io_subrequest *subreq, enum netfs_sreq_ref_trace what); ssize_t netfs_extract_user_iter(struct iov_iter *orig, size_t orig_len, struct iov_iter *new, iov_iter_extraction_t extraction_flags); size_t netfs_limit_iter(const struct iov_iter *iter, size_t start_offset, size_t max_size, size_t max_segs); void netfs_prepare_write_failed(struct netfs_io_subrequest *subreq); void netfs_write_subrequest_terminated(void *_op, ssize_t transferred_or_error); int netfs_start_io_read(struct inode *inode); void netfs_end_io_read(struct inode *inode); int netfs_start_io_write(struct inode *inode); void netfs_end_io_write(struct inode *inode); int netfs_start_io_direct(struct inode *inode); void netfs_end_io_direct(struct inode *inode); /* Miscellaneous APIs. */ struct folio_queue *netfs_folioq_alloc(unsigned int rreq_id, gfp_t gfp, unsigned int trace /*enum netfs_folioq_trace*/); void netfs_folioq_free(struct folio_queue *folioq, unsigned int trace /*enum netfs_trace_folioq*/); /* Buffer wrangling helpers API. */ int netfs_alloc_folioq_buffer(struct address_space *mapping, struct folio_queue **_buffer, size_t *_cur_size, ssize_t size, gfp_t gfp); void netfs_free_folioq_buffer(struct folio_queue *fq); /** * netfs_inode - Get the netfs inode context from the inode * @inode: The inode to query * * Get the netfs lib inode context from the network filesystem's inode. The * context struct is expected to directly follow on from the VFS inode struct. */ static inline struct netfs_inode *netfs_inode(struct inode *inode) { return container_of(inode, struct netfs_inode, inode); } /** * netfs_inode_init - Initialise a netfslib inode context * @ctx: The netfs inode to initialise * @ops: The netfs's operations list * @use_zero_point: True to use the zero_point read optimisation * * Initialise the netfs library context struct. This is expected to follow on * directly from the VFS inode struct. */ static inline void netfs_inode_init(struct netfs_inode *ctx, const struct netfs_request_ops *ops, bool use_zero_point) { ctx->ops = ops; ctx->remote_i_size = i_size_read(&ctx->inode); ctx->zero_point = LLONG_MAX; ctx->flags = 0; atomic_set(&ctx->io_count, 0); #if IS_ENABLED(CONFIG_FSCACHE) ctx->cache = NULL; #endif mutex_init(&ctx->wb_lock); /* ->releasepage() drives zero_point */ if (use_zero_point) { ctx->zero_point = ctx->remote_i_size; mapping_set_release_always(ctx->inode.i_mapping); } } /** * netfs_resize_file - Note that a file got resized * @ctx: The netfs inode being resized * @new_i_size: The new file size * @changed_on_server: The change was applied to the server * * Inform the netfs lib that a file got resized so that it can adjust its state. */ static inline void netfs_resize_file(struct netfs_inode *ctx, loff_t new_i_size, bool changed_on_server) { if (changed_on_server) ctx->remote_i_size = new_i_size; if (new_i_size < ctx->zero_point) ctx->zero_point = new_i_size; } /** * netfs_i_cookie - Get the cache cookie from the inode * @ctx: The netfs inode to query * * Get the caching cookie (if enabled) from the network filesystem's inode. */ static inline struct fscache_cookie *netfs_i_cookie(struct netfs_inode *ctx) { #if IS_ENABLED(CONFIG_FSCACHE) return ctx->cache; #else return NULL; #endif } /** * netfs_wait_for_outstanding_io - Wait for outstanding I/O to complete * @inode: The netfs inode to wait on * * Wait for outstanding I/O requests of any type to complete. This is intended * to be called from inode eviction routines. This makes sure that any * resources held by those requests are cleaned up before we let the inode get * cleaned up. */ static inline void netfs_wait_for_outstanding_io(struct inode *inode) { struct netfs_inode *ictx = netfs_inode(inode); wait_var_event(&ictx->io_count, atomic_read(&ictx->io_count) == 0); } #endif /* _LINUX_NETFS_H */
40 22 22 22 1 43 47 47 47 38 16 38 47 10 44 44 44 43 44 4 6 4 41 15 43 4 5 2 22 2 22 5 34 33 12 23 12 12 22 4 22 22 1 21 22 22 15 2 17 39 10 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 // SPDX-License-Identifier: GPL-2.0 /* * linux/fs/ext4/readpage.c * * Copyright (C) 2002, Linus Torvalds. * Copyright (C) 2015, Google, Inc. * * This was originally taken from fs/mpage.c * * The ext4_mpage_readpages() function here is intended to * replace mpage_readahead() in the general case, not just for * encrypted files. It has some limitations (see below), where it * will fall back to read_block_full_page(), but these limitations * should only be hit when page_size != block_size. * * This will allow us to attach a callback function to support ext4 * encryption. * * If anything unusual happens, such as: * * - encountering a page which has buffers * - encountering a page which has a non-hole after a hole * - encountering a page with non-contiguous blocks * * then this code just gives up and calls the buffer_head-based read function. * It does handle a page which has holes at the end - that is a common case: * the end-of-file on blocksize < PAGE_SIZE setups. * */ #include <linux/kernel.h> #include <linux/export.h> #include <linux/mm.h> #include <linux/kdev_t.h> #include <linux/gfp.h> #include <linux/bio.h> #include <linux/fs.h> #include <linux/buffer_head.h> #include <linux/blkdev.h> #include <linux/highmem.h> #include <linux/prefetch.h> #include <linux/mpage.h> #include <linux/writeback.h> #include <linux/backing-dev.h> #include <linux/pagevec.h> #include "ext4.h" #define NUM_PREALLOC_POST_READ_CTXS 128 static struct kmem_cache *bio_post_read_ctx_cache; static mempool_t *bio_post_read_ctx_pool; /* postprocessing steps for read bios */ enum bio_post_read_step { STEP_INITIAL = 0, STEP_DECRYPT, STEP_VERITY, STEP_MAX, }; struct bio_post_read_ctx { struct bio *bio; struct work_struct work; unsigned int cur_step; unsigned int enabled_steps; }; static void __read_end_io(struct bio *bio) { struct folio_iter fi; bio_for_each_folio_all(fi, bio) folio_end_read(fi.folio, bio->bi_status == 0); if (bio->bi_private) mempool_free(bio->bi_private, bio_post_read_ctx_pool); bio_put(bio); } static void bio_post_read_processing(struct bio_post_read_ctx *ctx); static void decrypt_work(struct work_struct *work) { struct bio_post_read_ctx *ctx = container_of(work, struct bio_post_read_ctx, work); struct bio *bio = ctx->bio; if (fscrypt_decrypt_bio(bio)) bio_post_read_processing(ctx); else __read_end_io(bio); } static void verity_work(struct work_struct *work) { struct bio_post_read_ctx *ctx = container_of(work, struct bio_post_read_ctx, work); struct bio *bio = ctx->bio; /* * fsverity_verify_bio() may call readahead() again, and although verity * will be disabled for that, decryption may still be needed, causing * another bio_post_read_ctx to be allocated. So to guarantee that * mempool_alloc() never deadlocks we must free the current ctx first. * This is safe because verity is the last post-read step. */ BUILD_BUG_ON(STEP_VERITY + 1 != STEP_MAX); mempool_free(ctx, bio_post_read_ctx_pool); bio->bi_private = NULL; fsverity_verify_bio(bio); __read_end_io(bio); } static void bio_post_read_processing(struct bio_post_read_ctx *ctx) { /* * We use different work queues for decryption and for verity because * verity may require reading metadata pages that need decryption, and * we shouldn't recurse to the same workqueue. */ switch (++ctx->cur_step) { case STEP_DECRYPT: if (ctx->enabled_steps & (1 << STEP_DECRYPT)) { INIT_WORK(&ctx->work, decrypt_work); fscrypt_enqueue_decrypt_work(&ctx->work); return; } ctx->cur_step++; fallthrough; case STEP_VERITY: if (ctx->enabled_steps & (1 << STEP_VERITY)) { INIT_WORK(&ctx->work, verity_work); fsverity_enqueue_verify_work(&ctx->work); return; } ctx->cur_step++; fallthrough; default: __read_end_io(ctx->bio); } } static bool bio_post_read_required(struct bio *bio) { return bio->bi_private && !bio->bi_status; } /* * I/O completion handler for multipage BIOs. * * The mpage code never puts partial pages into a BIO (except for end-of-file). * If a page does not map to a contiguous run of blocks then it simply falls * back to block_read_full_folio(). * * Why is this? If a page's completion depends on a number of different BIOs * which can complete in any order (or at the same time) then determining the * status of that page is hard. See end_buffer_async_read() for the details. * There is no point in duplicating all that complexity. */ static void mpage_end_io(struct bio *bio) { if (bio_post_read_required(bio)) { struct bio_post_read_ctx *ctx = bio->bi_private; ctx->cur_step = STEP_INITIAL; bio_post_read_processing(ctx); return; } __read_end_io(bio); } static inline bool ext4_need_verity(const struct inode *inode, pgoff_t idx) { return fsverity_active(inode) && idx < DIV_ROUND_UP(inode->i_size, PAGE_SIZE); } static void ext4_set_bio_post_read_ctx(struct bio *bio, const struct inode *inode, pgoff_t first_idx) { unsigned int post_read_steps = 0; if (fscrypt_inode_uses_fs_layer_crypto(inode)) post_read_steps |= 1 << STEP_DECRYPT; if (ext4_need_verity(inode, first_idx)) post_read_steps |= 1 << STEP_VERITY; if (post_read_steps) { /* Due to the mempool, this never fails. */ struct bio_post_read_ctx *ctx = mempool_alloc(bio_post_read_ctx_pool, GFP_NOFS); ctx->bio = bio; ctx->enabled_steps = post_read_steps; bio->bi_private = ctx; } } static inline loff_t ext4_readpage_limit(struct inode *inode) { if (IS_ENABLED(CONFIG_FS_VERITY) && IS_VERITY(inode)) return inode->i_sb->s_maxbytes; return i_size_read(inode); } int ext4_mpage_readpages(struct inode *inode, struct readahead_control *rac, struct folio *folio) { struct bio *bio = NULL; sector_t last_block_in_bio = 0; const unsigned blkbits = inode->i_blkbits; const unsigned blocks_per_page = PAGE_SIZE >> blkbits; const unsigned blocksize = 1 << blkbits; sector_t next_block; sector_t block_in_file; sector_t last_block; sector_t last_block_in_file; sector_t first_block; unsigned page_block; struct block_device *bdev = inode->i_sb->s_bdev; int length; unsigned relative_block = 0; struct ext4_map_blocks map; unsigned int nr_pages, folio_pages; map.m_pblk = 0; map.m_lblk = 0; map.m_len = 0; map.m_flags = 0; nr_pages = rac ? readahead_count(rac) : folio_nr_pages(folio); for (; nr_pages; nr_pages -= folio_pages) { int fully_mapped = 1; unsigned int first_hole; unsigned int blocks_per_folio; if (rac) folio = readahead_folio(rac); folio_pages = folio_nr_pages(folio); prefetchw(&folio->flags); if (folio_buffers(folio)) goto confused; blocks_per_folio = folio_size(folio) >> blkbits; first_hole = blocks_per_folio; block_in_file = next_block = (sector_t)folio->index << (PAGE_SHIFT - blkbits); last_block = block_in_file + nr_pages * blocks_per_page; last_block_in_file = (ext4_readpage_limit(inode) + blocksize - 1) >> blkbits; if (last_block > last_block_in_file) last_block = last_block_in_file; page_block = 0; /* * Map blocks using the previous result first. */ if ((map.m_flags & EXT4_MAP_MAPPED) && block_in_file > map.m_lblk && block_in_file < (map.m_lblk + map.m_len)) { unsigned map_offset = block_in_file - map.m_lblk; unsigned last = map.m_len - map_offset; first_block = map.m_pblk + map_offset; for (relative_block = 0; ; relative_block++) { if (relative_block == last) { /* needed? */ map.m_flags &= ~EXT4_MAP_MAPPED; break; } if (page_block == blocks_per_folio) break; page_block++; block_in_file++; } } /* * Then do more ext4_map_blocks() calls until we are * done with this folio. */ while (page_block < blocks_per_folio) { if (block_in_file < last_block) { map.m_lblk = block_in_file; map.m_len = last_block - block_in_file; if (ext4_map_blocks(NULL, inode, &map, 0) < 0) { set_error_page: folio_zero_segment(folio, 0, folio_size(folio)); folio_unlock(folio); goto next_page; } } if ((map.m_flags & EXT4_MAP_MAPPED) == 0) { fully_mapped = 0; if (first_hole == blocks_per_folio) first_hole = page_block; page_block++; block_in_file++; continue; } if (first_hole != blocks_per_folio) goto confused; /* hole -> non-hole */ /* Contiguous blocks? */ if (!page_block) first_block = map.m_pblk; else if (first_block + page_block != map.m_pblk) goto confused; for (relative_block = 0; ; relative_block++) { if (relative_block == map.m_len) { /* needed? */ map.m_flags &= ~EXT4_MAP_MAPPED; break; } else if (page_block == blocks_per_folio) break; page_block++; block_in_file++; } } if (first_hole != blocks_per_folio) { folio_zero_segment(folio, first_hole << blkbits, folio_size(folio)); if (first_hole == 0) { if (ext4_need_verity(inode, folio->index) && !fsverity_verify_folio(folio)) goto set_error_page; folio_end_read(folio, true); continue; } } else if (fully_mapped) { folio_set_mappedtodisk(folio); } /* * This folio will go to BIO. Do we need to send this * BIO off first? */ if (bio && (last_block_in_bio != first_block - 1 || !fscrypt_mergeable_bio(bio, inode, next_block))) { submit_and_realloc: submit_bio(bio); bio = NULL; } if (bio == NULL) { /* * bio_alloc will _always_ be able to allocate a bio if * __GFP_DIRECT_RECLAIM is set, see bio_alloc_bioset(). */ bio = bio_alloc(bdev, bio_max_segs(nr_pages), REQ_OP_READ, GFP_KERNEL); fscrypt_set_bio_crypt_ctx(bio, inode, next_block, GFP_KERNEL); ext4_set_bio_post_read_ctx(bio, inode, folio->index); bio->bi_iter.bi_sector = first_block << (blkbits - 9); bio->bi_end_io = mpage_end_io; if (rac) bio->bi_opf |= REQ_RAHEAD; } length = first_hole << blkbits; if (!bio_add_folio(bio, folio, length, 0)) goto submit_and_realloc; if (((map.m_flags & EXT4_MAP_BOUNDARY) && (relative_block == map.m_len)) || (first_hole != blocks_per_folio)) { submit_bio(bio); bio = NULL; } else last_block_in_bio = first_block + blocks_per_folio - 1; continue; confused: if (bio) { submit_bio(bio); bio = NULL; } if (!folio_test_uptodate(folio)) block_read_full_folio(folio, ext4_get_block); else folio_unlock(folio); next_page: ; /* A label shall be followed by a statement until C23 */ } if (bio) submit_bio(bio); return 0; } int __init ext4_init_post_read_processing(void) { bio_post_read_ctx_cache = KMEM_CACHE(bio_post_read_ctx, SLAB_RECLAIM_ACCOUNT); if (!bio_post_read_ctx_cache) goto fail; bio_post_read_ctx_pool = mempool_create_slab_pool(NUM_PREALLOC_POST_READ_CTXS, bio_post_read_ctx_cache); if (!bio_post_read_ctx_pool) goto fail_free_cache; return 0; fail_free_cache: kmem_cache_destroy(bio_post_read_ctx_cache); fail: return -ENOMEM; } void ext4_exit_post_read_processing(void) { mempool_destroy(bio_post_read_ctx_pool); kmem_cache_destroy(bio_post_read_ctx_cache); }
369 332 312 3 2 3 3 47 43 290 289 290 290 290 290 289 290 26 12 351 351 230 220 23 23 101 100 5 96 8 100 5 101 101 3 3 218 212 10 218 218 218 218 113 132 42 11 53 53 53 53 33 2 35 35 35 19 19 19 4 15 15 15 6 6 6 6 6 6 23 23 13 227 203 202 82 4 56 29 78 7 6 3 8 8 28 7 1 4 21 26 8 2 1 1 6 1 1 8 8 40 20 2 16 5 27 10 10 19 2 13 3 3 32 5 10 26 27 10 37 37 37 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 // SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2000-2001,2005 Silicon Graphics, Inc. * All Rights Reserved. */ #include "xfs.h" #include "xfs_fs.h" #include "xfs_shared.h" #include "xfs_format.h" #include "xfs_log_format.h" #include "xfs_trans_resv.h" #include "xfs_mount.h" #include "xfs_inode.h" #include "xfs_trans.h" #include "xfs_bmap.h" #include "xfs_dir2.h" #include "xfs_dir2_priv.h" #include "xfs_errortag.h" #include "xfs_error.h" #include "xfs_trace.h" #include "xfs_health.h" #include "xfs_bmap_btree.h" #include "xfs_trans_space.h" #include "xfs_parent.h" #include "xfs_ag.h" #include "xfs_ialloc.h" const struct xfs_name xfs_name_dotdot = { .name = (const unsigned char *)"..", .len = 2, .type = XFS_DIR3_FT_DIR, }; const struct xfs_name xfs_name_dot = { .name = (const unsigned char *)".", .len = 1, .type = XFS_DIR3_FT_DIR, }; /* * Convert inode mode to directory entry filetype */ unsigned char xfs_mode_to_ftype( int mode) { switch (mode & S_IFMT) { case S_IFREG: return XFS_DIR3_FT_REG_FILE; case S_IFDIR: return XFS_DIR3_FT_DIR; case S_IFCHR: return XFS_DIR3_FT_CHRDEV; case S_IFBLK: return XFS_DIR3_FT_BLKDEV; case S_IFIFO: return XFS_DIR3_FT_FIFO; case S_IFSOCK: return XFS_DIR3_FT_SOCK; case S_IFLNK: return XFS_DIR3_FT_SYMLINK; default: return XFS_DIR3_FT_UNKNOWN; } } /* * ASCII case-insensitive (ie. A-Z) support for directories that was * used in IRIX. */ xfs_dahash_t xfs_ascii_ci_hashname( const struct xfs_name *name) { xfs_dahash_t hash; int i; for (i = 0, hash = 0; i < name->len; i++) hash = xfs_ascii_ci_xfrm(name->name[i]) ^ rol32(hash, 7); return hash; } enum xfs_dacmp xfs_ascii_ci_compname( struct xfs_da_args *args, const unsigned char *name, int len) { enum xfs_dacmp result; int i; if (args->namelen != len) return XFS_CMP_DIFFERENT; result = XFS_CMP_EXACT; for (i = 0; i < len; i++) { if (args->name[i] == name[i]) continue; if (xfs_ascii_ci_xfrm(args->name[i]) != xfs_ascii_ci_xfrm(name[i])) return XFS_CMP_DIFFERENT; result = XFS_CMP_CASE; } return result; } int xfs_da_mount( struct xfs_mount *mp) { struct xfs_da_geometry *dageo; ASSERT(mp->m_sb.sb_versionnum & XFS_SB_VERSION_DIRV2BIT); ASSERT(xfs_dir2_dirblock_bytes(&mp->m_sb) <= XFS_MAX_BLOCKSIZE); mp->m_dir_geo = kzalloc(sizeof(struct xfs_da_geometry), GFP_KERNEL | __GFP_RETRY_MAYFAIL); mp->m_attr_geo = kzalloc(sizeof(struct xfs_da_geometry), GFP_KERNEL | __GFP_RETRY_MAYFAIL); if (!mp->m_dir_geo || !mp->m_attr_geo) { kfree(mp->m_dir_geo); kfree(mp->m_attr_geo); return -ENOMEM; } /* set up directory geometry */ dageo = mp->m_dir_geo; dageo->blklog = mp->m_sb.sb_blocklog + mp->m_sb.sb_dirblklog; dageo->fsblog = mp->m_sb.sb_blocklog; dageo->blksize = xfs_dir2_dirblock_bytes(&mp->m_sb); dageo->fsbcount = 1 << mp->m_sb.sb_dirblklog; if (xfs_has_crc(mp)) { dageo->node_hdr_size = sizeof(struct xfs_da3_node_hdr); dageo->leaf_hdr_size = sizeof(struct xfs_dir3_leaf_hdr); dageo->free_hdr_size = sizeof(struct xfs_dir3_free_hdr); dageo->data_entry_offset = sizeof(struct xfs_dir3_data_hdr); } else { dageo->node_hdr_size = sizeof(struct xfs_da_node_hdr); dageo->leaf_hdr_size = sizeof(struct xfs_dir2_leaf_hdr); dageo->free_hdr_size = sizeof(struct xfs_dir2_free_hdr); dageo->data_entry_offset = sizeof(struct xfs_dir2_data_hdr); } dageo->leaf_max_ents = (dageo->blksize - dageo->leaf_hdr_size) / sizeof(struct xfs_dir2_leaf_entry); dageo->free_max_bests = (dageo->blksize - dageo->free_hdr_size) / sizeof(xfs_dir2_data_off_t); dageo->data_first_offset = dageo->data_entry_offset + xfs_dir2_data_entsize(mp, 1) + xfs_dir2_data_entsize(mp, 2); /* * Now we've set up the block conversion variables, we can calculate the * segment block constants using the geometry structure. */ dageo->datablk = xfs_dir2_byte_to_da(dageo, XFS_DIR2_DATA_OFFSET); dageo->leafblk = xfs_dir2_byte_to_da(dageo, XFS_DIR2_LEAF_OFFSET); dageo->freeblk = xfs_dir2_byte_to_da(dageo, XFS_DIR2_FREE_OFFSET); dageo->node_ents = (dageo->blksize - dageo->node_hdr_size) / (uint)sizeof(xfs_da_node_entry_t); dageo->max_extents = (XFS_DIR2_MAX_SPACES * XFS_DIR2_SPACE_SIZE) >> mp->m_sb.sb_blocklog; dageo->magicpct = (dageo->blksize * 37) / 100; /* set up attribute geometry - single fsb only */ dageo = mp->m_attr_geo; dageo->blklog = mp->m_sb.sb_blocklog; dageo->fsblog = mp->m_sb.sb_blocklog; dageo->blksize = 1 << dageo->blklog; dageo->fsbcount = 1; dageo->node_hdr_size = mp->m_dir_geo->node_hdr_size; dageo->node_ents = (dageo->blksize - dageo->node_hdr_size) / (uint)sizeof(xfs_da_node_entry_t); if (xfs_has_large_extent_counts(mp)) dageo->max_extents = XFS_MAX_EXTCNT_ATTR_FORK_LARGE; else dageo->max_extents = XFS_MAX_EXTCNT_ATTR_FORK_SMALL; dageo->magicpct = (dageo->blksize * 37) / 100; return 0; } void xfs_da_unmount( struct xfs_mount *mp) { kfree(mp->m_dir_geo); kfree(mp->m_attr_geo); } /* * Return 1 if directory contains only "." and "..". */ static bool xfs_dir_isempty( xfs_inode_t *dp) { xfs_dir2_sf_hdr_t *sfp; ASSERT(S_ISDIR(VFS_I(dp)->i_mode)); if (dp->i_disk_size == 0) /* might happen during shutdown. */ return true; if (dp->i_disk_size > xfs_inode_data_fork_size(dp)) return false; sfp = dp->i_df.if_data; return !sfp->count; } /* * Validate a given inode number. */ int xfs_dir_ino_validate( xfs_mount_t *mp, xfs_ino_t ino) { bool ino_ok = xfs_verify_dir_ino(mp, ino); if (XFS_IS_CORRUPT(mp, !ino_ok) || XFS_TEST_ERROR(mp, XFS_ERRTAG_DIR_INO_VALIDATE)) { xfs_warn(mp, "Invalid inode number 0x%Lx", (unsigned long long) ino); return -EFSCORRUPTED; } return 0; } /* * Initialize a directory with its "." and ".." entries. */ int xfs_dir_init( xfs_trans_t *tp, xfs_inode_t *dp, xfs_inode_t *pdp) { struct xfs_da_args *args; int error; ASSERT(S_ISDIR(VFS_I(dp)->i_mode)); error = xfs_dir_ino_validate(tp->t_mountp, pdp->i_ino); if (error) return error; args = kzalloc(sizeof(*args), GFP_KERNEL | __GFP_NOFAIL); if (!args) return -ENOMEM; args->geo = dp->i_mount->m_dir_geo; args->dp = dp; args->trans = tp; args->owner = dp->i_ino; error = xfs_dir2_sf_create(args, pdp->i_ino); kfree(args); return error; } enum xfs_dir2_fmt xfs_dir2_format( struct xfs_da_args *args, int *error) { struct xfs_inode *dp = args->dp; struct xfs_mount *mp = dp->i_mount; struct xfs_da_geometry *geo = mp->m_dir_geo; xfs_fileoff_t eof; xfs_assert_ilocked(dp, XFS_ILOCK_SHARED | XFS_ILOCK_EXCL); *error = 0; if (dp->i_df.if_format == XFS_DINODE_FMT_LOCAL) return XFS_DIR2_FMT_SF; *error = xfs_bmap_last_offset(dp, &eof, XFS_DATA_FORK); if (*error) return XFS_DIR2_FMT_ERROR; if (eof == XFS_B_TO_FSB(mp, geo->blksize)) { if (XFS_IS_CORRUPT(mp, dp->i_disk_size != geo->blksize)) { xfs_da_mark_sick(args); *error = -EFSCORRUPTED; return XFS_DIR2_FMT_ERROR; } return XFS_DIR2_FMT_BLOCK; } if (eof == geo->leafblk + geo->fsbcount) return XFS_DIR2_FMT_LEAF; return XFS_DIR2_FMT_NODE; } int xfs_dir_createname_args( struct xfs_da_args *args) { int error; if (!args->inumber) args->op_flags |= XFS_DA_OP_JUSTCHECK; switch (xfs_dir2_format(args, &error)) { case XFS_DIR2_FMT_SF: return xfs_dir2_sf_addname(args); case XFS_DIR2_FMT_BLOCK: return xfs_dir2_block_addname(args); case XFS_DIR2_FMT_LEAF: return xfs_dir2_leaf_addname(args); case XFS_DIR2_FMT_NODE: return xfs_dir2_node_addname(args); default: return error; } } /* * Enter a name in a directory, or check for available space. * If inum is 0, only the available space test is performed. */ int xfs_dir_createname( struct xfs_trans *tp, struct xfs_inode *dp, const struct xfs_name *name, xfs_ino_t inum, /* new entry inode number */ xfs_extlen_t total) /* bmap's total block count */ { struct xfs_da_args *args; int rval; ASSERT(S_ISDIR(VFS_I(dp)->i_mode)); if (inum) { rval = xfs_dir_ino_validate(tp->t_mountp, inum); if (rval) return rval; XFS_STATS_INC(dp->i_mount, xs_dir_create); } args = kzalloc(sizeof(*args), GFP_KERNEL | __GFP_NOFAIL); if (!args) return -ENOMEM; args->geo = dp->i_mount->m_dir_geo; args->name = name->name; args->namelen = name->len; args->filetype = name->type; args->hashval = xfs_dir2_hashname(dp->i_mount, name); args->inumber = inum; args->dp = dp; args->total = total; args->whichfork = XFS_DATA_FORK; args->trans = tp; args->op_flags = XFS_DA_OP_ADDNAME | XFS_DA_OP_OKNOENT; args->owner = dp->i_ino; rval = xfs_dir_createname_args(args); kfree(args); return rval; } /* * If doing a CI lookup and case-insensitive match, dup actual name into * args.value. Return EEXIST for success (ie. name found) or an error. */ int xfs_dir_cilookup_result( struct xfs_da_args *args, const unsigned char *name, int len) { if (args->cmpresult == XFS_CMP_DIFFERENT) return -ENOENT; if (args->cmpresult != XFS_CMP_CASE || !(args->op_flags & XFS_DA_OP_CILOOKUP)) return -EEXIST; args->value = kmemdup(name, len, GFP_KERNEL | __GFP_NOLOCKDEP | __GFP_RETRY_MAYFAIL); if (!args->value) return -ENOMEM; args->valuelen = len; return -EEXIST; } int xfs_dir_lookup_args( struct xfs_da_args *args) { int error; switch (xfs_dir2_format(args, &error)) { case XFS_DIR2_FMT_SF: error = xfs_dir2_sf_lookup(args); break; case XFS_DIR2_FMT_BLOCK: error = xfs_dir2_block_lookup(args); break; case XFS_DIR2_FMT_LEAF: error = xfs_dir2_leaf_lookup(args); break; case XFS_DIR2_FMT_NODE: error = xfs_dir2_node_lookup(args); break; default: break; } if (error != -EEXIST) return error; return 0; } /* * Lookup a name in a directory, give back the inode number. * If ci_name is not NULL, returns the actual name in ci_name if it differs * to name, or ci_name->name is set to NULL for an exact match. */ int xfs_dir_lookup( struct xfs_trans *tp, struct xfs_inode *dp, const struct xfs_name *name, xfs_ino_t *inum, /* out: inode number */ struct xfs_name *ci_name) /* out: actual name if CI match */ { struct xfs_da_args *args; int rval; int lock_mode; ASSERT(S_ISDIR(VFS_I(dp)->i_mode)); XFS_STATS_INC(dp->i_mount, xs_dir_lookup); args = kzalloc(sizeof(*args), GFP_KERNEL | __GFP_NOLOCKDEP | __GFP_NOFAIL); args->geo = dp->i_mount->m_dir_geo; args->name = name->name; args->namelen = name->len; args->filetype = name->type; args->hashval = xfs_dir2_hashname(dp->i_mount, name); args->dp = dp; args->whichfork = XFS_DATA_FORK; args->trans = tp; args->op_flags = XFS_DA_OP_OKNOENT; args->owner = dp->i_ino; if (ci_name) args->op_flags |= XFS_DA_OP_CILOOKUP; lock_mode = xfs_ilock_data_map_shared(dp); rval = xfs_dir_lookup_args(args); if (!rval) { *inum = args->inumber; if (ci_name) { ci_name->name = args->value; ci_name->len = args->valuelen; } } xfs_iunlock(dp, lock_mode); kfree(args); return rval; } int xfs_dir_removename_args( struct xfs_da_args *args) { int error; switch (xfs_dir2_format(args, &error)) { case XFS_DIR2_FMT_SF: return xfs_dir2_sf_removename(args); case XFS_DIR2_FMT_BLOCK: return xfs_dir2_block_removename(args); case XFS_DIR2_FMT_LEAF: return xfs_dir2_leaf_removename(args); case XFS_DIR2_FMT_NODE: return xfs_dir2_node_removename(args); default: return error; } } /* * Remove an entry from a directory. */ int xfs_dir_removename( struct xfs_trans *tp, struct xfs_inode *dp, const struct xfs_name *name, xfs_ino_t ino, xfs_extlen_t total) /* bmap's total block count */ { struct xfs_da_args *args; int rval; ASSERT(S_ISDIR(VFS_I(dp)->i_mode)); XFS_STATS_INC(dp->i_mount, xs_dir_remove); args = kzalloc(sizeof(*args), GFP_KERNEL | __GFP_NOFAIL); if (!args) return -ENOMEM; args->geo = dp->i_mount->m_dir_geo; args->name = name->name; args->namelen = name->len; args->filetype = name->type; args->hashval = xfs_dir2_hashname(dp->i_mount, name); args->inumber = ino; args->dp = dp; args->total = total; args->whichfork = XFS_DATA_FORK; args->trans = tp; args->owner = dp->i_ino; rval = xfs_dir_removename_args(args); kfree(args); return rval; } int xfs_dir_replace_args( struct xfs_da_args *args) { int error; switch (xfs_dir2_format(args, &error)) { case XFS_DIR2_FMT_SF: return xfs_dir2_sf_replace(args); case XFS_DIR2_FMT_BLOCK: return xfs_dir2_block_replace(args); case XFS_DIR2_FMT_LEAF: return xfs_dir2_leaf_replace(args); case XFS_DIR2_FMT_NODE: return xfs_dir2_node_replace(args); default: return error; } } /* * Replace the inode number of a directory entry. */ int xfs_dir_replace( struct xfs_trans *tp, struct xfs_inode *dp, const struct xfs_name *name, /* name of entry to replace */ xfs_ino_t inum, /* new inode number */ xfs_extlen_t total) /* bmap's total block count */ { struct xfs_da_args *args; int rval; ASSERT(S_ISDIR(VFS_I(dp)->i_mode)); rval = xfs_dir_ino_validate(tp->t_mountp, inum); if (rval) return rval; args = kzalloc(sizeof(*args), GFP_KERNEL | __GFP_NOFAIL); if (!args) return -ENOMEM; args->geo = dp->i_mount->m_dir_geo; args->name = name->name; args->namelen = name->len; args->filetype = name->type; args->hashval = xfs_dir2_hashname(dp->i_mount, name); args->inumber = inum; args->dp = dp; args->total = total; args->whichfork = XFS_DATA_FORK; args->trans = tp; args->owner = dp->i_ino; rval = xfs_dir_replace_args(args); kfree(args); return rval; } /* * See if this entry can be added to the directory without allocating space. */ int xfs_dir_canenter( struct xfs_trans *tp, struct xfs_inode *dp, const struct xfs_name *name) /* name of entry to add */ { return xfs_dir_createname(tp, dp, name, 0, 0); } /* * Utility routines. */ /* * Add a block to the directory. * * This routine is for data and free blocks, not leaf/node blocks which are * handled by xfs_da_grow_inode. */ int xfs_dir2_grow_inode( struct xfs_da_args *args, int space, /* v2 dir's space XFS_DIR2_xxx_SPACE */ xfs_dir2_db_t *dbp) /* out: block number added */ { struct xfs_inode *dp = args->dp; struct xfs_mount *mp = dp->i_mount; xfs_fileoff_t bno; /* directory offset of new block */ int count; /* count of filesystem blocks */ int error; trace_xfs_dir2_grow_inode(args, space); /* * Set lowest possible block in the space requested. */ bno = XFS_B_TO_FSBT(mp, space * XFS_DIR2_SPACE_SIZE); count = args->geo->fsbcount; error = xfs_da_grow_inode_int(args, &bno, count); if (error) return error; *dbp = xfs_dir2_da_to_db(args->geo, (xfs_dablk_t)bno); /* * Update file's size if this is the data space and it grew. */ if (space == XFS_DIR2_DATA_SPACE) { xfs_fsize_t size; /* directory file (data) size */ size = XFS_FSB_TO_B(mp, bno + count); if (size > dp->i_disk_size) { dp->i_disk_size = size; xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE); } } return 0; } /* * Remove the given block from the directory. * This routine is used for data and free blocks, leaf/node are done * by xfs_da_shrink_inode. */ int xfs_dir2_shrink_inode( struct xfs_da_args *args, xfs_dir2_db_t db, struct xfs_buf *bp) { xfs_fileoff_t bno; /* directory file offset */ xfs_dablk_t da; /* directory file offset */ int done; /* bunmap is finished */ struct xfs_inode *dp; int error; struct xfs_mount *mp; struct xfs_trans *tp; trace_xfs_dir2_shrink_inode(args, db); dp = args->dp; mp = dp->i_mount; tp = args->trans; da = xfs_dir2_db_to_da(args->geo, db); /* Unmap the fsblock(s). */ error = xfs_bunmapi(tp, dp, da, args->geo->fsbcount, 0, 0, &done); if (error) { /* * ENOSPC actually can happen if we're in a removename with no * space reservation, and the resulting block removal would * cause a bmap btree split or conversion from extents to btree. * This can only happen for un-fragmented directory blocks, * since you need to be punching out the middle of an extent. * In this case we need to leave the block in the file, and not * binval it. So the block has to be in a consistent empty * state and appropriately logged. We don't free up the buffer, * the caller can tell it hasn't happened since it got an error * back. */ return error; } ASSERT(done); /* * Invalidate the buffer from the transaction. */ xfs_trans_binval(tp, bp); /* * If it's not a data block, we're done. */ if (db >= xfs_dir2_byte_to_db(args->geo, XFS_DIR2_LEAF_OFFSET)) return 0; /* * If the block isn't the last one in the directory, we're done. */ if (dp->i_disk_size > xfs_dir2_db_off_to_byte(args->geo, db + 1, 0)) return 0; bno = da; if ((error = xfs_bmap_last_before(tp, dp, &bno, XFS_DATA_FORK))) { /* * This can't really happen unless there's kernel corruption. */ return error; } if (db == args->geo->datablk) ASSERT(bno == 0); else ASSERT(bno > 0); /* * Set the size to the new last block. */ dp->i_disk_size = XFS_FSB_TO_B(mp, bno); xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE); return 0; } /* Returns true if the directory entry name is valid. */ bool xfs_dir2_namecheck( const void *name, size_t length) { /* * MAXNAMELEN includes the trailing null, but (name/length) leave it * out, so use >= for the length check. */ if (length >= MAXNAMELEN) return false; /* There shouldn't be any slashes or nulls here */ return !memchr(name, '/', length) && !memchr(name, 0, length); } xfs_dahash_t xfs_dir2_hashname( struct xfs_mount *mp, const struct xfs_name *name) { if (unlikely(xfs_has_asciici(mp))) return xfs_ascii_ci_hashname(name); return xfs_da_hashname(name->name, name->len); } enum xfs_dacmp xfs_dir2_compname( struct xfs_da_args *args, const unsigned char *name, int len) { if (unlikely(xfs_has_asciici(args->dp->i_mount))) return xfs_ascii_ci_compname(args, name, len); return xfs_da_compname(args, name, len); } #ifdef CONFIG_XFS_LIVE_HOOKS /* * Use a static key here to reduce the overhead of directory live update hooks. * If the compiler supports jump labels, the static branch will be replaced by * a nop sled when there are no hook users. Online fsck is currently the only * caller, so this is a reasonable tradeoff. * * Note: Patching the kernel code requires taking the cpu hotplug lock. Other * parts of the kernel allocate memory with that lock held, which means that * XFS callers cannot hold any locks that might be used by memory reclaim or * writeback when calling the static_branch_{inc,dec} functions. */ DEFINE_STATIC_XFS_HOOK_SWITCH(xfs_dir_hooks_switch); void xfs_dir_hook_disable(void) { xfs_hooks_switch_off(&xfs_dir_hooks_switch); } void xfs_dir_hook_enable(void) { xfs_hooks_switch_on(&xfs_dir_hooks_switch); } /* Call hooks for a directory update relating to a child dirent update. */ inline void xfs_dir_update_hook( struct xfs_inode *dp, struct xfs_inode *ip, int delta, const struct xfs_name *name) { if (xfs_hooks_switched_on(&xfs_dir_hooks_switch)) { struct xfs_dir_update_params p = { .dp = dp, .ip = ip, .delta = delta, .name = name, }; struct xfs_mount *mp = ip->i_mount; xfs_hooks_call(&mp->m_dir_update_hooks, 0, &p); } } /* Call the specified function during a directory update. */ int xfs_dir_hook_add( struct xfs_mount *mp, struct xfs_dir_hook *hook) { return xfs_hooks_add(&mp->m_dir_update_hooks, &hook->dirent_hook); } /* Stop calling the specified function during a directory update. */ void xfs_dir_hook_del( struct xfs_mount *mp, struct xfs_dir_hook *hook) { xfs_hooks_del(&mp->m_dir_update_hooks, &hook->dirent_hook); } /* Configure directory update hook functions. */ void xfs_dir_hook_setup( struct xfs_dir_hook *hook, notifier_fn_t mod_fn) { xfs_hook_setup(&hook->dirent_hook, mod_fn); } #endif /* CONFIG_XFS_LIVE_HOOKS */ /* * Given a directory @dp, a newly allocated inode @ip, and a @name, link @ip * into @dp under the given @name. If @ip is a directory, it will be * initialized. Both inodes must have the ILOCK held and the transaction must * have sufficient blocks reserved. */ int xfs_dir_create_child( struct xfs_trans *tp, unsigned int resblks, struct xfs_dir_update *du) { struct xfs_inode *dp = du->dp; const struct xfs_name *name = du->name; struct xfs_inode *ip = du->ip; int error; xfs_assert_ilocked(ip, XFS_ILOCK_EXCL); xfs_assert_ilocked(dp, XFS_ILOCK_EXCL); error = xfs_dir_createname(tp, dp, name, ip->i_ino, resblks); if (error) { ASSERT(error != -ENOSPC); return error; } xfs_trans_ichgtime(tp, dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE); if (S_ISDIR(VFS_I(ip)->i_mode)) { error = xfs_dir_init(tp, ip, dp); if (error) return error; xfs_bumplink(tp, dp); } /* * If we have parent pointers, we need to add the attribute containing * the parent information now. */ if (du->ppargs) { error = xfs_parent_addname(tp, du->ppargs, dp, name, ip); if (error) return error; } xfs_dir_update_hook(dp, ip, 1, name); return 0; } /* * Given a directory @dp, an existing non-directory inode @ip, and a @name, * link @ip into @dp under the given @name. Both inodes must have the ILOCK * held. */ int xfs_dir_add_child( struct xfs_trans *tp, unsigned int resblks, struct xfs_dir_update *du) { struct xfs_inode *dp = du->dp; const struct xfs_name *name = du->name; struct xfs_inode *ip = du->ip; struct xfs_mount *mp = tp->t_mountp; int error; xfs_assert_ilocked(ip, XFS_ILOCK_EXCL); xfs_assert_ilocked(dp, XFS_ILOCK_EXCL); ASSERT(!S_ISDIR(VFS_I(ip)->i_mode)); if (!resblks) { error = xfs_dir_canenter(tp, dp, name); if (error) return error; } /* * Handle initial link state of O_TMPFILE inode */ if (VFS_I(ip)->i_nlink == 0) { struct xfs_perag *pag; pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino)); error = xfs_iunlink_remove(tp, pag, ip); xfs_perag_put(pag); if (error) return error; } error = xfs_dir_createname(tp, dp, name, ip->i_ino, resblks); if (error) return error; xfs_trans_ichgtime(tp, dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE); xfs_bumplink(tp, ip); /* * If we have parent pointers, we now need to add the parent record to * the attribute fork of the inode. If this is the initial parent * attribute, we need to create it correctly, otherwise we can just add * the parent to the inode. */ if (du->ppargs) { error = xfs_parent_addname(tp, du->ppargs, dp, name, ip); if (error) return error; } xfs_dir_update_hook(dp, ip, 1, name); return 0; } /* * Given a directory @dp, a child @ip, and a @name, remove the (@name, @ip) * entry from the directory. Both inodes must have the ILOCK held. */ int xfs_dir_remove_child( struct xfs_trans *tp, unsigned int resblks, struct xfs_dir_update *du) { struct xfs_inode *dp = du->dp; const struct xfs_name *name = du->name; struct xfs_inode *ip = du->ip; int error; xfs_assert_ilocked(ip, XFS_ILOCK_EXCL); xfs_assert_ilocked(dp, XFS_ILOCK_EXCL); /* * If we're removing a directory perform some additional validation. */ if (S_ISDIR(VFS_I(ip)->i_mode)) { ASSERT(VFS_I(ip)->i_nlink >= 2); if (VFS_I(ip)->i_nlink != 2) return -ENOTEMPTY; if (!xfs_dir_isempty(ip)) return -ENOTEMPTY; /* Drop the link from ip's "..". */ error = xfs_droplink(tp, dp); if (error) return error; /* Drop the "." link from ip to self. */ error = xfs_droplink(tp, ip); if (error) return error; /* * Point the unlinked child directory's ".." entry to the root * directory to eliminate back-references to inodes that may * get freed before the child directory is closed. If the fs * gets shrunk, this can lead to dirent inode validation errors. */ if (dp->i_ino != tp->t_mountp->m_sb.sb_rootino) { error = xfs_dir_replace(tp, ip, &xfs_name_dotdot, tp->t_mountp->m_sb.sb_rootino, 0); if (error) return error; } } else { /* * When removing a non-directory we need to log the parent * inode here. For a directory this is done implicitly * by the xfs_droplink call for the ".." entry. */ xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE); } xfs_trans_ichgtime(tp, dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); /* Drop the link from dp to ip. */ error = xfs_droplink(tp, ip); if (error) return error; error = xfs_dir_removename(tp, dp, name, ip->i_ino, resblks); if (error) { ASSERT(error != -ENOENT); return error; } /* Remove parent pointer. */ if (du->ppargs) { error = xfs_parent_removename(tp, du->ppargs, dp, name, ip); if (error) return error; } xfs_dir_update_hook(dp, ip, -1, name); return 0; } /* * Exchange the entry (@name1, @ip1) in directory @dp1 with the entry (@name2, * @ip2) in directory @dp2, and update '..' @ip1 and @ip2's entries as needed. * @ip1 and @ip2 need not be of the same type. * * All inodes must have the ILOCK held, and both entries must already exist. */ int xfs_dir_exchange_children( struct xfs_trans *tp, struct xfs_dir_update *du1, struct xfs_dir_update *du2, unsigned int spaceres) { struct xfs_inode *dp1 = du1->dp; const struct xfs_name *name1 = du1->name; struct xfs_inode *ip1 = du1->ip; struct xfs_inode *dp2 = du2->dp; const struct xfs_name *name2 = du2->name; struct xfs_inode *ip2 = du2->ip; int ip1_flags = 0; int ip2_flags = 0; int dp2_flags = 0; int error; /* Swap inode number for dirent in first parent */ error = xfs_dir_replace(tp, dp1, name1, ip2->i_ino, spaceres); if (error) return error; /* Swap inode number for dirent in second parent */ error = xfs_dir_replace(tp, dp2, name2, ip1->i_ino, spaceres); if (error) return error; /* * If we're renaming one or more directories across different parents, * update the respective ".." entries (and link counts) to match the new * parents. */ if (dp1 != dp2) { dp2_flags = XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG; if (S_ISDIR(VFS_I(ip2)->i_mode)) { error = xfs_dir_replace(tp, ip2, &xfs_name_dotdot, dp1->i_ino, spaceres); if (error) return error; /* transfer ip2 ".." reference to dp1 */ if (!S_ISDIR(VFS_I(ip1)->i_mode)) { error = xfs_droplink(tp, dp2); if (error) return error; xfs_bumplink(tp, dp1); } /* * Although ip1 isn't changed here, userspace needs * to be warned about the change, so that applications * relying on it (like backup ones), will properly * notify the change */ ip1_flags |= XFS_ICHGTIME_CHG; ip2_flags |= XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG; } if (S_ISDIR(VFS_I(ip1)->i_mode)) { error = xfs_dir_replace(tp, ip1, &xfs_name_dotdot, dp2->i_ino, spaceres); if (error) return error; /* transfer ip1 ".." reference to dp2 */ if (!S_ISDIR(VFS_I(ip2)->i_mode)) { error = xfs_droplink(tp, dp1); if (error) return error; xfs_bumplink(tp, dp2); } /* * Although ip2 isn't changed here, userspace needs * to be warned about the change, so that applications * relying on it (like backup ones), will properly * notify the change */ ip1_flags |= XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG; ip2_flags |= XFS_ICHGTIME_CHG; } } if (ip1_flags) { xfs_trans_ichgtime(tp, ip1, ip1_flags); xfs_trans_log_inode(tp, ip1, XFS_ILOG_CORE); } if (ip2_flags) { xfs_trans_ichgtime(tp, ip2, ip2_flags); xfs_trans_log_inode(tp, ip2, XFS_ILOG_CORE); } if (dp2_flags) { xfs_trans_ichgtime(tp, dp2, dp2_flags); xfs_trans_log_inode(tp, dp2, XFS_ILOG_CORE); } xfs_trans_ichgtime(tp, dp1, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); xfs_trans_log_inode(tp, dp1, XFS_ILOG_CORE); /* Schedule parent pointer replacements */ if (du1->ppargs) { error = xfs_parent_replacename(tp, du1->ppargs, dp1, name1, dp2, name2, ip1); if (error) return error; } if (du2->ppargs) { error = xfs_parent_replacename(tp, du2->ppargs, dp2, name2, dp1, name1, ip2); if (error) return error; } /* * Inform our hook clients that we've finished an exchange operation as * follows: removed the source and target files from their directories; * added the target to the source directory; and added the source to * the target directory. All inodes are locked, so it's ok to model a * rename this way so long as we say we deleted entries before we add * new ones. */ xfs_dir_update_hook(dp1, ip1, -1, name1); xfs_dir_update_hook(dp2, ip2, -1, name2); xfs_dir_update_hook(dp1, ip2, 1, name1); xfs_dir_update_hook(dp2, ip1, 1, name2); return 0; } /* * Given an entry (@src_name, @src_ip) in directory @src_dp, make the entry * @target_name in directory @target_dp point to @src_ip and remove the * original entry, cleaning up everything left behind. * * Cleanup involves dropping a link count on @target_ip, and either removing * the (@src_name, @src_ip) entry from @src_dp or simply replacing the entry * with (@src_name, @wip) if a whiteout inode @wip is supplied. * * All inodes must have the ILOCK held. We assume that if @src_ip is a * directory then its '..' doesn't already point to @target_dp, and that @wip * is a freshly allocated whiteout. */ int xfs_dir_rename_children( struct xfs_trans *tp, struct xfs_dir_update *du_src, struct xfs_dir_update *du_tgt, unsigned int spaceres, struct xfs_dir_update *du_wip) { struct xfs_mount *mp = tp->t_mountp; struct xfs_inode *src_dp = du_src->dp; const struct xfs_name *src_name = du_src->name; struct xfs_inode *src_ip = du_src->ip; struct xfs_inode *target_dp = du_tgt->dp; const struct xfs_name *target_name = du_tgt->name; struct xfs_inode *target_ip = du_tgt->ip; bool new_parent = (src_dp != target_dp); bool src_is_directory; int error; src_is_directory = S_ISDIR(VFS_I(src_ip)->i_mode); /* * Check for expected errors before we dirty the transaction * so we can return an error without a transaction abort. */ if (target_ip == NULL) { /* * If there's no space reservation, check the entry will * fit before actually inserting it. */ if (!spaceres) { error = xfs_dir_canenter(tp, target_dp, target_name); if (error) return error; } } else { /* * If target exists and it's a directory, check that whether * it can be destroyed. */ if (S_ISDIR(VFS_I(target_ip)->i_mode) && (!xfs_dir_isempty(target_ip) || (VFS_I(target_ip)->i_nlink > 2))) return -EEXIST; } /* * Directory entry creation below may acquire the AGF. Remove * the whiteout from the unlinked list first to preserve correct * AGI/AGF locking order. This dirties the transaction so failures * after this point will abort and log recovery will clean up the * mess. * * For whiteouts, we need to bump the link count on the whiteout * inode. After this point, we have a real link, clear the tmpfile * state flag from the inode so it doesn't accidentally get misused * in future. */ if (du_wip->ip) { struct xfs_perag *pag; ASSERT(VFS_I(du_wip->ip)->i_nlink == 0); pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, du_wip->ip->i_ino)); error = xfs_iunlink_remove(tp, pag, du_wip->ip); xfs_perag_put(pag); if (error) return error; xfs_bumplink(tp, du_wip->ip); } /* * Set up the target. */ if (target_ip == NULL) { /* * If target does not exist and the rename crosses * directories, adjust the target directory link count * to account for the ".." reference from the new entry. */ error = xfs_dir_createname(tp, target_dp, target_name, src_ip->i_ino, spaceres); if (error) return error; xfs_trans_ichgtime(tp, target_dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); if (new_parent && src_is_directory) { xfs_bumplink(tp, target_dp); } } else { /* target_ip != NULL */ /* * Link the source inode under the target name. * If the source inode is a directory and we are moving * it across directories, its ".." entry will be * inconsistent until we replace that down below. * * In case there is already an entry with the same * name at the destination directory, remove it first. */ error = xfs_dir_replace(tp, target_dp, target_name, src_ip->i_ino, spaceres); if (error) return error; xfs_trans_ichgtime(tp, target_dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); /* * Decrement the link count on the target since the target * dir no longer points to it. */ error = xfs_droplink(tp, target_ip); if (error) return error; if (src_is_directory) { /* * Drop the link from the old "." entry. */ error = xfs_droplink(tp, target_ip); if (error) return error; } } /* target_ip != NULL */ /* * Remove the source. */ if (new_parent && src_is_directory) { /* * Rewrite the ".." entry to point to the new * directory. */ error = xfs_dir_replace(tp, src_ip, &xfs_name_dotdot, target_dp->i_ino, spaceres); ASSERT(error != -EEXIST); if (error) return error; } /* * We always want to hit the ctime on the source inode. * * This isn't strictly required by the standards since the source * inode isn't really being changed, but old unix file systems did * it and some incremental backup programs won't work without it. */ xfs_trans_ichgtime(tp, src_ip, XFS_ICHGTIME_CHG); xfs_trans_log_inode(tp, src_ip, XFS_ILOG_CORE); /* * Adjust the link count on src_dp. This is necessary when * renaming a directory, either within one parent when * the target existed, or across two parent directories. */ if (src_is_directory && (new_parent || target_ip != NULL)) { /* * Decrement link count on src_directory since the * entry that's moved no longer points to it. */ error = xfs_droplink(tp, src_dp); if (error) return error; } /* * For whiteouts, we only need to update the source dirent with the * inode number of the whiteout inode rather than removing it * altogether. */ if (du_wip->ip) error = xfs_dir_replace(tp, src_dp, src_name, du_wip->ip->i_ino, spaceres); else error = xfs_dir_removename(tp, src_dp, src_name, src_ip->i_ino, spaceres); if (error) return error; xfs_trans_ichgtime(tp, src_dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); xfs_trans_log_inode(tp, src_dp, XFS_ILOG_CORE); if (new_parent) xfs_trans_log_inode(tp, target_dp, XFS_ILOG_CORE); /* Schedule parent pointer updates. */ if (du_wip->ppargs) { error = xfs_parent_addname(tp, du_wip->ppargs, src_dp, src_name, du_wip->ip); if (error) return error; } if (du_src->ppargs) { error = xfs_parent_replacename(tp, du_src->ppargs, src_dp, src_name, target_dp, target_name, src_ip); if (error) return error; } if (du_tgt->ppargs) { error = xfs_parent_removename(tp, du_tgt->ppargs, target_dp, target_name, target_ip); if (error) return error; } /* * Inform our hook clients that we've finished a rename operation as * follows: removed the source and target files from their directories; * that we've added the source to the target directory; and finally * that we've added the whiteout, if there was one. All inodes are * locked, so it's ok to model a rename this way so long as we say we * deleted entries before we add new ones. */ if (target_ip) xfs_dir_update_hook(target_dp, target_ip, -1, target_name); xfs_dir_update_hook(src_dp, src_ip, -1, src_name); xfs_dir_update_hook(target_dp, src_ip, 1, target_name); if (du_wip->ip) xfs_dir_update_hook(src_dp, du_wip->ip, 1, src_name); return 0; }
2 4 4 4 4 3 4 4 4 4 2 3 3 2 4 6 6 6 6 6 6 6 6 6 6 6 6 4 3 3 3 2 4 2 4 4 4 4 4 4 4 2 4 4 4 4 4 4 4 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 // SPDX-License-Identifier: GPL-2.0 #include <linux/netdevice.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <net/wext.h> #include <net/hotdata.h> #include "dev.h" static void *dev_seq_from_index(struct seq_file *seq, loff_t *pos) { unsigned long ifindex = *pos; struct net_device *dev; for_each_netdev_dump(seq_file_net(seq), dev, ifindex) { *pos = dev->ifindex; return dev; } return NULL; } static void *dev_seq_start(struct seq_file *seq, loff_t *pos) __acquires(RCU) { rcu_read_lock(); if (!*pos) return SEQ_START_TOKEN; return dev_seq_from_index(seq, pos); } static void *dev_seq_next(struct seq_file *seq, void *v, loff_t *pos) { ++*pos; return dev_seq_from_index(seq, pos); } static void dev_seq_stop(struct seq_file *seq, void *v) __releases(RCU) { rcu_read_unlock(); } static void dev_seq_printf_stats(struct seq_file *seq, struct net_device *dev) { struct rtnl_link_stats64 temp; const struct rtnl_link_stats64 *stats = dev_get_stats(dev, &temp); seq_printf(seq, "%6s: %7llu %7llu %4llu %4llu %4llu %5llu %10llu %9llu " "%8llu %7llu %4llu %4llu %4llu %5llu %7llu %10llu\n", dev->name, stats->rx_bytes, stats->rx_packets, stats->rx_errors, stats->rx_dropped + stats->rx_missed_errors, stats->rx_fifo_errors, stats->rx_length_errors + stats->rx_over_errors + stats->rx_crc_errors + stats->rx_frame_errors, stats->rx_compressed, stats->multicast, stats->tx_bytes, stats->tx_packets, stats->tx_errors, stats->tx_dropped, stats->tx_fifo_errors, stats->collisions, stats->tx_carrier_errors + stats->tx_aborted_errors + stats->tx_window_errors + stats->tx_heartbeat_errors, stats->tx_compressed); } /* * Called from the PROCfs module. This now uses the new arbitrary sized * /proc/net interface to create /proc/net/dev */ static int dev_seq_show(struct seq_file *seq, void *v) { if (v == SEQ_START_TOKEN) seq_puts(seq, "Inter-| Receive " " | Transmit\n" " face |bytes packets errs drop fifo frame " "compressed multicast|bytes packets errs " "drop fifo colls carrier compressed\n"); else dev_seq_printf_stats(seq, v); return 0; } static u32 softnet_input_pkt_queue_len(struct softnet_data *sd) { return skb_queue_len_lockless(&sd->input_pkt_queue); } static u32 softnet_process_queue_len(struct softnet_data *sd) { return skb_queue_len_lockless(&sd->process_queue); } static struct softnet_data *softnet_get_online(loff_t *pos) { struct softnet_data *sd = NULL; while (*pos < nr_cpu_ids) if (cpu_online(*pos)) { sd = &per_cpu(softnet_data, *pos); break; } else ++*pos; return sd; } static void *softnet_seq_start(struct seq_file *seq, loff_t *pos) { return softnet_get_online(pos); } static void *softnet_seq_next(struct seq_file *seq, void *v, loff_t *pos) { ++*pos; return softnet_get_online(pos); } static void softnet_seq_stop(struct seq_file *seq, void *v) { } static int softnet_seq_show(struct seq_file *seq, void *v) { struct softnet_data *sd = v; u32 input_qlen = softnet_input_pkt_queue_len(sd); u32 process_qlen = softnet_process_queue_len(sd); unsigned int flow_limit_count = 0; #ifdef CONFIG_NET_FLOW_LIMIT struct sd_flow_limit *fl; rcu_read_lock(); fl = rcu_dereference(sd->flow_limit); /* Pairs with WRITE_ONCE() in skb_flow_limit() */ if (fl) flow_limit_count = READ_ONCE(fl->count); rcu_read_unlock(); #endif /* the index is the CPU id owing this sd. Since offline CPUs are not * displayed, it would be othrwise not trivial for the user-space * mapping the data a specific CPU */ seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x %08x %08x %08x %08x %08x " "%08x %08x\n", READ_ONCE(sd->processed), numa_drop_read(&sd->drop_counters), READ_ONCE(sd->time_squeeze), 0, 0, 0, 0, 0, /* was fastroute */ 0, /* was cpu_collision */ READ_ONCE(sd->received_rps), flow_limit_count, input_qlen + process_qlen, (int)seq->index, input_qlen, process_qlen); return 0; } static const struct seq_operations dev_seq_ops = { .start = dev_seq_start, .next = dev_seq_next, .stop = dev_seq_stop, .show = dev_seq_show, }; static const struct seq_operations softnet_seq_ops = { .start = softnet_seq_start, .next = softnet_seq_next, .stop = softnet_seq_stop, .show = softnet_seq_show, }; static void *ptype_get_idx(struct seq_file *seq, loff_t pos) { struct list_head *ptype_list = NULL; struct packet_type *pt = NULL; struct net_device *dev; loff_t i = 0; int t; for_each_netdev_rcu(seq_file_net(seq), dev) { ptype_list = &dev->ptype_all; list_for_each_entry_rcu(pt, ptype_list, list) { if (i == pos) return pt; ++i; } } list_for_each_entry_rcu(pt, &seq_file_net(seq)->ptype_all, list) { if (i == pos) return pt; ++i; } list_for_each_entry_rcu(pt, &seq_file_net(seq)->ptype_specific, list) { if (i == pos) return pt; ++i; } for (t = 0; t < PTYPE_HASH_SIZE; t++) { list_for_each_entry_rcu(pt, &ptype_base[t], list) { if (i == pos) return pt; ++i; } } return NULL; } static void *ptype_seq_start(struct seq_file *seq, loff_t *pos) __acquires(RCU) { rcu_read_lock(); return *pos ? ptype_get_idx(seq, *pos - 1) : SEQ_START_TOKEN; } static void *ptype_seq_next(struct seq_file *seq, void *v, loff_t *pos) { struct net *net = seq_file_net(seq); struct net_device *dev; struct packet_type *pt; struct list_head *nxt; int hash; ++*pos; if (v == SEQ_START_TOKEN) return ptype_get_idx(seq, 0); pt = v; nxt = pt->list.next; if (pt->dev) { if (nxt != &pt->dev->ptype_all) goto found; dev = pt->dev; for_each_netdev_continue_rcu(seq_file_net(seq), dev) { if (!list_empty(&dev->ptype_all)) { nxt = dev->ptype_all.next; goto found; } } nxt = net->ptype_all.next; goto net_ptype_all; } if (pt->af_packet_net) { net_ptype_all: if (nxt != &net->ptype_all && nxt != &net->ptype_specific) goto found; if (nxt == &net->ptype_all) { /* continue with ->ptype_specific if it's not empty */ nxt = net->ptype_specific.next; if (nxt != &net->ptype_specific) goto found; } hash = 0; nxt = ptype_base[0].next; } else hash = ntohs(pt->type) & PTYPE_HASH_MASK; while (nxt == &ptype_base[hash]) { if (++hash >= PTYPE_HASH_SIZE) return NULL; nxt = ptype_base[hash].next; } found: return list_entry(nxt, struct packet_type, list); } static void ptype_seq_stop(struct seq_file *seq, void *v) __releases(RCU) { rcu_read_unlock(); } static int ptype_seq_show(struct seq_file *seq, void *v) { struct packet_type *pt = v; if (v == SEQ_START_TOKEN) seq_puts(seq, "Type Device Function\n"); else if ((!pt->af_packet_net || net_eq(pt->af_packet_net, seq_file_net(seq))) && (!pt->dev || net_eq(dev_net(pt->dev), seq_file_net(seq)))) { if (pt->type == htons(ETH_P_ALL)) seq_puts(seq, "ALL "); else seq_printf(seq, "%04x", ntohs(pt->type)); seq_printf(seq, " %-8s %ps\n", pt->dev ? pt->dev->name : "", pt->func); } return 0; } static const struct seq_operations ptype_seq_ops = { .start = ptype_seq_start, .next = ptype_seq_next, .stop = ptype_seq_stop, .show = ptype_seq_show, }; static int __net_init dev_proc_net_init(struct net *net) { int rc = -ENOMEM; if (!proc_create_net("dev", 0444, net->proc_net, &dev_seq_ops, sizeof(struct seq_net_private))) goto out; if (!proc_create_seq("softnet_stat", 0444, net->proc_net, &softnet_seq_ops)) goto out_dev; if (!proc_create_net("ptype", 0444, net->proc_net, &ptype_seq_ops, sizeof(struct seq_net_private))) goto out_softnet; if (wext_proc_init(net)) goto out_ptype; rc = 0; out: return rc; out_ptype: remove_proc_entry("ptype", net->proc_net); out_softnet: remove_proc_entry("softnet_stat", net->proc_net); out_dev: remove_proc_entry("dev", net->proc_net); goto out; } static void __net_exit dev_proc_net_exit(struct net *net) { wext_proc_exit(net); remove_proc_entry("ptype", net->proc_net); remove_proc_entry("softnet_stat", net->proc_net); remove_proc_entry("dev", net->proc_net); } static struct pernet_operations __net_initdata dev_proc_ops = { .init = dev_proc_net_init, .exit = dev_proc_net_exit, }; static int dev_mc_seq_show(struct seq_file *seq, void *v) { struct netdev_hw_addr *ha; struct net_device *dev = v; if (v == SEQ_START_TOKEN) return 0; netif_addr_lock_bh(dev); netdev_for_each_mc_addr(ha, dev) { seq_printf(seq, "%-4d %-15s %-5d %-5d %*phN\n", dev->ifindex, dev->name, ha->refcount, ha->global_use, (int)dev->addr_len, ha->addr); } netif_addr_unlock_bh(dev); return 0; } static const struct seq_operations dev_mc_seq_ops = { .start = dev_seq_start, .next = dev_seq_next, .stop = dev_seq_stop, .show = dev_mc_seq_show, }; static int __net_init dev_mc_net_init(struct net *net) { if (!proc_create_net("dev_mcast", 0, net->proc_net, &dev_mc_seq_ops, sizeof(struct seq_net_private))) return -ENOMEM; return 0; } static void __net_exit dev_mc_net_exit(struct net *net) { remove_proc_entry("dev_mcast", net->proc_net); } static struct pernet_operations __net_initdata dev_mc_net_ops = { .init = dev_mc_net_init, .exit = dev_mc_net_exit, }; int __init dev_proc_init(void) { int ret = register_pernet_subsys(&dev_proc_ops); if (!ret) return register_pernet_subsys(&dev_mc_net_ops); return ret; }
23 27 27 27 26 8 21 5 19 16 16 2 16 16 9 7 7 4 3 7 7 1 35 35 1 34 27 7 7 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 // SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2000-2005 Silicon Graphics, Inc. * Copyright (c) 2013 Red Hat, Inc. * All Rights Reserved. */ #include "xfs.h" #include "xfs_fs.h" #include "xfs_shared.h" #include "xfs_format.h" #include "xfs_log_format.h" #include "xfs_trans_resv.h" #include "xfs_mount.h" #include "xfs_inode.h" #include "xfs_dir2.h" #include "xfs_dir2_priv.h" #include "xfs_trace.h" #include "xfs_bmap.h" #include "xfs_trans.h" #include "xfs_error.h" #include "xfs_health.h" /* * Directory file type support functions */ static unsigned char xfs_dir3_filetype_table[] = { DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK, DT_WHT, }; unsigned char xfs_dir3_get_dtype( struct xfs_mount *mp, uint8_t filetype) { if (!xfs_has_ftype(mp)) return DT_UNKNOWN; if (filetype >= XFS_DIR3_FT_MAX) return DT_UNKNOWN; return xfs_dir3_filetype_table[filetype]; } STATIC int xfs_dir2_sf_getdents( struct xfs_da_args *args, struct dir_context *ctx) { int i; /* shortform entry number */ struct xfs_inode *dp = args->dp; /* incore directory inode */ struct xfs_mount *mp = dp->i_mount; xfs_dir2_dataptr_t off; /* current entry's offset */ xfs_dir2_sf_entry_t *sfep; /* shortform directory entry */ struct xfs_dir2_sf_hdr *sfp = dp->i_df.if_data; xfs_dir2_dataptr_t dot_offset; xfs_dir2_dataptr_t dotdot_offset; xfs_ino_t ino; struct xfs_da_geometry *geo = args->geo; ASSERT(dp->i_df.if_format == XFS_DINODE_FMT_LOCAL); ASSERT(dp->i_df.if_bytes == dp->i_disk_size); ASSERT(sfp != NULL); /* * If the block number in the offset is out of range, we're done. */ if (xfs_dir2_dataptr_to_db(geo, ctx->pos) > geo->datablk) return 0; /* * Precalculate offsets for "." and ".." as we will always need them. * This relies on the fact that directories always start with the * entries for "." and "..". */ dot_offset = xfs_dir2_db_off_to_dataptr(geo, geo->datablk, geo->data_entry_offset); dotdot_offset = xfs_dir2_db_off_to_dataptr(geo, geo->datablk, geo->data_entry_offset + xfs_dir2_data_entsize(mp, sizeof(".") - 1)); /* * Put . entry unless we're starting past it. */ if (ctx->pos <= dot_offset) { ctx->pos = dot_offset & 0x7fffffff; if (!dir_emit(ctx, ".", 1, dp->i_ino, DT_DIR)) return 0; } /* * Put .. entry unless we're starting past it. */ if (ctx->pos <= dotdot_offset) { ino = xfs_dir2_sf_get_parent_ino(sfp); ctx->pos = dotdot_offset & 0x7fffffff; if (!dir_emit(ctx, "..", 2, ino, DT_DIR)) return 0; } /* * Loop while there are more entries and put'ing works. */ sfep = xfs_dir2_sf_firstentry(sfp); for (i = 0; i < sfp->count; i++) { uint8_t filetype; off = xfs_dir2_db_off_to_dataptr(geo, geo->datablk, xfs_dir2_sf_get_offset(sfep)); if (ctx->pos > off) { sfep = xfs_dir2_sf_nextentry(mp, sfp, sfep); continue; } ino = xfs_dir2_sf_get_ino(mp, sfp, sfep); filetype = xfs_dir2_sf_get_ftype(mp, sfep); ctx->pos = off & 0x7fffffff; if (XFS_IS_CORRUPT(dp->i_mount, !xfs_dir2_namecheck(sfep->name, sfep->namelen))) { xfs_dirattr_mark_sick(dp, XFS_DATA_FORK); return -EFSCORRUPTED; } if (!dir_emit(ctx, (char *)sfep->name, sfep->namelen, ino, xfs_dir3_get_dtype(mp, filetype))) return 0; sfep = xfs_dir2_sf_nextentry(mp, sfp, sfep); } ctx->pos = xfs_dir2_db_off_to_dataptr(geo, geo->datablk + 1, 0) & 0x7fffffff; return 0; } /* * Readdir for block directories. */ STATIC int xfs_dir2_block_getdents( struct xfs_da_args *args, struct dir_context *ctx, unsigned int *lock_mode) { struct xfs_inode *dp = args->dp; /* incore directory inode */ struct xfs_buf *bp; /* buffer for block */ int error; /* error return value */ int wantoff; /* starting block offset */ xfs_off_t cook; struct xfs_da_geometry *geo = args->geo; unsigned int offset, next_offset; unsigned int end; /* * If the block number in the offset is out of range, we're done. */ if (xfs_dir2_dataptr_to_db(geo, ctx->pos) > geo->datablk) return 0; error = xfs_dir3_block_read(args->trans, dp, args->owner, &bp); if (error) return error; xfs_iunlock(dp, *lock_mode); *lock_mode = 0; /* * Extract the byte offset we start at from the seek pointer. * We'll skip entries before this. */ wantoff = xfs_dir2_dataptr_to_off(geo, ctx->pos); xfs_dir3_data_check(dp, bp); /* * Loop over the data portion of the block. * Each object is a real entry (dep) or an unused one (dup). */ end = xfs_dir3_data_end_offset(geo, bp->b_addr); for (offset = geo->data_entry_offset; offset < end; offset = next_offset) { struct xfs_dir2_data_unused *dup = bp->b_addr + offset; struct xfs_dir2_data_entry *dep = bp->b_addr + offset; uint8_t filetype; /* * Unused, skip it. */ if (be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG) { next_offset = offset + be16_to_cpu(dup->length); continue; } /* * Bump pointer for the next iteration. */ next_offset = offset + xfs_dir2_data_entsize(dp->i_mount, dep->namelen); /* * The entry is before the desired starting point, skip it. */ if (offset < wantoff) continue; cook = xfs_dir2_db_off_to_dataptr(geo, geo->datablk, offset); ctx->pos = cook & 0x7fffffff; filetype = xfs_dir2_data_get_ftype(dp->i_mount, dep); /* * If it didn't fit, set the final offset to here & return. */ if (XFS_IS_CORRUPT(dp->i_mount, !xfs_dir2_namecheck(dep->name, dep->namelen))) { xfs_dirattr_mark_sick(dp, XFS_DATA_FORK); error = -EFSCORRUPTED; goto out_rele; } if (!dir_emit(ctx, (char *)dep->name, dep->namelen, be64_to_cpu(dep->inumber), xfs_dir3_get_dtype(dp->i_mount, filetype))) goto out_rele; } /* * Reached the end of the block. * Set the offset to a non-existent block 1 and return. */ ctx->pos = xfs_dir2_db_off_to_dataptr(geo, geo->datablk + 1, 0) & 0x7fffffff; out_rele: xfs_trans_brelse(args->trans, bp); return error; } /* * Read a directory block and initiate readahead for blocks beyond that. * We maintain a sliding readahead window of the remaining space in the * buffer rounded up to the nearest block. */ STATIC int xfs_dir2_leaf_readbuf( struct xfs_da_args *args, size_t bufsize, xfs_dir2_off_t *cur_off, xfs_dablk_t *ra_blk, struct xfs_buf **bpp) { struct xfs_inode *dp = args->dp; struct xfs_buf *bp = NULL; struct xfs_da_geometry *geo = args->geo; struct xfs_ifork *ifp = xfs_ifork_ptr(dp, XFS_DATA_FORK); struct xfs_bmbt_irec map; struct blk_plug plug; xfs_dir2_off_t new_off; xfs_dablk_t next_ra; xfs_dablk_t map_off; xfs_dablk_t last_da; struct xfs_iext_cursor icur; int ra_want; int error = 0; error = xfs_iread_extents(args->trans, dp, XFS_DATA_FORK); if (error) goto out; /* * Look for mapped directory blocks at or above the current offset. * Truncate down to the nearest directory block to start the scanning * operation. */ last_da = xfs_dir2_byte_to_da(geo, XFS_DIR2_LEAF_OFFSET); map_off = xfs_dir2_db_to_da(geo, xfs_dir2_byte_to_db(geo, *cur_off)); if (!xfs_iext_lookup_extent(dp, ifp, map_off, &icur, &map)) goto out; if (map.br_startoff >= last_da) goto out; xfs_trim_extent(&map, map_off, last_da - map_off); /* Read the directory block of that first mapping. */ new_off = xfs_dir2_da_to_byte(geo, map.br_startoff); if (new_off > *cur_off) *cur_off = new_off; error = xfs_dir3_data_read(args->trans, dp, args->owner, map.br_startoff, 0, &bp); if (error) goto out; /* * Start readahead for the next bufsize's worth of dir data blocks. * We may have already issued readahead for some of that range; * ra_blk tracks the last block we tried to read(ahead). */ ra_want = howmany(bufsize + geo->blksize, (1 << geo->fsblog)); if (*ra_blk >= last_da) goto out; else if (*ra_blk == 0) *ra_blk = map.br_startoff; next_ra = map.br_startoff + geo->fsbcount; if (next_ra >= last_da) goto out_no_ra; if (map.br_blockcount < geo->fsbcount && !xfs_iext_next_extent(ifp, &icur, &map)) goto out_no_ra; if (map.br_startoff >= last_da) goto out_no_ra; xfs_trim_extent(&map, next_ra, last_da - next_ra); /* Start ra for each dir (not fs) block that has a mapping. */ blk_start_plug(&plug); while (ra_want > 0) { next_ra = roundup((xfs_dablk_t)map.br_startoff, geo->fsbcount); while (ra_want > 0 && next_ra < map.br_startoff + map.br_blockcount) { if (next_ra >= last_da) { *ra_blk = last_da; break; } if (next_ra > *ra_blk) { xfs_dir3_data_readahead(dp, next_ra, XFS_DABUF_MAP_HOLE_OK); *ra_blk = next_ra; } ra_want -= geo->fsbcount; next_ra += geo->fsbcount; } if (!xfs_iext_next_extent(ifp, &icur, &map)) { *ra_blk = last_da; break; } } blk_finish_plug(&plug); out: *bpp = bp; return error; out_no_ra: *ra_blk = last_da; goto out; } /* * Getdents (readdir) for leaf and node directories. * This reads the data blocks only, so is the same for both forms. */ STATIC int xfs_dir2_leaf_getdents( struct xfs_da_args *args, struct dir_context *ctx, size_t bufsize, unsigned int *lock_mode) { struct xfs_inode *dp = args->dp; struct xfs_mount *mp = dp->i_mount; struct xfs_buf *bp = NULL; /* data block buffer */ xfs_dir2_data_entry_t *dep; /* data entry */ xfs_dir2_data_unused_t *dup; /* unused entry */ struct xfs_da_geometry *geo = args->geo; xfs_dablk_t rablk = 0; /* current readahead block */ xfs_dir2_off_t curoff; /* current overall offset */ int length; /* temporary length value */ int byteoff; /* offset in current block */ unsigned int offset = 0; int error = 0; /* error return value */ /* * If the offset is at or past the largest allowed value, * give up right away. */ if (ctx->pos >= XFS_DIR2_MAX_DATAPTR) return 0; /* * Inside the loop we keep the main offset value as a byte offset * in the directory file. */ curoff = xfs_dir2_dataptr_to_byte(ctx->pos); /* * Loop over directory entries until we reach the end offset. * Get more blocks and readahead as necessary. */ while (curoff < XFS_DIR2_LEAF_OFFSET) { uint8_t filetype; /* * If we have no buffer, or we're off the end of the * current buffer, need to get another one. */ if (!bp || offset >= geo->blksize) { if (bp) { xfs_trans_brelse(args->trans, bp); bp = NULL; } if (*lock_mode == 0) *lock_mode = xfs_ilock_data_map_shared(dp); error = xfs_dir2_leaf_readbuf(args, bufsize, &curoff, &rablk, &bp); if (error || !bp) break; xfs_iunlock(dp, *lock_mode); *lock_mode = 0; xfs_dir3_data_check(dp, bp); /* * Find our position in the block. */ offset = geo->data_entry_offset; byteoff = xfs_dir2_byte_to_off(geo, curoff); /* * Skip past the header. */ if (byteoff == 0) curoff += geo->data_entry_offset; /* * Skip past entries until we reach our offset. */ else { while (offset < byteoff) { dup = bp->b_addr + offset; if (be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG) { length = be16_to_cpu(dup->length); offset += length; continue; } dep = bp->b_addr + offset; length = xfs_dir2_data_entsize(mp, dep->namelen); offset += length; } /* * Now set our real offset. */ curoff = xfs_dir2_db_off_to_byte(geo, xfs_dir2_byte_to_db(geo, curoff), offset); if (offset >= geo->blksize) continue; } } /* * We have a pointer to an entry. Is it a live one? */ dup = bp->b_addr + offset; /* * No, it's unused, skip over it. */ if (be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG) { length = be16_to_cpu(dup->length); offset += length; curoff += length; continue; } dep = bp->b_addr + offset; length = xfs_dir2_data_entsize(mp, dep->namelen); filetype = xfs_dir2_data_get_ftype(mp, dep); ctx->pos = xfs_dir2_byte_to_dataptr(curoff) & 0x7fffffff; if (XFS_IS_CORRUPT(dp->i_mount, !xfs_dir2_namecheck(dep->name, dep->namelen))) { xfs_dirattr_mark_sick(dp, XFS_DATA_FORK); error = -EFSCORRUPTED; break; } if (!dir_emit(ctx, (char *)dep->name, dep->namelen, be64_to_cpu(dep->inumber), xfs_dir3_get_dtype(dp->i_mount, filetype))) break; /* * Advance to next entry in the block. */ offset += length; curoff += length; /* bufsize may have just been a guess; don't go negative */ bufsize = bufsize > length ? bufsize - length : 0; } /* * All done. Set output offset value to current offset. */ if (curoff > xfs_dir2_dataptr_to_byte(XFS_DIR2_MAX_DATAPTR)) ctx->pos = XFS_DIR2_MAX_DATAPTR & 0x7fffffff; else ctx->pos = xfs_dir2_byte_to_dataptr(curoff) & 0x7fffffff; if (bp) xfs_trans_brelse(args->trans, bp); return error; } /* * Read a directory. * * If supplied, the transaction collects locked dir buffers to avoid * nested buffer deadlocks. This function does not dirty the * transaction. The caller must hold the IOLOCK (shared or exclusive) * before calling this function. */ int xfs_readdir( struct xfs_trans *tp, struct xfs_inode *dp, struct dir_context *ctx, size_t bufsize) { struct xfs_da_args args = { NULL }; unsigned int lock_mode; int error; trace_xfs_readdir(dp); if (xfs_is_shutdown(dp->i_mount)) return -EIO; if (xfs_ifork_zapped(dp, XFS_DATA_FORK)) return -EIO; ASSERT(S_ISDIR(VFS_I(dp)->i_mode)); xfs_assert_ilocked(dp, XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL); XFS_STATS_INC(dp->i_mount, xs_dir_getdents); args.dp = dp; args.geo = dp->i_mount->m_dir_geo; args.trans = tp; args.owner = dp->i_ino; if (dp->i_df.if_format == XFS_DINODE_FMT_LOCAL) return xfs_dir2_sf_getdents(&args, ctx); lock_mode = xfs_ilock_data_map_shared(dp); switch (xfs_dir2_format(&args, &error)) { case XFS_DIR2_FMT_BLOCK: error = xfs_dir2_block_getdents(&args, ctx, &lock_mode); break; case XFS_DIR2_FMT_LEAF: case XFS_DIR2_FMT_NODE: error = xfs_dir2_leaf_getdents(&args, ctx, bufsize, &lock_mode); break; default: break; } if (lock_mode) xfs_iunlock(dp, lock_mode); return error; }
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 /* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under both the BSD-style license (found in the * LICENSE file in the root directory of this source tree) and the GPLv2 (found * in the COPYING file in the root directory of this source tree). * You may select, at your option, one of the above-listed licenses. */ /* This file provides custom allocation primitives */ #define ZSTD_DEPS_NEED_MALLOC #include "zstd_deps.h" /* ZSTD_malloc, ZSTD_calloc, ZSTD_free, ZSTD_memset */ #include "compiler.h" /* MEM_STATIC */ #define ZSTD_STATIC_LINKING_ONLY #include <linux/zstd.h> /* ZSTD_customMem */ #ifndef ZSTD_ALLOCATIONS_H #define ZSTD_ALLOCATIONS_H /* custom memory allocation functions */ MEM_STATIC void* ZSTD_customMalloc(size_t size, ZSTD_customMem customMem) { if (customMem.customAlloc) return customMem.customAlloc(customMem.opaque, size); return ZSTD_malloc(size); } MEM_STATIC void* ZSTD_customCalloc(size_t size, ZSTD_customMem customMem) { if (customMem.customAlloc) { /* calloc implemented as malloc+memset; * not as efficient as calloc, but next best guess for custom malloc */ void* const ptr = customMem.customAlloc(customMem.opaque, size); ZSTD_memset(ptr, 0, size); return ptr; } return ZSTD_calloc(1, size); } MEM_STATIC void ZSTD_customFree(void* ptr, ZSTD_customMem customMem) { if (ptr!=NULL) { if (customMem.customFree) customMem.customFree(customMem.opaque, ptr); else ZSTD_free(ptr); } } #endif /* ZSTD_ALLOCATIONS_H */
4 3 4 4 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 /* zutil.h -- internal interface and configuration of the compression library * Copyright (C) 1995-1998 Jean-loup Gailly. * For conditions of distribution and use, see copyright notice in zlib.h */ /* WARNING: this file should *not* be used by applications. It is part of the implementation of the compression library and is subject to change. Applications should only use zlib.h. */ /* @(#) $Id: zutil.h,v 1.1 2000/01/01 03:32:23 davem Exp $ */ #ifndef _Z_UTIL_H #define _Z_UTIL_H #include <linux/zlib.h> #include <linux/string.h> #include <linux/kernel.h> typedef unsigned char uch; typedef unsigned short ush; typedef unsigned long ulg; /* common constants */ #define STORED_BLOCK 0 #define STATIC_TREES 1 #define DYN_TREES 2 /* The three kinds of block type */ #define MIN_MATCH 3 #define MAX_MATCH 258 /* The minimum and maximum match lengths */ #define PRESET_DICT 0x20 /* preset dictionary flag in zlib header */ /* target dependencies */ /* Common defaults */ #ifndef OS_CODE # define OS_CODE 0x03 /* assume Unix */ #endif /* functions */ typedef uLong (*check_func) (uLong check, const Byte *buf, uInt len); /* checksum functions */ #define BASE 65521L /* largest prime smaller than 65536 */ #define NMAX 5552 /* NMAX is the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1 */ #define DO1(buf,i) {s1 += buf[i]; s2 += s1;} #define DO2(buf,i) DO1(buf,i); DO1(buf,i+1); #define DO4(buf,i) DO2(buf,i); DO2(buf,i+2); #define DO8(buf,i) DO4(buf,i); DO4(buf,i+4); #define DO16(buf) DO8(buf,0); DO8(buf,8); /* ========================================================================= */ /* Update a running Adler-32 checksum with the bytes buf[0..len-1] and return the updated checksum. If buf is NULL, this function returns the required initial value for the checksum. An Adler-32 checksum is almost as reliable as a CRC32 but can be computed much faster. Usage example: uLong adler = zlib_adler32(0L, NULL, 0); while (read_buffer(buffer, length) != EOF) { adler = zlib_adler32(adler, buffer, length); } if (adler != original_adler) error(); */ static inline uLong zlib_adler32(uLong adler, const Byte *buf, uInt len) { unsigned long s1 = adler & 0xffff; unsigned long s2 = (adler >> 16) & 0xffff; int k; if (buf == NULL) return 1L; while (len > 0) { k = len < NMAX ? len : NMAX; len -= k; while (k >= 16) { DO16(buf); buf += 16; k -= 16; } if (k != 0) do { s1 += *buf++; s2 += s1; } while (--k); s1 %= BASE; s2 %= BASE; } return (s2 << 16) | s1; } #endif /* _Z_UTIL_H */
1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 /* SPDX-License-Identifier: LGPL-2.1+ WITH Linux-syscall-note */ /* * nilfs2_ondisk.h - NILFS2 on-disk structures * * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as published * by the Free Software Foundation; either version 2.1 of the License, or * (at your option) any later version. */ /* * linux/include/linux/ext2_fs.h * * Copyright (C) 1992, 1993, 1994, 1995 * Remy Card (card@masi.ibp.fr) * Laboratoire MASI - Institut Blaise Pascal * Universite Pierre et Marie Curie (Paris VI) * * from * * linux/include/linux/minix_fs.h * * Copyright (C) 1991, 1992 Linus Torvalds */ #ifndef _LINUX_NILFS2_ONDISK_H #define _LINUX_NILFS2_ONDISK_H #include <linux/types.h> #include <linux/magic.h> #include <asm/byteorder.h> #define NILFS_INODE_BMAP_SIZE 7 /** * struct nilfs_inode - structure of an inode on disk * @i_blocks: blocks count * @i_size: size in bytes * @i_ctime: creation time (seconds) * @i_mtime: modification time (seconds) * @i_ctime_nsec: creation time (nano seconds) * @i_mtime_nsec: modification time (nano seconds) * @i_uid: user id * @i_gid: group id * @i_mode: file mode * @i_links_count: links count * @i_flags: file flags * @i_bmap: block mapping * @i_xattr: extended attributes * @i_generation: file generation (for NFS) * @i_pad: padding */ struct nilfs_inode { __le64 i_blocks; __le64 i_size; __le64 i_ctime; __le64 i_mtime; __le32 i_ctime_nsec; __le32 i_mtime_nsec; __le32 i_uid; __le32 i_gid; __le16 i_mode; __le16 i_links_count; __le32 i_flags; __le64 i_bmap[NILFS_INODE_BMAP_SIZE]; #define i_device_code i_bmap[0] __le64 i_xattr; __le32 i_generation; __le32 i_pad; }; #define NILFS_MIN_INODE_SIZE 128 /** * struct nilfs_super_root - structure of super root * @sr_sum: check sum * @sr_bytes: byte count of the structure * @sr_flags: flags (reserved) * @sr_nongc_ctime: write time of the last segment not for cleaner operation * @sr_dat: DAT file inode * @sr_cpfile: checkpoint file inode * @sr_sufile: segment usage file inode */ struct nilfs_super_root { __le32 sr_sum; __le16 sr_bytes; __le16 sr_flags; __le64 sr_nongc_ctime; struct nilfs_inode sr_dat; struct nilfs_inode sr_cpfile; struct nilfs_inode sr_sufile; }; #define NILFS_SR_MDT_OFFSET(inode_size, i) \ ((unsigned long)&((struct nilfs_super_root *)0)->sr_dat + \ (inode_size) * (i)) #define NILFS_SR_DAT_OFFSET(inode_size) NILFS_SR_MDT_OFFSET(inode_size, 0) #define NILFS_SR_CPFILE_OFFSET(inode_size) NILFS_SR_MDT_OFFSET(inode_size, 1) #define NILFS_SR_SUFILE_OFFSET(inode_size) NILFS_SR_MDT_OFFSET(inode_size, 2) #define NILFS_SR_BYTES(inode_size) NILFS_SR_MDT_OFFSET(inode_size, 3) /* * Maximal mount counts */ #define NILFS_DFL_MAX_MNT_COUNT 50 /* 50 mounts */ /* * File system states (sbp->s_state, nilfs->ns_mount_state) */ #define NILFS_VALID_FS 0x0001 /* Unmounted cleanly */ #define NILFS_ERROR_FS 0x0002 /* Errors detected */ #define NILFS_RESIZE_FS 0x0004 /* Resize required */ /* * Mount flags (sbi->s_mount_opt) */ #define NILFS_MOUNT_ERROR_MODE 0x0070 /* Error mode mask */ #define NILFS_MOUNT_ERRORS_CONT 0x0010 /* Continue on errors */ #define NILFS_MOUNT_ERRORS_RO 0x0020 /* Remount fs ro on errors */ #define NILFS_MOUNT_ERRORS_PANIC 0x0040 /* Panic on errors */ #define NILFS_MOUNT_BARRIER 0x1000 /* Use block barriers */ #define NILFS_MOUNT_STRICT_ORDER 0x2000 /* * Apply strict in-order * semantics also for data */ #define NILFS_MOUNT_NORECOVERY 0x4000 /* * Disable write access during * mount-time recovery */ #define NILFS_MOUNT_DISCARD 0x8000 /* Issue DISCARD requests */ /** * struct nilfs_super_block - structure of super block on disk */ struct nilfs_super_block { /*00*/ __le32 s_rev_level; /* Revision level */ __le16 s_minor_rev_level; /* minor revision level */ __le16 s_magic; /* Magic signature */ __le16 s_bytes; /* * Bytes count of CRC calculation * for this structure. s_reserved * is excluded. */ __le16 s_flags; /* flags */ __le32 s_crc_seed; /* Seed value of CRC calculation */ /*10*/ __le32 s_sum; /* Check sum of super block */ __le32 s_log_block_size; /* * Block size represented as follows * blocksize = * 1 << (s_log_block_size + 10) */ __le64 s_nsegments; /* Number of segments in filesystem */ /*20*/ __le64 s_dev_size; /* block device size in bytes */ __le64 s_first_data_block; /* 1st seg disk block number */ /*30*/ __le32 s_blocks_per_segment; /* number of blocks per full segment */ __le32 s_r_segments_percentage; /* Reserved segments percentage */ __le64 s_last_cno; /* Last checkpoint number */ /*40*/ __le64 s_last_pseg; /* disk block addr pseg written last */ __le64 s_last_seq; /* seq. number of seg written last */ /*50*/ __le64 s_free_blocks_count; /* Free blocks count */ __le64 s_ctime; /* * Creation time (execution time of * newfs) */ /*60*/ __le64 s_mtime; /* Mount time */ __le64 s_wtime; /* Write time */ /*70*/ __le16 s_mnt_count; /* Mount count */ __le16 s_max_mnt_count; /* Maximal mount count */ __le16 s_state; /* File system state */ __le16 s_errors; /* Behaviour when detecting errors */ __le64 s_lastcheck; /* time of last check */ /*80*/ __le32 s_checkinterval; /* max. time between checks */ __le32 s_creator_os; /* OS */ __le16 s_def_resuid; /* Default uid for reserved blocks */ __le16 s_def_resgid; /* Default gid for reserved blocks */ __le32 s_first_ino; /* First non-reserved inode */ /*90*/ __le16 s_inode_size; /* Size of an inode */ __le16 s_dat_entry_size; /* Size of a dat entry */ __le16 s_checkpoint_size; /* Size of a checkpoint */ __le16 s_segment_usage_size; /* Size of a segment usage */ /*98*/ __u8 s_uuid[16]; /* 128-bit uuid for volume */ /*A8*/ char s_volume_name[80] /* volume name */ __kernel_nonstring; /*F8*/ __le32 s_c_interval; /* Commit interval of segment */ __le32 s_c_block_max; /* * Threshold of data amount for * the segment construction */ /*100*/ __le64 s_feature_compat; /* Compatible feature set */ __le64 s_feature_compat_ro; /* Read-only compatible feature set */ __le64 s_feature_incompat; /* Incompatible feature set */ __u32 s_reserved[186]; /* padding to the end of the block */ }; /* * Codes for operating systems */ #define NILFS_OS_LINUX 0 /* Codes from 1 to 4 are reserved to keep compatibility with ext2 creator-OS */ /* * Revision levels */ #define NILFS_CURRENT_REV 2 /* current major revision */ #define NILFS_MINOR_REV 0 /* minor revision */ #define NILFS_MIN_SUPP_REV 2 /* minimum supported revision */ /* * Feature set definitions * * If there is a bit set in the incompatible feature set that the kernel * doesn't know about, it should refuse to mount the filesystem. */ #define NILFS_FEATURE_COMPAT_RO_BLOCK_COUNT 0x00000001ULL #define NILFS_FEATURE_COMPAT_SUPP 0ULL #define NILFS_FEATURE_COMPAT_RO_SUPP NILFS_FEATURE_COMPAT_RO_BLOCK_COUNT #define NILFS_FEATURE_INCOMPAT_SUPP 0ULL /* * Bytes count of super_block for CRC-calculation */ #define NILFS_SB_BYTES \ ((long)&((struct nilfs_super_block *)0)->s_reserved) /* * Special inode number */ #define NILFS_ROOT_INO 2 /* Root file inode */ #define NILFS_DAT_INO 3 /* DAT file */ #define NILFS_CPFILE_INO 4 /* checkpoint file */ #define NILFS_SUFILE_INO 5 /* segment usage file */ #define NILFS_IFILE_INO 6 /* ifile */ #define NILFS_ATIME_INO 7 /* Atime file (reserved) */ #define NILFS_XATTR_INO 8 /* Xattribute file (reserved) */ #define NILFS_SKETCH_INO 10 /* Sketch file */ #define NILFS_USER_INO 11 /* Fisrt user's file inode number */ #define NILFS_SB_OFFSET_BYTES 1024 /* byte offset of nilfs superblock */ #define NILFS_SEG_MIN_BLOCKS 16 /* * Minimum number of blocks in * a full segment */ #define NILFS_PSEG_MIN_BLOCKS 2 /* * Minimum number of blocks in * a partial segment */ #define NILFS_MIN_NRSVSEGS 8 /* * Minimum number of reserved * segments */ /* * We call DAT, cpfile, and sufile root metadata files. Inodes of * these files are written in super root block instead of ifile, and * garbage collector doesn't keep any past versions of these files. */ #define NILFS_ROOT_METADATA_FILE(ino) \ ((ino) >= NILFS_DAT_INO && (ino) <= NILFS_SUFILE_INO) /* * bytes offset of secondary super block */ #define NILFS_SB2_OFFSET_BYTES(devsize) ((((devsize) >> 12) - 1) << 12) /* * Maximal count of links to a file */ #define NILFS_LINK_MAX 32000 /* * Structure of a directory entry * (Same as ext2) */ #define NILFS_NAME_LEN 255 /* * Block size limitations */ #define NILFS_MIN_BLOCK_SIZE 1024 #define NILFS_MAX_BLOCK_SIZE 65536 /* * The new version of the directory entry. Since V0 structures are * stored in intel byte order, and the name_len field could never be * bigger than 255 chars, it's safe to reclaim the extra byte for the * file_type field. */ struct nilfs_dir_entry { __le64 inode; /* Inode number */ __le16 rec_len; /* Directory entry length */ __u8 name_len; /* Name length */ __u8 file_type; /* Dir entry type (file, dir, etc) */ char name[NILFS_NAME_LEN]; /* File name */ char pad; }; /* * NILFS directory file types. Only the low 3 bits are used. The * other bits are reserved for now. */ enum { NILFS_FT_UNKNOWN, NILFS_FT_REG_FILE, NILFS_FT_DIR, NILFS_FT_CHRDEV, NILFS_FT_BLKDEV, NILFS_FT_FIFO, NILFS_FT_SOCK, NILFS_FT_SYMLINK, NILFS_FT_MAX }; /* * NILFS_DIR_PAD defines the directory entries boundaries * * NOTE: It must be a multiple of 8 */ #define NILFS_DIR_PAD 8 #define NILFS_DIR_ROUND (NILFS_DIR_PAD - 1) #define NILFS_DIR_REC_LEN(name_len) (((name_len) + 12 + NILFS_DIR_ROUND) & \ ~NILFS_DIR_ROUND) #define NILFS_MAX_REC_LEN ((1 << 16) - 1) /** * struct nilfs_finfo - file information * @fi_ino: inode number * @fi_cno: checkpoint number * @fi_nblocks: number of blocks (including intermediate blocks) * @fi_ndatablk: number of file data blocks */ struct nilfs_finfo { __le64 fi_ino; __le64 fi_cno; __le32 fi_nblocks; __le32 fi_ndatablk; }; /** * struct nilfs_binfo_v - information on a data block (except DAT) * @bi_vblocknr: virtual block number * @bi_blkoff: block offset */ struct nilfs_binfo_v { __le64 bi_vblocknr; __le64 bi_blkoff; }; /** * struct nilfs_binfo_dat - information on a DAT node block * @bi_blkoff: block offset * @bi_level: level * @bi_pad: padding */ struct nilfs_binfo_dat { __le64 bi_blkoff; __u8 bi_level; __u8 bi_pad[7]; }; /** * union nilfs_binfo: block information * @bi_v: nilfs_binfo_v structure * @bi_dat: nilfs_binfo_dat structure */ union nilfs_binfo { struct nilfs_binfo_v bi_v; struct nilfs_binfo_dat bi_dat; }; /** * struct nilfs_segment_summary - segment summary header * @ss_datasum: checksum of data * @ss_sumsum: checksum of segment summary * @ss_magic: magic number * @ss_bytes: size of this structure in bytes * @ss_flags: flags * @ss_seq: sequence number * @ss_create: creation timestamp * @ss_next: next segment * @ss_nblocks: number of blocks * @ss_nfinfo: number of finfo structures * @ss_sumbytes: total size of segment summary in bytes * @ss_pad: padding * @ss_cno: checkpoint number */ struct nilfs_segment_summary { __le32 ss_datasum; __le32 ss_sumsum; __le32 ss_magic; __le16 ss_bytes; __le16 ss_flags; __le64 ss_seq; __le64 ss_create; __le64 ss_next; __le32 ss_nblocks; __le32 ss_nfinfo; __le32 ss_sumbytes; __le32 ss_pad; __le64 ss_cno; /* array of finfo structures */ }; #define NILFS_SEGSUM_MAGIC 0x1eaffa11 /* segment summary magic number */ /* * Segment summary flags */ #define NILFS_SS_LOGBGN 0x0001 /* begins a logical segment */ #define NILFS_SS_LOGEND 0x0002 /* ends a logical segment */ #define NILFS_SS_SR 0x0004 /* has super root */ #define NILFS_SS_SYNDT 0x0008 /* includes data only updates */ #define NILFS_SS_GC 0x0010 /* segment written for cleaner operation */ /** * struct nilfs_btree_node - header of B-tree node block * @bn_flags: flags * @bn_level: level * @bn_nchildren: number of children * @bn_pad: padding */ struct nilfs_btree_node { __u8 bn_flags; __u8 bn_level; __le16 bn_nchildren; __le32 bn_pad; }; /* flags */ #define NILFS_BTREE_NODE_ROOT 0x01 /* level */ #define NILFS_BTREE_LEVEL_DATA 0 #define NILFS_BTREE_LEVEL_NODE_MIN (NILFS_BTREE_LEVEL_DATA + 1) #define NILFS_BTREE_LEVEL_MAX 14 /* Max level (exclusive) */ /** * struct nilfs_direct_node - header of built-in bmap array * @dn_flags: flags * @dn_pad: padding */ struct nilfs_direct_node { __u8 dn_flags; __u8 pad[7]; }; /** * struct nilfs_palloc_group_desc - block group descriptor * @pg_nfrees: number of free entries in block group */ struct nilfs_palloc_group_desc { __le32 pg_nfrees; }; /** * struct nilfs_dat_entry - disk address translation entry * @de_blocknr: block number * @de_start: start checkpoint number * @de_end: end checkpoint number * @de_rsv: reserved for future use */ struct nilfs_dat_entry { __le64 de_blocknr; __le64 de_start; __le64 de_end; __le64 de_rsv; }; #define NILFS_MIN_DAT_ENTRY_SIZE 32 /** * struct nilfs_snapshot_list - snapshot list * @ssl_next: next checkpoint number on snapshot list * @ssl_prev: previous checkpoint number on snapshot list */ struct nilfs_snapshot_list { __le64 ssl_next; __le64 ssl_prev; }; /** * struct nilfs_checkpoint - checkpoint structure * @cp_flags: flags * @cp_checkpoints_count: checkpoints count in a block * @cp_snapshot_list: snapshot list * @cp_cno: checkpoint number * @cp_create: creation timestamp * @cp_nblk_inc: number of blocks incremented by this checkpoint * @cp_inodes_count: inodes count * @cp_blocks_count: blocks count * @cp_ifile_inode: inode of ifile */ struct nilfs_checkpoint { __le32 cp_flags; __le32 cp_checkpoints_count; struct nilfs_snapshot_list cp_snapshot_list; __le64 cp_cno; __le64 cp_create; __le64 cp_nblk_inc; __le64 cp_inodes_count; __le64 cp_blocks_count; /* * Do not change the byte offset of ifile inode. * To keep the compatibility of the disk format, * additional fields should be added behind cp_ifile_inode. */ struct nilfs_inode cp_ifile_inode; }; #define NILFS_MIN_CHECKPOINT_SIZE (64 + NILFS_MIN_INODE_SIZE) /* checkpoint flags */ enum { NILFS_CHECKPOINT_SNAPSHOT, NILFS_CHECKPOINT_INVALID, NILFS_CHECKPOINT_SKETCH, NILFS_CHECKPOINT_MINOR, }; #define NILFS_CHECKPOINT_FNS(flag, name) \ static inline void \ nilfs_checkpoint_set_##name(struct nilfs_checkpoint *cp) \ { \ cp->cp_flags = __cpu_to_le32(__le32_to_cpu(cp->cp_flags) | \ (1UL << NILFS_CHECKPOINT_##flag)); \ } \ static inline void \ nilfs_checkpoint_clear_##name(struct nilfs_checkpoint *cp) \ { \ cp->cp_flags = __cpu_to_le32(__le32_to_cpu(cp->cp_flags) & \ ~(1UL << NILFS_CHECKPOINT_##flag)); \ } \ static inline int \ nilfs_checkpoint_##name(const struct nilfs_checkpoint *cp) \ { \ return !!(__le32_to_cpu(cp->cp_flags) & \ (1UL << NILFS_CHECKPOINT_##flag)); \ } NILFS_CHECKPOINT_FNS(SNAPSHOT, snapshot) NILFS_CHECKPOINT_FNS(INVALID, invalid) NILFS_CHECKPOINT_FNS(MINOR, minor) /** * struct nilfs_cpfile_header - checkpoint file header * @ch_ncheckpoints: number of checkpoints * @ch_nsnapshots: number of snapshots * @ch_snapshot_list: snapshot list */ struct nilfs_cpfile_header { __le64 ch_ncheckpoints; __le64 ch_nsnapshots; struct nilfs_snapshot_list ch_snapshot_list; }; #define NILFS_CPFILE_FIRST_CHECKPOINT_OFFSET \ ((sizeof(struct nilfs_cpfile_header) + \ sizeof(struct nilfs_checkpoint) - 1) / \ sizeof(struct nilfs_checkpoint)) /** * struct nilfs_segment_usage - segment usage * @su_lastmod: last modified timestamp * @su_nblocks: number of blocks in segment * @su_flags: flags */ struct nilfs_segment_usage { __le64 su_lastmod; __le32 su_nblocks; __le32 su_flags; }; #define NILFS_MIN_SEGMENT_USAGE_SIZE 16 /* segment usage flag */ enum { NILFS_SEGMENT_USAGE_ACTIVE, NILFS_SEGMENT_USAGE_DIRTY, NILFS_SEGMENT_USAGE_ERROR, }; #define NILFS_SEGMENT_USAGE_FNS(flag, name) \ static inline void \ nilfs_segment_usage_set_##name(struct nilfs_segment_usage *su) \ { \ su->su_flags = __cpu_to_le32(__le32_to_cpu(su->su_flags) | \ (1UL << NILFS_SEGMENT_USAGE_##flag));\ } \ static inline void \ nilfs_segment_usage_clear_##name(struct nilfs_segment_usage *su) \ { \ su->su_flags = \ __cpu_to_le32(__le32_to_cpu(su->su_flags) & \ ~(1UL << NILFS_SEGMENT_USAGE_##flag)); \ } \ static inline int \ nilfs_segment_usage_##name(const struct nilfs_segment_usage *su) \ { \ return !!(__le32_to_cpu(su->su_flags) & \ (1UL << NILFS_SEGMENT_USAGE_##flag)); \ } NILFS_SEGMENT_USAGE_FNS(ACTIVE, active) NILFS_SEGMENT_USAGE_FNS(DIRTY, dirty) NILFS_SEGMENT_USAGE_FNS(ERROR, error) static inline void nilfs_segment_usage_set_clean(struct nilfs_segment_usage *su) { su->su_lastmod = __cpu_to_le64(0); su->su_nblocks = __cpu_to_le32(0); su->su_flags = __cpu_to_le32(0); } static inline int nilfs_segment_usage_clean(const struct nilfs_segment_usage *su) { return !__le32_to_cpu(su->su_flags); } /** * struct nilfs_sufile_header - segment usage file header * @sh_ncleansegs: number of clean segments * @sh_ndirtysegs: number of dirty segments * @sh_last_alloc: last allocated segment number */ struct nilfs_sufile_header { __le64 sh_ncleansegs; __le64 sh_ndirtysegs; __le64 sh_last_alloc; /* ... */ }; #define NILFS_SUFILE_FIRST_SEGMENT_USAGE_OFFSET \ ((sizeof(struct nilfs_sufile_header) + \ sizeof(struct nilfs_segment_usage) - 1) / \ sizeof(struct nilfs_segment_usage)) #endif /* _LINUX_NILFS2_ONDISK_H */
1381 1356 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _NF_CONNTRACK_COMMON_H #define _NF_CONNTRACK_COMMON_H #include <linux/refcount.h> #include <uapi/linux/netfilter/nf_conntrack_common.h> struct ip_conntrack_stat { unsigned int found; unsigned int invalid; unsigned int insert; unsigned int insert_failed; unsigned int clash_resolve; unsigned int drop; unsigned int early_drop; unsigned int error; unsigned int expect_new; unsigned int expect_create; unsigned int expect_delete; unsigned int search_restart; unsigned int chaintoolong; }; #define NFCT_INFOMASK 7UL #define NFCT_PTRMASK ~(NFCT_INFOMASK) struct nf_conntrack { refcount_t use; }; void nf_conntrack_destroy(struct nf_conntrack *nfct); /* like nf_ct_put, but without module dependency on nf_conntrack */ static inline void nf_conntrack_put(struct nf_conntrack *nfct) { if (nfct && refcount_dec_and_test(&nfct->use)) nf_conntrack_destroy(nfct); } static inline void nf_conntrack_get(struct nf_conntrack *nfct) { if (nfct) refcount_inc(&nfct->use); } #endif /* _NF_CONNTRACK_COMMON_H */
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 /* SPDX-License-Identifier: GPL-2.0 */ /* interrupt.h */ #ifndef _LINUX_INTERRUPT_H #define _LINUX_INTERRUPT_H #include <linux/kernel.h> #include <linux/bitops.h> #include <linux/cleanup.h> #include <linux/irqreturn.h> #include <linux/irqnr.h> #include <linux/hardirq.h> #include <linux/irqflags.h> #include <linux/hrtimer.h> #include <linux/kref.h> #include <linux/cpumask_types.h> #include <linux/workqueue.h> #include <linux/jump_label.h> #include <linux/atomic.h> #include <asm/ptrace.h> #include <asm/irq.h> #include <asm/sections.h> /* * These correspond to the IORESOURCE_IRQ_* defines in * linux/ioport.h to select the interrupt line behaviour. When * requesting an interrupt without specifying a IRQF_TRIGGER, the * setting should be assumed to be "as already configured", which * may be as per machine or firmware initialisation. */ #define IRQF_TRIGGER_NONE 0x00000000 #define IRQF_TRIGGER_RISING 0x00000001 #define IRQF_TRIGGER_FALLING 0x00000002 #define IRQF_TRIGGER_HIGH 0x00000004 #define IRQF_TRIGGER_LOW 0x00000008 #define IRQF_TRIGGER_MASK (IRQF_TRIGGER_HIGH | IRQF_TRIGGER_LOW | \ IRQF_TRIGGER_RISING | IRQF_TRIGGER_FALLING) #define IRQF_TRIGGER_PROBE 0x00000010 /* * These flags used only by the kernel as part of the * irq handling routines. * * IRQF_SHARED - allow sharing the irq among several devices * IRQF_PROBE_SHARED - set by callers when they expect sharing mismatches to occur * IRQF_TIMER - Flag to mark this interrupt as timer interrupt * IRQF_PERCPU - Interrupt is per cpu * IRQF_NOBALANCING - Flag to exclude this interrupt from irq balancing * IRQF_IRQPOLL - Interrupt is used for polling (only the interrupt that is * registered first in a shared interrupt is considered for * performance reasons) * IRQF_ONESHOT - Interrupt is not reenabled after the hardirq handler finished. * Used by threaded interrupts which need to keep the * irq line disabled until the threaded handler has been run. * IRQF_NO_SUSPEND - Do not disable this IRQ during suspend. Does not guarantee * that this interrupt will wake the system from a suspended * state. See Documentation/power/suspend-and-interrupts.rst * IRQF_FORCE_RESUME - Force enable it on resume even if IRQF_NO_SUSPEND is set * IRQF_NO_THREAD - Interrupt cannot be threaded * IRQF_EARLY_RESUME - Resume IRQ early during syscore instead of at device * resume time. * IRQF_COND_SUSPEND - If the IRQ is shared with a NO_SUSPEND user, execute this * interrupt handler after suspending interrupts. For system * wakeup devices users need to implement wakeup detection in * their interrupt handlers. * IRQF_NO_AUTOEN - Don't enable IRQ or NMI automatically when users request it. * Users will enable it explicitly by enable_irq() or enable_nmi() * later. * IRQF_NO_DEBUG - Exclude from runnaway detection for IPI and similar handlers, * depends on IRQF_PERCPU. * IRQF_COND_ONESHOT - Agree to do IRQF_ONESHOT if already set for a shared * interrupt. */ #define IRQF_SHARED 0x00000080 #define IRQF_PROBE_SHARED 0x00000100 #define __IRQF_TIMER 0x00000200 #define IRQF_PERCPU 0x00000400 #define IRQF_NOBALANCING 0x00000800 #define IRQF_IRQPOLL 0x00001000 #define IRQF_ONESHOT 0x00002000 #define IRQF_NO_SUSPEND 0x00004000 #define IRQF_FORCE_RESUME 0x00008000 #define IRQF_NO_THREAD 0x00010000 #define IRQF_EARLY_RESUME 0x00020000 #define IRQF_COND_SUSPEND 0x00040000 #define IRQF_NO_AUTOEN 0x00080000 #define IRQF_NO_DEBUG 0x00100000 #define IRQF_COND_ONESHOT 0x00200000 #define IRQF_TIMER (__IRQF_TIMER | IRQF_NO_SUSPEND | IRQF_NO_THREAD) /* * These values can be returned by request_any_context_irq() and * describe the context the interrupt will be run in. * * IRQC_IS_HARDIRQ - interrupt runs in hardirq context * IRQC_IS_NESTED - interrupt runs in a nested threaded context */ enum { IRQC_IS_HARDIRQ = 0, IRQC_IS_NESTED, }; typedef irqreturn_t (*irq_handler_t)(int, void *); /** * struct irqaction - per interrupt action descriptor * @handler: interrupt handler function * @name: name of the device * @dev_id: cookie to identify the device * @percpu_dev_id: cookie to identify the device * @next: pointer to the next irqaction for shared interrupts * @irq: interrupt number * @flags: flags (see IRQF_* above) * @thread_fn: interrupt handler function for threaded interrupts * @thread: thread pointer for threaded interrupts * @secondary: pointer to secondary irqaction (force threading) * @thread_flags: flags related to @thread * @thread_mask: bitmask for keeping track of @thread activity * @dir: pointer to the proc/irq/NN/name entry */ struct irqaction { irq_handler_t handler; void *dev_id; void __percpu *percpu_dev_id; struct irqaction *next; irq_handler_t thread_fn; struct task_struct *thread; struct irqaction *secondary; unsigned int irq; unsigned int flags; unsigned long thread_flags; unsigned long thread_mask; const char *name; struct proc_dir_entry *dir; } ____cacheline_internodealigned_in_smp; extern irqreturn_t no_action(int cpl, void *dev_id); /* * If a (PCI) device interrupt is not connected we set dev->irq to * IRQ_NOTCONNECTED. This causes request_irq() to fail with -ENOTCONN, so we * can distinguish that case from other error returns. * * 0x80000000 is guaranteed to be outside the available range of interrupts * and easy to distinguish from other possible incorrect values. */ #define IRQ_NOTCONNECTED (1U << 31) extern int __must_check request_threaded_irq(unsigned int irq, irq_handler_t handler, irq_handler_t thread_fn, unsigned long flags, const char *name, void *dev); /** * request_irq - Add a handler for an interrupt line * @irq: The interrupt line to allocate * @handler: Function to be called when the IRQ occurs. * Primary handler for threaded interrupts * If NULL, the default primary handler is installed * @flags: Handling flags * @name: Name of the device generating this interrupt * @dev: A cookie passed to the handler function * * This call allocates an interrupt and establishes a handler; see * the documentation for request_threaded_irq() for details. */ static inline int __must_check request_irq(unsigned int irq, irq_handler_t handler, unsigned long flags, const char *name, void *dev) { return request_threaded_irq(irq, handler, NULL, flags | IRQF_COND_ONESHOT, name, dev); } extern int __must_check request_any_context_irq(unsigned int irq, irq_handler_t handler, unsigned long flags, const char *name, void *dev_id); extern int __must_check __request_percpu_irq(unsigned int irq, irq_handler_t handler, unsigned long flags, const char *devname, void __percpu *percpu_dev_id); extern int __must_check request_nmi(unsigned int irq, irq_handler_t handler, unsigned long flags, const char *name, void *dev); static inline int __must_check request_percpu_irq(unsigned int irq, irq_handler_t handler, const char *devname, void __percpu *percpu_dev_id) { return __request_percpu_irq(irq, handler, 0, devname, percpu_dev_id); } extern int __must_check request_percpu_nmi(unsigned int irq, irq_handler_t handler, const char *devname, void __percpu *dev); extern const void *free_irq(unsigned int, void *); extern void free_percpu_irq(unsigned int, void __percpu *); extern const void *free_nmi(unsigned int irq, void *dev_id); extern void free_percpu_nmi(unsigned int irq, void __percpu *percpu_dev_id); struct device; extern int __must_check devm_request_threaded_irq(struct device *dev, unsigned int irq, irq_handler_t handler, irq_handler_t thread_fn, unsigned long irqflags, const char *devname, void *dev_id); static inline int __must_check devm_request_irq(struct device *dev, unsigned int irq, irq_handler_t handler, unsigned long irqflags, const char *devname, void *dev_id) { return devm_request_threaded_irq(dev, irq, handler, NULL, irqflags, devname, dev_id); } extern int __must_check devm_request_any_context_irq(struct device *dev, unsigned int irq, irq_handler_t handler, unsigned long irqflags, const char *devname, void *dev_id); extern void devm_free_irq(struct device *dev, unsigned int irq, void *dev_id); bool irq_has_action(unsigned int irq); extern void disable_irq_nosync(unsigned int irq); extern bool disable_hardirq(unsigned int irq); extern void disable_irq(unsigned int irq); extern void disable_percpu_irq(unsigned int irq); extern void enable_irq(unsigned int irq); extern void enable_percpu_irq(unsigned int irq, unsigned int type); extern bool irq_percpu_is_enabled(unsigned int irq); extern void irq_wake_thread(unsigned int irq, void *dev_id); DEFINE_LOCK_GUARD_1(disable_irq, int, disable_irq(*_T->lock), enable_irq(*_T->lock)) extern void disable_nmi_nosync(unsigned int irq); extern void disable_percpu_nmi(unsigned int irq); extern void enable_nmi(unsigned int irq); extern void enable_percpu_nmi(unsigned int irq, unsigned int type); extern int prepare_percpu_nmi(unsigned int irq); extern void teardown_percpu_nmi(unsigned int irq); extern int irq_inject_interrupt(unsigned int irq); /* The following three functions are for the core kernel use only. */ extern void suspend_device_irqs(void); extern void resume_device_irqs(void); extern void rearm_wake_irq(unsigned int irq); /** * struct irq_affinity_notify - context for notification of IRQ affinity changes * @irq: Interrupt to which notification applies * @kref: Reference count, for internal use * @work: Work item, for internal use * @notify: Function to be called on change. This will be * called in process context. * @release: Function to be called on release. This will be * called in process context. Once registered, the * structure must only be freed when this function is * called or later. */ struct irq_affinity_notify { unsigned int irq; struct kref kref; struct work_struct work; void (*notify)(struct irq_affinity_notify *, const cpumask_t *mask); void (*release)(struct kref *ref); }; #define IRQ_AFFINITY_MAX_SETS 4 /** * struct irq_affinity - Description for automatic irq affinity assignments * @pre_vectors: Don't apply affinity to @pre_vectors at beginning of * the MSI(-X) vector space * @post_vectors: Don't apply affinity to @post_vectors at end of * the MSI(-X) vector space * @nr_sets: The number of interrupt sets for which affinity * spreading is required * @set_size: Array holding the size of each interrupt set * @calc_sets: Callback for calculating the number and size * of interrupt sets * @priv: Private data for usage by @calc_sets, usually a * pointer to driver/device specific data. */ struct irq_affinity { unsigned int pre_vectors; unsigned int post_vectors; unsigned int nr_sets; unsigned int set_size[IRQ_AFFINITY_MAX_SETS]; void (*calc_sets)(struct irq_affinity *, unsigned int nvecs); void *priv; }; /** * struct irq_affinity_desc - Interrupt affinity descriptor * @mask: cpumask to hold the affinity assignment * @is_managed: 1 if the interrupt is managed internally */ struct irq_affinity_desc { struct cpumask mask; unsigned int is_managed : 1; }; #if defined(CONFIG_SMP) extern cpumask_var_t irq_default_affinity; extern int irq_set_affinity(unsigned int irq, const struct cpumask *cpumask); extern int irq_force_affinity(unsigned int irq, const struct cpumask *cpumask); extern int irq_can_set_affinity(unsigned int irq); extern int irq_select_affinity(unsigned int irq); extern int __irq_apply_affinity_hint(unsigned int irq, const struct cpumask *m, bool setaffinity); /** * irq_update_affinity_hint - Update the affinity hint * @irq: Interrupt to update * @m: cpumask pointer (NULL to clear the hint) * * Updates the affinity hint, but does not change the affinity of the interrupt. */ static inline int irq_update_affinity_hint(unsigned int irq, const struct cpumask *m) { return __irq_apply_affinity_hint(irq, m, false); } /** * irq_set_affinity_and_hint - Update the affinity hint and apply the provided * cpumask to the interrupt * @irq: Interrupt to update * @m: cpumask pointer (NULL to clear the hint) * * Updates the affinity hint and if @m is not NULL it applies it as the * affinity of that interrupt. */ static inline int irq_set_affinity_and_hint(unsigned int irq, const struct cpumask *m) { return __irq_apply_affinity_hint(irq, m, true); } /* * Deprecated. Use irq_update_affinity_hint() or irq_set_affinity_and_hint() * instead. */ static inline int irq_set_affinity_hint(unsigned int irq, const struct cpumask *m) { return irq_set_affinity_and_hint(irq, m); } extern int irq_update_affinity_desc(unsigned int irq, struct irq_affinity_desc *affinity); extern int irq_set_affinity_notifier(unsigned int irq, struct irq_affinity_notify *notify); struct irq_affinity_desc * irq_create_affinity_masks(unsigned int nvec, struct irq_affinity *affd); unsigned int irq_calc_affinity_vectors(unsigned int minvec, unsigned int maxvec, const struct irq_affinity *affd); #else /* CONFIG_SMP */ static inline int irq_set_affinity(unsigned int irq, const struct cpumask *m) { return -EINVAL; } static inline int irq_force_affinity(unsigned int irq, const struct cpumask *cpumask) { return 0; } static inline int irq_can_set_affinity(unsigned int irq) { return 0; } static inline int irq_select_affinity(unsigned int irq) { return 0; } static inline int irq_update_affinity_hint(unsigned int irq, const struct cpumask *m) { return -EINVAL; } static inline int irq_set_affinity_and_hint(unsigned int irq, const struct cpumask *m) { return -EINVAL; } static inline int irq_set_affinity_hint(unsigned int irq, const struct cpumask *m) { return -EINVAL; } static inline int irq_update_affinity_desc(unsigned int irq, struct irq_affinity_desc *affinity) { return -EINVAL; } static inline int irq_set_affinity_notifier(unsigned int irq, struct irq_affinity_notify *notify) { return 0; } static inline struct irq_affinity_desc * irq_create_affinity_masks(unsigned int nvec, struct irq_affinity *affd) { return NULL; } static inline unsigned int irq_calc_affinity_vectors(unsigned int minvec, unsigned int maxvec, const struct irq_affinity *affd) { return maxvec; } #endif /* CONFIG_SMP */ /* * Special lockdep variants of irq disabling/enabling. * These should be used for locking constructs that * know that a particular irq context which is disabled, * and which is the only irq-context user of a lock, * that it's safe to take the lock in the irq-disabled * section without disabling hardirqs. * * On !CONFIG_LOCKDEP they are equivalent to the normal * irq disable/enable methods. */ static inline void disable_irq_nosync_lockdep(unsigned int irq) { disable_irq_nosync(irq); #if defined(CONFIG_LOCKDEP) && !defined(CONFIG_PREEMPT_RT) local_irq_disable(); #endif } static inline void disable_irq_nosync_lockdep_irqsave(unsigned int irq, unsigned long *flags) { disable_irq_nosync(irq); #if defined(CONFIG_LOCKDEP) && !defined(CONFIG_PREEMPT_RT) local_irq_save(*flags); #endif } static inline void enable_irq_lockdep(unsigned int irq) { #if defined(CONFIG_LOCKDEP) && !defined(CONFIG_PREEMPT_RT) local_irq_enable(); #endif enable_irq(irq); } static inline void enable_irq_lockdep_irqrestore(unsigned int irq, unsigned long *flags) { #if defined(CONFIG_LOCKDEP) && !defined(CONFIG_PREEMPT_RT) local_irq_restore(*flags); #endif enable_irq(irq); } /* IRQ wakeup (PM) control: */ extern int irq_set_irq_wake(unsigned int irq, unsigned int on); static inline int enable_irq_wake(unsigned int irq) { return irq_set_irq_wake(irq, 1); } static inline int disable_irq_wake(unsigned int irq) { return irq_set_irq_wake(irq, 0); } /* * irq_get_irqchip_state/irq_set_irqchip_state specific flags */ enum irqchip_irq_state { IRQCHIP_STATE_PENDING, /* Is interrupt pending? */ IRQCHIP_STATE_ACTIVE, /* Is interrupt in progress? */ IRQCHIP_STATE_MASKED, /* Is interrupt masked? */ IRQCHIP_STATE_LINE_LEVEL, /* Is IRQ line high? */ }; extern int irq_get_irqchip_state(unsigned int irq, enum irqchip_irq_state which, bool *state); extern int irq_set_irqchip_state(unsigned int irq, enum irqchip_irq_state which, bool state); #ifdef CONFIG_IRQ_FORCED_THREADING # ifdef CONFIG_PREEMPT_RT # define force_irqthreads() (true) # else DECLARE_STATIC_KEY_FALSE(force_irqthreads_key); # define force_irqthreads() (static_branch_unlikely(&force_irqthreads_key)) # endif #else #define force_irqthreads() (false) #endif #ifndef local_softirq_pending #ifndef local_softirq_pending_ref #define local_softirq_pending_ref irq_stat.__softirq_pending #endif #define local_softirq_pending() (__this_cpu_read(local_softirq_pending_ref)) #define set_softirq_pending(x) (__this_cpu_write(local_softirq_pending_ref, (x))) #define or_softirq_pending(x) (__this_cpu_or(local_softirq_pending_ref, (x))) #endif /* local_softirq_pending */ /* Some architectures might implement lazy enabling/disabling of * interrupts. In some cases, such as stop_machine, we might want * to ensure that after a local_irq_disable(), interrupts have * really been disabled in hardware. Such architectures need to * implement the following hook. */ #ifndef hard_irq_disable #define hard_irq_disable() do { } while(0) #endif /* PLEASE, avoid to allocate new softirqs, if you need not _really_ high frequency threaded job scheduling. For almost all the purposes tasklets are more than enough. F.e. all serial device BHs et al. should be converted to tasklets, not to softirqs. */ enum { HI_SOFTIRQ=0, TIMER_SOFTIRQ, NET_TX_SOFTIRQ, NET_RX_SOFTIRQ, BLOCK_SOFTIRQ, IRQ_POLL_SOFTIRQ, TASKLET_SOFTIRQ, SCHED_SOFTIRQ, HRTIMER_SOFTIRQ, RCU_SOFTIRQ, /* Preferable RCU should always be the last softirq */ NR_SOFTIRQS }; /* * The following vectors can be safely ignored after ksoftirqd is parked: * * _ RCU: * 1) rcutree_migrate_callbacks() migrates the queue. * 2) rcutree_report_cpu_dead() reports the final quiescent states. * * _ IRQ_POLL: irq_poll_cpu_dead() migrates the queue * * _ (HR)TIMER_SOFTIRQ: (hr)timers_dead_cpu() migrates the queue */ #define SOFTIRQ_HOTPLUG_SAFE_MASK (BIT(TIMER_SOFTIRQ) | BIT(IRQ_POLL_SOFTIRQ) |\ BIT(HRTIMER_SOFTIRQ) | BIT(RCU_SOFTIRQ)) /* map softirq index to softirq name. update 'softirq_to_name' in * kernel/softirq.c when adding a new softirq. */ extern const char * const softirq_to_name[NR_SOFTIRQS]; /* softirq mask and active fields moved to irq_cpustat_t in * asm/hardirq.h to get better cache usage. KAO */ struct softirq_action { void (*action)(void); }; asmlinkage void do_softirq(void); asmlinkage void __do_softirq(void); #ifdef CONFIG_PREEMPT_RT extern void do_softirq_post_smp_call_flush(unsigned int was_pending); #else static inline void do_softirq_post_smp_call_flush(unsigned int unused) { do_softirq(); } #endif extern void open_softirq(int nr, void (*action)(void)); extern void softirq_init(void); extern void __raise_softirq_irqoff(unsigned int nr); extern void raise_softirq_irqoff(unsigned int nr); extern void raise_softirq(unsigned int nr); /* * With forced-threaded interrupts enabled a raised softirq is deferred to * ksoftirqd unless it can be handled within the threaded interrupt. This * affects timer_list timers and hrtimers which are explicitly marked with * HRTIMER_MODE_SOFT. * With PREEMPT_RT enabled more hrtimers are moved to softirq for processing * which includes all timers which are not explicitly marked HRTIMER_MODE_HARD. * Userspace controlled timers (like the clock_nanosleep() interface) is divided * into two categories: Tasks with elevated scheduling policy including * SCHED_{FIFO|RR|DL} and the remaining scheduling policy. The tasks with the * elevated scheduling policy are woken up directly from the HARDIRQ while all * other wake ups are delayed to softirq and so to ksoftirqd. * * The ksoftirqd runs at SCHED_OTHER policy at which it should remain since it * handles the softirq in an overloaded situation (not handled everything * within its last run). * If the timers are handled at SCHED_OTHER priority then they competes with all * other SCHED_OTHER tasks for CPU resources are possibly delayed. * Moving timers softirqs to a low priority SCHED_FIFO thread instead ensures * that timer are performed before scheduling any SCHED_OTHER thread. */ DECLARE_PER_CPU(struct task_struct *, ktimerd); DECLARE_PER_CPU(unsigned long, pending_timer_softirq); void raise_ktimers_thread(unsigned int nr); static inline unsigned int local_timers_pending_force_th(void) { return __this_cpu_read(pending_timer_softirq); } static inline void raise_timer_softirq(unsigned int nr) { lockdep_assert_in_irq(); if (force_irqthreads()) raise_ktimers_thread(nr); else __raise_softirq_irqoff(nr); } static inline unsigned int local_timers_pending(void) { if (force_irqthreads()) return local_timers_pending_force_th(); else return local_softirq_pending(); } DECLARE_PER_CPU(struct task_struct *, ksoftirqd); static inline struct task_struct *this_cpu_ksoftirqd(void) { return this_cpu_read(ksoftirqd); } /* Tasklets --- multithreaded analogue of BHs. This API is deprecated. Please consider using threaded IRQs instead: https://lore.kernel.org/lkml/20200716081538.2sivhkj4hcyrusem@linutronix.de Main feature differing them of generic softirqs: tasklet is running only on one CPU simultaneously. Main feature differing them of BHs: different tasklets may be run simultaneously on different CPUs. Properties: * If tasklet_schedule() is called, then tasklet is guaranteed to be executed on some cpu at least once after this. * If the tasklet is already scheduled, but its execution is still not started, it will be executed only once. * If this tasklet is already running on another CPU (or schedule is called from tasklet itself), it is rescheduled for later. * Tasklet is strictly serialized wrt itself, but not wrt another tasklets. If client needs some intertask synchronization, he makes it with spinlocks. */ struct tasklet_struct { struct tasklet_struct *next; unsigned long state; atomic_t count; bool use_callback; union { void (*func)(unsigned long data); void (*callback)(struct tasklet_struct *t); }; unsigned long data; }; #define DECLARE_TASKLET(name, _callback) \ struct tasklet_struct name = { \ .count = ATOMIC_INIT(0), \ .callback = _callback, \ .use_callback = true, \ } #define DECLARE_TASKLET_DISABLED(name, _callback) \ struct tasklet_struct name = { \ .count = ATOMIC_INIT(1), \ .callback = _callback, \ .use_callback = true, \ } #define from_tasklet(var, callback_tasklet, tasklet_fieldname) \ container_of(callback_tasklet, typeof(*var), tasklet_fieldname) #define DECLARE_TASKLET_OLD(name, _func) \ struct tasklet_struct name = { \ .count = ATOMIC_INIT(0), \ .func = _func, \ } #define DECLARE_TASKLET_DISABLED_OLD(name, _func) \ struct tasklet_struct name = { \ .count = ATOMIC_INIT(1), \ .func = _func, \ } enum { TASKLET_STATE_SCHED, /* Tasklet is scheduled for execution */ TASKLET_STATE_RUN /* Tasklet is running (SMP only) */ }; #if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT) static inline int tasklet_trylock(struct tasklet_struct *t) { return !test_and_set_bit(TASKLET_STATE_RUN, &(t)->state); } void tasklet_unlock(struct tasklet_struct *t); void tasklet_unlock_wait(struct tasklet_struct *t); void tasklet_unlock_spin_wait(struct tasklet_struct *t); #else static inline int tasklet_trylock(struct tasklet_struct *t) { return 1; } static inline void tasklet_unlock(struct tasklet_struct *t) { } static inline void tasklet_unlock_wait(struct tasklet_struct *t) { } static inline void tasklet_unlock_spin_wait(struct tasklet_struct *t) { } #endif extern void __tasklet_schedule(struct tasklet_struct *t); static inline void tasklet_schedule(struct tasklet_struct *t) { if (!test_and_set_bit(TASKLET_STATE_SCHED, &t->state)) __tasklet_schedule(t); } extern void __tasklet_hi_schedule(struct tasklet_struct *t); static inline void tasklet_hi_schedule(struct tasklet_struct *t) { if (!test_and_set_bit(TASKLET_STATE_SCHED, &t->state)) __tasklet_hi_schedule(t); } static inline void tasklet_disable_nosync(struct tasklet_struct *t) { atomic_inc(&t->count); smp_mb__after_atomic(); } /* * Do not use in new code. Disabling tasklets from atomic contexts is * error prone and should be avoided. */ static inline void tasklet_disable_in_atomic(struct tasklet_struct *t) { tasklet_disable_nosync(t); tasklet_unlock_spin_wait(t); smp_mb(); } static inline void tasklet_disable(struct tasklet_struct *t) { tasklet_disable_nosync(t); tasklet_unlock_wait(t); smp_mb(); } static inline void tasklet_enable(struct tasklet_struct *t) { smp_mb__before_atomic(); atomic_dec(&t->count); } extern void tasklet_kill(struct tasklet_struct *t); extern void tasklet_init(struct tasklet_struct *t, void (*func)(unsigned long), unsigned long data); extern void tasklet_setup(struct tasklet_struct *t, void (*callback)(struct tasklet_struct *)); /* * Autoprobing for irqs: * * probe_irq_on() and probe_irq_off() provide robust primitives * for accurate IRQ probing during kernel initialization. They are * reasonably simple to use, are not "fooled" by spurious interrupts, * and, unlike other attempts at IRQ probing, they do not get hung on * stuck interrupts (such as unused PS2 mouse interfaces on ASUS boards). * * For reasonably foolproof probing, use them as follows: * * 1. clear and/or mask the device's internal interrupt. * 2. sti(); * 3. irqs = probe_irq_on(); // "take over" all unassigned idle IRQs * 4. enable the device and cause it to trigger an interrupt. * 5. wait for the device to interrupt, using non-intrusive polling or a delay. * 6. irq = probe_irq_off(irqs); // get IRQ number, 0=none, negative=multiple * 7. service the device to clear its pending interrupt. * 8. loop again if paranoia is required. * * probe_irq_on() returns a mask of allocated irq's. * * probe_irq_off() takes the mask as a parameter, * and returns the irq number which occurred, * or zero if none occurred, or a negative irq number * if more than one irq occurred. */ #if !defined(CONFIG_GENERIC_IRQ_PROBE) static inline unsigned long probe_irq_on(void) { return 0; } static inline int probe_irq_off(unsigned long val) { return 0; } static inline unsigned int probe_irq_mask(unsigned long val) { return 0; } #else extern unsigned long probe_irq_on(void); /* returns 0 on failure */ extern int probe_irq_off(unsigned long); /* returns 0 or negative on failure */ extern unsigned int probe_irq_mask(unsigned long); /* returns mask of ISA interrupts */ #endif #ifdef CONFIG_PROC_FS /* Initialize /proc/irq/ */ extern void init_irq_proc(void); #else static inline void init_irq_proc(void) { } #endif #ifdef CONFIG_IRQ_TIMINGS void irq_timings_enable(void); void irq_timings_disable(void); u64 irq_timings_next_event(u64 now); #endif struct seq_file; int show_interrupts(struct seq_file *p, void *v); int arch_show_interrupts(struct seq_file *p, int prec); extern int early_irq_init(void); extern int arch_probe_nr_irqs(void); extern int arch_early_irq_init(void); /* * We want to know which function is an entrypoint of a hardirq or a softirq. */ #ifndef __irq_entry # define __irq_entry __section(".irqentry.text") #endif #define __softirq_entry __section(".softirqentry.text") #endif
13 13 6 6 6 7 1 8 7 3 7 7 6 6 6 6 2 2 2 2 2 2 2 3 3 3 3 3 3 3 3 1 1 2 2 2 1 2 2 1 1 1 1 1 1 1 1 1 1 1 1 1 3 2 3 3 3 3 2 3 3 1 1 1 1 1 3 3 3 2 3 3 3 1 2 2 2 2 2 1 2 3 1 2 2 2 3 3 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808 4809 4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 4820 4821 4822 4823 4824 4825 4826 4827 4828 4829 4830 4831 4832 4833 4834 4835 4836 4837 4838 4839 4840 4841 4842 4843 4844 4845 4846 4847 4848 4849 4850 4851 4852 4853 4854 4855 4856 4857 4858 4859 4860 4861 4862 4863 4864 4865 4866 4867 4868 4869 4870 4871 4872 4873 4874 4875 4876 4877 4878 4879 4880 4881 4882 4883 4884 4885 4886 4887 4888 4889 4890 4891 4892 4893 4894 4895 4896 4897 4898 4899 4900 4901 4902 4903 4904 4905 4906 4907 4908 4909 4910 4911 4912 4913 4914 4915 4916 4917 4918 4919 4920 4921 4922 4923 4924 4925 4926 4927 4928 4929 4930 4931 4932 4933 4934 4935 4936 4937 4938 4939 4940 4941 4942 4943 4944 4945 4946 4947 4948 4949 4950 4951 4952 4953 4954 4955 4956 4957 4958 4959 4960 4961 4962 4963 4964 4965 4966 4967 4968 4969 4970 4971 4972 4973 4974 4975 4976 4977 4978 4979 4980 4981 4982 4983 4984 4985 4986 4987 4988 4989 4990 4991 4992 4993 4994 4995 4996 4997 4998 4999 5000 5001 5002 5003 5004 5005 5006 5007 5008 5009 5010 5011 5012 5013 5014 5015 5016 5017 5018 5019 5020 5021 5022 5023 5024 5025 5026 5027 5028 5029 5030 5031 5032 5033 5034 5035 5036 5037 5038 5039 5040 5041 5042 5043 5044 5045 5046 5047 5048 5049 5050 5051 5052 5053 5054 5055 5056 5057 5058 5059 5060 5061 5062 5063 5064 5065 5066 5067 5068 5069 5070 5071 5072 5073 5074 5075 5076 5077 5078 5079 5080 5081 5082 5083 5084 5085 5086 5087 5088 5089 5090 5091 5092 5093 5094 5095 5096 5097 5098 5099 5100 5101 5102 5103 5104 5105 5106 5107 5108 5109 5110 5111 5112 5113 5114 5115 5116 5117 5118 5119 5120 5121 5122 5123 5124 5125 5126 5127 5128 5129 5130 5131 5132 5133 5134 5135 5136 5137 5138 5139 5140 5141 5142 5143 5144 5145 5146 5147 5148 5149 5150 5151 5152 5153 5154 5155 5156 5157 5158 5159 5160 5161 5162 5163 5164 5165 5166 5167 5168 5169 5170 5171 5172 5173 5174 5175 5176 5177 5178 5179 5180 5181 5182 5183 5184 5185 5186 5187 5188 5189 5190 5191 5192 5193 5194 5195 5196 5197 5198 5199 5200 5201 5202 5203 5204 5205 5206 5207 5208 5209 5210 5211 5212 5213 5214 5215 5216 5217 5218 5219 5220 5221 5222 5223 5224 5225 5226 5227 5228 5229 5230 5231 5232 5233 5234 5235 5236 5237 5238 5239 5240 5241 5242 5243 5244 5245 5246 5247 5248 5249 5250 5251 5252 5253 5254 5255 5256 5257 5258 5259 5260 5261 5262 5263 5264 5265 5266 5267 5268 5269 5270 5271 5272 5273 5274 5275 5276 5277 5278 5279 5280 5281 5282 5283 5284 5285 5286 5287 5288 5289 5290 5291 5292 5293 5294 5295 5296 5297 5298 5299 5300 5301 5302 5303 5304 5305 5306 5307 5308 5309 5310 5311 5312 5313 5314 5315 5316 5317 5318 5319 5320 5321 5322 5323 5324 5325 5326 5327 5328 5329 5330 5331 5332 5333 5334 5335 5336 5337 5338 5339 5340 5341 5342 5343 5344 5345 5346 5347 5348 5349 5350 5351 5352 5353 5354 5355 5356 5357 5358 5359 5360 5361 5362 5363 5364 5365 5366 5367 5368 5369 5370 5371 5372 5373 5374 5375 5376 5377 5378 5379 5380 5381 5382 5383 5384 5385 5386 5387 5388 5389 5390 5391 5392 5393 5394 5395 5396 5397 5398 5399 5400 5401 5402 5403 5404 5405 5406 5407 5408 5409 5410 5411 5412 5413 5414 5415 5416 5417 5418 5419 5420 5421 5422 5423 5424 5425 5426 5427 5428 5429 5430 5431 5432 5433 5434 5435 5436 5437 5438 5439 5440 5441 5442 5443 5444 5445 5446 5447 5448 5449 5450 5451 5452 5453 5454 5455 5456 5457 5458 5459 5460 5461 5462 5463 5464 5465 5466 5467 5468 5469 5470 5471 5472 5473 5474 5475 5476 5477 5478 5479 5480 5481 5482 5483 5484 5485 5486 5487 5488 5489 5490 5491 5492 5493 5494 5495 5496 5497 5498 5499 5500 5501 5502 5503 5504 5505 5506 5507 5508 5509 5510 5511 5512 5513 5514 5515 5516 5517 5518 5519 5520 5521 5522 5523 5524 5525 5526 5527 5528 5529 5530 5531 5532 5533 5534 5535 5536 5537 5538 5539 5540 5541 5542 5543 5544 5545 5546 5547 5548 5549 5550 5551 5552 5553 5554 5555 5556 5557 5558 5559 5560 5561 5562 5563 5564 5565 5566 5567 5568 5569 5570 5571 5572 5573 5574 5575 5576 5577 5578 5579 5580 5581 5582 5583 5584 5585 5586 5587 5588 5589 5590 5591 5592 5593 5594 5595 5596 5597 5598 5599 5600 5601 5602 5603 5604 5605 5606 5607 5608 5609 5610 5611 5612 5613 5614 5615 5616 5617 5618 5619 5620 5621 5622 5623 5624 5625 5626 5627 5628 5629 5630 5631 5632 5633 5634 5635 5636 5637 5638 5639 5640 5641 5642 5643 5644 5645 5646 5647 5648 5649 5650 5651 5652 5653 5654 5655 5656 5657 5658 5659 5660 5661 5662 5663 5664 5665 5666 5667 5668 5669 5670 5671 5672 5673 5674 5675 5676 5677 5678 5679 5680 5681 5682 5683 5684 5685 5686 5687 5688 5689 5690 5691 5692 5693 5694 5695 5696 5697 5698 5699 5700 5701 5702 5703 5704 5705 5706 5707 5708 5709 5710 5711 5712 5713 5714 5715 5716 5717 5718 5719 5720 5721 5722 5723 5724 5725 5726 5727 5728 5729 5730 5731 5732 5733 5734 5735 5736 5737 5738 5739 5740 5741 5742 5743 5744 5745 5746 5747 5748 5749 5750 5751 5752 5753 5754 5755 5756 5757 5758 5759 5760 5761 5762 5763 5764 5765 5766 5767 5768 5769 5770 5771 5772 5773 5774 5775 5776 5777 5778 5779 5780 5781 5782 5783 5784 5785 5786 5787 5788 5789 5790 5791 5792 5793 5794 5795 5796 5797 5798 5799 5800 5801 5802 5803 5804 5805 5806 5807 5808 5809 5810 5811 5812 5813 5814 5815 5816 5817 5818 5819 5820 5821 5822 5823 5824 5825 5826 5827 5828 5829 5830 5831 5832 5833 5834 5835 5836 5837 5838 5839 5840 5841 5842 5843 5844 5845 5846 5847 5848 5849 5850 5851 5852 5853 5854 5855 5856 5857 5858 5859 5860 5861 5862 5863 5864 5865 5866 5867 5868 5869 5870 5871 5872 5873 5874 5875 5876 5877 5878 5879 5880 5881 5882 5883 5884 5885 5886 5887 5888 5889 5890 5891 5892 5893 5894 5895 5896 5897 5898 5899 5900 5901 5902 5903 5904 5905 5906 5907 5908 5909 5910 5911 5912 5913 5914 5915 5916 5917 5918 5919 5920 5921 5922 5923 5924 5925 5926 5927 5928 5929 5930 5931 5932 5933 5934 5935 5936 5937 5938 5939 5940 5941 5942 5943 5944 5945 5946 5947 5948 5949 5950 5951 5952 5953 5954 5955 5956 5957 5958 5959 5960 5961 5962 5963 5964 5965 5966 5967 5968 5969 5970 5971 5972 5973 5974 5975 5976 5977 5978 5979 5980 5981 5982 5983 5984 5985 5986 5987 5988 5989 5990 5991 5992 5993 5994 5995 5996 5997 5998 5999 6000 6001 6002 6003 6004 6005 6006 6007 6008 6009 6010 6011 6012 6013 6014 6015 6016 6017 6018 6019 6020 6021 6022 6023 6024 6025 6026 6027 6028 6029 6030 6031 6032 6033 6034 6035 6036 6037 6038 6039 6040 6041 6042 6043 6044 6045 6046 6047 6048 6049 6050 6051 6052 6053 6054 6055 6056 6057 6058 6059 6060 6061 6062 6063 6064 6065 6066 6067 6068 6069 6070 6071 6072 6073 6074 6075 6076 6077 6078 6079 6080 6081 6082 6083 6084 6085 6086 6087 6088 6089 6090 6091 6092 6093 6094 6095 6096 6097 6098 6099 6100 6101 6102 6103 6104 6105 6106 6107 6108 6109 6110 6111 6112 6113 6114 6115 6116 6117 6118 6119 6120 6121 6122 6123 6124 6125 6126 6127 6128 6129 6130 6131 6132 6133 6134 6135 6136 6137 6138 6139 6140 6141 6142 6143 6144 6145 6146 6147 6148 6149 6150 6151 6152 6153 6154 6155 6156 6157 6158 6159 6160 6161 6162 6163 6164 6165 6166 6167 6168 6169 6170 6171 6172 6173 6174 6175 6176 6177 6178 6179 6180 6181 6182 6183 6184 6185 6186 6187 6188 6189 6190 6191 6192 6193 6194 6195 6196 6197 6198 6199 6200 6201 6202 6203 6204 6205 6206 6207 6208 6209 6210 6211 6212 6213 6214 6215 6216 6217 6218 6219 6220 6221 6222 6223 6224 6225 6226 6227 6228 6229 6230 6231 6232 6233 6234 6235 6236 6237 6238 6239 6240 6241 6242 6243 6244 6245 6246 6247 6248 6249 6250 6251 6252 6253 6254 6255 6256 6257 6258 6259 6260 6261 6262 6263 6264 6265 6266 6267 6268 6269 6270 6271 6272 6273 6274 6275 6276 6277 6278 6279 6280 6281 6282 6283 6284 6285 6286 6287 6288 6289 6290 6291 6292 6293 6294 6295 6296 6297 6298 6299 6300 6301 6302 6303 6304 6305 6306 6307 6308 6309 6310 6311 6312 6313 6314 6315 6316 6317 6318 6319 6320 6321 6322 6323 6324 6325 6326 6327 6328 6329 6330 6331 6332 6333 6334 6335 6336 6337 6338 6339 6340 6341 6342 6343 6344 6345 6346 6347 6348 6349 6350 6351 6352 6353 6354 6355 6356 6357 6358 6359 6360 6361 6362 6363 6364 6365 6366 6367 6368 6369 6370 6371 6372 6373 6374 6375 6376 6377 6378 6379 6380 6381 6382 6383 6384 6385 6386 6387 6388 6389 6390 6391 6392 6393 6394 6395 6396 6397 6398 6399 6400 6401 6402 6403 6404 6405 6406 6407 6408 6409 6410 6411 6412 6413 6414 6415 6416 6417 6418 6419 6420 6421 6422 6423 6424 6425 6426 6427 6428 6429 6430 6431 6432 6433 6434 6435 6436 6437 6438 6439 6440 6441 6442 6443 6444 6445 6446 6447 6448 6449 6450 6451 6452 6453 6454 6455 6456 6457 6458 6459 6460 6461 6462 6463 6464 6465 6466 6467 6468 6469 6470 6471 6472 6473 6474 6475 6476 6477 6478 6479 6480 6481 6482 6483 6484 6485 6486 6487 6488 6489 6490 6491 6492 6493 6494 6495 6496 6497 6498 6499 6500 6501 6502 6503 6504 6505 6506 6507 6508 6509 6510 6511 6512 6513 6514 6515 6516 6517 6518 6519 6520 6521 6522 6523 6524 6525 6526 6527 6528 6529 6530 6531 6532 6533 6534 6535 6536 6537 6538 6539 6540 6541 6542 6543 6544 6545 6546 6547 6548 6549 6550 6551 6552 6553 6554 6555 6556 6557 6558 6559 6560 6561 6562 6563 6564 6565 6566 6567 6568 6569 6570 6571 6572 6573 6574 6575 6576 6577 6578 6579 6580 6581 6582 6583 6584 6585 6586 6587 6588 6589 6590 6591 6592 6593 6594 6595 6596 6597 6598 6599 6600 6601 6602 6603 6604 6605 6606 6607 6608 6609 6610 6611 6612 6613 6614 6615 6616 6617 6618 6619 6620 6621 6622 6623 6624 6625 6626 6627 6628 6629 6630 6631 6632 6633 6634 6635 6636 6637 6638 6639 6640 6641 6642 6643 6644 6645 6646 6647 6648 6649 6650 6651 6652 6653 6654 6655 6656 6657 6658 6659 6660 6661 6662 6663 6664 6665 6666 6667 6668 6669 6670 6671 6672 6673 6674 6675 6676 6677 6678 6679 6680 6681 6682 6683 6684 6685 6686 6687 6688 6689 6690 6691 6692 6693 6694 6695 6696 6697 6698 6699 6700 6701 6702 6703 6704 6705 6706 6707 6708 6709 6710 6711 6712 6713 6714 6715 6716 6717 6718 6719 6720 6721 6722 6723 6724 6725 6726 6727 6728 6729 6730 6731 6732 6733 6734 6735 6736 6737 6738 6739 6740 6741 6742 6743 6744 6745 6746 6747 6748 6749 6750 6751 6752 6753 6754 6755 6756 6757 6758 6759 6760 6761 6762 6763 6764 6765 6766 6767 6768 6769 6770 6771 6772 6773 6774 6775 6776 6777 6778 6779 6780 6781 6782 6783 6784 6785 6786 6787 6788 6789 6790 6791 6792 6793 6794 6795 6796 6797 6798 6799 6800 6801 6802 6803 6804 6805 6806 6807 6808 6809 6810 6811 6812 6813 6814 6815 6816 6817 6818 6819 6820 6821 6822 6823 6824 6825 6826 6827 6828 6829 6830 6831 6832 6833 6834 6835 6836 6837 6838 6839 6840 6841 6842 6843 6844 6845 6846 6847 6848 6849 6850 6851 6852 6853 6854 6855 6856 6857 6858 6859 6860 6861 6862 6863 6864 6865 6866 6867 6868 6869 6870 6871 6872 6873 6874 6875 6876 6877 6878 6879 6880 6881 6882 6883 6884 6885 6886 6887 6888 6889 6890 6891 6892 6893 6894 6895 6896 6897 6898 6899 6900 6901 6902 6903 6904 6905 6906 6907 6908 6909 6910 6911 6912 6913 6914 6915 6916 6917 6918 6919 6920 6921 6922 6923 6924 6925 6926 6927 6928 6929 6930 6931 6932 6933 6934 6935 6936 6937 6938 6939 6940 6941 6942 6943 6944 6945 6946 6947 6948 6949 6950 6951 6952 6953 6954 6955 6956 6957 6958 6959 6960 6961 6962 6963 6964 6965 6966 6967 6968 6969 6970 6971 6972 6973 6974 6975 6976 6977 6978 6979 6980 6981 6982 6983 6984 6985 6986 6987 6988 6989 6990 6991 6992 6993 6994 6995 6996 6997 6998 6999 7000 7001 7002 7003 7004 7005 7006 7007 7008 7009 7010 7011 7012 7013 7014 7015 7016 7017 7018 7019 7020 7021 7022 7023 7024 7025 7026 7027 7028 7029 7030 7031 7032 7033 7034 7035 7036 7037 7038 7039 7040 7041 7042 7043 7044 7045 7046 7047 7048 7049 7050 7051 7052 7053 7054 7055 7056 7057 7058 7059 7060 7061 7062 7063 7064 7065 7066 7067 7068 7069 7070 7071 7072 7073 7074 7075 7076 7077 7078 7079 7080 7081 7082 7083 7084 7085 7086 7087 7088 7089 7090 7091 7092 7093 7094 7095 7096 7097 7098 7099 7100 7101 7102 7103 7104 7105 7106 7107 7108 7109 7110 7111 7112 7113 7114 7115 7116 7117 7118 7119 7120 7121 7122 7123 7124 7125 7126 7127 7128 7129 7130 7131 7132 7133 7134 7135 7136 7137 7138 7139 7140 7141 7142 7143 7144 7145 7146 7147 7148 7149 7150 7151 7152 7153 7154 7155 7156 7157 7158 // SPDX-License-Identifier: GPL-2.0-only /* binder.c * * Android IPC Subsystem * * Copyright (C) 2007-2008 Google, Inc. */ /* * Locking overview * * There are 3 main spinlocks which must be acquired in the * order shown: * * 1) proc->outer_lock : protects binder_ref * binder_proc_lock() and binder_proc_unlock() are * used to acq/rel. * 2) node->lock : protects most fields of binder_node. * binder_node_lock() and binder_node_unlock() are * used to acq/rel * 3) proc->inner_lock : protects the thread and node lists * (proc->threads, proc->waiting_threads, proc->nodes) * and all todo lists associated with the binder_proc * (proc->todo, thread->todo, proc->delivered_death and * node->async_todo), as well as thread->transaction_stack * binder_inner_proc_lock() and binder_inner_proc_unlock() * are used to acq/rel * * Any lock under procA must never be nested under any lock at the same * level or below on procB. * * Functions that require a lock held on entry indicate which lock * in the suffix of the function name: * * foo_olocked() : requires node->outer_lock * foo_nlocked() : requires node->lock * foo_ilocked() : requires proc->inner_lock * foo_oilocked(): requires proc->outer_lock and proc->inner_lock * foo_nilocked(): requires node->lock and proc->inner_lock * ... */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/fdtable.h> #include <linux/file.h> #include <linux/freezer.h> #include <linux/fs.h> #include <linux/list.h> #include <linux/miscdevice.h> #include <linux/module.h> #include <linux/mutex.h> #include <linux/nsproxy.h> #include <linux/poll.h> #include <linux/debugfs.h> #include <linux/rbtree.h> #include <linux/sched/signal.h> #include <linux/sched/mm.h> #include <linux/seq_file.h> #include <linux/string.h> #include <linux/uaccess.h> #include <linux/pid_namespace.h> #include <linux/security.h> #include <linux/spinlock.h> #include <linux/ratelimit.h> #include <linux/syscalls.h> #include <linux/task_work.h> #include <linux/sizes.h> #include <linux/ktime.h> #include <kunit/visibility.h> #include <uapi/linux/android/binder.h> #include <linux/cacheflush.h> #include "binder_netlink.h" #include "binder_internal.h" #include "binder_trace.h" static HLIST_HEAD(binder_deferred_list); static DEFINE_MUTEX(binder_deferred_lock); static HLIST_HEAD(binder_devices); static DEFINE_SPINLOCK(binder_devices_lock); static HLIST_HEAD(binder_procs); static DEFINE_MUTEX(binder_procs_lock); static HLIST_HEAD(binder_dead_nodes); static DEFINE_SPINLOCK(binder_dead_nodes_lock); static struct dentry *binder_debugfs_dir_entry_root; static struct dentry *binder_debugfs_dir_entry_proc; static atomic_t binder_last_id; static int proc_show(struct seq_file *m, void *unused); DEFINE_SHOW_ATTRIBUTE(proc); #define FORBIDDEN_MMAP_FLAGS (VM_WRITE) enum { BINDER_DEBUG_USER_ERROR = 1U << 0, BINDER_DEBUG_FAILED_TRANSACTION = 1U << 1, BINDER_DEBUG_DEAD_TRANSACTION = 1U << 2, BINDER_DEBUG_OPEN_CLOSE = 1U << 3, BINDER_DEBUG_DEAD_BINDER = 1U << 4, BINDER_DEBUG_DEATH_NOTIFICATION = 1U << 5, BINDER_DEBUG_READ_WRITE = 1U << 6, BINDER_DEBUG_USER_REFS = 1U << 7, BINDER_DEBUG_THREADS = 1U << 8, BINDER_DEBUG_TRANSACTION = 1U << 9, BINDER_DEBUG_TRANSACTION_COMPLETE = 1U << 10, BINDER_DEBUG_FREE_BUFFER = 1U << 11, BINDER_DEBUG_INTERNAL_REFS = 1U << 12, BINDER_DEBUG_PRIORITY_CAP = 1U << 13, BINDER_DEBUG_SPINLOCKS = 1U << 14, }; static uint32_t binder_debug_mask = BINDER_DEBUG_USER_ERROR | BINDER_DEBUG_FAILED_TRANSACTION | BINDER_DEBUG_DEAD_TRANSACTION; module_param_named(debug_mask, binder_debug_mask, uint, 0644); char *binder_devices_param = CONFIG_ANDROID_BINDER_DEVICES; module_param_named(devices, binder_devices_param, charp, 0444); static DECLARE_WAIT_QUEUE_HEAD(binder_user_error_wait); static int binder_stop_on_user_error; static int binder_set_stop_on_user_error(const char *val, const struct kernel_param *kp) { int ret; ret = param_set_int(val, kp); if (binder_stop_on_user_error < 2) wake_up(&binder_user_error_wait); return ret; } module_param_call(stop_on_user_error, binder_set_stop_on_user_error, param_get_int, &binder_stop_on_user_error, 0644); static __printf(2, 3) void binder_debug(int mask, const char *format, ...) { struct va_format vaf; va_list args; if (binder_debug_mask & mask) { va_start(args, format); vaf.va = &args; vaf.fmt = format; pr_info_ratelimited("%pV", &vaf); va_end(args); } } #define binder_txn_error(x...) \ binder_debug(BINDER_DEBUG_FAILED_TRANSACTION, x) static __printf(1, 2) void binder_user_error(const char *format, ...) { struct va_format vaf; va_list args; if (binder_debug_mask & BINDER_DEBUG_USER_ERROR) { va_start(args, format); vaf.va = &args; vaf.fmt = format; pr_info_ratelimited("%pV", &vaf); va_end(args); } if (binder_stop_on_user_error) binder_stop_on_user_error = 2; } #define binder_set_extended_error(ee, _id, _command, _param) \ do { \ (ee)->id = _id; \ (ee)->command = _command; \ (ee)->param = _param; \ } while (0) #define to_flat_binder_object(hdr) \ container_of(hdr, struct flat_binder_object, hdr) #define to_binder_fd_object(hdr) container_of(hdr, struct binder_fd_object, hdr) #define to_binder_buffer_object(hdr) \ container_of(hdr, struct binder_buffer_object, hdr) #define to_binder_fd_array_object(hdr) \ container_of(hdr, struct binder_fd_array_object, hdr) static struct binder_stats binder_stats; static inline void binder_stats_deleted(enum binder_stat_types type) { atomic_inc(&binder_stats.obj_deleted[type]); } static inline void binder_stats_created(enum binder_stat_types type) { atomic_inc(&binder_stats.obj_created[type]); } struct binder_transaction_log_entry { int debug_id; int debug_id_done; int call_type; int from_proc; int from_thread; int target_handle; int to_proc; int to_thread; int to_node; int data_size; int offsets_size; int return_error_line; uint32_t return_error; uint32_t return_error_param; char context_name[BINDERFS_MAX_NAME + 1]; }; struct binder_transaction_log { atomic_t cur; bool full; struct binder_transaction_log_entry entry[32]; }; static struct binder_transaction_log binder_transaction_log; static struct binder_transaction_log binder_transaction_log_failed; static struct binder_transaction_log_entry *binder_transaction_log_add( struct binder_transaction_log *log) { struct binder_transaction_log_entry *e; unsigned int cur = atomic_inc_return(&log->cur); if (cur >= ARRAY_SIZE(log->entry)) log->full = true; e = &log->entry[cur % ARRAY_SIZE(log->entry)]; WRITE_ONCE(e->debug_id_done, 0); /* * write-barrier to synchronize access to e->debug_id_done. * We make sure the initialized 0 value is seen before * memset() other fields are zeroed by memset. */ smp_wmb(); memset(e, 0, sizeof(*e)); return e; } enum binder_deferred_state { BINDER_DEFERRED_FLUSH = 0x01, BINDER_DEFERRED_RELEASE = 0x02, }; enum { BINDER_LOOPER_STATE_REGISTERED = 0x01, BINDER_LOOPER_STATE_ENTERED = 0x02, BINDER_LOOPER_STATE_EXITED = 0x04, BINDER_LOOPER_STATE_INVALID = 0x08, BINDER_LOOPER_STATE_WAITING = 0x10, BINDER_LOOPER_STATE_POLL = 0x20, }; /** * binder_proc_lock() - Acquire outer lock for given binder_proc * @proc: struct binder_proc to acquire * * Acquires proc->outer_lock. Used to protect binder_ref * structures associated with the given proc. */ #define binder_proc_lock(proc) _binder_proc_lock(proc, __LINE__) static void _binder_proc_lock(struct binder_proc *proc, int line) __acquires(&proc->outer_lock) { binder_debug(BINDER_DEBUG_SPINLOCKS, "%s: line=%d\n", __func__, line); spin_lock(&proc->outer_lock); } /** * binder_proc_unlock() - Release outer lock for given binder_proc * @proc: struct binder_proc to acquire * * Release lock acquired via binder_proc_lock() */ #define binder_proc_unlock(proc) _binder_proc_unlock(proc, __LINE__) static void _binder_proc_unlock(struct binder_proc *proc, int line) __releases(&proc->outer_lock) { binder_debug(BINDER_DEBUG_SPINLOCKS, "%s: line=%d\n", __func__, line); spin_unlock(&proc->outer_lock); } /** * binder_inner_proc_lock() - Acquire inner lock for given binder_proc * @proc: struct binder_proc to acquire * * Acquires proc->inner_lock. Used to protect todo lists */ #define binder_inner_proc_lock(proc) _binder_inner_proc_lock(proc, __LINE__) static void _binder_inner_proc_lock(struct binder_proc *proc, int line) __acquires(&proc->inner_lock) { binder_debug(BINDER_DEBUG_SPINLOCKS, "%s: line=%d\n", __func__, line); spin_lock(&proc->inner_lock); } /** * binder_inner_proc_unlock() - Release inner lock for given binder_proc * @proc: struct binder_proc to acquire * * Release lock acquired via binder_inner_proc_lock() */ #define binder_inner_proc_unlock(proc) _binder_inner_proc_unlock(proc, __LINE__) static void _binder_inner_proc_unlock(struct binder_proc *proc, int line) __releases(&proc->inner_lock) { binder_debug(BINDER_DEBUG_SPINLOCKS, "%s: line=%d\n", __func__, line); spin_unlock(&proc->inner_lock); } /** * binder_node_lock() - Acquire spinlock for given binder_node * @node: struct binder_node to acquire * * Acquires node->lock. Used to protect binder_node fields */ #define binder_node_lock(node) _binder_node_lock(node, __LINE__) static void _binder_node_lock(struct binder_node *node, int line) __acquires(&node->lock) { binder_debug(BINDER_DEBUG_SPINLOCKS, "%s: line=%d\n", __func__, line); spin_lock(&node->lock); } /** * binder_node_unlock() - Release spinlock for given binder_proc * @node: struct binder_node to acquire * * Release lock acquired via binder_node_lock() */ #define binder_node_unlock(node) _binder_node_unlock(node, __LINE__) static void _binder_node_unlock(struct binder_node *node, int line) __releases(&node->lock) { binder_debug(BINDER_DEBUG_SPINLOCKS, "%s: line=%d\n", __func__, line); spin_unlock(&node->lock); } /** * binder_node_inner_lock() - Acquire node and inner locks * @node: struct binder_node to acquire * * Acquires node->lock. If node->proc also acquires * proc->inner_lock. Used to protect binder_node fields */ #define binder_node_inner_lock(node) _binder_node_inner_lock(node, __LINE__) static void _binder_node_inner_lock(struct binder_node *node, int line) __acquires(&node->lock) __acquires(&node->proc->inner_lock) { binder_debug(BINDER_DEBUG_SPINLOCKS, "%s: line=%d\n", __func__, line); spin_lock(&node->lock); if (node->proc) binder_inner_proc_lock(node->proc); else /* annotation for sparse */ __acquire(&node->proc->inner_lock); } /** * binder_node_inner_unlock() - Release node and inner locks * @node: struct binder_node to acquire * * Release lock acquired via binder_node_lock() */ #define binder_node_inner_unlock(node) _binder_node_inner_unlock(node, __LINE__) static void _binder_node_inner_unlock(struct binder_node *node, int line) __releases(&node->lock) __releases(&node->proc->inner_lock) { struct binder_proc *proc = node->proc; binder_debug(BINDER_DEBUG_SPINLOCKS, "%s: line=%d\n", __func__, line); if (proc) binder_inner_proc_unlock(proc); else /* annotation for sparse */ __release(&node->proc->inner_lock); spin_unlock(&node->lock); } static bool binder_worklist_empty_ilocked(struct list_head *list) { return list_empty(list); } /** * binder_worklist_empty() - Check if no items on the work list * @proc: binder_proc associated with list * @list: list to check * * Return: true if there are no items on list, else false */ static bool binder_worklist_empty(struct binder_proc *proc, struct list_head *list) { bool ret; binder_inner_proc_lock(proc); ret = binder_worklist_empty_ilocked(list); binder_inner_proc_unlock(proc); return ret; } /** * binder_enqueue_work_ilocked() - Add an item to the work list * @work: struct binder_work to add to list * @target_list: list to add work to * * Adds the work to the specified list. Asserts that work * is not already on a list. * * Requires the proc->inner_lock to be held. */ static void binder_enqueue_work_ilocked(struct binder_work *work, struct list_head *target_list) { BUG_ON(target_list == NULL); BUG_ON(work->entry.next && !list_empty(&work->entry)); list_add_tail(&work->entry, target_list); } /** * binder_enqueue_deferred_thread_work_ilocked() - Add deferred thread work * @thread: thread to queue work to * @work: struct binder_work to add to list * * Adds the work to the todo list of the thread. Doesn't set the process_todo * flag, which means that (if it wasn't already set) the thread will go to * sleep without handling this work when it calls read. * * Requires the proc->inner_lock to be held. */ static void binder_enqueue_deferred_thread_work_ilocked(struct binder_thread *thread, struct binder_work *work) { WARN_ON(!list_empty(&thread->waiting_thread_node)); binder_enqueue_work_ilocked(work, &thread->todo); } /** * binder_enqueue_thread_work_ilocked() - Add an item to the thread work list * @thread: thread to queue work to * @work: struct binder_work to add to list * * Adds the work to the todo list of the thread, and enables processing * of the todo queue. * * Requires the proc->inner_lock to be held. */ static void binder_enqueue_thread_work_ilocked(struct binder_thread *thread, struct binder_work *work) { WARN_ON(!list_empty(&thread->waiting_thread_node)); binder_enqueue_work_ilocked(work, &thread->todo); /* (e)poll-based threads require an explicit wakeup signal when * queuing their own work; they rely on these events to consume * messages without I/O block. Without it, threads risk waiting * indefinitely without handling the work. */ if (thread->looper & BINDER_LOOPER_STATE_POLL && thread->pid == current->pid && !thread->process_todo) wake_up_interruptible_sync(&thread->wait); thread->process_todo = true; } /** * binder_enqueue_thread_work() - Add an item to the thread work list * @thread: thread to queue work to * @work: struct binder_work to add to list * * Adds the work to the todo list of the thread, and enables processing * of the todo queue. */ static void binder_enqueue_thread_work(struct binder_thread *thread, struct binder_work *work) { binder_inner_proc_lock(thread->proc); binder_enqueue_thread_work_ilocked(thread, work); binder_inner_proc_unlock(thread->proc); } static void binder_dequeue_work_ilocked(struct binder_work *work) { list_del_init(&work->entry); } /** * binder_dequeue_work() - Removes an item from the work list * @proc: binder_proc associated with list * @work: struct binder_work to remove from list * * Removes the specified work item from whatever list it is on. * Can safely be called if work is not on any list. */ static void binder_dequeue_work(struct binder_proc *proc, struct binder_work *work) { binder_inner_proc_lock(proc); binder_dequeue_work_ilocked(work); binder_inner_proc_unlock(proc); } static struct binder_work *binder_dequeue_work_head_ilocked( struct list_head *list) { struct binder_work *w; w = list_first_entry_or_null(list, struct binder_work, entry); if (w) list_del_init(&w->entry); return w; } static void binder_defer_work(struct binder_proc *proc, enum binder_deferred_state defer); static void binder_free_thread(struct binder_thread *thread); static void binder_free_proc(struct binder_proc *proc); static void binder_inc_node_tmpref_ilocked(struct binder_node *node); static bool binder_has_work_ilocked(struct binder_thread *thread, bool do_proc_work) { return thread->process_todo || thread->looper_need_return || (do_proc_work && !binder_worklist_empty_ilocked(&thread->proc->todo)); } static bool binder_has_work(struct binder_thread *thread, bool do_proc_work) { bool has_work; binder_inner_proc_lock(thread->proc); has_work = binder_has_work_ilocked(thread, do_proc_work); binder_inner_proc_unlock(thread->proc); return has_work; } static bool binder_available_for_proc_work_ilocked(struct binder_thread *thread) { return !thread->transaction_stack && binder_worklist_empty_ilocked(&thread->todo); } static void binder_wakeup_poll_threads_ilocked(struct binder_proc *proc, bool sync) { struct rb_node *n; struct binder_thread *thread; for (n = rb_first(&proc->threads); n != NULL; n = rb_next(n)) { thread = rb_entry(n, struct binder_thread, rb_node); if (thread->looper & BINDER_LOOPER_STATE_POLL && binder_available_for_proc_work_ilocked(thread)) { if (sync) wake_up_interruptible_sync(&thread->wait); else wake_up_interruptible(&thread->wait); } } } /** * binder_select_thread_ilocked() - selects a thread for doing proc work. * @proc: process to select a thread from * * Note that calling this function moves the thread off the waiting_threads * list, so it can only be woken up by the caller of this function, or a * signal. Therefore, callers *should* always wake up the thread this function * returns. * * Return: If there's a thread currently waiting for process work, * returns that thread. Otherwise returns NULL. */ static struct binder_thread * binder_select_thread_ilocked(struct binder_proc *proc) { struct binder_thread *thread; assert_spin_locked(&proc->inner_lock); thread = list_first_entry_or_null(&proc->waiting_threads, struct binder_thread, waiting_thread_node); if (thread) list_del_init(&thread->waiting_thread_node); return thread; } /** * binder_wakeup_thread_ilocked() - wakes up a thread for doing proc work. * @proc: process to wake up a thread in * @thread: specific thread to wake-up (may be NULL) * @sync: whether to do a synchronous wake-up * * This function wakes up a thread in the @proc process. * The caller may provide a specific thread to wake-up in * the @thread parameter. If @thread is NULL, this function * will wake up threads that have called poll(). * * Note that for this function to work as expected, callers * should first call binder_select_thread() to find a thread * to handle the work (if they don't have a thread already), * and pass the result into the @thread parameter. */ static void binder_wakeup_thread_ilocked(struct binder_proc *proc, struct binder_thread *thread, bool sync) { assert_spin_locked(&proc->inner_lock); if (thread) { if (sync) wake_up_interruptible_sync(&thread->wait); else wake_up_interruptible(&thread->wait); return; } /* Didn't find a thread waiting for proc work; this can happen * in two scenarios: * 1. All threads are busy handling transactions * In that case, one of those threads should call back into * the kernel driver soon and pick up this work. * 2. Threads are using the (e)poll interface, in which case * they may be blocked on the waitqueue without having been * added to waiting_threads. For this case, we just iterate * over all threads not handling transaction work, and * wake them all up. We wake all because we don't know whether * a thread that called into (e)poll is handling non-binder * work currently. */ binder_wakeup_poll_threads_ilocked(proc, sync); } static void binder_wakeup_proc_ilocked(struct binder_proc *proc) { struct binder_thread *thread = binder_select_thread_ilocked(proc); binder_wakeup_thread_ilocked(proc, thread, /* sync = */false); } static void binder_set_nice(long nice) { long min_nice; if (can_nice(current, nice)) { set_user_nice(current, nice); return; } min_nice = rlimit_to_nice(rlimit(RLIMIT_NICE)); binder_debug(BINDER_DEBUG_PRIORITY_CAP, "%d: nice value %ld not allowed use %ld instead\n", current->pid, nice, min_nice); set_user_nice(current, min_nice); if (min_nice <= MAX_NICE) return; binder_user_error("%d RLIMIT_NICE not set\n", current->pid); } static struct binder_node *binder_get_node_ilocked(struct binder_proc *proc, binder_uintptr_t ptr) { struct rb_node *n = proc->nodes.rb_node; struct binder_node *node; assert_spin_locked(&proc->inner_lock); while (n) { node = rb_entry(n, struct binder_node, rb_node); if (ptr < node->ptr) n = n->rb_left; else if (ptr > node->ptr) n = n->rb_right; else { /* * take an implicit weak reference * to ensure node stays alive until * call to binder_put_node() */ binder_inc_node_tmpref_ilocked(node); return node; } } return NULL; } static struct binder_node *binder_get_node(struct binder_proc *proc, binder_uintptr_t ptr) { struct binder_node *node; binder_inner_proc_lock(proc); node = binder_get_node_ilocked(proc, ptr); binder_inner_proc_unlock(proc); return node; } static struct binder_node *binder_init_node_ilocked( struct binder_proc *proc, struct binder_node *new_node, struct flat_binder_object *fp) { struct rb_node **p = &proc->nodes.rb_node; struct rb_node *parent = NULL; struct binder_node *node; binder_uintptr_t ptr = fp ? fp->binder : 0; binder_uintptr_t cookie = fp ? fp->cookie : 0; __u32 flags = fp ? fp->flags : 0; assert_spin_locked(&proc->inner_lock); while (*p) { parent = *p; node = rb_entry(parent, struct binder_node, rb_node); if (ptr < node->ptr) p = &(*p)->rb_left; else if (ptr > node->ptr) p = &(*p)->rb_right; else { /* * A matching node is already in * the rb tree. Abandon the init * and return it. */ binder_inc_node_tmpref_ilocked(node); return node; } } node = new_node; binder_stats_created(BINDER_STAT_NODE); node->tmp_refs++; rb_link_node(&node->rb_node, parent, p); rb_insert_color(&node->rb_node, &proc->nodes); node->debug_id = atomic_inc_return(&binder_last_id); node->proc = proc; node->ptr = ptr; node->cookie = cookie; node->work.type = BINDER_WORK_NODE; node->min_priority = flags & FLAT_BINDER_FLAG_PRIORITY_MASK; node->accept_fds = !!(flags & FLAT_BINDER_FLAG_ACCEPTS_FDS); node->txn_security_ctx = !!(flags & FLAT_BINDER_FLAG_TXN_SECURITY_CTX); spin_lock_init(&node->lock); INIT_LIST_HEAD(&node->work.entry); INIT_LIST_HEAD(&node->async_todo); binder_debug(BINDER_DEBUG_INTERNAL_REFS, "%d:%d node %d u%016llx c%016llx created\n", proc->pid, current->pid, node->debug_id, (u64)node->ptr, (u64)node->cookie); return node; } static struct binder_node *binder_new_node(struct binder_proc *proc, struct flat_binder_object *fp) { struct binder_node *node; struct binder_node *new_node = kzalloc(sizeof(*node), GFP_KERNEL); if (!new_node) return NULL; binder_inner_proc_lock(proc); node = binder_init_node_ilocked(proc, new_node, fp); binder_inner_proc_unlock(proc); if (node != new_node) /* * The node was already added by another thread */ kfree(new_node); return node; } static void binder_free_node(struct binder_node *node) { kfree(node); binder_stats_deleted(BINDER_STAT_NODE); } static int binder_inc_node_nilocked(struct binder_node *node, int strong, int internal, struct list_head *target_list) { struct binder_proc *proc = node->proc; assert_spin_locked(&node->lock); if (proc) assert_spin_locked(&proc->inner_lock); if (strong) { if (internal) { if (target_list == NULL && node->internal_strong_refs == 0 && !(node->proc && node == node->proc->context->binder_context_mgr_node && node->has_strong_ref)) { pr_err("invalid inc strong node for %d\n", node->debug_id); return -EINVAL; } node->internal_strong_refs++; } else node->local_strong_refs++; if (!node->has_strong_ref && target_list) { struct binder_thread *thread = container_of(target_list, struct binder_thread, todo); binder_dequeue_work_ilocked(&node->work); BUG_ON(&thread->todo != target_list); binder_enqueue_deferred_thread_work_ilocked(thread, &node->work); } } else { if (!internal) node->local_weak_refs++; if (!node->has_weak_ref && target_list && list_empty(&node->work.entry)) binder_enqueue_work_ilocked(&node->work, target_list); } return 0; } static int binder_inc_node(struct binder_node *node, int strong, int internal, struct list_head *target_list) { int ret; binder_node_inner_lock(node); ret = binder_inc_node_nilocked(node, strong, internal, target_list); binder_node_inner_unlock(node); return ret; } static bool binder_dec_node_nilocked(struct binder_node *node, int strong, int internal) { struct binder_proc *proc = node->proc; assert_spin_locked(&node->lock); if (proc) assert_spin_locked(&proc->inner_lock); if (strong) { if (internal) node->internal_strong_refs--; else node->local_strong_refs--; if (node->local_strong_refs || node->internal_strong_refs) return false; } else { if (!internal) node->local_weak_refs--; if (node->local_weak_refs || node->tmp_refs || !hlist_empty(&node->refs)) return false; } if (proc && (node->has_strong_ref || node->has_weak_ref)) { if (list_empty(&node->work.entry)) { binder_enqueue_work_ilocked(&node->work, &proc->todo); binder_wakeup_proc_ilocked(proc); } } else { if (hlist_empty(&node->refs) && !node->local_strong_refs && !node->local_weak_refs && !node->tmp_refs) { if (proc) { binder_dequeue_work_ilocked(&node->work); rb_erase(&node->rb_node, &proc->nodes); binder_debug(BINDER_DEBUG_INTERNAL_REFS, "refless node %d deleted\n", node->debug_id); } else { BUG_ON(!list_empty(&node->work.entry)); spin_lock(&binder_dead_nodes_lock); /* * tmp_refs could have changed so * check it again */ if (node->tmp_refs) { spin_unlock(&binder_dead_nodes_lock); return false; } hlist_del(&node->dead_node); spin_unlock(&binder_dead_nodes_lock); binder_debug(BINDER_DEBUG_INTERNAL_REFS, "dead node %d deleted\n", node->debug_id); } return true; } } return false; } static void binder_dec_node(struct binder_node *node, int strong, int internal) { bool free_node; binder_node_inner_lock(node); free_node = binder_dec_node_nilocked(node, strong, internal); binder_node_inner_unlock(node); if (free_node) binder_free_node(node); } static void binder_inc_node_tmpref_ilocked(struct binder_node *node) { /* * No call to binder_inc_node() is needed since we * don't need to inform userspace of any changes to * tmp_refs */ node->tmp_refs++; } /** * binder_inc_node_tmpref() - take a temporary reference on node * @node: node to reference * * Take reference on node to prevent the node from being freed * while referenced only by a local variable. The inner lock is * needed to serialize with the node work on the queue (which * isn't needed after the node is dead). If the node is dead * (node->proc is NULL), use binder_dead_nodes_lock to protect * node->tmp_refs against dead-node-only cases where the node * lock cannot be acquired (eg traversing the dead node list to * print nodes) */ static void binder_inc_node_tmpref(struct binder_node *node) { binder_node_lock(node); if (node->proc) binder_inner_proc_lock(node->proc); else spin_lock(&binder_dead_nodes_lock); binder_inc_node_tmpref_ilocked(node); if (node->proc) binder_inner_proc_unlock(node->proc); else spin_unlock(&binder_dead_nodes_lock); binder_node_unlock(node); } /** * binder_dec_node_tmpref() - remove a temporary reference on node * @node: node to reference * * Release temporary reference on node taken via binder_inc_node_tmpref() */ static void binder_dec_node_tmpref(struct binder_node *node) { bool free_node; binder_node_inner_lock(node); if (!node->proc) spin_lock(&binder_dead_nodes_lock); else __acquire(&binder_dead_nodes_lock); node->tmp_refs--; BUG_ON(node->tmp_refs < 0); if (!node->proc) spin_unlock(&binder_dead_nodes_lock); else __release(&binder_dead_nodes_lock); /* * Call binder_dec_node() to check if all refcounts are 0 * and cleanup is needed. Calling with strong=0 and internal=1 * causes no actual reference to be released in binder_dec_node(). * If that changes, a change is needed here too. */ free_node = binder_dec_node_nilocked(node, 0, 1); binder_node_inner_unlock(node); if (free_node) binder_free_node(node); } static void binder_put_node(struct binder_node *node) { binder_dec_node_tmpref(node); } static struct binder_ref *binder_get_ref_olocked(struct binder_proc *proc, u32 desc, bool need_strong_ref) { struct rb_node *n = proc->refs_by_desc.rb_node; struct binder_ref *ref; while (n) { ref = rb_entry(n, struct binder_ref, rb_node_desc); if (desc < ref->data.desc) { n = n->rb_left; } else if (desc > ref->data.desc) { n = n->rb_right; } else if (need_strong_ref && !ref->data.strong) { binder_user_error("tried to use weak ref as strong ref\n"); return NULL; } else { return ref; } } return NULL; } /* Find the smallest unused descriptor the "slow way" */ static u32 slow_desc_lookup_olocked(struct binder_proc *proc, u32 offset) { struct binder_ref *ref; struct rb_node *n; u32 desc; desc = offset; for (n = rb_first(&proc->refs_by_desc); n; n = rb_next(n)) { ref = rb_entry(n, struct binder_ref, rb_node_desc); if (ref->data.desc > desc) break; desc = ref->data.desc + 1; } return desc; } /* * Find an available reference descriptor ID. The proc->outer_lock might * be released in the process, in which case -EAGAIN is returned and the * @desc should be considered invalid. */ static int get_ref_desc_olocked(struct binder_proc *proc, struct binder_node *node, u32 *desc) { struct dbitmap *dmap = &proc->dmap; unsigned int nbits, offset; unsigned long *new, bit; /* 0 is reserved for the context manager */ offset = (node == proc->context->binder_context_mgr_node) ? 0 : 1; if (!dbitmap_enabled(dmap)) { *desc = slow_desc_lookup_olocked(proc, offset); return 0; } if (dbitmap_acquire_next_zero_bit(dmap, offset, &bit) == 0) { *desc = bit; return 0; } /* * The dbitmap is full and needs to grow. The proc->outer_lock * is briefly released to allocate the new bitmap safely. */ nbits = dbitmap_grow_nbits(dmap); binder_proc_unlock(proc); new = bitmap_zalloc(nbits, GFP_KERNEL); binder_proc_lock(proc); dbitmap_grow(dmap, new, nbits); return -EAGAIN; } /** * binder_get_ref_for_node_olocked() - get the ref associated with given node * @proc: binder_proc that owns the ref * @node: binder_node of target * @new_ref: newly allocated binder_ref to be initialized or %NULL * * Look up the ref for the given node and return it if it exists * * If it doesn't exist and the caller provides a newly allocated * ref, initialize the fields of the newly allocated ref and insert * into the given proc rb_trees and node refs list. * * Return: the ref for node. It is possible that another thread * allocated/initialized the ref first in which case the * returned ref would be different than the passed-in * new_ref. new_ref must be kfree'd by the caller in * this case. */ static struct binder_ref *binder_get_ref_for_node_olocked( struct binder_proc *proc, struct binder_node *node, struct binder_ref *new_ref) { struct binder_ref *ref; struct rb_node *parent; struct rb_node **p; u32 desc; retry: p = &proc->refs_by_node.rb_node; parent = NULL; while (*p) { parent = *p; ref = rb_entry(parent, struct binder_ref, rb_node_node); if (node < ref->node) p = &(*p)->rb_left; else if (node > ref->node) p = &(*p)->rb_right; else return ref; } if (!new_ref) return NULL; /* might release the proc->outer_lock */ if (get_ref_desc_olocked(proc, node, &desc) == -EAGAIN) goto retry; binder_stats_created(BINDER_STAT_REF); new_ref->data.debug_id = atomic_inc_return(&binder_last_id); new_ref->proc = proc; new_ref->node = node; rb_link_node(&new_ref->rb_node_node, parent, p); rb_insert_color(&new_ref->rb_node_node, &proc->refs_by_node); new_ref->data.desc = desc; p = &proc->refs_by_desc.rb_node; while (*p) { parent = *p; ref = rb_entry(parent, struct binder_ref, rb_node_desc); if (new_ref->data.desc < ref->data.desc) p = &(*p)->rb_left; else if (new_ref->data.desc > ref->data.desc) p = &(*p)->rb_right; else BUG(); } rb_link_node(&new_ref->rb_node_desc, parent, p); rb_insert_color(&new_ref->rb_node_desc, &proc->refs_by_desc); binder_node_lock(node); hlist_add_head(&new_ref->node_entry, &node->refs); binder_debug(BINDER_DEBUG_INTERNAL_REFS, "%d new ref %d desc %d for node %d\n", proc->pid, new_ref->data.debug_id, new_ref->data.desc, node->debug_id); binder_node_unlock(node); return new_ref; } static void binder_cleanup_ref_olocked(struct binder_ref *ref) { struct dbitmap *dmap = &ref->proc->dmap; bool delete_node = false; binder_debug(BINDER_DEBUG_INTERNAL_REFS, "%d delete ref %d desc %d for node %d\n", ref->proc->pid, ref->data.debug_id, ref->data.desc, ref->node->debug_id); if (dbitmap_enabled(dmap)) dbitmap_clear_bit(dmap, ref->data.desc); rb_erase(&ref->rb_node_desc, &ref->proc->refs_by_desc); rb_erase(&ref->rb_node_node, &ref->proc->refs_by_node); binder_node_inner_lock(ref->node); if (ref->data.strong) binder_dec_node_nilocked(ref->node, 1, 1); hlist_del(&ref->node_entry); delete_node = binder_dec_node_nilocked(ref->node, 0, 1); binder_node_inner_unlock(ref->node); /* * Clear ref->node unless we want the caller to free the node */ if (!delete_node) { /* * The caller uses ref->node to determine * whether the node needs to be freed. Clear * it since the node is still alive. */ ref->node = NULL; } if (ref->death) { binder_debug(BINDER_DEBUG_DEAD_BINDER, "%d delete ref %d desc %d has death notification\n", ref->proc->pid, ref->data.debug_id, ref->data.desc); binder_dequeue_work(ref->proc, &ref->death->work); binder_stats_deleted(BINDER_STAT_DEATH); } if (ref->freeze) { binder_dequeue_work(ref->proc, &ref->freeze->work); binder_stats_deleted(BINDER_STAT_FREEZE); } binder_stats_deleted(BINDER_STAT_REF); } /** * binder_inc_ref_olocked() - increment the ref for given handle * @ref: ref to be incremented * @strong: if true, strong increment, else weak * @target_list: list to queue node work on * * Increment the ref. @ref->proc->outer_lock must be held on entry * * Return: 0, if successful, else errno */ static int binder_inc_ref_olocked(struct binder_ref *ref, int strong, struct list_head *target_list) { int ret; if (strong) { if (ref->data.strong == 0) { ret = binder_inc_node(ref->node, 1, 1, target_list); if (ret) return ret; } ref->data.strong++; } else { if (ref->data.weak == 0) { ret = binder_inc_node(ref->node, 0, 1, target_list); if (ret) return ret; } ref->data.weak++; } return 0; } /** * binder_dec_ref_olocked() - dec the ref for given handle * @ref: ref to be decremented * @strong: if true, strong decrement, else weak * * Decrement the ref. * * Return: %true if ref is cleaned up and ready to be freed. */ static bool binder_dec_ref_olocked(struct binder_ref *ref, int strong) { if (strong) { if (ref->data.strong == 0) { binder_user_error("%d invalid dec strong, ref %d desc %d s %d w %d\n", ref->proc->pid, ref->data.debug_id, ref->data.desc, ref->data.strong, ref->data.weak); return false; } ref->data.strong--; if (ref->data.strong == 0) binder_dec_node(ref->node, strong, 1); } else { if (ref->data.weak == 0) { binder_user_error("%d invalid dec weak, ref %d desc %d s %d w %d\n", ref->proc->pid, ref->data.debug_id, ref->data.desc, ref->data.strong, ref->data.weak); return false; } ref->data.weak--; } if (ref->data.strong == 0 && ref->data.weak == 0) { binder_cleanup_ref_olocked(ref); return true; } return false; } /** * binder_get_node_from_ref() - get the node from the given proc/desc * @proc: proc containing the ref * @desc: the handle associated with the ref * @need_strong_ref: if true, only return node if ref is strong * @rdata: the id/refcount data for the ref * * Given a proc and ref handle, return the associated binder_node * * Return: a binder_node or NULL if not found or not strong when strong required */ static struct binder_node *binder_get_node_from_ref( struct binder_proc *proc, u32 desc, bool need_strong_ref, struct binder_ref_data *rdata) { struct binder_node *node; struct binder_ref *ref; binder_proc_lock(proc); ref = binder_get_ref_olocked(proc, desc, need_strong_ref); if (!ref) goto err_no_ref; node = ref->node; /* * Take an implicit reference on the node to ensure * it stays alive until the call to binder_put_node() */ binder_inc_node_tmpref(node); if (rdata) *rdata = ref->data; binder_proc_unlock(proc); return node; err_no_ref: binder_proc_unlock(proc); return NULL; } /** * binder_free_ref() - free the binder_ref * @ref: ref to free * * Free the binder_ref. Free the binder_node indicated by ref->node * (if non-NULL) and the binder_ref_death indicated by ref->death. */ static void binder_free_ref(struct binder_ref *ref) { if (ref->node) binder_free_node(ref->node); kfree(ref->death); kfree(ref->freeze); kfree(ref); } /* shrink descriptor bitmap if needed */ static void try_shrink_dmap(struct binder_proc *proc) { unsigned long *new; int nbits; binder_proc_lock(proc); nbits = dbitmap_shrink_nbits(&proc->dmap); binder_proc_unlock(proc); if (!nbits) return; new = bitmap_zalloc(nbits, GFP_KERNEL); binder_proc_lock(proc); dbitmap_shrink(&proc->dmap, new, nbits); binder_proc_unlock(proc); } /** * binder_update_ref_for_handle() - inc/dec the ref for given handle * @proc: proc containing the ref * @desc: the handle associated with the ref * @increment: true=inc reference, false=dec reference * @strong: true=strong reference, false=weak reference * @rdata: the id/refcount data for the ref * * Given a proc and ref handle, increment or decrement the ref * according to "increment" arg. * * Return: 0 if successful, else errno */ static int binder_update_ref_for_handle(struct binder_proc *proc, uint32_t desc, bool increment, bool strong, struct binder_ref_data *rdata) { int ret = 0; struct binder_ref *ref; bool delete_ref = false; binder_proc_lock(proc); ref = binder_get_ref_olocked(proc, desc, strong); if (!ref) { ret = -EINVAL; goto err_no_ref; } if (increment) ret = binder_inc_ref_olocked(ref, strong, NULL); else delete_ref = binder_dec_ref_olocked(ref, strong); if (rdata) *rdata = ref->data; binder_proc_unlock(proc); if (delete_ref) { binder_free_ref(ref); try_shrink_dmap(proc); } return ret; err_no_ref: binder_proc_unlock(proc); return ret; } /** * binder_dec_ref_for_handle() - dec the ref for given handle * @proc: proc containing the ref * @desc: the handle associated with the ref * @strong: true=strong reference, false=weak reference * @rdata: the id/refcount data for the ref * * Just calls binder_update_ref_for_handle() to decrement the ref. * * Return: 0 if successful, else errno */ static int binder_dec_ref_for_handle(struct binder_proc *proc, uint32_t desc, bool strong, struct binder_ref_data *rdata) { return binder_update_ref_for_handle(proc, desc, false, strong, rdata); } /** * binder_inc_ref_for_node() - increment the ref for given proc/node * @proc: proc containing the ref * @node: target node * @strong: true=strong reference, false=weak reference * @target_list: worklist to use if node is incremented * @rdata: the id/refcount data for the ref * * Given a proc and node, increment the ref. Create the ref if it * doesn't already exist * * Return: 0 if successful, else errno */ static int binder_inc_ref_for_node(struct binder_proc *proc, struct binder_node *node, bool strong, struct list_head *target_list, struct binder_ref_data *rdata) { struct binder_ref *ref; struct binder_ref *new_ref = NULL; int ret = 0; binder_proc_lock(proc); ref = binder_get_ref_for_node_olocked(proc, node, NULL); if (!ref) { binder_proc_unlock(proc); new_ref = kzalloc(sizeof(*ref), GFP_KERNEL); if (!new_ref) return -ENOMEM; binder_proc_lock(proc); ref = binder_get_ref_for_node_olocked(proc, node, new_ref); } ret = binder_inc_ref_olocked(ref, strong, target_list); *rdata = ref->data; if (ret && ref == new_ref) { /* * Cleanup the failed reference here as the target * could now be dead and have already released its * references by now. Calling on the new reference * with strong=0 and a tmp_refs will not decrement * the node. The new_ref gets kfree'd below. */ binder_cleanup_ref_olocked(new_ref); ref = NULL; } binder_proc_unlock(proc); if (new_ref && ref != new_ref) /* * Another thread created the ref first so * free the one we allocated */ kfree(new_ref); return ret; } static void binder_pop_transaction_ilocked(struct binder_thread *target_thread, struct binder_transaction *t) { BUG_ON(!target_thread); assert_spin_locked(&target_thread->proc->inner_lock); BUG_ON(target_thread->transaction_stack != t); BUG_ON(target_thread->transaction_stack->from != target_thread); target_thread->transaction_stack = target_thread->transaction_stack->from_parent; t->from = NULL; } /** * binder_thread_dec_tmpref() - decrement thread->tmp_ref * @thread: thread to decrement * * A thread needs to be kept alive while being used to create or * handle a transaction. binder_get_txn_from() is used to safely * extract t->from from a binder_transaction and keep the thread * indicated by t->from from being freed. When done with that * binder_thread, this function is called to decrement the * tmp_ref and free if appropriate (thread has been released * and no transaction being processed by the driver) */ static void binder_thread_dec_tmpref(struct binder_thread *thread) { /* * atomic is used to protect the counter value while * it cannot reach zero or thread->is_dead is false */ binder_inner_proc_lock(thread->proc); atomic_dec(&thread->tmp_ref); if (thread->is_dead && !atomic_read(&thread->tmp_ref)) { binder_inner_proc_unlock(thread->proc); binder_free_thread(thread); return; } binder_inner_proc_unlock(thread->proc); } /** * binder_proc_dec_tmpref() - decrement proc->tmp_ref * @proc: proc to decrement * * A binder_proc needs to be kept alive while being used to create or * handle a transaction. proc->tmp_ref is incremented when * creating a new transaction or the binder_proc is currently in-use * by threads that are being released. When done with the binder_proc, * this function is called to decrement the counter and free the * proc if appropriate (proc has been released, all threads have * been released and not currently in-use to process a transaction). */ static void binder_proc_dec_tmpref(struct binder_proc *proc) { binder_inner_proc_lock(proc); proc->tmp_ref--; if (proc->is_dead && RB_EMPTY_ROOT(&proc->threads) && !proc->tmp_ref) { binder_inner_proc_unlock(proc); binder_free_proc(proc); return; } binder_inner_proc_unlock(proc); } /** * binder_get_txn_from() - safely extract the "from" thread in transaction * @t: binder transaction for t->from * * Atomically return the "from" thread and increment the tmp_ref * count for the thread to ensure it stays alive until * binder_thread_dec_tmpref() is called. * * Return: the value of t->from */ static struct binder_thread *binder_get_txn_from( struct binder_transaction *t) { struct binder_thread *from; guard(spinlock)(&t->lock); from = t->from; if (from) atomic_inc(&from->tmp_ref); return from; } /** * binder_get_txn_from_and_acq_inner() - get t->from and acquire inner lock * @t: binder transaction for t->from * * Same as binder_get_txn_from() except it also acquires the proc->inner_lock * to guarantee that the thread cannot be released while operating on it. * The caller must call binder_inner_proc_unlock() to release the inner lock * as well as call binder_dec_thread_txn() to release the reference. * * Return: the value of t->from */ static struct binder_thread *binder_get_txn_from_and_acq_inner( struct binder_transaction *t) __acquires(&t->from->proc->inner_lock) { struct binder_thread *from; from = binder_get_txn_from(t); if (!from) { __acquire(&from->proc->inner_lock); return NULL; } binder_inner_proc_lock(from->proc); if (t->from) { BUG_ON(from != t->from); return from; } binder_inner_proc_unlock(from->proc); __acquire(&from->proc->inner_lock); binder_thread_dec_tmpref(from); return NULL; } /** * binder_free_txn_fixups() - free unprocessed fd fixups * @t: binder transaction for t->from * * If the transaction is being torn down prior to being * processed by the target process, free all of the * fd fixups and fput the file structs. It is safe to * call this function after the fixups have been * processed -- in that case, the list will be empty. */ static void binder_free_txn_fixups(struct binder_transaction *t) { struct binder_txn_fd_fixup *fixup, *tmp; list_for_each_entry_safe(fixup, tmp, &t->fd_fixups, fixup_entry) { fput(fixup->file); if (fixup->target_fd >= 0) put_unused_fd(fixup->target_fd); list_del(&fixup->fixup_entry); kfree(fixup); } } static void binder_txn_latency_free(struct binder_transaction *t) { int from_proc, from_thread, to_proc, to_thread; spin_lock(&t->lock); from_proc = t->from ? t->from->proc->pid : 0; from_thread = t->from ? t->from->pid : 0; to_proc = t->to_proc ? t->to_proc->pid : 0; to_thread = t->to_thread ? t->to_thread->pid : 0; spin_unlock(&t->lock); trace_binder_txn_latency_free(t, from_proc, from_thread, to_proc, to_thread); } static void binder_free_transaction(struct binder_transaction *t) { struct binder_proc *target_proc = t->to_proc; if (target_proc) { binder_inner_proc_lock(target_proc); target_proc->outstanding_txns--; if (target_proc->outstanding_txns < 0) pr_warn("%s: Unexpected outstanding_txns %d\n", __func__, target_proc->outstanding_txns); if (!target_proc->outstanding_txns && target_proc->is_frozen) wake_up_interruptible_all(&target_proc->freeze_wait); if (t->buffer) t->buffer->transaction = NULL; binder_inner_proc_unlock(target_proc); } if (trace_binder_txn_latency_free_enabled()) binder_txn_latency_free(t); /* * If the transaction has no target_proc, then * t->buffer->transaction has already been cleared. */ binder_free_txn_fixups(t); kfree(t); binder_stats_deleted(BINDER_STAT_TRANSACTION); } static void binder_send_failed_reply(struct binder_transaction *t, uint32_t error_code) { struct binder_thread *target_thread; struct binder_transaction *next; BUG_ON(t->flags & TF_ONE_WAY); while (1) { target_thread = binder_get_txn_from_and_acq_inner(t); if (target_thread) { binder_debug(BINDER_DEBUG_FAILED_TRANSACTION, "send failed reply for transaction %d to %d:%d\n", t->debug_id, target_thread->proc->pid, target_thread->pid); binder_pop_transaction_ilocked(target_thread, t); if (target_thread->reply_error.cmd == BR_OK) { target_thread->reply_error.cmd = error_code; binder_enqueue_thread_work_ilocked( target_thread, &target_thread->reply_error.work); wake_up_interruptible(&target_thread->wait); } else { /* * Cannot get here for normal operation, but * we can if multiple synchronous transactions * are sent without blocking for responses. * Just ignore the 2nd error in this case. */ pr_warn("Unexpected reply error: %u\n", target_thread->reply_error.cmd); } binder_inner_proc_unlock(target_thread->proc); binder_thread_dec_tmpref(target_thread); binder_free_transaction(t); return; } __release(&target_thread->proc->inner_lock); next = t->from_parent; binder_debug(BINDER_DEBUG_FAILED_TRANSACTION, "send failed reply for transaction %d, target dead\n", t->debug_id); binder_free_transaction(t); if (next == NULL) { binder_debug(BINDER_DEBUG_DEAD_BINDER, "reply failed, no target thread at root\n"); return; } t = next; binder_debug(BINDER_DEBUG_DEAD_BINDER, "reply failed, no target thread -- retry %d\n", t->debug_id); } } /** * binder_cleanup_transaction() - cleans up undelivered transaction * @t: transaction that needs to be cleaned up * @reason: reason the transaction wasn't delivered * @error_code: error to return to caller (if synchronous call) */ static void binder_cleanup_transaction(struct binder_transaction *t, const char *reason, uint32_t error_code) { if (t->buffer->target_node && !(t->flags & TF_ONE_WAY)) { binder_send_failed_reply(t, error_code); } else { binder_debug(BINDER_DEBUG_DEAD_TRANSACTION, "undelivered transaction %d, %s\n", t->debug_id, reason); binder_free_transaction(t); } } /** * binder_get_object() - gets object and checks for valid metadata * @proc: binder_proc owning the buffer * @u: sender's user pointer to base of buffer * @buffer: binder_buffer that we're parsing. * @offset: offset in the @buffer at which to validate an object. * @object: struct binder_object to read into * * Copy the binder object at the given offset into @object. If @u is * provided then the copy is from the sender's buffer. If not, then * it is copied from the target's @buffer. * * Return: If there's a valid metadata object at @offset, the * size of that object. Otherwise, it returns zero. The object * is read into the struct binder_object pointed to by @object. */ static size_t binder_get_object(struct binder_proc *proc, const void __user *u, struct binder_buffer *buffer, unsigned long offset, struct binder_object *object) { size_t read_size; struct binder_object_header *hdr; size_t object_size = 0; read_size = min_t(size_t, sizeof(*object), buffer->data_size - offset); if (offset > buffer->data_size || read_size < sizeof(*hdr) || !IS_ALIGNED(offset, sizeof(u32))) return 0; if (u) { if (copy_from_user(object, u + offset, read_size)) return 0; } else { if (binder_alloc_copy_from_buffer(&proc->alloc, object, buffer, offset, read_size)) return 0; } /* Ok, now see if we read a complete object. */ hdr = &object->hdr; switch (hdr->type) { case BINDER_TYPE_BINDER: case BINDER_TYPE_WEAK_BINDER: case BINDER_TYPE_HANDLE: case BINDER_TYPE_WEAK_HANDLE: object_size = sizeof(struct flat_binder_object); break; case BINDER_TYPE_FD: object_size = sizeof(struct binder_fd_object); break; case BINDER_TYPE_PTR: object_size = sizeof(struct binder_buffer_object); break; case BINDER_TYPE_FDA: object_size = sizeof(struct binder_fd_array_object); break; default: return 0; } if (offset <= buffer->data_size - object_size && buffer->data_size >= object_size) return object_size; else return 0; } /** * binder_validate_ptr() - validates binder_buffer_object in a binder_buffer. * @proc: binder_proc owning the buffer * @b: binder_buffer containing the object * @object: struct binder_object to read into * @index: index in offset array at which the binder_buffer_object is * located * @start_offset: points to the start of the offset array * @object_offsetp: offset of @object read from @b * @num_valid: the number of valid offsets in the offset array * * Return: If @index is within the valid range of the offset array * described by @start and @num_valid, and if there's a valid * binder_buffer_object at the offset found in index @index * of the offset array, that object is returned. Otherwise, * %NULL is returned. * Note that the offset found in index @index itself is not * verified; this function assumes that @num_valid elements * from @start were previously verified to have valid offsets. * If @object_offsetp is non-NULL, then the offset within * @b is written to it. */ static struct binder_buffer_object *binder_validate_ptr( struct binder_proc *proc, struct binder_buffer *b, struct binder_object *object, binder_size_t index, binder_size_t start_offset, binder_size_t *object_offsetp, binder_size_t num_valid) { size_t object_size; binder_size_t object_offset; unsigned long buffer_offset; if (index >= num_valid) return NULL; buffer_offset = start_offset + sizeof(binder_size_t) * index; if (binder_alloc_copy_from_buffer(&proc->alloc, &object_offset, b, buffer_offset, sizeof(object_offset))) return NULL; object_size = binder_get_object(proc, NULL, b, object_offset, object); if (!object_size || object->hdr.type != BINDER_TYPE_PTR) return NULL; if (object_offsetp) *object_offsetp = object_offset; return &object->bbo; } /** * binder_validate_fixup() - validates pointer/fd fixups happen in order. * @proc: binder_proc owning the buffer * @b: transaction buffer * @objects_start_offset: offset to start of objects buffer * @buffer_obj_offset: offset to binder_buffer_object in which to fix up * @fixup_offset: start offset in @buffer to fix up * @last_obj_offset: offset to last binder_buffer_object that we fixed * @last_min_offset: minimum fixup offset in object at @last_obj_offset * * Return: %true if a fixup in buffer @buffer at offset @offset is * allowed. * * For safety reasons, we only allow fixups inside a buffer to happen * at increasing offsets; additionally, we only allow fixup on the last * buffer object that was verified, or one of its parents. * * Example of what is allowed: * * A * B (parent = A, offset = 0) * C (parent = A, offset = 16) * D (parent = C, offset = 0) * E (parent = A, offset = 32) // min_offset is 16 (C.parent_offset) * * Examples of what is not allowed: * * Decreasing offsets within the same parent: * A * C (parent = A, offset = 16) * B (parent = A, offset = 0) // decreasing offset within A * * Referring to a parent that wasn't the last object or any of its parents: * A * B (parent = A, offset = 0) * C (parent = A, offset = 0) * C (parent = A, offset = 16) * D (parent = B, offset = 0) // B is not A or any of A's parents */ static bool binder_validate_fixup(struct binder_proc *proc, struct binder_buffer *b, binder_size_t objects_start_offset, binder_size_t buffer_obj_offset, binder_size_t fixup_offset, binder_size_t last_obj_offset, binder_size_t last_min_offset) { if (!last_obj_offset) { /* Nothing to fix up in */ return false; } while (last_obj_offset != buffer_obj_offset) { unsigned long buffer_offset; struct binder_object last_object; struct binder_buffer_object *last_bbo; size_t object_size = binder_get_object(proc, NULL, b, last_obj_offset, &last_object); if (object_size != sizeof(*last_bbo)) return false; last_bbo = &last_object.bbo; /* * Safe to retrieve the parent of last_obj, since it * was already previously verified by the driver. */ if ((last_bbo->flags & BINDER_BUFFER_FLAG_HAS_PARENT) == 0) return false; last_min_offset = last_bbo->parent_offset + sizeof(uintptr_t); buffer_offset = objects_start_offset + sizeof(binder_size_t) * last_bbo->parent; if (binder_alloc_copy_from_buffer(&proc->alloc, &last_obj_offset, b, buffer_offset, sizeof(last_obj_offset))) return false; } return (fixup_offset >= last_min_offset); } /** * struct binder_task_work_cb - for deferred close * * @twork: callback_head for task work * @file: file to close * * Structure to pass task work to be handled after * returning from binder_ioctl() via task_work_add(). */ struct binder_task_work_cb { struct callback_head twork; struct file *file; }; /** * binder_do_fd_close() - close list of file descriptors * @twork: callback head for task work * * It is not safe to call ksys_close() during the binder_ioctl() * function if there is a chance that binder's own file descriptor * might be closed. This is to meet the requirements for using * fdget() (see comments for __fget_light()). Therefore use * task_work_add() to schedule the close operation once we have * returned from binder_ioctl(). This function is a callback * for that mechanism and does the actual ksys_close() on the * given file descriptor. */ static void binder_do_fd_close(struct callback_head *twork) { struct binder_task_work_cb *twcb = container_of(twork, struct binder_task_work_cb, twork); fput(twcb->file); kfree(twcb); } /** * binder_deferred_fd_close() - schedule a close for the given file-descriptor * @fd: file-descriptor to close * * See comments in binder_do_fd_close(). This function is used to schedule * a file-descriptor to be closed after returning from binder_ioctl(). */ static void binder_deferred_fd_close(int fd) { struct binder_task_work_cb *twcb; twcb = kzalloc(sizeof(*twcb), GFP_KERNEL); if (!twcb) return; init_task_work(&twcb->twork, binder_do_fd_close); twcb->file = file_close_fd(fd); if (twcb->file) { // pin it until binder_do_fd_close(); see comments there get_file(twcb->file); filp_close(twcb->file, current->files); task_work_add(current, &twcb->twork, TWA_RESUME); } else { kfree(twcb); } } static void binder_transaction_buffer_release(struct binder_proc *proc, struct binder_thread *thread, struct binder_buffer *buffer, binder_size_t off_end_offset, bool is_failure) { int debug_id = buffer->debug_id; binder_size_t off_start_offset, buffer_offset; binder_debug(BINDER_DEBUG_TRANSACTION, "%d buffer release %d, size %zd-%zd, failed at %llx\n", proc->pid, buffer->debug_id, buffer->data_size, buffer->offsets_size, (unsigned long long)off_end_offset); if (buffer->target_node) binder_dec_node(buffer->target_node, 1, 0); off_start_offset = ALIGN(buffer->data_size, sizeof(void *)); for (buffer_offset = off_start_offset; buffer_offset < off_end_offset; buffer_offset += sizeof(binder_size_t)) { struct binder_object_header *hdr; size_t object_size = 0; struct binder_object object; binder_size_t object_offset; if (!binder_alloc_copy_from_buffer(&proc->alloc, &object_offset, buffer, buffer_offset, sizeof(object_offset))) object_size = binder_get_object(proc, NULL, buffer, object_offset, &object); if (object_size == 0) { pr_err("transaction release %d bad object at offset %lld, size %zd\n", debug_id, (u64)object_offset, buffer->data_size); continue; } hdr = &object.hdr; switch (hdr->type) { case BINDER_TYPE_BINDER: case BINDER_TYPE_WEAK_BINDER: { struct flat_binder_object *fp; struct binder_node *node; fp = to_flat_binder_object(hdr); node = binder_get_node(proc, fp->binder); if (node == NULL) { pr_err("transaction release %d bad node %016llx\n", debug_id, (u64)fp->binder); break; } binder_debug(BINDER_DEBUG_TRANSACTION, " node %d u%016llx\n", node->debug_id, (u64)node->ptr); binder_dec_node(node, hdr->type == BINDER_TYPE_BINDER, 0); binder_put_node(node); } break; case BINDER_TYPE_HANDLE: case BINDER_TYPE_WEAK_HANDLE: { struct flat_binder_object *fp; struct binder_ref_data rdata; int ret; fp = to_flat_binder_object(hdr); ret = binder_dec_ref_for_handle(proc, fp->handle, hdr->type == BINDER_TYPE_HANDLE, &rdata); if (ret) { pr_err("transaction release %d bad handle %d, ret = %d\n", debug_id, fp->handle, ret); break; } binder_debug(BINDER_DEBUG_TRANSACTION, " ref %d desc %d\n", rdata.debug_id, rdata.desc); } break; case BINDER_TYPE_FD: { /* * No need to close the file here since user-space * closes it for successfully delivered * transactions. For transactions that weren't * delivered, the new fd was never allocated so * there is no need to close and the fput on the * file is done when the transaction is torn * down. */ } break; case BINDER_TYPE_PTR: /* * Nothing to do here, this will get cleaned up when the * transaction buffer gets freed */ break; case BINDER_TYPE_FDA: { struct binder_fd_array_object *fda; struct binder_buffer_object *parent; struct binder_object ptr_object; binder_size_t fda_offset; size_t fd_index; binder_size_t fd_buf_size; binder_size_t num_valid; if (is_failure) { /* * The fd fixups have not been applied so no * fds need to be closed. */ continue; } num_valid = (buffer_offset - off_start_offset) / sizeof(binder_size_t); fda = to_binder_fd_array_object(hdr); parent = binder_validate_ptr(proc, buffer, &ptr_object, fda->parent, off_start_offset, NULL, num_valid); if (!parent) { pr_err("transaction release %d bad parent offset\n", debug_id); continue; } fd_buf_size = sizeof(u32) * fda->num_fds; if (fda->num_fds >= SIZE_MAX / sizeof(u32)) { pr_err("transaction release %d invalid number of fds (%lld)\n", debug_id, (u64)fda->num_fds); continue; } if (fd_buf_size > parent->length || fda->parent_offset > parent->length - fd_buf_size) { /* No space for all file descriptors here. */ pr_err("transaction release %d not enough space for %lld fds in buffer\n", debug_id, (u64)fda->num_fds); continue; } /* * the source data for binder_buffer_object is visible * to user-space and the @buffer element is the user * pointer to the buffer_object containing the fd_array. * Convert the address to an offset relative to * the base of the transaction buffer. */ fda_offset = parent->buffer - buffer->user_data + fda->parent_offset; for (fd_index = 0; fd_index < fda->num_fds; fd_index++) { u32 fd; int err; binder_size_t offset = fda_offset + fd_index * sizeof(fd); err = binder_alloc_copy_from_buffer( &proc->alloc, &fd, buffer, offset, sizeof(fd)); WARN_ON(err); if (!err) { binder_deferred_fd_close(fd); /* * Need to make sure the thread goes * back to userspace to complete the * deferred close */ if (thread) thread->looper_need_return = true; } } } break; default: pr_err("transaction release %d bad object type %x\n", debug_id, hdr->type); break; } } } /* Clean up all the objects in the buffer */ static inline void binder_release_entire_buffer(struct binder_proc *proc, struct binder_thread *thread, struct binder_buffer *buffer, bool is_failure) { binder_size_t off_end_offset; off_end_offset = ALIGN(buffer->data_size, sizeof(void *)); off_end_offset += buffer->offsets_size; binder_transaction_buffer_release(proc, thread, buffer, off_end_offset, is_failure); } static int binder_translate_binder(struct flat_binder_object *fp, struct binder_transaction *t, struct binder_thread *thread) { struct binder_node *node; struct binder_proc *proc = thread->proc; struct binder_proc *target_proc = t->to_proc; struct binder_ref_data rdata; int ret = 0; node = binder_get_node(proc, fp->binder); if (!node) { node = binder_new_node(proc, fp); if (!node) return -ENOMEM; } if (fp->cookie != node->cookie) { binder_user_error("%d:%d sending u%016llx node %d, cookie mismatch %016llx != %016llx\n", proc->pid, thread->pid, (u64)fp->binder, node->debug_id, (u64)fp->cookie, (u64)node->cookie); ret = -EINVAL; goto done; } if (security_binder_transfer_binder(proc->cred, target_proc->cred)) { ret = -EPERM; goto done; } ret = binder_inc_ref_for_node(target_proc, node, fp->hdr.type == BINDER_TYPE_BINDER, &thread->todo, &rdata); if (ret) goto done; if (fp->hdr.type == BINDER_TYPE_BINDER) fp->hdr.type = BINDER_TYPE_HANDLE; else fp->hdr.type = BINDER_TYPE_WEAK_HANDLE; fp->binder = 0; fp->handle = rdata.desc; fp->cookie = 0; trace_binder_transaction_node_to_ref(t, node, &rdata); binder_debug(BINDER_DEBUG_TRANSACTION, " node %d u%016llx -> ref %d desc %d\n", node->debug_id, (u64)node->ptr, rdata.debug_id, rdata.desc); done: binder_put_node(node); return ret; } static int binder_translate_handle(struct flat_binder_object *fp, struct binder_transaction *t, struct binder_thread *thread) { struct binder_proc *proc = thread->proc; struct binder_proc *target_proc = t->to_proc; struct binder_node *node; struct binder_ref_data src_rdata; int ret = 0; node = binder_get_node_from_ref(proc, fp->handle, fp->hdr.type == BINDER_TYPE_HANDLE, &src_rdata); if (!node) { binder_user_error("%d:%d got transaction with invalid handle, %d\n", proc->pid, thread->pid, fp->handle); return -EINVAL; } if (security_binder_transfer_binder(proc->cred, target_proc->cred)) { ret = -EPERM; goto done; } binder_node_lock(node); if (node->proc == target_proc) { if (fp->hdr.type == BINDER_TYPE_HANDLE) fp->hdr.type = BINDER_TYPE_BINDER; else fp->hdr.type = BINDER_TYPE_WEAK_BINDER; fp->binder = node->ptr; fp->cookie = node->cookie; if (node->proc) binder_inner_proc_lock(node->proc); else __acquire(&node->proc->inner_lock); binder_inc_node_nilocked(node, fp->hdr.type == BINDER_TYPE_BINDER, 0, NULL); if (node->proc) binder_inner_proc_unlock(node->proc); else __release(&node->proc->inner_lock); trace_binder_transaction_ref_to_node(t, node, &src_rdata); binder_debug(BINDER_DEBUG_TRANSACTION, " ref %d desc %d -> node %d u%016llx\n", src_rdata.debug_id, src_rdata.desc, node->debug_id, (u64)node->ptr); binder_node_unlock(node); } else { struct binder_ref_data dest_rdata; binder_node_unlock(node); ret = binder_inc_ref_for_node(target_proc, node, fp->hdr.type == BINDER_TYPE_HANDLE, NULL, &dest_rdata); if (ret) goto done; fp->binder = 0; fp->handle = dest_rdata.desc; fp->cookie = 0; trace_binder_transaction_ref_to_ref(t, node, &src_rdata, &dest_rdata); binder_debug(BINDER_DEBUG_TRANSACTION, " ref %d desc %d -> ref %d desc %d (node %d)\n", src_rdata.debug_id, src_rdata.desc, dest_rdata.debug_id, dest_rdata.desc, node->debug_id); } done: binder_put_node(node); return ret; } static int binder_translate_fd(u32 fd, binder_size_t fd_offset, struct binder_transaction *t, struct binder_thread *thread, struct binder_transaction *in_reply_to) { struct binder_proc *proc = thread->proc; struct binder_proc *target_proc = t->to_proc; struct binder_txn_fd_fixup *fixup; struct file *file; int ret = 0; bool target_allows_fd; if (in_reply_to) target_allows_fd = !!(in_reply_to->flags & TF_ACCEPT_FDS); else target_allows_fd = t->buffer->target_node->accept_fds; if (!target_allows_fd) { binder_user_error("%d:%d got %s with fd, %d, but target does not allow fds\n", proc->pid, thread->pid, in_reply_to ? "reply" : "transaction", fd); ret = -EPERM; goto err_fd_not_accepted; } file = fget(fd); if (!file) { binder_user_error("%d:%d got transaction with invalid fd, %d\n", proc->pid, thread->pid, fd); ret = -EBADF; goto err_fget; } ret = security_binder_transfer_file(proc->cred, target_proc->cred, file); if (ret < 0) { ret = -EPERM; goto err_security; } /* * Add fixup record for this transaction. The allocation * of the fd in the target needs to be done from a * target thread. */ fixup = kzalloc(sizeof(*fixup), GFP_KERNEL); if (!fixup) { ret = -ENOMEM; goto err_alloc; } fixup->file = file; fixup->offset = fd_offset; fixup->target_fd = -1; trace_binder_transaction_fd_send(t, fd, fixup->offset); list_add_tail(&fixup->fixup_entry, &t->fd_fixups); return ret; err_alloc: err_security: fput(file); err_fget: err_fd_not_accepted: return ret; } /** * struct binder_ptr_fixup - data to be fixed-up in target buffer * @offset: offset in target buffer to fixup * @skip_size: bytes to skip in copy (fixup will be written later) * @fixup_data: data to write at fixup offset * @node: list node * * This is used for the pointer fixup list (pf) which is created and consumed * during binder_transaction() and is only accessed locally. No * locking is necessary. * * The list is ordered by @offset. */ struct binder_ptr_fixup { binder_size_t offset; size_t skip_size; binder_uintptr_t fixup_data; struct list_head node; }; /** * struct binder_sg_copy - scatter-gather data to be copied * @offset: offset in target buffer * @sender_uaddr: user address in source buffer * @length: bytes to copy * @node: list node * * This is used for the sg copy list (sgc) which is created and consumed * during binder_transaction() and is only accessed locally. No * locking is necessary. * * The list is ordered by @offset. */ struct binder_sg_copy { binder_size_t offset; const void __user *sender_uaddr; size_t length; struct list_head node; }; /** * binder_do_deferred_txn_copies() - copy and fixup scatter-gather data * @alloc: binder_alloc associated with @buffer * @buffer: binder buffer in target process * @sgc_head: list_head of scatter-gather copy list * @pf_head: list_head of pointer fixup list * * Processes all elements of @sgc_head, applying fixups from @pf_head * and copying the scatter-gather data from the source process' user * buffer to the target's buffer. It is expected that the list creation * and processing all occurs during binder_transaction() so these lists * are only accessed in local context. * * Return: 0=success, else -errno */ static int binder_do_deferred_txn_copies(struct binder_alloc *alloc, struct binder_buffer *buffer, struct list_head *sgc_head, struct list_head *pf_head) { int ret = 0; struct binder_sg_copy *sgc, *tmpsgc; struct binder_ptr_fixup *tmppf; struct binder_ptr_fixup *pf = list_first_entry_or_null(pf_head, struct binder_ptr_fixup, node); list_for_each_entry_safe(sgc, tmpsgc, sgc_head, node) { size_t bytes_copied = 0; while (bytes_copied < sgc->length) { size_t copy_size; size_t bytes_left = sgc->length - bytes_copied; size_t offset = sgc->offset + bytes_copied; /* * We copy up to the fixup (pointed to by pf) */ copy_size = pf ? min(bytes_left, (size_t)pf->offset - offset) : bytes_left; if (!ret && copy_size) ret = binder_alloc_copy_user_to_buffer( alloc, buffer, offset, sgc->sender_uaddr + bytes_copied, copy_size); bytes_copied += copy_size; if (copy_size != bytes_left) { BUG_ON(!pf); /* we stopped at a fixup offset */ if (pf->skip_size) { /* * we are just skipping. This is for * BINDER_TYPE_FDA where the translated * fds will be fixed up when we get * to target context. */ bytes_copied += pf->skip_size; } else { /* apply the fixup indicated by pf */ if (!ret) ret = binder_alloc_copy_to_buffer( alloc, buffer, pf->offset, &pf->fixup_data, sizeof(pf->fixup_data)); bytes_copied += sizeof(pf->fixup_data); } list_del(&pf->node); kfree(pf); pf = list_first_entry_or_null(pf_head, struct binder_ptr_fixup, node); } } list_del(&sgc->node); kfree(sgc); } list_for_each_entry_safe(pf, tmppf, pf_head, node) { BUG_ON(pf->skip_size == 0); list_del(&pf->node); kfree(pf); } BUG_ON(!list_empty(sgc_head)); return ret > 0 ? -EINVAL : ret; } /** * binder_cleanup_deferred_txn_lists() - free specified lists * @sgc_head: list_head of scatter-gather copy list * @pf_head: list_head of pointer fixup list * * Called to clean up @sgc_head and @pf_head if there is an * error. */ static void binder_cleanup_deferred_txn_lists(struct list_head *sgc_head, struct list_head *pf_head) { struct binder_sg_copy *sgc, *tmpsgc; struct binder_ptr_fixup *pf, *tmppf; list_for_each_entry_safe(sgc, tmpsgc, sgc_head, node) { list_del(&sgc->node); kfree(sgc); } list_for_each_entry_safe(pf, tmppf, pf_head, node) { list_del(&pf->node); kfree(pf); } } /** * binder_defer_copy() - queue a scatter-gather buffer for copy * @sgc_head: list_head of scatter-gather copy list * @offset: binder buffer offset in target process * @sender_uaddr: user address in source process * @length: bytes to copy * * Specify a scatter-gather block to be copied. The actual copy must * be deferred until all the needed fixups are identified and queued. * Then the copy and fixups are done together so un-translated values * from the source are never visible in the target buffer. * * We are guaranteed that repeated calls to this function will have * monotonically increasing @offset values so the list will naturally * be ordered. * * Return: 0=success, else -errno */ static int binder_defer_copy(struct list_head *sgc_head, binder_size_t offset, const void __user *sender_uaddr, size_t length) { struct binder_sg_copy *bc = kzalloc(sizeof(*bc), GFP_KERNEL); if (!bc) return -ENOMEM; bc->offset = offset; bc->sender_uaddr = sender_uaddr; bc->length = length; INIT_LIST_HEAD(&bc->node); /* * We are guaranteed that the deferred copies are in-order * so just add to the tail. */ list_add_tail(&bc->node, sgc_head); return 0; } /** * binder_add_fixup() - queue a fixup to be applied to sg copy * @pf_head: list_head of binder ptr fixup list * @offset: binder buffer offset in target process * @fixup: bytes to be copied for fixup * @skip_size: bytes to skip when copying (fixup will be applied later) * * Add the specified fixup to a list ordered by @offset. When copying * the scatter-gather buffers, the fixup will be copied instead of * data from the source buffer. For BINDER_TYPE_FDA fixups, the fixup * will be applied later (in target process context), so we just skip * the bytes specified by @skip_size. If @skip_size is 0, we copy the * value in @fixup. * * This function is called *mostly* in @offset order, but there are * exceptions. Since out-of-order inserts are relatively uncommon, * we insert the new element by searching backward from the tail of * the list. * * Return: 0=success, else -errno */ static int binder_add_fixup(struct list_head *pf_head, binder_size_t offset, binder_uintptr_t fixup, size_t skip_size) { struct binder_ptr_fixup *pf = kzalloc(sizeof(*pf), GFP_KERNEL); struct binder_ptr_fixup *tmppf; if (!pf) return -ENOMEM; pf->offset = offset; pf->fixup_data = fixup; pf->skip_size = skip_size; INIT_LIST_HEAD(&pf->node); /* Fixups are *mostly* added in-order, but there are some * exceptions. Look backwards through list for insertion point. */ list_for_each_entry_reverse(tmppf, pf_head, node) { if (tmppf->offset < pf->offset) { list_add(&pf->node, &tmppf->node); return 0; } } /* * if we get here, then the new offset is the lowest so * insert at the head */ list_add(&pf->node, pf_head); return 0; } static int binder_translate_fd_array(struct list_head *pf_head, struct binder_fd_array_object *fda, const void __user *sender_ubuffer, struct binder_buffer_object *parent, struct binder_buffer_object *sender_uparent, struct binder_transaction *t, struct binder_thread *thread, struct binder_transaction *in_reply_to) { binder_size_t fdi, fd_buf_size; binder_size_t fda_offset; const void __user *sender_ufda_base; struct binder_proc *proc = thread->proc; int ret; if (fda->num_fds == 0) return 0; fd_buf_size = sizeof(u32) * fda->num_fds; if (fda->num_fds >= SIZE_MAX / sizeof(u32)) { binder_user_error("%d:%d got transaction with invalid number of fds (%lld)\n", proc->pid, thread->pid, (u64)fda->num_fds); return -EINVAL; } if (fd_buf_size > parent->length || fda->parent_offset > parent->length - fd_buf_size) { /* No space for all file descriptors here. */ binder_user_error("%d:%d not enough space to store %lld fds in buffer\n", proc->pid, thread->pid, (u64)fda->num_fds); return -EINVAL; } /* * the source data for binder_buffer_object is visible * to user-space and the @buffer element is the user * pointer to the buffer_object containing the fd_array. * Convert the address to an offset relative to * the base of the transaction buffer. */ fda_offset = parent->buffer - t->buffer->user_data + fda->parent_offset; sender_ufda_base = (void __user *)(uintptr_t)sender_uparent->buffer + fda->parent_offset; if (!IS_ALIGNED((unsigned long)fda_offset, sizeof(u32)) || !IS_ALIGNED((unsigned long)sender_ufda_base, sizeof(u32))) { binder_user_error("%d:%d parent offset not aligned correctly.\n", proc->pid, thread->pid); return -EINVAL; } ret = binder_add_fixup(pf_head, fda_offset, 0, fda->num_fds * sizeof(u32)); if (ret) return ret; for (fdi = 0; fdi < fda->num_fds; fdi++) { u32 fd; binder_size_t offset = fda_offset + fdi * sizeof(fd); binder_size_t sender_uoffset = fdi * sizeof(fd); ret = copy_from_user(&fd, sender_ufda_base + sender_uoffset, sizeof(fd)); if (!ret) ret = binder_translate_fd(fd, offset, t, thread, in_reply_to); if (ret) return ret > 0 ? -EINVAL : ret; } return 0; } static int binder_fixup_parent(struct list_head *pf_head, struct binder_transaction *t, struct binder_thread *thread, struct binder_buffer_object *bp, binder_size_t off_start_offset, binder_size_t num_valid, binder_size_t last_fixup_obj_off, binder_size_t last_fixup_min_off) { struct binder_buffer_object *parent; struct binder_buffer *b = t->buffer; struct binder_proc *proc = thread->proc; struct binder_proc *target_proc = t->to_proc; struct binder_object object; binder_size_t buffer_offset; binder_size_t parent_offset; if (!(bp->flags & BINDER_BUFFER_FLAG_HAS_PARENT)) return 0; parent = binder_validate_ptr(target_proc, b, &object, bp->parent, off_start_offset, &parent_offset, num_valid); if (!parent) { binder_user_error("%d:%d got transaction with invalid parent offset or type\n", proc->pid, thread->pid); return -EINVAL; } if (!binder_validate_fixup(target_proc, b, off_start_offset, parent_offset, bp->parent_offset, last_fixup_obj_off, last_fixup_min_off)) { binder_user_error("%d:%d got transaction with out-of-order buffer fixup\n", proc->pid, thread->pid); return -EINVAL; } if (parent->length < sizeof(binder_uintptr_t) || bp->parent_offset > parent->length - sizeof(binder_uintptr_t)) { /* No space for a pointer here! */ binder_user_error("%d:%d got transaction with invalid parent offset\n", proc->pid, thread->pid); return -EINVAL; } buffer_offset = bp->parent_offset + parent->buffer - b->user_data; return binder_add_fixup(pf_head, buffer_offset, bp->buffer, 0); } /** * binder_can_update_transaction() - Can a txn be superseded by an updated one? * @t1: the pending async txn in the frozen process * @t2: the new async txn to supersede the outdated pending one * * Return: true if t2 can supersede t1 * false if t2 can not supersede t1 */ static bool binder_can_update_transaction(struct binder_transaction *t1, struct binder_transaction *t2) { if ((t1->flags & t2->flags & (TF_ONE_WAY | TF_UPDATE_TXN)) != (TF_ONE_WAY | TF_UPDATE_TXN) || !t1->to_proc || !t2->to_proc) return false; if (t1->to_proc->tsk == t2->to_proc->tsk && t1->code == t2->code && t1->flags == t2->flags && t1->buffer->pid == t2->buffer->pid && t1->buffer->target_node->ptr == t2->buffer->target_node->ptr && t1->buffer->target_node->cookie == t2->buffer->target_node->cookie) return true; return false; } /** * binder_find_outdated_transaction_ilocked() - Find the outdated transaction * @t: new async transaction * @target_list: list to find outdated transaction * * Return: the outdated transaction if found * NULL if no outdated transacton can be found * * Requires the proc->inner_lock to be held. */ static struct binder_transaction * binder_find_outdated_transaction_ilocked(struct binder_transaction *t, struct list_head *target_list) { struct binder_work *w; list_for_each_entry(w, target_list, entry) { struct binder_transaction *t_queued; if (w->type != BINDER_WORK_TRANSACTION) continue; t_queued = container_of(w, struct binder_transaction, work); if (binder_can_update_transaction(t_queued, t)) return t_queued; } return NULL; } /** * binder_proc_transaction() - sends a transaction to a process and wakes it up * @t: transaction to send * @proc: process to send the transaction to * @thread: thread in @proc to send the transaction to (may be NULL) * * This function queues a transaction to the specified process. It will try * to find a thread in the target process to handle the transaction and * wake it up. If no thread is found, the work is queued to the proc * waitqueue. * * If the @thread parameter is not NULL, the transaction is always queued * to the waitlist of that specific thread. * * Return: 0 if the transaction was successfully queued * BR_DEAD_REPLY if the target process or thread is dead * BR_FROZEN_REPLY if the target process or thread is frozen and * the sync transaction was rejected * BR_TRANSACTION_PENDING_FROZEN if the target process is frozen * and the async transaction was successfully queued */ static int binder_proc_transaction(struct binder_transaction *t, struct binder_proc *proc, struct binder_thread *thread) { struct binder_node *node = t->buffer->target_node; bool oneway = !!(t->flags & TF_ONE_WAY); bool pending_async = false; struct binder_transaction *t_outdated = NULL; bool frozen = false; BUG_ON(!node); binder_node_lock(node); if (oneway) { BUG_ON(thread); if (node->has_async_transaction) pending_async = true; else node->has_async_transaction = true; } binder_inner_proc_lock(proc); if (proc->is_frozen) { frozen = true; proc->sync_recv |= !oneway; proc->async_recv |= oneway; } if ((frozen && !oneway) || proc->is_dead || (thread && thread->is_dead)) { binder_inner_proc_unlock(proc); binder_node_unlock(node); return frozen ? BR_FROZEN_REPLY : BR_DEAD_REPLY; } if (!thread && !pending_async) thread = binder_select_thread_ilocked(proc); if (thread) { binder_enqueue_thread_work_ilocked(thread, &t->work); } else if (!pending_async) { binder_enqueue_work_ilocked(&t->work, &proc->todo); } else { if ((t->flags & TF_UPDATE_TXN) && frozen) { t_outdated = binder_find_outdated_transaction_ilocked(t, &node->async_todo); if (t_outdated) { binder_debug(BINDER_DEBUG_TRANSACTION, "txn %d supersedes %d\n", t->debug_id, t_outdated->debug_id); list_del_init(&t_outdated->work.entry); proc->outstanding_txns--; } } binder_enqueue_work_ilocked(&t->work, &node->async_todo); } if (!pending_async) binder_wakeup_thread_ilocked(proc, thread, !oneway /* sync */); proc->outstanding_txns++; binder_inner_proc_unlock(proc); binder_node_unlock(node); /* * To reduce potential contention, free the outdated transaction and * buffer after releasing the locks. */ if (t_outdated) { struct binder_buffer *buffer = t_outdated->buffer; t_outdated->buffer = NULL; buffer->transaction = NULL; trace_binder_transaction_update_buffer_release(buffer); binder_release_entire_buffer(proc, NULL, buffer, false); binder_alloc_free_buf(&proc->alloc, buffer); kfree(t_outdated); binder_stats_deleted(BINDER_STAT_TRANSACTION); } if (oneway && frozen) return BR_TRANSACTION_PENDING_FROZEN; return 0; } /** * binder_get_node_refs_for_txn() - Get required refs on node for txn * @node: struct binder_node for which to get refs * @procp: returns @node->proc if valid * @error: if no @procp then returns BR_DEAD_REPLY * * User-space normally keeps the node alive when creating a transaction * since it has a reference to the target. The local strong ref keeps it * alive if the sending process dies before the target process processes * the transaction. If the source process is malicious or has a reference * counting bug, relying on the local strong ref can fail. * * Since user-space can cause the local strong ref to go away, we also take * a tmpref on the node to ensure it survives while we are constructing * the transaction. We also need a tmpref on the proc while we are * constructing the transaction, so we take that here as well. * * Return: The target_node with refs taken or NULL if no @node->proc is NULL. * Also sets @procp if valid. If the @node->proc is NULL indicating that the * target proc has died, @error is set to BR_DEAD_REPLY. */ static struct binder_node *binder_get_node_refs_for_txn( struct binder_node *node, struct binder_proc **procp, uint32_t *error) { struct binder_node *target_node = NULL; binder_node_inner_lock(node); if (node->proc) { target_node = node; binder_inc_node_nilocked(node, 1, 0, NULL); binder_inc_node_tmpref_ilocked(node); node->proc->tmp_ref++; *procp = node->proc; } else *error = BR_DEAD_REPLY; binder_node_inner_unlock(node); return target_node; } static void binder_set_txn_from_error(struct binder_transaction *t, int id, uint32_t command, int32_t param) { struct binder_thread *from = binder_get_txn_from_and_acq_inner(t); if (!from) { /* annotation for sparse */ __release(&from->proc->inner_lock); return; } /* don't override existing errors */ if (from->ee.command == BR_OK) binder_set_extended_error(&from->ee, id, command, param); binder_inner_proc_unlock(from->proc); binder_thread_dec_tmpref(from); } /** * binder_netlink_report() - report a transaction failure via netlink * @proc: the binder proc sending the transaction * @t: the binder transaction that failed * @data_size: the user provided data size for the transaction * @error: enum binder_driver_return_protocol returned to sender */ static void binder_netlink_report(struct binder_proc *proc, struct binder_transaction *t, u32 data_size, u32 error) { const char *context = proc->context->name; struct sk_buff *skb; void *hdr; if (!genl_has_listeners(&binder_nl_family, &init_net, BINDER_NLGRP_REPORT)) return; trace_binder_netlink_report(context, t, data_size, error); skb = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!skb) return; hdr = genlmsg_put(skb, 0, 0, &binder_nl_family, 0, BINDER_CMD_REPORT); if (!hdr) goto free_skb; if (nla_put_u32(skb, BINDER_A_REPORT_ERROR, error) || nla_put_string(skb, BINDER_A_REPORT_CONTEXT, context) || nla_put_u32(skb, BINDER_A_REPORT_FROM_PID, t->from_pid) || nla_put_u32(skb, BINDER_A_REPORT_FROM_TID, t->from_tid)) goto cancel_skb; if (t->to_proc && nla_put_u32(skb, BINDER_A_REPORT_TO_PID, t->to_proc->pid)) goto cancel_skb; if (t->to_thread && nla_put_u32(skb, BINDER_A_REPORT_TO_TID, t->to_thread->pid)) goto cancel_skb; if (t->is_reply && nla_put_flag(skb, BINDER_A_REPORT_IS_REPLY)) goto cancel_skb; if (nla_put_u32(skb, BINDER_A_REPORT_FLAGS, t->flags) || nla_put_u32(skb, BINDER_A_REPORT_CODE, t->code) || nla_put_u32(skb, BINDER_A_REPORT_DATA_SIZE, data_size)) goto cancel_skb; genlmsg_end(skb, hdr); genlmsg_multicast(&binder_nl_family, skb, 0, BINDER_NLGRP_REPORT, GFP_KERNEL); return; cancel_skb: genlmsg_cancel(skb, hdr); free_skb: nlmsg_free(skb); } static void binder_transaction(struct binder_proc *proc, struct binder_thread *thread, struct binder_transaction_data *tr, int reply, binder_size_t extra_buffers_size) { int ret; struct binder_transaction *t; struct binder_work *w; struct binder_work *tcomplete; binder_size_t buffer_offset = 0; binder_size_t off_start_offset, off_end_offset; binder_size_t off_min; binder_size_t sg_buf_offset, sg_buf_end_offset; binder_size_t user_offset = 0; struct binder_proc *target_proc = NULL; struct binder_thread *target_thread = NULL; struct binder_node *target_node = NULL; struct binder_transaction *in_reply_to = NULL; struct binder_transaction_log_entry *e; uint32_t return_error = 0; uint32_t return_error_param = 0; uint32_t return_error_line = 0; binder_size_t last_fixup_obj_off = 0; binder_size_t last_fixup_min_off = 0; struct binder_context *context = proc->context; int t_debug_id = atomic_inc_return(&binder_last_id); ktime_t t_start_time = ktime_get(); struct lsm_context lsmctx = { }; struct list_head sgc_head; struct list_head pf_head; const void __user *user_buffer = (const void __user *) (uintptr_t)tr->data.ptr.buffer; INIT_LIST_HEAD(&sgc_head); INIT_LIST_HEAD(&pf_head); e = binder_transaction_log_add(&binder_transaction_log); e->debug_id = t_debug_id; e->call_type = reply ? 2 : !!(tr->flags & TF_ONE_WAY); e->from_proc = proc->pid; e->from_thread = thread->pid; e->target_handle = tr->target.handle; e->data_size = tr->data_size; e->offsets_size = tr->offsets_size; strscpy(e->context_name, proc->context->name, BINDERFS_MAX_NAME); binder_inner_proc_lock(proc); binder_set_extended_error(&thread->ee, t_debug_id, BR_OK, 0); binder_inner_proc_unlock(proc); t = kzalloc(sizeof(*t), GFP_KERNEL); if (!t) { binder_txn_error("%d:%d cannot allocate transaction\n", thread->pid, proc->pid); return_error = BR_FAILED_REPLY; return_error_param = -ENOMEM; return_error_line = __LINE__; goto err_alloc_t_failed; } INIT_LIST_HEAD(&t->fd_fixups); binder_stats_created(BINDER_STAT_TRANSACTION); spin_lock_init(&t->lock); t->debug_id = t_debug_id; t->start_time = t_start_time; t->from_pid = proc->pid; t->from_tid = thread->pid; t->sender_euid = task_euid(proc->tsk); t->code = tr->code; t->flags = tr->flags; t->priority = task_nice(current); t->work.type = BINDER_WORK_TRANSACTION; t->is_async = !reply && (tr->flags & TF_ONE_WAY); t->is_reply = reply; if (!reply && !(tr->flags & TF_ONE_WAY)) t->from = thread; if (reply) { binder_inner_proc_lock(proc); in_reply_to = thread->transaction_stack; if (in_reply_to == NULL) { binder_inner_proc_unlock(proc); binder_user_error("%d:%d got reply transaction with no transaction stack\n", proc->pid, thread->pid); return_error = BR_FAILED_REPLY; return_error_param = -EPROTO; return_error_line = __LINE__; goto err_empty_call_stack; } if (in_reply_to->to_thread != thread) { spin_lock(&in_reply_to->lock); binder_user_error("%d:%d got reply transaction with bad transaction stack, transaction %d has target %d:%d\n", proc->pid, thread->pid, in_reply_to->debug_id, in_reply_to->to_proc ? in_reply_to->to_proc->pid : 0, in_reply_to->to_thread ? in_reply_to->to_thread->pid : 0); spin_unlock(&in_reply_to->lock); binder_inner_proc_unlock(proc); return_error = BR_FAILED_REPLY; return_error_param = -EPROTO; return_error_line = __LINE__; in_reply_to = NULL; goto err_bad_call_stack; } thread->transaction_stack = in_reply_to->to_parent; binder_inner_proc_unlock(proc); binder_set_nice(in_reply_to->saved_priority); target_thread = binder_get_txn_from_and_acq_inner(in_reply_to); if (target_thread == NULL) { /* annotation for sparse */ __release(&target_thread->proc->inner_lock); binder_txn_error("%d:%d reply target not found\n", thread->pid, proc->pid); return_error = BR_DEAD_REPLY; return_error_line = __LINE__; goto err_dead_binder; } if (target_thread->transaction_stack != in_reply_to) { binder_user_error("%d:%d got reply transaction with bad target transaction stack %d, expected %d\n", proc->pid, thread->pid, target_thread->transaction_stack ? target_thread->transaction_stack->debug_id : 0, in_reply_to->debug_id); binder_inner_proc_unlock(target_thread->proc); return_error = BR_FAILED_REPLY; return_error_param = -EPROTO; return_error_line = __LINE__; in_reply_to = NULL; target_thread = NULL; goto err_dead_binder; } target_proc = target_thread->proc; target_proc->tmp_ref++; binder_inner_proc_unlock(target_thread->proc); } else { if (tr->target.handle) { struct binder_ref *ref; /* * There must already be a strong ref * on this node. If so, do a strong * increment on the node to ensure it * stays alive until the transaction is * done. */ binder_proc_lock(proc); ref = binder_get_ref_olocked(proc, tr->target.handle, true); if (ref) { target_node = binder_get_node_refs_for_txn( ref->node, &target_proc, &return_error); } else { binder_user_error("%d:%d got transaction to invalid handle, %u\n", proc->pid, thread->pid, tr->target.handle); return_error = BR_FAILED_REPLY; } binder_proc_unlock(proc); } else { mutex_lock(&context->context_mgr_node_lock); target_node = context->binder_context_mgr_node; if (target_node) target_node = binder_get_node_refs_for_txn( target_node, &target_proc, &return_error); else return_error = BR_DEAD_REPLY; mutex_unlock(&context->context_mgr_node_lock); if (target_node && target_proc->pid == proc->pid) { binder_user_error("%d:%d got transaction to context manager from process owning it\n", proc->pid, thread->pid); return_error = BR_FAILED_REPLY; return_error_param = -EINVAL; return_error_line = __LINE__; goto err_invalid_target_handle; } } if (!target_node) { binder_txn_error("%d:%d cannot find target node\n", proc->pid, thread->pid); /* return_error is set above */ return_error_param = -EINVAL; return_error_line = __LINE__; goto err_dead_binder; } e->to_node = target_node->debug_id; if (WARN_ON(proc == target_proc)) { binder_txn_error("%d:%d self transactions not allowed\n", thread->pid, proc->pid); return_error = BR_FAILED_REPLY; return_error_param = -EINVAL; return_error_line = __LINE__; goto err_invalid_target_handle; } if (security_binder_transaction(proc->cred, target_proc->cred) < 0) { binder_txn_error("%d:%d transaction credentials failed\n", thread->pid, proc->pid); return_error = BR_FAILED_REPLY; return_error_param = -EPERM; return_error_line = __LINE__; goto err_invalid_target_handle; } binder_inner_proc_lock(proc); w = list_first_entry_or_null(&thread->todo, struct binder_work, entry); if (!(tr->flags & TF_ONE_WAY) && w && w->type == BINDER_WORK_TRANSACTION) { /* * Do not allow new outgoing transaction from a * thread that has a transaction at the head of * its todo list. Only need to check the head * because binder_select_thread_ilocked picks a * thread from proc->waiting_threads to enqueue * the transaction, and nothing is queued to the * todo list while the thread is on waiting_threads. */ binder_user_error("%d:%d new transaction not allowed when there is a transaction on thread todo\n", proc->pid, thread->pid); binder_inner_proc_unlock(proc); return_error = BR_FAILED_REPLY; return_error_param = -EPROTO; return_error_line = __LINE__; goto err_bad_todo_list; } if (!(tr->flags & TF_ONE_WAY) && thread->transaction_stack) { struct binder_transaction *tmp; tmp = thread->transaction_stack; if (tmp->to_thread != thread) { spin_lock(&tmp->lock); binder_user_error("%d:%d got new transaction with bad transaction stack, transaction %d has target %d:%d\n", proc->pid, thread->pid, tmp->debug_id, tmp->to_proc ? tmp->to_proc->pid : 0, tmp->to_thread ? tmp->to_thread->pid : 0); spin_unlock(&tmp->lock); binder_inner_proc_unlock(proc); return_error = BR_FAILED_REPLY; return_error_param = -EPROTO; return_error_line = __LINE__; goto err_bad_call_stack; } while (tmp) { struct binder_thread *from; spin_lock(&tmp->lock); from = tmp->from; if (from && from->proc == target_proc) { atomic_inc(&from->tmp_ref); target_thread = from; spin_unlock(&tmp->lock); break; } spin_unlock(&tmp->lock); tmp = tmp->from_parent; } } binder_inner_proc_unlock(proc); } t->to_proc = target_proc; t->to_thread = target_thread; if (target_thread) e->to_thread = target_thread->pid; e->to_proc = target_proc->pid; tcomplete = kzalloc(sizeof(*tcomplete), GFP_KERNEL); if (tcomplete == NULL) { binder_txn_error("%d:%d cannot allocate work for transaction\n", thread->pid, proc->pid); return_error = BR_FAILED_REPLY; return_error_param = -ENOMEM; return_error_line = __LINE__; goto err_alloc_tcomplete_failed; } binder_stats_created(BINDER_STAT_TRANSACTION_COMPLETE); if (reply) binder_debug(BINDER_DEBUG_TRANSACTION, "%d:%d BC_REPLY %d -> %d:%d, data size %lld-%lld-%lld\n", proc->pid, thread->pid, t->debug_id, target_proc->pid, target_thread->pid, (u64)tr->data_size, (u64)tr->offsets_size, (u64)extra_buffers_size); else binder_debug(BINDER_DEBUG_TRANSACTION, "%d:%d BC_TRANSACTION %d -> %d - node %d, data size %lld-%lld-%lld\n", proc->pid, thread->pid, t->debug_id, target_proc->pid, target_node->debug_id, (u64)tr->data_size, (u64)tr->offsets_size, (u64)extra_buffers_size); if (target_node && target_node->txn_security_ctx) { u32 secid; size_t added_size; security_cred_getsecid(proc->cred, &secid); ret = security_secid_to_secctx(secid, &lsmctx); if (ret < 0) { binder_txn_error("%d:%d failed to get security context\n", thread->pid, proc->pid); return_error = BR_FAILED_REPLY; return_error_param = ret; return_error_line = __LINE__; goto err_get_secctx_failed; } added_size = ALIGN(lsmctx.len, sizeof(u64)); extra_buffers_size += added_size; if (extra_buffers_size < added_size) { binder_txn_error("%d:%d integer overflow of extra_buffers_size\n", thread->pid, proc->pid); return_error = BR_FAILED_REPLY; return_error_param = -EINVAL; return_error_line = __LINE__; goto err_bad_extra_size; } } trace_binder_transaction(reply, t, target_node); t->buffer = binder_alloc_new_buf(&target_proc->alloc, tr->data_size, tr->offsets_size, extra_buffers_size, !reply && (t->flags & TF_ONE_WAY)); if (IS_ERR(t->buffer)) { char *s; ret = PTR_ERR(t->buffer); s = (ret == -ESRCH) ? ": vma cleared, target dead or dying" : (ret == -ENOSPC) ? ": no space left" : (ret == -ENOMEM) ? ": memory allocation failed" : ""; binder_txn_error("cannot allocate buffer%s", s); return_error_param = PTR_ERR(t->buffer); return_error = return_error_param == -ESRCH ? BR_DEAD_REPLY : BR_FAILED_REPLY; return_error_line = __LINE__; t->buffer = NULL; goto err_binder_alloc_buf_failed; } if (lsmctx.context) { int err; size_t buf_offset = ALIGN(tr->data_size, sizeof(void *)) + ALIGN(tr->offsets_size, sizeof(void *)) + ALIGN(extra_buffers_size, sizeof(void *)) - ALIGN(lsmctx.len, sizeof(u64)); t->security_ctx = t->buffer->user_data + buf_offset; err = binder_alloc_copy_to_buffer(&target_proc->alloc, t->buffer, buf_offset, lsmctx.context, lsmctx.len); if (err) { t->security_ctx = 0; WARN_ON(1); } security_release_secctx(&lsmctx); lsmctx.context = NULL; } t->buffer->debug_id = t->debug_id; t->buffer->transaction = t; t->buffer->target_node = target_node; t->buffer->clear_on_free = !!(t->flags & TF_CLEAR_BUF); trace_binder_transaction_alloc_buf(t->buffer); if (binder_alloc_copy_user_to_buffer( &target_proc->alloc, t->buffer, ALIGN(tr->data_size, sizeof(void *)), (const void __user *) (uintptr_t)tr->data.ptr.offsets, tr->offsets_size)) { binder_user_error("%d:%d got transaction with invalid offsets ptr\n", proc->pid, thread->pid); return_error = BR_FAILED_REPLY; return_error_param = -EFAULT; return_error_line = __LINE__; goto err_copy_data_failed; } if (!IS_ALIGNED(tr->offsets_size, sizeof(binder_size_t))) { binder_user_error("%d:%d got transaction with invalid offsets size, %lld\n", proc->pid, thread->pid, (u64)tr->offsets_size); return_error = BR_FAILED_REPLY; return_error_param = -EINVAL; return_error_line = __LINE__; goto err_bad_offset; } if (!IS_ALIGNED(extra_buffers_size, sizeof(u64))) { binder_user_error("%d:%d got transaction with unaligned buffers size, %lld\n", proc->pid, thread->pid, (u64)extra_buffers_size); return_error = BR_FAILED_REPLY; return_error_param = -EINVAL; return_error_line = __LINE__; goto err_bad_offset; } off_start_offset = ALIGN(tr->data_size, sizeof(void *)); buffer_offset = off_start_offset; off_end_offset = off_start_offset + tr->offsets_size; sg_buf_offset = ALIGN(off_end_offset, sizeof(void *)); sg_buf_end_offset = sg_buf_offset + extra_buffers_size - ALIGN(lsmctx.len, sizeof(u64)); off_min = 0; for (buffer_offset = off_start_offset; buffer_offset < off_end_offset; buffer_offset += sizeof(binder_size_t)) { struct binder_object_header *hdr; size_t object_size; struct binder_object object; binder_size_t object_offset; binder_size_t copy_size; if (binder_alloc_copy_from_buffer(&target_proc->alloc, &object_offset, t->buffer, buffer_offset, sizeof(object_offset))) { binder_txn_error("%d:%d copy offset from buffer failed\n", thread->pid, proc->pid); return_error = BR_FAILED_REPLY; return_error_param = -EINVAL; return_error_line = __LINE__; goto err_bad_offset; } /* * Copy the source user buffer up to the next object * that will be processed. */ copy_size = object_offset - user_offset; if (copy_size && (user_offset > object_offset || object_offset > tr->data_size || binder_alloc_copy_user_to_buffer( &target_proc->alloc, t->buffer, user_offset, user_buffer + user_offset, copy_size))) { binder_user_error("%d:%d got transaction with invalid data ptr\n", proc->pid, thread->pid); return_error = BR_FAILED_REPLY; return_error_param = -EFAULT; return_error_line = __LINE__; goto err_copy_data_failed; } object_size = binder_get_object(target_proc, user_buffer, t->buffer, object_offset, &object); if (object_size == 0 || object_offset < off_min) { binder_user_error("%d:%d got transaction with invalid offset (%lld, min %lld max %lld) or object.\n", proc->pid, thread->pid, (u64)object_offset, (u64)off_min, (u64)t->buffer->data_size); return_error = BR_FAILED_REPLY; return_error_param = -EINVAL; return_error_line = __LINE__; goto err_bad_offset; } /* * Set offset to the next buffer fragment to be * copied */ user_offset = object_offset + object_size; hdr = &object.hdr; off_min = object_offset + object_size; switch (hdr->type) { case BINDER_TYPE_BINDER: case BINDER_TYPE_WEAK_BINDER: { struct flat_binder_object *fp; fp = to_flat_binder_object(hdr); ret = binder_translate_binder(fp, t, thread); if (ret < 0 || binder_alloc_copy_to_buffer(&target_proc->alloc, t->buffer, object_offset, fp, sizeof(*fp))) { binder_txn_error("%d:%d translate binder failed\n", thread->pid, proc->pid); return_error = BR_FAILED_REPLY; return_error_param = ret; return_error_line = __LINE__; goto err_translate_failed; } } break; case BINDER_TYPE_HANDLE: case BINDER_TYPE_WEAK_HANDLE: { struct flat_binder_object *fp; fp = to_flat_binder_object(hdr); ret = binder_translate_handle(fp, t, thread); if (ret < 0 || binder_alloc_copy_to_buffer(&target_proc->alloc, t->buffer, object_offset, fp, sizeof(*fp))) { binder_txn_error("%d:%d translate handle failed\n", thread->pid, proc->pid); return_error = BR_FAILED_REPLY; return_error_param = ret; return_error_line = __LINE__; goto err_translate_failed; } } break; case BINDER_TYPE_FD: { struct binder_fd_object *fp = to_binder_fd_object(hdr); binder_size_t fd_offset = object_offset + (uintptr_t)&fp->fd - (uintptr_t)fp; int ret = binder_translate_fd(fp->fd, fd_offset, t, thread, in_reply_to); fp->pad_binder = 0; if (ret < 0 || binder_alloc_copy_to_buffer(&target_proc->alloc, t->buffer, object_offset, fp, sizeof(*fp))) { binder_txn_error("%d:%d translate fd failed\n", thread->pid, proc->pid); return_error = BR_FAILED_REPLY; return_error_param = ret; return_error_line = __LINE__; goto err_translate_failed; } } break; case BINDER_TYPE_FDA: { struct binder_object ptr_object; binder_size_t parent_offset; struct binder_object user_object; size_t user_parent_size; struct binder_fd_array_object *fda = to_binder_fd_array_object(hdr); size_t num_valid = (buffer_offset - off_start_offset) / sizeof(binder_size_t); struct binder_buffer_object *parent = binder_validate_ptr(target_proc, t->buffer, &ptr_object, fda->parent, off_start_offset, &parent_offset, num_valid); if (!parent) { binder_user_error("%d:%d got transaction with invalid parent offset or type\n", proc->pid, thread->pid); return_error = BR_FAILED_REPLY; return_error_param = -EINVAL; return_error_line = __LINE__; goto err_bad_parent; } if (!binder_validate_fixup(target_proc, t->buffer, off_start_offset, parent_offset, fda->parent_offset, last_fixup_obj_off, last_fixup_min_off)) { binder_user_error("%d:%d got transaction with out-of-order buffer fixup\n", proc->pid, thread->pid); return_error = BR_FAILED_REPLY; return_error_param = -EINVAL; return_error_line = __LINE__; goto err_bad_parent; } /* * We need to read the user version of the parent * object to get the original user offset */ user_parent_size = binder_get_object(proc, user_buffer, t->buffer, parent_offset, &user_object); if (user_parent_size != sizeof(user_object.bbo)) { binder_user_error("%d:%d invalid ptr object size: %zd vs %zd\n", proc->pid, thread->pid, user_parent_size, sizeof(user_object.bbo)); return_error = BR_FAILED_REPLY; return_error_param = -EINVAL; return_error_line = __LINE__; goto err_bad_parent; } ret = binder_translate_fd_array(&pf_head, fda, user_buffer, parent, &user_object.bbo, t, thread, in_reply_to); if (!ret) ret = binder_alloc_copy_to_buffer(&target_proc->alloc, t->buffer, object_offset, fda, sizeof(*fda)); if (ret) { binder_txn_error("%d:%d translate fd array failed\n", thread->pid, proc->pid); return_error = BR_FAILED_REPLY; return_error_param = ret > 0 ? -EINVAL : ret; return_error_line = __LINE__; goto err_translate_failed; } last_fixup_obj_off = parent_offset; last_fixup_min_off = fda->parent_offset + sizeof(u32) * fda->num_fds; } break; case BINDER_TYPE_PTR: { struct binder_buffer_object *bp = to_binder_buffer_object(hdr); size_t buf_left = sg_buf_end_offset - sg_buf_offset; size_t num_valid; if (bp->length > buf_left) { binder_user_error("%d:%d got transaction with too large buffer\n", proc->pid, thread->pid); return_error = BR_FAILED_REPLY; return_error_param = -EINVAL; return_error_line = __LINE__; goto err_bad_offset; } ret = binder_defer_copy(&sgc_head, sg_buf_offset, (const void __user *)(uintptr_t)bp->buffer, bp->length); if (ret) { binder_txn_error("%d:%d deferred copy failed\n", thread->pid, proc->pid); return_error = BR_FAILED_REPLY; return_error_param = ret; return_error_line = __LINE__; goto err_translate_failed; } /* Fixup buffer pointer to target proc address space */ bp->buffer = t->buffer->user_data + sg_buf_offset; sg_buf_offset += ALIGN(bp->length, sizeof(u64)); num_valid = (buffer_offset - off_start_offset) / sizeof(binder_size_t); ret = binder_fixup_parent(&pf_head, t, thread, bp, off_start_offset, num_valid, last_fixup_obj_off, last_fixup_min_off); if (ret < 0 || binder_alloc_copy_to_buffer(&target_proc->alloc, t->buffer, object_offset, bp, sizeof(*bp))) { binder_txn_error("%d:%d failed to fixup parent\n", thread->pid, proc->pid); return_error = BR_FAILED_REPLY; return_error_param = ret; return_error_line = __LINE__; goto err_translate_failed; } last_fixup_obj_off = object_offset; last_fixup_min_off = 0; } break; default: binder_user_error("%d:%d got transaction with invalid object type, %x\n", proc->pid, thread->pid, hdr->type); return_error = BR_FAILED_REPLY; return_error_param = -EINVAL; return_error_line = __LINE__; goto err_bad_object_type; } } /* Done processing objects, copy the rest of the buffer */ if (binder_alloc_copy_user_to_buffer( &target_proc->alloc, t->buffer, user_offset, user_buffer + user_offset, tr->data_size - user_offset)) { binder_user_error("%d:%d got transaction with invalid data ptr\n", proc->pid, thread->pid); return_error = BR_FAILED_REPLY; return_error_param = -EFAULT; return_error_line = __LINE__; goto err_copy_data_failed; } ret = binder_do_deferred_txn_copies(&target_proc->alloc, t->buffer, &sgc_head, &pf_head); if (ret) { binder_user_error("%d:%d got transaction with invalid offsets ptr\n", proc->pid, thread->pid); return_error = BR_FAILED_REPLY; return_error_param = ret; return_error_line = __LINE__; goto err_copy_data_failed; } if (t->buffer->oneway_spam_suspect) { tcomplete->type = BINDER_WORK_TRANSACTION_ONEWAY_SPAM_SUSPECT; binder_netlink_report(proc, t, tr->data_size, BR_ONEWAY_SPAM_SUSPECT); } else { tcomplete->type = BINDER_WORK_TRANSACTION_COMPLETE; } if (reply) { binder_enqueue_thread_work(thread, tcomplete); binder_inner_proc_lock(target_proc); if (target_thread->is_dead) { return_error = BR_DEAD_REPLY; binder_inner_proc_unlock(target_proc); goto err_dead_proc_or_thread; } BUG_ON(t->buffer->async_transaction != 0); binder_pop_transaction_ilocked(target_thread, in_reply_to); binder_enqueue_thread_work_ilocked(target_thread, &t->work); target_proc->outstanding_txns++; binder_inner_proc_unlock(target_proc); wake_up_interruptible_sync(&target_thread->wait); binder_free_transaction(in_reply_to); } else if (!(t->flags & TF_ONE_WAY)) { BUG_ON(t->buffer->async_transaction != 0); binder_inner_proc_lock(proc); /* * Defer the TRANSACTION_COMPLETE, so we don't return to * userspace immediately; this allows the target process to * immediately start processing this transaction, reducing * latency. We will then return the TRANSACTION_COMPLETE when * the target replies (or there is an error). */ binder_enqueue_deferred_thread_work_ilocked(thread, tcomplete); t->from_parent = thread->transaction_stack; thread->transaction_stack = t; binder_inner_proc_unlock(proc); return_error = binder_proc_transaction(t, target_proc, target_thread); if (return_error) { binder_inner_proc_lock(proc); binder_pop_transaction_ilocked(thread, t); binder_inner_proc_unlock(proc); goto err_dead_proc_or_thread; } } else { BUG_ON(target_node == NULL); BUG_ON(t->buffer->async_transaction != 1); return_error = binder_proc_transaction(t, target_proc, NULL); /* * Let the caller know when async transaction reaches a frozen * process and is put in a pending queue, waiting for the target * process to be unfrozen. */ if (return_error == BR_TRANSACTION_PENDING_FROZEN) { tcomplete->type = BINDER_WORK_TRANSACTION_PENDING; binder_netlink_report(proc, t, tr->data_size, return_error); } binder_enqueue_thread_work(thread, tcomplete); if (return_error && return_error != BR_TRANSACTION_PENDING_FROZEN) goto err_dead_proc_or_thread; } if (target_thread) binder_thread_dec_tmpref(target_thread); binder_proc_dec_tmpref(target_proc); if (target_node) binder_dec_node_tmpref(target_node); /* * write barrier to synchronize with initialization * of log entry */ smp_wmb(); WRITE_ONCE(e->debug_id_done, t_debug_id); return; err_dead_proc_or_thread: binder_txn_error("%d:%d dead process or thread\n", thread->pid, proc->pid); return_error_line = __LINE__; binder_dequeue_work(proc, tcomplete); err_translate_failed: err_bad_object_type: err_bad_offset: err_bad_parent: err_copy_data_failed: binder_cleanup_deferred_txn_lists(&sgc_head, &pf_head); binder_free_txn_fixups(t); trace_binder_transaction_failed_buffer_release(t->buffer); binder_transaction_buffer_release(target_proc, NULL, t->buffer, buffer_offset, true); if (target_node) binder_dec_node_tmpref(target_node); target_node = NULL; t->buffer->transaction = NULL; binder_alloc_free_buf(&target_proc->alloc, t->buffer); err_binder_alloc_buf_failed: err_bad_extra_size: if (lsmctx.context) security_release_secctx(&lsmctx); err_get_secctx_failed: kfree(tcomplete); binder_stats_deleted(BINDER_STAT_TRANSACTION_COMPLETE); err_alloc_tcomplete_failed: if (trace_binder_txn_latency_free_enabled()) binder_txn_latency_free(t); err_bad_todo_list: err_bad_call_stack: err_empty_call_stack: err_dead_binder: err_invalid_target_handle: if (target_node) { binder_dec_node(target_node, 1, 0); binder_dec_node_tmpref(target_node); } binder_netlink_report(proc, t, tr->data_size, return_error); kfree(t); binder_stats_deleted(BINDER_STAT_TRANSACTION); err_alloc_t_failed: binder_debug(BINDER_DEBUG_FAILED_TRANSACTION, "%d:%d transaction %s to %d:%d failed %d/%d/%d, code %u size %lld-%lld line %d\n", proc->pid, thread->pid, reply ? "reply" : (tr->flags & TF_ONE_WAY ? "async" : "call"), target_proc ? target_proc->pid : 0, target_thread ? target_thread->pid : 0, t_debug_id, return_error, return_error_param, tr->code, (u64)tr->data_size, (u64)tr->offsets_size, return_error_line); if (target_thread) binder_thread_dec_tmpref(target_thread); if (target_proc) binder_proc_dec_tmpref(target_proc); { struct binder_transaction_log_entry *fe; e->return_error = return_error; e->return_error_param = return_error_param; e->return_error_line = return_error_line; fe = binder_transaction_log_add(&binder_transaction_log_failed); *fe = *e; /* * write barrier to synchronize with initialization * of log entry */ smp_wmb(); WRITE_ONCE(e->debug_id_done, t_debug_id); WRITE_ONCE(fe->debug_id_done, t_debug_id); } BUG_ON(thread->return_error.cmd != BR_OK); if (in_reply_to) { binder_set_txn_from_error(in_reply_to, t_debug_id, return_error, return_error_param); thread->return_error.cmd = BR_TRANSACTION_COMPLETE; binder_enqueue_thread_work(thread, &thread->return_error.work); binder_send_failed_reply(in_reply_to, return_error); } else { binder_inner_proc_lock(proc); binder_set_extended_error(&thread->ee, t_debug_id, return_error, return_error_param); binder_inner_proc_unlock(proc); thread->return_error.cmd = return_error; binder_enqueue_thread_work(thread, &thread->return_error.work); } } static int binder_request_freeze_notification(struct binder_proc *proc, struct binder_thread *thread, struct binder_handle_cookie *handle_cookie) { struct binder_ref_freeze *freeze; struct binder_ref *ref; freeze = kzalloc(sizeof(*freeze), GFP_KERNEL); if (!freeze) return -ENOMEM; binder_proc_lock(proc); ref = binder_get_ref_olocked(proc, handle_cookie->handle, false); if (!ref) { binder_user_error("%d:%d BC_REQUEST_FREEZE_NOTIFICATION invalid ref %d\n", proc->pid, thread->pid, handle_cookie->handle); binder_proc_unlock(proc); kfree(freeze); return -EINVAL; } binder_node_lock(ref->node); if (ref->freeze) { binder_user_error("%d:%d BC_REQUEST_FREEZE_NOTIFICATION already set\n", proc->pid, thread->pid); binder_node_unlock(ref->node); binder_proc_unlock(proc); kfree(freeze); return -EINVAL; } binder_stats_created(BINDER_STAT_FREEZE); INIT_LIST_HEAD(&freeze->work.entry); freeze->cookie = handle_cookie->cookie; freeze->work.type = BINDER_WORK_FROZEN_BINDER; ref->freeze = freeze; if (ref->node->proc) { binder_inner_proc_lock(ref->node->proc); freeze->is_frozen = ref->node->proc->is_frozen; binder_inner_proc_unlock(ref->node->proc); binder_inner_proc_lock(proc); binder_enqueue_work_ilocked(&freeze->work, &proc->todo); binder_wakeup_proc_ilocked(proc); binder_inner_proc_unlock(proc); } binder_node_unlock(ref->node); binder_proc_unlock(proc); return 0; } static int binder_clear_freeze_notification(struct binder_proc *proc, struct binder_thread *thread, struct binder_handle_cookie *handle_cookie) { struct binder_ref_freeze *freeze; struct binder_ref *ref; binder_proc_lock(proc); ref = binder_get_ref_olocked(proc, handle_cookie->handle, false); if (!ref) { binder_user_error("%d:%d BC_CLEAR_FREEZE_NOTIFICATION invalid ref %d\n", proc->pid, thread->pid, handle_cookie->handle); binder_proc_unlock(proc); return -EINVAL; } binder_node_lock(ref->node); if (!ref->freeze) { binder_user_error("%d:%d BC_CLEAR_FREEZE_NOTIFICATION freeze notification not active\n", proc->pid, thread->pid); binder_node_unlock(ref->node); binder_proc_unlock(proc); return -EINVAL; } freeze = ref->freeze; binder_inner_proc_lock(proc); if (freeze->cookie != handle_cookie->cookie) { binder_user_error("%d:%d BC_CLEAR_FREEZE_NOTIFICATION freeze notification cookie mismatch %016llx != %016llx\n", proc->pid, thread->pid, (u64)freeze->cookie, (u64)handle_cookie->cookie); binder_inner_proc_unlock(proc); binder_node_unlock(ref->node); binder_proc_unlock(proc); return -EINVAL; } ref->freeze = NULL; /* * Take the existing freeze object and overwrite its work type. There are three cases here: * 1. No pending notification. In this case just add the work to the queue. * 2. A notification was sent and is pending an ack from userspace. Once an ack arrives, we * should resend with the new work type. * 3. A notification is pending to be sent. Since the work is already in the queue, nothing * needs to be done here. */ freeze->work.type = BINDER_WORK_CLEAR_FREEZE_NOTIFICATION; if (list_empty(&freeze->work.entry)) { binder_enqueue_work_ilocked(&freeze->work, &proc->todo); binder_wakeup_proc_ilocked(proc); } else if (freeze->sent) { freeze->resend = true; } binder_inner_proc_unlock(proc); binder_node_unlock(ref->node); binder_proc_unlock(proc); return 0; } static int binder_freeze_notification_done(struct binder_proc *proc, struct binder_thread *thread, binder_uintptr_t cookie) { struct binder_ref_freeze *freeze = NULL; struct binder_work *w; binder_inner_proc_lock(proc); list_for_each_entry(w, &proc->delivered_freeze, entry) { struct binder_ref_freeze *tmp_freeze = container_of(w, struct binder_ref_freeze, work); if (tmp_freeze->cookie == cookie) { freeze = tmp_freeze; break; } } if (!freeze) { binder_user_error("%d:%d BC_FREEZE_NOTIFICATION_DONE %016llx not found\n", proc->pid, thread->pid, (u64)cookie); binder_inner_proc_unlock(proc); return -EINVAL; } binder_dequeue_work_ilocked(&freeze->work); freeze->sent = false; if (freeze->resend) { freeze->resend = false; binder_enqueue_work_ilocked(&freeze->work, &proc->todo); binder_wakeup_proc_ilocked(proc); } binder_inner_proc_unlock(proc); return 0; } /** * binder_free_buf() - free the specified buffer * @proc: binder proc that owns buffer * @thread: binder thread performing the buffer release * @buffer: buffer to be freed * @is_failure: failed to send transaction * * If the buffer is for an async transaction, enqueue the next async * transaction from the node. * * Cleanup the buffer and free it. */ static void binder_free_buf(struct binder_proc *proc, struct binder_thread *thread, struct binder_buffer *buffer, bool is_failure) { binder_inner_proc_lock(proc); if (buffer->transaction) { buffer->transaction->buffer = NULL; buffer->transaction = NULL; } binder_inner_proc_unlock(proc); if (buffer->async_transaction && buffer->target_node) { struct binder_node *buf_node; struct binder_work *w; buf_node = buffer->target_node; binder_node_inner_lock(buf_node); BUG_ON(!buf_node->has_async_transaction); BUG_ON(buf_node->proc != proc); w = binder_dequeue_work_head_ilocked( &buf_node->async_todo); if (!w) { buf_node->has_async_transaction = false; } else { binder_enqueue_work_ilocked( w, &proc->todo); binder_wakeup_proc_ilocked(proc); } binder_node_inner_unlock(buf_node); } trace_binder_transaction_buffer_release(buffer); binder_release_entire_buffer(proc, thread, buffer, is_failure); binder_alloc_free_buf(&proc->alloc, buffer); } static int binder_thread_write(struct binder_proc *proc, struct binder_thread *thread, binder_uintptr_t binder_buffer, size_t size, binder_size_t *consumed) { uint32_t cmd; struct binder_context *context = proc->context; void __user *buffer = (void __user *)(uintptr_t)binder_buffer; void __user *ptr = buffer + *consumed; void __user *end = buffer + size; while (ptr < end && thread->return_error.cmd == BR_OK) { int ret; if (get_user(cmd, (uint32_t __user *)ptr)) return -EFAULT; ptr += sizeof(uint32_t); trace_binder_command(cmd); if (_IOC_NR(cmd) < ARRAY_SIZE(binder_stats.bc)) { atomic_inc(&binder_stats.bc[_IOC_NR(cmd)]); atomic_inc(&proc->stats.bc[_IOC_NR(cmd)]); atomic_inc(&thread->stats.bc[_IOC_NR(cmd)]); } switch (cmd) { case BC_INCREFS: case BC_ACQUIRE: case BC_RELEASE: case BC_DECREFS: { uint32_t target; const char *debug_string; bool strong = cmd == BC_ACQUIRE || cmd == BC_RELEASE; bool increment = cmd == BC_INCREFS || cmd == BC_ACQUIRE; struct binder_ref_data rdata; if (get_user(target, (uint32_t __user *)ptr)) return -EFAULT; ptr += sizeof(uint32_t); ret = -1; if (increment && !target) { struct binder_node *ctx_mgr_node; mutex_lock(&context->context_mgr_node_lock); ctx_mgr_node = context->binder_context_mgr_node; if (ctx_mgr_node) { if (ctx_mgr_node->proc == proc) { binder_user_error("%d:%d context manager tried to acquire desc 0\n", proc->pid, thread->pid); mutex_unlock(&context->context_mgr_node_lock); return -EINVAL; } ret = binder_inc_ref_for_node( proc, ctx_mgr_node, strong, NULL, &rdata); } mutex_unlock(&context->context_mgr_node_lock); } if (ret) ret = binder_update_ref_for_handle( proc, target, increment, strong, &rdata); if (!ret && rdata.desc != target) { binder_user_error("%d:%d tried to acquire reference to desc %d, got %d instead\n", proc->pid, thread->pid, target, rdata.desc); } switch (cmd) { case BC_INCREFS: debug_string = "IncRefs"; break; case BC_ACQUIRE: debug_string = "Acquire"; break; case BC_RELEASE: debug_string = "Release"; break; case BC_DECREFS: default: debug_string = "DecRefs"; break; } if (ret) { binder_user_error("%d:%d %s %d refcount change on invalid ref %d ret %d\n", proc->pid, thread->pid, debug_string, strong, target, ret); break; } binder_debug(BINDER_DEBUG_USER_REFS, "%d:%d %s ref %d desc %d s %d w %d\n", proc->pid, thread->pid, debug_string, rdata.debug_id, rdata.desc, rdata.strong, rdata.weak); break; } case BC_INCREFS_DONE: case BC_ACQUIRE_DONE: { binder_uintptr_t node_ptr; binder_uintptr_t cookie; struct binder_node *node; bool free_node; if (get_user(node_ptr, (binder_uintptr_t __user *)ptr)) return -EFAULT; ptr += sizeof(binder_uintptr_t); if (get_user(cookie, (binder_uintptr_t __user *)ptr)) return -EFAULT; ptr += sizeof(binder_uintptr_t); node = binder_get_node(proc, node_ptr); if (node == NULL) { binder_user_error("%d:%d %s u%016llx no match\n", proc->pid, thread->pid, cmd == BC_INCREFS_DONE ? "BC_INCREFS_DONE" : "BC_ACQUIRE_DONE", (u64)node_ptr); break; } if (cookie != node->cookie) { binder_user_error("%d:%d %s u%016llx node %d cookie mismatch %016llx != %016llx\n", proc->pid, thread->pid, cmd == BC_INCREFS_DONE ? "BC_INCREFS_DONE" : "BC_ACQUIRE_DONE", (u64)node_ptr, node->debug_id, (u64)cookie, (u64)node->cookie); binder_put_node(node); break; } binder_node_inner_lock(node); if (cmd == BC_ACQUIRE_DONE) { if (node->pending_strong_ref == 0) { binder_user_error("%d:%d BC_ACQUIRE_DONE node %d has no pending acquire request\n", proc->pid, thread->pid, node->debug_id); binder_node_inner_unlock(node); binder_put_node(node); break; } node->pending_strong_ref = 0; } else { if (node->pending_weak_ref == 0) { binder_user_error("%d:%d BC_INCREFS_DONE node %d has no pending increfs request\n", proc->pid, thread->pid, node->debug_id); binder_node_inner_unlock(node); binder_put_node(node); break; } node->pending_weak_ref = 0; } free_node = binder_dec_node_nilocked(node, cmd == BC_ACQUIRE_DONE, 0); WARN_ON(free_node); binder_debug(BINDER_DEBUG_USER_REFS, "%d:%d %s node %d ls %d lw %d tr %d\n", proc->pid, thread->pid, cmd == BC_INCREFS_DONE ? "BC_INCREFS_DONE" : "BC_ACQUIRE_DONE", node->debug_id, node->local_strong_refs, node->local_weak_refs, node->tmp_refs); binder_node_inner_unlock(node); binder_put_node(node); break; } case BC_ATTEMPT_ACQUIRE: pr_err("BC_ATTEMPT_ACQUIRE not supported\n"); return -EINVAL; case BC_ACQUIRE_RESULT: pr_err("BC_ACQUIRE_RESULT not supported\n"); return -EINVAL; case BC_FREE_BUFFER: { binder_uintptr_t data_ptr; struct binder_buffer *buffer; if (get_user(data_ptr, (binder_uintptr_t __user *)ptr)) return -EFAULT; ptr += sizeof(binder_uintptr_t); buffer = binder_alloc_prepare_to_free(&proc->alloc, data_ptr); if (IS_ERR_OR_NULL(buffer)) { if (PTR_ERR(buffer) == -EPERM) { binder_user_error( "%d:%d BC_FREE_BUFFER matched unreturned or currently freeing buffer at offset %lx\n", proc->pid, thread->pid, (unsigned long)data_ptr - proc->alloc.vm_start); } else { binder_user_error( "%d:%d BC_FREE_BUFFER no match for buffer at offset %lx\n", proc->pid, thread->pid, (unsigned long)data_ptr - proc->alloc.vm_start); } break; } binder_debug(BINDER_DEBUG_FREE_BUFFER, "%d:%d BC_FREE_BUFFER at offset %lx found buffer %d for %s transaction\n", proc->pid, thread->pid, (unsigned long)data_ptr - proc->alloc.vm_start, buffer->debug_id, buffer->transaction ? "active" : "finished"); binder_free_buf(proc, thread, buffer, false); break; } case BC_TRANSACTION_SG: case BC_REPLY_SG: { struct binder_transaction_data_sg tr; if (copy_from_user(&tr, ptr, sizeof(tr))) return -EFAULT; ptr += sizeof(tr); binder_transaction(proc, thread, &tr.transaction_data, cmd == BC_REPLY_SG, tr.buffers_size); break; } case BC_TRANSACTION: case BC_REPLY: { struct binder_transaction_data tr; if (copy_from_user(&tr, ptr, sizeof(tr))) return -EFAULT; ptr += sizeof(tr); binder_transaction(proc, thread, &tr, cmd == BC_REPLY, 0); break; } case BC_REGISTER_LOOPER: binder_debug(BINDER_DEBUG_THREADS, "%d:%d BC_REGISTER_LOOPER\n", proc->pid, thread->pid); binder_inner_proc_lock(proc); if (thread->looper & BINDER_LOOPER_STATE_ENTERED) { thread->looper |= BINDER_LOOPER_STATE_INVALID; binder_user_error("%d:%d ERROR: BC_REGISTER_LOOPER called after BC_ENTER_LOOPER\n", proc->pid, thread->pid); } else if (proc->requested_threads == 0) { thread->looper |= BINDER_LOOPER_STATE_INVALID; binder_user_error("%d:%d ERROR: BC_REGISTER_LOOPER called without request\n", proc->pid, thread->pid); } else { proc->requested_threads--; proc->requested_threads_started++; } thread->looper |= BINDER_LOOPER_STATE_REGISTERED; binder_inner_proc_unlock(proc); break; case BC_ENTER_LOOPER: binder_debug(BINDER_DEBUG_THREADS, "%d:%d BC_ENTER_LOOPER\n", proc->pid, thread->pid); if (thread->looper & BINDER_LOOPER_STATE_REGISTERED) { thread->looper |= BINDER_LOOPER_STATE_INVALID; binder_user_error("%d:%d ERROR: BC_ENTER_LOOPER called after BC_REGISTER_LOOPER\n", proc->pid, thread->pid); } thread->looper |= BINDER_LOOPER_STATE_ENTERED; break; case BC_EXIT_LOOPER: binder_debug(BINDER_DEBUG_THREADS, "%d:%d BC_EXIT_LOOPER\n", proc->pid, thread->pid); thread->looper |= BINDER_LOOPER_STATE_EXITED; break; case BC_REQUEST_DEATH_NOTIFICATION: case BC_CLEAR_DEATH_NOTIFICATION: { uint32_t target; binder_uintptr_t cookie; struct binder_ref *ref; struct binder_ref_death *death = NULL; if (get_user(target, (uint32_t __user *)ptr)) return -EFAULT; ptr += sizeof(uint32_t); if (get_user(cookie, (binder_uintptr_t __user *)ptr)) return -EFAULT; ptr += sizeof(binder_uintptr_t); if (cmd == BC_REQUEST_DEATH_NOTIFICATION) { /* * Allocate memory for death notification * before taking lock */ death = kzalloc(sizeof(*death), GFP_KERNEL); if (death == NULL) { WARN_ON(thread->return_error.cmd != BR_OK); thread->return_error.cmd = BR_ERROR; binder_enqueue_thread_work( thread, &thread->return_error.work); binder_debug( BINDER_DEBUG_FAILED_TRANSACTION, "%d:%d BC_REQUEST_DEATH_NOTIFICATION failed\n", proc->pid, thread->pid); break; } } binder_proc_lock(proc); ref = binder_get_ref_olocked(proc, target, false); if (ref == NULL) { binder_user_error("%d:%d %s invalid ref %d\n", proc->pid, thread->pid, cmd == BC_REQUEST_DEATH_NOTIFICATION ? "BC_REQUEST_DEATH_NOTIFICATION" : "BC_CLEAR_DEATH_NOTIFICATION", target); binder_proc_unlock(proc); kfree(death); break; } binder_debug(BINDER_DEBUG_DEATH_NOTIFICATION, "%d:%d %s %016llx ref %d desc %d s %d w %d for node %d\n", proc->pid, thread->pid, cmd == BC_REQUEST_DEATH_NOTIFICATION ? "BC_REQUEST_DEATH_NOTIFICATION" : "BC_CLEAR_DEATH_NOTIFICATION", (u64)cookie, ref->data.debug_id, ref->data.desc, ref->data.strong, ref->data.weak, ref->node->debug_id); binder_node_lock(ref->node); if (cmd == BC_REQUEST_DEATH_NOTIFICATION) { if (ref->death) { binder_user_error("%d:%d BC_REQUEST_DEATH_NOTIFICATION death notification already set\n", proc->pid, thread->pid); binder_node_unlock(ref->node); binder_proc_unlock(proc); kfree(death); break; } binder_stats_created(BINDER_STAT_DEATH); INIT_LIST_HEAD(&death->work.entry); death->cookie = cookie; ref->death = death; if (ref->node->proc == NULL) { ref->death->work.type = BINDER_WORK_DEAD_BINDER; binder_inner_proc_lock(proc); binder_enqueue_work_ilocked( &ref->death->work, &proc->todo); binder_wakeup_proc_ilocked(proc); binder_inner_proc_unlock(proc); } } else { if (ref->death == NULL) { binder_user_error("%d:%d BC_CLEAR_DEATH_NOTIFICATION death notification not active\n", proc->pid, thread->pid); binder_node_unlock(ref->node); binder_proc_unlock(proc); break; } death = ref->death; if (death->cookie != cookie) { binder_user_error("%d:%d BC_CLEAR_DEATH_NOTIFICATION death notification cookie mismatch %016llx != %016llx\n", proc->pid, thread->pid, (u64)death->cookie, (u64)cookie); binder_node_unlock(ref->node); binder_proc_unlock(proc); break; } ref->death = NULL; binder_inner_proc_lock(proc); if (list_empty(&death->work.entry)) { death->work.type = BINDER_WORK_CLEAR_DEATH_NOTIFICATION; if (thread->looper & (BINDER_LOOPER_STATE_REGISTERED | BINDER_LOOPER_STATE_ENTERED)) binder_enqueue_thread_work_ilocked( thread, &death->work); else { binder_enqueue_work_ilocked( &death->work, &proc->todo); binder_wakeup_proc_ilocked( proc); } } else { BUG_ON(death->work.type != BINDER_WORK_DEAD_BINDER); death->work.type = BINDER_WORK_DEAD_BINDER_AND_CLEAR; } binder_inner_proc_unlock(proc); } binder_node_unlock(ref->node); binder_proc_unlock(proc); } break; case BC_DEAD_BINDER_DONE: { struct binder_work *w; binder_uintptr_t cookie; struct binder_ref_death *death = NULL; if (get_user(cookie, (binder_uintptr_t __user *)ptr)) return -EFAULT; ptr += sizeof(cookie); binder_inner_proc_lock(proc); list_for_each_entry(w, &proc->delivered_death, entry) { struct binder_ref_death *tmp_death = container_of(w, struct binder_ref_death, work); if (tmp_death->cookie == cookie) { death = tmp_death; break; } } binder_debug(BINDER_DEBUG_DEAD_BINDER, "%d:%d BC_DEAD_BINDER_DONE %016llx found %pK\n", proc->pid, thread->pid, (u64)cookie, death); if (death == NULL) { binder_user_error("%d:%d BC_DEAD_BINDER_DONE %016llx not found\n", proc->pid, thread->pid, (u64)cookie); binder_inner_proc_unlock(proc); break; } binder_dequeue_work_ilocked(&death->work); if (death->work.type == BINDER_WORK_DEAD_BINDER_AND_CLEAR) { death->work.type = BINDER_WORK_CLEAR_DEATH_NOTIFICATION; if (thread->looper & (BINDER_LOOPER_STATE_REGISTERED | BINDER_LOOPER_STATE_ENTERED)) binder_enqueue_thread_work_ilocked( thread, &death->work); else { binder_enqueue_work_ilocked( &death->work, &proc->todo); binder_wakeup_proc_ilocked(proc); } } binder_inner_proc_unlock(proc); } break; case BC_REQUEST_FREEZE_NOTIFICATION: { struct binder_handle_cookie handle_cookie; int error; if (copy_from_user(&handle_cookie, ptr, sizeof(handle_cookie))) return -EFAULT; ptr += sizeof(handle_cookie); error = binder_request_freeze_notification(proc, thread, &handle_cookie); if (error) return error; } break; case BC_CLEAR_FREEZE_NOTIFICATION: { struct binder_handle_cookie handle_cookie; int error; if (copy_from_user(&handle_cookie, ptr, sizeof(handle_cookie))) return -EFAULT; ptr += sizeof(handle_cookie); error = binder_clear_freeze_notification(proc, thread, &handle_cookie); if (error) return error; } break; case BC_FREEZE_NOTIFICATION_DONE: { binder_uintptr_t cookie; int error; if (get_user(cookie, (binder_uintptr_t __user *)ptr)) return -EFAULT; ptr += sizeof(cookie); error = binder_freeze_notification_done(proc, thread, cookie); if (error) return error; } break; default: pr_err("%d:%d unknown command %u\n", proc->pid, thread->pid, cmd); return -EINVAL; } *consumed = ptr - buffer; } return 0; } static void binder_stat_br(struct binder_proc *proc, struct binder_thread *thread, uint32_t cmd) { trace_binder_return(cmd); if (_IOC_NR(cmd) < ARRAY_SIZE(binder_stats.br)) { atomic_inc(&binder_stats.br[_IOC_NR(cmd)]); atomic_inc(&proc->stats.br[_IOC_NR(cmd)]); atomic_inc(&thread->stats.br[_IOC_NR(cmd)]); } } static int binder_put_node_cmd(struct binder_proc *proc, struct binder_thread *thread, void __user **ptrp, binder_uintptr_t node_ptr, binder_uintptr_t node_cookie, int node_debug_id, uint32_t cmd, const char *cmd_name) { void __user *ptr = *ptrp; if (put_user(cmd, (uint32_t __user *)ptr)) return -EFAULT; ptr += sizeof(uint32_t); if (put_user(node_ptr, (binder_uintptr_t __user *)ptr)) return -EFAULT; ptr += sizeof(binder_uintptr_t); if (put_user(node_cookie, (binder_uintptr_t __user *)ptr)) return -EFAULT; ptr += sizeof(binder_uintptr_t); binder_stat_br(proc, thread, cmd); binder_debug(BINDER_DEBUG_USER_REFS, "%d:%d %s %d u%016llx c%016llx\n", proc->pid, thread->pid, cmd_name, node_debug_id, (u64)node_ptr, (u64)node_cookie); *ptrp = ptr; return 0; } static int binder_wait_for_work(struct binder_thread *thread, bool do_proc_work) { DEFINE_WAIT(wait); struct binder_proc *proc = thread->proc; int ret = 0; binder_inner_proc_lock(proc); for (;;) { prepare_to_wait(&thread->wait, &wait, TASK_INTERRUPTIBLE|TASK_FREEZABLE); if (binder_has_work_ilocked(thread, do_proc_work)) break; if (do_proc_work) list_add(&thread->waiting_thread_node, &proc->waiting_threads); binder_inner_proc_unlock(proc); schedule(); binder_inner_proc_lock(proc); list_del_init(&thread->waiting_thread_node); if (signal_pending(current)) { ret = -EINTR; break; } } finish_wait(&thread->wait, &wait); binder_inner_proc_unlock(proc); return ret; } /** * binder_apply_fd_fixups() - finish fd translation * @proc: binder_proc associated @t->buffer * @t: binder transaction with list of fd fixups * * Now that we are in the context of the transaction target * process, we can allocate and install fds. Process the * list of fds to translate and fixup the buffer with the * new fds first and only then install the files. * * If we fail to allocate an fd, skip the install and release * any fds that have already been allocated. */ static int binder_apply_fd_fixups(struct binder_proc *proc, struct binder_transaction *t) { struct binder_txn_fd_fixup *fixup, *tmp; int ret = 0; list_for_each_entry(fixup, &t->fd_fixups, fixup_entry) { int fd = get_unused_fd_flags(O_CLOEXEC); if (fd < 0) { binder_debug(BINDER_DEBUG_TRANSACTION, "failed fd fixup txn %d fd %d\n", t->debug_id, fd); ret = -ENOMEM; goto err; } binder_debug(BINDER_DEBUG_TRANSACTION, "fd fixup txn %d fd %d\n", t->debug_id, fd); trace_binder_transaction_fd_recv(t, fd, fixup->offset); fixup->target_fd = fd; if (binder_alloc_copy_to_buffer(&proc->alloc, t->buffer, fixup->offset, &fd, sizeof(u32))) { ret = -EINVAL; goto err; } } list_for_each_entry_safe(fixup, tmp, &t->fd_fixups, fixup_entry) { fd_install(fixup->target_fd, fixup->file); list_del(&fixup->fixup_entry); kfree(fixup); } return ret; err: binder_free_txn_fixups(t); return ret; } static int binder_thread_read(struct binder_proc *proc, struct binder_thread *thread, binder_uintptr_t binder_buffer, size_t size, binder_size_t *consumed, int non_block) { void __user *buffer = (void __user *)(uintptr_t)binder_buffer; void __user *ptr = buffer + *consumed; void __user *end = buffer + size; int ret = 0; int wait_for_proc_work; if (*consumed == 0) { if (put_user(BR_NOOP, (uint32_t __user *)ptr)) return -EFAULT; ptr += sizeof(uint32_t); } retry: binder_inner_proc_lock(proc); wait_for_proc_work = binder_available_for_proc_work_ilocked(thread); binder_inner_proc_unlock(proc); thread->looper |= BINDER_LOOPER_STATE_WAITING; trace_binder_wait_for_work(wait_for_proc_work, !!thread->transaction_stack, !binder_worklist_empty(proc, &thread->todo)); if (wait_for_proc_work) { if (!(thread->looper & (BINDER_LOOPER_STATE_REGISTERED | BINDER_LOOPER_STATE_ENTERED))) { binder_user_error("%d:%d ERROR: Thread waiting for process work before calling BC_REGISTER_LOOPER or BC_ENTER_LOOPER (state %x)\n", proc->pid, thread->pid, thread->looper); wait_event_interruptible(binder_user_error_wait, binder_stop_on_user_error < 2); } binder_set_nice(proc->default_priority); } if (non_block) { if (!binder_has_work(thread, wait_for_proc_work)) ret = -EAGAIN; } else { ret = binder_wait_for_work(thread, wait_for_proc_work); } thread->looper &= ~BINDER_LOOPER_STATE_WAITING; if (ret) return ret; while (1) { uint32_t cmd; struct binder_transaction_data_secctx tr; struct binder_transaction_data *trd = &tr.transaction_data; struct binder_work *w = NULL; struct list_head *list = NULL; struct binder_transaction *t = NULL; struct binder_thread *t_from; size_t trsize = sizeof(*trd); binder_inner_proc_lock(proc); if (!binder_worklist_empty_ilocked(&thread->todo)) list = &thread->todo; else if (!binder_worklist_empty_ilocked(&proc->todo) && wait_for_proc_work) list = &proc->todo; else { binder_inner_proc_unlock(proc); /* no data added */ if (ptr - buffer == 4 && !thread->looper_need_return) goto retry; break; } if (end - ptr < sizeof(tr) + 4) { binder_inner_proc_unlock(proc); break; } w = binder_dequeue_work_head_ilocked(list); if (binder_worklist_empty_ilocked(&thread->todo)) thread->process_todo = false; switch (w->type) { case BINDER_WORK_TRANSACTION: { binder_inner_proc_unlock(proc); t = container_of(w, struct binder_transaction, work); } break; case BINDER_WORK_RETURN_ERROR: { struct binder_error *e = container_of( w, struct binder_error, work); WARN_ON(e->cmd == BR_OK); binder_inner_proc_unlock(proc); if (put_user(e->cmd, (uint32_t __user *)ptr)) return -EFAULT; cmd = e->cmd; e->cmd = BR_OK; ptr += sizeof(uint32_t); binder_stat_br(proc, thread, cmd); } break; case BINDER_WORK_TRANSACTION_COMPLETE: case BINDER_WORK_TRANSACTION_PENDING: case BINDER_WORK_TRANSACTION_ONEWAY_SPAM_SUSPECT: { if (proc->oneway_spam_detection_enabled && w->type == BINDER_WORK_TRANSACTION_ONEWAY_SPAM_SUSPECT) cmd = BR_ONEWAY_SPAM_SUSPECT; else if (w->type == BINDER_WORK_TRANSACTION_PENDING) cmd = BR_TRANSACTION_PENDING_FROZEN; else cmd = BR_TRANSACTION_COMPLETE; binder_inner_proc_unlock(proc); kfree(w); binder_stats_deleted(BINDER_STAT_TRANSACTION_COMPLETE); if (put_user(cmd, (uint32_t __user *)ptr)) return -EFAULT; ptr += sizeof(uint32_t); binder_stat_br(proc, thread, cmd); binder_debug(BINDER_DEBUG_TRANSACTION_COMPLETE, "%d:%d BR_TRANSACTION_COMPLETE\n", proc->pid, thread->pid); } break; case BINDER_WORK_NODE: { struct binder_node *node = container_of(w, struct binder_node, work); int strong, weak; binder_uintptr_t node_ptr = node->ptr; binder_uintptr_t node_cookie = node->cookie; int node_debug_id = node->debug_id; int has_weak_ref; int has_strong_ref; void __user *orig_ptr = ptr; BUG_ON(proc != node->proc); strong = node->internal_strong_refs || node->local_strong_refs; weak = !hlist_empty(&node->refs) || node->local_weak_refs || node->tmp_refs || strong; has_strong_ref = node->has_strong_ref; has_weak_ref = node->has_weak_ref; if (weak && !has_weak_ref) { node->has_weak_ref = 1; node->pending_weak_ref = 1; node->local_weak_refs++; } if (strong && !has_strong_ref) { node->has_strong_ref = 1; node->pending_strong_ref = 1; node->local_strong_refs++; } if (!strong && has_strong_ref) node->has_strong_ref = 0; if (!weak && has_weak_ref) node->has_weak_ref = 0; if (!weak && !strong) { binder_debug(BINDER_DEBUG_INTERNAL_REFS, "%d:%d node %d u%016llx c%016llx deleted\n", proc->pid, thread->pid, node_debug_id, (u64)node_ptr, (u64)node_cookie); rb_erase(&node->rb_node, &proc->nodes); binder_inner_proc_unlock(proc); binder_node_lock(node); /* * Acquire the node lock before freeing the * node to serialize with other threads that * may have been holding the node lock while * decrementing this node (avoids race where * this thread frees while the other thread * is unlocking the node after the final * decrement) */ binder_node_unlock(node); binder_free_node(node); } else binder_inner_proc_unlock(proc); if (weak && !has_weak_ref) ret = binder_put_node_cmd( proc, thread, &ptr, node_ptr, node_cookie, node_debug_id, BR_INCREFS, "BR_INCREFS"); if (!ret && strong && !has_strong_ref) ret = binder_put_node_cmd( proc, thread, &ptr, node_ptr, node_cookie, node_debug_id, BR_ACQUIRE, "BR_ACQUIRE"); if (!ret && !strong && has_strong_ref) ret = binder_put_node_cmd( proc, thread, &ptr, node_ptr, node_cookie, node_debug_id, BR_RELEASE, "BR_RELEASE"); if (!ret && !weak && has_weak_ref) ret = binder_put_node_cmd( proc, thread, &ptr, node_ptr, node_cookie, node_debug_id, BR_DECREFS, "BR_DECREFS"); if (orig_ptr == ptr) binder_debug(BINDER_DEBUG_INTERNAL_REFS, "%d:%d node %d u%016llx c%016llx state unchanged\n", proc->pid, thread->pid, node_debug_id, (u64)node_ptr, (u64)node_cookie); if (ret) return ret; } break; case BINDER_WORK_DEAD_BINDER: case BINDER_WORK_DEAD_BINDER_AND_CLEAR: case BINDER_WORK_CLEAR_DEATH_NOTIFICATION: { struct binder_ref_death *death; uint32_t cmd; binder_uintptr_t cookie; death = container_of(w, struct binder_ref_death, work); if (w->type == BINDER_WORK_CLEAR_DEATH_NOTIFICATION) cmd = BR_CLEAR_DEATH_NOTIFICATION_DONE; else cmd = BR_DEAD_BINDER; cookie = death->cookie; binder_debug(BINDER_DEBUG_DEATH_NOTIFICATION, "%d:%d %s %016llx\n", proc->pid, thread->pid, cmd == BR_DEAD_BINDER ? "BR_DEAD_BINDER" : "BR_CLEAR_DEATH_NOTIFICATION_DONE", (u64)cookie); if (w->type == BINDER_WORK_CLEAR_DEATH_NOTIFICATION) { binder_inner_proc_unlock(proc); kfree(death); binder_stats_deleted(BINDER_STAT_DEATH); } else { binder_enqueue_work_ilocked( w, &proc->delivered_death); binder_inner_proc_unlock(proc); } if (put_user(cmd, (uint32_t __user *)ptr)) return -EFAULT; ptr += sizeof(uint32_t); if (put_user(cookie, (binder_uintptr_t __user *)ptr)) return -EFAULT; ptr += sizeof(binder_uintptr_t); binder_stat_br(proc, thread, cmd); if (cmd == BR_DEAD_BINDER) goto done; /* DEAD_BINDER notifications can cause transactions */ } break; case BINDER_WORK_FROZEN_BINDER: { struct binder_ref_freeze *freeze; struct binder_frozen_state_info info; memset(&info, 0, sizeof(info)); freeze = container_of(w, struct binder_ref_freeze, work); info.is_frozen = freeze->is_frozen; info.cookie = freeze->cookie; freeze->sent = true; binder_enqueue_work_ilocked(w, &proc->delivered_freeze); binder_inner_proc_unlock(proc); if (put_user(BR_FROZEN_BINDER, (uint32_t __user *)ptr)) return -EFAULT; ptr += sizeof(uint32_t); if (copy_to_user(ptr, &info, sizeof(info))) return -EFAULT; ptr += sizeof(info); binder_stat_br(proc, thread, BR_FROZEN_BINDER); goto done; /* BR_FROZEN_BINDER notifications can cause transactions */ } break; case BINDER_WORK_CLEAR_FREEZE_NOTIFICATION: { struct binder_ref_freeze *freeze = container_of(w, struct binder_ref_freeze, work); binder_uintptr_t cookie = freeze->cookie; binder_inner_proc_unlock(proc); kfree(freeze); binder_stats_deleted(BINDER_STAT_FREEZE); if (put_user(BR_CLEAR_FREEZE_NOTIFICATION_DONE, (uint32_t __user *)ptr)) return -EFAULT; ptr += sizeof(uint32_t); if (put_user(cookie, (binder_uintptr_t __user *)ptr)) return -EFAULT; ptr += sizeof(binder_uintptr_t); binder_stat_br(proc, thread, BR_CLEAR_FREEZE_NOTIFICATION_DONE); } break; default: binder_inner_proc_unlock(proc); pr_err("%d:%d: bad work type %d\n", proc->pid, thread->pid, w->type); break; } if (!t) continue; BUG_ON(t->buffer == NULL); if (t->buffer->target_node) { struct binder_node *target_node = t->buffer->target_node; trd->target.ptr = target_node->ptr; trd->cookie = target_node->cookie; t->saved_priority = task_nice(current); if (t->priority < target_node->min_priority && !(t->flags & TF_ONE_WAY)) binder_set_nice(t->priority); else if (!(t->flags & TF_ONE_WAY) || t->saved_priority > target_node->min_priority) binder_set_nice(target_node->min_priority); cmd = BR_TRANSACTION; } else { trd->target.ptr = 0; trd->cookie = 0; cmd = BR_REPLY; } trd->code = t->code; trd->flags = t->flags; trd->sender_euid = from_kuid(current_user_ns(), t->sender_euid); t_from = binder_get_txn_from(t); if (t_from) { struct task_struct *sender = t_from->proc->tsk; trd->sender_pid = task_tgid_nr_ns(sender, task_active_pid_ns(current)); } else { trd->sender_pid = 0; } ret = binder_apply_fd_fixups(proc, t); if (ret) { struct binder_buffer *buffer = t->buffer; bool oneway = !!(t->flags & TF_ONE_WAY); int tid = t->debug_id; if (t_from) binder_thread_dec_tmpref(t_from); buffer->transaction = NULL; binder_cleanup_transaction(t, "fd fixups failed", BR_FAILED_REPLY); binder_free_buf(proc, thread, buffer, true); binder_debug(BINDER_DEBUG_FAILED_TRANSACTION, "%d:%d %stransaction %d fd fixups failed %d/%d, line %d\n", proc->pid, thread->pid, oneway ? "async " : (cmd == BR_REPLY ? "reply " : ""), tid, BR_FAILED_REPLY, ret, __LINE__); if (cmd == BR_REPLY) { cmd = BR_FAILED_REPLY; if (put_user(cmd, (uint32_t __user *)ptr)) return -EFAULT; ptr += sizeof(uint32_t); binder_stat_br(proc, thread, cmd); break; } continue; } trd->data_size = t->buffer->data_size; trd->offsets_size = t->buffer->offsets_size; trd->data.ptr.buffer = t->buffer->user_data; trd->data.ptr.offsets = trd->data.ptr.buffer + ALIGN(t->buffer->data_size, sizeof(void *)); tr.secctx = t->security_ctx; if (t->security_ctx) { cmd = BR_TRANSACTION_SEC_CTX; trsize = sizeof(tr); } if (put_user(cmd, (uint32_t __user *)ptr)) { if (t_from) binder_thread_dec_tmpref(t_from); binder_cleanup_transaction(t, "put_user failed", BR_FAILED_REPLY); return -EFAULT; } ptr += sizeof(uint32_t); if (copy_to_user(ptr, &tr, trsize)) { if (t_from) binder_thread_dec_tmpref(t_from); binder_cleanup_transaction(t, "copy_to_user failed", BR_FAILED_REPLY); return -EFAULT; } ptr += trsize; trace_binder_transaction_received(t); binder_stat_br(proc, thread, cmd); binder_debug(BINDER_DEBUG_TRANSACTION, "%d:%d %s %d %d:%d, cmd %u size %zd-%zd\n", proc->pid, thread->pid, (cmd == BR_TRANSACTION) ? "BR_TRANSACTION" : (cmd == BR_TRANSACTION_SEC_CTX) ? "BR_TRANSACTION_SEC_CTX" : "BR_REPLY", t->debug_id, t_from ? t_from->proc->pid : 0, t_from ? t_from->pid : 0, cmd, t->buffer->data_size, t->buffer->offsets_size); if (t_from) binder_thread_dec_tmpref(t_from); t->buffer->allow_user_free = 1; if (cmd != BR_REPLY && !(t->flags & TF_ONE_WAY)) { binder_inner_proc_lock(thread->proc); t->to_parent = thread->transaction_stack; t->to_thread = thread; thread->transaction_stack = t; binder_inner_proc_unlock(thread->proc); } else { binder_free_transaction(t); } break; } done: *consumed = ptr - buffer; binder_inner_proc_lock(proc); if (proc->requested_threads == 0 && list_empty(&thread->proc->waiting_threads) && proc->requested_threads_started < proc->max_threads && (thread->looper & (BINDER_LOOPER_STATE_REGISTERED | BINDER_LOOPER_STATE_ENTERED)) /* the user-space code fails to */ /*spawn a new thread if we leave this out */) { proc->requested_threads++; binder_inner_proc_unlock(proc); binder_debug(BINDER_DEBUG_THREADS, "%d:%d BR_SPAWN_LOOPER\n", proc->pid, thread->pid); if (put_user(BR_SPAWN_LOOPER, (uint32_t __user *)buffer)) return -EFAULT; binder_stat_br(proc, thread, BR_SPAWN_LOOPER); } else binder_inner_proc_unlock(proc); return 0; } static void binder_release_work(struct binder_proc *proc, struct list_head *list) { struct binder_work *w; enum binder_work_type wtype; while (1) { binder_inner_proc_lock(proc); w = binder_dequeue_work_head_ilocked(list); wtype = w ? w->type : 0; binder_inner_proc_unlock(proc); if (!w) return; switch (wtype) { case BINDER_WORK_TRANSACTION: { struct binder_transaction *t; t = container_of(w, struct binder_transaction, work); binder_cleanup_transaction(t, "process died.", BR_DEAD_REPLY); } break; case BINDER_WORK_RETURN_ERROR: { struct binder_error *e = container_of( w, struct binder_error, work); binder_debug(BINDER_DEBUG_DEAD_TRANSACTION, "undelivered TRANSACTION_ERROR: %u\n", e->cmd); } break; case BINDER_WORK_TRANSACTION_PENDING: case BINDER_WORK_TRANSACTION_ONEWAY_SPAM_SUSPECT: case BINDER_WORK_TRANSACTION_COMPLETE: { binder_debug(BINDER_DEBUG_DEAD_TRANSACTION, "undelivered TRANSACTION_COMPLETE\n"); kfree(w); binder_stats_deleted(BINDER_STAT_TRANSACTION_COMPLETE); } break; case BINDER_WORK_DEAD_BINDER_AND_CLEAR: case BINDER_WORK_CLEAR_DEATH_NOTIFICATION: { struct binder_ref_death *death; death = container_of(w, struct binder_ref_death, work); binder_debug(BINDER_DEBUG_DEAD_TRANSACTION, "undelivered death notification, %016llx\n", (u64)death->cookie); kfree(death); binder_stats_deleted(BINDER_STAT_DEATH); } break; case BINDER_WORK_NODE: break; case BINDER_WORK_CLEAR_FREEZE_NOTIFICATION: { struct binder_ref_freeze *freeze; freeze = container_of(w, struct binder_ref_freeze, work); binder_debug(BINDER_DEBUG_DEAD_TRANSACTION, "undelivered freeze notification, %016llx\n", (u64)freeze->cookie); kfree(freeze); binder_stats_deleted(BINDER_STAT_FREEZE); } break; default: pr_err("unexpected work type, %d, not freed\n", wtype); break; } } } static struct binder_thread *binder_get_thread_ilocked( struct binder_proc *proc, struct binder_thread *new_thread) { struct binder_thread *thread = NULL; struct rb_node *parent = NULL; struct rb_node **p = &proc->threads.rb_node; while (*p) { parent = *p; thread = rb_entry(parent, struct binder_thread, rb_node); if (current->pid < thread->pid) p = &(*p)->rb_left; else if (current->pid > thread->pid) p = &(*p)->rb_right; else return thread; } if (!new_thread) return NULL; thread = new_thread; binder_stats_created(BINDER_STAT_THREAD); thread->proc = proc; thread->pid = current->pid; atomic_set(&thread->tmp_ref, 0); init_waitqueue_head(&thread->wait); INIT_LIST_HEAD(&thread->todo); rb_link_node(&thread->rb_node, parent, p); rb_insert_color(&thread->rb_node, &proc->threads); thread->looper_need_return = true; thread->return_error.work.type = BINDER_WORK_RETURN_ERROR; thread->return_error.cmd = BR_OK; thread->reply_error.work.type = BINDER_WORK_RETURN_ERROR; thread->reply_error.cmd = BR_OK; thread->ee.command = BR_OK; INIT_LIST_HEAD(&new_thread->waiting_thread_node); return thread; } static struct binder_thread *binder_get_thread(struct binder_proc *proc) { struct binder_thread *thread; struct binder_thread *new_thread; binder_inner_proc_lock(proc); thread = binder_get_thread_ilocked(proc, NULL); binder_inner_proc_unlock(proc); if (!thread) { new_thread = kzalloc(sizeof(*thread), GFP_KERNEL); if (new_thread == NULL) return NULL; binder_inner_proc_lock(proc); thread = binder_get_thread_ilocked(proc, new_thread); binder_inner_proc_unlock(proc); if (thread != new_thread) kfree(new_thread); } return thread; } static void binder_free_proc(struct binder_proc *proc) { struct binder_device *device; BUG_ON(!list_empty(&proc->todo)); BUG_ON(!list_empty(&proc->delivered_death)); if (proc->outstanding_txns) pr_warn("%s: Unexpected outstanding_txns %d\n", __func__, proc->outstanding_txns); device = container_of(proc->context, struct binder_device, context); if (refcount_dec_and_test(&device->ref)) { binder_remove_device(device); kfree(proc->context->name); kfree(device); } binder_alloc_deferred_release(&proc->alloc); put_task_struct(proc->tsk); put_cred(proc->cred); binder_stats_deleted(BINDER_STAT_PROC); dbitmap_free(&proc->dmap); kfree(proc); } static void binder_free_thread(struct binder_thread *thread) { BUG_ON(!list_empty(&thread->todo)); binder_stats_deleted(BINDER_STAT_THREAD); binder_proc_dec_tmpref(thread->proc); kfree(thread); } static int binder_thread_release(struct binder_proc *proc, struct binder_thread *thread) { struct binder_transaction *t; struct binder_transaction *send_reply = NULL; int active_transactions = 0; struct binder_transaction *last_t = NULL; binder_inner_proc_lock(thread->proc); /* * take a ref on the proc so it survives * after we remove this thread from proc->threads. * The corresponding dec is when we actually * free the thread in binder_free_thread() */ proc->tmp_ref++; /* * take a ref on this thread to ensure it * survives while we are releasing it */ atomic_inc(&thread->tmp_ref); rb_erase(&thread->rb_node, &proc->threads); t = thread->transaction_stack; if (t) { spin_lock(&t->lock); if (t->to_thread == thread) send_reply = t; } else { __acquire(&t->lock); } thread->is_dead = true; while (t) { last_t = t; active_transactions++; binder_debug(BINDER_DEBUG_DEAD_TRANSACTION, "release %d:%d transaction %d %s, still active\n", proc->pid, thread->pid, t->debug_id, (t->to_thread == thread) ? "in" : "out"); if (t->to_thread == thread) { thread->proc->outstanding_txns--; t->to_proc = NULL; t->to_thread = NULL; if (t->buffer) { t->buffer->transaction = NULL; t->buffer = NULL; } t = t->to_parent; } else if (t->from == thread) { t->from = NULL; t = t->from_parent; } else BUG(); spin_unlock(&last_t->lock); if (t) spin_lock(&t->lock); else __acquire(&t->lock); } /* annotation for sparse, lock not acquired in last iteration above */ __release(&t->lock); /* * If this thread used poll, make sure we remove the waitqueue from any * poll data structures holding it. */ if (thread->looper & BINDER_LOOPER_STATE_POLL) wake_up_pollfree(&thread->wait); binder_inner_proc_unlock(thread->proc); /* * This is needed to avoid races between wake_up_pollfree() above and * someone else removing the last entry from the queue for other reasons * (e.g. ep_remove_wait_queue() being called due to an epoll file * descriptor being closed). Such other users hold an RCU read lock, so * we can be sure they're done after we call synchronize_rcu(). */ if (thread->looper & BINDER_LOOPER_STATE_POLL) synchronize_rcu(); if (send_reply) binder_send_failed_reply(send_reply, BR_DEAD_REPLY); binder_release_work(proc, &thread->todo); binder_thread_dec_tmpref(thread); return active_transactions; } static __poll_t binder_poll(struct file *filp, struct poll_table_struct *wait) { struct binder_proc *proc = filp->private_data; struct binder_thread *thread = NULL; bool wait_for_proc_work; thread = binder_get_thread(proc); if (!thread) return EPOLLERR; binder_inner_proc_lock(thread->proc); thread->looper |= BINDER_LOOPER_STATE_POLL; wait_for_proc_work = binder_available_for_proc_work_ilocked(thread); binder_inner_proc_unlock(thread->proc); poll_wait(filp, &thread->wait, wait); if (binder_has_work(thread, wait_for_proc_work)) return EPOLLIN; return 0; } static int binder_ioctl_write_read(struct file *filp, unsigned long arg, struct binder_thread *thread) { int ret = 0; struct binder_proc *proc = filp->private_data; void __user *ubuf = (void __user *)arg; struct binder_write_read bwr; if (copy_from_user(&bwr, ubuf, sizeof(bwr))) return -EFAULT; binder_debug(BINDER_DEBUG_READ_WRITE, "%d:%d write %lld at %016llx, read %lld at %016llx\n", proc->pid, thread->pid, (u64)bwr.write_size, (u64)bwr.write_buffer, (u64)bwr.read_size, (u64)bwr.read_buffer); if (bwr.write_size > 0) { ret = binder_thread_write(proc, thread, bwr.write_buffer, bwr.write_size, &bwr.write_consumed); trace_binder_write_done(ret); if (ret < 0) { bwr.read_consumed = 0; goto out; } } if (bwr.read_size > 0) { ret = binder_thread_read(proc, thread, bwr.read_buffer, bwr.read_size, &bwr.read_consumed, filp->f_flags & O_NONBLOCK); trace_binder_read_done(ret); binder_inner_proc_lock(proc); if (!binder_worklist_empty_ilocked(&proc->todo)) binder_wakeup_proc_ilocked(proc); binder_inner_proc_unlock(proc); if (ret < 0) goto out; } binder_debug(BINDER_DEBUG_READ_WRITE, "%d:%d wrote %lld of %lld, read return %lld of %lld\n", proc->pid, thread->pid, (u64)bwr.write_consumed, (u64)bwr.write_size, (u64)bwr.read_consumed, (u64)bwr.read_size); out: if (copy_to_user(ubuf, &bwr, sizeof(bwr))) ret = -EFAULT; return ret; } static int binder_ioctl_set_ctx_mgr(struct file *filp, struct flat_binder_object *fbo) { int ret = 0; struct binder_proc *proc = filp->private_data; struct binder_context *context = proc->context; struct binder_node *new_node; kuid_t curr_euid = current_euid(); guard(mutex)(&context->context_mgr_node_lock); if (context->binder_context_mgr_node) { pr_err("BINDER_SET_CONTEXT_MGR already set\n"); return -EBUSY; } ret = security_binder_set_context_mgr(proc->cred); if (ret < 0) return ret; if (uid_valid(context->binder_context_mgr_uid)) { if (!uid_eq(context->binder_context_mgr_uid, curr_euid)) { pr_err("BINDER_SET_CONTEXT_MGR bad uid %d != %d\n", from_kuid(&init_user_ns, curr_euid), from_kuid(&init_user_ns, context->binder_context_mgr_uid)); return -EPERM; } } else { context->binder_context_mgr_uid = curr_euid; } new_node = binder_new_node(proc, fbo); if (!new_node) return -ENOMEM; binder_node_lock(new_node); new_node->local_weak_refs++; new_node->local_strong_refs++; new_node->has_strong_ref = 1; new_node->has_weak_ref = 1; context->binder_context_mgr_node = new_node; binder_node_unlock(new_node); binder_put_node(new_node); return ret; } static int binder_ioctl_get_node_info_for_ref(struct binder_proc *proc, struct binder_node_info_for_ref *info) { struct binder_node *node; struct binder_context *context = proc->context; __u32 handle = info->handle; if (info->strong_count || info->weak_count || info->reserved1 || info->reserved2 || info->reserved3) { binder_user_error("%d BINDER_GET_NODE_INFO_FOR_REF: only handle may be non-zero.", proc->pid); return -EINVAL; } /* This ioctl may only be used by the context manager */ mutex_lock(&context->context_mgr_node_lock); if (!context->binder_context_mgr_node || context->binder_context_mgr_node->proc != proc) { mutex_unlock(&context->context_mgr_node_lock); return -EPERM; } mutex_unlock(&context->context_mgr_node_lock); node = binder_get_node_from_ref(proc, handle, true, NULL); if (!node) return -EINVAL; info->strong_count = node->local_strong_refs + node->internal_strong_refs; info->weak_count = node->local_weak_refs; binder_put_node(node); return 0; } static int binder_ioctl_get_node_debug_info(struct binder_proc *proc, struct binder_node_debug_info *info) { struct rb_node *n; binder_uintptr_t ptr = info->ptr; memset(info, 0, sizeof(*info)); binder_inner_proc_lock(proc); for (n = rb_first(&proc->nodes); n != NULL; n = rb_next(n)) { struct binder_node *node = rb_entry(n, struct binder_node, rb_node); if (node->ptr > ptr) { info->ptr = node->ptr; info->cookie = node->cookie; info->has_strong_ref = node->has_strong_ref; info->has_weak_ref = node->has_weak_ref; break; } } binder_inner_proc_unlock(proc); return 0; } static bool binder_txns_pending_ilocked(struct binder_proc *proc) { struct rb_node *n; struct binder_thread *thread; if (proc->outstanding_txns > 0) return true; for (n = rb_first(&proc->threads); n; n = rb_next(n)) { thread = rb_entry(n, struct binder_thread, rb_node); if (thread->transaction_stack) return true; } return false; } static void binder_add_freeze_work(struct binder_proc *proc, bool is_frozen) { struct binder_node *prev = NULL; struct rb_node *n; struct binder_ref *ref; binder_inner_proc_lock(proc); for (n = rb_first(&proc->nodes); n; n = rb_next(n)) { struct binder_node *node; node = rb_entry(n, struct binder_node, rb_node); binder_inc_node_tmpref_ilocked(node); binder_inner_proc_unlock(proc); if (prev) binder_put_node(prev); binder_node_lock(node); hlist_for_each_entry(ref, &node->refs, node_entry) { /* * Need the node lock to synchronize * with new notification requests and the * inner lock to synchronize with queued * freeze notifications. */ binder_inner_proc_lock(ref->proc); if (!ref->freeze) { binder_inner_proc_unlock(ref->proc); continue; } ref->freeze->work.type = BINDER_WORK_FROZEN_BINDER; if (list_empty(&ref->freeze->work.entry)) { ref->freeze->is_frozen = is_frozen; binder_enqueue_work_ilocked(&ref->freeze->work, &ref->proc->todo); binder_wakeup_proc_ilocked(ref->proc); } else { if (ref->freeze->sent && ref->freeze->is_frozen != is_frozen) ref->freeze->resend = true; ref->freeze->is_frozen = is_frozen; } binder_inner_proc_unlock(ref->proc); } prev = node; binder_node_unlock(node); binder_inner_proc_lock(proc); if (proc->is_dead) break; } binder_inner_proc_unlock(proc); if (prev) binder_put_node(prev); } static int binder_ioctl_freeze(struct binder_freeze_info *info, struct binder_proc *target_proc) { int ret = 0; if (!info->enable) { binder_inner_proc_lock(target_proc); target_proc->sync_recv = false; target_proc->async_recv = false; target_proc->is_frozen = false; binder_inner_proc_unlock(target_proc); binder_add_freeze_work(target_proc, false); return 0; } /* * Freezing the target. Prevent new transactions by * setting frozen state. If timeout specified, wait * for transactions to drain. */ binder_inner_proc_lock(target_proc); target_proc->sync_recv = false; target_proc->async_recv = false; target_proc->is_frozen = true; binder_inner_proc_unlock(target_proc); if (info->timeout_ms > 0) ret = wait_event_interruptible_timeout( target_proc->freeze_wait, (!target_proc->outstanding_txns), msecs_to_jiffies(info->timeout_ms)); /* Check pending transactions that wait for reply */ if (ret >= 0) { binder_inner_proc_lock(target_proc); if (binder_txns_pending_ilocked(target_proc)) ret = -EAGAIN; binder_inner_proc_unlock(target_proc); } if (ret < 0) { binder_inner_proc_lock(target_proc); target_proc->is_frozen = false; binder_inner_proc_unlock(target_proc); } else { binder_add_freeze_work(target_proc, true); } return ret; } static int binder_ioctl_get_freezer_info( struct binder_frozen_status_info *info) { struct binder_proc *target_proc; bool found = false; __u32 txns_pending; info->sync_recv = 0; info->async_recv = 0; mutex_lock(&binder_procs_lock); hlist_for_each_entry(target_proc, &binder_procs, proc_node) { if (target_proc->pid == info->pid) { found = true; binder_inner_proc_lock(target_proc); txns_pending = binder_txns_pending_ilocked(target_proc); info->sync_recv |= target_proc->sync_recv | (txns_pending << 1); info->async_recv |= target_proc->async_recv; binder_inner_proc_unlock(target_proc); } } mutex_unlock(&binder_procs_lock); if (!found) return -EINVAL; return 0; } static int binder_ioctl_get_extended_error(struct binder_thread *thread, void __user *ubuf) { struct binder_extended_error ee; binder_inner_proc_lock(thread->proc); ee = thread->ee; binder_set_extended_error(&thread->ee, 0, BR_OK, 0); binder_inner_proc_unlock(thread->proc); if (copy_to_user(ubuf, &ee, sizeof(ee))) return -EFAULT; return 0; } static long binder_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { int ret; struct binder_proc *proc = filp->private_data; struct binder_thread *thread; void __user *ubuf = (void __user *)arg; trace_binder_ioctl(cmd, arg); ret = wait_event_interruptible(binder_user_error_wait, binder_stop_on_user_error < 2); if (ret) goto err_unlocked; thread = binder_get_thread(proc); if (thread == NULL) { ret = -ENOMEM; goto err; } switch (cmd) { case BINDER_WRITE_READ: ret = binder_ioctl_write_read(filp, arg, thread); if (ret) goto err; break; case BINDER_SET_MAX_THREADS: { u32 max_threads; if (copy_from_user(&max_threads, ubuf, sizeof(max_threads))) { ret = -EINVAL; goto err; } binder_inner_proc_lock(proc); proc->max_threads = max_threads; binder_inner_proc_unlock(proc); break; } case BINDER_SET_CONTEXT_MGR_EXT: { struct flat_binder_object fbo; if (copy_from_user(&fbo, ubuf, sizeof(fbo))) { ret = -EINVAL; goto err; } ret = binder_ioctl_set_ctx_mgr(filp, &fbo); if (ret) goto err; break; } case BINDER_SET_CONTEXT_MGR: ret = binder_ioctl_set_ctx_mgr(filp, NULL); if (ret) goto err; break; case BINDER_THREAD_EXIT: binder_debug(BINDER_DEBUG_THREADS, "%d:%d exit\n", proc->pid, thread->pid); binder_thread_release(proc, thread); thread = NULL; break; case BINDER_VERSION: { struct binder_version __user *ver = ubuf; if (put_user(BINDER_CURRENT_PROTOCOL_VERSION, &ver->protocol_version)) { ret = -EINVAL; goto err; } break; } case BINDER_GET_NODE_INFO_FOR_REF: { struct binder_node_info_for_ref info; if (copy_from_user(&info, ubuf, sizeof(info))) { ret = -EFAULT; goto err; } ret = binder_ioctl_get_node_info_for_ref(proc, &info); if (ret < 0) goto err; if (copy_to_user(ubuf, &info, sizeof(info))) { ret = -EFAULT; goto err; } break; } case BINDER_GET_NODE_DEBUG_INFO: { struct binder_node_debug_info info; if (copy_from_user(&info, ubuf, sizeof(info))) { ret = -EFAULT; goto err; } ret = binder_ioctl_get_node_debug_info(proc, &info); if (ret < 0) goto err; if (copy_to_user(ubuf, &info, sizeof(info))) { ret = -EFAULT; goto err; } break; } case BINDER_FREEZE: { struct binder_freeze_info info; struct binder_proc **target_procs = NULL, *target_proc; int target_procs_count = 0, i = 0; ret = 0; if (copy_from_user(&info, ubuf, sizeof(info))) { ret = -EFAULT; goto err; } mutex_lock(&binder_procs_lock); hlist_for_each_entry(target_proc, &binder_procs, proc_node) { if (target_proc->pid == info.pid) target_procs_count++; } if (target_procs_count == 0) { mutex_unlock(&binder_procs_lock); ret = -EINVAL; goto err; } target_procs = kcalloc(target_procs_count, sizeof(struct binder_proc *), GFP_KERNEL); if (!target_procs) { mutex_unlock(&binder_procs_lock); ret = -ENOMEM; goto err; } hlist_for_each_entry(target_proc, &binder_procs, proc_node) { if (target_proc->pid != info.pid) continue; binder_inner_proc_lock(target_proc); target_proc->tmp_ref++; binder_inner_proc_unlock(target_proc); target_procs[i++] = target_proc; } mutex_unlock(&binder_procs_lock); for (i = 0; i < target_procs_count; i++) { if (ret >= 0) ret = binder_ioctl_freeze(&info, target_procs[i]); binder_proc_dec_tmpref(target_procs[i]); } kfree(target_procs); if (ret < 0) goto err; break; } case BINDER_GET_FROZEN_INFO: { struct binder_frozen_status_info info; if (copy_from_user(&info, ubuf, sizeof(info))) { ret = -EFAULT; goto err; } ret = binder_ioctl_get_freezer_info(&info); if (ret < 0) goto err; if (copy_to_user(ubuf, &info, sizeof(info))) { ret = -EFAULT; goto err; } break; } case BINDER_ENABLE_ONEWAY_SPAM_DETECTION: { uint32_t enable; if (copy_from_user(&enable, ubuf, sizeof(enable))) { ret = -EFAULT; goto err; } binder_inner_proc_lock(proc); proc->oneway_spam_detection_enabled = (bool)enable; binder_inner_proc_unlock(proc); break; } case BINDER_GET_EXTENDED_ERROR: ret = binder_ioctl_get_extended_error(thread, ubuf); if (ret < 0) goto err; break; default: ret = -EINVAL; goto err; } ret = 0; err: if (thread) thread->looper_need_return = false; wait_event_interruptible(binder_user_error_wait, binder_stop_on_user_error < 2); if (ret && ret != -EINTR) pr_info("%d:%d ioctl %x %lx returned %d\n", proc->pid, current->pid, cmd, arg, ret); err_unlocked: trace_binder_ioctl_done(ret); return ret; } static void binder_vma_open(struct vm_area_struct *vma) { struct binder_proc *proc = vma->vm_private_data; binder_debug(BINDER_DEBUG_OPEN_CLOSE, "%d open vm area %lx-%lx (%ld K) vma %lx pagep %lx\n", proc->pid, vma->vm_start, vma->vm_end, (vma->vm_end - vma->vm_start) / SZ_1K, vma->vm_flags, (unsigned long)pgprot_val(vma->vm_page_prot)); } static void binder_vma_close(struct vm_area_struct *vma) { struct binder_proc *proc = vma->vm_private_data; binder_debug(BINDER_DEBUG_OPEN_CLOSE, "%d close vm area %lx-%lx (%ld K) vma %lx pagep %lx\n", proc->pid, vma->vm_start, vma->vm_end, (vma->vm_end - vma->vm_start) / SZ_1K, vma->vm_flags, (unsigned long)pgprot_val(vma->vm_page_prot)); binder_alloc_vma_close(&proc->alloc); } VISIBLE_IF_KUNIT vm_fault_t binder_vm_fault(struct vm_fault *vmf) { return VM_FAULT_SIGBUS; } EXPORT_SYMBOL_IF_KUNIT(binder_vm_fault); static const struct vm_operations_struct binder_vm_ops = { .open = binder_vma_open, .close = binder_vma_close, .fault = binder_vm_fault, }; static int binder_mmap(struct file *filp, struct vm_area_struct *vma) { struct binder_proc *proc = filp->private_data; if (proc->tsk != current->group_leader) return -EINVAL; binder_debug(BINDER_DEBUG_OPEN_CLOSE, "%s: %d %lx-%lx (%ld K) vma %lx pagep %lx\n", __func__, proc->pid, vma->vm_start, vma->vm_end, (vma->vm_end - vma->vm_start) / SZ_1K, vma->vm_flags, (unsigned long)pgprot_val(vma->vm_page_prot)); if (vma->vm_flags & FORBIDDEN_MMAP_FLAGS) { pr_err("%s: %d %lx-%lx %s failed %d\n", __func__, proc->pid, vma->vm_start, vma->vm_end, "bad vm_flags", -EPERM); return -EPERM; } vm_flags_mod(vma, VM_DONTCOPY | VM_MIXEDMAP, VM_MAYWRITE); vma->vm_ops = &binder_vm_ops; vma->vm_private_data = proc; return binder_alloc_mmap_handler(&proc->alloc, vma); } static int binder_open(struct inode *nodp, struct file *filp) { struct binder_proc *proc, *itr; struct binder_device *binder_dev; struct binderfs_info *info; struct dentry *binder_binderfs_dir_entry_proc = NULL; bool existing_pid = false; binder_debug(BINDER_DEBUG_OPEN_CLOSE, "%s: %d:%d\n", __func__, current->group_leader->pid, current->pid); proc = kzalloc(sizeof(*proc), GFP_KERNEL); if (proc == NULL) return -ENOMEM; dbitmap_init(&proc->dmap); spin_lock_init(&proc->inner_lock); spin_lock_init(&proc->outer_lock); get_task_struct(current->group_leader); proc->tsk = current->group_leader; proc->cred = get_cred(filp->f_cred); INIT_LIST_HEAD(&proc->todo); init_waitqueue_head(&proc->freeze_wait); proc->default_priority = task_nice(current); /* binderfs stashes devices in i_private */ if (is_binderfs_device(nodp)) { binder_dev = nodp->i_private; info = nodp->i_sb->s_fs_info; binder_binderfs_dir_entry_proc = info->proc_log_dir; } else { binder_dev = container_of(filp->private_data, struct binder_device, miscdev); } refcount_inc(&binder_dev->ref); proc->context = &binder_dev->context; binder_alloc_init(&proc->alloc); binder_stats_created(BINDER_STAT_PROC); proc->pid = current->group_leader->pid; INIT_LIST_HEAD(&proc->delivered_death); INIT_LIST_HEAD(&proc->delivered_freeze); INIT_LIST_HEAD(&proc->waiting_threads); filp->private_data = proc; mutex_lock(&binder_procs_lock); hlist_for_each_entry(itr, &binder_procs, proc_node) { if (itr->pid == proc->pid) { existing_pid = true; break; } } hlist_add_head(&proc->proc_node, &binder_procs); mutex_unlock(&binder_procs_lock); if (binder_debugfs_dir_entry_proc && !existing_pid) { char strbuf[11]; snprintf(strbuf, sizeof(strbuf), "%u", proc->pid); /* * proc debug entries are shared between contexts. * Only create for the first PID to avoid debugfs log spamming * The printing code will anyway print all contexts for a given * PID so this is not a problem. */ proc->debugfs_entry = debugfs_create_file(strbuf, 0444, binder_debugfs_dir_entry_proc, (void *)(unsigned long)proc->pid, &proc_fops); } if (binder_binderfs_dir_entry_proc && !existing_pid) { char strbuf[11]; struct dentry *binderfs_entry; snprintf(strbuf, sizeof(strbuf), "%u", proc->pid); /* * Similar to debugfs, the process specific log file is shared * between contexts. Only create for the first PID. * This is ok since same as debugfs, the log file will contain * information on all contexts of a given PID. */ binderfs_entry = binderfs_create_file(binder_binderfs_dir_entry_proc, strbuf, &proc_fops, (void *)(unsigned long)proc->pid); if (!IS_ERR(binderfs_entry)) { proc->binderfs_entry = binderfs_entry; } else { int error; error = PTR_ERR(binderfs_entry); pr_warn("Unable to create file %s in binderfs (error %d)\n", strbuf, error); } } return 0; } static int binder_flush(struct file *filp, fl_owner_t id) { struct binder_proc *proc = filp->private_data; binder_defer_work(proc, BINDER_DEFERRED_FLUSH); return 0; } static void binder_deferred_flush(struct binder_proc *proc) { struct rb_node *n; int wake_count = 0; binder_inner_proc_lock(proc); for (n = rb_first(&proc->threads); n != NULL; n = rb_next(n)) { struct binder_thread *thread = rb_entry(n, struct binder_thread, rb_node); thread->looper_need_return = true; if (thread->looper & BINDER_LOOPER_STATE_WAITING) { wake_up_interruptible(&thread->wait); wake_count++; } } binder_inner_proc_unlock(proc); binder_debug(BINDER_DEBUG_OPEN_CLOSE, "binder_flush: %d woke %d threads\n", proc->pid, wake_count); } static int binder_release(struct inode *nodp, struct file *filp) { struct binder_proc *proc = filp->private_data; debugfs_remove(proc->debugfs_entry); if (proc->binderfs_entry) { simple_recursive_removal(proc->binderfs_entry, NULL); proc->binderfs_entry = NULL; } binder_defer_work(proc, BINDER_DEFERRED_RELEASE); return 0; } static int binder_node_release(struct binder_node *node, int refs) { struct binder_ref *ref; int death = 0; struct binder_proc *proc = node->proc; binder_release_work(proc, &node->async_todo); binder_node_lock(node); binder_inner_proc_lock(proc); binder_dequeue_work_ilocked(&node->work); /* * The caller must have taken a temporary ref on the node, */ BUG_ON(!node->tmp_refs); if (hlist_empty(&node->refs) && node->tmp_refs == 1) { binder_inner_proc_unlock(proc); binder_node_unlock(node); binder_free_node(node); return refs; } node->proc = NULL; node->local_strong_refs = 0; node->local_weak_refs = 0; binder_inner_proc_unlock(proc); spin_lock(&binder_dead_nodes_lock); hlist_add_head(&node->dead_node, &binder_dead_nodes); spin_unlock(&binder_dead_nodes_lock); hlist_for_each_entry(ref, &node->refs, node_entry) { refs++; /* * Need the node lock to synchronize * with new notification requests and the * inner lock to synchronize with queued * death notifications. */ binder_inner_proc_lock(ref->proc); if (!ref->death) { binder_inner_proc_unlock(ref->proc); continue; } death++; BUG_ON(!list_empty(&ref->death->work.entry)); ref->death->work.type = BINDER_WORK_DEAD_BINDER; binder_enqueue_work_ilocked(&ref->death->work, &ref->proc->todo); binder_wakeup_proc_ilocked(ref->proc); binder_inner_proc_unlock(ref->proc); } binder_debug(BINDER_DEBUG_DEAD_BINDER, "node %d now dead, refs %d, death %d\n", node->debug_id, refs, death); binder_node_unlock(node); binder_put_node(node); return refs; } static void binder_deferred_release(struct binder_proc *proc) { struct binder_context *context = proc->context; struct rb_node *n; int threads, nodes, incoming_refs, outgoing_refs, active_transactions; mutex_lock(&binder_procs_lock); hlist_del(&proc->proc_node); mutex_unlock(&binder_procs_lock); mutex_lock(&context->context_mgr_node_lock); if (context->binder_context_mgr_node && context->binder_context_mgr_node->proc == proc) { binder_debug(BINDER_DEBUG_DEAD_BINDER, "%s: %d context_mgr_node gone\n", __func__, proc->pid); context->binder_context_mgr_node = NULL; } mutex_unlock(&context->context_mgr_node_lock); binder_inner_proc_lock(proc); /* * Make sure proc stays alive after we * remove all the threads */ proc->tmp_ref++; proc->is_dead = true; proc->is_frozen = false; proc->sync_recv = false; proc->async_recv = false; threads = 0; active_transactions = 0; while ((n = rb_first(&proc->threads))) { struct binder_thread *thread; thread = rb_entry(n, struct binder_thread, rb_node); binder_inner_proc_unlock(proc); threads++; active_transactions += binder_thread_release(proc, thread); binder_inner_proc_lock(proc); } nodes = 0; incoming_refs = 0; while ((n = rb_first(&proc->nodes))) { struct binder_node *node; node = rb_entry(n, struct binder_node, rb_node); nodes++; /* * take a temporary ref on the node before * calling binder_node_release() which will either * kfree() the node or call binder_put_node() */ binder_inc_node_tmpref_ilocked(node); rb_erase(&node->rb_node, &proc->nodes); binder_inner_proc_unlock(proc); incoming_refs = binder_node_release(node, incoming_refs); binder_inner_proc_lock(proc); } binder_inner_proc_unlock(proc); outgoing_refs = 0; binder_proc_lock(proc); while ((n = rb_first(&proc->refs_by_desc))) { struct binder_ref *ref; ref = rb_entry(n, struct binder_ref, rb_node_desc); outgoing_refs++; binder_cleanup_ref_olocked(ref); binder_proc_unlock(proc); binder_free_ref(ref); binder_proc_lock(proc); } binder_proc_unlock(proc); binder_release_work(proc, &proc->todo); binder_release_work(proc, &proc->delivered_death); binder_release_work(proc, &proc->delivered_freeze); binder_debug(BINDER_DEBUG_OPEN_CLOSE, "%s: %d threads %d, nodes %d (ref %d), refs %d, active transactions %d\n", __func__, proc->pid, threads, nodes, incoming_refs, outgoing_refs, active_transactions); binder_proc_dec_tmpref(proc); } static void binder_deferred_func(struct work_struct *work) { struct binder_proc *proc; int defer; do { mutex_lock(&binder_deferred_lock); if (!hlist_empty(&binder_deferred_list)) { proc = hlist_entry(binder_deferred_list.first, struct binder_proc, deferred_work_node); hlist_del_init(&proc->deferred_work_node); defer = proc->deferred_work; proc->deferred_work = 0; } else { proc = NULL; defer = 0; } mutex_unlock(&binder_deferred_lock); if (defer & BINDER_DEFERRED_FLUSH) binder_deferred_flush(proc); if (defer & BINDER_DEFERRED_RELEASE) binder_deferred_release(proc); /* frees proc */ } while (proc); } static DECLARE_WORK(binder_deferred_work, binder_deferred_func); static void binder_defer_work(struct binder_proc *proc, enum binder_deferred_state defer) { guard(mutex)(&binder_deferred_lock); proc->deferred_work |= defer; if (hlist_unhashed(&proc->deferred_work_node)) { hlist_add_head(&proc->deferred_work_node, &binder_deferred_list); schedule_work(&binder_deferred_work); } } static void print_binder_transaction_ilocked(struct seq_file *m, struct binder_proc *proc, const char *prefix, struct binder_transaction *t) { struct binder_proc *to_proc; struct binder_buffer *buffer = t->buffer; ktime_t current_time = ktime_get(); spin_lock(&t->lock); to_proc = t->to_proc; seq_printf(m, "%s %d: %pK from %d:%d to %d:%d code %x flags %x pri %ld a%d r%d elapsed %lldms", prefix, t->debug_id, t, t->from_pid, t->from_tid, to_proc ? to_proc->pid : 0, t->to_thread ? t->to_thread->pid : 0, t->code, t->flags, t->priority, t->is_async, t->is_reply, ktime_ms_delta(current_time, t->start_time)); spin_unlock(&t->lock); if (proc != to_proc) { /* * Can only safely deref buffer if we are holding the * correct proc inner lock for this node */ seq_puts(m, "\n"); return; } if (buffer == NULL) { seq_puts(m, " buffer free\n"); return; } if (buffer->target_node) seq_printf(m, " node %d", buffer->target_node->debug_id); seq_printf(m, " size %zd:%zd offset %lx\n", buffer->data_size, buffer->offsets_size, buffer->user_data - proc->alloc.vm_start); } static void print_binder_work_ilocked(struct seq_file *m, struct binder_proc *proc, const char *prefix, const char *transaction_prefix, struct binder_work *w, bool hash_ptrs) { struct binder_node *node; struct binder_transaction *t; switch (w->type) { case BINDER_WORK_TRANSACTION: t = container_of(w, struct binder_transaction, work); print_binder_transaction_ilocked( m, proc, transaction_prefix, t); break; case BINDER_WORK_RETURN_ERROR: { struct binder_error *e = container_of( w, struct binder_error, work); seq_printf(m, "%stransaction error: %u\n", prefix, e->cmd); } break; case BINDER_WORK_TRANSACTION_COMPLETE: seq_printf(m, "%stransaction complete\n", prefix); break; case BINDER_WORK_NODE: node = container_of(w, struct binder_node, work); if (hash_ptrs) seq_printf(m, "%snode work %d: u%p c%p\n", prefix, node->debug_id, (void *)(long)node->ptr, (void *)(long)node->cookie); else seq_printf(m, "%snode work %d: u%016llx c%016llx\n", prefix, node->debug_id, (u64)node->ptr, (u64)node->cookie); break; case BINDER_WORK_DEAD_BINDER: seq_printf(m, "%shas dead binder\n", prefix); break; case BINDER_WORK_DEAD_BINDER_AND_CLEAR: seq_printf(m, "%shas cleared dead binder\n", prefix); break; case BINDER_WORK_CLEAR_DEATH_NOTIFICATION: seq_printf(m, "%shas cleared death notification\n", prefix); break; case BINDER_WORK_FROZEN_BINDER: seq_printf(m, "%shas frozen binder\n", prefix); break; case BINDER_WORK_CLEAR_FREEZE_NOTIFICATION: seq_printf(m, "%shas cleared freeze notification\n", prefix); break; default: seq_printf(m, "%sunknown work: type %d\n", prefix, w->type); break; } } static void print_binder_thread_ilocked(struct seq_file *m, struct binder_thread *thread, bool print_always, bool hash_ptrs) { struct binder_transaction *t; struct binder_work *w; size_t start_pos = m->count; size_t header_pos; seq_printf(m, " thread %d: l %02x need_return %d tr %d\n", thread->pid, thread->looper, thread->looper_need_return, atomic_read(&thread->tmp_ref)); header_pos = m->count; t = thread->transaction_stack; while (t) { if (t->from == thread) { print_binder_transaction_ilocked(m, thread->proc, " outgoing transaction", t); t = t->from_parent; } else if (t->to_thread == thread) { print_binder_transaction_ilocked(m, thread->proc, " incoming transaction", t); t = t->to_parent; } else { print_binder_transaction_ilocked(m, thread->proc, " bad transaction", t); t = NULL; } } list_for_each_entry(w, &thread->todo, entry) { print_binder_work_ilocked(m, thread->proc, " ", " pending transaction", w, hash_ptrs); } if (!print_always && m->count == header_pos) m->count = start_pos; } static void print_binder_node_nilocked(struct seq_file *m, struct binder_node *node, bool hash_ptrs) { struct binder_ref *ref; struct binder_work *w; int count; count = hlist_count_nodes(&node->refs); if (hash_ptrs) seq_printf(m, " node %d: u%p c%p", node->debug_id, (void *)(long)node->ptr, (void *)(long)node->cookie); else seq_printf(m, " node %d: u%016llx c%016llx", node->debug_id, (u64)node->ptr, (u64)node->cookie); seq_printf(m, " hs %d hw %d ls %d lw %d is %d iw %d tr %d", node->has_strong_ref, node->has_weak_ref, node->local_strong_refs, node->local_weak_refs, node->internal_strong_refs, count, node->tmp_refs); if (count) { seq_puts(m, " proc"); hlist_for_each_entry(ref, &node->refs, node_entry) seq_printf(m, " %d", ref->proc->pid); } seq_puts(m, "\n"); if (node->proc) { list_for_each_entry(w, &node->async_todo, entry) print_binder_work_ilocked(m, node->proc, " ", " pending async transaction", w, hash_ptrs); } } static void print_binder_ref_olocked(struct seq_file *m, struct binder_ref *ref) { binder_node_lock(ref->node); seq_printf(m, " ref %d: desc %d %snode %d s %d w %d d %pK\n", ref->data.debug_id, ref->data.desc, ref->node->proc ? "" : "dead ", ref->node->debug_id, ref->data.strong, ref->data.weak, ref->death); binder_node_unlock(ref->node); } /** * print_next_binder_node_ilocked() - Print binder_node from a locked list * @m: struct seq_file for output via seq_printf() * @proc: struct binder_proc we hold the inner_proc_lock to (if any) * @node: struct binder_node to print fields of * @prev_node: struct binder_node we hold a temporary reference to (if any) * @hash_ptrs: whether to hash @node's binder_uintptr_t fields * * Helper function to handle synchronization around printing a struct * binder_node while iterating through @proc->nodes or the dead nodes list. * Caller must hold either @proc->inner_lock (for live nodes) or * binder_dead_nodes_lock. This lock will be released during the body of this * function, but it will be reacquired before returning to the caller. * * Return: pointer to the struct binder_node we hold a tmpref on */ static struct binder_node * print_next_binder_node_ilocked(struct seq_file *m, struct binder_proc *proc, struct binder_node *node, struct binder_node *prev_node, bool hash_ptrs) { /* * Take a temporary reference on the node so that isn't freed while * we print it. */ binder_inc_node_tmpref_ilocked(node); /* * Live nodes need to drop the inner proc lock and dead nodes need to * drop the binder_dead_nodes_lock before trying to take the node lock. */ if (proc) binder_inner_proc_unlock(proc); else spin_unlock(&binder_dead_nodes_lock); if (prev_node) binder_put_node(prev_node); binder_node_inner_lock(node); print_binder_node_nilocked(m, node, hash_ptrs); binder_node_inner_unlock(node); if (proc) binder_inner_proc_lock(proc); else spin_lock(&binder_dead_nodes_lock); return node; } static void print_binder_proc(struct seq_file *m, struct binder_proc *proc, bool print_all, bool hash_ptrs) { struct binder_work *w; struct rb_node *n; size_t start_pos = m->count; size_t header_pos; struct binder_node *last_node = NULL; seq_printf(m, "proc %d\n", proc->pid); seq_printf(m, "context %s\n", proc->context->name); header_pos = m->count; binder_inner_proc_lock(proc); for (n = rb_first(&proc->threads); n; n = rb_next(n)) print_binder_thread_ilocked(m, rb_entry(n, struct binder_thread, rb_node), print_all, hash_ptrs); for (n = rb_first(&proc->nodes); n; n = rb_next(n)) { struct binder_node *node = rb_entry(n, struct binder_node, rb_node); if (!print_all && !node->has_async_transaction) continue; last_node = print_next_binder_node_ilocked(m, proc, node, last_node, hash_ptrs); } binder_inner_proc_unlock(proc); if (last_node) binder_put_node(last_node); if (print_all) { binder_proc_lock(proc); for (n = rb_first(&proc->refs_by_desc); n; n = rb_next(n)) print_binder_ref_olocked(m, rb_entry(n, struct binder_ref, rb_node_desc)); binder_proc_unlock(proc); } binder_alloc_print_allocated(m, &proc->alloc); binder_inner_proc_lock(proc); list_for_each_entry(w, &proc->todo, entry) print_binder_work_ilocked(m, proc, " ", " pending transaction", w, hash_ptrs); list_for_each_entry(w, &proc->delivered_death, entry) { seq_puts(m, " has delivered dead binder\n"); break; } list_for_each_entry(w, &proc->delivered_freeze, entry) { seq_puts(m, " has delivered freeze binder\n"); break; } binder_inner_proc_unlock(proc); if (!print_all && m->count == header_pos) m->count = start_pos; } static const char * const binder_return_strings[] = { "BR_ERROR", "BR_OK", "BR_TRANSACTION", "BR_REPLY", "BR_ACQUIRE_RESULT", "BR_DEAD_REPLY", "BR_TRANSACTION_COMPLETE", "BR_INCREFS", "BR_ACQUIRE", "BR_RELEASE", "BR_DECREFS", "BR_ATTEMPT_ACQUIRE", "BR_NOOP", "BR_SPAWN_LOOPER", "BR_FINISHED", "BR_DEAD_BINDER", "BR_CLEAR_DEATH_NOTIFICATION_DONE", "BR_FAILED_REPLY", "BR_FROZEN_REPLY", "BR_ONEWAY_SPAM_SUSPECT", "BR_TRANSACTION_PENDING_FROZEN", "BR_FROZEN_BINDER", "BR_CLEAR_FREEZE_NOTIFICATION_DONE", }; static const char * const binder_command_strings[] = { "BC_TRANSACTION", "BC_REPLY", "BC_ACQUIRE_RESULT", "BC_FREE_BUFFER", "BC_INCREFS", "BC_ACQUIRE", "BC_RELEASE", "BC_DECREFS", "BC_INCREFS_DONE", "BC_ACQUIRE_DONE", "BC_ATTEMPT_ACQUIRE", "BC_REGISTER_LOOPER", "BC_ENTER_LOOPER", "BC_EXIT_LOOPER", "BC_REQUEST_DEATH_NOTIFICATION", "BC_CLEAR_DEATH_NOTIFICATION", "BC_DEAD_BINDER_DONE", "BC_TRANSACTION_SG", "BC_REPLY_SG", "BC_REQUEST_FREEZE_NOTIFICATION", "BC_CLEAR_FREEZE_NOTIFICATION", "BC_FREEZE_NOTIFICATION_DONE", }; static const char * const binder_objstat_strings[] = { "proc", "thread", "node", "ref", "death", "transaction", "transaction_complete", "freeze", }; static void print_binder_stats(struct seq_file *m, const char *prefix, struct binder_stats *stats) { int i; BUILD_BUG_ON(ARRAY_SIZE(stats->bc) != ARRAY_SIZE(binder_command_strings)); for (i = 0; i < ARRAY_SIZE(stats->bc); i++) { int temp = atomic_read(&stats->bc[i]); if (temp) seq_printf(m, "%s%s: %d\n", prefix, binder_command_strings[i], temp); } BUILD_BUG_ON(ARRAY_SIZE(stats->br) != ARRAY_SIZE(binder_return_strings)); for (i = 0; i < ARRAY_SIZE(stats->br); i++) { int temp = atomic_read(&stats->br[i]); if (temp) seq_printf(m, "%s%s: %d\n", prefix, binder_return_strings[i], temp); } BUILD_BUG_ON(ARRAY_SIZE(stats->obj_created) != ARRAY_SIZE(binder_objstat_strings)); BUILD_BUG_ON(ARRAY_SIZE(stats->obj_created) != ARRAY_SIZE(stats->obj_deleted)); for (i = 0; i < ARRAY_SIZE(stats->obj_created); i++) { int created = atomic_read(&stats->obj_created[i]); int deleted = atomic_read(&stats->obj_deleted[i]); if (created || deleted) seq_printf(m, "%s%s: active %d total %d\n", prefix, binder_objstat_strings[i], created - deleted, created); } } static void print_binder_proc_stats(struct seq_file *m, struct binder_proc *proc) { struct binder_work *w; struct binder_thread *thread; struct rb_node *n; int count, strong, weak, ready_threads; size_t free_async_space = binder_alloc_get_free_async_space(&proc->alloc); seq_printf(m, "proc %d\n", proc->pid); seq_printf(m, "context %s\n", proc->context->name); count = 0; ready_threads = 0; binder_inner_proc_lock(proc); for (n = rb_first(&proc->threads); n; n = rb_next(n)) count++; list_for_each_entry(thread, &proc->waiting_threads, waiting_thread_node) ready_threads++; seq_printf(m, " threads: %d\n", count); seq_printf(m, " requested threads: %d+%d/%d\n" " ready threads %d\n" " free async space %zd\n", proc->requested_threads, proc->requested_threads_started, proc->max_threads, ready_threads, free_async_space); count = 0; for (n = rb_first(&proc->nodes); n; n = rb_next(n)) count++; binder_inner_proc_unlock(proc); seq_printf(m, " nodes: %d\n", count); count = 0; strong = 0; weak = 0; binder_proc_lock(proc); for (n = rb_first(&proc->refs_by_desc); n; n = rb_next(n)) { struct binder_ref *ref = rb_entry(n, struct binder_ref, rb_node_desc); count++; strong += ref->data.strong; weak += ref->data.weak; } binder_proc_unlock(proc); seq_printf(m, " refs: %d s %d w %d\n", count, strong, weak); count = binder_alloc_get_allocated_count(&proc->alloc); seq_printf(m, " buffers: %d\n", count); binder_alloc_print_pages(m, &proc->alloc); count = 0; binder_inner_proc_lock(proc); list_for_each_entry(w, &proc->todo, entry) { if (w->type == BINDER_WORK_TRANSACTION) count++; } binder_inner_proc_unlock(proc); seq_printf(m, " pending transactions: %d\n", count); print_binder_stats(m, " ", &proc->stats); } static void print_binder_state(struct seq_file *m, bool hash_ptrs) { struct binder_proc *proc; struct binder_node *node; struct binder_node *last_node = NULL; seq_puts(m, "binder state:\n"); spin_lock(&binder_dead_nodes_lock); if (!hlist_empty(&binder_dead_nodes)) seq_puts(m, "dead nodes:\n"); hlist_for_each_entry(node, &binder_dead_nodes, dead_node) last_node = print_next_binder_node_ilocked(m, NULL, node, last_node, hash_ptrs); spin_unlock(&binder_dead_nodes_lock); if (last_node) binder_put_node(last_node); mutex_lock(&binder_procs_lock); hlist_for_each_entry(proc, &binder_procs, proc_node) print_binder_proc(m, proc, true, hash_ptrs); mutex_unlock(&binder_procs_lock); } static void print_binder_transactions(struct seq_file *m, bool hash_ptrs) { struct binder_proc *proc; seq_puts(m, "binder transactions:\n"); mutex_lock(&binder_procs_lock); hlist_for_each_entry(proc, &binder_procs, proc_node) print_binder_proc(m, proc, false, hash_ptrs); mutex_unlock(&binder_procs_lock); } static int state_show(struct seq_file *m, void *unused) { print_binder_state(m, false); return 0; } static int state_hashed_show(struct seq_file *m, void *unused) { print_binder_state(m, true); return 0; } static int stats_show(struct seq_file *m, void *unused) { struct binder_proc *proc; seq_puts(m, "binder stats:\n"); print_binder_stats(m, "", &binder_stats); mutex_lock(&binder_procs_lock); hlist_for_each_entry(proc, &binder_procs, proc_node) print_binder_proc_stats(m, proc); mutex_unlock(&binder_procs_lock); return 0; } static int transactions_show(struct seq_file *m, void *unused) { print_binder_transactions(m, false); return 0; } static int transactions_hashed_show(struct seq_file *m, void *unused) { print_binder_transactions(m, true); return 0; } static int proc_show(struct seq_file *m, void *unused) { struct binder_proc *itr; int pid = (unsigned long)m->private; guard(mutex)(&binder_procs_lock); hlist_for_each_entry(itr, &binder_procs, proc_node) { if (itr->pid == pid) { seq_puts(m, "binder proc state:\n"); print_binder_proc(m, itr, true, false); } } return 0; } static void print_binder_transaction_log_entry(struct seq_file *m, struct binder_transaction_log_entry *e) { int debug_id = READ_ONCE(e->debug_id_done); /* * read barrier to guarantee debug_id_done read before * we print the log values */ smp_rmb(); seq_printf(m, "%d: %s from %d:%d to %d:%d context %s node %d handle %d size %d:%d ret %d/%d l=%d", e->debug_id, (e->call_type == 2) ? "reply" : ((e->call_type == 1) ? "async" : "call "), e->from_proc, e->from_thread, e->to_proc, e->to_thread, e->context_name, e->to_node, e->target_handle, e->data_size, e->offsets_size, e->return_error, e->return_error_param, e->return_error_line); /* * read-barrier to guarantee read of debug_id_done after * done printing the fields of the entry */ smp_rmb(); seq_printf(m, debug_id && debug_id == READ_ONCE(e->debug_id_done) ? "\n" : " (incomplete)\n"); } static int transaction_log_show(struct seq_file *m, void *unused) { struct binder_transaction_log *log = m->private; unsigned int log_cur = atomic_read(&log->cur); unsigned int count; unsigned int cur; int i; count = log_cur + 1; cur = count < ARRAY_SIZE(log->entry) && !log->full ? 0 : count % ARRAY_SIZE(log->entry); if (count > ARRAY_SIZE(log->entry) || log->full) count = ARRAY_SIZE(log->entry); for (i = 0; i < count; i++) { unsigned int index = cur++ % ARRAY_SIZE(log->entry); print_binder_transaction_log_entry(m, &log->entry[index]); } return 0; } const struct file_operations binder_fops = { .owner = THIS_MODULE, .poll = binder_poll, .unlocked_ioctl = binder_ioctl, .compat_ioctl = compat_ptr_ioctl, .mmap = binder_mmap, .open = binder_open, .flush = binder_flush, .release = binder_release, }; DEFINE_SHOW_ATTRIBUTE(state); DEFINE_SHOW_ATTRIBUTE(state_hashed); DEFINE_SHOW_ATTRIBUTE(stats); DEFINE_SHOW_ATTRIBUTE(transactions); DEFINE_SHOW_ATTRIBUTE(transactions_hashed); DEFINE_SHOW_ATTRIBUTE(transaction_log); const struct binder_debugfs_entry binder_debugfs_entries[] = { { .name = "state", .mode = 0444, .fops = &state_fops, .data = NULL, }, { .name = "state_hashed", .mode = 0444, .fops = &state_hashed_fops, .data = NULL, }, { .name = "stats", .mode = 0444, .fops = &stats_fops, .data = NULL, }, { .name = "transactions", .mode = 0444, .fops = &transactions_fops, .data = NULL, }, { .name = "transactions_hashed", .mode = 0444, .fops = &transactions_hashed_fops, .data = NULL, }, { .name = "transaction_log", .mode = 0444, .fops = &transaction_log_fops, .data = &binder_transaction_log, }, { .name = "failed_transaction_log", .mode = 0444, .fops = &transaction_log_fops, .data = &binder_transaction_log_failed, }, {} /* terminator */ }; void binder_add_device(struct binder_device *device) { guard(spinlock)(&binder_devices_lock); hlist_add_head(&device->hlist, &binder_devices); } void binder_remove_device(struct binder_device *device) { guard(spinlock)(&binder_devices_lock); hlist_del_init(&device->hlist); } static int __init init_binder_device(const char *name) { int ret; struct binder_device *binder_device; binder_device = kzalloc(sizeof(*binder_device), GFP_KERNEL); if (!binder_device) return -ENOMEM; binder_device->miscdev.fops = &binder_fops; binder_device->miscdev.minor = MISC_DYNAMIC_MINOR; binder_device->miscdev.name = name; refcount_set(&binder_device->ref, 1); binder_device->context.binder_context_mgr_uid = INVALID_UID; binder_device->context.name = name; mutex_init(&binder_device->context.context_mgr_node_lock); ret = misc_register(&binder_device->miscdev); if (ret < 0) { kfree(binder_device); return ret; } binder_add_device(binder_device); return ret; } static int __init binder_init(void) { int ret; char *device_name, *device_tmp; struct binder_device *device; struct hlist_node *tmp; char *device_names = NULL; const struct binder_debugfs_entry *db_entry; ret = binder_alloc_shrinker_init(); if (ret) return ret; atomic_set(&binder_transaction_log.cur, ~0U); atomic_set(&binder_transaction_log_failed.cur, ~0U); binder_debugfs_dir_entry_root = debugfs_create_dir("binder", NULL); binder_for_each_debugfs_entry(db_entry) debugfs_create_file(db_entry->name, db_entry->mode, binder_debugfs_dir_entry_root, db_entry->data, db_entry->fops); binder_debugfs_dir_entry_proc = debugfs_create_dir("proc", binder_debugfs_dir_entry_root); if (!IS_ENABLED(CONFIG_ANDROID_BINDERFS) && strcmp(binder_devices_param, "") != 0) { /* * Copy the module_parameter string, because we don't want to * tokenize it in-place. */ device_names = kstrdup(binder_devices_param, GFP_KERNEL); if (!device_names) { ret = -ENOMEM; goto err_alloc_device_names_failed; } device_tmp = device_names; while ((device_name = strsep(&device_tmp, ","))) { ret = init_binder_device(device_name); if (ret) goto err_init_binder_device_failed; } } ret = genl_register_family(&binder_nl_family); if (ret) goto err_init_binder_device_failed; ret = init_binderfs(); if (ret) goto err_init_binderfs_failed; return ret; err_init_binderfs_failed: genl_unregister_family(&binder_nl_family); err_init_binder_device_failed: hlist_for_each_entry_safe(device, tmp, &binder_devices, hlist) { misc_deregister(&device->miscdev); binder_remove_device(device); kfree(device); } kfree(device_names); err_alloc_device_names_failed: debugfs_remove_recursive(binder_debugfs_dir_entry_root); binder_alloc_shrinker_exit(); return ret; } device_initcall(binder_init); #define CREATE_TRACE_POINTS #include "binder_trace.h"
177 180 18 83 41 179 16 42 16 110 122 204 158 15 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 /* SPDX-License-Identifier: GPL-2.0 */ /* * fs/f2fs/node.h * * Copyright (c) 2012 Samsung Electronics Co., Ltd. * http://www.samsung.com/ */ /* start node id of a node block dedicated to the given node id */ #define START_NID(nid) (((nid) / NAT_ENTRY_PER_BLOCK) * NAT_ENTRY_PER_BLOCK) /* node block offset on the NAT area dedicated to the given start node id */ #define NAT_BLOCK_OFFSET(start_nid) ((start_nid) / NAT_ENTRY_PER_BLOCK) /* # of pages to perform synchronous readahead before building free nids */ #define FREE_NID_PAGES 8 #define MAX_FREE_NIDS (NAT_ENTRY_PER_BLOCK * FREE_NID_PAGES) /* size of free nid batch when shrinking */ #define SHRINK_NID_BATCH_SIZE 8 #define DEF_RA_NID_PAGES 0 /* # of nid pages to be readaheaded */ /* maximum readahead size for node during getting data blocks */ #define MAX_RA_NODE 128 /* control the memory footprint threshold (10MB per 1GB ram) */ #define DEF_RAM_THRESHOLD 1 /* control dirty nats ratio threshold (default: 10% over max nid count) */ #define DEF_DIRTY_NAT_RATIO_THRESHOLD 10 /* control total # of nats */ #define DEF_NAT_CACHE_THRESHOLD 100000 /* control total # of node writes used for roll-forward recovery */ #define DEF_RF_NODE_BLOCKS 0 /* vector size for gang look-up from nat cache that consists of radix tree */ #define NAT_VEC_SIZE 32 /* return value for read_node_page */ #define LOCKED_PAGE 1 /* check pinned file's alignment status of physical blocks */ #define FILE_NOT_ALIGNED 1 /* For flag in struct node_info */ enum { IS_CHECKPOINTED, /* is it checkpointed before? */ HAS_FSYNCED_INODE, /* is the inode fsynced before? */ HAS_LAST_FSYNC, /* has the latest node fsync mark? */ IS_DIRTY, /* this nat entry is dirty? */ IS_PREALLOC, /* nat entry is preallocated */ }; /* For node type in __get_node_folio() */ enum node_type { NODE_TYPE_REGULAR, NODE_TYPE_INODE, NODE_TYPE_XATTR, NODE_TYPE_NON_INODE, }; /* * For node information */ struct node_info { nid_t nid; /* node id */ nid_t ino; /* inode number of the node's owner */ block_t blk_addr; /* block address of the node */ unsigned char version; /* version of the node */ unsigned char flag; /* for node information bits */ }; struct nat_entry { struct list_head list; /* for clean or dirty nat list */ struct node_info ni; /* in-memory node information */ }; #define nat_get_nid(nat) ((nat)->ni.nid) #define nat_set_nid(nat, n) ((nat)->ni.nid = (n)) #define nat_get_blkaddr(nat) ((nat)->ni.blk_addr) #define nat_set_blkaddr(nat, b) ((nat)->ni.blk_addr = (b)) #define nat_get_ino(nat) ((nat)->ni.ino) #define nat_set_ino(nat, i) ((nat)->ni.ino = (i)) #define nat_get_version(nat) ((nat)->ni.version) #define nat_set_version(nat, v) ((nat)->ni.version = (v)) #define inc_node_version(version) (++(version)) static inline void copy_node_info(struct node_info *dst, struct node_info *src) { dst->nid = src->nid; dst->ino = src->ino; dst->blk_addr = src->blk_addr; dst->version = src->version; /* should not copy flag here */ } static inline void set_nat_flag(struct nat_entry *ne, unsigned int type, bool set) { if (set) ne->ni.flag |= BIT(type); else ne->ni.flag &= ~BIT(type); } static inline bool get_nat_flag(struct nat_entry *ne, unsigned int type) { return ne->ni.flag & BIT(type); } static inline void nat_reset_flag(struct nat_entry *ne) { /* these states can be set only after checkpoint was done */ set_nat_flag(ne, IS_CHECKPOINTED, true); set_nat_flag(ne, HAS_FSYNCED_INODE, false); set_nat_flag(ne, HAS_LAST_FSYNC, true); } static inline void node_info_from_raw_nat(struct node_info *ni, struct f2fs_nat_entry *raw_ne) { ni->ino = le32_to_cpu(raw_ne->ino); ni->blk_addr = le32_to_cpu(raw_ne->block_addr); ni->version = raw_ne->version; } static inline void raw_nat_from_node_info(struct f2fs_nat_entry *raw_ne, struct node_info *ni) { raw_ne->ino = cpu_to_le32(ni->ino); raw_ne->block_addr = cpu_to_le32(ni->blk_addr); raw_ne->version = ni->version; } static inline bool excess_dirty_nats(struct f2fs_sb_info *sbi) { return NM_I(sbi)->nat_cnt[DIRTY_NAT] >= NM_I(sbi)->max_nid * NM_I(sbi)->dirty_nats_ratio / 100; } static inline bool excess_cached_nats(struct f2fs_sb_info *sbi) { return NM_I(sbi)->nat_cnt[TOTAL_NAT] >= DEF_NAT_CACHE_THRESHOLD; } enum mem_type { FREE_NIDS, /* indicates the free nid list */ NAT_ENTRIES, /* indicates the cached nat entry */ DIRTY_DENTS, /* indicates dirty dentry pages */ INO_ENTRIES, /* indicates inode entries */ READ_EXTENT_CACHE, /* indicates read extent cache */ AGE_EXTENT_CACHE, /* indicates age extent cache */ DISCARD_CACHE, /* indicates memory of cached discard cmds */ COMPRESS_PAGE, /* indicates memory of cached compressed pages */ BASE_CHECK, /* check kernel status */ }; struct nat_entry_set { struct list_head set_list; /* link with other nat sets */ struct list_head entry_list; /* link with dirty nat entries */ nid_t set; /* set number*/ unsigned int entry_cnt; /* the # of nat entries in set */ }; struct free_nid { struct list_head list; /* for free node id list */ nid_t nid; /* node id */ int state; /* in use or not: FREE_NID or PREALLOC_NID */ }; static inline void next_free_nid(struct f2fs_sb_info *sbi, nid_t *nid) { struct f2fs_nm_info *nm_i = NM_I(sbi); struct free_nid *fnid; spin_lock(&nm_i->nid_list_lock); if (nm_i->nid_cnt[FREE_NID] <= 0) { spin_unlock(&nm_i->nid_list_lock); return; } fnid = list_first_entry(&nm_i->free_nid_list, struct free_nid, list); *nid = fnid->nid; spin_unlock(&nm_i->nid_list_lock); } /* * inline functions */ static inline void get_nat_bitmap(struct f2fs_sb_info *sbi, void *addr) { struct f2fs_nm_info *nm_i = NM_I(sbi); #ifdef CONFIG_F2FS_CHECK_FS if (memcmp(nm_i->nat_bitmap, nm_i->nat_bitmap_mir, nm_i->bitmap_size)) f2fs_bug_on(sbi, 1); #endif memcpy(addr, nm_i->nat_bitmap, nm_i->bitmap_size); } static inline pgoff_t current_nat_addr(struct f2fs_sb_info *sbi, nid_t start) { struct f2fs_nm_info *nm_i = NM_I(sbi); pgoff_t block_off; pgoff_t block_addr; /* * block_off = segment_off * 512 + off_in_segment * OLD = (segment_off * 512) * 2 + off_in_segment * NEW = 2 * (segment_off * 512 + off_in_segment) - off_in_segment */ block_off = NAT_BLOCK_OFFSET(start); block_addr = (pgoff_t)(nm_i->nat_blkaddr + (block_off << 1) - (block_off & (BLKS_PER_SEG(sbi) - 1))); if (f2fs_test_bit(block_off, nm_i->nat_bitmap)) block_addr += BLKS_PER_SEG(sbi); return block_addr; } static inline pgoff_t next_nat_addr(struct f2fs_sb_info *sbi, pgoff_t block_addr) { struct f2fs_nm_info *nm_i = NM_I(sbi); block_addr -= nm_i->nat_blkaddr; block_addr ^= BIT(sbi->log_blocks_per_seg); return block_addr + nm_i->nat_blkaddr; } static inline void set_to_next_nat(struct f2fs_nm_info *nm_i, nid_t start_nid) { unsigned int block_off = NAT_BLOCK_OFFSET(start_nid); f2fs_change_bit(block_off, nm_i->nat_bitmap); #ifdef CONFIG_F2FS_CHECK_FS f2fs_change_bit(block_off, nm_i->nat_bitmap_mir); #endif } static inline nid_t ino_of_node(const struct folio *node_folio) { struct f2fs_node *rn = F2FS_NODE(node_folio); return le32_to_cpu(rn->footer.ino); } static inline nid_t nid_of_node(const struct folio *node_folio) { struct f2fs_node *rn = F2FS_NODE(node_folio); return le32_to_cpu(rn->footer.nid); } static inline unsigned int ofs_of_node(const struct folio *node_folio) { struct f2fs_node *rn = F2FS_NODE(node_folio); unsigned flag = le32_to_cpu(rn->footer.flag); return flag >> OFFSET_BIT_SHIFT; } static inline __u64 cpver_of_node(const struct folio *node_folio) { struct f2fs_node *rn = F2FS_NODE(node_folio); return le64_to_cpu(rn->footer.cp_ver); } static inline block_t next_blkaddr_of_node(const struct folio *node_folio) { struct f2fs_node *rn = F2FS_NODE(node_folio); return le32_to_cpu(rn->footer.next_blkaddr); } static inline void fill_node_footer(const struct folio *folio, nid_t nid, nid_t ino, unsigned int ofs, bool reset) { struct f2fs_node *rn = F2FS_NODE(folio); unsigned int old_flag = 0; if (reset) memset(rn, 0, sizeof(*rn)); else old_flag = le32_to_cpu(rn->footer.flag); rn->footer.nid = cpu_to_le32(nid); rn->footer.ino = cpu_to_le32(ino); /* should remain old flag bits such as COLD_BIT_SHIFT */ rn->footer.flag = cpu_to_le32((ofs << OFFSET_BIT_SHIFT) | (old_flag & OFFSET_BIT_MASK)); } static inline void copy_node_footer(const struct folio *dst, const struct folio *src) { struct f2fs_node *src_rn = F2FS_NODE(src); struct f2fs_node *dst_rn = F2FS_NODE(dst); memcpy(&dst_rn->footer, &src_rn->footer, sizeof(struct node_footer)); } static inline void fill_node_footer_blkaddr(struct folio *folio, block_t blkaddr) { struct f2fs_checkpoint *ckpt = F2FS_CKPT(F2FS_F_SB(folio)); struct f2fs_node *rn = F2FS_NODE(folio); __u64 cp_ver = cur_cp_version(ckpt); if (__is_set_ckpt_flags(ckpt, CP_CRC_RECOVERY_FLAG)) cp_ver |= (cur_cp_crc(ckpt) << 32); rn->footer.cp_ver = cpu_to_le64(cp_ver); rn->footer.next_blkaddr = cpu_to_le32(blkaddr); } static inline bool is_recoverable_dnode(const struct folio *folio) { struct f2fs_checkpoint *ckpt = F2FS_CKPT(F2FS_F_SB(folio)); __u64 cp_ver = cur_cp_version(ckpt); /* Don't care crc part, if fsck.f2fs sets it. */ if (__is_set_ckpt_flags(ckpt, CP_NOCRC_RECOVERY_FLAG)) return (cp_ver << 32) == (cpver_of_node(folio) << 32); if (__is_set_ckpt_flags(ckpt, CP_CRC_RECOVERY_FLAG)) cp_ver |= (cur_cp_crc(ckpt) << 32); return cp_ver == cpver_of_node(folio); } /* * f2fs assigns the following node offsets described as (num). * N = NIDS_PER_BLOCK * * Inode block (0) * |- direct node (1) * |- direct node (2) * |- indirect node (3) * | `- direct node (4 => 4 + N - 1) * |- indirect node (4 + N) * | `- direct node (5 + N => 5 + 2N - 1) * `- double indirect node (5 + 2N) * `- indirect node (6 + 2N) * `- direct node * ...... * `- indirect node ((6 + 2N) + x(N + 1)) * `- direct node * ...... * `- indirect node ((6 + 2N) + (N - 1)(N + 1)) * `- direct node */ static inline bool IS_DNODE(const struct folio *node_folio) { unsigned int ofs = ofs_of_node(node_folio); if (f2fs_has_xattr_block(ofs)) return true; if (ofs == 3 || ofs == 4 + NIDS_PER_BLOCK || ofs == 5 + 2 * NIDS_PER_BLOCK) return false; if (ofs >= 6 + 2 * NIDS_PER_BLOCK) { ofs -= 6 + 2 * NIDS_PER_BLOCK; if (!((long int)ofs % (NIDS_PER_BLOCK + 1))) return false; } return true; } static inline int set_nid(struct folio *folio, int off, nid_t nid, bool i) { struct f2fs_node *rn = F2FS_NODE(folio); f2fs_folio_wait_writeback(folio, NODE, true, true); if (i) rn->i.i_nid[off - NODE_DIR1_BLOCK] = cpu_to_le32(nid); else rn->in.nid[off] = cpu_to_le32(nid); return folio_mark_dirty(folio); } static inline nid_t get_nid(const struct folio *folio, int off, bool i) { struct f2fs_node *rn = F2FS_NODE(folio); if (i) return le32_to_cpu(rn->i.i_nid[off - NODE_DIR1_BLOCK]); return le32_to_cpu(rn->in.nid[off]); } /* * Coldness identification: * - Mark cold files in f2fs_inode_info * - Mark cold node blocks in their node footer * - Mark cold data pages in page cache */ static inline int is_node(const struct folio *folio, int type) { struct f2fs_node *rn = F2FS_NODE(folio); return le32_to_cpu(rn->footer.flag) & BIT(type); } #define is_cold_node(folio) is_node(folio, COLD_BIT_SHIFT) #define is_fsync_dnode(folio) is_node(folio, FSYNC_BIT_SHIFT) #define is_dent_dnode(folio) is_node(folio, DENT_BIT_SHIFT) static inline void set_cold_node(const struct folio *folio, bool is_dir) { struct f2fs_node *rn = F2FS_NODE(folio); unsigned int flag = le32_to_cpu(rn->footer.flag); if (is_dir) flag &= ~BIT(COLD_BIT_SHIFT); else flag |= BIT(COLD_BIT_SHIFT); rn->footer.flag = cpu_to_le32(flag); } static inline void set_mark(struct folio *folio, int mark, int type) { struct f2fs_node *rn = F2FS_NODE(folio); unsigned int flag = le32_to_cpu(rn->footer.flag); if (mark) flag |= BIT(type); else flag &= ~BIT(type); rn->footer.flag = cpu_to_le32(flag); #ifdef CONFIG_F2FS_CHECK_FS f2fs_inode_chksum_set(F2FS_F_SB(folio), folio); #endif } #define set_dentry_mark(folio, mark) set_mark(folio, mark, DENT_BIT_SHIFT) #define set_fsync_mark(folio, mark) set_mark(folio, mark, FSYNC_BIT_SHIFT)
592 590 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 // SPDX-License-Identifier: GPL-2.0 /* * Device physical location support * * Author: Won Chung <wonchung@google.com> */ #include <linux/acpi.h> #include <linux/sysfs.h> #include <linux/string_choices.h> #include "physical_location.h" bool dev_add_physical_location(struct device *dev) { struct acpi_pld_info *pld; if (!has_acpi_companion(dev)) return false; if (!acpi_get_physical_device_location(ACPI_HANDLE(dev), &pld)) return false; dev->physical_location = kzalloc(sizeof(*dev->physical_location), GFP_KERNEL); if (!dev->physical_location) { ACPI_FREE(pld); return false; } dev->physical_location->panel = pld->panel; dev->physical_location->vertical_position = pld->vertical_position; dev->physical_location->horizontal_position = pld->horizontal_position; dev->physical_location->dock = pld->dock; dev->physical_location->lid = pld->lid; ACPI_FREE(pld); return true; } static ssize_t panel_show(struct device *dev, struct device_attribute *attr, char *buf) { const char *panel; switch (dev->physical_location->panel) { case DEVICE_PANEL_TOP: panel = "top"; break; case DEVICE_PANEL_BOTTOM: panel = "bottom"; break; case DEVICE_PANEL_LEFT: panel = "left"; break; case DEVICE_PANEL_RIGHT: panel = "right"; break; case DEVICE_PANEL_FRONT: panel = "front"; break; case DEVICE_PANEL_BACK: panel = "back"; break; default: panel = "unknown"; } return sysfs_emit(buf, "%s\n", panel); } static DEVICE_ATTR_RO(panel); static ssize_t vertical_position_show(struct device *dev, struct device_attribute *attr, char *buf) { const char *vertical_position; switch (dev->physical_location->vertical_position) { case DEVICE_VERT_POS_UPPER: vertical_position = "upper"; break; case DEVICE_VERT_POS_CENTER: vertical_position = "center"; break; case DEVICE_VERT_POS_LOWER: vertical_position = "lower"; break; default: vertical_position = "unknown"; } return sysfs_emit(buf, "%s\n", vertical_position); } static DEVICE_ATTR_RO(vertical_position); static ssize_t horizontal_position_show(struct device *dev, struct device_attribute *attr, char *buf) { const char *horizontal_position; switch (dev->physical_location->horizontal_position) { case DEVICE_HORI_POS_LEFT: horizontal_position = "left"; break; case DEVICE_HORI_POS_CENTER: horizontal_position = "center"; break; case DEVICE_HORI_POS_RIGHT: horizontal_position = "right"; break; default: horizontal_position = "unknown"; } return sysfs_emit(buf, "%s\n", horizontal_position); } static DEVICE_ATTR_RO(horizontal_position); static ssize_t dock_show(struct device *dev, struct device_attribute *attr, char *buf) { return sysfs_emit(buf, "%s\n", str_yes_no(dev->physical_location->dock)); } static DEVICE_ATTR_RO(dock); static ssize_t lid_show(struct device *dev, struct device_attribute *attr, char *buf) { return sysfs_emit(buf, "%s\n", str_yes_no(dev->physical_location->lid)); } static DEVICE_ATTR_RO(lid); static struct attribute *dev_attr_physical_location[] = { &dev_attr_panel.attr, &dev_attr_vertical_position.attr, &dev_attr_horizontal_position.attr, &dev_attr_dock.attr, &dev_attr_lid.attr, NULL, }; const struct attribute_group dev_attr_physical_location_group = { .name = "physical_location", .attrs = dev_attr_physical_location, };
1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808 4809 4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 4820 4821 4822 4823 4824 4825 4826 4827 4828 4829 4830 4831 4832 4833 4834 4835 4836 4837 4838 4839 4840 4841 4842 4843 4844 4845 4846 4847 4848 4849 4850 4851 4852 4853 4854 4855 4856 4857 4858 4859 4860 4861 4862 4863 4864 4865 4866 4867 4868 4869 4870 4871 4872 4873 4874 4875 4876 4877 4878 4879 4880 4881 4882 4883 4884 4885 4886 4887 4888 4889 4890 4891 4892 4893 4894 4895 4896 4897 4898 4899 4900 4901 4902 4903 4904 4905 4906 4907 4908 4909 4910 4911 4912 4913 4914 4915 4916 4917 4918 4919 4920 4921 4922 4923 4924 4925 4926 4927 4928 4929 4930 4931 4932 4933 4934 4935 4936 4937 4938 4939 4940 4941 4942 4943 4944 4945 4946 4947 4948 4949 4950 4951 4952 4953 4954 4955 4956 4957 4958 4959 4960 4961 4962 4963 4964 4965 4966 4967 4968 4969 4970 4971 4972 4973 4974 4975 4976 4977 4978 4979 4980 4981 4982 4983 4984 4985 4986 4987 4988 4989 4990 4991 4992 4993 4994 4995 4996 4997 4998 4999 5000 5001 5002 5003 5004 5005 5006 5007 5008 5009 5010 5011 5012 5013 5014 5015 5016 5017 5018 5019 5020 5021 5022 5023 5024 5025 5026 5027 5028 5029 5030 5031 5032 5033 5034 5035 5036 5037 5038 5039 5040 5041 5042 5043 5044 5045 5046 5047 5048 5049 5050 5051 5052 5053 5054 5055 5056 5057 5058 5059 5060 5061 5062 5063 5064 5065 5066 5067 5068 5069 5070 5071 5072 5073 5074 5075 5076 5077 5078 5079 5080 5081 5082 5083 5084 5085 5086 5087 5088 5089 5090 5091 5092 5093 5094 5095 5096 5097 5098 5099 5100 5101 5102 5103 5104 5105 5106 5107 5108 5109 5110 5111 5112 5113 5114 5115 5116 5117 5118 5119 5120 5121 5122 5123 5124 5125 5126 5127 5128 5129 5130 5131 5132 5133 5134 5135 5136 5137 5138 5139 5140 5141 5142 5143 5144 5145 5146 5147 5148 5149 5150 5151 5152 5153 5154 5155 5156 5157 5158 5159 5160 5161 5162 5163 5164 5165 5166 5167 5168 5169 5170 5171 5172 5173 5174 5175 5176 5177 5178 5179 5180 5181 5182 5183 5184 5185 5186 5187 5188 5189 5190 5191 5192 5193 5194 5195 5196 5197 5198 5199 5200 5201 5202 5203 5204 5205 5206 5207 5208 5209 5210 5211 5212 5213 5214 5215 5216 5217 5218 5219 5220 5221 5222 5223 5224 5225 5226 5227 5228 5229 5230 5231 5232 5233 5234 5235 5236 5237 5238 5239 5240 5241 5242 5243 5244 5245 5246 5247 5248 5249 5250 5251 5252 5253 5254 5255 5256 5257 5258 5259 5260 5261 5262 5263 5264 5265 5266 5267 5268 5269 5270 5271 5272 5273 5274 5275 5276 5277 5278 5279 5280 5281 5282 5283 5284 5285 5286 5287 5288 5289 5290 5291 5292 5293 5294 5295 5296 5297 5298 5299 5300 5301 5302 5303 5304 5305 5306 5307 5308 5309 5310 5311 5312 5313 5314 5315 5316 5317 5318 5319 5320 5321 5322 5323 5324 5325 5326 5327 5328 5329 5330 5331 5332 5333 5334 5335 5336 5337 5338 5339 5340 5341 5342 5343 5344 5345 5346 5347 5348 5349 5350 5351 5352 5353 5354 5355 5356 5357 5358 5359 5360 5361 5362 5363 5364 5365 5366 5367 5368 5369 5370 5371 5372 5373 5374 5375 5376 5377 5378 5379 5380 5381 5382 5383 5384 5385 5386 5387 5388 5389 5390 5391 5392 5393 5394 5395 5396 5397 5398 5399 5400 5401 5402 5403 5404 5405 5406 5407 5408 5409 5410 5411 5412 5413 5414 5415 5416 5417 5418 5419 5420 5421 5422 5423 5424 5425 5426 5427 5428 5429 5430 5431 5432 5433 5434 5435 5436 5437 5438 5439 5440 5441 5442 5443 5444 5445 5446 5447 5448 5449 5450 5451 5452 5453 5454 5455 5456 5457 5458 5459 5460 5461 5462 5463 5464 5465 5466 5467 5468 5469 5470 5471 5472 5473 5474 5475 5476 5477 5478 5479 5480 5481 5482 5483 5484 5485 5486 5487 5488 5489 5490 5491 5492 5493 5494 5495 5496 5497 5498 5499 5500 5501 5502 5503 5504 5505 5506 5507 5508 5509 5510 5511 5512 5513 5514 5515 5516 5517 5518 5519 5520 5521 5522 5523 5524 5525 5526 5527 5528 5529 5530 5531 5532 5533 5534 5535 5536 5537 5538 5539 5540 5541 5542 5543 5544 5545 5546 5547 5548 5549 5550 5551 5552 5553 5554 5555 5556 5557 5558 5559 5560 5561 5562 5563 5564 5565 5566 5567 5568 5569 5570 5571 5572 5573 5574 5575 5576 5577 5578 5579 5580 5581 5582 5583 5584 5585 5586 5587 5588 5589 5590 5591 5592 5593 5594 5595 5596 5597 5598 5599 5600 5601 5602 5603 5604 5605 5606 5607 5608 5609 5610 5611 5612 5613 5614 5615 5616 5617 5618 5619 5620 5621 5622 5623 5624 5625 5626 5627 5628 5629 5630 5631 5632 5633 5634 5635 5636 5637 5638 5639 5640 5641 5642 5643 5644 5645 5646 5647 5648 5649 5650 5651 5652 5653 5654 5655 5656 5657 5658 5659 5660 5661 5662 5663 5664 5665 5666 5667 5668 5669 5670 5671 5672 5673 5674 5675 5676 5677 5678 5679 5680 5681 5682 5683 5684 5685 5686 5687 5688 5689 5690 5691 5692 5693 5694 5695 5696 5697 5698 5699 5700 5701 5702 5703 5704 5705 5706 5707 5708 5709 5710 5711 5712 5713 5714 5715 5716 5717 5718 5719 5720 5721 5722 5723 5724 5725 5726 5727 5728 5729 5730 5731 5732 5733 5734 5735 5736 5737 5738 5739 5740 5741 5742 5743 5744 5745 5746 5747 5748 5749 5750 5751 5752 5753 5754 5755 5756 5757 5758 5759 5760 5761 5762 5763 5764 5765 5766 5767 5768 5769 5770 5771 5772 5773 5774 5775 5776 5777 5778 5779 5780 5781 5782 5783 5784 5785 5786 5787 5788 5789 5790 5791 5792 5793 5794 5795 5796 5797 5798 5799 5800 5801 5802 5803 5804 5805 5806 5807 5808 5809 5810 5811 5812 5813 5814 5815 5816 5817 5818 5819 5820 5821 5822 5823 5824 5825 5826 5827 5828 5829 5830 5831 5832 5833 5834 5835 5836 5837 5838 5839 5840 5841 5842 5843 5844 5845 5846 5847 5848 5849 5850 5851 5852 5853 5854 5855 5856 5857 5858 5859 5860 5861 5862 5863 5864 5865 5866 5867 5868 5869 5870 5871 5872 5873 5874 5875 5876 5877 5878 5879 5880 5881 5882 5883 5884 5885 5886 5887 5888 5889 5890 5891 5892 5893 5894 5895 5896 5897 5898 5899 5900 5901 5902 5903 5904 5905 5906 5907 5908 5909 5910 5911 5912 5913 5914 5915 5916 5917 5918 5919 5920 5921 5922 5923 5924 5925 5926 5927 5928 5929 5930 5931 5932 5933 5934 5935 5936 5937 5938 5939 5940 5941 5942 5943 5944 5945 5946 5947 5948 5949 5950 5951 5952 5953 5954 5955 5956 5957 5958 5959 5960 5961 5962 5963 5964 5965 5966 5967 5968 5969 5970 5971 5972 5973 5974 5975 5976 5977 5978 5979 5980 5981 5982 5983 5984 5985 5986 5987 5988 5989 5990 5991 5992 5993 5994 5995 5996 5997 5998 5999 6000 6001 6002 6003 6004 6005 6006 6007 6008 6009 6010 6011 6012 6013 6014 6015 6016 6017 6018 6019 6020 6021 6022 6023 6024 6025 6026 6027 6028 6029 6030 6031 6032 6033 6034 6035 6036 6037 6038 6039 6040 6041 6042 6043 6044 6045 6046 6047 6048 6049 6050 6051 6052 6053 6054 6055 6056 6057 6058 6059 6060 6061 6062 6063 6064 6065 6066 6067 6068 6069 6070 6071 6072 6073 6074 6075 6076 6077 6078 6079 6080 6081 6082 6083 6084 6085 6086 6087 6088 6089 6090 6091 6092 6093 6094 6095 6096 6097 6098 6099 6100 6101 6102 6103 6104 6105 6106 6107 6108 6109 6110 6111 6112 6113 6114 6115 6116 6117 6118 6119 6120 6121 6122 6123 6124 6125 6126 6127 6128 6129 6130 6131 6132 6133 6134 6135 6136 6137 6138 6139 6140 6141 6142 6143 6144 6145 6146 6147 6148 6149 6150 6151 6152 6153 6154 6155 6156 6157 6158 6159 6160 6161 6162 6163 6164 6165 6166 6167 6168 6169 6170 6171 6172 6173 6174 6175 6176 6177 6178 6179 6180 6181 6182 6183 6184 6185 6186 6187 6188 6189 6190 6191 6192 6193 6194 6195 6196 6197 6198 6199 6200 6201 6202 6203 6204 6205 6206 6207 6208 6209 6210 6211 6212 6213 6214 6215 6216 6217 6218 6219 6220 6221 6222 6223 6224 6225 6226 6227 6228 6229 6230 6231 6232 6233 6234 6235 6236 6237 6238 6239 6240 6241 6242 6243 6244 6245 6246 6247 6248 6249 6250 6251 6252 6253 6254 6255 6256 6257 6258 6259 6260 6261 6262 6263 6264 6265 6266 6267 6268 6269 6270 6271 6272 6273 6274 6275 6276 6277 6278 6279 6280 6281 6282 6283 6284 6285 6286 6287 6288 6289 6290 6291 6292 6293 6294 6295 6296 6297 6298 6299 6300 6301 6302 6303 6304 6305 6306 6307 6308 6309 6310 6311 6312 6313 6314 6315 6316 6317 6318 6319 6320 6321 6322 6323 6324 6325 6326 6327 6328 6329 6330 6331 6332 6333 6334 6335 6336 6337 6338 6339 6340 6341 6342 6343 6344 6345 6346 6347 6348 6349 6350 6351 6352 6353 6354 6355 6356 6357 6358 6359 6360 6361 6362 6363 6364 6365 6366 6367 6368 6369 6370 6371 6372 6373 6374 6375 6376 6377 6378 6379 6380 6381 6382 6383 6384 6385 6386 6387 6388 6389 6390 6391 6392 6393 6394 6395 6396 6397 6398 6399 6400 6401 6402 6403 6404 6405 6406 6407 6408 6409 6410 6411 6412 6413 6414 6415 6416 6417 6418 6419 6420 6421 6422 6423 6424 6425 6426 6427 6428 6429 6430 6431 6432 6433 6434 6435 6436 6437 6438 6439 6440 6441 6442 6443 6444 6445 6446 6447 6448 6449 6450 6451 6452 6453 6454 6455 6456 6457 6458 6459 6460 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright 2002-2005, Instant802 Networks, Inc. * Copyright 2005-2006, Devicescape Software, Inc. * Copyright 2006-2007 Jiri Benc <jbenc@suse.cz> * Copyright 2007 Johannes Berg <johannes@sipsolutions.net> * Copyright 2013-2014 Intel Mobile Communications GmbH * Copyright (C) 2018-2025 Intel Corporation * * Transmit and frame generation functions. */ #include <linux/kernel.h> #include <linux/slab.h> #include <linux/skbuff.h> #include <linux/if_vlan.h> #include <linux/etherdevice.h> #include <linux/bitmap.h> #include <linux/rcupdate.h> #include <linux/export.h> #include <net/net_namespace.h> #include <net/ieee80211_radiotap.h> #include <net/cfg80211.h> #include <net/mac80211.h> #include <net/codel.h> #include <net/codel_impl.h> #include <linux/unaligned.h> #include <net/fq_impl.h> #include <net/sock.h> #include <net/gso.h> #include "ieee80211_i.h" #include "driver-ops.h" #include "led.h" #include "mesh.h" #include "wep.h" #include "wpa.h" #include "wme.h" #include "rate.h" /* misc utils */ static __le16 ieee80211_duration(struct ieee80211_tx_data *tx, struct sk_buff *skb, int group_addr, int next_frag_len) { int rate, mrate, erp, dur, i; struct ieee80211_rate *txrate; struct ieee80211_local *local = tx->local; struct ieee80211_supported_band *sband; struct ieee80211_hdr *hdr; struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); /* assume HW handles this */ if (tx->rate.flags & (IEEE80211_TX_RC_MCS | IEEE80211_TX_RC_VHT_MCS)) return 0; /* uh huh? */ if (WARN_ON_ONCE(tx->rate.idx < 0)) return 0; if (info->band >= NUM_NL80211_BANDS) return 0; sband = local->hw.wiphy->bands[info->band]; txrate = &sband->bitrates[tx->rate.idx]; erp = txrate->flags & IEEE80211_RATE_ERP_G; /* device is expected to do this */ if (sband->band == NL80211_BAND_S1GHZ) return 0; /* * data and mgmt (except PS Poll): * - during CFP: 32768 * - during contention period: * if addr1 is group address: 0 * if more fragments = 0 and addr1 is individual address: time to * transmit one ACK plus SIFS * if more fragments = 1 and addr1 is individual address: time to * transmit next fragment plus 2 x ACK plus 3 x SIFS * * IEEE 802.11, 9.6: * - control response frame (CTS or ACK) shall be transmitted using the * same rate as the immediately previous frame in the frame exchange * sequence, if this rate belongs to the PHY mandatory rates, or else * at the highest possible rate belonging to the PHY rates in the * BSSBasicRateSet */ hdr = (struct ieee80211_hdr *)skb->data; if (ieee80211_is_ctl(hdr->frame_control)) { /* TODO: These control frames are not currently sent by * mac80211, but should they be implemented, this function * needs to be updated to support duration field calculation. * * RTS: time needed to transmit pending data/mgmt frame plus * one CTS frame plus one ACK frame plus 3 x SIFS * CTS: duration of immediately previous RTS minus time * required to transmit CTS and its SIFS * ACK: 0 if immediately previous directed data/mgmt had * more=0, with more=1 duration in ACK frame is duration * from previous frame minus time needed to transmit ACK * and its SIFS * PS Poll: BIT(15) | BIT(14) | aid */ return 0; } /* data/mgmt */ if (0 /* FIX: data/mgmt during CFP */) return cpu_to_le16(32768); if (group_addr) /* Group address as the destination - no ACK */ return 0; /* Individual destination address: * IEEE 802.11, Ch. 9.6 (after IEEE 802.11g changes) * CTS and ACK frames shall be transmitted using the highest rate in * basic rate set that is less than or equal to the rate of the * immediately previous frame and that is using the same modulation * (CCK or OFDM). If no basic rate set matches with these requirements, * the highest mandatory rate of the PHY that is less than or equal to * the rate of the previous frame is used. * Mandatory rates for IEEE 802.11g PHY: 1, 2, 5.5, 11, 6, 12, 24 Mbps */ rate = -1; /* use lowest available if everything fails */ mrate = sband->bitrates[0].bitrate; for (i = 0; i < sband->n_bitrates; i++) { struct ieee80211_rate *r = &sband->bitrates[i]; u32 flag; if (r->bitrate > txrate->bitrate) break; if (tx->sdata->vif.bss_conf.basic_rates & BIT(i)) rate = r->bitrate; switch (sband->band) { case NL80211_BAND_2GHZ: case NL80211_BAND_LC: if (tx->sdata->deflink.operating_11g_mode) flag = IEEE80211_RATE_MANDATORY_G; else flag = IEEE80211_RATE_MANDATORY_B; break; case NL80211_BAND_5GHZ: case NL80211_BAND_6GHZ: flag = IEEE80211_RATE_MANDATORY_A; break; default: flag = 0; WARN_ON(1); break; } if (r->flags & flag) mrate = r->bitrate; } if (rate == -1) { /* No matching basic rate found; use highest suitable mandatory * PHY rate */ rate = mrate; } /* Don't calculate ACKs for QoS Frames with NoAck Policy set */ if (ieee80211_is_data_qos(hdr->frame_control) && *(ieee80211_get_qos_ctl(hdr)) & IEEE80211_QOS_CTL_ACK_POLICY_NOACK) dur = 0; else /* Time needed to transmit ACK * (10 bytes + 4-byte FCS = 112 bits) plus SIFS; rounded up * to closest integer */ dur = ieee80211_frame_duration(sband->band, 10, rate, erp, tx->sdata->vif.bss_conf.use_short_preamble); if (next_frag_len) { /* Frame is fragmented: duration increases with time needed to * transmit next fragment plus ACK and 2 x SIFS. */ dur *= 2; /* ACK + SIFS */ /* next fragment */ dur += ieee80211_frame_duration(sband->band, next_frag_len, txrate->bitrate, erp, tx->sdata->vif.bss_conf.use_short_preamble); } return cpu_to_le16(dur); } /* tx handlers */ static ieee80211_tx_result debug_noinline ieee80211_tx_h_dynamic_ps(struct ieee80211_tx_data *tx) { struct ieee80211_local *local = tx->local; struct ieee80211_if_managed *ifmgd; struct ieee80211_tx_info *info = IEEE80211_SKB_CB(tx->skb); /* driver doesn't support power save */ if (!ieee80211_hw_check(&local->hw, SUPPORTS_PS)) return TX_CONTINUE; /* hardware does dynamic power save */ if (ieee80211_hw_check(&local->hw, SUPPORTS_DYNAMIC_PS)) return TX_CONTINUE; /* dynamic power save disabled */ if (local->hw.conf.dynamic_ps_timeout <= 0) return TX_CONTINUE; /* we are scanning, don't enable power save */ if (local->scanning) return TX_CONTINUE; if (!local->ps_sdata) return TX_CONTINUE; /* No point if we're going to suspend */ if (local->quiescing) return TX_CONTINUE; /* dynamic ps is supported only in managed mode */ if (tx->sdata->vif.type != NL80211_IFTYPE_STATION) return TX_CONTINUE; if (unlikely(info->flags & IEEE80211_TX_INTFL_OFFCHAN_TX_OK)) return TX_CONTINUE; ifmgd = &tx->sdata->u.mgd; /* * Don't wakeup from power save if u-apsd is enabled, voip ac has * u-apsd enabled and the frame is in voip class. This effectively * means that even if all access categories have u-apsd enabled, in * practise u-apsd is only used with the voip ac. This is a * workaround for the case when received voip class packets do not * have correct qos tag for some reason, due the network or the * peer application. * * Note: ifmgd->uapsd_queues access is racy here. If the value is * changed via debugfs, user needs to reassociate manually to have * everything in sync. */ if ((ifmgd->flags & IEEE80211_STA_UAPSD_ENABLED) && (ifmgd->uapsd_queues & IEEE80211_WMM_IE_STA_QOSINFO_AC_VO) && skb_get_queue_mapping(tx->skb) == IEEE80211_AC_VO) return TX_CONTINUE; if (local->hw.conf.flags & IEEE80211_CONF_PS) { ieee80211_stop_queues_by_reason(&local->hw, IEEE80211_MAX_QUEUE_MAP, IEEE80211_QUEUE_STOP_REASON_PS, false); ifmgd->flags &= ~IEEE80211_STA_NULLFUNC_ACKED; wiphy_work_queue(local->hw.wiphy, &local->dynamic_ps_disable_work); } /* Don't restart the timer if we're not disassociated */ if (!ifmgd->associated) return TX_CONTINUE; mod_timer(&local->dynamic_ps_timer, jiffies + msecs_to_jiffies(local->hw.conf.dynamic_ps_timeout)); return TX_CONTINUE; } static ieee80211_tx_result debug_noinline ieee80211_tx_h_check_assoc(struct ieee80211_tx_data *tx) { struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)tx->skb->data; struct ieee80211_tx_info *info = IEEE80211_SKB_CB(tx->skb); bool assoc = false; if (unlikely(info->flags & IEEE80211_TX_CTL_INJECTED)) return TX_CONTINUE; if (unlikely(test_bit(SCAN_SW_SCANNING, &tx->local->scanning)) && test_bit(SDATA_STATE_OFFCHANNEL, &tx->sdata->state) && !ieee80211_is_probe_req(hdr->frame_control) && !ieee80211_is_any_nullfunc(hdr->frame_control)) /* * When software scanning only nullfunc frames (to notify * the sleep state to the AP) and probe requests (for the * active scan) are allowed, all other frames should not be * sent and we should not get here, but if we do * nonetheless, drop them to avoid sending them * off-channel. See the link below and * ieee80211_start_scan() for more. * * http://article.gmane.org/gmane.linux.kernel.wireless.general/30089 */ return TX_DROP; if (tx->sdata->vif.type == NL80211_IFTYPE_OCB) return TX_CONTINUE; if (tx->flags & IEEE80211_TX_PS_BUFFERED) return TX_CONTINUE; if (tx->sta) assoc = test_sta_flag(tx->sta, WLAN_STA_ASSOC); if (likely(tx->flags & IEEE80211_TX_UNICAST)) { if (unlikely(!assoc && ieee80211_is_data(hdr->frame_control))) { #ifdef CONFIG_MAC80211_VERBOSE_DEBUG sdata_info(tx->sdata, "dropped data frame to not associated station %pM\n", hdr->addr1); #endif I802_DEBUG_INC(tx->local->tx_handlers_drop_not_assoc); return TX_DROP; } } else if (unlikely(ieee80211_is_data(hdr->frame_control) && ieee80211_vif_get_num_mcast_if(tx->sdata) == 0)) { /* * No associated STAs - no need to send multicast * frames. */ return TX_DROP; } return TX_CONTINUE; } /* This function is called whenever the AP is about to exceed the maximum limit * of buffered frames for power saving STAs. This situation should not really * happen often during normal operation, so dropping the oldest buffered packet * from each queue should be OK to make some room for new frames. */ static void purge_old_ps_buffers(struct ieee80211_local *local) { int total = 0, purged = 0; struct sk_buff *skb; struct ieee80211_sub_if_data *sdata; struct sta_info *sta; list_for_each_entry_rcu(sdata, &local->interfaces, list) { struct ps_data *ps; if (sdata->vif.type == NL80211_IFTYPE_AP) ps = &sdata->u.ap.ps; else if (ieee80211_vif_is_mesh(&sdata->vif)) ps = &sdata->u.mesh.ps; else continue; skb = skb_dequeue(&ps->bc_buf); if (skb) { purged++; ieee80211_free_txskb(&local->hw, skb); } total += skb_queue_len(&ps->bc_buf); } /* * Drop one frame from each station from the lowest-priority * AC that has frames at all. */ list_for_each_entry_rcu(sta, &local->sta_list, list) { int ac; for (ac = IEEE80211_AC_BK; ac >= IEEE80211_AC_VO; ac--) { skb = skb_dequeue(&sta->ps_tx_buf[ac]); total += skb_queue_len(&sta->ps_tx_buf[ac]); if (skb) { purged++; ieee80211_free_txskb(&local->hw, skb); break; } } } local->total_ps_buffered = total; ps_dbg_hw(&local->hw, "PS buffers full - purged %d frames\n", purged); } static ieee80211_tx_result ieee80211_tx_h_multicast_ps_buf(struct ieee80211_tx_data *tx) { struct ieee80211_tx_info *info = IEEE80211_SKB_CB(tx->skb); struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)tx->skb->data; struct ps_data *ps; /* * broadcast/multicast frame * * If any of the associated/peer stations is in power save mode, * the frame is buffered to be sent after DTIM beacon frame. * This is done either by the hardware or us. */ /* powersaving STAs currently only in AP/VLAN/mesh mode */ if (tx->sdata->vif.type == NL80211_IFTYPE_AP || tx->sdata->vif.type == NL80211_IFTYPE_AP_VLAN) { if (!tx->sdata->bss) return TX_CONTINUE; ps = &tx->sdata->bss->ps; } else if (ieee80211_vif_is_mesh(&tx->sdata->vif)) { ps = &tx->sdata->u.mesh.ps; } else { return TX_CONTINUE; } /* no buffering for ordered frames */ if (ieee80211_has_order(hdr->frame_control)) return TX_CONTINUE; if (ieee80211_is_probe_req(hdr->frame_control)) return TX_CONTINUE; if (ieee80211_hw_check(&tx->local->hw, QUEUE_CONTROL)) info->hw_queue = tx->sdata->vif.cab_queue; /* no stations in PS mode and no buffered packets */ if (!atomic_read(&ps->num_sta_ps) && skb_queue_empty(&ps->bc_buf)) return TX_CONTINUE; info->flags |= IEEE80211_TX_CTL_SEND_AFTER_DTIM; /* device releases frame after DTIM beacon */ if (!ieee80211_hw_check(&tx->local->hw, HOST_BROADCAST_PS_BUFFERING)) return TX_CONTINUE; /* buffered in mac80211 */ if (tx->local->total_ps_buffered >= TOTAL_MAX_TX_BUFFER) purge_old_ps_buffers(tx->local); if (skb_queue_len(&ps->bc_buf) >= AP_MAX_BC_BUFFER) { ps_dbg(tx->sdata, "BC TX buffer full - dropping the oldest frame\n"); ieee80211_free_txskb(&tx->local->hw, skb_dequeue(&ps->bc_buf)); } else tx->local->total_ps_buffered++; skb_queue_tail(&ps->bc_buf, tx->skb); return TX_QUEUED; } static int ieee80211_use_mfp(__le16 fc, struct sta_info *sta, struct sk_buff *skb) { if (!ieee80211_is_mgmt(fc)) return 0; if (sta == NULL || !test_sta_flag(sta, WLAN_STA_MFP)) return 0; if (!ieee80211_is_robust_mgmt_frame(skb)) return 0; return 1; } static ieee80211_tx_result ieee80211_tx_h_unicast_ps_buf(struct ieee80211_tx_data *tx) { struct sta_info *sta = tx->sta; struct ieee80211_tx_info *info = IEEE80211_SKB_CB(tx->skb); struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)tx->skb->data; struct ieee80211_local *local = tx->local; if (unlikely(!sta)) return TX_CONTINUE; if (unlikely((test_sta_flag(sta, WLAN_STA_PS_STA) || test_sta_flag(sta, WLAN_STA_PS_DRIVER) || test_sta_flag(sta, WLAN_STA_PS_DELIVER)) && !(info->flags & IEEE80211_TX_CTL_NO_PS_BUFFER))) { int ac = skb_get_queue_mapping(tx->skb); if (ieee80211_is_mgmt(hdr->frame_control) && !ieee80211_is_bufferable_mmpdu(tx->skb)) { info->flags |= IEEE80211_TX_CTL_NO_PS_BUFFER; return TX_CONTINUE; } ps_dbg(sta->sdata, "STA %pM aid %d: PS buffer for AC %d\n", sta->sta.addr, sta->sta.aid, ac); if (tx->local->total_ps_buffered >= TOTAL_MAX_TX_BUFFER) purge_old_ps_buffers(tx->local); /* sync with ieee80211_sta_ps_deliver_wakeup */ spin_lock(&sta->ps_lock); /* * STA woke up the meantime and all the frames on ps_tx_buf have * been queued to pending queue. No reordering can happen, go * ahead and Tx the packet. */ if (!test_sta_flag(sta, WLAN_STA_PS_STA) && !test_sta_flag(sta, WLAN_STA_PS_DRIVER) && !test_sta_flag(sta, WLAN_STA_PS_DELIVER)) { spin_unlock(&sta->ps_lock); return TX_CONTINUE; } if (skb_queue_len(&sta->ps_tx_buf[ac]) >= STA_MAX_TX_BUFFER) { struct sk_buff *old = skb_dequeue(&sta->ps_tx_buf[ac]); ps_dbg(tx->sdata, "STA %pM TX buffer for AC %d full - dropping oldest frame\n", sta->sta.addr, ac); ieee80211_free_txskb(&local->hw, old); } else tx->local->total_ps_buffered++; info->control.jiffies = jiffies; info->control.vif = &tx->sdata->vif; info->control.flags |= IEEE80211_TX_INTCFL_NEED_TXPROCESSING; info->flags &= ~IEEE80211_TX_TEMPORARY_FLAGS; skb_queue_tail(&sta->ps_tx_buf[ac], tx->skb); spin_unlock(&sta->ps_lock); if (!timer_pending(&local->sta_cleanup)) mod_timer(&local->sta_cleanup, round_jiffies(jiffies + STA_INFO_CLEANUP_INTERVAL)); /* * We queued up some frames, so the TIM bit might * need to be set, recalculate it. */ sta_info_recalc_tim(sta); return TX_QUEUED; } else if (unlikely(test_sta_flag(sta, WLAN_STA_PS_STA))) { ps_dbg(tx->sdata, "STA %pM in PS mode, but polling/in SP -> send frame\n", sta->sta.addr); } return TX_CONTINUE; } static ieee80211_tx_result debug_noinline ieee80211_tx_h_ps_buf(struct ieee80211_tx_data *tx) { if (unlikely(tx->flags & IEEE80211_TX_PS_BUFFERED)) return TX_CONTINUE; if (tx->flags & IEEE80211_TX_UNICAST) return ieee80211_tx_h_unicast_ps_buf(tx); else return ieee80211_tx_h_multicast_ps_buf(tx); } static ieee80211_tx_result debug_noinline ieee80211_tx_h_check_control_port_protocol(struct ieee80211_tx_data *tx) { struct ieee80211_tx_info *info = IEEE80211_SKB_CB(tx->skb); if (unlikely(tx->sdata->control_port_protocol == tx->skb->protocol)) { if (tx->sdata->control_port_no_encrypt) info->flags |= IEEE80211_TX_INTFL_DONT_ENCRYPT; info->control.flags |= IEEE80211_TX_CTRL_PORT_CTRL_PROTO; info->flags |= IEEE80211_TX_CTL_USE_MINRATE; } return TX_CONTINUE; } static struct ieee80211_key * ieee80211_select_link_key(struct ieee80211_tx_data *tx) { struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)tx->skb->data; struct ieee80211_tx_info *info = IEEE80211_SKB_CB(tx->skb); struct ieee80211_link_data *link; unsigned int link_id; link_id = u32_get_bits(info->control.flags, IEEE80211_TX_CTRL_MLO_LINK); if (link_id == IEEE80211_LINK_UNSPECIFIED) { link = &tx->sdata->deflink; } else { link = rcu_dereference(tx->sdata->link[link_id]); if (!link) return NULL; } if (ieee80211_is_group_privacy_action(tx->skb)) return rcu_dereference(link->default_multicast_key); else if (ieee80211_is_mgmt(hdr->frame_control) && is_multicast_ether_addr(hdr->addr1) && ieee80211_is_robust_mgmt_frame(tx->skb)) return rcu_dereference(link->default_mgmt_key); else if (is_multicast_ether_addr(hdr->addr1)) return rcu_dereference(link->default_multicast_key); return NULL; } static ieee80211_tx_result debug_noinline ieee80211_tx_h_select_key(struct ieee80211_tx_data *tx) { struct ieee80211_key *key; struct ieee80211_tx_info *info = IEEE80211_SKB_CB(tx->skb); struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)tx->skb->data; if (unlikely(info->flags & IEEE80211_TX_INTFL_DONT_ENCRYPT)) { tx->key = NULL; return TX_CONTINUE; } if (tx->sta && (key = rcu_dereference(tx->sta->ptk[tx->sta->ptk_idx]))) tx->key = key; else if ((key = ieee80211_select_link_key(tx))) tx->key = key; else if (!is_multicast_ether_addr(hdr->addr1) && (key = rcu_dereference(tx->sdata->default_unicast_key))) tx->key = key; else tx->key = NULL; if (info->flags & IEEE80211_TX_CTL_HW_80211_ENCAP) { if (tx->key && tx->key->flags & KEY_FLAG_UPLOADED_TO_HARDWARE) info->control.hw_key = &tx->key->conf; return TX_CONTINUE; } if (tx->key) { bool skip_hw = false; /* TODO: add threshold stuff again */ switch (tx->key->conf.cipher) { case WLAN_CIPHER_SUITE_WEP40: case WLAN_CIPHER_SUITE_WEP104: case WLAN_CIPHER_SUITE_TKIP: if (!ieee80211_is_data_present(hdr->frame_control)) tx->key = NULL; break; case WLAN_CIPHER_SUITE_CCMP: case WLAN_CIPHER_SUITE_CCMP_256: case WLAN_CIPHER_SUITE_GCMP: case WLAN_CIPHER_SUITE_GCMP_256: if (!ieee80211_is_data_present(hdr->frame_control) && !ieee80211_use_mfp(hdr->frame_control, tx->sta, tx->skb) && !ieee80211_is_group_privacy_action(tx->skb)) tx->key = NULL; else skip_hw = (tx->key->conf.flags & IEEE80211_KEY_FLAG_SW_MGMT_TX) && ieee80211_is_mgmt(hdr->frame_control); break; case WLAN_CIPHER_SUITE_AES_CMAC: case WLAN_CIPHER_SUITE_BIP_CMAC_256: case WLAN_CIPHER_SUITE_BIP_GMAC_128: case WLAN_CIPHER_SUITE_BIP_GMAC_256: if (!ieee80211_is_mgmt(hdr->frame_control)) tx->key = NULL; break; } if (unlikely(tx->key && tx->key->flags & KEY_FLAG_TAINTED && !ieee80211_is_deauth(hdr->frame_control)) && tx->skb->protocol != tx->sdata->control_port_protocol) return TX_DROP; if (!skip_hw && tx->key && tx->key->flags & KEY_FLAG_UPLOADED_TO_HARDWARE) info->control.hw_key = &tx->key->conf; } else if (ieee80211_is_data_present(hdr->frame_control) && tx->sta && test_sta_flag(tx->sta, WLAN_STA_USES_ENCRYPTION)) { return TX_DROP; } return TX_CONTINUE; } static ieee80211_tx_result debug_noinline ieee80211_tx_h_rate_ctrl(struct ieee80211_tx_data *tx) { struct ieee80211_tx_info *info = IEEE80211_SKB_CB(tx->skb); struct ieee80211_hdr *hdr = (void *)tx->skb->data; struct ieee80211_supported_band *sband; u32 len; struct ieee80211_tx_rate_control txrc; struct ieee80211_sta_rates *ratetbl = NULL; bool encap = info->flags & IEEE80211_TX_CTL_HW_80211_ENCAP; bool assoc = false; memset(&txrc, 0, sizeof(txrc)); if (info->band < NUM_NL80211_BANDS) sband = tx->local->hw.wiphy->bands[info->band]; else return TX_CONTINUE; len = min_t(u32, tx->skb->len + FCS_LEN, tx->local->hw.wiphy->frag_threshold); /* set up the tx rate control struct we give the RC algo */ txrc.hw = &tx->local->hw; txrc.sband = sband; txrc.bss_conf = &tx->sdata->vif.bss_conf; txrc.skb = tx->skb; txrc.reported_rate.idx = -1; if (unlikely(info->control.flags & IEEE80211_TX_CTRL_DONT_USE_RATE_MASK)) { txrc.rate_idx_mask = ~0; } else { txrc.rate_idx_mask = tx->sdata->rc_rateidx_mask[info->band]; if (tx->sdata->rc_has_mcs_mask[info->band]) txrc.rate_idx_mcs_mask = tx->sdata->rc_rateidx_mcs_mask[info->band]; } txrc.bss = (tx->sdata->vif.type == NL80211_IFTYPE_AP || tx->sdata->vif.type == NL80211_IFTYPE_MESH_POINT || tx->sdata->vif.type == NL80211_IFTYPE_ADHOC || tx->sdata->vif.type == NL80211_IFTYPE_OCB); /* set up RTS protection if desired */ if (len > tx->local->hw.wiphy->rts_threshold) { txrc.rts = true; } info->control.use_rts = txrc.rts; info->control.use_cts_prot = tx->sdata->vif.bss_conf.use_cts_prot; /* * Use short preamble if the BSS can handle it, but not for * management frames unless we know the receiver can handle * that -- the management frame might be to a station that * just wants a probe response. */ if (tx->sdata->vif.bss_conf.use_short_preamble && (ieee80211_is_tx_data(tx->skb) || (tx->sta && test_sta_flag(tx->sta, WLAN_STA_SHORT_PREAMBLE)))) txrc.short_preamble = true; info->control.short_preamble = txrc.short_preamble; /* don't ask rate control when rate already injected via radiotap */ if (info->control.flags & IEEE80211_TX_CTRL_RATE_INJECT) return TX_CONTINUE; if (tx->sta) assoc = test_sta_flag(tx->sta, WLAN_STA_ASSOC); /* * Lets not bother rate control if we're associated and cannot * talk to the sta. This should not happen. */ if (WARN(test_bit(SCAN_SW_SCANNING, &tx->local->scanning) && assoc && !rate_usable_index_exists(sband, &tx->sta->sta), "%s: Dropped data frame as no usable bitrate found while " "scanning and associated. Target station: " "%pM on %d GHz band\n", tx->sdata->name, encap ? ((struct ethhdr *)hdr)->h_dest : hdr->addr1, info->band ? 5 : 2)) return TX_DROP; /* * If we're associated with the sta at this point we know we can at * least send the frame at the lowest bit rate. */ rate_control_get_rate(tx->sdata, tx->sta, &txrc); if (tx->sta && !info->control.skip_table) ratetbl = rcu_dereference(tx->sta->sta.rates); if (unlikely(info->control.rates[0].idx < 0)) { if (ratetbl) { struct ieee80211_tx_rate rate = { .idx = ratetbl->rate[0].idx, .flags = ratetbl->rate[0].flags, .count = ratetbl->rate[0].count }; if (ratetbl->rate[0].idx < 0) return TX_DROP; tx->rate = rate; } else { return TX_DROP; } } else { tx->rate = info->control.rates[0]; } if (txrc.reported_rate.idx < 0) { txrc.reported_rate = tx->rate; if (tx->sta && ieee80211_is_tx_data(tx->skb)) tx->sta->deflink.tx_stats.last_rate = txrc.reported_rate; } else if (tx->sta) tx->sta->deflink.tx_stats.last_rate = txrc.reported_rate; if (ratetbl) return TX_CONTINUE; if (unlikely(!info->control.rates[0].count)) info->control.rates[0].count = 1; if (WARN_ON_ONCE((info->control.rates[0].count > 1) && (info->flags & IEEE80211_TX_CTL_NO_ACK))) info->control.rates[0].count = 1; return TX_CONTINUE; } static __le16 ieee80211_tx_next_seq(struct sta_info *sta, int tid) { u16 *seq = &sta->tid_seq[tid]; __le16 ret = cpu_to_le16(*seq); /* Increase the sequence number. */ *seq = (*seq + 0x10) & IEEE80211_SCTL_SEQ; return ret; } static ieee80211_tx_result debug_noinline ieee80211_tx_h_sequence(struct ieee80211_tx_data *tx) { struct ieee80211_tx_info *info = IEEE80211_SKB_CB(tx->skb); struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)tx->skb->data; int tid; /* * Packet injection may want to control the sequence * number, if we have no matching interface then we * neither assign one ourselves nor ask the driver to. */ if (unlikely(info->control.vif->type == NL80211_IFTYPE_MONITOR)) return TX_CONTINUE; if (unlikely(ieee80211_is_ctl(hdr->frame_control))) return TX_CONTINUE; if (ieee80211_hdrlen(hdr->frame_control) < 24) return TX_CONTINUE; if (ieee80211_is_qos_nullfunc(hdr->frame_control)) return TX_CONTINUE; if (info->control.flags & IEEE80211_TX_CTRL_NO_SEQNO) return TX_CONTINUE; /* SNS11 from 802.11be 10.3.2.14 */ if (unlikely(is_multicast_ether_addr(hdr->addr1) && ieee80211_vif_is_mld(info->control.vif) && info->control.vif->type == NL80211_IFTYPE_AP)) { if (info->control.flags & IEEE80211_TX_CTRL_MCAST_MLO_FIRST_TX) tx->sdata->mld_mcast_seq += 0x10; hdr->seq_ctrl = cpu_to_le16(tx->sdata->mld_mcast_seq); return TX_CONTINUE; } /* * Anything but QoS data that has a sequence number field * (is long enough) gets a sequence number from the global * counter. QoS data frames with a multicast destination * also use the global counter (802.11-2012 9.3.2.10). */ if (!ieee80211_is_data_qos(hdr->frame_control) || is_multicast_ether_addr(hdr->addr1)) { /* driver should assign sequence number */ info->flags |= IEEE80211_TX_CTL_ASSIGN_SEQ; /* for pure STA mode without beacons, we can do it */ hdr->seq_ctrl = cpu_to_le16(tx->sdata->sequence_number); tx->sdata->sequence_number += 0x10; if (tx->sta) tx->sta->deflink.tx_stats.msdu[IEEE80211_NUM_TIDS]++; return TX_CONTINUE; } /* * This should be true for injected/management frames only, for * management frames we have set the IEEE80211_TX_CTL_ASSIGN_SEQ * above since they are not QoS-data frames. */ if (!tx->sta) return TX_CONTINUE; /* include per-STA, per-TID sequence counter */ tid = ieee80211_get_tid(hdr); tx->sta->deflink.tx_stats.msdu[tid]++; hdr->seq_ctrl = ieee80211_tx_next_seq(tx->sta, tid); return TX_CONTINUE; } static int ieee80211_fragment(struct ieee80211_tx_data *tx, struct sk_buff *skb, int hdrlen, int frag_threshold) { struct ieee80211_local *local = tx->local; struct ieee80211_tx_info *info; struct sk_buff *tmp; int per_fragm = frag_threshold - hdrlen - FCS_LEN; int pos = hdrlen + per_fragm; int rem = skb->len - hdrlen - per_fragm; if (WARN_ON(rem < 0)) return -EINVAL; /* first fragment was already added to queue by caller */ while (rem) { int fraglen = per_fragm; if (fraglen > rem) fraglen = rem; rem -= fraglen; tmp = dev_alloc_skb(local->tx_headroom + frag_threshold + IEEE80211_ENCRYPT_HEADROOM + IEEE80211_ENCRYPT_TAILROOM); if (!tmp) return -ENOMEM; __skb_queue_tail(&tx->skbs, tmp); skb_reserve(tmp, local->tx_headroom + IEEE80211_ENCRYPT_HEADROOM); /* copy control information */ memcpy(tmp->cb, skb->cb, sizeof(tmp->cb)); info = IEEE80211_SKB_CB(tmp); info->flags &= ~(IEEE80211_TX_CTL_CLEAR_PS_FILT | IEEE80211_TX_CTL_FIRST_FRAGMENT); if (rem) info->flags |= IEEE80211_TX_CTL_MORE_FRAMES; skb_copy_queue_mapping(tmp, skb); tmp->priority = skb->priority; tmp->dev = skb->dev; /* copy header and data */ skb_put_data(tmp, skb->data, hdrlen); skb_put_data(tmp, skb->data + pos, fraglen); pos += fraglen; } /* adjust first fragment's length */ skb_trim(skb, hdrlen + per_fragm); return 0; } static ieee80211_tx_result debug_noinline ieee80211_tx_h_fragment(struct ieee80211_tx_data *tx) { struct sk_buff *skb = tx->skb; struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); struct ieee80211_hdr *hdr = (void *)skb->data; int frag_threshold = tx->local->hw.wiphy->frag_threshold; int hdrlen; int fragnum; /* no matter what happens, tx->skb moves to tx->skbs */ __skb_queue_tail(&tx->skbs, skb); tx->skb = NULL; if (info->flags & IEEE80211_TX_CTL_DONTFRAG) return TX_CONTINUE; if (ieee80211_hw_check(&tx->local->hw, SUPPORTS_TX_FRAG)) return TX_CONTINUE; /* * Warn when submitting a fragmented A-MPDU frame and drop it. * This scenario is handled in ieee80211_tx_prepare but extra * caution taken here as fragmented ampdu may cause Tx stop. */ if (WARN_ON(info->flags & IEEE80211_TX_CTL_AMPDU)) return TX_DROP; hdrlen = ieee80211_hdrlen(hdr->frame_control); /* internal error, why isn't DONTFRAG set? */ if (WARN_ON(skb->len + FCS_LEN <= frag_threshold)) return TX_DROP; /* * Now fragment the frame. This will allocate all the fragments and * chain them (using skb as the first fragment) to skb->next. * During transmission, we will remove the successfully transmitted * fragments from this list. When the low-level driver rejects one * of the fragments then we will simply pretend to accept the skb * but store it away as pending. */ if (ieee80211_fragment(tx, skb, hdrlen, frag_threshold)) return TX_DROP; /* update duration/seq/flags of fragments */ fragnum = 0; skb_queue_walk(&tx->skbs, skb) { const __le16 morefrags = cpu_to_le16(IEEE80211_FCTL_MOREFRAGS); hdr = (void *)skb->data; info = IEEE80211_SKB_CB(skb); if (!skb_queue_is_last(&tx->skbs, skb)) { hdr->frame_control |= morefrags; /* * No multi-rate retries for fragmented frames, that * would completely throw off the NAV at other STAs. */ info->control.rates[1].idx = -1; info->control.rates[2].idx = -1; info->control.rates[3].idx = -1; BUILD_BUG_ON(IEEE80211_TX_MAX_RATES != 4); info->flags &= ~IEEE80211_TX_CTL_RATE_CTRL_PROBE; } else { hdr->frame_control &= ~morefrags; } hdr->seq_ctrl |= cpu_to_le16(fragnum & IEEE80211_SCTL_FRAG); fragnum++; } return TX_CONTINUE; } static ieee80211_tx_result debug_noinline ieee80211_tx_h_stats(struct ieee80211_tx_data *tx) { struct sk_buff *skb; int ac = -1; if (!tx->sta) return TX_CONTINUE; skb_queue_walk(&tx->skbs, skb) { ac = skb_get_queue_mapping(skb); tx->sta->deflink.tx_stats.bytes[ac] += skb->len; } if (ac >= 0) tx->sta->deflink.tx_stats.packets[ac]++; return TX_CONTINUE; } static ieee80211_tx_result debug_noinline ieee80211_tx_h_encrypt(struct ieee80211_tx_data *tx) { if (!tx->key) return TX_CONTINUE; switch (tx->key->conf.cipher) { case WLAN_CIPHER_SUITE_WEP40: case WLAN_CIPHER_SUITE_WEP104: return ieee80211_crypto_wep_encrypt(tx); case WLAN_CIPHER_SUITE_TKIP: return ieee80211_crypto_tkip_encrypt(tx); case WLAN_CIPHER_SUITE_CCMP: return ieee80211_crypto_ccmp_encrypt( tx, IEEE80211_CCMP_MIC_LEN); case WLAN_CIPHER_SUITE_CCMP_256: return ieee80211_crypto_ccmp_encrypt( tx, IEEE80211_CCMP_256_MIC_LEN); case WLAN_CIPHER_SUITE_AES_CMAC: return ieee80211_crypto_aes_cmac_encrypt(tx); case WLAN_CIPHER_SUITE_BIP_CMAC_256: return ieee80211_crypto_aes_cmac_256_encrypt(tx); case WLAN_CIPHER_SUITE_BIP_GMAC_128: case WLAN_CIPHER_SUITE_BIP_GMAC_256: return ieee80211_crypto_aes_gmac_encrypt(tx); case WLAN_CIPHER_SUITE_GCMP: case WLAN_CIPHER_SUITE_GCMP_256: return ieee80211_crypto_gcmp_encrypt(tx); } return TX_DROP; } static ieee80211_tx_result debug_noinline ieee80211_tx_h_calculate_duration(struct ieee80211_tx_data *tx) { struct sk_buff *skb; struct ieee80211_hdr *hdr; int next_len; bool group_addr; skb_queue_walk(&tx->skbs, skb) { hdr = (void *) skb->data; if (unlikely(ieee80211_is_pspoll(hdr->frame_control))) break; /* must not overwrite AID */ if (!skb_queue_is_last(&tx->skbs, skb)) { struct sk_buff *next = skb_queue_next(&tx->skbs, skb); next_len = next->len; } else next_len = 0; group_addr = is_multicast_ether_addr(hdr->addr1); hdr->duration_id = ieee80211_duration(tx, skb, group_addr, next_len); } return TX_CONTINUE; } /* actual transmit path */ static bool ieee80211_tx_prep_agg(struct ieee80211_tx_data *tx, struct sk_buff *skb, struct ieee80211_tx_info *info, struct tid_ampdu_tx *tid_tx, int tid) { bool queued = false; bool reset_agg_timer = false; struct sk_buff *purge_skb = NULL; if (test_bit(HT_AGG_STATE_OPERATIONAL, &tid_tx->state)) { reset_agg_timer = true; } else if (test_bit(HT_AGG_STATE_WANT_START, &tid_tx->state)) { /* * nothing -- this aggregation session is being started * but that might still fail with the driver */ } else if (!tx->sta->sta.txq[tid]) { spin_lock(&tx->sta->lock); /* * Need to re-check now, because we may get here * * 1) in the window during which the setup is actually * already done, but not marked yet because not all * packets are spliced over to the driver pending * queue yet -- if this happened we acquire the lock * either before or after the splice happens, but * need to recheck which of these cases happened. * * 2) during session teardown, if the OPERATIONAL bit * was cleared due to the teardown but the pointer * hasn't been assigned NULL yet (or we loaded it * before it was assigned) -- in this case it may * now be NULL which means we should just let the * packet pass through because splicing the frames * back is already done. */ tid_tx = rcu_dereference_protected_tid_tx(tx->sta, tid); if (!tid_tx) { /* do nothing, let packet pass through */ } else if (test_bit(HT_AGG_STATE_OPERATIONAL, &tid_tx->state)) { reset_agg_timer = true; } else { queued = true; if (info->flags & IEEE80211_TX_CTL_NO_PS_BUFFER) { clear_sta_flag(tx->sta, WLAN_STA_SP); ps_dbg(tx->sta->sdata, "STA %pM aid %d: SP frame queued, close the SP w/o telling the peer\n", tx->sta->sta.addr, tx->sta->sta.aid); } info->control.vif = &tx->sdata->vif; info->control.flags |= IEEE80211_TX_INTCFL_NEED_TXPROCESSING; info->flags &= ~IEEE80211_TX_TEMPORARY_FLAGS; __skb_queue_tail(&tid_tx->pending, skb); if (skb_queue_len(&tid_tx->pending) > STA_MAX_TX_BUFFER) purge_skb = __skb_dequeue(&tid_tx->pending); } spin_unlock(&tx->sta->lock); if (purge_skb) ieee80211_free_txskb(&tx->local->hw, purge_skb); } /* reset session timer */ if (reset_agg_timer) tid_tx->last_tx = jiffies; return queued; } void ieee80211_aggr_check(struct ieee80211_sub_if_data *sdata, struct sta_info *sta, struct sk_buff *skb) { struct rate_control_ref *ref = sdata->local->rate_ctrl; u16 tid; if (!ref || !(ref->ops->capa & RATE_CTRL_CAPA_AMPDU_TRIGGER)) return; if (!sta || (!sta->sta.valid_links && !sta->sta.deflink.ht_cap.ht_supported && !sta->sta.deflink.s1g_cap.s1g) || !sta->sta.wme || skb_get_queue_mapping(skb) == IEEE80211_AC_VO || skb->protocol == sdata->control_port_protocol) return; tid = skb->priority & IEEE80211_QOS_CTL_TID_MASK; if (likely(sta->ampdu_mlme.tid_tx[tid])) return; ieee80211_start_tx_ba_session(&sta->sta, tid, 0); } /* * initialises @tx * pass %NULL for the station if unknown, a valid pointer if known * or an ERR_PTR() if the station is known not to exist */ static ieee80211_tx_result ieee80211_tx_prepare(struct ieee80211_sub_if_data *sdata, struct ieee80211_tx_data *tx, struct sta_info *sta, struct sk_buff *skb) { struct ieee80211_local *local = sdata->local; struct ieee80211_hdr *hdr; struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); bool aggr_check = false; int tid; memset(tx, 0, sizeof(*tx)); tx->skb = skb; tx->local = local; tx->sdata = sdata; __skb_queue_head_init(&tx->skbs); /* * If this flag is set to true anywhere, and we get here, * we are doing the needed processing, so remove the flag * now. */ info->control.flags &= ~IEEE80211_TX_INTCFL_NEED_TXPROCESSING; hdr = (struct ieee80211_hdr *) skb->data; if (likely(sta)) { if (!IS_ERR(sta)) tx->sta = sta; } else { if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN) { tx->sta = rcu_dereference(sdata->u.vlan.sta); if (!tx->sta && sdata->wdev.use_4addr) return TX_DROP; } else if (tx->sdata->control_port_protocol == tx->skb->protocol) { tx->sta = sta_info_get_bss(sdata, hdr->addr1); } if (!tx->sta && !is_multicast_ether_addr(hdr->addr1)) { tx->sta = sta_info_get(sdata, hdr->addr1); aggr_check = true; } } if (tx->sta && ieee80211_is_data_qos(hdr->frame_control) && !ieee80211_is_qos_nullfunc(hdr->frame_control) && ieee80211_hw_check(&local->hw, AMPDU_AGGREGATION) && !ieee80211_hw_check(&local->hw, TX_AMPDU_SETUP_IN_HW)) { struct tid_ampdu_tx *tid_tx; tid = ieee80211_get_tid(hdr); tid_tx = rcu_dereference(tx->sta->ampdu_mlme.tid_tx[tid]); if (!tid_tx && aggr_check) { ieee80211_aggr_check(sdata, tx->sta, skb); tid_tx = rcu_dereference(tx->sta->ampdu_mlme.tid_tx[tid]); } if (tid_tx) { bool queued; queued = ieee80211_tx_prep_agg(tx, skb, info, tid_tx, tid); if (unlikely(queued)) return TX_QUEUED; } } if (is_multicast_ether_addr(hdr->addr1)) { tx->flags &= ~IEEE80211_TX_UNICAST; info->flags |= IEEE80211_TX_CTL_NO_ACK; } else tx->flags |= IEEE80211_TX_UNICAST; if (!(info->flags & IEEE80211_TX_CTL_DONTFRAG)) { if (!(tx->flags & IEEE80211_TX_UNICAST) || skb->len + FCS_LEN <= local->hw.wiphy->frag_threshold || info->flags & IEEE80211_TX_CTL_AMPDU) info->flags |= IEEE80211_TX_CTL_DONTFRAG; } if (!tx->sta) info->flags |= IEEE80211_TX_CTL_CLEAR_PS_FILT; else if (test_and_clear_sta_flag(tx->sta, WLAN_STA_CLEAR_PS_FILT)) { info->flags |= IEEE80211_TX_CTL_CLEAR_PS_FILT; ieee80211_check_fast_xmit(tx->sta); } info->flags |= IEEE80211_TX_CTL_FIRST_FRAGMENT; return TX_CONTINUE; } static struct txq_info *ieee80211_get_txq(struct ieee80211_local *local, struct ieee80211_vif *vif, struct sta_info *sta, struct sk_buff *skb) { struct ieee80211_hdr *hdr = (struct ieee80211_hdr *) skb->data; struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); struct ieee80211_txq *txq = NULL; if ((info->flags & IEEE80211_TX_CTL_SEND_AFTER_DTIM) || (info->control.flags & IEEE80211_TX_CTRL_PS_RESPONSE)) return NULL; if (!(info->flags & IEEE80211_TX_CTL_HW_80211_ENCAP) && unlikely(!ieee80211_is_data_present(hdr->frame_control))) { if ((!ieee80211_is_mgmt(hdr->frame_control) || ieee80211_is_bufferable_mmpdu(skb) || vif->type == NL80211_IFTYPE_STATION) && sta && sta->uploaded) { /* * This will be NULL if the driver didn't set the * opt-in hardware flag. */ txq = sta->sta.txq[IEEE80211_NUM_TIDS]; } } else if (sta) { u8 tid = skb->priority & IEEE80211_QOS_CTL_TID_MASK; if (!sta->uploaded) return NULL; txq = sta->sta.txq[tid]; } else { txq = vif->txq; } if (!txq) return NULL; return to_txq_info(txq); } static void ieee80211_set_skb_enqueue_time(struct sk_buff *skb) { struct sk_buff *next; codel_time_t now = codel_get_time(); skb_list_walk_safe(skb, skb, next) IEEE80211_SKB_CB(skb)->control.enqueue_time = now; } static u32 codel_skb_len_func(const struct sk_buff *skb) { return skb->len; } static codel_time_t codel_skb_time_func(const struct sk_buff *skb) { const struct ieee80211_tx_info *info; info = (const struct ieee80211_tx_info *)skb->cb; return info->control.enqueue_time; } static struct sk_buff *codel_dequeue_func(struct codel_vars *cvars, void *ctx) { struct ieee80211_local *local; struct txq_info *txqi; struct fq *fq; struct fq_flow *flow; txqi = ctx; local = vif_to_sdata(txqi->txq.vif)->local; fq = &local->fq; if (cvars == &txqi->def_cvars) flow = &txqi->tin.default_flow; else flow = &fq->flows[cvars - local->cvars]; return fq_flow_dequeue(fq, flow); } static void codel_drop_func(struct sk_buff *skb, void *ctx) { struct ieee80211_local *local; struct ieee80211_hw *hw; struct txq_info *txqi; txqi = ctx; local = vif_to_sdata(txqi->txq.vif)->local; hw = &local->hw; ieee80211_free_txskb(hw, skb); } static struct sk_buff *fq_tin_dequeue_func(struct fq *fq, struct fq_tin *tin, struct fq_flow *flow) { struct ieee80211_local *local; struct txq_info *txqi; struct codel_vars *cvars; struct codel_params *cparams; struct codel_stats *cstats; local = container_of(fq, struct ieee80211_local, fq); txqi = container_of(tin, struct txq_info, tin); cparams = &local->cparams; cstats = &txqi->cstats; if (flow == &tin->default_flow) cvars = &txqi->def_cvars; else cvars = &local->cvars[flow - fq->flows]; return codel_dequeue(txqi, &flow->backlog, cparams, cvars, cstats, codel_skb_len_func, codel_skb_time_func, codel_drop_func, codel_dequeue_func); } static void fq_skb_free_func(struct fq *fq, struct fq_tin *tin, struct fq_flow *flow, struct sk_buff *skb) { struct ieee80211_local *local; local = container_of(fq, struct ieee80211_local, fq); ieee80211_free_txskb(&local->hw, skb); } static void ieee80211_txq_enqueue(struct ieee80211_local *local, struct txq_info *txqi, struct sk_buff *skb) { struct fq *fq = &local->fq; struct fq_tin *tin = &txqi->tin; u32 flow_idx; ieee80211_set_skb_enqueue_time(skb); spin_lock_bh(&fq->lock); /* * For management frames, don't really apply codel etc., * we don't want to apply any shaping or anything we just * want to simplify the driver API by having them on the * txqi. */ if (unlikely(txqi->txq.tid == IEEE80211_NUM_TIDS)) { IEEE80211_SKB_CB(skb)->control.flags |= IEEE80211_TX_INTCFL_NEED_TXPROCESSING; __skb_queue_tail(&txqi->frags, skb); } else { flow_idx = fq_flow_idx(fq, skb); fq_tin_enqueue(fq, tin, flow_idx, skb, fq_skb_free_func); } spin_unlock_bh(&fq->lock); } static bool fq_vlan_filter_func(struct fq *fq, struct fq_tin *tin, struct fq_flow *flow, struct sk_buff *skb, void *data) { struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); return info->control.vif == data; } void ieee80211_txq_remove_vlan(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata) { struct fq *fq = &local->fq; struct txq_info *txqi; struct fq_tin *tin; struct ieee80211_sub_if_data *ap; if (WARN_ON(sdata->vif.type != NL80211_IFTYPE_AP_VLAN)) return; ap = container_of(sdata->bss, struct ieee80211_sub_if_data, u.ap); if (!ap->vif.txq) return; txqi = to_txq_info(ap->vif.txq); tin = &txqi->tin; spin_lock_bh(&fq->lock); fq_tin_filter(fq, tin, fq_vlan_filter_func, &sdata->vif, fq_skb_free_func); spin_unlock_bh(&fq->lock); } void ieee80211_txq_init(struct ieee80211_sub_if_data *sdata, struct sta_info *sta, struct txq_info *txqi, int tid) { fq_tin_init(&txqi->tin); codel_vars_init(&txqi->def_cvars); codel_stats_init(&txqi->cstats); __skb_queue_head_init(&txqi->frags); INIT_LIST_HEAD(&txqi->schedule_order); txqi->txq.vif = &sdata->vif; if (!sta) { sdata->vif.txq = &txqi->txq; txqi->txq.tid = 0; txqi->txq.ac = IEEE80211_AC_BE; return; } if (tid == IEEE80211_NUM_TIDS) { if (sdata->vif.type == NL80211_IFTYPE_STATION) { /* Drivers need to opt in to the management MPDU TXQ */ if (!ieee80211_hw_check(&sdata->local->hw, STA_MMPDU_TXQ)) return; } else if (!ieee80211_hw_check(&sdata->local->hw, BUFF_MMPDU_TXQ)) { /* Drivers need to opt in to the bufferable MMPDU TXQ */ return; } txqi->txq.ac = IEEE80211_AC_VO; } else { txqi->txq.ac = ieee80211_ac_from_tid(tid); } txqi->txq.sta = &sta->sta; txqi->txq.tid = tid; sta->sta.txq[tid] = &txqi->txq; } void ieee80211_txq_purge(struct ieee80211_local *local, struct txq_info *txqi) { struct fq *fq = &local->fq; struct fq_tin *tin = &txqi->tin; spin_lock_bh(&fq->lock); fq_tin_reset(fq, tin, fq_skb_free_func); ieee80211_purge_tx_queue(&local->hw, &txqi->frags); spin_unlock_bh(&fq->lock); spin_lock_bh(&local->active_txq_lock[txqi->txq.ac]); list_del_init(&txqi->schedule_order); spin_unlock_bh(&local->active_txq_lock[txqi->txq.ac]); } void ieee80211_txq_set_params(struct ieee80211_local *local, int radio_idx) { if (local->hw.wiphy->txq_limit) local->fq.limit = local->hw.wiphy->txq_limit; else local->hw.wiphy->txq_limit = local->fq.limit; if (local->hw.wiphy->txq_memory_limit) local->fq.memory_limit = local->hw.wiphy->txq_memory_limit; else local->hw.wiphy->txq_memory_limit = local->fq.memory_limit; if (local->hw.wiphy->txq_quantum) local->fq.quantum = local->hw.wiphy->txq_quantum; else local->hw.wiphy->txq_quantum = local->fq.quantum; } int ieee80211_txq_setup_flows(struct ieee80211_local *local) { struct fq *fq = &local->fq; int ret; int i; bool supp_vht = false; enum nl80211_band band; ret = fq_init(fq, 4096); if (ret) return ret; /* * If the hardware doesn't support VHT, it is safe to limit the maximum * queue size. 4 Mbytes is 64 max-size aggregates in 802.11n. */ for (band = 0; band < NUM_NL80211_BANDS; band++) { struct ieee80211_supported_band *sband; sband = local->hw.wiphy->bands[band]; if (!sband) continue; supp_vht = supp_vht || sband->vht_cap.vht_supported; } if (!supp_vht) fq->memory_limit = 4 << 20; /* 4 Mbytes */ codel_params_init(&local->cparams); local->cparams.interval = MS2TIME(100); local->cparams.target = MS2TIME(20); local->cparams.ecn = true; local->cvars = kvcalloc(fq->flows_cnt, sizeof(local->cvars[0]), GFP_KERNEL); if (!local->cvars) { spin_lock_bh(&fq->lock); fq_reset(fq, fq_skb_free_func); spin_unlock_bh(&fq->lock); return -ENOMEM; } for (i = 0; i < fq->flows_cnt; i++) codel_vars_init(&local->cvars[i]); ieee80211_txq_set_params(local, -1); return 0; } void ieee80211_txq_teardown_flows(struct ieee80211_local *local) { struct fq *fq = &local->fq; kvfree(local->cvars); local->cvars = NULL; spin_lock_bh(&fq->lock); fq_reset(fq, fq_skb_free_func); spin_unlock_bh(&fq->lock); } static bool ieee80211_queue_skb(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, struct sta_info *sta, struct sk_buff *skb) { struct ieee80211_vif *vif; struct txq_info *txqi; if (sdata->vif.type == NL80211_IFTYPE_MONITOR) return false; if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN) sdata = container_of(sdata->bss, struct ieee80211_sub_if_data, u.ap); vif = &sdata->vif; txqi = ieee80211_get_txq(local, vif, sta, skb); if (!txqi) return false; ieee80211_txq_enqueue(local, txqi, skb); schedule_and_wake_txq(local, txqi); return true; } static bool ieee80211_tx_frags(struct ieee80211_local *local, struct ieee80211_vif *vif, struct sta_info *sta, struct sk_buff_head *skbs, bool txpending) { struct ieee80211_tx_control control = {}; struct sk_buff *skb, *tmp; unsigned long flags; skb_queue_walk_safe(skbs, skb, tmp) { struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); int q = info->hw_queue; #ifdef CONFIG_MAC80211_VERBOSE_DEBUG if (WARN_ON_ONCE(q >= local->hw.queues)) { __skb_unlink(skb, skbs); ieee80211_free_txskb(&local->hw, skb); continue; } #endif spin_lock_irqsave(&local->queue_stop_reason_lock, flags); if (local->queue_stop_reasons[q] || (!txpending && !skb_queue_empty(&local->pending[q]))) { if (unlikely(info->flags & IEEE80211_TX_INTFL_OFFCHAN_TX_OK)) { if (local->queue_stop_reasons[q] & ~BIT(IEEE80211_QUEUE_STOP_REASON_OFFCHANNEL)) { /* * Drop off-channel frames if queues * are stopped for any reason other * than off-channel operation. Never * queue them. */ spin_unlock_irqrestore( &local->queue_stop_reason_lock, flags); ieee80211_purge_tx_queue(&local->hw, skbs); return true; } } else { /* * Since queue is stopped, queue up frames for * later transmission from the tx-pending * tasklet when the queue is woken again. */ if (txpending) skb_queue_splice_init(skbs, &local->pending[q]); else skb_queue_splice_tail_init(skbs, &local->pending[q]); spin_unlock_irqrestore(&local->queue_stop_reason_lock, flags); return false; } } spin_unlock_irqrestore(&local->queue_stop_reason_lock, flags); info->control.vif = vif; control.sta = sta ? &sta->sta : NULL; __skb_unlink(skb, skbs); drv_tx(local, &control, skb); } return true; } /* * Returns false if the frame couldn't be transmitted but was queued instead. */ static bool __ieee80211_tx(struct ieee80211_local *local, struct sk_buff_head *skbs, struct sta_info *sta, bool txpending) { struct ieee80211_tx_info *info; struct ieee80211_sub_if_data *sdata; struct ieee80211_vif *vif; struct sk_buff *skb; bool result; if (WARN_ON(skb_queue_empty(skbs))) return true; skb = skb_peek(skbs); info = IEEE80211_SKB_CB(skb); sdata = vif_to_sdata(info->control.vif); if (sta && !sta->uploaded) sta = NULL; switch (sdata->vif.type) { case NL80211_IFTYPE_MONITOR: if ((sdata->u.mntr.flags & MONITOR_FLAG_ACTIVE) || ieee80211_hw_check(&local->hw, NO_VIRTUAL_MONITOR)) { vif = &sdata->vif; break; } sdata = rcu_dereference(local->monitor_sdata); if (sdata && ieee80211_hw_check(&local->hw, WANT_MONITOR_VIF)) { vif = &sdata->vif; info->hw_queue = vif->hw_queue[skb_get_queue_mapping(skb)]; } else if (ieee80211_hw_check(&local->hw, QUEUE_CONTROL)) { ieee80211_purge_tx_queue(&local->hw, skbs); return true; } else vif = NULL; break; case NL80211_IFTYPE_AP_VLAN: sdata = container_of(sdata->bss, struct ieee80211_sub_if_data, u.ap); fallthrough; default: vif = &sdata->vif; break; } result = ieee80211_tx_frags(local, vif, sta, skbs, txpending); WARN_ON_ONCE(!skb_queue_empty(skbs)); return result; } /* * Invoke TX handlers, return 0 on success and non-zero if the * frame was dropped or queued. * * The handlers are split into an early and late part. The latter is everything * that can be sensitive to reordering, and will be deferred to after packets * are dequeued from the intermediate queues (when they are enabled). */ static int invoke_tx_handlers_early(struct ieee80211_tx_data *tx) { ieee80211_tx_result res = TX_DROP; #define CALL_TXH(txh) \ do { \ res = txh(tx); \ if (res != TX_CONTINUE) \ goto txh_done; \ } while (0) CALL_TXH(ieee80211_tx_h_dynamic_ps); CALL_TXH(ieee80211_tx_h_check_assoc); CALL_TXH(ieee80211_tx_h_ps_buf); CALL_TXH(ieee80211_tx_h_check_control_port_protocol); CALL_TXH(ieee80211_tx_h_select_key); txh_done: if (unlikely(res == TX_DROP)) { tx->sdata->tx_handlers_drop++; if (tx->skb) ieee80211_free_txskb(&tx->local->hw, tx->skb); else ieee80211_purge_tx_queue(&tx->local->hw, &tx->skbs); return -1; } else if (unlikely(res == TX_QUEUED)) { I802_DEBUG_INC(tx->local->tx_handlers_queued); return -1; } return 0; } /* * Late handlers can be called while the sta lock is held. Handlers that can * cause packets to be generated will cause deadlock! */ static int invoke_tx_handlers_late(struct ieee80211_tx_data *tx) { struct ieee80211_tx_info *info = IEEE80211_SKB_CB(tx->skb); ieee80211_tx_result res = TX_CONTINUE; if (!ieee80211_hw_check(&tx->local->hw, HAS_RATE_CONTROL)) CALL_TXH(ieee80211_tx_h_rate_ctrl); if (unlikely(info->flags & IEEE80211_TX_INTFL_RETRANSMISSION)) { __skb_queue_tail(&tx->skbs, tx->skb); tx->skb = NULL; goto txh_done; } CALL_TXH(ieee80211_tx_h_michael_mic_add); CALL_TXH(ieee80211_tx_h_sequence); CALL_TXH(ieee80211_tx_h_fragment); /* handlers after fragment must be aware of tx info fragmentation! */ CALL_TXH(ieee80211_tx_h_stats); CALL_TXH(ieee80211_tx_h_encrypt); if (!ieee80211_hw_check(&tx->local->hw, HAS_RATE_CONTROL)) CALL_TXH(ieee80211_tx_h_calculate_duration); #undef CALL_TXH txh_done: if (unlikely(res == TX_DROP)) { tx->sdata->tx_handlers_drop++; if (tx->skb) ieee80211_free_txskb(&tx->local->hw, tx->skb); else ieee80211_purge_tx_queue(&tx->local->hw, &tx->skbs); return -1; } else if (unlikely(res == TX_QUEUED)) { I802_DEBUG_INC(tx->local->tx_handlers_queued); return -1; } return 0; } static int invoke_tx_handlers(struct ieee80211_tx_data *tx) { int r = invoke_tx_handlers_early(tx); if (r) return r; return invoke_tx_handlers_late(tx); } bool ieee80211_tx_prepare_skb(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct sk_buff *skb, int band, struct ieee80211_sta **sta) { struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif); struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); struct ieee80211_tx_data tx; struct sk_buff *skb2; if (ieee80211_tx_prepare(sdata, &tx, NULL, skb) == TX_DROP) return false; info->band = band; info->control.vif = vif; info->hw_queue = vif->hw_queue[skb_get_queue_mapping(skb)]; if (invoke_tx_handlers(&tx)) return false; if (sta) { if (tx.sta) *sta = &tx.sta->sta; else *sta = NULL; } /* this function isn't suitable for fragmented data frames */ skb2 = __skb_dequeue(&tx.skbs); if (WARN_ON(skb2 != skb || !skb_queue_empty(&tx.skbs))) { ieee80211_free_txskb(hw, skb2); ieee80211_purge_tx_queue(hw, &tx.skbs); return false; } return true; } EXPORT_SYMBOL(ieee80211_tx_prepare_skb); /* * Returns false if the frame couldn't be transmitted but was queued instead. */ static bool ieee80211_tx(struct ieee80211_sub_if_data *sdata, struct sta_info *sta, struct sk_buff *skb, bool txpending) { struct ieee80211_local *local = sdata->local; struct ieee80211_tx_data tx; ieee80211_tx_result res_prepare; struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); bool result = true; if (unlikely(skb->len < 10)) { dev_kfree_skb(skb); return true; } /* initialises tx */ res_prepare = ieee80211_tx_prepare(sdata, &tx, sta, skb); if (unlikely(res_prepare == TX_DROP)) { ieee80211_free_txskb(&local->hw, skb); tx.sdata->tx_handlers_drop++; return true; } else if (unlikely(res_prepare == TX_QUEUED)) { return true; } /* set up hw_queue value early */ if (!(info->flags & IEEE80211_TX_CTL_TX_OFFCHAN) || !ieee80211_hw_check(&local->hw, QUEUE_CONTROL)) info->hw_queue = sdata->vif.hw_queue[skb_get_queue_mapping(skb)]; if (invoke_tx_handlers_early(&tx)) return true; if (ieee80211_queue_skb(local, sdata, tx.sta, tx.skb)) return true; if (!invoke_tx_handlers_late(&tx)) result = __ieee80211_tx(local, &tx.skbs, tx.sta, txpending); return result; } /* device xmit handlers */ enum ieee80211_encrypt { ENCRYPT_NO, ENCRYPT_MGMT, ENCRYPT_DATA, }; static int ieee80211_skb_resize(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb, int head_need, enum ieee80211_encrypt encrypt) { struct ieee80211_local *local = sdata->local; bool enc_tailroom; int tail_need = 0; enc_tailroom = encrypt == ENCRYPT_MGMT || (encrypt == ENCRYPT_DATA && sdata->crypto_tx_tailroom_needed_cnt); if (enc_tailroom) { tail_need = IEEE80211_ENCRYPT_TAILROOM; tail_need -= skb_tailroom(skb); tail_need = max_t(int, tail_need, 0); } if (skb_cloned(skb) && (!ieee80211_hw_check(&local->hw, SUPPORTS_CLONED_SKBS) || !skb_clone_writable(skb, ETH_HLEN) || enc_tailroom)) I802_DEBUG_INC(local->tx_expand_skb_head_cloned); else if (head_need || tail_need) I802_DEBUG_INC(local->tx_expand_skb_head); else return 0; if (pskb_expand_head(skb, head_need, tail_need, GFP_ATOMIC)) { wiphy_debug(local->hw.wiphy, "failed to reallocate TX buffer\n"); return -ENOMEM; } return 0; } void ieee80211_xmit(struct ieee80211_sub_if_data *sdata, struct sta_info *sta, struct sk_buff *skb) { struct ieee80211_local *local = sdata->local; struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); struct ieee80211_hdr *hdr = (struct ieee80211_hdr *) skb->data; int headroom; enum ieee80211_encrypt encrypt; if (info->flags & IEEE80211_TX_INTFL_DONT_ENCRYPT) encrypt = ENCRYPT_NO; else if (ieee80211_is_mgmt(hdr->frame_control)) encrypt = ENCRYPT_MGMT; else encrypt = ENCRYPT_DATA; headroom = local->tx_headroom; if (encrypt != ENCRYPT_NO) headroom += IEEE80211_ENCRYPT_HEADROOM; headroom -= skb_headroom(skb); headroom = max_t(int, 0, headroom); if (ieee80211_skb_resize(sdata, skb, headroom, encrypt)) { ieee80211_free_txskb(&local->hw, skb); return; } /* reload after potential resize */ hdr = (struct ieee80211_hdr *) skb->data; info->control.vif = &sdata->vif; if (ieee80211_vif_is_mesh(&sdata->vif)) { if (ieee80211_is_data(hdr->frame_control) && is_unicast_ether_addr(hdr->addr1)) { if (mesh_nexthop_resolve(sdata, skb)) return; /* skb queued: don't free */ } else { ieee80211_mps_set_frame_flags(sdata, NULL, hdr); } } ieee80211_set_qos_hdr(sdata, skb); ieee80211_tx(sdata, sta, skb, false); } static bool ieee80211_validate_radiotap_len(struct sk_buff *skb) { struct ieee80211_radiotap_header *rthdr = (struct ieee80211_radiotap_header *)skb->data; /* check for not even having the fixed radiotap header part */ if (unlikely(skb->len < sizeof(struct ieee80211_radiotap_header))) return false; /* too short to be possibly valid */ /* is it a header version we can trust to find length from? */ if (unlikely(rthdr->it_version)) return false; /* only version 0 is supported */ /* does the skb contain enough to deliver on the alleged length? */ if (unlikely(skb->len < ieee80211_get_radiotap_len(skb->data))) return false; /* skb too short for claimed rt header extent */ return true; } bool ieee80211_parse_tx_radiotap(struct sk_buff *skb, struct net_device *dev) { struct ieee80211_local *local = wdev_priv(dev->ieee80211_ptr); struct ieee80211_radiotap_iterator iterator; struct ieee80211_radiotap_header *rthdr = (struct ieee80211_radiotap_header *) skb->data; struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); int ret = ieee80211_radiotap_iterator_init(&iterator, rthdr, skb->len, NULL); u16 txflags; u16 rate = 0; bool rate_found = false; u8 rate_retries = 0; u16 rate_flags = 0; u8 mcs_known, mcs_flags, mcs_bw; u16 vht_known; u8 vht_mcs = 0, vht_nss = 0; int i; if (!ieee80211_validate_radiotap_len(skb)) return false; info->flags |= IEEE80211_TX_INTFL_DONT_ENCRYPT | IEEE80211_TX_CTL_DONTFRAG; /* * for every radiotap entry that is present * (ieee80211_radiotap_iterator_next returns -ENOENT when no more * entries present, or -EINVAL on error) */ while (!ret) { ret = ieee80211_radiotap_iterator_next(&iterator); if (ret) continue; /* see if this argument is something we can use */ switch (iterator.this_arg_index) { /* * You must take care when dereferencing iterator.this_arg * for multibyte types... the pointer is not aligned. Use * get_unaligned((type *)iterator.this_arg) to dereference * iterator.this_arg for type "type" safely on all arches. */ case IEEE80211_RADIOTAP_FLAGS: if (*iterator.this_arg & IEEE80211_RADIOTAP_F_FCS) { /* * this indicates that the skb we have been * handed has the 32-bit FCS CRC at the end... * we should react to that by snipping it off * because it will be recomputed and added * on transmission */ if (skb->len < (iterator._max_length + FCS_LEN)) return false; skb_trim(skb, skb->len - FCS_LEN); } if (*iterator.this_arg & IEEE80211_RADIOTAP_F_WEP) info->flags &= ~IEEE80211_TX_INTFL_DONT_ENCRYPT; if (*iterator.this_arg & IEEE80211_RADIOTAP_F_FRAG) info->flags &= ~IEEE80211_TX_CTL_DONTFRAG; break; case IEEE80211_RADIOTAP_TX_FLAGS: txflags = get_unaligned_le16(iterator.this_arg); if (txflags & IEEE80211_RADIOTAP_F_TX_NOACK) info->flags |= IEEE80211_TX_CTL_NO_ACK; if (txflags & IEEE80211_RADIOTAP_F_TX_NOSEQNO) info->control.flags |= IEEE80211_TX_CTRL_NO_SEQNO; if (txflags & IEEE80211_RADIOTAP_F_TX_ORDER) info->control.flags |= IEEE80211_TX_CTRL_DONT_REORDER; break; case IEEE80211_RADIOTAP_RATE: rate = *iterator.this_arg; rate_flags = 0; rate_found = true; break; case IEEE80211_RADIOTAP_ANTENNA: /* this can appear multiple times, keep a bitmap */ info->control.antennas |= BIT(*iterator.this_arg); break; case IEEE80211_RADIOTAP_DATA_RETRIES: rate_retries = *iterator.this_arg; break; case IEEE80211_RADIOTAP_MCS: mcs_known = iterator.this_arg[0]; mcs_flags = iterator.this_arg[1]; if (!(mcs_known & IEEE80211_RADIOTAP_MCS_HAVE_MCS)) break; rate_found = true; rate = iterator.this_arg[2]; rate_flags = IEEE80211_TX_RC_MCS; if (mcs_known & IEEE80211_RADIOTAP_MCS_HAVE_GI && mcs_flags & IEEE80211_RADIOTAP_MCS_SGI) rate_flags |= IEEE80211_TX_RC_SHORT_GI; mcs_bw = mcs_flags & IEEE80211_RADIOTAP_MCS_BW_MASK; if (mcs_known & IEEE80211_RADIOTAP_MCS_HAVE_BW && mcs_bw == IEEE80211_RADIOTAP_MCS_BW_40) rate_flags |= IEEE80211_TX_RC_40_MHZ_WIDTH; if (mcs_known & IEEE80211_RADIOTAP_MCS_HAVE_FEC && mcs_flags & IEEE80211_RADIOTAP_MCS_FEC_LDPC) info->flags |= IEEE80211_TX_CTL_LDPC; if (mcs_known & IEEE80211_RADIOTAP_MCS_HAVE_STBC) { u8 stbc = u8_get_bits(mcs_flags, IEEE80211_RADIOTAP_MCS_STBC_MASK); info->flags |= u32_encode_bits(stbc, IEEE80211_TX_CTL_STBC); } break; case IEEE80211_RADIOTAP_VHT: vht_known = get_unaligned_le16(iterator.this_arg); rate_found = true; rate_flags = IEEE80211_TX_RC_VHT_MCS; if ((vht_known & IEEE80211_RADIOTAP_VHT_KNOWN_GI) && (iterator.this_arg[2] & IEEE80211_RADIOTAP_VHT_FLAG_SGI)) rate_flags |= IEEE80211_TX_RC_SHORT_GI; if (vht_known & IEEE80211_RADIOTAP_VHT_KNOWN_BANDWIDTH) { if (iterator.this_arg[3] == 1) rate_flags |= IEEE80211_TX_RC_40_MHZ_WIDTH; else if (iterator.this_arg[3] == 4) rate_flags |= IEEE80211_TX_RC_80_MHZ_WIDTH; else if (iterator.this_arg[3] == 11) rate_flags |= IEEE80211_TX_RC_160_MHZ_WIDTH; } vht_mcs = iterator.this_arg[4] >> 4; if (vht_mcs > 11) vht_mcs = 0; vht_nss = iterator.this_arg[4] & 0xF; if (!vht_nss || vht_nss > 8) vht_nss = 1; break; /* * Please update the file * Documentation/networking/mac80211-injection.rst * when parsing new fields here. */ default: break; } } if (ret != -ENOENT) /* ie, if we didn't simply run out of fields */ return false; if (rate_found) { struct ieee80211_supported_band *sband = local->hw.wiphy->bands[info->band]; info->control.flags |= IEEE80211_TX_CTRL_RATE_INJECT; for (i = 0; i < IEEE80211_TX_MAX_RATES; i++) { info->control.rates[i].idx = -1; info->control.rates[i].flags = 0; info->control.rates[i].count = 0; } if (rate_flags & IEEE80211_TX_RC_MCS) { /* reset antennas if not enough */ if (IEEE80211_HT_MCS_CHAINS(rate) > hweight8(info->control.antennas)) info->control.antennas = 0; info->control.rates[0].idx = rate; } else if (rate_flags & IEEE80211_TX_RC_VHT_MCS) { /* reset antennas if not enough */ if (vht_nss > hweight8(info->control.antennas)) info->control.antennas = 0; ieee80211_rate_set_vht(info->control.rates, vht_mcs, vht_nss); } else if (sband) { for (i = 0; i < sband->n_bitrates; i++) { if (rate * 5 != sband->bitrates[i].bitrate) continue; info->control.rates[0].idx = i; break; } } if (info->control.rates[0].idx < 0) info->control.flags &= ~IEEE80211_TX_CTRL_RATE_INJECT; info->control.rates[0].flags = rate_flags; info->control.rates[0].count = min_t(u8, rate_retries + 1, local->hw.max_rate_tries); } return true; } netdev_tx_t ieee80211_monitor_start_xmit(struct sk_buff *skb, struct net_device *dev) { struct ieee80211_local *local = wdev_priv(dev->ieee80211_ptr); struct ieee80211_chanctx_conf *chanctx_conf; struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); struct ieee80211_hdr *hdr; struct ieee80211_sub_if_data *tmp_sdata, *sdata; struct cfg80211_chan_def *chandef; u16 len_rthdr; int hdrlen; sdata = IEEE80211_DEV_TO_SUB_IF(dev); if (unlikely(!ieee80211_sdata_running(sdata))) goto fail; memset(info, 0, sizeof(*info)); info->flags = IEEE80211_TX_CTL_REQ_TX_STATUS | IEEE80211_TX_CTL_INJECTED; /* Sanity-check the length of the radiotap header */ if (!ieee80211_validate_radiotap_len(skb)) goto fail; /* we now know there is a radiotap header with a length we can use */ len_rthdr = ieee80211_get_radiotap_len(skb->data); /* * fix up the pointers accounting for the radiotap * header still being in there. We are being given * a precooked IEEE80211 header so no need for * normal processing */ skb_set_mac_header(skb, len_rthdr); /* * these are just fixed to the end of the rt area since we * don't have any better information and at this point, nobody cares */ skb_set_network_header(skb, len_rthdr); skb_set_transport_header(skb, len_rthdr); if (skb->len < len_rthdr + 2) goto fail; hdr = (struct ieee80211_hdr *)(skb->data + len_rthdr); hdrlen = ieee80211_hdrlen(hdr->frame_control); if (skb->len < len_rthdr + hdrlen) goto fail; /* * Initialize skb->protocol if the injected frame is a data frame * carrying a rfc1042 header */ if (ieee80211_is_data(hdr->frame_control) && skb->len >= len_rthdr + hdrlen + sizeof(rfc1042_header) + 2) { u8 *payload = (u8 *)hdr + hdrlen; if (ether_addr_equal(payload, rfc1042_header)) skb->protocol = cpu_to_be16((payload[6] << 8) | payload[7]); } rcu_read_lock(); /* * We process outgoing injected frames that have a local address * we handle as though they are non-injected frames. * This code here isn't entirely correct, the local MAC address * isn't always enough to find the interface to use; for proper * VLAN support we have an nl80211-based mechanism. * * This is necessary, for example, for old hostapd versions that * don't use nl80211-based management TX/RX. */ list_for_each_entry_rcu(tmp_sdata, &local->interfaces, list) { if (!ieee80211_sdata_running(tmp_sdata)) continue; if (tmp_sdata->vif.type == NL80211_IFTYPE_MONITOR || tmp_sdata->vif.type == NL80211_IFTYPE_AP_VLAN) continue; if (ether_addr_equal(tmp_sdata->vif.addr, hdr->addr2)) { sdata = tmp_sdata; break; } } chanctx_conf = rcu_dereference(sdata->vif.bss_conf.chanctx_conf); if (!chanctx_conf) { tmp_sdata = rcu_dereference(local->monitor_sdata); if (tmp_sdata) chanctx_conf = rcu_dereference(tmp_sdata->vif.bss_conf.chanctx_conf); } if (chanctx_conf) chandef = &chanctx_conf->def; else goto fail_rcu; /* * If driver/HW supports IEEE80211_CHAN_CAN_MONITOR we still * shouldn't transmit on disabled channels. */ if (!cfg80211_chandef_usable(local->hw.wiphy, chandef, IEEE80211_CHAN_DISABLED)) goto fail_rcu; /* * Frame injection is not allowed if beaconing is not allowed * or if we need radar detection. Beaconing is usually not allowed when * the mode or operation (Adhoc, AP, Mesh) does not support DFS. * Passive scan is also used in world regulatory domains where * your country is not known and as such it should be treated as * NO TX unless the channel is explicitly allowed in which case * your current regulatory domain would not have the passive scan * flag. * * Since AP mode uses monitor interfaces to inject/TX management * frames we can make AP mode the exception to this rule once it * supports radar detection as its implementation can deal with * radar detection by itself. We can do that later by adding a * monitor flag interfaces used for AP support. */ if (!cfg80211_reg_can_beacon(local->hw.wiphy, chandef, sdata->vif.type)) goto fail_rcu; info->band = chandef->chan->band; /* Initialize skb->priority according to frame type and TID class, * with respect to the sub interface that the frame will actually * be transmitted on. If the DONT_REORDER flag is set, the original * skb-priority is preserved to assure frames injected with this * flag are not reordered relative to each other. */ ieee80211_select_queue_80211(sdata, skb, hdr); skb_set_queue_mapping(skb, ieee80211_ac_from_tid(skb->priority)); /* * Process the radiotap header. This will now take into account the * selected chandef above to accurately set injection rates and * retransmissions. */ if (!ieee80211_parse_tx_radiotap(skb, dev)) goto fail_rcu; /* remove the injection radiotap header */ skb_pull(skb, len_rthdr); ieee80211_xmit(sdata, NULL, skb); rcu_read_unlock(); return NETDEV_TX_OK; fail_rcu: rcu_read_unlock(); fail: dev_kfree_skb(skb); return NETDEV_TX_OK; /* meaning, we dealt with the skb */ } static inline bool ieee80211_is_tdls_setup(struct sk_buff *skb) { u16 ethertype = (skb->data[12] << 8) | skb->data[13]; return ethertype == ETH_P_TDLS && skb->len > 14 && skb->data[14] == WLAN_TDLS_SNAP_RFTYPE; } int ieee80211_lookup_ra_sta(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb, struct sta_info **sta_out) { struct sta_info *sta; switch (sdata->vif.type) { case NL80211_IFTYPE_AP_VLAN: sta = rcu_dereference(sdata->u.vlan.sta); if (sta) { *sta_out = sta; return 0; } else if (sdata->wdev.use_4addr) { return -ENOLINK; } fallthrough; case NL80211_IFTYPE_AP: case NL80211_IFTYPE_OCB: case NL80211_IFTYPE_ADHOC: if (is_multicast_ether_addr(skb->data)) { *sta_out = ERR_PTR(-ENOENT); return 0; } sta = sta_info_get_bss(sdata, skb->data); break; #ifdef CONFIG_MAC80211_MESH case NL80211_IFTYPE_MESH_POINT: /* determined much later */ *sta_out = NULL; return 0; #endif case NL80211_IFTYPE_STATION: if (sdata->wdev.wiphy->flags & WIPHY_FLAG_SUPPORTS_TDLS) { sta = sta_info_get(sdata, skb->data); if (sta && test_sta_flag(sta, WLAN_STA_TDLS_PEER)) { if (test_sta_flag(sta, WLAN_STA_TDLS_PEER_AUTH)) { *sta_out = sta; return 0; } /* * TDLS link during setup - throw out frames to * peer. Allow TDLS-setup frames to unauthorized * peers for the special case of a link teardown * after a TDLS sta is removed due to being * unreachable. */ if (!ieee80211_is_tdls_setup(skb)) return -EINVAL; } } sta = sta_info_get(sdata, sdata->vif.cfg.ap_addr); if (!sta) return -ENOLINK; break; default: return -EINVAL; } *sta_out = sta ?: ERR_PTR(-ENOENT); return 0; } static u16 ieee80211_store_ack_skb(struct ieee80211_local *local, struct sk_buff *skb, u32 *info_flags, u64 *cookie) { struct sk_buff *ack_skb; u16 info_id = 0; if (skb->sk) ack_skb = skb_clone_sk(skb); else ack_skb = skb_clone(skb, GFP_ATOMIC); if (ack_skb) { unsigned long flags; int id; spin_lock_irqsave(&local->ack_status_lock, flags); id = idr_alloc(&local->ack_status_frames, ack_skb, 1, 0x2000, GFP_ATOMIC); spin_unlock_irqrestore(&local->ack_status_lock, flags); if (id >= 0) { info_id = id; *info_flags |= IEEE80211_TX_CTL_REQ_TX_STATUS; if (cookie) { *cookie = ieee80211_mgmt_tx_cookie(local); IEEE80211_SKB_CB(ack_skb)->ack.cookie = *cookie; } } else { kfree_skb(ack_skb); } } return info_id; } /** * ieee80211_build_hdr - build 802.11 header in the given frame * @sdata: virtual interface to build the header for * @skb: the skb to build the header in * @info_flags: skb flags to set * @sta: the station pointer * @ctrl_flags: info control flags to set * @cookie: cookie pointer to fill (if not %NULL) * * This function takes the skb with 802.3 header and reformats the header to * the appropriate IEEE 802.11 header based on which interface the packet is * being transmitted on. * * Note that this function also takes care of the TX status request and * potential unsharing of the SKB - this needs to be interleaved with the * header building. * * The function requires the read-side RCU lock held * * Returns: the (possibly reallocated) skb or an ERR_PTR() code */ static struct sk_buff *ieee80211_build_hdr(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb, u32 info_flags, struct sta_info *sta, u32 ctrl_flags, u64 *cookie) { struct ieee80211_local *local = sdata->local; struct ieee80211_tx_info *info; int head_need; u16 ethertype, hdrlen, meshhdrlen = 0; __le16 fc; struct ieee80211_hdr hdr; struct ieee80211s_hdr mesh_hdr __maybe_unused; struct mesh_path __maybe_unused *mppath = NULL, *mpath = NULL; const u8 *encaps_data; int encaps_len, skip_header_bytes; bool wme_sta = false, authorized = false; bool tdls_peer; bool multicast; u16 info_id = 0; struct ieee80211_chanctx_conf *chanctx_conf = NULL; enum nl80211_band band; int ret; u8 link_id = u32_get_bits(ctrl_flags, IEEE80211_TX_CTRL_MLO_LINK); if (IS_ERR(sta)) sta = NULL; #ifdef CONFIG_MAC80211_DEBUGFS if (local->force_tx_status) info_flags |= IEEE80211_TX_CTL_REQ_TX_STATUS; #endif /* convert Ethernet header to proper 802.11 header (based on * operation mode) */ ethertype = (skb->data[12] << 8) | skb->data[13]; fc = cpu_to_le16(IEEE80211_FTYPE_DATA | IEEE80211_STYPE_DATA); if (!ieee80211_vif_is_mld(&sdata->vif)) chanctx_conf = rcu_dereference(sdata->vif.bss_conf.chanctx_conf); switch (sdata->vif.type) { case NL80211_IFTYPE_AP_VLAN: if (sdata->wdev.use_4addr) { fc |= cpu_to_le16(IEEE80211_FCTL_FROMDS | IEEE80211_FCTL_TODS); /* RA TA DA SA */ memcpy(hdr.addr1, sta->sta.addr, ETH_ALEN); memcpy(hdr.addr2, sdata->vif.addr, ETH_ALEN); memcpy(hdr.addr3, skb->data, ETH_ALEN); memcpy(hdr.addr4, skb->data + ETH_ALEN, ETH_ALEN); hdrlen = 30; authorized = test_sta_flag(sta, WLAN_STA_AUTHORIZED); wme_sta = sta->sta.wme; } if (!ieee80211_vif_is_mld(&sdata->vif)) { struct ieee80211_sub_if_data *ap_sdata; /* override chanctx_conf from AP (we don't have one) */ ap_sdata = container_of(sdata->bss, struct ieee80211_sub_if_data, u.ap); chanctx_conf = rcu_dereference(ap_sdata->vif.bss_conf.chanctx_conf); } if (sdata->wdev.use_4addr) break; fallthrough; case NL80211_IFTYPE_AP: fc |= cpu_to_le16(IEEE80211_FCTL_FROMDS); /* DA BSSID SA */ memcpy(hdr.addr1, skb->data, ETH_ALEN); if (ieee80211_vif_is_mld(&sdata->vif) && sta && !sta->sta.mlo) { struct ieee80211_link_data *link; link_id = sta->deflink.link_id; link = rcu_dereference(sdata->link[link_id]); if (WARN_ON(!link)) { ret = -ENOLINK; goto free; } memcpy(hdr.addr2, link->conf->addr, ETH_ALEN); } else if (link_id == IEEE80211_LINK_UNSPECIFIED || (sta && sta->sta.mlo)) { memcpy(hdr.addr2, sdata->vif.addr, ETH_ALEN); } else { struct ieee80211_bss_conf *conf; conf = rcu_dereference(sdata->vif.link_conf[link_id]); if (unlikely(!conf)) { ret = -ENOLINK; goto free; } memcpy(hdr.addr2, conf->addr, ETH_ALEN); } memcpy(hdr.addr3, skb->data + ETH_ALEN, ETH_ALEN); hdrlen = 24; break; #ifdef CONFIG_MAC80211_MESH case NL80211_IFTYPE_MESH_POINT: if (!is_multicast_ether_addr(skb->data)) { struct sta_info *next_hop; bool mpp_lookup = true; mpath = mesh_path_lookup(sdata, skb->data); if (mpath) { mpp_lookup = false; next_hop = rcu_dereference(mpath->next_hop); if (!next_hop || !(mpath->flags & (MESH_PATH_ACTIVE | MESH_PATH_RESOLVING))) mpp_lookup = true; } if (mpp_lookup) { mppath = mpp_path_lookup(sdata, skb->data); if (mppath) mppath->exp_time = jiffies; } if (mppath && mpath) mesh_path_del(sdata, mpath->dst); } /* * Use address extension if it is a packet from * another interface or if we know the destination * is being proxied by a portal (i.e. portal address * differs from proxied address) */ if (ether_addr_equal(sdata->vif.addr, skb->data + ETH_ALEN) && !(mppath && !ether_addr_equal(mppath->mpp, skb->data))) { hdrlen = ieee80211_fill_mesh_addresses(&hdr, &fc, skb->data, skb->data + ETH_ALEN); meshhdrlen = ieee80211_new_mesh_header(sdata, &mesh_hdr, NULL, NULL); } else { /* DS -> MBSS (802.11-2012 13.11.3.3). * For unicast with unknown forwarding information, * destination might be in the MBSS or if that fails * forwarded to another mesh gate. In either case * resolution will be handled in ieee80211_xmit(), so * leave the original DA. This also works for mcast */ const u8 *mesh_da = skb->data; if (mppath) mesh_da = mppath->mpp; else if (mpath) mesh_da = mpath->dst; hdrlen = ieee80211_fill_mesh_addresses(&hdr, &fc, mesh_da, sdata->vif.addr); if (is_multicast_ether_addr(mesh_da)) /* DA TA mSA AE:SA */ meshhdrlen = ieee80211_new_mesh_header( sdata, &mesh_hdr, skb->data + ETH_ALEN, NULL); else /* RA TA mDA mSA AE:DA SA */ meshhdrlen = ieee80211_new_mesh_header( sdata, &mesh_hdr, skb->data, skb->data + ETH_ALEN); } /* For injected frames, fill RA right away as nexthop lookup * will be skipped. */ if ((ctrl_flags & IEEE80211_TX_CTRL_SKIP_MPATH_LOOKUP) && is_zero_ether_addr(hdr.addr1)) memcpy(hdr.addr1, skb->data, ETH_ALEN); break; #endif case NL80211_IFTYPE_STATION: /* we already did checks when looking up the RA STA */ tdls_peer = test_sta_flag(sta, WLAN_STA_TDLS_PEER); if (tdls_peer) { /* For TDLS only one link can be valid with peer STA */ int tdls_link_id = ieee80211_tdls_sta_link_id(sta); struct ieee80211_link_data *link; /* DA SA BSSID */ memcpy(hdr.addr1, skb->data, ETH_ALEN); memcpy(hdr.addr2, skb->data + ETH_ALEN, ETH_ALEN); link = rcu_dereference(sdata->link[tdls_link_id]); if (WARN_ON_ONCE(!link)) { ret = -EINVAL; goto free; } memcpy(hdr.addr3, link->u.mgd.bssid, ETH_ALEN); hdrlen = 24; } else if (sdata->u.mgd.use_4addr && cpu_to_be16(ethertype) != sdata->control_port_protocol) { fc |= cpu_to_le16(IEEE80211_FCTL_FROMDS | IEEE80211_FCTL_TODS); /* RA TA DA SA */ memcpy(hdr.addr1, sdata->deflink.u.mgd.bssid, ETH_ALEN); memcpy(hdr.addr2, sdata->vif.addr, ETH_ALEN); memcpy(hdr.addr3, skb->data, ETH_ALEN); memcpy(hdr.addr4, skb->data + ETH_ALEN, ETH_ALEN); hdrlen = 30; } else { fc |= cpu_to_le16(IEEE80211_FCTL_TODS); /* BSSID SA DA */ memcpy(hdr.addr1, sdata->vif.cfg.ap_addr, ETH_ALEN); memcpy(hdr.addr2, skb->data + ETH_ALEN, ETH_ALEN); memcpy(hdr.addr3, skb->data, ETH_ALEN); hdrlen = 24; } break; case NL80211_IFTYPE_OCB: /* DA SA BSSID */ memcpy(hdr.addr1, skb->data, ETH_ALEN); memcpy(hdr.addr2, skb->data + ETH_ALEN, ETH_ALEN); eth_broadcast_addr(hdr.addr3); hdrlen = 24; break; case NL80211_IFTYPE_ADHOC: /* DA SA BSSID */ memcpy(hdr.addr1, skb->data, ETH_ALEN); memcpy(hdr.addr2, skb->data + ETH_ALEN, ETH_ALEN); memcpy(hdr.addr3, sdata->u.ibss.bssid, ETH_ALEN); hdrlen = 24; break; default: ret = -EINVAL; goto free; } if (!chanctx_conf) { if (!ieee80211_vif_is_mld(&sdata->vif)) { ret = -ENOTCONN; goto free; } /* MLD transmissions must not rely on the band */ band = 0; } else { band = chanctx_conf->def.chan->band; } multicast = is_multicast_ether_addr(hdr.addr1); /* sta is always NULL for mesh */ if (sta) { authorized = test_sta_flag(sta, WLAN_STA_AUTHORIZED); wme_sta = sta->sta.wme; } else if (ieee80211_vif_is_mesh(&sdata->vif)) { /* For mesh, the use of the QoS header is mandatory */ wme_sta = true; } /* receiver does QoS (which also means we do) use it */ if (wme_sta) { fc |= cpu_to_le16(IEEE80211_STYPE_QOS_DATA); hdrlen += 2; } /* * Drop unicast frames to unauthorised stations unless they are * EAPOL frames from the local station. */ if (unlikely(!ieee80211_vif_is_mesh(&sdata->vif) && (sdata->vif.type != NL80211_IFTYPE_OCB) && !multicast && !authorized && (cpu_to_be16(ethertype) != sdata->control_port_protocol || !ieee80211_is_our_addr(sdata, skb->data + ETH_ALEN, NULL)))) { #ifdef CONFIG_MAC80211_VERBOSE_DEBUG net_info_ratelimited("%s: dropped frame to %pM (unauthorized port)\n", sdata->name, hdr.addr1); #endif I802_DEBUG_INC(local->tx_handlers_drop_unauth_port); ret = -EPERM; goto free; } if (unlikely(!multicast && (sk_requests_wifi_status(skb->sk) || ctrl_flags & IEEE80211_TX_CTL_REQ_TX_STATUS))) info_id = ieee80211_store_ack_skb(local, skb, &info_flags, cookie); /* * If the skb is shared we need to obtain our own copy. */ skb = skb_share_check(skb, GFP_ATOMIC); if (unlikely(!skb)) { ret = -ENOMEM; goto free; } hdr.frame_control = fc; hdr.duration_id = 0; hdr.seq_ctrl = 0; skip_header_bytes = ETH_HLEN; if (ethertype == ETH_P_AARP || ethertype == ETH_P_IPX) { encaps_data = bridge_tunnel_header; encaps_len = sizeof(bridge_tunnel_header); skip_header_bytes -= 2; } else if (ethertype >= ETH_P_802_3_MIN) { encaps_data = rfc1042_header; encaps_len = sizeof(rfc1042_header); skip_header_bytes -= 2; } else { encaps_data = NULL; encaps_len = 0; } skb_pull(skb, skip_header_bytes); head_need = hdrlen + encaps_len + meshhdrlen - skb_headroom(skb); /* * So we need to modify the skb header and hence need a copy of * that. The head_need variable above doesn't, so far, include * the needed header space that we don't need right away. If we * can, then we don't reallocate right now but only after the * frame arrives at the master device (if it does...) * * If we cannot, however, then we will reallocate to include all * the ever needed space. Also, if we need to reallocate it anyway, * make it big enough for everything we may ever need. */ if (head_need > 0 || skb_cloned(skb)) { head_need += IEEE80211_ENCRYPT_HEADROOM; head_need += local->tx_headroom; head_need = max_t(int, 0, head_need); if (ieee80211_skb_resize(sdata, skb, head_need, ENCRYPT_DATA)) { ieee80211_free_txskb(&local->hw, skb); skb = NULL; return ERR_PTR(-ENOMEM); } } if (encaps_data) memcpy(skb_push(skb, encaps_len), encaps_data, encaps_len); #ifdef CONFIG_MAC80211_MESH if (meshhdrlen > 0) memcpy(skb_push(skb, meshhdrlen), &mesh_hdr, meshhdrlen); #endif if (ieee80211_is_data_qos(fc)) { __le16 *qos_control; qos_control = skb_push(skb, 2); memcpy(skb_push(skb, hdrlen - 2), &hdr, hdrlen - 2); /* * Maybe we could actually set some fields here, for now just * initialise to zero to indicate no special operation. */ *qos_control = 0; } else memcpy(skb_push(skb, hdrlen), &hdr, hdrlen); skb_reset_mac_header(skb); info = IEEE80211_SKB_CB(skb); memset(info, 0, sizeof(*info)); info->flags = info_flags; if (info_id) { info->status_data = info_id; info->status_data_idr = 1; } info->band = band; if (likely(!cookie)) { ctrl_flags |= u32_encode_bits(link_id, IEEE80211_TX_CTRL_MLO_LINK); } else { unsigned int pre_conf_link_id; /* * ctrl_flags already have been set by * ieee80211_tx_control_port(), here * we just sanity check that */ pre_conf_link_id = u32_get_bits(ctrl_flags, IEEE80211_TX_CTRL_MLO_LINK); if (pre_conf_link_id != link_id && link_id != IEEE80211_LINK_UNSPECIFIED) { #ifdef CONFIG_MAC80211_VERBOSE_DEBUG net_info_ratelimited("%s: dropped frame to %pM with bad link ID request (%d vs. %d)\n", sdata->name, hdr.addr1, pre_conf_link_id, link_id); #endif ret = -EINVAL; goto free; } } info->control.flags = ctrl_flags; return skb; free: kfree_skb(skb); return ERR_PTR(ret); } /* * fast-xmit overview * * The core idea of this fast-xmit is to remove per-packet checks by checking * them out of band. ieee80211_check_fast_xmit() implements the out-of-band * checks that are needed to get the sta->fast_tx pointer assigned, after which * much less work can be done per packet. For example, fragmentation must be * disabled or the fast_tx pointer will not be set. All the conditions are seen * in the code here. * * Once assigned, the fast_tx data structure also caches the per-packet 802.11 * header and other data to aid packet processing in ieee80211_xmit_fast(). * * The most difficult part of this is that when any of these assumptions * change, an external trigger (i.e. a call to ieee80211_clear_fast_xmit(), * ieee80211_check_fast_xmit() or friends) is required to reset the data, * since the per-packet code no longer checks the conditions. This is reflected * by the calls to these functions throughout the rest of the code, and must be * maintained if any of the TX path checks change. */ void ieee80211_check_fast_xmit(struct sta_info *sta) { struct ieee80211_fast_tx build = {}, *fast_tx = NULL, *old; struct ieee80211_local *local = sta->local; struct ieee80211_sub_if_data *sdata = sta->sdata; struct ieee80211_hdr *hdr = (void *)build.hdr; struct ieee80211_chanctx_conf *chanctx_conf; __le16 fc; if (!ieee80211_hw_check(&local->hw, SUPPORT_FAST_XMIT)) return; if (ieee80211_vif_is_mesh(&sdata->vif)) mesh_fast_tx_flush_sta(sdata, sta); /* Locking here protects both the pointer itself, and against concurrent * invocations winning data access races to, e.g., the key pointer that * is used. * Without it, the invocation of this function right after the key * pointer changes wouldn't be sufficient, as another CPU could access * the pointer, then stall, and then do the cache update after the CPU * that invalidated the key. * With the locking, such scenarios cannot happen as the check for the * key and the fast-tx assignment are done atomically, so the CPU that * modifies the key will either wait or other one will see the key * cleared/changed already. */ spin_lock_bh(&sta->lock); if (ieee80211_hw_check(&local->hw, SUPPORTS_PS) && !ieee80211_hw_check(&local->hw, SUPPORTS_DYNAMIC_PS) && sdata->vif.type == NL80211_IFTYPE_STATION) goto out; if (!test_sta_flag(sta, WLAN_STA_AUTHORIZED) || !sta->uploaded) goto out; if (test_sta_flag(sta, WLAN_STA_PS_STA) || test_sta_flag(sta, WLAN_STA_PS_DRIVER) || test_sta_flag(sta, WLAN_STA_PS_DELIVER) || test_sta_flag(sta, WLAN_STA_CLEAR_PS_FILT)) goto out; if (sdata->noack_map) goto out; /* fast-xmit doesn't handle fragmentation at all */ if (local->hw.wiphy->frag_threshold != (u32)-1 && !ieee80211_hw_check(&local->hw, SUPPORTS_TX_FRAG)) goto out; if (!ieee80211_vif_is_mld(&sdata->vif)) { rcu_read_lock(); chanctx_conf = rcu_dereference(sdata->vif.bss_conf.chanctx_conf); if (!chanctx_conf) { rcu_read_unlock(); goto out; } build.band = chanctx_conf->def.chan->band; rcu_read_unlock(); } else { /* MLD transmissions must not rely on the band */ build.band = 0; } fc = cpu_to_le16(IEEE80211_FTYPE_DATA | IEEE80211_STYPE_DATA); switch (sdata->vif.type) { case NL80211_IFTYPE_ADHOC: /* DA SA BSSID */ build.da_offs = offsetof(struct ieee80211_hdr, addr1); build.sa_offs = offsetof(struct ieee80211_hdr, addr2); memcpy(hdr->addr3, sdata->u.ibss.bssid, ETH_ALEN); build.hdr_len = 24; break; case NL80211_IFTYPE_STATION: if (test_sta_flag(sta, WLAN_STA_TDLS_PEER)) { /* For TDLS only one link can be valid with peer STA */ int tdls_link_id = ieee80211_tdls_sta_link_id(sta); struct ieee80211_link_data *link; /* DA SA BSSID */ build.da_offs = offsetof(struct ieee80211_hdr, addr1); build.sa_offs = offsetof(struct ieee80211_hdr, addr2); rcu_read_lock(); link = rcu_dereference(sdata->link[tdls_link_id]); if (!WARN_ON_ONCE(!link)) memcpy(hdr->addr3, link->u.mgd.bssid, ETH_ALEN); rcu_read_unlock(); build.hdr_len = 24; break; } if (sdata->u.mgd.use_4addr) { /* non-regular ethertype cannot use the fastpath */ fc |= cpu_to_le16(IEEE80211_FCTL_FROMDS | IEEE80211_FCTL_TODS); /* RA TA DA SA */ memcpy(hdr->addr1, sdata->deflink.u.mgd.bssid, ETH_ALEN); memcpy(hdr->addr2, sdata->vif.addr, ETH_ALEN); build.da_offs = offsetof(struct ieee80211_hdr, addr3); build.sa_offs = offsetof(struct ieee80211_hdr, addr4); build.hdr_len = 30; break; } fc |= cpu_to_le16(IEEE80211_FCTL_TODS); /* BSSID SA DA */ memcpy(hdr->addr1, sdata->vif.cfg.ap_addr, ETH_ALEN); build.da_offs = offsetof(struct ieee80211_hdr, addr3); build.sa_offs = offsetof(struct ieee80211_hdr, addr2); build.hdr_len = 24; break; case NL80211_IFTYPE_AP_VLAN: if (sdata->wdev.use_4addr) { fc |= cpu_to_le16(IEEE80211_FCTL_FROMDS | IEEE80211_FCTL_TODS); /* RA TA DA SA */ memcpy(hdr->addr1, sta->sta.addr, ETH_ALEN); memcpy(hdr->addr2, sdata->vif.addr, ETH_ALEN); build.da_offs = offsetof(struct ieee80211_hdr, addr3); build.sa_offs = offsetof(struct ieee80211_hdr, addr4); build.hdr_len = 30; break; } fallthrough; case NL80211_IFTYPE_AP: fc |= cpu_to_le16(IEEE80211_FCTL_FROMDS); /* DA BSSID SA */ build.da_offs = offsetof(struct ieee80211_hdr, addr1); if (sta->sta.mlo || !ieee80211_vif_is_mld(&sdata->vif)) { memcpy(hdr->addr2, sdata->vif.addr, ETH_ALEN); } else { unsigned int link_id = sta->deflink.link_id; struct ieee80211_link_data *link; rcu_read_lock(); link = rcu_dereference(sdata->link[link_id]); if (WARN_ON(!link)) { rcu_read_unlock(); goto out; } memcpy(hdr->addr2, link->conf->addr, ETH_ALEN); rcu_read_unlock(); } build.sa_offs = offsetof(struct ieee80211_hdr, addr3); build.hdr_len = 24; break; default: /* not handled on fast-xmit */ goto out; } if (sta->sta.wme) { build.hdr_len += 2; fc |= cpu_to_le16(IEEE80211_STYPE_QOS_DATA); } /* We store the key here so there's no point in using rcu_dereference() * but that's fine because the code that changes the pointers will call * this function after doing so. For a single CPU that would be enough, * for multiple see the comment above. */ build.key = rcu_access_pointer(sta->ptk[sta->ptk_idx]); if (!build.key) build.key = rcu_access_pointer(sdata->default_unicast_key); if (build.key) { bool gen_iv, iv_spc, mmic; gen_iv = build.key->conf.flags & IEEE80211_KEY_FLAG_GENERATE_IV; iv_spc = build.key->conf.flags & IEEE80211_KEY_FLAG_PUT_IV_SPACE; mmic = build.key->conf.flags & (IEEE80211_KEY_FLAG_GENERATE_MMIC | IEEE80211_KEY_FLAG_PUT_MIC_SPACE); /* don't handle software crypto */ if (!(build.key->flags & KEY_FLAG_UPLOADED_TO_HARDWARE)) goto out; /* Key is being removed */ if (build.key->flags & KEY_FLAG_TAINTED) goto out; switch (build.key->conf.cipher) { case WLAN_CIPHER_SUITE_CCMP: case WLAN_CIPHER_SUITE_CCMP_256: if (gen_iv) build.pn_offs = build.hdr_len; if (gen_iv || iv_spc) build.hdr_len += IEEE80211_CCMP_HDR_LEN; break; case WLAN_CIPHER_SUITE_GCMP: case WLAN_CIPHER_SUITE_GCMP_256: if (gen_iv) build.pn_offs = build.hdr_len; if (gen_iv || iv_spc) build.hdr_len += IEEE80211_GCMP_HDR_LEN; break; case WLAN_CIPHER_SUITE_TKIP: /* cannot handle MMIC or IV generation in xmit-fast */ if (mmic || gen_iv) goto out; if (iv_spc) build.hdr_len += IEEE80211_TKIP_IV_LEN; break; case WLAN_CIPHER_SUITE_WEP40: case WLAN_CIPHER_SUITE_WEP104: /* cannot handle IV generation in fast-xmit */ if (gen_iv) goto out; if (iv_spc) build.hdr_len += IEEE80211_WEP_IV_LEN; break; case WLAN_CIPHER_SUITE_AES_CMAC: case WLAN_CIPHER_SUITE_BIP_CMAC_256: case WLAN_CIPHER_SUITE_BIP_GMAC_128: case WLAN_CIPHER_SUITE_BIP_GMAC_256: WARN(1, "management cipher suite 0x%x enabled for data\n", build.key->conf.cipher); goto out; default: /* we don't know how to generate IVs for this at all */ if (WARN_ON(gen_iv)) goto out; } fc |= cpu_to_le16(IEEE80211_FCTL_PROTECTED); } hdr->frame_control = fc; memcpy(build.hdr + build.hdr_len, rfc1042_header, sizeof(rfc1042_header)); build.hdr_len += sizeof(rfc1042_header); fast_tx = kmemdup(&build, sizeof(build), GFP_ATOMIC); /* if the kmemdup fails, continue w/o fast_tx */ out: /* we might have raced against another call to this function */ old = rcu_dereference_protected(sta->fast_tx, lockdep_is_held(&sta->lock)); rcu_assign_pointer(sta->fast_tx, fast_tx); if (old) kfree_rcu(old, rcu_head); spin_unlock_bh(&sta->lock); } void ieee80211_check_fast_xmit_all(struct ieee80211_local *local) { struct sta_info *sta; rcu_read_lock(); list_for_each_entry_rcu(sta, &local->sta_list, list) ieee80211_check_fast_xmit(sta); rcu_read_unlock(); } void ieee80211_check_fast_xmit_iface(struct ieee80211_sub_if_data *sdata) { struct ieee80211_local *local = sdata->local; struct sta_info *sta; rcu_read_lock(); list_for_each_entry_rcu(sta, &local->sta_list, list) { if (sdata != sta->sdata && (!sta->sdata->bss || sta->sdata->bss != sdata->bss)) continue; ieee80211_check_fast_xmit(sta); } rcu_read_unlock(); } void ieee80211_clear_fast_xmit(struct sta_info *sta) { struct ieee80211_fast_tx *fast_tx; spin_lock_bh(&sta->lock); fast_tx = rcu_dereference_protected(sta->fast_tx, lockdep_is_held(&sta->lock)); RCU_INIT_POINTER(sta->fast_tx, NULL); spin_unlock_bh(&sta->lock); if (fast_tx) kfree_rcu(fast_tx, rcu_head); } static bool ieee80211_amsdu_realloc_pad(struct ieee80211_local *local, struct sk_buff *skb, int headroom) { if (skb_headroom(skb) < headroom) { I802_DEBUG_INC(local->tx_expand_skb_head); if (pskb_expand_head(skb, headroom, 0, GFP_ATOMIC)) { wiphy_debug(local->hw.wiphy, "failed to reallocate TX buffer\n"); return false; } } return true; } static bool ieee80211_amsdu_prepare_head(struct ieee80211_sub_if_data *sdata, struct ieee80211_fast_tx *fast_tx, struct sk_buff *skb) { struct ieee80211_local *local = sdata->local; struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); struct ieee80211_hdr *hdr; struct ethhdr *amsdu_hdr; int hdr_len = fast_tx->hdr_len - sizeof(rfc1042_header); int subframe_len = skb->len - hdr_len; void *data; u8 *qc, *h_80211_src, *h_80211_dst; const u8 *bssid; if (info->flags & IEEE80211_TX_CTL_RATE_CTRL_PROBE) return false; if (info->control.flags & IEEE80211_TX_CTRL_AMSDU) return true; if (!ieee80211_amsdu_realloc_pad(local, skb, sizeof(*amsdu_hdr) + local->hw.extra_tx_headroom)) return false; data = skb_push(skb, sizeof(*amsdu_hdr)); memmove(data, data + sizeof(*amsdu_hdr), hdr_len); hdr = data; amsdu_hdr = data + hdr_len; /* h_80211_src/dst is addr* field within hdr */ h_80211_src = data + fast_tx->sa_offs; h_80211_dst = data + fast_tx->da_offs; amsdu_hdr->h_proto = cpu_to_be16(subframe_len); ether_addr_copy(amsdu_hdr->h_source, h_80211_src); ether_addr_copy(amsdu_hdr->h_dest, h_80211_dst); /* according to IEEE 802.11-2012 8.3.2 table 8-19, the outer SA/DA * fields needs to be changed to BSSID for A-MSDU frames depending * on FromDS/ToDS values. */ switch (sdata->vif.type) { case NL80211_IFTYPE_STATION: bssid = sdata->vif.cfg.ap_addr; break; case NL80211_IFTYPE_AP: case NL80211_IFTYPE_AP_VLAN: bssid = sdata->vif.addr; break; default: bssid = NULL; } if (bssid && ieee80211_has_fromds(hdr->frame_control)) ether_addr_copy(h_80211_src, bssid); if (bssid && ieee80211_has_tods(hdr->frame_control)) ether_addr_copy(h_80211_dst, bssid); qc = ieee80211_get_qos_ctl(hdr); *qc |= IEEE80211_QOS_CTL_A_MSDU_PRESENT; info->control.flags |= IEEE80211_TX_CTRL_AMSDU; return true; } static bool ieee80211_amsdu_aggregate(struct ieee80211_sub_if_data *sdata, struct sta_info *sta, struct ieee80211_fast_tx *fast_tx, struct sk_buff *skb, const u8 *da, const u8 *sa) { struct ieee80211_local *local = sdata->local; struct fq *fq = &local->fq; struct fq_tin *tin; struct fq_flow *flow; u8 tid = skb->priority & IEEE80211_QOS_CTL_TAG1D_MASK; struct ieee80211_txq *txq = sta->sta.txq[tid]; struct txq_info *txqi; struct sk_buff **frag_tail, *head; int subframe_len = skb->len - ETH_ALEN; u8 max_subframes = sta->sta.max_amsdu_subframes; int max_frags = local->hw.max_tx_fragments; int max_amsdu_len = sta->sta.cur->max_amsdu_len; int orig_truesize; u32 flow_idx; __be16 len; void *data; bool ret = false; unsigned int orig_len; int n = 2, nfrags, pad = 0; u16 hdrlen; if (!ieee80211_hw_check(&local->hw, TX_AMSDU)) return false; if (sdata->vif.offload_flags & IEEE80211_OFFLOAD_ENCAP_ENABLED) return false; if (ieee80211_vif_is_mesh(&sdata->vif)) return false; if (skb_is_gso(skb)) return false; if (!txq) return false; txqi = to_txq_info(txq); if (test_bit(IEEE80211_TXQ_NO_AMSDU, &txqi->flags)) return false; if (sta->sta.cur->max_rc_amsdu_len) max_amsdu_len = min_t(int, max_amsdu_len, sta->sta.cur->max_rc_amsdu_len); if (sta->sta.cur->max_tid_amsdu_len[tid]) max_amsdu_len = min_t(int, max_amsdu_len, sta->sta.cur->max_tid_amsdu_len[tid]); flow_idx = fq_flow_idx(fq, skb); spin_lock_bh(&fq->lock); /* TODO: Ideally aggregation should be done on dequeue to remain * responsive to environment changes. */ tin = &txqi->tin; flow = fq_flow_classify(fq, tin, flow_idx, skb); head = skb_peek_tail(&flow->queue); if (!head || skb_is_gso(head)) goto out; orig_truesize = head->truesize; orig_len = head->len; if (skb->len + head->len > max_amsdu_len) goto out; nfrags = 1 + skb_shinfo(skb)->nr_frags; nfrags += 1 + skb_shinfo(head)->nr_frags; frag_tail = &skb_shinfo(head)->frag_list; while (*frag_tail) { nfrags += 1 + skb_shinfo(*frag_tail)->nr_frags; frag_tail = &(*frag_tail)->next; n++; } if (max_subframes && n > max_subframes) goto out; if (max_frags && nfrags > max_frags) goto out; if (!drv_can_aggregate_in_amsdu(local, head, skb)) goto out; if (!ieee80211_amsdu_prepare_head(sdata, fast_tx, head)) goto out; /* If n == 2, the "while (*frag_tail)" loop above didn't execute * and frag_tail should be &skb_shinfo(head)->frag_list. * However, ieee80211_amsdu_prepare_head() can reallocate it. * Reload frag_tail to have it pointing to the correct place. */ if (n == 2) frag_tail = &skb_shinfo(head)->frag_list; /* * Pad out the previous subframe to a multiple of 4 by adding the * padding to the next one, that's being added. Note that head->len * is the length of the full A-MSDU, but that works since each time * we add a new subframe we pad out the previous one to a multiple * of 4 and thus it no longer matters in the next round. */ hdrlen = fast_tx->hdr_len - sizeof(rfc1042_header); if ((head->len - hdrlen) & 3) pad = 4 - ((head->len - hdrlen) & 3); if (!ieee80211_amsdu_realloc_pad(local, skb, sizeof(rfc1042_header) + 2 + pad)) goto out_recalc; ret = true; data = skb_push(skb, ETH_ALEN + 2); ether_addr_copy(data, da); ether_addr_copy(data + ETH_ALEN, sa); data += 2 * ETH_ALEN; len = cpu_to_be16(subframe_len); memcpy(data, &len, 2); memcpy(data + 2, rfc1042_header, sizeof(rfc1042_header)); memset(skb_push(skb, pad), 0, pad); head->len += skb->len; head->data_len += skb->len; *frag_tail = skb; out_recalc: fq->memory_usage += head->truesize - orig_truesize; if (head->len != orig_len) { flow->backlog += head->len - orig_len; tin->backlog_bytes += head->len - orig_len; } out: spin_unlock_bh(&fq->lock); return ret; } /* * Can be called while the sta lock is held. Anything that can cause packets to * be generated will cause deadlock! */ static ieee80211_tx_result ieee80211_xmit_fast_finish(struct ieee80211_sub_if_data *sdata, struct sta_info *sta, u8 pn_offs, struct ieee80211_key *key, struct ieee80211_tx_data *tx) { struct sk_buff *skb = tx->skb; struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); struct ieee80211_hdr *hdr = (void *)skb->data; u8 tid = IEEE80211_NUM_TIDS; if (!ieee80211_hw_check(&tx->local->hw, HAS_RATE_CONTROL) && ieee80211_tx_h_rate_ctrl(tx) != TX_CONTINUE) return TX_DROP; if (key) info->control.hw_key = &key->conf; dev_sw_netstats_tx_add(skb->dev, 1, skb->len); if (hdr->frame_control & cpu_to_le16(IEEE80211_STYPE_QOS_DATA)) { tid = skb->priority & IEEE80211_QOS_CTL_TAG1D_MASK; hdr->seq_ctrl = ieee80211_tx_next_seq(sta, tid); } else { info->flags |= IEEE80211_TX_CTL_ASSIGN_SEQ; hdr->seq_ctrl = cpu_to_le16(sdata->sequence_number); sdata->sequence_number += 0x10; } if (skb_shinfo(skb)->gso_size) sta->deflink.tx_stats.msdu[tid] += DIV_ROUND_UP(skb->len, skb_shinfo(skb)->gso_size); else sta->deflink.tx_stats.msdu[tid]++; info->hw_queue = sdata->vif.hw_queue[skb_get_queue_mapping(skb)]; /* statistics normally done by ieee80211_tx_h_stats (but that * has to consider fragmentation, so is more complex) */ sta->deflink.tx_stats.bytes[skb_get_queue_mapping(skb)] += skb->len; sta->deflink.tx_stats.packets[skb_get_queue_mapping(skb)]++; if (pn_offs) { u64 pn; u8 *crypto_hdr = skb->data + pn_offs; switch (key->conf.cipher) { case WLAN_CIPHER_SUITE_CCMP: case WLAN_CIPHER_SUITE_CCMP_256: case WLAN_CIPHER_SUITE_GCMP: case WLAN_CIPHER_SUITE_GCMP_256: pn = atomic64_inc_return(&key->conf.tx_pn); crypto_hdr[0] = pn; crypto_hdr[1] = pn >> 8; crypto_hdr[3] = 0x20 | (key->conf.keyidx << 6); crypto_hdr[4] = pn >> 16; crypto_hdr[5] = pn >> 24; crypto_hdr[6] = pn >> 32; crypto_hdr[7] = pn >> 40; break; } } return TX_CONTINUE; } static netdev_features_t ieee80211_sdata_netdev_features(struct ieee80211_sub_if_data *sdata) { if (sdata->vif.type != NL80211_IFTYPE_AP_VLAN) return sdata->vif.netdev_features; if (!sdata->bss) return 0; sdata = container_of(sdata->bss, struct ieee80211_sub_if_data, u.ap); return sdata->vif.netdev_features; } static struct sk_buff * ieee80211_tx_skb_fixup(struct sk_buff *skb, netdev_features_t features) { if (skb_is_gso(skb)) { struct sk_buff *segs; segs = skb_gso_segment(skb, features); if (!segs) return skb; if (IS_ERR(segs)) goto free; consume_skb(skb); return segs; } if (skb_needs_linearize(skb, features) && __skb_linearize(skb)) goto free; if (skb->ip_summed == CHECKSUM_PARTIAL) { int ofs = skb_checksum_start_offset(skb); if (skb->encapsulation) skb_set_inner_transport_header(skb, ofs); else skb_set_transport_header(skb, ofs); if (skb_csum_hwoffload_help(skb, features)) goto free; } skb_mark_not_on_list(skb); return skb; free: kfree_skb(skb); return NULL; } void __ieee80211_xmit_fast(struct ieee80211_sub_if_data *sdata, struct sta_info *sta, struct ieee80211_fast_tx *fast_tx, struct sk_buff *skb, bool ampdu, const u8 *da, const u8 *sa) { struct ieee80211_local *local = sdata->local; struct ieee80211_hdr *hdr = (void *)fast_tx->hdr; struct ieee80211_tx_info *info; struct ieee80211_tx_data tx; ieee80211_tx_result r; int hw_headroom = sdata->local->hw.extra_tx_headroom; int extra_head = fast_tx->hdr_len - (ETH_HLEN - 2); skb = skb_share_check(skb, GFP_ATOMIC); if (unlikely(!skb)) return; if ((hdr->frame_control & cpu_to_le16(IEEE80211_STYPE_QOS_DATA)) && ieee80211_amsdu_aggregate(sdata, sta, fast_tx, skb, da, sa)) return; /* will not be crypto-handled beyond what we do here, so use false * as the may-encrypt argument for the resize to not account for * more room than we already have in 'extra_head' */ if (unlikely(ieee80211_skb_resize(sdata, skb, max_t(int, extra_head + hw_headroom - skb_headroom(skb), 0), ENCRYPT_NO))) goto free; hdr = skb_push(skb, extra_head); memcpy(skb->data, fast_tx->hdr, fast_tx->hdr_len); memcpy(skb->data + fast_tx->da_offs, da, ETH_ALEN); memcpy(skb->data + fast_tx->sa_offs, sa, ETH_ALEN); info = IEEE80211_SKB_CB(skb); memset(info, 0, sizeof(*info)); info->band = fast_tx->band; info->control.vif = &sdata->vif; info->flags = IEEE80211_TX_CTL_FIRST_FRAGMENT | IEEE80211_TX_CTL_DONTFRAG; info->control.flags = IEEE80211_TX_CTRL_FAST_XMIT | u32_encode_bits(IEEE80211_LINK_UNSPECIFIED, IEEE80211_TX_CTRL_MLO_LINK); #ifdef CONFIG_MAC80211_DEBUGFS if (local->force_tx_status) info->flags |= IEEE80211_TX_CTL_REQ_TX_STATUS; #endif if (hdr->frame_control & cpu_to_le16(IEEE80211_STYPE_QOS_DATA)) { u8 tid = skb->priority & IEEE80211_QOS_CTL_TAG1D_MASK; *ieee80211_get_qos_ctl(hdr) = tid; } __skb_queue_head_init(&tx.skbs); tx.flags = IEEE80211_TX_UNICAST; tx.local = local; tx.sdata = sdata; tx.sta = sta; tx.key = fast_tx->key; if (ieee80211_queue_skb(local, sdata, sta, skb)) return; tx.skb = skb; r = ieee80211_xmit_fast_finish(sdata, sta, fast_tx->pn_offs, fast_tx->key, &tx); tx.skb = NULL; if (r == TX_DROP) { tx.sdata->tx_handlers_drop++; goto free; } if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN) sdata = container_of(sdata->bss, struct ieee80211_sub_if_data, u.ap); __skb_queue_tail(&tx.skbs, skb); ieee80211_tx_frags(local, &sdata->vif, sta, &tx.skbs, false); return; free: kfree_skb(skb); } static bool ieee80211_xmit_fast(struct ieee80211_sub_if_data *sdata, struct sta_info *sta, struct ieee80211_fast_tx *fast_tx, struct sk_buff *skb) { u16 ethertype = (skb->data[12] << 8) | skb->data[13]; struct ieee80211_hdr *hdr = (void *)fast_tx->hdr; struct tid_ampdu_tx *tid_tx = NULL; struct sk_buff *next; struct ethhdr eth; u8 tid = IEEE80211_NUM_TIDS; /* control port protocol needs a lot of special handling */ if (cpu_to_be16(ethertype) == sdata->control_port_protocol) return false; /* only RFC 1042 SNAP */ if (ethertype < ETH_P_802_3_MIN) return false; /* don't handle TX status request here either */ if (sk_requests_wifi_status(skb->sk)) return false; if (hdr->frame_control & cpu_to_le16(IEEE80211_STYPE_QOS_DATA)) { tid = skb->priority & IEEE80211_QOS_CTL_TAG1D_MASK; tid_tx = rcu_dereference(sta->ampdu_mlme.tid_tx[tid]); if (tid_tx) { if (!test_bit(HT_AGG_STATE_OPERATIONAL, &tid_tx->state)) return false; if (tid_tx->timeout) tid_tx->last_tx = jiffies; } } memcpy(&eth, skb->data, ETH_HLEN - 2); /* after this point (skb is modified) we cannot return false */ skb = ieee80211_tx_skb_fixup(skb, ieee80211_sdata_netdev_features(sdata)); if (!skb) return true; skb_list_walk_safe(skb, skb, next) { skb_mark_not_on_list(skb); __ieee80211_xmit_fast(sdata, sta, fast_tx, skb, tid_tx, eth.h_dest, eth.h_source); } return true; } struct sk_buff *ieee80211_tx_dequeue(struct ieee80211_hw *hw, struct ieee80211_txq *txq) { struct ieee80211_local *local = hw_to_local(hw); struct txq_info *txqi = container_of(txq, struct txq_info, txq); struct ieee80211_hdr *hdr; struct sk_buff *skb = NULL; struct fq *fq = &local->fq; struct fq_tin *tin = &txqi->tin; struct ieee80211_tx_info *info; struct ieee80211_tx_data tx; ieee80211_tx_result r; struct ieee80211_vif *vif = txq->vif; int q = vif->hw_queue[txq->ac]; unsigned long flags; bool q_stopped; WARN_ON_ONCE(softirq_count() == 0); if (!ieee80211_txq_airtime_check(hw, txq)) return NULL; begin: spin_lock_irqsave(&local->queue_stop_reason_lock, flags); q_stopped = local->queue_stop_reasons[q]; spin_unlock_irqrestore(&local->queue_stop_reason_lock, flags); if (unlikely(q_stopped)) { /* mark for waking later */ set_bit(IEEE80211_TXQ_DIRTY, &txqi->flags); return NULL; } spin_lock_bh(&fq->lock); /* Make sure fragments stay together. */ skb = __skb_dequeue(&txqi->frags); if (unlikely(skb)) { if (!(IEEE80211_SKB_CB(skb)->control.flags & IEEE80211_TX_INTCFL_NEED_TXPROCESSING)) goto out; IEEE80211_SKB_CB(skb)->control.flags &= ~IEEE80211_TX_INTCFL_NEED_TXPROCESSING; } else { if (unlikely(test_bit(IEEE80211_TXQ_STOP, &txqi->flags))) goto out; skb = fq_tin_dequeue(fq, tin, fq_tin_dequeue_func); } if (!skb) goto out; spin_unlock_bh(&fq->lock); hdr = (struct ieee80211_hdr *)skb->data; info = IEEE80211_SKB_CB(skb); memset(&tx, 0, sizeof(tx)); __skb_queue_head_init(&tx.skbs); tx.local = local; tx.skb = skb; tx.sdata = vif_to_sdata(info->control.vif); if (txq->sta) { tx.sta = container_of(txq->sta, struct sta_info, sta); /* * Drop unicast frames to unauthorised stations unless they are * injected frames or EAPOL frames from the local station. */ if (unlikely(!(info->flags & IEEE80211_TX_CTL_INJECTED) && ieee80211_is_data(hdr->frame_control) && !ieee80211_vif_is_mesh(&tx.sdata->vif) && tx.sdata->vif.type != NL80211_IFTYPE_OCB && !is_multicast_ether_addr(hdr->addr1) && !test_sta_flag(tx.sta, WLAN_STA_AUTHORIZED) && (!(info->control.flags & IEEE80211_TX_CTRL_PORT_CTRL_PROTO) || !ieee80211_is_our_addr(tx.sdata, hdr->addr2, NULL)))) { I802_DEBUG_INC(local->tx_handlers_drop_unauth_port); ieee80211_free_txskb(&local->hw, skb); goto begin; } } /* * The key can be removed while the packet was queued, so need to call * this here to get the current key. */ info->control.hw_key = NULL; r = ieee80211_tx_h_select_key(&tx); if (r != TX_CONTINUE) { ieee80211_free_txskb(&local->hw, skb); goto begin; } if (test_bit(IEEE80211_TXQ_AMPDU, &txqi->flags)) info->flags |= (IEEE80211_TX_CTL_AMPDU | IEEE80211_TX_CTL_DONTFRAG); if (info->flags & IEEE80211_TX_CTL_HW_80211_ENCAP) { if (!ieee80211_hw_check(&local->hw, HAS_RATE_CONTROL)) { r = ieee80211_tx_h_rate_ctrl(&tx); if (r != TX_CONTINUE) { ieee80211_free_txskb(&local->hw, skb); goto begin; } } goto encap_out; } if (info->control.flags & IEEE80211_TX_CTRL_FAST_XMIT) { struct sta_info *sta = container_of(txq->sta, struct sta_info, sta); u8 pn_offs = 0; if (tx.key && (tx.key->conf.flags & IEEE80211_KEY_FLAG_GENERATE_IV)) pn_offs = ieee80211_hdrlen(hdr->frame_control); r = ieee80211_xmit_fast_finish(sta->sdata, sta, pn_offs, tx.key, &tx); if (r != TX_CONTINUE) { ieee80211_free_txskb(&local->hw, skb); goto begin; } } else { if (invoke_tx_handlers_late(&tx)) goto begin; skb = __skb_dequeue(&tx.skbs); info = IEEE80211_SKB_CB(skb); if (!skb_queue_empty(&tx.skbs)) { spin_lock_bh(&fq->lock); skb_queue_splice_tail(&tx.skbs, &txqi->frags); spin_unlock_bh(&fq->lock); } } if (skb_has_frag_list(skb) && !ieee80211_hw_check(&local->hw, TX_FRAG_LIST)) { if (skb_linearize(skb)) { ieee80211_free_txskb(&local->hw, skb); goto begin; } } switch (tx.sdata->vif.type) { case NL80211_IFTYPE_MONITOR: if ((tx.sdata->u.mntr.flags & MONITOR_FLAG_ACTIVE) || ieee80211_hw_check(&local->hw, NO_VIRTUAL_MONITOR)) { vif = &tx.sdata->vif; break; } tx.sdata = rcu_dereference(local->monitor_sdata); if (tx.sdata && ieee80211_hw_check(&local->hw, WANT_MONITOR_VIF)) { vif = &tx.sdata->vif; info->hw_queue = vif->hw_queue[skb_get_queue_mapping(skb)]; } else if (ieee80211_hw_check(&local->hw, QUEUE_CONTROL)) { ieee80211_free_txskb(&local->hw, skb); goto begin; } else { info->control.vif = NULL; return skb; } break; case NL80211_IFTYPE_AP_VLAN: tx.sdata = container_of(tx.sdata->bss, struct ieee80211_sub_if_data, u.ap); fallthrough; default: vif = &tx.sdata->vif; break; } encap_out: info->control.vif = vif; if (tx.sta && wiphy_ext_feature_isset(local->hw.wiphy, NL80211_EXT_FEATURE_AQL)) { bool ampdu = txq->ac != IEEE80211_AC_VO; u32 airtime; airtime = ieee80211_calc_expected_tx_airtime(hw, vif, txq->sta, skb->len, ampdu); if (airtime) { airtime = ieee80211_info_set_tx_time_est(info, airtime); ieee80211_sta_update_pending_airtime(local, tx.sta, txq->ac, airtime, false); } } return skb; out: spin_unlock_bh(&fq->lock); return skb; } EXPORT_SYMBOL(ieee80211_tx_dequeue); static inline s32 ieee80211_sta_deficit(struct sta_info *sta, u8 ac) { struct airtime_info *air_info = &sta->airtime[ac]; return air_info->deficit - atomic_read(&air_info->aql_tx_pending); } static void ieee80211_txq_set_active(struct txq_info *txqi) { struct sta_info *sta; if (!txqi->txq.sta) return; sta = container_of(txqi->txq.sta, struct sta_info, sta); sta->airtime[txqi->txq.ac].last_active = jiffies; } static bool ieee80211_txq_keep_active(struct txq_info *txqi) { struct sta_info *sta; if (!txqi->txq.sta) return false; sta = container_of(txqi->txq.sta, struct sta_info, sta); if (ieee80211_sta_deficit(sta, txqi->txq.ac) >= 0) return false; return ieee80211_sta_keep_active(sta, txqi->txq.ac); } struct ieee80211_txq *ieee80211_next_txq(struct ieee80211_hw *hw, u8 ac) { struct ieee80211_local *local = hw_to_local(hw); struct ieee80211_txq *ret = NULL; struct txq_info *txqi = NULL, *head = NULL; bool found_eligible_txq = false; spin_lock_bh(&local->active_txq_lock[ac]); if (!local->schedule_round[ac]) goto out; begin: txqi = list_first_entry_or_null(&local->active_txqs[ac], struct txq_info, schedule_order); if (!txqi) goto out; if (txqi == head) { if (!found_eligible_txq) goto out; else found_eligible_txq = false; } if (!head) head = txqi; if (txqi->txq.sta) { struct sta_info *sta = container_of(txqi->txq.sta, struct sta_info, sta); bool aql_check = ieee80211_txq_airtime_check(hw, &txqi->txq); s32 deficit = ieee80211_sta_deficit(sta, txqi->txq.ac); if (aql_check) found_eligible_txq = true; if (deficit < 0) sta->airtime[txqi->txq.ac].deficit += sta->airtime_weight; if (deficit < 0 || !aql_check) { list_move_tail(&txqi->schedule_order, &local->active_txqs[txqi->txq.ac]); goto begin; } } if (txqi->schedule_round == local->schedule_round[ac]) goto out; list_del_init(&txqi->schedule_order); txqi->schedule_round = local->schedule_round[ac]; ret = &txqi->txq; out: spin_unlock_bh(&local->active_txq_lock[ac]); return ret; } EXPORT_SYMBOL(ieee80211_next_txq); void __ieee80211_schedule_txq(struct ieee80211_hw *hw, struct ieee80211_txq *txq, bool force) { struct ieee80211_local *local = hw_to_local(hw); struct txq_info *txqi = to_txq_info(txq); bool has_queue; spin_lock_bh(&local->active_txq_lock[txq->ac]); has_queue = force || (!test_bit(IEEE80211_TXQ_STOP, &txqi->flags) && txq_has_queue(txq)); if (list_empty(&txqi->schedule_order) && (has_queue || ieee80211_txq_keep_active(txqi))) { /* If airtime accounting is active, always enqueue STAs at the * head of the list to ensure that they only get moved to the * back by the airtime DRR scheduler once they have a negative * deficit. A station that already has a negative deficit will * get immediately moved to the back of the list on the next * call to ieee80211_next_txq(). */ if (txqi->txq.sta && local->airtime_flags && has_queue && wiphy_ext_feature_isset(local->hw.wiphy, NL80211_EXT_FEATURE_AIRTIME_FAIRNESS)) list_add(&txqi->schedule_order, &local->active_txqs[txq->ac]); else list_add_tail(&txqi->schedule_order, &local->active_txqs[txq->ac]); if (has_queue) ieee80211_txq_set_active(txqi); } spin_unlock_bh(&local->active_txq_lock[txq->ac]); } EXPORT_SYMBOL(__ieee80211_schedule_txq); DEFINE_STATIC_KEY_FALSE(aql_disable); bool ieee80211_txq_airtime_check(struct ieee80211_hw *hw, struct ieee80211_txq *txq) { struct sta_info *sta; struct ieee80211_local *local = hw_to_local(hw); if (!wiphy_ext_feature_isset(local->hw.wiphy, NL80211_EXT_FEATURE_AQL)) return true; if (static_branch_unlikely(&aql_disable)) return true; if (!txq->sta) return true; if (unlikely(txq->tid == IEEE80211_NUM_TIDS)) return true; sta = container_of(txq->sta, struct sta_info, sta); if (atomic_read(&sta->airtime[txq->ac].aql_tx_pending) < sta->airtime[txq->ac].aql_limit_low) return true; if (atomic_read(&local->aql_total_pending_airtime) < local->aql_threshold && atomic_read(&sta->airtime[txq->ac].aql_tx_pending) < sta->airtime[txq->ac].aql_limit_high) return true; return false; } EXPORT_SYMBOL(ieee80211_txq_airtime_check); static bool ieee80211_txq_schedule_airtime_check(struct ieee80211_local *local, u8 ac) { unsigned int num_txq = 0; struct txq_info *txq; u32 aql_limit; if (!wiphy_ext_feature_isset(local->hw.wiphy, NL80211_EXT_FEATURE_AQL)) return true; list_for_each_entry(txq, &local->active_txqs[ac], schedule_order) num_txq++; aql_limit = (num_txq - 1) * local->aql_txq_limit_low[ac] / 2 + local->aql_txq_limit_high[ac]; return atomic_read(&local->aql_ac_pending_airtime[ac]) < aql_limit; } bool ieee80211_txq_may_transmit(struct ieee80211_hw *hw, struct ieee80211_txq *txq) { struct ieee80211_local *local = hw_to_local(hw); struct txq_info *iter, *tmp, *txqi = to_txq_info(txq); struct sta_info *sta; u8 ac = txq->ac; spin_lock_bh(&local->active_txq_lock[ac]); if (!txqi->txq.sta) goto out; if (list_empty(&txqi->schedule_order)) goto out; if (!ieee80211_txq_schedule_airtime_check(local, ac)) goto out; list_for_each_entry_safe(iter, tmp, &local->active_txqs[ac], schedule_order) { if (iter == txqi) break; if (!iter->txq.sta) { list_move_tail(&iter->schedule_order, &local->active_txqs[ac]); continue; } sta = container_of(iter->txq.sta, struct sta_info, sta); if (ieee80211_sta_deficit(sta, ac) < 0) sta->airtime[ac].deficit += sta->airtime_weight; list_move_tail(&iter->schedule_order, &local->active_txqs[ac]); } sta = container_of(txqi->txq.sta, struct sta_info, sta); if (sta->airtime[ac].deficit >= 0) goto out; sta->airtime[ac].deficit += sta->airtime_weight; list_move_tail(&txqi->schedule_order, &local->active_txqs[ac]); spin_unlock_bh(&local->active_txq_lock[ac]); return false; out: if (!list_empty(&txqi->schedule_order)) list_del_init(&txqi->schedule_order); spin_unlock_bh(&local->active_txq_lock[ac]); return true; } EXPORT_SYMBOL(ieee80211_txq_may_transmit); void ieee80211_txq_schedule_start(struct ieee80211_hw *hw, u8 ac) { struct ieee80211_local *local = hw_to_local(hw); spin_lock_bh(&local->active_txq_lock[ac]); if (ieee80211_txq_schedule_airtime_check(local, ac)) { local->schedule_round[ac]++; if (!local->schedule_round[ac]) local->schedule_round[ac]++; } else { local->schedule_round[ac] = 0; } spin_unlock_bh(&local->active_txq_lock[ac]); } EXPORT_SYMBOL(ieee80211_txq_schedule_start); void __ieee80211_subif_start_xmit(struct sk_buff *skb, struct net_device *dev, u32 info_flags, u32 ctrl_flags, u64 *cookie) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); struct ieee80211_local *local = sdata->local; struct sta_info *sta; struct sk_buff *next; int len = skb->len; if (unlikely(!ieee80211_sdata_running(sdata) || skb->len < ETH_HLEN)) { kfree_skb(skb); return; } sk_pacing_shift_update(skb->sk, sdata->local->hw.tx_sk_pacing_shift); rcu_read_lock(); if (ieee80211_vif_is_mesh(&sdata->vif) && ieee80211_hw_check(&local->hw, SUPPORT_FAST_XMIT) && ieee80211_mesh_xmit_fast(sdata, skb, ctrl_flags)) goto out; if (ieee80211_lookup_ra_sta(sdata, skb, &sta)) goto out_free; if (IS_ERR(sta)) sta = NULL; skb_set_queue_mapping(skb, ieee80211_select_queue(sdata, sta, skb)); ieee80211_aggr_check(sdata, sta, skb); if (sta) { struct ieee80211_fast_tx *fast_tx; fast_tx = rcu_dereference(sta->fast_tx); if (fast_tx && ieee80211_xmit_fast(sdata, sta, fast_tx, skb)) goto out; } /* the frame could be fragmented, software-encrypted, and other * things so we cannot really handle checksum or GSO offload. * fix it up in software before we handle anything else. */ skb = ieee80211_tx_skb_fixup(skb, 0); if (!skb) { len = 0; goto out; } skb_list_walk_safe(skb, skb, next) { skb_mark_not_on_list(skb); if (skb->protocol == sdata->control_port_protocol) ctrl_flags |= IEEE80211_TX_CTRL_SKIP_MPATH_LOOKUP; skb = ieee80211_build_hdr(sdata, skb, info_flags, sta, ctrl_flags, cookie); if (IS_ERR(skb)) { kfree_skb_list(next); goto out; } dev_sw_netstats_tx_add(dev, 1, skb->len); ieee80211_xmit(sdata, sta, skb); } goto out; out_free: kfree_skb(skb); len = 0; out: if (len) ieee80211_tpt_led_trig_tx(local, len); rcu_read_unlock(); } static int ieee80211_change_da(struct sk_buff *skb, struct sta_info *sta) { struct ethhdr *eth; int err; err = skb_ensure_writable(skb, ETH_HLEN); if (unlikely(err)) return err; eth = (void *)skb->data; ether_addr_copy(eth->h_dest, sta->sta.addr); return 0; } static bool ieee80211_multicast_to_unicast(struct sk_buff *skb, struct net_device *dev) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); const struct ethhdr *eth = (void *)skb->data; const struct vlan_ethhdr *ethvlan = (void *)skb->data; __be16 ethertype; switch (sdata->vif.type) { case NL80211_IFTYPE_AP_VLAN: if (sdata->u.vlan.sta) return false; if (sdata->wdev.use_4addr) return false; fallthrough; case NL80211_IFTYPE_AP: /* check runtime toggle for this bss */ if (!sdata->bss->multicast_to_unicast) return false; break; default: return false; } /* multicast to unicast conversion only for some payload */ ethertype = eth->h_proto; if (ethertype == htons(ETH_P_8021Q) && skb->len >= VLAN_ETH_HLEN) ethertype = ethvlan->h_vlan_encapsulated_proto; switch (ethertype) { case htons(ETH_P_ARP): case htons(ETH_P_IP): case htons(ETH_P_IPV6): break; default: return false; } return true; } static void ieee80211_convert_to_unicast(struct sk_buff *skb, struct net_device *dev, struct sk_buff_head *queue) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); struct ieee80211_local *local = sdata->local; const struct ethhdr *eth = (struct ethhdr *)skb->data; struct sta_info *sta, *first = NULL; struct sk_buff *cloned_skb; rcu_read_lock(); list_for_each_entry_rcu(sta, &local->sta_list, list) { if (sdata != sta->sdata) /* AP-VLAN mismatch */ continue; if (unlikely(ether_addr_equal(eth->h_source, sta->sta.addr))) /* do not send back to source */ continue; if (!first) { first = sta; continue; } cloned_skb = skb_clone(skb, GFP_ATOMIC); if (!cloned_skb) goto multicast; if (unlikely(ieee80211_change_da(cloned_skb, sta))) { dev_kfree_skb(cloned_skb); goto multicast; } __skb_queue_tail(queue, cloned_skb); } if (likely(first)) { if (unlikely(ieee80211_change_da(skb, first))) goto multicast; __skb_queue_tail(queue, skb); } else { /* no STA connected, drop */ kfree_skb(skb); skb = NULL; } goto out; multicast: __skb_queue_purge(queue); __skb_queue_tail(queue, skb); out: rcu_read_unlock(); } static void ieee80211_mlo_multicast_tx_one(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb, u32 ctrl_flags, unsigned int link_id) { struct sk_buff *out; out = skb_copy(skb, GFP_ATOMIC); if (!out) return; ctrl_flags |= u32_encode_bits(link_id, IEEE80211_TX_CTRL_MLO_LINK); __ieee80211_subif_start_xmit(out, sdata->dev, 0, ctrl_flags, NULL); } static void ieee80211_mlo_multicast_tx(struct net_device *dev, struct sk_buff *skb) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); unsigned long links = sdata->vif.active_links; unsigned int link; u32 ctrl_flags = IEEE80211_TX_CTRL_MCAST_MLO_FIRST_TX; if (hweight16(links) == 1) { ctrl_flags |= u32_encode_bits(__ffs(links), IEEE80211_TX_CTRL_MLO_LINK); __ieee80211_subif_start_xmit(skb, sdata->dev, 0, ctrl_flags, NULL); return; } for_each_set_bit(link, &links, IEEE80211_MLD_MAX_NUM_LINKS) { ieee80211_mlo_multicast_tx_one(sdata, skb, ctrl_flags, link); ctrl_flags = 0; } kfree_skb(skb); } /** * ieee80211_subif_start_xmit - netif start_xmit function for 802.3 vifs * @skb: packet to be sent * @dev: incoming interface * * On failure skb will be freed. * * Returns: the netdev TX status (but really only %NETDEV_TX_OK) */ netdev_tx_t ieee80211_subif_start_xmit(struct sk_buff *skb, struct net_device *dev) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); const struct ethhdr *eth = (void *)skb->data; if (likely(!is_multicast_ether_addr(eth->h_dest))) goto normal; if (unlikely(!ieee80211_sdata_running(sdata))) { kfree_skb(skb); return NETDEV_TX_OK; } if (unlikely(ieee80211_multicast_to_unicast(skb, dev))) { struct sk_buff_head queue; __skb_queue_head_init(&queue); ieee80211_convert_to_unicast(skb, dev, &queue); while ((skb = __skb_dequeue(&queue))) __ieee80211_subif_start_xmit(skb, dev, 0, IEEE80211_TX_CTRL_MLO_LINK_UNSPEC, NULL); } else if (ieee80211_vif_is_mld(&sdata->vif) && ((sdata->vif.type == NL80211_IFTYPE_AP && !ieee80211_hw_check(&sdata->local->hw, MLO_MCAST_MULTI_LINK_TX)) || (sdata->vif.type == NL80211_IFTYPE_AP_VLAN && !sdata->wdev.use_4addr))) { ieee80211_mlo_multicast_tx(dev, skb); } else { normal: __ieee80211_subif_start_xmit(skb, dev, 0, IEEE80211_TX_CTRL_MLO_LINK_UNSPEC, NULL); } return NETDEV_TX_OK; } static bool __ieee80211_tx_8023(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb, struct sta_info *sta, bool txpending) { struct ieee80211_local *local = sdata->local; struct ieee80211_tx_control control = {}; struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); struct ieee80211_sta *pubsta = NULL; unsigned long flags; int q = info->hw_queue; spin_lock_irqsave(&local->queue_stop_reason_lock, flags); if (local->queue_stop_reasons[q] || (!txpending && !skb_queue_empty(&local->pending[q]))) { if (txpending) skb_queue_head(&local->pending[q], skb); else skb_queue_tail(&local->pending[q], skb); spin_unlock_irqrestore(&local->queue_stop_reason_lock, flags); return false; } spin_unlock_irqrestore(&local->queue_stop_reason_lock, flags); if (sta && sta->uploaded) pubsta = &sta->sta; control.sta = pubsta; drv_tx(local, &control, skb); return true; } static bool ieee80211_tx_8023(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb, struct sta_info *sta, bool txpending) { struct ieee80211_local *local = sdata->local; struct sk_buff *next; bool ret = true; if (ieee80211_queue_skb(local, sdata, sta, skb)) return true; skb_list_walk_safe(skb, skb, next) { skb_mark_not_on_list(skb); if (!__ieee80211_tx_8023(sdata, skb, sta, txpending)) ret = false; } return ret; } static void ieee80211_8023_xmit(struct ieee80211_sub_if_data *sdata, struct net_device *dev, struct sta_info *sta, struct ieee80211_key *key, struct sk_buff *skb) { struct ieee80211_tx_info *info; struct ieee80211_local *local = sdata->local; struct tid_ampdu_tx *tid_tx; struct sk_buff *seg, *next; unsigned int skbs = 0, len = 0; u16 queue; u8 tid; queue = ieee80211_select_queue(sdata, sta, skb); skb_set_queue_mapping(skb, queue); if (unlikely(test_bit(SCAN_SW_SCANNING, &local->scanning)) && test_bit(SDATA_STATE_OFFCHANNEL, &sdata->state)) goto out_free; skb = skb_share_check(skb, GFP_ATOMIC); if (unlikely(!skb)) return; ieee80211_aggr_check(sdata, sta, skb); tid = skb->priority & IEEE80211_QOS_CTL_TAG1D_MASK; tid_tx = rcu_dereference(sta->ampdu_mlme.tid_tx[tid]); if (tid_tx) { if (!test_bit(HT_AGG_STATE_OPERATIONAL, &tid_tx->state)) { /* fall back to non-offload slow path */ __ieee80211_subif_start_xmit(skb, dev, 0, IEEE80211_TX_CTRL_MLO_LINK_UNSPEC, NULL); return; } if (tid_tx->timeout) tid_tx->last_tx = jiffies; } skb = ieee80211_tx_skb_fixup(skb, ieee80211_sdata_netdev_features(sdata)); if (!skb) return; info = IEEE80211_SKB_CB(skb); memset(info, 0, sizeof(*info)); info->hw_queue = sdata->vif.hw_queue[queue]; if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN) sdata = container_of(sdata->bss, struct ieee80211_sub_if_data, u.ap); info->flags |= IEEE80211_TX_CTL_HW_80211_ENCAP; info->control.vif = &sdata->vif; if (key) info->control.hw_key = &key->conf; skb_list_walk_safe(skb, seg, next) { skbs++; len += seg->len; if (seg != skb) memcpy(IEEE80211_SKB_CB(seg), info, sizeof(*info)); } if (unlikely(sk_requests_wifi_status(skb->sk))) { info->status_data = ieee80211_store_ack_skb(local, skb, &info->flags, NULL); if (info->status_data) info->status_data_idr = 1; } dev_sw_netstats_tx_add(dev, skbs, len); sta->deflink.tx_stats.packets[queue] += skbs; sta->deflink.tx_stats.bytes[queue] += len; ieee80211_tpt_led_trig_tx(local, len); ieee80211_tx_8023(sdata, skb, sta, false); return; out_free: kfree_skb(skb); } netdev_tx_t ieee80211_subif_start_xmit_8023(struct sk_buff *skb, struct net_device *dev) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); struct ethhdr *ehdr = (struct ethhdr *)skb->data; struct ieee80211_key *key; struct sta_info *sta; if (unlikely(!ieee80211_sdata_running(sdata) || skb->len < ETH_HLEN)) { kfree_skb(skb); return NETDEV_TX_OK; } rcu_read_lock(); if (ieee80211_lookup_ra_sta(sdata, skb, &sta)) { kfree_skb(skb); goto out; } if (unlikely(IS_ERR_OR_NULL(sta) || !sta->uploaded || !test_sta_flag(sta, WLAN_STA_AUTHORIZED) || sdata->control_port_protocol == ehdr->h_proto)) goto skip_offload; key = rcu_dereference(sta->ptk[sta->ptk_idx]); if (!key) key = rcu_dereference(sdata->default_unicast_key); if (key && (!(key->flags & KEY_FLAG_UPLOADED_TO_HARDWARE) || key->conf.cipher == WLAN_CIPHER_SUITE_TKIP)) goto skip_offload; sk_pacing_shift_update(skb->sk, sdata->local->hw.tx_sk_pacing_shift); ieee80211_8023_xmit(sdata, dev, sta, key, skb); goto out; skip_offload: ieee80211_subif_start_xmit(skb, dev); out: rcu_read_unlock(); return NETDEV_TX_OK; } struct sk_buff * ieee80211_build_data_template(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb, u32 info_flags) { struct ieee80211_hdr *hdr; struct ieee80211_tx_data tx = { .local = sdata->local, .sdata = sdata, }; struct sta_info *sta; rcu_read_lock(); if (ieee80211_lookup_ra_sta(sdata, skb, &sta)) { kfree_skb(skb); skb = ERR_PTR(-EINVAL); goto out; } skb = ieee80211_build_hdr(sdata, skb, info_flags, sta, IEEE80211_TX_CTRL_MLO_LINK_UNSPEC, NULL); if (IS_ERR(skb)) goto out; hdr = (void *)skb->data; tx.sta = sta_info_get(sdata, hdr->addr1); tx.skb = skb; if (ieee80211_tx_h_select_key(&tx) != TX_CONTINUE) { rcu_read_unlock(); kfree_skb(skb); return ERR_PTR(-EINVAL); } out: rcu_read_unlock(); return skb; } /* * ieee80211_clear_tx_pending may not be called in a context where * it is possible that it packets could come in again. */ void ieee80211_clear_tx_pending(struct ieee80211_local *local) { struct sk_buff *skb; int i; for (i = 0; i < local->hw.queues; i++) { while ((skb = skb_dequeue(&local->pending[i])) != NULL) ieee80211_free_txskb(&local->hw, skb); } } /* * Returns false if the frame couldn't be transmitted but was queued instead, * which in this case means re-queued -- take as an indication to stop sending * more pending frames. */ static bool ieee80211_tx_pending_skb(struct ieee80211_local *local, struct sk_buff *skb) { struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); struct ieee80211_sub_if_data *sdata; struct sta_info *sta; struct ieee80211_hdr *hdr; bool result; struct ieee80211_chanctx_conf *chanctx_conf; sdata = vif_to_sdata(info->control.vif); if (info->control.flags & IEEE80211_TX_INTCFL_NEED_TXPROCESSING) { /* update band only for non-MLD */ if (!ieee80211_vif_is_mld(&sdata->vif)) { chanctx_conf = rcu_dereference(sdata->vif.bss_conf.chanctx_conf); if (unlikely(!chanctx_conf)) { dev_kfree_skb(skb); return true; } info->band = chanctx_conf->def.chan->band; } result = ieee80211_tx(sdata, NULL, skb, true); } else if (info->flags & IEEE80211_TX_CTL_HW_80211_ENCAP) { if (ieee80211_lookup_ra_sta(sdata, skb, &sta)) { dev_kfree_skb(skb); return true; } if (IS_ERR(sta) || (sta && !sta->uploaded)) sta = NULL; result = ieee80211_tx_8023(sdata, skb, sta, true); } else { struct sk_buff_head skbs; __skb_queue_head_init(&skbs); __skb_queue_tail(&skbs, skb); hdr = (struct ieee80211_hdr *)skb->data; sta = sta_info_get(sdata, hdr->addr1); result = __ieee80211_tx(local, &skbs, sta, true); } return result; } /* * Transmit all pending packets. Called from tasklet. */ void ieee80211_tx_pending(struct tasklet_struct *t) { struct ieee80211_local *local = from_tasklet(local, t, tx_pending_tasklet); unsigned long flags; int i; bool txok; rcu_read_lock(); spin_lock_irqsave(&local->queue_stop_reason_lock, flags); for (i = 0; i < local->hw.queues; i++) { /* * If queue is stopped by something other than due to pending * frames, or we have no pending frames, proceed to next queue. */ if (local->queue_stop_reasons[i] || skb_queue_empty(&local->pending[i])) continue; while (!skb_queue_empty(&local->pending[i])) { struct sk_buff *skb = __skb_dequeue(&local->pending[i]); struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); if (WARN_ON(!info->control.vif)) { ieee80211_free_txskb(&local->hw, skb); continue; } spin_unlock_irqrestore(&local->queue_stop_reason_lock, flags); txok = ieee80211_tx_pending_skb(local, skb); spin_lock_irqsave(&local->queue_stop_reason_lock, flags); if (!txok) break; } } spin_unlock_irqrestore(&local->queue_stop_reason_lock, flags); rcu_read_unlock(); } /* functions for drivers to get certain frames */ static void ieee80211_beacon_add_tim_pvb(struct ps_data *ps, struct sk_buff *skb, bool mcast_traffic) { int i, n1 = 0, n2; /* * Find largest even number N1 so that bits numbered 1 through * (N1 x 8) - 1 in the bitmap are 0 and number N2 so that bits * (N2 + 1) x 8 through 2007 are 0. */ for (i = 0; i < IEEE80211_MAX_TIM_LEN; i++) { if (ps->tim[i]) { n1 = i & 0xfe; break; } } n2 = n1; for (i = IEEE80211_MAX_TIM_LEN - 1; i >= n1; i--) { if (ps->tim[i]) { n2 = i; break; } } /* Bitmap control */ skb_put_u8(skb, n1 | mcast_traffic); /* Part Virt Bitmap */ skb_put_data(skb, ps->tim + n1, n2 - n1 + 1); } /* * mac80211 currently supports encoding using block bitmap mode, non * inversed. The current implementation supports up to 1600 AIDs. * * Block bitmap encoding breaks down the AID bitmap into blocks of 64 * AIDs. Each block contains between 0 and 8 subblocks. Each subblock * describes 8 AIDs and the presence of a subblock is determined by * the block bitmap. */ static void ieee80211_s1g_beacon_add_tim_pvb(struct ps_data *ps, struct sk_buff *skb, bool mcast_traffic) { int blk; /* * Emit a bitmap control block with a page slice number of 31 and a * page index of 0 which indicates as per IEEE80211-2024 9.4.2.5.1 * that the entire page (2048 bits) indicated by the page index * is encoded in the partial virtual bitmap. */ skb_put_u8(skb, mcast_traffic | (31 << 1)); /* Emit an encoded block for each non-zero sub-block */ for (blk = 0; blk < IEEE80211_MAX_SUPPORTED_S1G_TIM_BLOCKS; blk++) { u8 blk_bmap = 0; int sblk; for (sblk = 0; sblk < 8; sblk++) { int sblk_idx = blk * 8 + sblk; /* * If the current subblock is non-zero, increase the * number of subblocks to emit for the current block. */ if (ps->tim[sblk_idx]) blk_bmap |= BIT(sblk); } /* If the current block contains no non-zero sublocks */ if (!blk_bmap) continue; /* * Emit a block control byte for the current encoded block * with an encoding mode of block bitmap (0x0), not inverse * (0x0) and the current block offset (5 bits) */ skb_put_u8(skb, blk << 3); /* * Emit the block bitmap for the current encoded block which * contains the present subblocks. */ skb_put_u8(skb, blk_bmap); /* Emit the present subblocks */ for (sblk = 0; sblk < 8; sblk++) { int sblk_idx = blk * 8 + sblk; if (!(blk_bmap & BIT(sblk))) continue; skb_put_u8(skb, ps->tim[sblk_idx]); } } } static void __ieee80211_beacon_add_tim(struct ieee80211_sub_if_data *sdata, struct ieee80211_link_data *link, struct ps_data *ps, struct sk_buff *skb, bool is_template) { struct element *tim; bool mcast_traffic = false, have_bits = false; struct ieee80211_bss_conf *link_conf = link->conf; bool s1g = ieee80211_get_link_sband(link)->band == NL80211_BAND_S1GHZ; /* Generate bitmap for TIM only if there are any STAs in power save * mode. */ if (atomic_read(&ps->num_sta_ps) > 0) /* in the hope that this is faster than * checking byte-for-byte */ have_bits = !bitmap_empty((unsigned long *)ps->tim, IEEE80211_MAX_AID + 1); if (!is_template) { if (ps->dtim_count == 0) ps->dtim_count = link_conf->dtim_period - 1; else ps->dtim_count--; } /* Length is set after parsing the AID bitmap */ tim = skb_put(skb, sizeof(struct element)); tim->id = WLAN_EID_TIM; skb_put_u8(skb, ps->dtim_count); skb_put_u8(skb, link_conf->dtim_period); if (ps->dtim_count == 0 && !skb_queue_empty(&ps->bc_buf)) mcast_traffic = true; ps->dtim_bc_mc = mcast_traffic; if (have_bits) { if (s1g) ieee80211_s1g_beacon_add_tim_pvb(ps, skb, mcast_traffic); else ieee80211_beacon_add_tim_pvb(ps, skb, mcast_traffic); } else { /* * If there is no buffered unicast traffic for an S1G * interface, we can exclude the bitmap control. This is in * contrast to other phy types as they do include the bitmap * control and pvb even when there is no buffered traffic. */ if (!s1g) { /* Bitmap control */ skb_put_u8(skb, mcast_traffic); /* Part Virt Bitmap */ skb_put_u8(skb, 0); } } tim->datalen = skb_tail_pointer(skb) - tim->data; } static int ieee80211_beacon_add_tim(struct ieee80211_sub_if_data *sdata, struct ieee80211_link_data *link, struct ps_data *ps, struct sk_buff *skb, bool is_template) { struct ieee80211_local *local = sdata->local; /* * Not very nice, but we want to allow the driver to call * ieee80211_beacon_get() as a response to the set_tim() * callback. That, however, is already invoked under the * sta_lock to guarantee consistent and race-free update * of the tim bitmap in mac80211 and the driver. */ if (local->tim_in_locked_section) { __ieee80211_beacon_add_tim(sdata, link, ps, skb, is_template); } else { spin_lock_bh(&local->tim_lock); __ieee80211_beacon_add_tim(sdata, link, ps, skb, is_template); spin_unlock_bh(&local->tim_lock); } return 0; } static void ieee80211_set_beacon_cntdwn(struct ieee80211_sub_if_data *sdata, struct beacon_data *beacon, struct ieee80211_link_data *link) { u8 *beacon_data, count, max_count = 1; struct probe_resp *resp; size_t beacon_data_len; u16 *bcn_offsets; int i; switch (sdata->vif.type) { case NL80211_IFTYPE_AP: beacon_data = beacon->tail; beacon_data_len = beacon->tail_len; break; case NL80211_IFTYPE_ADHOC: beacon_data = beacon->head; beacon_data_len = beacon->head_len; break; case NL80211_IFTYPE_MESH_POINT: beacon_data = beacon->head; beacon_data_len = beacon->head_len; break; default: return; } resp = rcu_dereference(link->u.ap.probe_resp); bcn_offsets = beacon->cntdwn_counter_offsets; count = beacon->cntdwn_current_counter; if (link->conf->csa_active) max_count = IEEE80211_MAX_CNTDWN_COUNTERS_NUM; for (i = 0; i < max_count; ++i) { if (bcn_offsets[i]) { if (WARN_ON_ONCE(bcn_offsets[i] >= beacon_data_len)) return; beacon_data[bcn_offsets[i]] = count; } if (sdata->vif.type == NL80211_IFTYPE_AP && resp) { u16 *resp_offsets = resp->cntdwn_counter_offsets; resp->data[resp_offsets[i]] = count; } } } static u8 __ieee80211_beacon_update_cntdwn(struct ieee80211_link_data *link, struct beacon_data *beacon) { if (beacon->cntdwn_current_counter == 1) { /* * Channel switch handling is done by a worker thread while * beacons get pulled from hardware timers. It's therefore * possible that software threads are slow enough to not be * able to complete CSA handling in a single beacon interval, * in which case we get here. There isn't much to do about * it, other than letting the user know that the AP isn't * behaving correctly. */ link_err_once(link, "beacon TX faster than countdown (channel/color switch) completion\n"); return 0; } beacon->cntdwn_current_counter--; return beacon->cntdwn_current_counter; } u8 ieee80211_beacon_update_cntdwn(struct ieee80211_vif *vif, unsigned int link_id) { struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif); struct ieee80211_link_data *link; struct beacon_data *beacon = NULL; u8 count = 0; if (WARN_ON(link_id >= IEEE80211_MLD_MAX_NUM_LINKS)) return 0; rcu_read_lock(); link = rcu_dereference(sdata->link[link_id]); if (!link) goto unlock; if (sdata->vif.type == NL80211_IFTYPE_AP) beacon = rcu_dereference(link->u.ap.beacon); else if (sdata->vif.type == NL80211_IFTYPE_ADHOC) beacon = rcu_dereference(sdata->u.ibss.presp); else if (ieee80211_vif_is_mesh(&sdata->vif)) beacon = rcu_dereference(sdata->u.mesh.beacon); if (!beacon) goto unlock; count = __ieee80211_beacon_update_cntdwn(link, beacon); unlock: rcu_read_unlock(); return count; } EXPORT_SYMBOL(ieee80211_beacon_update_cntdwn); void ieee80211_beacon_set_cntdwn(struct ieee80211_vif *vif, u8 counter) { struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif); struct beacon_data *beacon = NULL; rcu_read_lock(); if (sdata->vif.type == NL80211_IFTYPE_AP) beacon = rcu_dereference(sdata->deflink.u.ap.beacon); else if (sdata->vif.type == NL80211_IFTYPE_ADHOC) beacon = rcu_dereference(sdata->u.ibss.presp); else if (ieee80211_vif_is_mesh(&sdata->vif)) beacon = rcu_dereference(sdata->u.mesh.beacon); if (!beacon) goto unlock; if (counter < beacon->cntdwn_current_counter) beacon->cntdwn_current_counter = counter; unlock: rcu_read_unlock(); } EXPORT_SYMBOL(ieee80211_beacon_set_cntdwn); bool ieee80211_beacon_cntdwn_is_complete(struct ieee80211_vif *vif, unsigned int link_id) { struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif); struct ieee80211_link_data *link; struct beacon_data *beacon = NULL; u8 *beacon_data; size_t beacon_data_len; int ret = false; if (!ieee80211_sdata_running(sdata)) return false; if (WARN_ON(link_id >= IEEE80211_MLD_MAX_NUM_LINKS)) return 0; rcu_read_lock(); link = rcu_dereference(sdata->link[link_id]); if (!link) goto out; if (vif->type == NL80211_IFTYPE_AP) { beacon = rcu_dereference(link->u.ap.beacon); if (WARN_ON(!beacon || !beacon->tail)) goto out; beacon_data = beacon->tail; beacon_data_len = beacon->tail_len; } else if (vif->type == NL80211_IFTYPE_ADHOC) { struct ieee80211_if_ibss *ifibss = &sdata->u.ibss; beacon = rcu_dereference(ifibss->presp); if (!beacon) goto out; beacon_data = beacon->head; beacon_data_len = beacon->head_len; } else if (vif->type == NL80211_IFTYPE_MESH_POINT) { struct ieee80211_if_mesh *ifmsh = &sdata->u.mesh; beacon = rcu_dereference(ifmsh->beacon); if (!beacon) goto out; beacon_data = beacon->head; beacon_data_len = beacon->head_len; } else { WARN_ON(1); goto out; } if (!beacon->cntdwn_counter_offsets[0]) goto out; if (WARN_ON_ONCE(beacon->cntdwn_counter_offsets[0] > beacon_data_len)) goto out; if (beacon_data[beacon->cntdwn_counter_offsets[0]] == 1) ret = true; out: rcu_read_unlock(); return ret; } EXPORT_SYMBOL(ieee80211_beacon_cntdwn_is_complete); static int ieee80211_beacon_protect(struct sk_buff *skb, struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, struct ieee80211_link_data *link) { ieee80211_tx_result res; struct ieee80211_tx_data tx; struct sk_buff *check_skb; memset(&tx, 0, sizeof(tx)); tx.key = rcu_dereference(link->default_beacon_key); if (!tx.key) return 0; if (unlikely(tx.key->flags & KEY_FLAG_TAINTED)) { tx.key = NULL; return -EINVAL; } if (!(tx.key->conf.flags & IEEE80211_KEY_FLAG_SW_MGMT_TX) && tx.key->flags & KEY_FLAG_UPLOADED_TO_HARDWARE) IEEE80211_SKB_CB(skb)->control.hw_key = &tx.key->conf; tx.local = local; tx.sdata = sdata; __skb_queue_head_init(&tx.skbs); __skb_queue_tail(&tx.skbs, skb); res = ieee80211_tx_h_encrypt(&tx); check_skb = __skb_dequeue(&tx.skbs); /* we may crash after this, but it'd be a bug in crypto */ WARN_ON(check_skb != skb); if (WARN_ON_ONCE(res != TX_CONTINUE)) return -EINVAL; return 0; } static void ieee80211_beacon_get_finish(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct ieee80211_link_data *link, struct ieee80211_mutable_offsets *offs, struct beacon_data *beacon, struct sk_buff *skb, struct ieee80211_chanctx_conf *chanctx_conf, u16 csa_off_base) { struct ieee80211_local *local = hw_to_local(hw); struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif); struct ieee80211_tx_info *info; enum nl80211_band band; struct ieee80211_tx_rate_control txrc; /* CSA offsets */ if (offs && beacon) { u16 i; for (i = 0; i < IEEE80211_MAX_CNTDWN_COUNTERS_NUM; i++) { u16 csa_off = beacon->cntdwn_counter_offsets[i]; if (!csa_off) continue; offs->cntdwn_counter_offs[i] = csa_off_base + csa_off; } } band = chanctx_conf->def.chan->band; info = IEEE80211_SKB_CB(skb); info->flags |= IEEE80211_TX_INTFL_DONT_ENCRYPT; info->flags |= IEEE80211_TX_CTL_NO_ACK; info->band = band; memset(&txrc, 0, sizeof(txrc)); txrc.hw = hw; txrc.sband = local->hw.wiphy->bands[band]; txrc.bss_conf = link->conf; txrc.skb = skb; txrc.reported_rate.idx = -1; if (sdata->beacon_rate_set && sdata->beacon_rateidx_mask[band]) txrc.rate_idx_mask = sdata->beacon_rateidx_mask[band]; else txrc.rate_idx_mask = sdata->rc_rateidx_mask[band]; txrc.bss = true; rate_control_get_rate(sdata, NULL, &txrc); info->control.vif = vif; info->control.flags |= u32_encode_bits(link->link_id, IEEE80211_TX_CTRL_MLO_LINK); info->flags |= IEEE80211_TX_CTL_CLEAR_PS_FILT | IEEE80211_TX_CTL_ASSIGN_SEQ | IEEE80211_TX_CTL_FIRST_FRAGMENT; } static void ieee80211_beacon_add_mbssid(struct sk_buff *skb, struct beacon_data *beacon, u8 i) { if (!beacon->mbssid_ies || !beacon->mbssid_ies->cnt || i > beacon->mbssid_ies->cnt) return; if (i < beacon->mbssid_ies->cnt) { skb_put_data(skb, beacon->mbssid_ies->elem[i].data, beacon->mbssid_ies->elem[i].len); if (beacon->rnr_ies && beacon->rnr_ies->cnt) { skb_put_data(skb, beacon->rnr_ies->elem[i].data, beacon->rnr_ies->elem[i].len); for (i = beacon->mbssid_ies->cnt; i < beacon->rnr_ies->cnt; i++) skb_put_data(skb, beacon->rnr_ies->elem[i].data, beacon->rnr_ies->elem[i].len); } return; } /* i == beacon->mbssid_ies->cnt, include all MBSSID elements */ for (i = 0; i < beacon->mbssid_ies->cnt; i++) skb_put_data(skb, beacon->mbssid_ies->elem[i].data, beacon->mbssid_ies->elem[i].len); } static struct sk_buff * __ieee80211_beacon_get_ap(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct ieee80211_link_data *link, struct ieee80211_mutable_offsets *offs, bool is_template, struct beacon_data *beacon, struct ieee80211_chanctx_conf *chanctx_conf, u8 ema_index) { struct ieee80211_local *local = hw_to_local(hw); struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif); struct ieee80211_if_ap *ap = &sdata->u.ap; struct sk_buff *skb = NULL; u16 csa_off_base = 0; int mbssid_len; if (beacon->cntdwn_counter_offsets[0]) { if (!is_template) ieee80211_beacon_update_cntdwn(vif, link->link_id); ieee80211_set_beacon_cntdwn(sdata, beacon, link); } /* headroom, head length, * tail length, maximum TIM length and multiple BSSID length */ mbssid_len = ieee80211_get_mbssid_beacon_len(beacon->mbssid_ies, beacon->rnr_ies, ema_index); skb = dev_alloc_skb(local->tx_headroom + beacon->head_len + beacon->tail_len + 256 + local->hw.extra_beacon_tailroom + mbssid_len); if (!skb) return NULL; skb_reserve(skb, local->tx_headroom); skb_put_data(skb, beacon->head, beacon->head_len); ieee80211_beacon_add_tim(sdata, link, &ap->ps, skb, is_template); if (offs) { offs->tim_offset = beacon->head_len; offs->tim_length = skb->len - beacon->head_len; offs->cntdwn_counter_offs[0] = beacon->cntdwn_counter_offsets[0]; if (mbssid_len) { ieee80211_beacon_add_mbssid(skb, beacon, ema_index); offs->mbssid_off = skb->len - mbssid_len; } /* for AP the csa offsets are from tail */ csa_off_base = skb->len; } if (beacon->tail) skb_put_data(skb, beacon->tail, beacon->tail_len); if (ieee80211_beacon_protect(skb, local, sdata, link) < 0) { dev_kfree_skb(skb); return NULL; } ieee80211_beacon_get_finish(hw, vif, link, offs, beacon, skb, chanctx_conf, csa_off_base); return skb; } static bool ieee80211_s1g_need_long_beacon(struct ieee80211_sub_if_data *sdata, struct ieee80211_link_data *link) { struct ps_data *ps = &sdata->u.ap.ps; if (ps->sb_count == 0) ps->sb_count = link->conf->s1g_long_beacon_period - 1; else ps->sb_count--; return ps->sb_count == 0; } static struct sk_buff * ieee80211_s1g_short_beacon_get(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct ieee80211_link_data *link, struct ieee80211_chanctx_conf *chanctx_conf, struct s1g_short_beacon_data *sb, bool is_template) { struct ieee80211_local *local = hw_to_local(hw); struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif); struct ieee80211_if_ap *ap = &sdata->u.ap; struct sk_buff *skb; skb = dev_alloc_skb(local->tx_headroom + sb->short_head_len + sb->short_tail_len + 256 + local->hw.extra_beacon_tailroom); if (!skb) return NULL; skb_reserve(skb, local->tx_headroom); skb_put_data(skb, sb->short_head, sb->short_head_len); ieee80211_beacon_add_tim(sdata, link, &ap->ps, skb, is_template); if (sb->short_tail) skb_put_data(skb, sb->short_tail, sb->short_tail_len); ieee80211_beacon_get_finish(hw, vif, link, NULL, NULL, skb, chanctx_conf, 0); return skb; } static struct sk_buff * ieee80211_beacon_get_ap(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct ieee80211_link_data *link, struct ieee80211_mutable_offsets *offs, bool is_template, struct beacon_data *beacon, struct ieee80211_chanctx_conf *chanctx_conf, u8 ema_index, struct s1g_short_beacon_data *s1g_sb) { struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif); if (!sdata->vif.cfg.s1g || !s1g_sb || ieee80211_s1g_need_long_beacon(sdata, link)) return __ieee80211_beacon_get_ap(hw, vif, link, offs, is_template, beacon, chanctx_conf, ema_index); return ieee80211_s1g_short_beacon_get(hw, vif, link, chanctx_conf, s1g_sb, is_template); } static struct ieee80211_ema_beacons * ieee80211_beacon_get_ap_ema_list(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct ieee80211_link_data *link, struct ieee80211_mutable_offsets *offs, bool is_template, struct beacon_data *beacon, struct ieee80211_chanctx_conf *chanctx_conf) { struct ieee80211_ema_beacons *ema = NULL; if (!beacon->mbssid_ies || !beacon->mbssid_ies->cnt) return NULL; ema = kzalloc(struct_size(ema, bcn, beacon->mbssid_ies->cnt), GFP_ATOMIC); if (!ema) return NULL; for (ema->cnt = 0; ema->cnt < beacon->mbssid_ies->cnt; ema->cnt++) { ema->bcn[ema->cnt].skb = ieee80211_beacon_get_ap(hw, vif, link, &ema->bcn[ema->cnt].offs, is_template, beacon, chanctx_conf, ema->cnt, NULL); if (!ema->bcn[ema->cnt].skb) break; } if (ema->cnt == beacon->mbssid_ies->cnt) return ema; ieee80211_beacon_free_ema_list(ema); return NULL; } #define IEEE80211_INCLUDE_ALL_MBSSID_ELEMS -1 static struct sk_buff * __ieee80211_beacon_get(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct ieee80211_mutable_offsets *offs, bool is_template, unsigned int link_id, int ema_index, struct ieee80211_ema_beacons **ema_beacons) { struct ieee80211_local *local = hw_to_local(hw); struct beacon_data *beacon = NULL; struct sk_buff *skb = NULL; struct ieee80211_sub_if_data *sdata = NULL; struct ieee80211_chanctx_conf *chanctx_conf; struct ieee80211_link_data *link; struct s1g_short_beacon_data *s1g_short_bcn = NULL; rcu_read_lock(); sdata = vif_to_sdata(vif); link = rcu_dereference(sdata->link[link_id]); if (!link) goto out; chanctx_conf = rcu_dereference(link->conf->chanctx_conf); if (!ieee80211_sdata_running(sdata) || !chanctx_conf) goto out; if (offs) memset(offs, 0, sizeof(*offs)); if (sdata->vif.type == NL80211_IFTYPE_AP) { beacon = rcu_dereference(link->u.ap.beacon); if (!beacon) goto out; if (vif->cfg.s1g && link->u.ap.s1g_short_beacon) { s1g_short_bcn = rcu_dereference(link->u.ap.s1g_short_beacon); if (!s1g_short_bcn) goto out; } if (ema_beacons) { *ema_beacons = ieee80211_beacon_get_ap_ema_list(hw, vif, link, offs, is_template, beacon, chanctx_conf); } else { if (beacon->mbssid_ies && beacon->mbssid_ies->cnt) { if (ema_index >= beacon->mbssid_ies->cnt) goto out; /* End of MBSSID elements */ if (ema_index <= IEEE80211_INCLUDE_ALL_MBSSID_ELEMS) ema_index = beacon->mbssid_ies->cnt; } else { ema_index = 0; } skb = ieee80211_beacon_get_ap(hw, vif, link, offs, is_template, beacon, chanctx_conf, ema_index, s1g_short_bcn); } } else if (sdata->vif.type == NL80211_IFTYPE_ADHOC) { struct ieee80211_if_ibss *ifibss = &sdata->u.ibss; struct ieee80211_hdr *hdr; beacon = rcu_dereference(ifibss->presp); if (!beacon) goto out; if (beacon->cntdwn_counter_offsets[0]) { if (!is_template) __ieee80211_beacon_update_cntdwn(link, beacon); ieee80211_set_beacon_cntdwn(sdata, beacon, link); } skb = dev_alloc_skb(local->tx_headroom + beacon->head_len + local->hw.extra_beacon_tailroom); if (!skb) goto out; skb_reserve(skb, local->tx_headroom); skb_put_data(skb, beacon->head, beacon->head_len); hdr = (struct ieee80211_hdr *) skb->data; hdr->frame_control = cpu_to_le16(IEEE80211_FTYPE_MGMT | IEEE80211_STYPE_BEACON); ieee80211_beacon_get_finish(hw, vif, link, offs, beacon, skb, chanctx_conf, 0); } else if (ieee80211_vif_is_mesh(&sdata->vif)) { struct ieee80211_if_mesh *ifmsh = &sdata->u.mesh; beacon = rcu_dereference(ifmsh->beacon); if (!beacon) goto out; if (beacon->cntdwn_counter_offsets[0]) { if (!is_template) /* TODO: For mesh csa_counter is in TU, so * decrementing it by one isn't correct, but * for now we leave it consistent with overall * mac80211's behavior. */ __ieee80211_beacon_update_cntdwn(link, beacon); ieee80211_set_beacon_cntdwn(sdata, beacon, link); } if (ifmsh->sync_ops) ifmsh->sync_ops->adjust_tsf(sdata, beacon); skb = dev_alloc_skb(local->tx_headroom + beacon->head_len + 256 + /* TIM IE */ beacon->tail_len + local->hw.extra_beacon_tailroom); if (!skb) goto out; skb_reserve(skb, local->tx_headroom); skb_put_data(skb, beacon->head, beacon->head_len); ieee80211_beacon_add_tim(sdata, link, &ifmsh->ps, skb, is_template); if (offs) { offs->tim_offset = beacon->head_len; offs->tim_length = skb->len - beacon->head_len; } skb_put_data(skb, beacon->tail, beacon->tail_len); ieee80211_beacon_get_finish(hw, vif, link, offs, beacon, skb, chanctx_conf, 0); } else { WARN_ON(1); goto out; } out: rcu_read_unlock(); return skb; } struct sk_buff * ieee80211_beacon_get_template(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct ieee80211_mutable_offsets *offs, unsigned int link_id) { return __ieee80211_beacon_get(hw, vif, offs, true, link_id, IEEE80211_INCLUDE_ALL_MBSSID_ELEMS, NULL); } EXPORT_SYMBOL(ieee80211_beacon_get_template); struct sk_buff * ieee80211_beacon_get_template_ema_index(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct ieee80211_mutable_offsets *offs, unsigned int link_id, u8 ema_index) { return __ieee80211_beacon_get(hw, vif, offs, true, link_id, ema_index, NULL); } EXPORT_SYMBOL(ieee80211_beacon_get_template_ema_index); void ieee80211_beacon_free_ema_list(struct ieee80211_ema_beacons *ema_beacons) { u8 i; if (!ema_beacons) return; for (i = 0; i < ema_beacons->cnt; i++) kfree_skb(ema_beacons->bcn[i].skb); kfree(ema_beacons); } EXPORT_SYMBOL(ieee80211_beacon_free_ema_list); struct ieee80211_ema_beacons * ieee80211_beacon_get_template_ema_list(struct ieee80211_hw *hw, struct ieee80211_vif *vif, unsigned int link_id) { struct ieee80211_ema_beacons *ema_beacons = NULL; WARN_ON(__ieee80211_beacon_get(hw, vif, NULL, true, link_id, 0, &ema_beacons)); return ema_beacons; } EXPORT_SYMBOL(ieee80211_beacon_get_template_ema_list); struct sk_buff *ieee80211_beacon_get_tim(struct ieee80211_hw *hw, struct ieee80211_vif *vif, u16 *tim_offset, u16 *tim_length, unsigned int link_id) { struct ieee80211_mutable_offsets offs = {}; struct sk_buff *bcn = __ieee80211_beacon_get(hw, vif, &offs, false, link_id, IEEE80211_INCLUDE_ALL_MBSSID_ELEMS, NULL); struct sk_buff *copy; if (!bcn) return bcn; if (tim_offset) *tim_offset = offs.tim_offset; if (tim_length) *tim_length = offs.tim_length; if (ieee80211_hw_check(hw, BEACON_TX_STATUS) || !hw_to_local(hw)->monitors) return bcn; /* send a copy to monitor interfaces */ copy = skb_copy(bcn, GFP_ATOMIC); if (!copy) return bcn; ieee80211_tx_monitor(hw_to_local(hw), copy, 1, NULL); return bcn; } EXPORT_SYMBOL(ieee80211_beacon_get_tim); struct sk_buff *ieee80211_proberesp_get(struct ieee80211_hw *hw, struct ieee80211_vif *vif) { struct sk_buff *skb = NULL; struct probe_resp *presp = NULL; struct ieee80211_hdr *hdr; struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif); if (sdata->vif.type != NL80211_IFTYPE_AP) return NULL; rcu_read_lock(); presp = rcu_dereference(sdata->deflink.u.ap.probe_resp); if (!presp) goto out; skb = dev_alloc_skb(presp->len); if (!skb) goto out; skb_put_data(skb, presp->data, presp->len); hdr = (struct ieee80211_hdr *) skb->data; memset(hdr->addr1, 0, sizeof(hdr->addr1)); out: rcu_read_unlock(); return skb; } EXPORT_SYMBOL(ieee80211_proberesp_get); struct sk_buff *ieee80211_get_fils_discovery_tmpl(struct ieee80211_hw *hw, struct ieee80211_vif *vif) { struct sk_buff *skb = NULL; struct fils_discovery_data *tmpl = NULL; struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif); if (sdata->vif.type != NL80211_IFTYPE_AP) return NULL; rcu_read_lock(); tmpl = rcu_dereference(sdata->deflink.u.ap.fils_discovery); if (!tmpl) { rcu_read_unlock(); return NULL; } skb = dev_alloc_skb(sdata->local->hw.extra_tx_headroom + tmpl->len); if (skb) { skb_reserve(skb, sdata->local->hw.extra_tx_headroom); skb_put_data(skb, tmpl->data, tmpl->len); } rcu_read_unlock(); return skb; } EXPORT_SYMBOL(ieee80211_get_fils_discovery_tmpl); struct sk_buff * ieee80211_get_unsol_bcast_probe_resp_tmpl(struct ieee80211_hw *hw, struct ieee80211_vif *vif) { struct sk_buff *skb = NULL; struct unsol_bcast_probe_resp_data *tmpl = NULL; struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif); if (sdata->vif.type != NL80211_IFTYPE_AP) return NULL; rcu_read_lock(); tmpl = rcu_dereference(sdata->deflink.u.ap.unsol_bcast_probe_resp); if (!tmpl) { rcu_read_unlock(); return NULL; } skb = dev_alloc_skb(sdata->local->hw.extra_tx_headroom + tmpl->len); if (skb) { skb_reserve(skb, sdata->local->hw.extra_tx_headroom); skb_put_data(skb, tmpl->data, tmpl->len); } rcu_read_unlock(); return skb; } EXPORT_SYMBOL(ieee80211_get_unsol_bcast_probe_resp_tmpl); struct sk_buff *ieee80211_pspoll_get(struct ieee80211_hw *hw, struct ieee80211_vif *vif) { struct ieee80211_sub_if_data *sdata; struct ieee80211_pspoll *pspoll; struct ieee80211_local *local; struct sk_buff *skb; if (WARN_ON(vif->type != NL80211_IFTYPE_STATION)) return NULL; sdata = vif_to_sdata(vif); local = sdata->local; skb = dev_alloc_skb(local->hw.extra_tx_headroom + sizeof(*pspoll)); if (!skb) return NULL; skb_reserve(skb, local->hw.extra_tx_headroom); pspoll = skb_put_zero(skb, sizeof(*pspoll)); pspoll->frame_control = cpu_to_le16(IEEE80211_FTYPE_CTL | IEEE80211_STYPE_PSPOLL); pspoll->aid = cpu_to_le16(sdata->vif.cfg.aid); /* aid in PS-Poll has its two MSBs each set to 1 */ pspoll->aid |= cpu_to_le16(1 << 15 | 1 << 14); memcpy(pspoll->bssid, sdata->deflink.u.mgd.bssid, ETH_ALEN); memcpy(pspoll->ta, vif->addr, ETH_ALEN); return skb; } EXPORT_SYMBOL(ieee80211_pspoll_get); struct sk_buff *ieee80211_nullfunc_get(struct ieee80211_hw *hw, struct ieee80211_vif *vif, int link_id, bool qos_ok) { struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif); struct ieee80211_local *local = sdata->local; struct ieee80211_link_data *link = NULL; struct ieee80211_hdr_3addr *nullfunc; struct sk_buff *skb; bool qos = false; if (WARN_ON(vif->type != NL80211_IFTYPE_STATION)) return NULL; skb = dev_alloc_skb(local->hw.extra_tx_headroom + sizeof(*nullfunc) + 2); if (!skb) return NULL; rcu_read_lock(); if (qos_ok) { struct sta_info *sta; sta = sta_info_get(sdata, vif->cfg.ap_addr); qos = sta && sta->sta.wme; } if (link_id >= 0) { link = rcu_dereference(sdata->link[link_id]); if (WARN_ON_ONCE(!link)) { rcu_read_unlock(); kfree_skb(skb); return NULL; } } skb_reserve(skb, local->hw.extra_tx_headroom); nullfunc = skb_put_zero(skb, sizeof(*nullfunc)); nullfunc->frame_control = cpu_to_le16(IEEE80211_FTYPE_DATA | IEEE80211_STYPE_NULLFUNC | IEEE80211_FCTL_TODS); if (qos) { __le16 qoshdr = cpu_to_le16(7); BUILD_BUG_ON((IEEE80211_STYPE_QOS_NULLFUNC | IEEE80211_STYPE_NULLFUNC) != IEEE80211_STYPE_QOS_NULLFUNC); nullfunc->frame_control |= cpu_to_le16(IEEE80211_STYPE_QOS_NULLFUNC); skb->priority = 7; skb_set_queue_mapping(skb, IEEE80211_AC_VO); skb_put_data(skb, &qoshdr, sizeof(qoshdr)); } if (link) { memcpy(nullfunc->addr1, link->conf->bssid, ETH_ALEN); memcpy(nullfunc->addr2, link->conf->addr, ETH_ALEN); memcpy(nullfunc->addr3, link->conf->bssid, ETH_ALEN); } else { memcpy(nullfunc->addr1, vif->cfg.ap_addr, ETH_ALEN); memcpy(nullfunc->addr2, vif->addr, ETH_ALEN); memcpy(nullfunc->addr3, vif->cfg.ap_addr, ETH_ALEN); } rcu_read_unlock(); return skb; } EXPORT_SYMBOL(ieee80211_nullfunc_get); struct sk_buff *ieee80211_probereq_get(struct ieee80211_hw *hw, const u8 *src_addr, const u8 *ssid, size_t ssid_len, size_t tailroom) { struct ieee80211_local *local = hw_to_local(hw); struct ieee80211_hdr_3addr *hdr; struct sk_buff *skb; size_t ie_ssid_len; u8 *pos; ie_ssid_len = 2 + ssid_len; skb = dev_alloc_skb(local->hw.extra_tx_headroom + sizeof(*hdr) + ie_ssid_len + tailroom); if (!skb) return NULL; skb_reserve(skb, local->hw.extra_tx_headroom); hdr = skb_put_zero(skb, sizeof(*hdr)); hdr->frame_control = cpu_to_le16(IEEE80211_FTYPE_MGMT | IEEE80211_STYPE_PROBE_REQ); eth_broadcast_addr(hdr->addr1); memcpy(hdr->addr2, src_addr, ETH_ALEN); eth_broadcast_addr(hdr->addr3); pos = skb_put(skb, ie_ssid_len); *pos++ = WLAN_EID_SSID; *pos++ = ssid_len; if (ssid_len) memcpy(pos, ssid, ssid_len); pos += ssid_len; return skb; } EXPORT_SYMBOL(ieee80211_probereq_get); void ieee80211_rts_get(struct ieee80211_hw *hw, struct ieee80211_vif *vif, const void *frame, size_t frame_len, const struct ieee80211_tx_info *frame_txctl, struct ieee80211_rts *rts) { const struct ieee80211_hdr *hdr = frame; rts->frame_control = cpu_to_le16(IEEE80211_FTYPE_CTL | IEEE80211_STYPE_RTS); rts->duration = ieee80211_rts_duration(hw, vif, frame_len, frame_txctl); memcpy(rts->ra, hdr->addr1, sizeof(rts->ra)); memcpy(rts->ta, hdr->addr2, sizeof(rts->ta)); } EXPORT_SYMBOL(ieee80211_rts_get); void ieee80211_ctstoself_get(struct ieee80211_hw *hw, struct ieee80211_vif *vif, const void *frame, size_t frame_len, const struct ieee80211_tx_info *frame_txctl, struct ieee80211_cts *cts) { const struct ieee80211_hdr *hdr = frame; cts->frame_control = cpu_to_le16(IEEE80211_FTYPE_CTL | IEEE80211_STYPE_CTS); cts->duration = ieee80211_ctstoself_duration(hw, vif, frame_len, frame_txctl); memcpy(cts->ra, hdr->addr1, sizeof(cts->ra)); } EXPORT_SYMBOL(ieee80211_ctstoself_get); struct sk_buff * ieee80211_get_buffered_bc(struct ieee80211_hw *hw, struct ieee80211_vif *vif) { struct ieee80211_local *local = hw_to_local(hw); struct sk_buff *skb = NULL; struct ieee80211_tx_data tx; struct ieee80211_sub_if_data *sdata; struct ps_data *ps; struct ieee80211_tx_info *info; struct ieee80211_chanctx_conf *chanctx_conf; sdata = vif_to_sdata(vif); rcu_read_lock(); chanctx_conf = rcu_dereference(sdata->vif.bss_conf.chanctx_conf); if (!chanctx_conf) goto out; if (sdata->vif.type == NL80211_IFTYPE_AP) { struct beacon_data *beacon = rcu_dereference(sdata->deflink.u.ap.beacon); if (!beacon || !beacon->head) goto out; ps = &sdata->u.ap.ps; } else if (ieee80211_vif_is_mesh(&sdata->vif)) { ps = &sdata->u.mesh.ps; } else { goto out; } if (ps->dtim_count != 0 || !ps->dtim_bc_mc) goto out; /* send buffered bc/mc only after DTIM beacon */ while (1) { skb = skb_dequeue(&ps->bc_buf); if (!skb) goto out; local->total_ps_buffered--; if (!skb_queue_empty(&ps->bc_buf) && skb->len >= 2) { struct ieee80211_hdr *hdr = (struct ieee80211_hdr *) skb->data; /* more buffered multicast/broadcast frames ==> set * MoreData flag in IEEE 802.11 header to inform PS * STAs */ hdr->frame_control |= cpu_to_le16(IEEE80211_FCTL_MOREDATA); } if (sdata->vif.type == NL80211_IFTYPE_AP) sdata = IEEE80211_DEV_TO_SUB_IF(skb->dev); if (!ieee80211_tx_prepare(sdata, &tx, NULL, skb)) break; ieee80211_free_txskb(hw, skb); } info = IEEE80211_SKB_CB(skb); tx.flags |= IEEE80211_TX_PS_BUFFERED; info->band = chanctx_conf->def.chan->band; if (invoke_tx_handlers(&tx)) skb = NULL; out: rcu_read_unlock(); return skb; } EXPORT_SYMBOL(ieee80211_get_buffered_bc); int ieee80211_reserve_tid(struct ieee80211_sta *pubsta, u8 tid) { struct sta_info *sta = container_of(pubsta, struct sta_info, sta); struct ieee80211_sub_if_data *sdata = sta->sdata; struct ieee80211_local *local = sdata->local; int ret; u32 queues; lockdep_assert_wiphy(local->hw.wiphy); /* only some cases are supported right now */ switch (sdata->vif.type) { case NL80211_IFTYPE_STATION: case NL80211_IFTYPE_AP: case NL80211_IFTYPE_AP_VLAN: break; default: WARN_ON(1); return -EINVAL; } if (WARN_ON(tid >= IEEE80211_NUM_UPS)) return -EINVAL; if (sta->reserved_tid == tid) { ret = 0; goto out; } if (sta->reserved_tid != IEEE80211_TID_UNRESERVED) { sdata_err(sdata, "TID reservation already active\n"); ret = -EALREADY; goto out; } ieee80211_stop_vif_queues(sdata->local, sdata, IEEE80211_QUEUE_STOP_REASON_RESERVE_TID); synchronize_net(); /* Tear down BA sessions so we stop aggregating on this TID */ if (ieee80211_hw_check(&local->hw, AMPDU_AGGREGATION)) { set_sta_flag(sta, WLAN_STA_BLOCK_BA); __ieee80211_stop_tx_ba_session(sta, tid, AGG_STOP_LOCAL_REQUEST); } queues = BIT(sdata->vif.hw_queue[ieee802_1d_to_ac[tid]]); __ieee80211_flush_queues(local, sdata, queues, false); sta->reserved_tid = tid; ieee80211_wake_vif_queues(local, sdata, IEEE80211_QUEUE_STOP_REASON_RESERVE_TID); if (ieee80211_hw_check(&local->hw, AMPDU_AGGREGATION)) clear_sta_flag(sta, WLAN_STA_BLOCK_BA); ret = 0; out: return ret; } EXPORT_SYMBOL(ieee80211_reserve_tid); void ieee80211_unreserve_tid(struct ieee80211_sta *pubsta, u8 tid) { struct sta_info *sta = container_of(pubsta, struct sta_info, sta); struct ieee80211_sub_if_data *sdata = sta->sdata; lockdep_assert_wiphy(sdata->local->hw.wiphy); /* only some cases are supported right now */ switch (sdata->vif.type) { case NL80211_IFTYPE_STATION: case NL80211_IFTYPE_AP: case NL80211_IFTYPE_AP_VLAN: break; default: WARN_ON(1); return; } if (tid != sta->reserved_tid) { sdata_err(sdata, "TID to unreserve (%d) isn't reserved\n", tid); return; } sta->reserved_tid = IEEE80211_TID_UNRESERVED; } EXPORT_SYMBOL(ieee80211_unreserve_tid); void __ieee80211_tx_skb_tid_band(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb, int tid, int link_id, enum nl80211_band band) { const struct ieee80211_hdr *hdr = (void *)skb->data; int ac = ieee80211_ac_from_tid(tid); unsigned int link; skb_reset_mac_header(skb); skb_set_queue_mapping(skb, ac); skb->priority = tid; skb->dev = sdata->dev; BUILD_BUG_ON(IEEE80211_LINK_UNSPECIFIED < IEEE80211_MLD_MAX_NUM_LINKS); BUILD_BUG_ON(!FIELD_FIT(IEEE80211_TX_CTRL_MLO_LINK, IEEE80211_LINK_UNSPECIFIED)); if (!ieee80211_vif_is_mld(&sdata->vif)) { link = 0; } else if (link_id >= 0) { link = link_id; } else if (memcmp(sdata->vif.addr, hdr->addr2, ETH_ALEN) == 0) { /* address from the MLD */ link = IEEE80211_LINK_UNSPECIFIED; } else { /* otherwise must be addressed from a link */ rcu_read_lock(); for (link = 0; link < ARRAY_SIZE(sdata->vif.link_conf); link++) { struct ieee80211_bss_conf *link_conf; link_conf = rcu_dereference(sdata->vif.link_conf[link]); if (!link_conf) continue; if (memcmp(link_conf->addr, hdr->addr2, ETH_ALEN) == 0) break; } rcu_read_unlock(); if (WARN_ON_ONCE(link == ARRAY_SIZE(sdata->vif.link_conf))) link = ffs(sdata->vif.active_links) - 1; } IEEE80211_SKB_CB(skb)->control.flags |= u32_encode_bits(link, IEEE80211_TX_CTRL_MLO_LINK); /* * The other path calling ieee80211_xmit is from the tasklet, * and while we can handle concurrent transmissions locking * requirements are that we do not come into tx with bhs on. */ local_bh_disable(); IEEE80211_SKB_CB(skb)->band = band; ieee80211_xmit(sdata, NULL, skb); local_bh_enable(); } void ieee80211_tx_skb_tid(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb, int tid, int link_id) { struct ieee80211_chanctx_conf *chanctx_conf; enum nl80211_band band; rcu_read_lock(); if (sdata->vif.type == NL80211_IFTYPE_NAN) { band = NUM_NL80211_BANDS; } else if (!ieee80211_vif_is_mld(&sdata->vif)) { WARN_ON(link_id >= 0); chanctx_conf = rcu_dereference(sdata->vif.bss_conf.chanctx_conf); if (WARN_ON(!chanctx_conf)) { rcu_read_unlock(); kfree_skb(skb); return; } band = chanctx_conf->def.chan->band; } else { WARN_ON(link_id >= 0 && !(sdata->vif.active_links & BIT(link_id))); /* MLD transmissions must not rely on the band */ band = 0; } __ieee80211_tx_skb_tid_band(sdata, skb, tid, link_id, band); rcu_read_unlock(); } int ieee80211_tx_control_port(struct wiphy *wiphy, struct net_device *dev, const u8 *buf, size_t len, const u8 *dest, __be16 proto, bool unencrypted, int link_id, u64 *cookie) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); struct ieee80211_local *local = sdata->local; struct sta_info *sta; struct sk_buff *skb; struct ethhdr *ehdr; u32 ctrl_flags = 0; u32 flags = 0; int err; /* mutex lock is only needed for incrementing the cookie counter */ lockdep_assert_wiphy(local->hw.wiphy); /* Only accept CONTROL_PORT_PROTOCOL configured in CONNECT/ASSOCIATE * or Pre-Authentication */ if (proto != sdata->control_port_protocol && proto != cpu_to_be16(ETH_P_PREAUTH)) return -EINVAL; if (proto == sdata->control_port_protocol) ctrl_flags |= IEEE80211_TX_CTRL_PORT_CTRL_PROTO | IEEE80211_TX_CTRL_SKIP_MPATH_LOOKUP; if (unencrypted) flags |= IEEE80211_TX_INTFL_DONT_ENCRYPT; if (cookie) ctrl_flags |= IEEE80211_TX_CTL_REQ_TX_STATUS; flags |= IEEE80211_TX_INTFL_NL80211_FRAME_TX; skb = dev_alloc_skb(local->hw.extra_tx_headroom + sizeof(struct ethhdr) + len); if (!skb) return -ENOMEM; skb_reserve(skb, local->hw.extra_tx_headroom + sizeof(struct ethhdr)); skb_put_data(skb, buf, len); ehdr = skb_push(skb, sizeof(struct ethhdr)); memcpy(ehdr->h_dest, dest, ETH_ALEN); /* we may override the SA for MLO STA later */ if (link_id < 0) { ctrl_flags |= u32_encode_bits(IEEE80211_LINK_UNSPECIFIED, IEEE80211_TX_CTRL_MLO_LINK); memcpy(ehdr->h_source, sdata->vif.addr, ETH_ALEN); } else { struct ieee80211_bss_conf *link_conf; ctrl_flags |= u32_encode_bits(link_id, IEEE80211_TX_CTRL_MLO_LINK); rcu_read_lock(); link_conf = rcu_dereference(sdata->vif.link_conf[link_id]); if (!link_conf) { dev_kfree_skb(skb); rcu_read_unlock(); return -ENOLINK; } memcpy(ehdr->h_source, link_conf->addr, ETH_ALEN); rcu_read_unlock(); } ehdr->h_proto = proto; skb->dev = dev; skb->protocol = proto; skb_reset_network_header(skb); skb_reset_mac_header(skb); if (local->hw.queues < IEEE80211_NUM_ACS) goto start_xmit; /* update QoS header to prioritize control port frames if possible, * prioritization also happens for control port frames send over * AF_PACKET */ rcu_read_lock(); err = ieee80211_lookup_ra_sta(sdata, skb, &sta); if (err) { dev_kfree_skb(skb); rcu_read_unlock(); return err; } if (!IS_ERR(sta)) { u16 queue = ieee80211_select_queue(sdata, sta, skb); skb_set_queue_mapping(skb, queue); /* * for MLO STA, the SA should be the AP MLD address, but * the link ID has been selected already */ if (sta && sta->sta.mlo) memcpy(ehdr->h_source, sdata->vif.addr, ETH_ALEN); } rcu_read_unlock(); start_xmit: local_bh_disable(); __ieee80211_subif_start_xmit(skb, skb->dev, flags, ctrl_flags, cookie); local_bh_enable(); return 0; } int ieee80211_probe_mesh_link(struct wiphy *wiphy, struct net_device *dev, const u8 *buf, size_t len) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); struct ieee80211_local *local = sdata->local; struct sk_buff *skb; skb = dev_alloc_skb(local->hw.extra_tx_headroom + len + 30 + /* header size */ 18); /* 11s header size */ if (!skb) return -ENOMEM; skb_reserve(skb, local->hw.extra_tx_headroom); skb_put_data(skb, buf, len); skb->dev = dev; skb->protocol = htons(ETH_P_802_3); skb_reset_network_header(skb); skb_reset_mac_header(skb); local_bh_disable(); __ieee80211_subif_start_xmit(skb, skb->dev, 0, IEEE80211_TX_CTRL_SKIP_MPATH_LOOKUP, NULL); local_bh_enable(); return 0; }
1 1 1 1 1 5 5 5 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 // SPDX-License-Identifier: GPL-2.0-or-later /* linux/net/ipv4/arp.c * * Copyright (C) 1994 by Florian La Roche * * This module implements the Address Resolution Protocol ARP (RFC 826), * which is used to convert IP addresses (or in the future maybe other * high-level addresses) into a low-level hardware address (like an Ethernet * address). * * Fixes: * Alan Cox : Removed the Ethernet assumptions in * Florian's code * Alan Cox : Fixed some small errors in the ARP * logic * Alan Cox : Allow >4K in /proc * Alan Cox : Make ARP add its own protocol entry * Ross Martin : Rewrote arp_rcv() and arp_get_info() * Stephen Henson : Add AX25 support to arp_get_info() * Alan Cox : Drop data when a device is downed. * Alan Cox : Use init_timer(). * Alan Cox : Double lock fixes. * Martin Seine : Move the arphdr structure * to if_arp.h for compatibility. * with BSD based programs. * Andrew Tridgell : Added ARP netmask code and * re-arranged proxy handling. * Alan Cox : Changed to use notifiers. * Niibe Yutaka : Reply for this device or proxies only. * Alan Cox : Don't proxy across hardware types! * Jonathan Naylor : Added support for NET/ROM. * Mike Shaver : RFC1122 checks. * Jonathan Naylor : Only lookup the hardware address for * the correct hardware type. * Germano Caronni : Assorted subtle races. * Craig Schlenter : Don't modify permanent entry * during arp_rcv. * Russ Nelson : Tidied up a few bits. * Alexey Kuznetsov: Major changes to caching and behaviour, * eg intelligent arp probing and * generation * of host down events. * Alan Cox : Missing unlock in device events. * Eckes : ARP ioctl control errors. * Alexey Kuznetsov: Arp free fix. * Manuel Rodriguez: Gratuitous ARP. * Jonathan Layes : Added arpd support through kerneld * message queue (960314) * Mike Shaver : /proc/sys/net/ipv4/arp_* support * Mike McLagan : Routing by source * Stuart Cheshire : Metricom and grat arp fixes * *** FOR 2.1 clean this up *** * Lawrence V. Stefani: (08/12/96) Added FDDI support. * Alan Cox : Took the AP1000 nasty FDDI hack and * folded into the mainstream FDDI code. * Ack spit, Linus how did you allow that * one in... * Jes Sorensen : Make FDDI work again in 2.1.x and * clean up the APFDDI & gen. FDDI bits. * Alexey Kuznetsov: new arp state machine; * now it is in net/core/neighbour.c. * Krzysztof Halasa: Added Frame Relay ARP support. * Arnaldo C. Melo : convert /proc/net/arp to seq_file * Shmulik Hen: Split arp_send to arp_create and * arp_xmit so intermediate drivers like * bonding can change the skb before * sending (e.g. insert 8021q tag). * Harald Welte : convert to make use of jenkins hash * Jesper D. Brouer: Proxy ARP PVLAN RFC 3069 support. */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/module.h> #include <linux/types.h> #include <linux/string.h> #include <linux/kernel.h> #include <linux/capability.h> #include <linux/socket.h> #include <linux/sockios.h> #include <linux/errno.h> #include <linux/in.h> #include <linux/mm.h> #include <linux/inet.h> #include <linux/inetdevice.h> #include <linux/netdevice.h> #include <linux/etherdevice.h> #include <linux/fddidevice.h> #include <linux/if_arp.h> #include <linux/skbuff.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <linux/stat.h> #include <linux/init.h> #include <linux/net.h> #include <linux/rcupdate.h> #include <linux/slab.h> #ifdef CONFIG_SYSCTL #include <linux/sysctl.h> #endif #include <net/net_namespace.h> #include <net/ip.h> #include <net/icmp.h> #include <net/route.h> #include <net/protocol.h> #include <net/tcp.h> #include <net/sock.h> #include <net/arp.h> #include <net/ax25.h> #include <net/netrom.h> #include <net/dst_metadata.h> #include <net/ip_tunnels.h> #include <linux/uaccess.h> #include <linux/netfilter_arp.h> /* * Interface to generic neighbour cache. */ static u32 arp_hash(const void *pkey, const struct net_device *dev, __u32 *hash_rnd); static bool arp_key_eq(const struct neighbour *n, const void *pkey); static int arp_constructor(struct neighbour *neigh); static void arp_solicit(struct neighbour *neigh, struct sk_buff *skb); static void arp_error_report(struct neighbour *neigh, struct sk_buff *skb); static void parp_redo(struct sk_buff *skb); static int arp_is_multicast(const void *pkey); static const struct neigh_ops arp_generic_ops = { .family = AF_INET, .solicit = arp_solicit, .error_report = arp_error_report, .output = neigh_resolve_output, .connected_output = neigh_connected_output, }; static const struct neigh_ops arp_hh_ops = { .family = AF_INET, .solicit = arp_solicit, .error_report = arp_error_report, .output = neigh_resolve_output, .connected_output = neigh_resolve_output, }; static const struct neigh_ops arp_direct_ops = { .family = AF_INET, .output = neigh_direct_output, .connected_output = neigh_direct_output, }; struct neigh_table arp_tbl = { .family = AF_INET, .key_len = 4, .protocol = cpu_to_be16(ETH_P_IP), .hash = arp_hash, .key_eq = arp_key_eq, .constructor = arp_constructor, .proxy_redo = parp_redo, .is_multicast = arp_is_multicast, .id = "arp_cache", .parms = { .tbl = &arp_tbl, .reachable_time = 30 * HZ, .data = { [NEIGH_VAR_MCAST_PROBES] = 3, [NEIGH_VAR_UCAST_PROBES] = 3, [NEIGH_VAR_RETRANS_TIME] = 1 * HZ, [NEIGH_VAR_BASE_REACHABLE_TIME] = 30 * HZ, [NEIGH_VAR_DELAY_PROBE_TIME] = 5 * HZ, [NEIGH_VAR_INTERVAL_PROBE_TIME_MS] = 5 * HZ, [NEIGH_VAR_GC_STALETIME] = 60 * HZ, [NEIGH_VAR_QUEUE_LEN_BYTES] = SK_WMEM_DEFAULT, [NEIGH_VAR_PROXY_QLEN] = 64, [NEIGH_VAR_ANYCAST_DELAY] = 1 * HZ, [NEIGH_VAR_PROXY_DELAY] = (8 * HZ) / 10, [NEIGH_VAR_LOCKTIME] = 1 * HZ, }, }, .gc_interval = 30 * HZ, .gc_thresh1 = 128, .gc_thresh2 = 512, .gc_thresh3 = 1024, }; EXPORT_SYMBOL(arp_tbl); int arp_mc_map(__be32 addr, u8 *haddr, struct net_device *dev, int dir) { switch (dev->type) { case ARPHRD_ETHER: case ARPHRD_FDDI: case ARPHRD_IEEE802: ip_eth_mc_map(addr, haddr); return 0; case ARPHRD_INFINIBAND: ip_ib_mc_map(addr, dev->broadcast, haddr); return 0; case ARPHRD_IPGRE: ip_ipgre_mc_map(addr, dev->broadcast, haddr); return 0; default: if (dir) { memcpy(haddr, dev->broadcast, dev->addr_len); return 0; } } return -EINVAL; } static u32 arp_hash(const void *pkey, const struct net_device *dev, __u32 *hash_rnd) { return arp_hashfn(pkey, dev, hash_rnd); } static bool arp_key_eq(const struct neighbour *neigh, const void *pkey) { return neigh_key_eq32(neigh, pkey); } static int arp_constructor(struct neighbour *neigh) { __be32 addr; struct net_device *dev = neigh->dev; struct in_device *in_dev; struct neigh_parms *parms; u32 inaddr_any = INADDR_ANY; if (dev->flags & (IFF_LOOPBACK | IFF_POINTOPOINT)) memcpy(neigh->primary_key, &inaddr_any, arp_tbl.key_len); addr = *(__be32 *)neigh->primary_key; rcu_read_lock(); in_dev = __in_dev_get_rcu(dev); if (!in_dev) { rcu_read_unlock(); return -EINVAL; } neigh->type = inet_addr_type_dev_table(dev_net(dev), dev, addr); parms = in_dev->arp_parms; __neigh_parms_put(neigh->parms); neigh->parms = neigh_parms_clone(parms); rcu_read_unlock(); if (!dev->header_ops) { neigh->nud_state = NUD_NOARP; neigh->ops = &arp_direct_ops; neigh->output = neigh_direct_output; } else { /* Good devices (checked by reading texts, but only Ethernet is tested) ARPHRD_ETHER: (ethernet, apfddi) ARPHRD_FDDI: (fddi) ARPHRD_IEEE802: (tr) ARPHRD_METRICOM: (strip) ARPHRD_ARCNET: etc. etc. etc. ARPHRD_IPDDP will also work, if author repairs it. I did not it, because this driver does not work even in old paradigm. */ if (neigh->type == RTN_MULTICAST) { neigh->nud_state = NUD_NOARP; arp_mc_map(addr, neigh->ha, dev, 1); } else if (dev->flags & (IFF_NOARP | IFF_LOOPBACK)) { neigh->nud_state = NUD_NOARP; memcpy(neigh->ha, dev->dev_addr, dev->addr_len); } else if (neigh->type == RTN_BROADCAST || (dev->flags & IFF_POINTOPOINT)) { neigh->nud_state = NUD_NOARP; memcpy(neigh->ha, dev->broadcast, dev->addr_len); } if (dev->header_ops->cache) neigh->ops = &arp_hh_ops; else neigh->ops = &arp_generic_ops; if (neigh->nud_state & NUD_VALID) neigh->output = neigh->ops->connected_output; else neigh->output = neigh->ops->output; } return 0; } static void arp_error_report(struct neighbour *neigh, struct sk_buff *skb) { dst_link_failure(skb); kfree_skb_reason(skb, SKB_DROP_REASON_NEIGH_FAILED); } /* Create and send an arp packet. */ static void arp_send_dst(int type, int ptype, __be32 dest_ip, struct net_device *dev, __be32 src_ip, const unsigned char *dest_hw, const unsigned char *src_hw, const unsigned char *target_hw, struct dst_entry *dst) { struct sk_buff *skb; /* arp on this interface. */ if (dev->flags & IFF_NOARP) return; skb = arp_create(type, ptype, dest_ip, dev, src_ip, dest_hw, src_hw, target_hw); if (!skb) return; skb_dst_set(skb, dst_clone(dst)); arp_xmit(skb); } void arp_send(int type, int ptype, __be32 dest_ip, struct net_device *dev, __be32 src_ip, const unsigned char *dest_hw, const unsigned char *src_hw, const unsigned char *target_hw) { arp_send_dst(type, ptype, dest_ip, dev, src_ip, dest_hw, src_hw, target_hw, NULL); } EXPORT_SYMBOL(arp_send); static void arp_solicit(struct neighbour *neigh, struct sk_buff *skb) { __be32 saddr = 0; u8 dst_ha[MAX_ADDR_LEN], *dst_hw = NULL; struct net_device *dev = neigh->dev; __be32 target = *(__be32 *)neigh->primary_key; int probes = atomic_read(&neigh->probes); struct in_device *in_dev; struct dst_entry *dst = NULL; rcu_read_lock(); in_dev = __in_dev_get_rcu(dev); if (!in_dev) { rcu_read_unlock(); return; } switch (IN_DEV_ARP_ANNOUNCE(in_dev)) { default: case 0: /* By default announce any local IP */ if (skb && inet_addr_type_dev_table(dev_net(dev), dev, ip_hdr(skb)->saddr) == RTN_LOCAL) saddr = ip_hdr(skb)->saddr; break; case 1: /* Restrict announcements of saddr in same subnet */ if (!skb) break; saddr = ip_hdr(skb)->saddr; if (inet_addr_type_dev_table(dev_net(dev), dev, saddr) == RTN_LOCAL) { /* saddr should be known to target */ if (inet_addr_onlink(in_dev, target, saddr)) break; } saddr = 0; break; case 2: /* Avoid secondary IPs, get a primary/preferred one */ break; } rcu_read_unlock(); if (!saddr) saddr = inet_select_addr(dev, target, RT_SCOPE_LINK); probes -= NEIGH_VAR(neigh->parms, UCAST_PROBES); if (probes < 0) { if (!(READ_ONCE(neigh->nud_state) & NUD_VALID)) pr_debug("trying to ucast probe in NUD_INVALID\n"); neigh_ha_snapshot(dst_ha, neigh, dev); dst_hw = dst_ha; } else { probes -= NEIGH_VAR(neigh->parms, APP_PROBES); if (probes < 0) { neigh_app_ns(neigh); return; } } if (skb && !(dev->priv_flags & IFF_XMIT_DST_RELEASE)) dst = skb_dst(skb); arp_send_dst(ARPOP_REQUEST, ETH_P_ARP, target, dev, saddr, dst_hw, dev->dev_addr, NULL, dst); } static int arp_ignore(struct in_device *in_dev, __be32 sip, __be32 tip) { struct net *net = dev_net(in_dev->dev); int scope; switch (IN_DEV_ARP_IGNORE(in_dev)) { case 0: /* Reply, the tip is already validated */ return 0; case 1: /* Reply only if tip is configured on the incoming interface */ sip = 0; scope = RT_SCOPE_HOST; break; case 2: /* * Reply only if tip is configured on the incoming interface * and is in same subnet as sip */ scope = RT_SCOPE_HOST; break; case 3: /* Do not reply for scope host addresses */ sip = 0; scope = RT_SCOPE_LINK; in_dev = NULL; break; case 4: /* Reserved */ case 5: case 6: case 7: return 0; case 8: /* Do not reply */ return 1; default: return 0; } return !inet_confirm_addr(net, in_dev, sip, tip, scope); } static int arp_accept(struct in_device *in_dev, __be32 sip) { struct net *net = dev_net(in_dev->dev); int scope = RT_SCOPE_LINK; switch (IN_DEV_ARP_ACCEPT(in_dev)) { case 0: /* Don't create new entries from garp */ return 0; case 1: /* Create new entries from garp */ return 1; case 2: /* Create a neighbor in the arp table only if sip * is in the same subnet as an address configured * on the interface that received the garp message */ return !!inet_confirm_addr(net, in_dev, sip, 0, scope); default: return 0; } } static int arp_filter(__be32 sip, __be32 tip, struct net_device *dev) { struct rtable *rt; int flag = 0; /*unsigned long now; */ struct net *net = dev_net(dev); rt = ip_route_output(net, sip, tip, 0, l3mdev_master_ifindex_rcu(dev), RT_SCOPE_UNIVERSE); if (IS_ERR(rt)) return 1; if (rt->dst.dev != dev) { __NET_INC_STATS(net, LINUX_MIB_ARPFILTER); flag = 1; } ip_rt_put(rt); return flag; } /* * Check if we can use proxy ARP for this path */ static inline int arp_fwd_proxy(struct in_device *in_dev, struct net_device *dev, struct rtable *rt) { struct in_device *out_dev; int imi, omi = -1; if (rt->dst.dev == dev) return 0; if (!IN_DEV_PROXY_ARP(in_dev)) return 0; imi = IN_DEV_MEDIUM_ID(in_dev); if (imi == 0) return 1; if (imi == -1) return 0; /* place to check for proxy_arp for routes */ out_dev = __in_dev_get_rcu(rt->dst.dev); if (out_dev) omi = IN_DEV_MEDIUM_ID(out_dev); return omi != imi && omi != -1; } /* * Check for RFC3069 proxy arp private VLAN (allow to send back to same dev) * * RFC3069 supports proxy arp replies back to the same interface. This * is done to support (ethernet) switch features, like RFC 3069, where * the individual ports are not allowed to communicate with each * other, BUT they are allowed to talk to the upstream router. As * described in RFC 3069, it is possible to allow these hosts to * communicate through the upstream router, by proxy_arp'ing. * * RFC 3069: "VLAN Aggregation for Efficient IP Address Allocation" * * This technology is known by different names: * In RFC 3069 it is called VLAN Aggregation. * Cisco and Allied Telesyn call it Private VLAN. * Hewlett-Packard call it Source-Port filtering or port-isolation. * Ericsson call it MAC-Forced Forwarding (RFC Draft). * */ static inline int arp_fwd_pvlan(struct in_device *in_dev, struct net_device *dev, struct rtable *rt, __be32 sip, __be32 tip) { /* Private VLAN is only concerned about the same ethernet segment */ if (rt->dst.dev != dev) return 0; /* Don't reply on self probes (often done by windowz boxes)*/ if (sip == tip) return 0; if (IN_DEV_PROXY_ARP_PVLAN(in_dev)) return 1; else return 0; } /* * Interface to link layer: send routine and receive handler. */ /* * Create an arp packet. If dest_hw is not set, we create a broadcast * message. */ struct sk_buff *arp_create(int type, int ptype, __be32 dest_ip, struct net_device *dev, __be32 src_ip, const unsigned char *dest_hw, const unsigned char *src_hw, const unsigned char *target_hw) { struct sk_buff *skb; struct arphdr *arp; unsigned char *arp_ptr; int hlen = LL_RESERVED_SPACE(dev); int tlen = dev->needed_tailroom; /* * Allocate a buffer */ skb = alloc_skb(arp_hdr_len(dev) + hlen + tlen, GFP_ATOMIC); if (!skb) return NULL; skb_reserve(skb, hlen); skb_reset_network_header(skb); arp = skb_put(skb, arp_hdr_len(dev)); skb->dev = dev; skb->protocol = htons(ETH_P_ARP); if (!src_hw) src_hw = dev->dev_addr; if (!dest_hw) dest_hw = dev->broadcast; /* * Fill the device header for the ARP frame */ if (dev_hard_header(skb, dev, ptype, dest_hw, src_hw, skb->len) < 0) goto out; /* * Fill out the arp protocol part. * * The arp hardware type should match the device type, except for FDDI, * which (according to RFC 1390) should always equal 1 (Ethernet). */ /* * Exceptions everywhere. AX.25 uses the AX.25 PID value not the * DIX code for the protocol. Make these device structure fields. */ switch (dev->type) { default: arp->ar_hrd = htons(dev->type); arp->ar_pro = htons(ETH_P_IP); break; #if IS_ENABLED(CONFIG_AX25) case ARPHRD_AX25: arp->ar_hrd = htons(ARPHRD_AX25); arp->ar_pro = htons(AX25_P_IP); break; #if IS_ENABLED(CONFIG_NETROM) case ARPHRD_NETROM: arp->ar_hrd = htons(ARPHRD_NETROM); arp->ar_pro = htons(AX25_P_IP); break; #endif #endif #if IS_ENABLED(CONFIG_FDDI) case ARPHRD_FDDI: arp->ar_hrd = htons(ARPHRD_ETHER); arp->ar_pro = htons(ETH_P_IP); break; #endif } arp->ar_hln = dev->addr_len; arp->ar_pln = 4; arp->ar_op = htons(type); arp_ptr = (unsigned char *)(arp + 1); memcpy(arp_ptr, src_hw, dev->addr_len); arp_ptr += dev->addr_len; memcpy(arp_ptr, &src_ip, 4); arp_ptr += 4; switch (dev->type) { #if IS_ENABLED(CONFIG_FIREWIRE_NET) case ARPHRD_IEEE1394: break; #endif default: if (target_hw) memcpy(arp_ptr, target_hw, dev->addr_len); else memset(arp_ptr, 0, dev->addr_len); arp_ptr += dev->addr_len; } memcpy(arp_ptr, &dest_ip, 4); return skb; out: kfree_skb(skb); return NULL; } EXPORT_SYMBOL(arp_create); static int arp_xmit_finish(struct net *net, struct sock *sk, struct sk_buff *skb) { return dev_queue_xmit(skb); } /* * Send an arp packet. */ void arp_xmit(struct sk_buff *skb) { rcu_read_lock(); /* Send it off, maybe filter it using firewalling first. */ NF_HOOK(NFPROTO_ARP, NF_ARP_OUT, dev_net_rcu(skb->dev), NULL, skb, NULL, skb->dev, arp_xmit_finish); rcu_read_unlock(); } EXPORT_SYMBOL(arp_xmit); static bool arp_is_garp(struct net *net, struct net_device *dev, int *addr_type, __be16 ar_op, __be32 sip, __be32 tip, unsigned char *sha, unsigned char *tha) { bool is_garp = tip == sip; /* Gratuitous ARP _replies_ also require target hwaddr to be * the same as source. */ if (is_garp && ar_op == htons(ARPOP_REPLY)) is_garp = /* IPv4 over IEEE 1394 doesn't provide target * hardware address field in its ARP payload. */ tha && !memcmp(tha, sha, dev->addr_len); if (is_garp) { *addr_type = inet_addr_type_dev_table(net, dev, sip); if (*addr_type != RTN_UNICAST) is_garp = false; } return is_garp; } /* * Process an arp request. */ static int arp_process(struct net *net, struct sock *sk, struct sk_buff *skb) { struct net_device *dev = skb->dev; struct in_device *in_dev = __in_dev_get_rcu(dev); struct arphdr *arp; unsigned char *arp_ptr; struct rtable *rt; unsigned char *sha; unsigned char *tha = NULL; __be32 sip, tip; u16 dev_type = dev->type; int addr_type; struct neighbour *n; struct dst_entry *reply_dst = NULL; bool is_garp = false; /* arp_rcv below verifies the ARP header and verifies the device * is ARP'able. */ if (!in_dev) goto out_free_skb; arp = arp_hdr(skb); switch (dev_type) { default: if (arp->ar_pro != htons(ETH_P_IP) || htons(dev_type) != arp->ar_hrd) goto out_free_skb; break; case ARPHRD_ETHER: case ARPHRD_FDDI: case ARPHRD_IEEE802: /* * ETHERNET, and Fibre Channel (which are IEEE 802 * devices, according to RFC 2625) devices will accept ARP * hardware types of either 1 (Ethernet) or 6 (IEEE 802.2). * This is the case also of FDDI, where the RFC 1390 says that * FDDI devices should accept ARP hardware of (1) Ethernet, * however, to be more robust, we'll accept both 1 (Ethernet) * or 6 (IEEE 802.2) */ if ((arp->ar_hrd != htons(ARPHRD_ETHER) && arp->ar_hrd != htons(ARPHRD_IEEE802)) || arp->ar_pro != htons(ETH_P_IP)) goto out_free_skb; break; case ARPHRD_AX25: if (arp->ar_pro != htons(AX25_P_IP) || arp->ar_hrd != htons(ARPHRD_AX25)) goto out_free_skb; break; case ARPHRD_NETROM: if (arp->ar_pro != htons(AX25_P_IP) || arp->ar_hrd != htons(ARPHRD_NETROM)) goto out_free_skb; break; } /* Understand only these message types */ if (arp->ar_op != htons(ARPOP_REPLY) && arp->ar_op != htons(ARPOP_REQUEST)) goto out_free_skb; /* * Extract fields */ arp_ptr = (unsigned char *)(arp + 1); sha = arp_ptr; arp_ptr += dev->addr_len; memcpy(&sip, arp_ptr, 4); arp_ptr += 4; switch (dev_type) { #if IS_ENABLED(CONFIG_FIREWIRE_NET) case ARPHRD_IEEE1394: break; #endif default: tha = arp_ptr; arp_ptr += dev->addr_len; } memcpy(&tip, arp_ptr, 4); /* * Check for bad requests for 127.x.x.x and requests for multicast * addresses. If this is one such, delete it. */ if (ipv4_is_multicast(tip) || (!IN_DEV_ROUTE_LOCALNET(in_dev) && ipv4_is_loopback(tip))) goto out_free_skb; /* * For some 802.11 wireless deployments (and possibly other networks), * there will be an ARP proxy and gratuitous ARP frames are attacks * and thus should not be accepted. */ if (sip == tip && IN_DEV_ORCONF(in_dev, DROP_GRATUITOUS_ARP)) goto out_free_skb; /* * Special case: We must set Frame Relay source Q.922 address */ if (dev_type == ARPHRD_DLCI) sha = dev->broadcast; /* * Process entry. The idea here is we want to send a reply if it is a * request for us or if it is a request for someone else that we hold * a proxy for. We want to add an entry to our cache if it is a reply * to us or if it is a request for our address. * (The assumption for this last is that if someone is requesting our * address, they are probably intending to talk to us, so it saves time * if we cache their address. Their address is also probably not in * our cache, since ours is not in their cache.) * * Putting this another way, we only care about replies if they are to * us, in which case we add them to the cache. For requests, we care * about those for us and those for our proxies. We reply to both, * and in the case of requests for us we add the requester to the arp * cache. */ if (arp->ar_op == htons(ARPOP_REQUEST) && skb_metadata_dst(skb)) reply_dst = (struct dst_entry *) iptunnel_metadata_reply(skb_metadata_dst(skb), GFP_ATOMIC); /* Special case: IPv4 duplicate address detection packet (RFC2131) */ if (sip == 0) { if (arp->ar_op == htons(ARPOP_REQUEST) && inet_addr_type_dev_table(net, dev, tip) == RTN_LOCAL && !arp_ignore(in_dev, sip, tip)) arp_send_dst(ARPOP_REPLY, ETH_P_ARP, sip, dev, tip, sha, dev->dev_addr, sha, reply_dst); goto out_consume_skb; } if (arp->ar_op == htons(ARPOP_REQUEST) && ip_route_input_noref(skb, tip, sip, 0, dev) == 0) { rt = skb_rtable(skb); addr_type = rt->rt_type; if (addr_type == RTN_LOCAL) { int dont_send; dont_send = arp_ignore(in_dev, sip, tip); if (!dont_send && IN_DEV_ARPFILTER(in_dev)) dont_send = arp_filter(sip, tip, dev); if (!dont_send) { n = neigh_event_ns(&arp_tbl, sha, &sip, dev); if (n) { arp_send_dst(ARPOP_REPLY, ETH_P_ARP, sip, dev, tip, sha, dev->dev_addr, sha, reply_dst); neigh_release(n); } } goto out_consume_skb; } else if (IN_DEV_FORWARD(in_dev)) { if (addr_type == RTN_UNICAST && (arp_fwd_proxy(in_dev, dev, rt) || arp_fwd_pvlan(in_dev, dev, rt, sip, tip) || (rt->dst.dev != dev && pneigh_lookup(&arp_tbl, net, &tip, dev)))) { n = neigh_event_ns(&arp_tbl, sha, &sip, dev); if (n) neigh_release(n); if (NEIGH_CB(skb)->flags & LOCALLY_ENQUEUED || skb->pkt_type == PACKET_HOST || NEIGH_VAR(in_dev->arp_parms, PROXY_DELAY) == 0) { arp_send_dst(ARPOP_REPLY, ETH_P_ARP, sip, dev, tip, sha, dev->dev_addr, sha, reply_dst); } else { pneigh_enqueue(&arp_tbl, in_dev->arp_parms, skb); goto out_free_dst; } goto out_consume_skb; } } } /* Update our ARP tables */ n = __neigh_lookup(&arp_tbl, &sip, dev, 0); addr_type = -1; if (n || arp_accept(in_dev, sip)) { is_garp = arp_is_garp(net, dev, &addr_type, arp->ar_op, sip, tip, sha, tha); } if (arp_accept(in_dev, sip)) { /* Unsolicited ARP is not accepted by default. It is possible, that this option should be enabled for some devices (strip is candidate) */ if (!n && (is_garp || (arp->ar_op == htons(ARPOP_REPLY) && (addr_type == RTN_UNICAST || (addr_type < 0 && /* postpone calculation to as late as possible */ inet_addr_type_dev_table(net, dev, sip) == RTN_UNICAST))))) n = __neigh_lookup(&arp_tbl, &sip, dev, 1); } if (n) { int state = NUD_REACHABLE; int override; /* If several different ARP replies follows back-to-back, use the FIRST one. It is possible, if several proxy agents are active. Taking the first reply prevents arp trashing and chooses the fastest router. */ override = time_after(jiffies, n->updated + NEIGH_VAR(n->parms, LOCKTIME)) || is_garp; /* Broadcast replies and request packets do not assert neighbour reachability. */ if (arp->ar_op != htons(ARPOP_REPLY) || skb->pkt_type != PACKET_HOST) state = NUD_STALE; neigh_update(n, sha, state, override ? NEIGH_UPDATE_F_OVERRIDE : 0, 0); neigh_release(n); } out_consume_skb: consume_skb(skb); out_free_dst: dst_release(reply_dst); return NET_RX_SUCCESS; out_free_skb: kfree_skb(skb); return NET_RX_DROP; } static void parp_redo(struct sk_buff *skb) { arp_process(dev_net(skb->dev), NULL, skb); } static int arp_is_multicast(const void *pkey) { return ipv4_is_multicast(*((__be32 *)pkey)); } /* * Receive an arp request from the device layer. */ static int arp_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev) { enum skb_drop_reason drop_reason; const struct arphdr *arp; /* do not tweak dropwatch on an ARP we will ignore */ if (dev->flags & IFF_NOARP || skb->pkt_type == PACKET_OTHERHOST || skb->pkt_type == PACKET_LOOPBACK) goto consumeskb; skb = skb_share_check(skb, GFP_ATOMIC); if (!skb) goto out_of_mem; /* ARP header, plus 2 device addresses, plus 2 IP addresses. */ drop_reason = pskb_may_pull_reason(skb, arp_hdr_len(dev)); if (drop_reason != SKB_NOT_DROPPED_YET) goto freeskb; arp = arp_hdr(skb); if (arp->ar_hln != dev->addr_len || arp->ar_pln != 4) { drop_reason = SKB_DROP_REASON_NOT_SPECIFIED; goto freeskb; } memset(NEIGH_CB(skb), 0, sizeof(struct neighbour_cb)); return NF_HOOK(NFPROTO_ARP, NF_ARP_IN, dev_net(dev), NULL, skb, dev, NULL, arp_process); consumeskb: consume_skb(skb); return NET_RX_SUCCESS; freeskb: kfree_skb_reason(skb, drop_reason); out_of_mem: return NET_RX_DROP; } /* * User level interface (ioctl) */ static struct net_device *arp_req_dev_by_name(struct net *net, struct arpreq *r, bool getarp) { struct net_device *dev; if (getarp) dev = dev_get_by_name_rcu(net, r->arp_dev); else dev = __dev_get_by_name(net, r->arp_dev); if (!dev) return ERR_PTR(-ENODEV); /* Mmmm... It is wrong... ARPHRD_NETROM == 0 */ if (!r->arp_ha.sa_family) r->arp_ha.sa_family = dev->type; if ((r->arp_flags & ATF_COM) && r->arp_ha.sa_family != dev->type) return ERR_PTR(-EINVAL); return dev; } static struct net_device *arp_req_dev(struct net *net, struct arpreq *r) { struct net_device *dev; struct rtable *rt; __be32 ip; if (r->arp_dev[0]) return arp_req_dev_by_name(net, r, false); if (r->arp_flags & ATF_PUBL) return NULL; ip = ((struct sockaddr_in *)&r->arp_pa)->sin_addr.s_addr; rt = ip_route_output(net, ip, 0, 0, 0, RT_SCOPE_LINK); if (IS_ERR(rt)) return ERR_CAST(rt); dev = rt->dst.dev; ip_rt_put(rt); if (!dev) return ERR_PTR(-EINVAL); return dev; } /* * Set (create) an ARP cache entry. */ static int arp_req_set_proxy(struct net *net, struct net_device *dev, int on) { if (!dev) { IPV4_DEVCONF_ALL(net, PROXY_ARP) = on; return 0; } if (__in_dev_get_rtnl_net(dev)) { IN_DEV_CONF_SET(__in_dev_get_rtnl_net(dev), PROXY_ARP, on); return 0; } return -ENXIO; } static int arp_req_set_public(struct net *net, struct arpreq *r, struct net_device *dev) { __be32 mask = ((struct sockaddr_in *)&r->arp_netmask)->sin_addr.s_addr; if (!dev && (r->arp_flags & ATF_COM)) { dev = dev_getbyhwaddr(net, r->arp_ha.sa_family, r->arp_ha.sa_data); if (!dev) return -ENODEV; } if (mask) { __be32 ip = ((struct sockaddr_in *)&r->arp_pa)->sin_addr.s_addr; return pneigh_create(&arp_tbl, net, &ip, dev, 0, 0, false); } return arp_req_set_proxy(net, dev, 1); } static int arp_req_set(struct net *net, struct arpreq *r) { struct neighbour *neigh; struct net_device *dev; __be32 ip; int err; dev = arp_req_dev(net, r); if (IS_ERR(dev)) return PTR_ERR(dev); if (r->arp_flags & ATF_PUBL) return arp_req_set_public(net, r, dev); switch (dev->type) { #if IS_ENABLED(CONFIG_FDDI) case ARPHRD_FDDI: /* * According to RFC 1390, FDDI devices should accept ARP * hardware types of 1 (Ethernet). However, to be more * robust, we'll accept hardware types of either 1 (Ethernet) * or 6 (IEEE 802.2). */ if (r->arp_ha.sa_family != ARPHRD_FDDI && r->arp_ha.sa_family != ARPHRD_ETHER && r->arp_ha.sa_family != ARPHRD_IEEE802) return -EINVAL; break; #endif default: if (r->arp_ha.sa_family != dev->type) return -EINVAL; break; } ip = ((struct sockaddr_in *)&r->arp_pa)->sin_addr.s_addr; neigh = __neigh_lookup_errno(&arp_tbl, &ip, dev); err = PTR_ERR(neigh); if (!IS_ERR(neigh)) { unsigned int state = NUD_STALE; if (r->arp_flags & ATF_PERM) { r->arp_flags |= ATF_COM; state = NUD_PERMANENT; } err = neigh_update(neigh, (r->arp_flags & ATF_COM) ? r->arp_ha.sa_data : NULL, state, NEIGH_UPDATE_F_OVERRIDE | NEIGH_UPDATE_F_ADMIN, 0); neigh_release(neigh); } return err; } static unsigned int arp_state_to_flags(struct neighbour *neigh) { if (neigh->nud_state&NUD_PERMANENT) return ATF_PERM | ATF_COM; else if (neigh->nud_state&NUD_VALID) return ATF_COM; else return 0; } /* * Get an ARP cache entry. */ static int arp_req_get(struct net *net, struct arpreq *r) { __be32 ip = ((struct sockaddr_in *) &r->arp_pa)->sin_addr.s_addr; struct neighbour *neigh; struct net_device *dev; if (!r->arp_dev[0]) return -ENODEV; dev = arp_req_dev_by_name(net, r, true); if (IS_ERR(dev)) return PTR_ERR(dev); neigh = neigh_lookup(&arp_tbl, &ip, dev); if (!neigh) return -ENXIO; if (READ_ONCE(neigh->nud_state) & NUD_NOARP) { neigh_release(neigh); return -ENXIO; } read_lock_bh(&neigh->lock); memcpy(r->arp_ha.sa_data, neigh->ha, min(dev->addr_len, sizeof(r->arp_ha.sa_data_min))); r->arp_flags = arp_state_to_flags(neigh); read_unlock_bh(&neigh->lock); neigh_release(neigh); r->arp_ha.sa_family = dev->type; netdev_copy_name(dev, r->arp_dev); return 0; } int arp_invalidate(struct net_device *dev, __be32 ip, bool force) { struct neighbour *neigh = neigh_lookup(&arp_tbl, &ip, dev); int err = -ENXIO; struct neigh_table *tbl = &arp_tbl; if (neigh) { if ((READ_ONCE(neigh->nud_state) & NUD_VALID) && !force) { neigh_release(neigh); return 0; } if (READ_ONCE(neigh->nud_state) & ~NUD_NOARP) err = neigh_update(neigh, NULL, NUD_FAILED, NEIGH_UPDATE_F_OVERRIDE| NEIGH_UPDATE_F_ADMIN, 0); write_lock_bh(&tbl->lock); neigh_release(neigh); neigh_remove_one(neigh); write_unlock_bh(&tbl->lock); } return err; } static int arp_req_delete_public(struct net *net, struct arpreq *r, struct net_device *dev) { __be32 mask = ((struct sockaddr_in *)&r->arp_netmask)->sin_addr.s_addr; if (mask) { __be32 ip = ((struct sockaddr_in *)&r->arp_pa)->sin_addr.s_addr; return pneigh_delete(&arp_tbl, net, &ip, dev); } return arp_req_set_proxy(net, dev, 0); } static int arp_req_delete(struct net *net, struct arpreq *r) { struct net_device *dev; __be32 ip; dev = arp_req_dev(net, r); if (IS_ERR(dev)) return PTR_ERR(dev); if (r->arp_flags & ATF_PUBL) return arp_req_delete_public(net, r, dev); ip = ((struct sockaddr_in *)&r->arp_pa)->sin_addr.s_addr; return arp_invalidate(dev, ip, true); } /* * Handle an ARP layer I/O control request. */ int arp_ioctl(struct net *net, unsigned int cmd, void __user *arg) { struct arpreq r; __be32 *netmask; int err; switch (cmd) { case SIOCDARP: case SIOCSARP: if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) return -EPERM; fallthrough; case SIOCGARP: err = copy_from_user(&r, arg, sizeof(struct arpreq)); if (err) return -EFAULT; break; default: return -EINVAL; } if (r.arp_pa.sa_family != AF_INET) return -EPFNOSUPPORT; if (!(r.arp_flags & ATF_PUBL) && (r.arp_flags & (ATF_NETMASK | ATF_DONTPUB))) return -EINVAL; netmask = &((struct sockaddr_in *)&r.arp_netmask)->sin_addr.s_addr; if (!(r.arp_flags & ATF_NETMASK)) *netmask = htonl(0xFFFFFFFFUL); else if (*netmask && *netmask != htonl(0xFFFFFFFFUL)) return -EINVAL; switch (cmd) { case SIOCDARP: rtnl_net_lock(net); err = arp_req_delete(net, &r); rtnl_net_unlock(net); break; case SIOCSARP: rtnl_net_lock(net); err = arp_req_set(net, &r); rtnl_net_unlock(net); break; case SIOCGARP: rcu_read_lock(); err = arp_req_get(net, &r); rcu_read_unlock(); if (!err && copy_to_user(arg, &r, sizeof(r))) err = -EFAULT; break; } return err; } static int arp_netdev_event(struct notifier_block *this, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct netdev_notifier_change_info *change_info; struct in_device *in_dev; bool evict_nocarrier; switch (event) { case NETDEV_CHANGEADDR: neigh_changeaddr(&arp_tbl, dev); rt_cache_flush(dev_net(dev)); break; case NETDEV_CHANGE: change_info = ptr; if (change_info->flags_changed & IFF_NOARP) neigh_changeaddr(&arp_tbl, dev); in_dev = __in_dev_get_rtnl(dev); if (!in_dev) evict_nocarrier = true; else evict_nocarrier = IN_DEV_ARP_EVICT_NOCARRIER(in_dev); if (evict_nocarrier && !netif_carrier_ok(dev)) neigh_carrier_down(&arp_tbl, dev); break; default: break; } return NOTIFY_DONE; } static struct notifier_block arp_netdev_notifier = { .notifier_call = arp_netdev_event, }; /* Note, that it is not on notifier chain. It is necessary, that this routine was called after route cache will be flushed. */ void arp_ifdown(struct net_device *dev) { neigh_ifdown(&arp_tbl, dev); } /* * Called once on startup. */ static struct packet_type arp_packet_type __read_mostly = { .type = cpu_to_be16(ETH_P_ARP), .func = arp_rcv, }; #ifdef CONFIG_PROC_FS #if IS_ENABLED(CONFIG_AX25) /* * ax25 -> ASCII conversion */ static void ax2asc2(ax25_address *a, char *buf) { char c, *s; int n; for (n = 0, s = buf; n < 6; n++) { c = (a->ax25_call[n] >> 1) & 0x7F; if (c != ' ') *s++ = c; } *s++ = '-'; n = (a->ax25_call[6] >> 1) & 0x0F; if (n > 9) { *s++ = '1'; n -= 10; } *s++ = n + '0'; *s++ = '\0'; if (*buf == '\0' || *buf == '-') { buf[0] = '*'; buf[1] = '\0'; } } #endif /* CONFIG_AX25 */ #define HBUFFERLEN 30 static void arp_format_neigh_entry(struct seq_file *seq, struct neighbour *n) { char hbuffer[HBUFFERLEN]; int k, j; char tbuf[16]; struct net_device *dev = n->dev; int hatype = dev->type; read_lock(&n->lock); /* Convert hardware address to XX:XX:XX:XX ... form. */ #if IS_ENABLED(CONFIG_AX25) if (hatype == ARPHRD_AX25 || hatype == ARPHRD_NETROM) ax2asc2((ax25_address *)n->ha, hbuffer); else { #endif for (k = 0, j = 0; k < HBUFFERLEN - 3 && j < dev->addr_len; j++) { hbuffer[k++] = hex_asc_hi(n->ha[j]); hbuffer[k++] = hex_asc_lo(n->ha[j]); hbuffer[k++] = ':'; } if (k != 0) --k; hbuffer[k] = 0; #if IS_ENABLED(CONFIG_AX25) } #endif sprintf(tbuf, "%pI4", n->primary_key); seq_printf(seq, "%-16s 0x%-10x0x%-10x%-17s * %s\n", tbuf, hatype, arp_state_to_flags(n), hbuffer, dev->name); read_unlock(&n->lock); } static void arp_format_pneigh_entry(struct seq_file *seq, struct pneigh_entry *n) { struct net_device *dev = n->dev; int hatype = dev ? dev->type : 0; char tbuf[16]; sprintf(tbuf, "%pI4", n->key); seq_printf(seq, "%-16s 0x%-10x0x%-10x%s * %s\n", tbuf, hatype, ATF_PUBL | ATF_PERM, "00:00:00:00:00:00", dev ? dev->name : "*"); } static int arp_seq_show(struct seq_file *seq, void *v) { if (v == SEQ_START_TOKEN) { seq_puts(seq, "IP address HW type Flags " "HW address Mask Device\n"); } else { struct neigh_seq_state *state = seq->private; if (state->flags & NEIGH_SEQ_IS_PNEIGH) arp_format_pneigh_entry(seq, v); else arp_format_neigh_entry(seq, v); } return 0; } static void *arp_seq_start(struct seq_file *seq, loff_t *pos) { /* Don't want to confuse "arp -a" w/ magic entries, * so we tell the generic iterator to skip NUD_NOARP. */ return neigh_seq_start(seq, pos, &arp_tbl, NEIGH_SEQ_SKIP_NOARP); } static const struct seq_operations arp_seq_ops = { .start = arp_seq_start, .next = neigh_seq_next, .stop = neigh_seq_stop, .show = arp_seq_show, }; #endif /* CONFIG_PROC_FS */ static int __net_init arp_net_init(struct net *net) { if (!proc_create_net("arp", 0444, net->proc_net, &arp_seq_ops, sizeof(struct neigh_seq_state))) return -ENOMEM; return 0; } static void __net_exit arp_net_exit(struct net *net) { remove_proc_entry("arp", net->proc_net); } static struct pernet_operations arp_net_ops = { .init = arp_net_init, .exit = arp_net_exit, }; void __init arp_init(void) { neigh_table_init(NEIGH_ARP_TABLE, &arp_tbl); dev_add_pack(&arp_packet_type); register_pernet_subsys(&arp_net_ops); #ifdef CONFIG_SYSCTL neigh_sysctl_register(NULL, &arp_tbl.parms, NULL); #endif register_netdevice_notifier(&arp_netdev_notifier); }
4 4 3 6 6 1 30 21 6 26 2 1 1 1 1 1 1 1 1 4 1 1 1 1 17 16 1 37 38 1 2 6 1 19 20 7 19 4 3 4 4 2 1 1 2 2 1 1 2 2 1 1 2 2 2 36 1 4 2 2 31 1 1 28 5 1 23 24 5 14 5 3 2 2 17 17 17 17 17 1 16 18 18 18 36 42 42 4 38 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 // SPDX-License-Identifier: GPL-2.0-only /* * linux/fs/hpfs/super.c * * Mikulas Patocka (mikulas@artax.karlin.mff.cuni.cz), 1998-1999 * * mounting, unmounting, error handling */ #include "hpfs_fn.h" #include <linux/module.h> #include <linux/fs_context.h> #include <linux/fs_parser.h> #include <linux/init.h> #include <linux/statfs.h> #include <linux/magic.h> #include <linux/sched.h> #include <linux/bitmap.h> #include <linux/slab.h> #include <linux/seq_file.h> /* Mark the filesystem dirty, so that chkdsk checks it when os/2 booted */ static void mark_dirty(struct super_block *s, int remount) { if (hpfs_sb(s)->sb_chkdsk && (remount || !sb_rdonly(s))) { struct buffer_head *bh; struct hpfs_spare_block *sb; if ((sb = hpfs_map_sector(s, 17, &bh, 0))) { sb->dirty = 1; sb->old_wrote = 0; mark_buffer_dirty(bh); sync_dirty_buffer(bh); brelse(bh); } } } /* Mark the filesystem clean (mark it dirty for chkdsk if chkdsk==2 or if there were errors) */ static void unmark_dirty(struct super_block *s) { struct buffer_head *bh; struct hpfs_spare_block *sb; if (sb_rdonly(s)) return; sync_blockdev(s->s_bdev); if ((sb = hpfs_map_sector(s, 17, &bh, 0))) { sb->dirty = hpfs_sb(s)->sb_chkdsk > 1 - hpfs_sb(s)->sb_was_error; sb->old_wrote = hpfs_sb(s)->sb_chkdsk >= 2 && !hpfs_sb(s)->sb_was_error; mark_buffer_dirty(bh); sync_dirty_buffer(bh); brelse(bh); } } /* Filesystem error... */ void hpfs_error(struct super_block *s, const char *fmt, ...) { struct va_format vaf; va_list args; va_start(args, fmt); vaf.fmt = fmt; vaf.va = &args; pr_err("filesystem error: %pV", &vaf); va_end(args); if (!hpfs_sb(s)->sb_was_error) { if (hpfs_sb(s)->sb_err == 2) { pr_cont("; crashing the system because you wanted it\n"); mark_dirty(s, 0); panic("HPFS panic"); } else if (hpfs_sb(s)->sb_err == 1) { if (sb_rdonly(s)) pr_cont("; already mounted read-only\n"); else { pr_cont("; remounting read-only\n"); mark_dirty(s, 0); s->s_flags |= SB_RDONLY; } } else if (sb_rdonly(s)) pr_cont("; going on - but anything won't be destroyed because it's read-only\n"); else pr_cont("; corrupted filesystem mounted read/write - your computer will explode within 20 seconds ... but you wanted it so!\n"); } else pr_cont("\n"); hpfs_sb(s)->sb_was_error = 1; } /* * A little trick to detect cycles in many hpfs structures and don't let the * kernel crash on corrupted filesystem. When first called, set c2 to 0. * * BTW. chkdsk doesn't detect cycles correctly. When I had 2 lost directories * nested each in other, chkdsk locked up happilly. */ int hpfs_stop_cycles(struct super_block *s, int key, int *c1, int *c2, char *msg) { if (*c2 && *c1 == key) { hpfs_error(s, "cycle detected on key %08x in %s", key, msg); return 1; } (*c2)++; if (!((*c2 - 1) & *c2)) *c1 = key; return 0; } static void free_sbi(struct hpfs_sb_info *sbi) { kfree(sbi->sb_cp_table); kfree(sbi->sb_bmp_dir); kfree(sbi); } static void lazy_free_sbi(struct rcu_head *rcu) { free_sbi(container_of(rcu, struct hpfs_sb_info, rcu)); } static void hpfs_put_super(struct super_block *s) { hpfs_lock(s); unmark_dirty(s); hpfs_unlock(s); call_rcu(&hpfs_sb(s)->rcu, lazy_free_sbi); } static unsigned hpfs_count_one_bitmap(struct super_block *s, secno secno) { struct quad_buffer_head qbh; unsigned long *bits; unsigned count; bits = hpfs_map_4sectors(s, secno, &qbh, 0); if (!bits) return (unsigned)-1; count = bitmap_weight(bits, 2048 * BITS_PER_BYTE); hpfs_brelse4(&qbh); return count; } static unsigned count_bitmaps(struct super_block *s) { unsigned n, count, n_bands; n_bands = (hpfs_sb(s)->sb_fs_size + 0x3fff) >> 14; count = 0; for (n = 0; n < COUNT_RD_AHEAD; n++) { hpfs_prefetch_bitmap(s, n); } for (n = 0; n < n_bands; n++) { unsigned c; hpfs_prefetch_bitmap(s, n + COUNT_RD_AHEAD); c = hpfs_count_one_bitmap(s, le32_to_cpu(hpfs_sb(s)->sb_bmp_dir[n])); if (c != (unsigned)-1) count += c; } return count; } unsigned hpfs_get_free_dnodes(struct super_block *s) { struct hpfs_sb_info *sbi = hpfs_sb(s); if (sbi->sb_n_free_dnodes == (unsigned)-1) { unsigned c = hpfs_count_one_bitmap(s, sbi->sb_dmap); if (c == (unsigned)-1) return 0; sbi->sb_n_free_dnodes = c; } return sbi->sb_n_free_dnodes; } static int hpfs_statfs(struct dentry *dentry, struct kstatfs *buf) { struct super_block *s = dentry->d_sb; struct hpfs_sb_info *sbi = hpfs_sb(s); u64 id = huge_encode_dev(s->s_bdev->bd_dev); hpfs_lock(s); if (sbi->sb_n_free == (unsigned)-1) sbi->sb_n_free = count_bitmaps(s); buf->f_type = s->s_magic; buf->f_bsize = 512; buf->f_blocks = sbi->sb_fs_size; buf->f_bfree = sbi->sb_n_free; buf->f_bavail = sbi->sb_n_free; buf->f_files = sbi->sb_dirband_size / 4; buf->f_ffree = hpfs_get_free_dnodes(s); buf->f_fsid = u64_to_fsid(id); buf->f_namelen = 254; hpfs_unlock(s); return 0; } long hpfs_ioctl(struct file *file, unsigned cmd, unsigned long arg) { switch (cmd) { case FITRIM: { struct fstrim_range range; secno n_trimmed; int r; if (!capable(CAP_SYS_ADMIN)) return -EPERM; if (copy_from_user(&range, (struct fstrim_range __user *)arg, sizeof(range))) return -EFAULT; r = hpfs_trim_fs(file_inode(file)->i_sb, range.start >> 9, (range.start + range.len) >> 9, (range.minlen + 511) >> 9, &n_trimmed); if (r) return r; range.len = (u64)n_trimmed << 9; if (copy_to_user((struct fstrim_range __user *)arg, &range, sizeof(range))) return -EFAULT; return 0; } default: { return -ENOIOCTLCMD; } } } static struct kmem_cache * hpfs_inode_cachep; static struct inode *hpfs_alloc_inode(struct super_block *sb) { struct hpfs_inode_info *ei; ei = alloc_inode_sb(sb, hpfs_inode_cachep, GFP_NOFS); if (!ei) return NULL; return &ei->vfs_inode; } static void hpfs_free_inode(struct inode *inode) { kmem_cache_free(hpfs_inode_cachep, hpfs_i(inode)); } static void init_once(void *foo) { struct hpfs_inode_info *ei = (struct hpfs_inode_info *) foo; inode_init_once(&ei->vfs_inode); } static int init_inodecache(void) { hpfs_inode_cachep = kmem_cache_create("hpfs_inode_cache", sizeof(struct hpfs_inode_info), 0, (SLAB_RECLAIM_ACCOUNT| SLAB_ACCOUNT), init_once); if (hpfs_inode_cachep == NULL) return -ENOMEM; return 0; } static void destroy_inodecache(void) { /* * Make sure all delayed rcu free inodes are flushed before we * destroy cache. */ rcu_barrier(); kmem_cache_destroy(hpfs_inode_cachep); } enum { Opt_help, Opt_uid, Opt_gid, Opt_umask, Opt_case, Opt_check, Opt_err, Opt_eas, Opt_chkdsk, Opt_timeshift, }; static const struct constant_table hpfs_param_case[] = { {"asis", 0}, {"lower", 1}, {} }; static const struct constant_table hpfs_param_check[] = { {"none", 0}, {"normal", 1}, {"strict", 2}, {} }; static const struct constant_table hpfs_param_err[] = { {"continue", 0}, {"remount-ro", 1}, {"panic", 2}, {} }; static const struct constant_table hpfs_param_eas[] = { {"no", 0}, {"ro", 1}, {"rw", 2}, {} }; static const struct constant_table hpfs_param_chkdsk[] = { {"no", 0}, {"errors", 1}, {"always", 2}, {} }; static const struct fs_parameter_spec hpfs_param_spec[] = { fsparam_flag ("help", Opt_help), fsparam_uid ("uid", Opt_uid), fsparam_gid ("gid", Opt_gid), fsparam_u32oct ("umask", Opt_umask), fsparam_enum ("case", Opt_case, hpfs_param_case), fsparam_enum ("check", Opt_check, hpfs_param_check), fsparam_enum ("errors", Opt_err, hpfs_param_err), fsparam_enum ("eas", Opt_eas, hpfs_param_eas), fsparam_enum ("chkdsk", Opt_chkdsk, hpfs_param_chkdsk), fsparam_s32 ("timeshift", Opt_timeshift), {} }; struct hpfs_fc_context { kuid_t uid; kgid_t gid; umode_t umask; int lowercase; int eas; int chk; int errs; int chkdsk; int timeshift; }; static inline void hpfs_help(void) { pr_info("\n\ HPFS filesystem options:\n\ help do not mount and display this text\n\ uid=xxx set uid of files that don't have uid specified in eas\n\ gid=xxx set gid of files that don't have gid specified in eas\n\ umask=xxx set mode of files that don't have mode specified in eas\n\ case=lower lowercase all files\n\ case=asis do not lowercase files (default)\n\ check=none no fs checks - kernel may crash on corrupted filesystem\n\ check=normal do some checks - it should not crash (default)\n\ check=strict do extra time-consuming checks, used for debugging\n\ errors=continue continue on errors\n\ errors=remount-ro remount read-only if errors found (default)\n\ errors=panic panic on errors\n\ chkdsk=no do not mark fs for chkdsking even if there were errors\n\ chkdsk=errors mark fs dirty if errors found (default)\n\ chkdsk=always always mark fs dirty - used for debugging\n\ eas=no ignore extended attributes\n\ eas=ro read but do not write extended attributes\n\ eas=rw r/w eas => enables chmod, chown, mknod, ln -s (default)\n\ timeshift=nnn add nnn seconds to file times\n\ \n"); } static int hpfs_parse_param(struct fs_context *fc, struct fs_parameter *param) { struct hpfs_fc_context *ctx = fc->fs_private; struct fs_parse_result result; int opt; opt = fs_parse(fc, hpfs_param_spec, param, &result); if (opt < 0) return opt; switch (opt) { case Opt_help: hpfs_help(); return -EINVAL; case Opt_uid: ctx->uid = result.uid; break; case Opt_gid: ctx->gid = result.gid; break; case Opt_umask: ctx->umask = result.uint_32; break; case Opt_case: ctx->lowercase = result.uint_32; break; case Opt_check: ctx->chk = result.uint_32; break; case Opt_err: ctx->errs = result.uint_32; break; case Opt_eas: ctx->eas = result.uint_32; break; case Opt_chkdsk: ctx->chkdsk = result.uint_32; break; case Opt_timeshift: { char *rhs = param->string; int timeshift; if (kstrtoint(rhs, 0, &timeshift)) return -EINVAL; ctx->timeshift = timeshift; break; } default: return -EINVAL; } return 0; } static int hpfs_reconfigure(struct fs_context *fc) { struct hpfs_fc_context *ctx = fc->fs_private; struct super_block *s = fc->root->d_sb; struct hpfs_sb_info *sbi = hpfs_sb(s); sync_filesystem(s); fc->sb_flags |= SB_NOATIME; hpfs_lock(s); if (ctx->timeshift != sbi->sb_timeshift) { pr_err("timeshift can't be changed using remount.\n"); goto out_err; } unmark_dirty(s); sbi->sb_uid = ctx->uid; sbi->sb_gid = ctx->gid; sbi->sb_mode = 0777 & ~ctx->umask; sbi->sb_lowercase = ctx->lowercase; sbi->sb_eas = ctx->eas; sbi->sb_chk = ctx->chk; sbi->sb_chkdsk = ctx->chkdsk; sbi->sb_err = ctx->errs; sbi->sb_timeshift = ctx->timeshift; if (!(fc->sb_flags & SB_RDONLY)) mark_dirty(s, 1); hpfs_unlock(s); return 0; out_err: hpfs_unlock(s); return -EINVAL; } static int hpfs_show_options(struct seq_file *seq, struct dentry *root) { struct hpfs_sb_info *sbi = hpfs_sb(root->d_sb); seq_printf(seq, ",uid=%u", from_kuid_munged(&init_user_ns, sbi->sb_uid)); seq_printf(seq, ",gid=%u", from_kgid_munged(&init_user_ns, sbi->sb_gid)); seq_printf(seq, ",umask=%03o", (~sbi->sb_mode & 0777)); if (sbi->sb_lowercase) seq_printf(seq, ",case=lower"); if (!sbi->sb_chk) seq_printf(seq, ",check=none"); if (sbi->sb_chk == 2) seq_printf(seq, ",check=strict"); if (!sbi->sb_err) seq_printf(seq, ",errors=continue"); if (sbi->sb_err == 2) seq_printf(seq, ",errors=panic"); if (!sbi->sb_chkdsk) seq_printf(seq, ",chkdsk=no"); if (sbi->sb_chkdsk == 2) seq_printf(seq, ",chkdsk=always"); if (!sbi->sb_eas) seq_printf(seq, ",eas=no"); if (sbi->sb_eas == 1) seq_printf(seq, ",eas=ro"); if (sbi->sb_timeshift) seq_printf(seq, ",timeshift=%d", sbi->sb_timeshift); return 0; } /* Super operations */ static const struct super_operations hpfs_sops = { .alloc_inode = hpfs_alloc_inode, .free_inode = hpfs_free_inode, .evict_inode = hpfs_evict_inode, .put_super = hpfs_put_super, .statfs = hpfs_statfs, .show_options = hpfs_show_options, }; static int hpfs_fill_super(struct super_block *s, struct fs_context *fc) { struct hpfs_fc_context *ctx = fc->fs_private; struct buffer_head *bh0, *bh1, *bh2; struct hpfs_boot_block *bootblock; struct hpfs_super_block *superblock; struct hpfs_spare_block *spareblock; struct hpfs_sb_info *sbi; struct inode *root; int silent = fc->sb_flags & SB_SILENT; dnode_secno root_dno; struct hpfs_dirent *de = NULL; struct quad_buffer_head qbh; sbi = kzalloc(sizeof(*sbi), GFP_KERNEL); if (!sbi) { return -ENOMEM; } s->s_fs_info = sbi; mutex_init(&sbi->hpfs_mutex); hpfs_lock(s); /*sbi->sb_mounting = 1;*/ sb_set_blocksize(s, 512); sbi->sb_fs_size = -1; if (!(bootblock = hpfs_map_sector(s, 0, &bh0, 0))) goto bail1; if (!(superblock = hpfs_map_sector(s, 16, &bh1, 1))) goto bail2; if (!(spareblock = hpfs_map_sector(s, 17, &bh2, 0))) goto bail3; /* Check magics */ if (/*le16_to_cpu(bootblock->magic) != BB_MAGIC ||*/ le32_to_cpu(superblock->magic) != SB_MAGIC || le32_to_cpu(spareblock->magic) != SP_MAGIC) { if (!silent) pr_err("Bad magic ... probably not HPFS\n"); goto bail4; } /* Check version */ if (!sb_rdonly(s) && superblock->funcversion != 2 && superblock->funcversion != 3) { pr_err("Bad version %d,%d. Mount readonly to go around\n", (int)superblock->version, (int)superblock->funcversion); pr_err("please try recent version of HPFS driver at http://artax.karlin.mff.cuni.cz/~mikulas/vyplody/hpfs/index-e.cgi and if it still can't understand this format, contact author - mikulas@artax.karlin.mff.cuni.cz\n"); goto bail4; } s->s_flags |= SB_NOATIME; /* Fill superblock stuff */ s->s_magic = HPFS_SUPER_MAGIC; s->s_op = &hpfs_sops; set_default_d_op(s, &hpfs_dentry_operations); s->s_time_min = local_to_gmt(s, 0); s->s_time_max = local_to_gmt(s, U32_MAX); sbi->sb_root = le32_to_cpu(superblock->root); sbi->sb_fs_size = le32_to_cpu(superblock->n_sectors); sbi->sb_bitmaps = le32_to_cpu(superblock->bitmaps); sbi->sb_dirband_start = le32_to_cpu(superblock->dir_band_start); sbi->sb_dirband_size = le32_to_cpu(superblock->n_dir_band); sbi->sb_dmap = le32_to_cpu(superblock->dir_band_bitmap); sbi->sb_uid = ctx->uid; sbi->sb_gid = ctx->gid; sbi->sb_mode = 0777 & ~ctx->umask; sbi->sb_n_free = -1; sbi->sb_n_free_dnodes = -1; sbi->sb_lowercase = ctx->lowercase; sbi->sb_eas = ctx->eas; sbi->sb_chk = ctx->chk; sbi->sb_chkdsk = ctx->chkdsk; sbi->sb_err = ctx->errs; sbi->sb_timeshift = ctx->timeshift; sbi->sb_was_error = 0; sbi->sb_cp_table = NULL; sbi->sb_c_bitmap = -1; sbi->sb_max_fwd_alloc = 0xffffff; if (sbi->sb_fs_size >= 0x80000000) { hpfs_error(s, "invalid size in superblock: %08x", (unsigned)sbi->sb_fs_size); goto bail4; } if (spareblock->n_spares_used) hpfs_load_hotfix_map(s, spareblock); /* Load bitmap directory */ if (!(sbi->sb_bmp_dir = hpfs_load_bitmap_directory(s, le32_to_cpu(superblock->bitmaps)))) goto bail4; /* Check for general fs errors*/ if (spareblock->dirty && !spareblock->old_wrote) { if (sbi->sb_err == 2) { pr_err("Improperly stopped, not mounted\n"); goto bail4; } hpfs_error(s, "improperly stopped"); } if (!sb_rdonly(s)) { spareblock->dirty = 1; spareblock->old_wrote = 0; mark_buffer_dirty(bh2); } if (le32_to_cpu(spareblock->n_dnode_spares) != le32_to_cpu(spareblock->n_dnode_spares_free)) { if (sbi->sb_err >= 2) { pr_err("Spare dnodes used, try chkdsk\n"); mark_dirty(s, 0); goto bail4; } hpfs_error(s, "warning: spare dnodes used, try chkdsk"); if (sbi->sb_err == 0) pr_err("Proceeding, but your filesystem could be corrupted if you delete files or directories\n"); } if (sbi->sb_chk) { unsigned a; if (le32_to_cpu(superblock->dir_band_end) - le32_to_cpu(superblock->dir_band_start) + 1 != le32_to_cpu(superblock->n_dir_band) || le32_to_cpu(superblock->dir_band_end) < le32_to_cpu(superblock->dir_band_start) || le32_to_cpu(superblock->n_dir_band) > 0x4000) { hpfs_error(s, "dir band size mismatch: dir_band_start==%08x, dir_band_end==%08x, n_dir_band==%08x", le32_to_cpu(superblock->dir_band_start), le32_to_cpu(superblock->dir_band_end), le32_to_cpu(superblock->n_dir_band)); goto bail4; } a = sbi->sb_dirband_size; sbi->sb_dirband_size = 0; if (hpfs_chk_sectors(s, le32_to_cpu(superblock->dir_band_start), le32_to_cpu(superblock->n_dir_band), "dir_band") || hpfs_chk_sectors(s, le32_to_cpu(superblock->dir_band_bitmap), 4, "dir_band_bitmap") || hpfs_chk_sectors(s, le32_to_cpu(superblock->bitmaps), 4, "bitmaps")) { mark_dirty(s, 0); goto bail4; } sbi->sb_dirband_size = a; } else pr_err("You really don't want any checks? You are crazy...\n"); /* Load code page table */ if (le32_to_cpu(spareblock->n_code_pages)) if (!(sbi->sb_cp_table = hpfs_load_code_page(s, le32_to_cpu(spareblock->code_page_dir)))) pr_err("code page support is disabled\n"); brelse(bh2); brelse(bh1); brelse(bh0); root = iget_locked(s, sbi->sb_root); if (!root) goto bail0; hpfs_init_inode(root); hpfs_read_inode(root); unlock_new_inode(root); s->s_root = d_make_root(root); if (!s->s_root) goto bail0; /* * find the root directory's . pointer & finish filling in the inode */ root_dno = hpfs_fnode_dno(s, sbi->sb_root); if (root_dno) de = map_dirent(root, root_dno, "\001\001", 2, NULL, &qbh); if (!de) hpfs_error(s, "unable to find root dir"); else { inode_set_atime(root, local_to_gmt(s, le32_to_cpu(de->read_date)), 0); inode_set_mtime(root, local_to_gmt(s, le32_to_cpu(de->write_date)), 0); inode_set_ctime(root, local_to_gmt(s, le32_to_cpu(de->creation_date)), 0); hpfs_i(root)->i_ea_size = le32_to_cpu(de->ea_size); hpfs_i(root)->i_parent_dir = root->i_ino; if (root->i_size == -1) root->i_size = 2048; if (root->i_blocks == -1) root->i_blocks = 5; hpfs_brelse4(&qbh); } hpfs_unlock(s); return 0; bail4: brelse(bh2); bail3: brelse(bh1); bail2: brelse(bh0); bail1: bail0: hpfs_unlock(s); free_sbi(sbi); return -EINVAL; } static int hpfs_get_tree(struct fs_context *fc) { return get_tree_bdev(fc, hpfs_fill_super); } static void hpfs_free_fc(struct fs_context *fc) { kfree(fc->fs_private); } static const struct fs_context_operations hpfs_fc_context_ops = { .parse_param = hpfs_parse_param, .get_tree = hpfs_get_tree, .reconfigure = hpfs_reconfigure, .free = hpfs_free_fc, }; static int hpfs_init_fs_context(struct fs_context *fc) { struct hpfs_fc_context *ctx; ctx = kzalloc(sizeof(struct hpfs_fc_context), GFP_KERNEL); if (!ctx) return -ENOMEM; if (fc->purpose == FS_CONTEXT_FOR_RECONFIGURE) { struct super_block *sb = fc->root->d_sb; struct hpfs_sb_info *sbi = hpfs_sb(sb); ctx->uid = sbi->sb_uid; ctx->gid = sbi->sb_gid; ctx->umask = 0777 & ~sbi->sb_mode; ctx->lowercase = sbi->sb_lowercase; ctx->eas = sbi->sb_eas; ctx->chk = sbi->sb_chk; ctx->chkdsk = sbi->sb_chkdsk; ctx->errs = sbi->sb_err; ctx->timeshift = sbi->sb_timeshift; } else { ctx->uid = current_uid(); ctx->gid = current_gid(); ctx->umask = current_umask(); ctx->lowercase = 0; ctx->eas = 2; ctx->chk = 1; ctx->errs = 1; ctx->chkdsk = 1; ctx->timeshift = 0; } fc->fs_private = ctx; fc->ops = &hpfs_fc_context_ops; return 0; }; static struct file_system_type hpfs_fs_type = { .owner = THIS_MODULE, .name = "hpfs", .kill_sb = kill_block_super, .fs_flags = FS_REQUIRES_DEV, .init_fs_context = hpfs_init_fs_context, .parameters = hpfs_param_spec, }; MODULE_ALIAS_FS("hpfs"); static int __init init_hpfs_fs(void) { int err = init_inodecache(); if (err) goto out1; err = register_filesystem(&hpfs_fs_type); if (err) goto out; return 0; out: destroy_inodecache(); out1: return err; } static void __exit exit_hpfs_fs(void) { unregister_filesystem(&hpfs_fs_type); destroy_inodecache(); } module_init(init_hpfs_fs) module_exit(exit_hpfs_fs) MODULE_DESCRIPTION("OS/2 HPFS file system support"); MODULE_LICENSE("GPL");
152 142 16 489 501 3 487 6 6 6 6 3 3 6 6 3 3 337 339 6 6 6 1174 1178 1178 1173 464 332 333 332 814 813 814 347 117 229 229 222 224 221 220 220 220 221 467 386 33 349 3 14 48 285 478 8 470 473 473 356 356 353 4 304 163 459 425 401 147 147 371 2 370 388 388 42 351 348 84 4 5 86 387 56 380 84 84 209 211 211 465 465 465 4 5 529 516 4 3 4 5 175 7 7 337 513 340 172 387 1 260 128 6 6 233 233 171 61 233 146 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 // SPDX-License-Identifier: GPL-2.0 /* * linux/fs/ext4/balloc.c * * Copyright (C) 1992, 1993, 1994, 1995 * Remy Card (card@masi.ibp.fr) * Laboratoire MASI - Institut Blaise Pascal * Universite Pierre et Marie Curie (Paris VI) * * Enhanced block allocation by Stephen Tweedie (sct@redhat.com), 1993 * Big-endian to little-endian byte-swapping/bitmaps by * David S. Miller (davem@caip.rutgers.edu), 1995 */ #include <linux/time.h> #include <linux/capability.h> #include <linux/fs.h> #include <linux/quotaops.h> #include <linux/buffer_head.h> #include "ext4.h" #include "ext4_jbd2.h" #include "mballoc.h" #include <trace/events/ext4.h> #include <kunit/static_stub.h> static unsigned ext4_num_base_meta_clusters(struct super_block *sb, ext4_group_t block_group); /* * balloc.c contains the blocks allocation and deallocation routines */ /* * Calculate block group number for a given block number */ ext4_group_t ext4_get_group_number(struct super_block *sb, ext4_fsblk_t block) { ext4_group_t group; if (test_opt2(sb, STD_GROUP_SIZE)) group = (block - le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block)) >> (EXT4_BLOCK_SIZE_BITS(sb) + EXT4_CLUSTER_BITS(sb) + 3); else ext4_get_group_no_and_offset(sb, block, &group, NULL); return group; } /* * Calculate the block group number and offset into the block/cluster * allocation bitmap, given a block number */ void ext4_get_group_no_and_offset(struct super_block *sb, ext4_fsblk_t blocknr, ext4_group_t *blockgrpp, ext4_grpblk_t *offsetp) { struct ext4_super_block *es = EXT4_SB(sb)->s_es; ext4_grpblk_t offset; blocknr = blocknr - le32_to_cpu(es->s_first_data_block); offset = do_div(blocknr, EXT4_BLOCKS_PER_GROUP(sb)) >> EXT4_SB(sb)->s_cluster_bits; if (offsetp) *offsetp = offset; if (blockgrpp) *blockgrpp = blocknr; } /* * Check whether the 'block' lives within the 'block_group'. Returns 1 if so * and 0 otherwise. */ static inline int ext4_block_in_group(struct super_block *sb, ext4_fsblk_t block, ext4_group_t block_group) { ext4_group_t actual_group; actual_group = ext4_get_group_number(sb, block); return (actual_group == block_group) ? 1 : 0; } /* * Return the number of clusters used for file system metadata; this * represents the overhead needed by the file system. */ static unsigned ext4_num_overhead_clusters(struct super_block *sb, ext4_group_t block_group, struct ext4_group_desc *gdp) { unsigned base_clusters, num_clusters; int block_cluster = -1, inode_cluster; int itbl_cluster_start = -1, itbl_cluster_end = -1; ext4_fsblk_t start = ext4_group_first_block_no(sb, block_group); ext4_fsblk_t end = start + EXT4_BLOCKS_PER_GROUP(sb) - 1; ext4_fsblk_t itbl_blk_start, itbl_blk_end; struct ext4_sb_info *sbi = EXT4_SB(sb); /* This is the number of clusters used by the superblock, * block group descriptors, and reserved block group * descriptor blocks */ base_clusters = ext4_num_base_meta_clusters(sb, block_group); num_clusters = base_clusters; /* * Account and record inode table clusters if any cluster * is in the block group, or inode table cluster range is * [-1, -1] and won't overlap with block/inode bitmap cluster * accounted below. */ itbl_blk_start = ext4_inode_table(sb, gdp); itbl_blk_end = itbl_blk_start + sbi->s_itb_per_group - 1; if (itbl_blk_start <= end && itbl_blk_end >= start) { itbl_blk_start = max(itbl_blk_start, start); itbl_blk_end = min(itbl_blk_end, end); itbl_cluster_start = EXT4_B2C(sbi, itbl_blk_start - start); itbl_cluster_end = EXT4_B2C(sbi, itbl_blk_end - start); num_clusters += itbl_cluster_end - itbl_cluster_start + 1; /* check if border cluster is overlapped */ if (itbl_cluster_start == base_clusters - 1) num_clusters--; } /* * For the allocation bitmaps, we first need to check to see * if the block is in the block group. If it is, then check * to see if the cluster is already accounted for in the clusters * used for the base metadata cluster and inode tables cluster. * Normally all of these blocks are contiguous, so the special * case handling shouldn't be necessary except for *very* * unusual file system layouts. */ if (ext4_block_in_group(sb, ext4_block_bitmap(sb, gdp), block_group)) { block_cluster = EXT4_B2C(sbi, ext4_block_bitmap(sb, gdp) - start); if (block_cluster >= base_clusters && (block_cluster < itbl_cluster_start || block_cluster > itbl_cluster_end)) num_clusters++; } if (ext4_block_in_group(sb, ext4_inode_bitmap(sb, gdp), block_group)) { inode_cluster = EXT4_B2C(sbi, ext4_inode_bitmap(sb, gdp) - start); /* * Additional check if inode bitmap is in just accounted * block_cluster */ if (inode_cluster != block_cluster && inode_cluster >= base_clusters && (inode_cluster < itbl_cluster_start || inode_cluster > itbl_cluster_end)) num_clusters++; } return num_clusters; } static unsigned int num_clusters_in_group(struct super_block *sb, ext4_group_t block_group) { unsigned int blocks; if (block_group == ext4_get_groups_count(sb) - 1) { /* * Even though mke2fs always initializes the first and * last group, just in case some other tool was used, * we need to make sure we calculate the right free * blocks. */ blocks = ext4_blocks_count(EXT4_SB(sb)->s_es) - ext4_group_first_block_no(sb, block_group); } else blocks = EXT4_BLOCKS_PER_GROUP(sb); return EXT4_NUM_B2C(EXT4_SB(sb), blocks); } /* Initializes an uninitialized block bitmap */ static int ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh, ext4_group_t block_group, struct ext4_group_desc *gdp) { unsigned int bit, bit_max; struct ext4_sb_info *sbi = EXT4_SB(sb); ext4_fsblk_t start, tmp; ASSERT(buffer_locked(bh)); if (!ext4_group_desc_csum_verify(sb, block_group, gdp)) { ext4_mark_group_bitmap_corrupted(sb, block_group, EXT4_GROUP_INFO_BBITMAP_CORRUPT | EXT4_GROUP_INFO_IBITMAP_CORRUPT); return -EFSBADCRC; } memset(bh->b_data, 0, sb->s_blocksize); bit_max = ext4_num_base_meta_clusters(sb, block_group); if ((bit_max >> 3) >= bh->b_size) return -EFSCORRUPTED; for (bit = 0; bit < bit_max; bit++) ext4_set_bit(bit, bh->b_data); start = ext4_group_first_block_no(sb, block_group); /* Set bits for block and inode bitmaps, and inode table */ tmp = ext4_block_bitmap(sb, gdp); if (ext4_block_in_group(sb, tmp, block_group)) ext4_set_bit(EXT4_B2C(sbi, tmp - start), bh->b_data); tmp = ext4_inode_bitmap(sb, gdp); if (ext4_block_in_group(sb, tmp, block_group)) ext4_set_bit(EXT4_B2C(sbi, tmp - start), bh->b_data); tmp = ext4_inode_table(sb, gdp); for (; tmp < ext4_inode_table(sb, gdp) + sbi->s_itb_per_group; tmp++) { if (ext4_block_in_group(sb, tmp, block_group)) ext4_set_bit(EXT4_B2C(sbi, tmp - start), bh->b_data); } /* * Also if the number of blocks within the group is less than * the blocksize * 8 ( which is the size of bitmap ), set rest * of the block bitmap to 1 */ ext4_mark_bitmap_end(num_clusters_in_group(sb, block_group), sb->s_blocksize * 8, bh->b_data); return 0; } /* Return the number of free blocks in a block group. It is used when * the block bitmap is uninitialized, so we can't just count the bits * in the bitmap. */ unsigned ext4_free_clusters_after_init(struct super_block *sb, ext4_group_t block_group, struct ext4_group_desc *gdp) { return num_clusters_in_group(sb, block_group) - ext4_num_overhead_clusters(sb, block_group, gdp); } /* * The free blocks are managed by bitmaps. A file system contains several * blocks groups. Each group contains 1 bitmap block for blocks, 1 bitmap * block for inodes, N blocks for the inode table and data blocks. * * The file system contains group descriptors which are located after the * super block. Each descriptor contains the number of the bitmap block and * the free blocks count in the block. The descriptors are loaded in memory * when a file system is mounted (see ext4_fill_super). */ /** * ext4_get_group_desc() -- load group descriptor from disk * @sb: super block * @block_group: given block group * @bh: pointer to the buffer head to store the block * group descriptor */ struct ext4_group_desc * ext4_get_group_desc(struct super_block *sb, ext4_group_t block_group, struct buffer_head **bh) { unsigned int group_desc; unsigned int offset; ext4_group_t ngroups = ext4_get_groups_count(sb); struct ext4_group_desc *desc; struct ext4_sb_info *sbi = EXT4_SB(sb); struct buffer_head *bh_p; KUNIT_STATIC_STUB_REDIRECT(ext4_get_group_desc, sb, block_group, bh); if (block_group >= ngroups) { ext4_error(sb, "block_group >= groups_count - block_group = %u," " groups_count = %u", block_group, ngroups); return NULL; } group_desc = block_group >> EXT4_DESC_PER_BLOCK_BITS(sb); offset = block_group & (EXT4_DESC_PER_BLOCK(sb) - 1); bh_p = sbi_array_rcu_deref(sbi, s_group_desc, group_desc); /* * sbi_array_rcu_deref returns with rcu unlocked, this is ok since * the pointer being dereferenced won't be dereferenced again. By * looking at the usage in add_new_gdb() the value isn't modified, * just the pointer, and so it remains valid. */ if (!bh_p) { ext4_error(sb, "Group descriptor not loaded - " "block_group = %u, group_desc = %u, desc = %u", block_group, group_desc, offset); return NULL; } desc = (struct ext4_group_desc *)( (__u8 *)bh_p->b_data + offset * EXT4_DESC_SIZE(sb)); if (bh) *bh = bh_p; return desc; } static ext4_fsblk_t ext4_valid_block_bitmap_padding(struct super_block *sb, ext4_group_t block_group, struct buffer_head *bh) { ext4_grpblk_t next_zero_bit; unsigned long bitmap_size = sb->s_blocksize * 8; unsigned int offset = num_clusters_in_group(sb, block_group); if (bitmap_size <= offset) return 0; next_zero_bit = ext4_find_next_zero_bit(bh->b_data, bitmap_size, offset); return (next_zero_bit < bitmap_size ? next_zero_bit : 0); } struct ext4_group_info *ext4_get_group_info(struct super_block *sb, ext4_group_t group) { struct ext4_group_info **grp_info; long indexv, indexh; if (unlikely(group >= EXT4_SB(sb)->s_groups_count)) return NULL; indexv = group >> (EXT4_DESC_PER_BLOCK_BITS(sb)); indexh = group & ((EXT4_DESC_PER_BLOCK(sb)) - 1); grp_info = sbi_array_rcu_deref(EXT4_SB(sb), s_group_info, indexv); return grp_info[indexh]; } /* * Return the block number which was discovered to be invalid, or 0 if * the block bitmap is valid. */ static ext4_fsblk_t ext4_valid_block_bitmap(struct super_block *sb, struct ext4_group_desc *desc, ext4_group_t block_group, struct buffer_head *bh) { struct ext4_sb_info *sbi = EXT4_SB(sb); ext4_grpblk_t offset; ext4_grpblk_t next_zero_bit; ext4_grpblk_t max_bit = EXT4_CLUSTERS_PER_GROUP(sb); ext4_fsblk_t blk; ext4_fsblk_t group_first_block; if (ext4_has_feature_flex_bg(sb)) { /* with FLEX_BG, the inode/block bitmaps and itable * blocks may not be in the group at all * so the bitmap validation will be skipped for those groups * or it has to also read the block group where the bitmaps * are located to verify they are set. */ return 0; } group_first_block = ext4_group_first_block_no(sb, block_group); /* check whether block bitmap block number is set */ blk = ext4_block_bitmap(sb, desc); offset = blk - group_first_block; if (offset < 0 || EXT4_B2C(sbi, offset) >= max_bit || !ext4_test_bit(EXT4_B2C(sbi, offset), bh->b_data)) /* bad block bitmap */ return blk; /* check whether the inode bitmap block number is set */ blk = ext4_inode_bitmap(sb, desc); offset = blk - group_first_block; if (offset < 0 || EXT4_B2C(sbi, offset) >= max_bit || !ext4_test_bit(EXT4_B2C(sbi, offset), bh->b_data)) /* bad block bitmap */ return blk; /* check whether the inode table block number is set */ blk = ext4_inode_table(sb, desc); offset = blk - group_first_block; if (offset < 0 || EXT4_B2C(sbi, offset) >= max_bit || EXT4_B2C(sbi, offset + sbi->s_itb_per_group - 1) >= max_bit) return blk; next_zero_bit = ext4_find_next_zero_bit(bh->b_data, EXT4_B2C(sbi, offset + sbi->s_itb_per_group - 1) + 1, EXT4_B2C(sbi, offset)); if (next_zero_bit < EXT4_B2C(sbi, offset + sbi->s_itb_per_group - 1) + 1) /* bad bitmap for inode tables */ return blk; return 0; } static int ext4_validate_block_bitmap(struct super_block *sb, struct ext4_group_desc *desc, ext4_group_t block_group, struct buffer_head *bh) { ext4_fsblk_t blk; struct ext4_group_info *grp; if (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY) return 0; grp = ext4_get_group_info(sb, block_group); if (buffer_verified(bh)) return 0; if (!grp || EXT4_MB_GRP_BBITMAP_CORRUPT(grp)) return -EFSCORRUPTED; ext4_lock_group(sb, block_group); if (buffer_verified(bh)) goto verified; if (unlikely(!ext4_block_bitmap_csum_verify(sb, desc, bh) || ext4_simulate_fail(sb, EXT4_SIM_BBITMAP_CRC))) { ext4_unlock_group(sb, block_group); ext4_error(sb, "bg %u: bad block bitmap checksum", block_group); ext4_mark_group_bitmap_corrupted(sb, block_group, EXT4_GROUP_INFO_BBITMAP_CORRUPT); return -EFSBADCRC; } blk = ext4_valid_block_bitmap(sb, desc, block_group, bh); if (unlikely(blk != 0)) { ext4_unlock_group(sb, block_group); ext4_error(sb, "bg %u: block %llu: invalid block bitmap", block_group, blk); ext4_mark_group_bitmap_corrupted(sb, block_group, EXT4_GROUP_INFO_BBITMAP_CORRUPT); return -EFSCORRUPTED; } blk = ext4_valid_block_bitmap_padding(sb, block_group, bh); if (unlikely(blk != 0)) { ext4_unlock_group(sb, block_group); ext4_error(sb, "bg %u: block %llu: padding at end of block bitmap is not set", block_group, blk); ext4_mark_group_bitmap_corrupted(sb, block_group, EXT4_GROUP_INFO_BBITMAP_CORRUPT); return -EFSCORRUPTED; } set_buffer_verified(bh); verified: ext4_unlock_group(sb, block_group); return 0; } /** * ext4_read_block_bitmap_nowait() * @sb: super block * @block_group: given block group * @ignore_locked: ignore locked buffers * * Read the bitmap for a given block_group,and validate the * bits for block/inode/inode tables are set in the bitmaps * * Return buffer_head on success or an ERR_PTR in case of failure. */ struct buffer_head * ext4_read_block_bitmap_nowait(struct super_block *sb, ext4_group_t block_group, bool ignore_locked) { struct ext4_group_desc *desc; struct ext4_sb_info *sbi = EXT4_SB(sb); struct buffer_head *bh; ext4_fsblk_t bitmap_blk; int err; KUNIT_STATIC_STUB_REDIRECT(ext4_read_block_bitmap_nowait, sb, block_group, ignore_locked); desc = ext4_get_group_desc(sb, block_group, NULL); if (!desc) return ERR_PTR(-EFSCORRUPTED); bitmap_blk = ext4_block_bitmap(sb, desc); if ((bitmap_blk <= le32_to_cpu(sbi->s_es->s_first_data_block)) || (bitmap_blk >= ext4_blocks_count(sbi->s_es))) { ext4_error(sb, "Invalid block bitmap block %llu in " "block_group %u", bitmap_blk, block_group); ext4_mark_group_bitmap_corrupted(sb, block_group, EXT4_GROUP_INFO_BBITMAP_CORRUPT); return ERR_PTR(-EFSCORRUPTED); } bh = sb_getblk(sb, bitmap_blk); if (unlikely(!bh)) { ext4_warning(sb, "Cannot get buffer for block bitmap - " "block_group = %u, block_bitmap = %llu", block_group, bitmap_blk); return ERR_PTR(-ENOMEM); } if (ignore_locked && buffer_locked(bh)) { /* buffer under IO already, return if called for prefetching */ put_bh(bh); return NULL; } if (bitmap_uptodate(bh)) goto verify; lock_buffer(bh); if (bitmap_uptodate(bh)) { unlock_buffer(bh); goto verify; } ext4_lock_group(sb, block_group); if (ext4_has_group_desc_csum(sb) && (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT))) { if (block_group == 0) { ext4_unlock_group(sb, block_group); unlock_buffer(bh); ext4_error(sb, "Block bitmap for bg 0 marked " "uninitialized"); err = -EFSCORRUPTED; goto out; } err = ext4_init_block_bitmap(sb, bh, block_group, desc); if (err) { ext4_unlock_group(sb, block_group); unlock_buffer(bh); ext4_error(sb, "Failed to init block bitmap for group " "%u: %d", block_group, err); goto out; } set_bitmap_uptodate(bh); set_buffer_uptodate(bh); set_buffer_verified(bh); ext4_unlock_group(sb, block_group); unlock_buffer(bh); return bh; } ext4_unlock_group(sb, block_group); if (buffer_uptodate(bh)) { /* * if not uninit if bh is uptodate, * bitmap is also uptodate */ set_bitmap_uptodate(bh); unlock_buffer(bh); goto verify; } /* * submit the buffer_head for reading */ set_buffer_new(bh); trace_ext4_read_block_bitmap_load(sb, block_group, ignore_locked); ext4_read_bh_nowait(bh, REQ_META | REQ_PRIO | (ignore_locked ? REQ_RAHEAD : 0), ext4_end_bitmap_read, ext4_simulate_fail(sb, EXT4_SIM_BBITMAP_EIO)); return bh; verify: err = ext4_validate_block_bitmap(sb, desc, block_group, bh); if (err) goto out; return bh; out: put_bh(bh); return ERR_PTR(err); } /* Returns 0 on success, -errno on error */ int ext4_wait_block_bitmap(struct super_block *sb, ext4_group_t block_group, struct buffer_head *bh) { struct ext4_group_desc *desc; KUNIT_STATIC_STUB_REDIRECT(ext4_wait_block_bitmap, sb, block_group, bh); if (!buffer_new(bh)) return 0; desc = ext4_get_group_desc(sb, block_group, NULL); if (!desc) return -EFSCORRUPTED; wait_on_buffer(bh); if (!buffer_uptodate(bh)) { ext4_error_err(sb, EIO, "Cannot read block bitmap - " "block_group = %u, block_bitmap = %llu", block_group, (unsigned long long) bh->b_blocknr); ext4_mark_group_bitmap_corrupted(sb, block_group, EXT4_GROUP_INFO_BBITMAP_CORRUPT); return -EIO; } clear_buffer_new(bh); /* Panic or remount fs read-only if block bitmap is invalid */ return ext4_validate_block_bitmap(sb, desc, block_group, bh); } struct buffer_head * ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group) { struct buffer_head *bh; int err; bh = ext4_read_block_bitmap_nowait(sb, block_group, false); if (IS_ERR(bh)) return bh; err = ext4_wait_block_bitmap(sb, block_group, bh); if (err) { put_bh(bh); return ERR_PTR(err); } return bh; } /** * ext4_has_free_clusters() * @sbi: in-core super block structure. * @nclusters: number of needed blocks * @flags: flags from ext4_mb_new_blocks() * * Check if filesystem has nclusters free & available for allocation. * On success return 1, return 0 on failure. */ static int ext4_has_free_clusters(struct ext4_sb_info *sbi, s64 nclusters, unsigned int flags) { s64 free_clusters, dirty_clusters, rsv, resv_clusters; struct percpu_counter *fcc = &sbi->s_freeclusters_counter; struct percpu_counter *dcc = &sbi->s_dirtyclusters_counter; free_clusters = percpu_counter_read_positive(fcc); dirty_clusters = percpu_counter_read_positive(dcc); resv_clusters = atomic64_read(&sbi->s_resv_clusters); /* * r_blocks_count should always be multiple of the cluster ratio so * we are safe to do a plane bit shift only. */ rsv = (ext4_r_blocks_count(sbi->s_es) >> sbi->s_cluster_bits) + resv_clusters; if (free_clusters - (nclusters + rsv + dirty_clusters) < EXT4_FREECLUSTERS_WATERMARK) { free_clusters = percpu_counter_sum_positive(fcc); dirty_clusters = percpu_counter_sum_positive(dcc); } /* Check whether we have space after accounting for current * dirty clusters & root reserved clusters. */ if (free_clusters >= (rsv + nclusters + dirty_clusters)) return 1; /* Hm, nope. Are (enough) root reserved clusters available? */ if (uid_eq(sbi->s_resuid, current_fsuid()) || (!gid_eq(sbi->s_resgid, GLOBAL_ROOT_GID) && in_group_p(sbi->s_resgid)) || (flags & EXT4_MB_USE_ROOT_BLOCKS) || capable(CAP_SYS_RESOURCE)) { if (free_clusters >= (nclusters + dirty_clusters + resv_clusters)) return 1; } /* No free blocks. Let's see if we can dip into reserved pool */ if (flags & EXT4_MB_USE_RESERVED) { if (free_clusters >= (nclusters + dirty_clusters)) return 1; } return 0; } int ext4_claim_free_clusters(struct ext4_sb_info *sbi, s64 nclusters, unsigned int flags) { if (ext4_has_free_clusters(sbi, nclusters, flags)) { percpu_counter_add(&sbi->s_dirtyclusters_counter, nclusters); return 0; } else return -ENOSPC; } /** * ext4_should_retry_alloc() - check if a block allocation should be retried * @sb: superblock * @retries: number of retry attempts made so far * * ext4_should_retry_alloc() is called when ENOSPC is returned while * attempting to allocate blocks. If there's an indication that a pending * journal transaction might free some space and allow another attempt to * succeed, this function will wait for the current or committing transaction * to complete and then return TRUE. */ int ext4_should_retry_alloc(struct super_block *sb, int *retries) { struct ext4_sb_info *sbi = EXT4_SB(sb); if (!sbi->s_journal) return 0; if (++(*retries) > 3) { percpu_counter_inc(&sbi->s_sra_exceeded_retry_limit); return 0; } /* * if there's no indication that blocks are about to be freed it's * possible we just missed a transaction commit that did so */ smp_mb(); if (atomic_read(&sbi->s_mb_free_pending) == 0) { if (test_opt(sb, DISCARD)) { atomic_inc(&sbi->s_retry_alloc_pending); flush_work(&sbi->s_discard_work); atomic_dec(&sbi->s_retry_alloc_pending); } return ext4_has_free_clusters(sbi, 1, 0); } /* * it's possible we've just missed a transaction commit here, * so ignore the returned status */ ext4_debug("%s: retrying operation after ENOSPC\n", sb->s_id); (void) jbd2_journal_force_commit_nested(sbi->s_journal); return 1; } /* * ext4_new_meta_blocks() -- allocate block for meta data (indexing) blocks * * @handle: handle to this transaction * @inode: file inode * @goal: given target block(filesystem wide) * @count: pointer to total number of clusters needed * @errp: error code * * Return 1st allocated block number on success, *count stores total account * error stores in errp pointer */ ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode, ext4_fsblk_t goal, unsigned int flags, unsigned long *count, int *errp) { struct ext4_allocation_request ar; ext4_fsblk_t ret; memset(&ar, 0, sizeof(ar)); /* Fill with neighbour allocated blocks */ ar.inode = inode; ar.goal = goal; ar.len = count ? *count : 1; ar.flags = flags; ret = ext4_mb_new_blocks(handle, &ar, errp); if (count) *count = ar.len; /* * Account for the allocated meta blocks. We will never * fail EDQUOT for metdata, but we do account for it. */ if (!(*errp) && (flags & EXT4_MB_DELALLOC_RESERVED)) { dquot_alloc_block_nofail(inode, EXT4_C2B(EXT4_SB(inode->i_sb), ar.len)); } return ret; } /** * ext4_count_free_clusters() -- count filesystem free clusters * @sb: superblock * * Adds up the number of free clusters from each block group. */ ext4_fsblk_t ext4_count_free_clusters(struct super_block *sb) { ext4_fsblk_t desc_count; struct ext4_group_desc *gdp; ext4_group_t i; ext4_group_t ngroups = ext4_get_groups_count(sb); struct ext4_group_info *grp; #ifdef EXT4FS_DEBUG struct ext4_super_block *es; ext4_fsblk_t bitmap_count; unsigned int x; struct buffer_head *bitmap_bh = NULL; es = EXT4_SB(sb)->s_es; desc_count = 0; bitmap_count = 0; gdp = NULL; for (i = 0; i < ngroups; i++) { gdp = ext4_get_group_desc(sb, i, NULL); if (!gdp) continue; grp = NULL; if (EXT4_SB(sb)->s_group_info) grp = ext4_get_group_info(sb, i); if (!grp || !EXT4_MB_GRP_BBITMAP_CORRUPT(grp)) desc_count += ext4_free_group_clusters(sb, gdp); brelse(bitmap_bh); bitmap_bh = ext4_read_block_bitmap(sb, i); if (IS_ERR(bitmap_bh)) { bitmap_bh = NULL; continue; } x = ext4_count_free(bitmap_bh->b_data, EXT4_CLUSTERS_PER_GROUP(sb) / 8); printk(KERN_DEBUG "group %u: stored = %d, counted = %u\n", i, ext4_free_group_clusters(sb, gdp), x); bitmap_count += x; } brelse(bitmap_bh); printk(KERN_DEBUG "ext4_count_free_clusters: stored = %llu" ", computed = %llu, %llu\n", EXT4_NUM_B2C(EXT4_SB(sb), ext4_free_blocks_count(es)), desc_count, bitmap_count); return bitmap_count; #else desc_count = 0; for (i = 0; i < ngroups; i++) { gdp = ext4_get_group_desc(sb, i, NULL); if (!gdp) continue; grp = NULL; if (EXT4_SB(sb)->s_group_info) grp = ext4_get_group_info(sb, i); if (!grp || !EXT4_MB_GRP_BBITMAP_CORRUPT(grp)) desc_count += ext4_free_group_clusters(sb, gdp); } return desc_count; #endif } static inline int test_root(ext4_group_t a, int b) { while (1) { if (a < b) return 0; if (a == b) return 1; if ((a % b) != 0) return 0; a = a / b; } } /** * ext4_bg_has_super - number of blocks used by the superblock in group * @sb: superblock for filesystem * @group: group number to check * * Return the number of blocks used by the superblock (primary or backup) * in this group. Currently this will be only 0 or 1. */ int ext4_bg_has_super(struct super_block *sb, ext4_group_t group) { struct ext4_super_block *es = EXT4_SB(sb)->s_es; if (group == 0) return 1; if (ext4_has_feature_sparse_super2(sb)) { if (group == le32_to_cpu(es->s_backup_bgs[0]) || group == le32_to_cpu(es->s_backup_bgs[1])) return 1; return 0; } if ((group <= 1) || !ext4_has_feature_sparse_super(sb)) return 1; if (!(group & 1)) return 0; if (test_root(group, 3) || (test_root(group, 5)) || test_root(group, 7)) return 1; return 0; } static unsigned long ext4_bg_num_gdb_meta(struct super_block *sb, ext4_group_t group) { unsigned long metagroup = group / EXT4_DESC_PER_BLOCK(sb); ext4_group_t first = metagroup * EXT4_DESC_PER_BLOCK(sb); ext4_group_t last = first + EXT4_DESC_PER_BLOCK(sb) - 1; if (group == first || group == first + 1 || group == last) return 1; return 0; } static unsigned long ext4_bg_num_gdb_nometa(struct super_block *sb, ext4_group_t group) { if (!ext4_bg_has_super(sb, group)) return 0; if (ext4_has_feature_meta_bg(sb)) return le32_to_cpu(EXT4_SB(sb)->s_es->s_first_meta_bg); else return EXT4_SB(sb)->s_gdb_count; } /** * ext4_bg_num_gdb - number of blocks used by the group table in group * @sb: superblock for filesystem * @group: group number to check * * Return the number of blocks used by the group descriptor table * (primary or backup) in this group. In the future there may be a * different number of descriptor blocks in each group. */ unsigned long ext4_bg_num_gdb(struct super_block *sb, ext4_group_t group) { unsigned long first_meta_bg = le32_to_cpu(EXT4_SB(sb)->s_es->s_first_meta_bg); unsigned long metagroup = group / EXT4_DESC_PER_BLOCK(sb); if (!ext4_has_feature_meta_bg(sb) || metagroup < first_meta_bg) return ext4_bg_num_gdb_nometa(sb, group); return ext4_bg_num_gdb_meta(sb,group); } /* * This function returns the number of file system metadata blocks at * the beginning of a block group, including the reserved gdt blocks. */ unsigned int ext4_num_base_meta_blocks(struct super_block *sb, ext4_group_t block_group) { struct ext4_sb_info *sbi = EXT4_SB(sb); unsigned num; /* Check for superblock and gdt backups in this group */ num = ext4_bg_has_super(sb, block_group); if (!ext4_has_feature_meta_bg(sb) || block_group < le32_to_cpu(sbi->s_es->s_first_meta_bg) * sbi->s_desc_per_block) { if (num) { num += ext4_bg_num_gdb_nometa(sb, block_group); num += le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks); } } else { /* For META_BG_BLOCK_GROUPS */ num += ext4_bg_num_gdb_meta(sb, block_group); } return num; } static unsigned int ext4_num_base_meta_clusters(struct super_block *sb, ext4_group_t block_group) { return EXT4_NUM_B2C(EXT4_SB(sb), ext4_num_base_meta_blocks(sb, block_group)); } /** * ext4_inode_to_goal_block - return a hint for block allocation * @inode: inode for block allocation * * Return the ideal location to start allocating blocks for a * newly created inode. */ ext4_fsblk_t ext4_inode_to_goal_block(struct inode *inode) { struct ext4_inode_info *ei = EXT4_I(inode); ext4_group_t block_group; ext4_grpblk_t colour; int flex_size = ext4_flex_bg_size(EXT4_SB(inode->i_sb)); ext4_fsblk_t bg_start; ext4_fsblk_t last_block; block_group = ei->i_block_group; if (flex_size >= EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME) { /* * If there are at least EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME * block groups per flexgroup, reserve the first block * group for directories and special files. Regular * files will start at the second block group. This * tends to speed up directory access and improves * fsck times. */ block_group &= ~(flex_size-1); if (S_ISREG(inode->i_mode)) block_group++; } bg_start = ext4_group_first_block_no(inode->i_sb, block_group); last_block = ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es) - 1; /* * If we are doing delayed allocation, we don't need take * colour into account. */ if (test_opt(inode->i_sb, DELALLOC)) return bg_start; if (bg_start + EXT4_BLOCKS_PER_GROUP(inode->i_sb) <= last_block) colour = (task_pid_nr(current) % 16) * (EXT4_BLOCKS_PER_GROUP(inode->i_sb) / 16); else colour = (task_pid_nr(current) % 16) * ((last_block - bg_start) / 16); return bg_start + colour; }
113 110 1 1 2 1 4 2 2 720 722 110 110 23 723 724 724 724 724 724 724 724 724 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 /* * memfd_create system call and file sealing support * * Code was originally included in shmem.c, and broken out to facilitate * use by hugetlbfs as well as tmpfs. * * This file is released under the GPL. */ #include <linux/fs.h> #include <linux/vfs.h> #include <linux/pagemap.h> #include <linux/file.h> #include <linux/mm.h> #include <linux/sched/signal.h> #include <linux/khugepaged.h> #include <linux/syscalls.h> #include <linux/hugetlb.h> #include <linux/shmem_fs.h> #include <linux/memfd.h> #include <linux/pid_namespace.h> #include <uapi/linux/memfd.h> #include "swap.h" /* * We need a tag: a new tag would expand every xa_node by 8 bytes, * so reuse a tag which we firmly believe is never set or cleared on tmpfs * or hugetlbfs because they are memory only filesystems. */ #define MEMFD_TAG_PINNED PAGECACHE_TAG_TOWRITE #define LAST_SCAN 4 /* about 150ms max */ static bool memfd_folio_has_extra_refs(struct folio *folio) { return folio_ref_count(folio) != folio_expected_ref_count(folio); } static void memfd_tag_pins(struct xa_state *xas) { struct folio *folio; int latency = 0; lru_add_drain(); xas_lock_irq(xas); xas_for_each(xas, folio, ULONG_MAX) { if (!xa_is_value(folio) && memfd_folio_has_extra_refs(folio)) xas_set_mark(xas, MEMFD_TAG_PINNED); if (++latency < XA_CHECK_SCHED) continue; latency = 0; xas_pause(xas); xas_unlock_irq(xas); cond_resched(); xas_lock_irq(xas); } xas_unlock_irq(xas); } /* * This is a helper function used by memfd_pin_user_pages() in GUP (gup.c). * It is mainly called to allocate a folio in a memfd when the caller * (memfd_pin_folios()) cannot find a folio in the page cache at a given * index in the mapping. */ struct folio *memfd_alloc_folio(struct file *memfd, pgoff_t idx) { #ifdef CONFIG_HUGETLB_PAGE struct folio *folio; gfp_t gfp_mask; if (is_file_hugepages(memfd)) { /* * The folio would most likely be accessed by a DMA driver, * therefore, we have zone memory constraints where we can * alloc from. Also, the folio will be pinned for an indefinite * amount of time, so it is not expected to be migrated away. */ struct inode *inode = file_inode(memfd); struct hstate *h = hstate_file(memfd); int err = -ENOMEM; long nr_resv; gfp_mask = htlb_alloc_mask(h); gfp_mask &= ~(__GFP_HIGHMEM | __GFP_MOVABLE); idx >>= huge_page_order(h); nr_resv = hugetlb_reserve_pages(inode, idx, idx + 1, NULL, 0); if (nr_resv < 0) return ERR_PTR(nr_resv); folio = alloc_hugetlb_folio_reserve(h, numa_node_id(), NULL, gfp_mask); if (folio) { u32 hash; /* * Zero the folio to prevent information leaks to userspace. * Use folio_zero_user() which is optimized for huge/gigantic * pages. Pass 0 as addr_hint since this is not a faulting path * and we don't have a user virtual address yet. */ folio_zero_user(folio, 0); /* * Mark the folio uptodate before adding to page cache, * as required by filemap.c and other hugetlb paths. */ __folio_mark_uptodate(folio); /* * Serialize hugepage allocation and instantiation to prevent * races with concurrent allocations, as required by all other * callers of hugetlb_add_to_page_cache(). */ hash = hugetlb_fault_mutex_hash(memfd->f_mapping, idx); mutex_lock(&hugetlb_fault_mutex_table[hash]); err = hugetlb_add_to_page_cache(folio, memfd->f_mapping, idx); mutex_unlock(&hugetlb_fault_mutex_table[hash]); if (err) { folio_put(folio); goto err_unresv; } hugetlb_set_folio_subpool(folio, subpool_inode(inode)); folio_unlock(folio); return folio; } err_unresv: if (nr_resv > 0) hugetlb_unreserve_pages(inode, idx, idx + 1, 0); return ERR_PTR(err); } #endif return shmem_read_folio(memfd->f_mapping, idx); } /* * Setting SEAL_WRITE requires us to verify there's no pending writer. However, * via get_user_pages(), drivers might have some pending I/O without any active * user-space mappings (eg., direct-IO, AIO). Therefore, we look at all folios * and see whether it has an elevated ref-count. If so, we tag them and wait for * them to be dropped. * The caller must guarantee that no new user will acquire writable references * to those folios to avoid races. */ static int memfd_wait_for_pins(struct address_space *mapping) { XA_STATE(xas, &mapping->i_pages, 0); struct folio *folio; int error, scan; memfd_tag_pins(&xas); error = 0; for (scan = 0; scan <= LAST_SCAN; scan++) { int latency = 0; if (!xas_marked(&xas, MEMFD_TAG_PINNED)) break; if (!scan) lru_add_drain_all(); else if (schedule_timeout_killable((HZ << scan) / 200)) scan = LAST_SCAN; xas_set(&xas, 0); xas_lock_irq(&xas); xas_for_each_marked(&xas, folio, ULONG_MAX, MEMFD_TAG_PINNED) { bool clear = true; if (!xa_is_value(folio) && memfd_folio_has_extra_refs(folio)) { /* * On the last scan, we clean up all those tags * we inserted; but make a note that we still * found folios pinned. */ if (scan == LAST_SCAN) error = -EBUSY; else clear = false; } if (clear) xas_clear_mark(&xas, MEMFD_TAG_PINNED); if (++latency < XA_CHECK_SCHED) continue; latency = 0; xas_pause(&xas); xas_unlock_irq(&xas); cond_resched(); xas_lock_irq(&xas); } xas_unlock_irq(&xas); } return error; } static unsigned int *memfd_file_seals_ptr(struct file *file) { if (shmem_file(file)) return &SHMEM_I(file_inode(file))->seals; #ifdef CONFIG_HUGETLBFS if (is_file_hugepages(file)) return &HUGETLBFS_I(file_inode(file))->seals; #endif return NULL; } #define F_ALL_SEALS (F_SEAL_SEAL | \ F_SEAL_EXEC | \ F_SEAL_SHRINK | \ F_SEAL_GROW | \ F_SEAL_WRITE | \ F_SEAL_FUTURE_WRITE) static int memfd_add_seals(struct file *file, unsigned int seals) { struct inode *inode = file_inode(file); unsigned int *file_seals; int error; /* * SEALING * Sealing allows multiple parties to share a tmpfs or hugetlbfs file * but restrict access to a specific subset of file operations. Seals * can only be added, but never removed. This way, mutually untrusted * parties can share common memory regions with a well-defined policy. * A malicious peer can thus never perform unwanted operations on a * shared object. * * Seals are only supported on special tmpfs or hugetlbfs files and * always affect the whole underlying inode. Once a seal is set, it * may prevent some kinds of access to the file. Currently, the * following seals are defined: * SEAL_SEAL: Prevent further seals from being set on this file * SEAL_SHRINK: Prevent the file from shrinking * SEAL_GROW: Prevent the file from growing * SEAL_WRITE: Prevent write access to the file * SEAL_EXEC: Prevent modification of the exec bits in the file mode * * As we don't require any trust relationship between two parties, we * must prevent seals from being removed. Therefore, sealing a file * only adds a given set of seals to the file, it never touches * existing seals. Furthermore, the "setting seals"-operation can be * sealed itself, which basically prevents any further seal from being * added. * * Semantics of sealing are only defined on volatile files. Only * anonymous tmpfs and hugetlbfs files support sealing. More * importantly, seals are never written to disk. Therefore, there's * no plan to support it on other file types. */ if (!(file->f_mode & FMODE_WRITE)) return -EPERM; if (seals & ~(unsigned int)F_ALL_SEALS) return -EINVAL; inode_lock(inode); file_seals = memfd_file_seals_ptr(file); if (!file_seals) { error = -EINVAL; goto unlock; } if (*file_seals & F_SEAL_SEAL) { error = -EPERM; goto unlock; } if ((seals & F_SEAL_WRITE) && !(*file_seals & F_SEAL_WRITE)) { error = mapping_deny_writable(file->f_mapping); if (error) goto unlock; error = memfd_wait_for_pins(file->f_mapping); if (error) { mapping_allow_writable(file->f_mapping); goto unlock; } } /* * SEAL_EXEC implies SEAL_WRITE, making W^X from the start. */ if (seals & F_SEAL_EXEC && inode->i_mode & 0111) seals |= F_SEAL_SHRINK|F_SEAL_GROW|F_SEAL_WRITE|F_SEAL_FUTURE_WRITE; *file_seals |= seals; error = 0; unlock: inode_unlock(inode); return error; } static int memfd_get_seals(struct file *file) { unsigned int *seals = memfd_file_seals_ptr(file); return seals ? *seals : -EINVAL; } long memfd_fcntl(struct file *file, unsigned int cmd, unsigned int arg) { long error; switch (cmd) { case F_ADD_SEALS: error = memfd_add_seals(file, arg); break; case F_GET_SEALS: error = memfd_get_seals(file); break; default: error = -EINVAL; break; } return error; } #define MFD_NAME_PREFIX "memfd:" #define MFD_NAME_PREFIX_LEN (sizeof(MFD_NAME_PREFIX) - 1) #define MFD_NAME_MAX_LEN (NAME_MAX - MFD_NAME_PREFIX_LEN) #define MFD_ALL_FLAGS (MFD_CLOEXEC | MFD_ALLOW_SEALING | MFD_HUGETLB | MFD_NOEXEC_SEAL | MFD_EXEC) static int check_sysctl_memfd_noexec(unsigned int *flags) { #ifdef CONFIG_SYSCTL struct pid_namespace *ns = task_active_pid_ns(current); int sysctl = pidns_memfd_noexec_scope(ns); if (!(*flags & (MFD_EXEC | MFD_NOEXEC_SEAL))) { if (sysctl >= MEMFD_NOEXEC_SCOPE_NOEXEC_SEAL) *flags |= MFD_NOEXEC_SEAL; else *flags |= MFD_EXEC; } if (!(*flags & MFD_NOEXEC_SEAL) && sysctl >= MEMFD_NOEXEC_SCOPE_NOEXEC_ENFORCED) { pr_err_ratelimited( "%s[%d]: memfd_create() requires MFD_NOEXEC_SEAL with vm.memfd_noexec=%d\n", current->comm, task_pid_nr(current), sysctl); return -EACCES; } #endif return 0; } static inline bool is_write_sealed(unsigned int seals) { return seals & (F_SEAL_WRITE | F_SEAL_FUTURE_WRITE); } static int check_write_seal(vm_flags_t *vm_flags_ptr) { vm_flags_t vm_flags = *vm_flags_ptr; vm_flags_t mask = vm_flags & (VM_SHARED | VM_WRITE); /* If a private mapping then writability is irrelevant. */ if (!(mask & VM_SHARED)) return 0; /* * New PROT_WRITE and MAP_SHARED mmaps are not allowed when * write seals are active. */ if (mask & VM_WRITE) return -EPERM; /* * This is a read-only mapping, disallow mprotect() from making a * write-sealed mapping writable in future. */ *vm_flags_ptr &= ~VM_MAYWRITE; return 0; } int memfd_check_seals_mmap(struct file *file, vm_flags_t *vm_flags_ptr) { int err = 0; unsigned int *seals_ptr = memfd_file_seals_ptr(file); unsigned int seals = seals_ptr ? *seals_ptr : 0; if (is_write_sealed(seals)) err = check_write_seal(vm_flags_ptr); return err; } static int sanitize_flags(unsigned int *flags_ptr) { unsigned int flags = *flags_ptr; if (!(flags & MFD_HUGETLB)) { if (flags & ~MFD_ALL_FLAGS) return -EINVAL; } else { /* Allow huge page size encoding in flags. */ if (flags & ~(MFD_ALL_FLAGS | (MFD_HUGE_MASK << MFD_HUGE_SHIFT))) return -EINVAL; } /* Invalid if both EXEC and NOEXEC_SEAL are set.*/ if ((flags & MFD_EXEC) && (flags & MFD_NOEXEC_SEAL)) return -EINVAL; return check_sysctl_memfd_noexec(flags_ptr); } static char *alloc_name(const char __user *uname) { int error; char *name; long len; name = kmalloc(NAME_MAX + 1, GFP_KERNEL); if (!name) return ERR_PTR(-ENOMEM); memcpy(name, MFD_NAME_PREFIX, MFD_NAME_PREFIX_LEN); /* returned length does not include terminating zero */ len = strncpy_from_user(&name[MFD_NAME_PREFIX_LEN], uname, MFD_NAME_MAX_LEN + 1); if (len < 0) { error = -EFAULT; goto err_name; } else if (len > MFD_NAME_MAX_LEN) { error = -EINVAL; goto err_name; } return name; err_name: kfree(name); return ERR_PTR(error); } static struct file *alloc_file(const char *name, unsigned int flags) { unsigned int *file_seals; struct file *file; if (flags & MFD_HUGETLB) { file = hugetlb_file_setup(name, 0, VM_NORESERVE, HUGETLB_ANONHUGE_INODE, (flags >> MFD_HUGE_SHIFT) & MFD_HUGE_MASK); } else { file = shmem_file_setup(name, 0, VM_NORESERVE); } if (IS_ERR(file)) return file; file->f_mode |= FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE; file->f_flags |= O_LARGEFILE; if (flags & MFD_NOEXEC_SEAL) { struct inode *inode = file_inode(file); inode->i_mode &= ~0111; file_seals = memfd_file_seals_ptr(file); if (file_seals) { *file_seals &= ~F_SEAL_SEAL; *file_seals |= F_SEAL_EXEC; } } else if (flags & MFD_ALLOW_SEALING) { /* MFD_EXEC and MFD_ALLOW_SEALING are set */ file_seals = memfd_file_seals_ptr(file); if (file_seals) *file_seals &= ~F_SEAL_SEAL; } return file; } SYSCALL_DEFINE2(memfd_create, const char __user *, uname, unsigned int, flags) { struct file *file; int fd, error; char *name; error = sanitize_flags(&flags); if (error < 0) return error; name = alloc_name(uname); if (IS_ERR(name)) return PTR_ERR(name); fd = get_unused_fd_flags((flags & MFD_CLOEXEC) ? O_CLOEXEC : 0); if (fd < 0) { error = fd; goto err_free_name; } file = alloc_file(name, flags); if (IS_ERR(file)) { error = PTR_ERR(file); goto err_free_fd; } fd_install(fd, file); kfree(name); return fd; err_free_fd: put_unused_fd(fd); err_free_name: kfree(name); return error; }
1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 5 5 5 5 5 5 5 5 3 5 5 5 5 5 5 5 4 5 5 5 5 4 5 5 3 5 5 5 5 5 5 5 5 3 5 5 5 5 5 5 5 5 5 5 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 // SPDX-License-Identifier: GPL-2.0-or-later /* * Linux NET3: Internet Group Management Protocol [IGMP] * * This code implements the IGMP protocol as defined in RFC1112. There has * been a further revision of this protocol since which is now supported. * * If you have trouble with this module be careful what gcc you have used, * the older version didn't come out right using gcc 2.5.8, the newer one * seems to fall out with gcc 2.6.2. * * Authors: * Alan Cox <alan@lxorguk.ukuu.org.uk> * * Fixes: * * Alan Cox : Added lots of __inline__ to optimise * the memory usage of all the tiny little * functions. * Alan Cox : Dumped the header building experiment. * Alan Cox : Minor tweaks ready for multicast routing * and extended IGMP protocol. * Alan Cox : Removed a load of inline directives. Gcc 2.5.8 * writes utterly bogus code otherwise (sigh) * fixed IGMP loopback to behave in the manner * desired by mrouted, fixed the fact it has been * broken since 1.3.6 and cleaned up a few minor * points. * * Chih-Jen Chang : Tried to revise IGMP to Version 2 * Tsu-Sheng Tsao E-mail: chihjenc@scf.usc.edu and tsusheng@scf.usc.edu * The enhancements are mainly based on Steve Deering's * ipmulti-3.5 source code. * Chih-Jen Chang : Added the igmp_get_mrouter_info and * Tsu-Sheng Tsao igmp_set_mrouter_info to keep track of * the mrouted version on that device. * Chih-Jen Chang : Added the max_resp_time parameter to * Tsu-Sheng Tsao igmp_heard_query(). Using this parameter * to identify the multicast router version * and do what the IGMP version 2 specified. * Chih-Jen Chang : Added a timer to revert to IGMP V2 router * Tsu-Sheng Tsao if the specified time expired. * Alan Cox : Stop IGMP from 0.0.0.0 being accepted. * Alan Cox : Use GFP_ATOMIC in the right places. * Christian Daudt : igmp timer wasn't set for local group * memberships but was being deleted, * which caused a "del_timer() called * from %p with timer not initialized\n" * message (960131). * Christian Daudt : removed del_timer from * igmp_timer_expire function (960205). * Christian Daudt : igmp_heard_report now only calls * igmp_timer_expire if tm->running is * true (960216). * Malcolm Beattie : ttl comparison wrong in igmp_rcv made * igmp_heard_query never trigger. Expiry * miscalculation fixed in igmp_heard_query * and random() made to return unsigned to * prevent negative expiry times. * Alexey Kuznetsov: Wrong group leaving behaviour, backport * fix from pending 2.1.x patches. * Alan Cox: Forget to enable FDDI support earlier. * Alexey Kuznetsov: Fixed leaving groups on device down. * Alexey Kuznetsov: Accordance to igmp-v2-06 draft. * David L Stevens: IGMPv3 support, with help from * Vinay Kulkarni */ #include <linux/module.h> #include <linux/slab.h> #include <linux/uaccess.h> #include <linux/types.h> #include <linux/kernel.h> #include <linux/jiffies.h> #include <linux/string.h> #include <linux/socket.h> #include <linux/sockios.h> #include <linux/in.h> #include <linux/inet.h> #include <linux/netdevice.h> #include <linux/skbuff.h> #include <linux/inetdevice.h> #include <linux/igmp.h> #include "igmp_internal.h" #include <linux/if_arp.h> #include <linux/rtnetlink.h> #include <linux/times.h> #include <linux/pkt_sched.h> #include <linux/byteorder/generic.h> #include <net/net_namespace.h> #include <net/netlink.h> #include <net/addrconf.h> #include <net/arp.h> #include <net/ip.h> #include <net/protocol.h> #include <net/route.h> #include <net/sock.h> #include <net/checksum.h> #include <net/inet_common.h> #include <linux/netfilter_ipv4.h> #ifdef CONFIG_IP_MROUTE #include <linux/mroute.h> #endif #ifdef CONFIG_PROC_FS #include <linux/proc_fs.h> #include <linux/seq_file.h> #endif #ifdef CONFIG_IP_MULTICAST /* Parameter names and values are taken from igmp-v2-06 draft */ #define IGMP_QUERY_INTERVAL (125*HZ) #define IGMP_QUERY_RESPONSE_INTERVAL (10*HZ) #define IGMP_INITIAL_REPORT_DELAY (1) /* IGMP_INITIAL_REPORT_DELAY is not from IGMP specs! * IGMP specs require to report membership immediately after * joining a group, but we delay the first report by a * small interval. It seems more natural and still does not * contradict to specs provided this delay is small enough. */ #define IGMP_V1_SEEN(in_dev) \ (IPV4_DEVCONF_ALL_RO(dev_net(in_dev->dev), FORCE_IGMP_VERSION) == 1 || \ IN_DEV_CONF_GET((in_dev), FORCE_IGMP_VERSION) == 1 || \ ((in_dev)->mr_v1_seen && \ time_before(jiffies, (in_dev)->mr_v1_seen))) #define IGMP_V2_SEEN(in_dev) \ (IPV4_DEVCONF_ALL_RO(dev_net(in_dev->dev), FORCE_IGMP_VERSION) == 2 || \ IN_DEV_CONF_GET((in_dev), FORCE_IGMP_VERSION) == 2 || \ ((in_dev)->mr_v2_seen && \ time_before(jiffies, (in_dev)->mr_v2_seen))) static int unsolicited_report_interval(struct in_device *in_dev) { int interval_ms, interval_jiffies; if (IGMP_V1_SEEN(in_dev) || IGMP_V2_SEEN(in_dev)) interval_ms = IN_DEV_CONF_GET( in_dev, IGMPV2_UNSOLICITED_REPORT_INTERVAL); else /* v3 */ interval_ms = IN_DEV_CONF_GET( in_dev, IGMPV3_UNSOLICITED_REPORT_INTERVAL); interval_jiffies = msecs_to_jiffies(interval_ms); /* _timer functions can't handle a delay of 0 jiffies so ensure * we always return a positive value. */ if (interval_jiffies <= 0) interval_jiffies = 1; return interval_jiffies; } static void igmpv3_add_delrec(struct in_device *in_dev, struct ip_mc_list *im, gfp_t gfp); static void igmpv3_del_delrec(struct in_device *in_dev, struct ip_mc_list *im); static void igmpv3_clear_delrec(struct in_device *in_dev); static int sf_setstate(struct ip_mc_list *pmc); static void sf_markstate(struct ip_mc_list *pmc); #endif static void ip_mc_clear_src(struct ip_mc_list *pmc); static int ip_mc_add_src(struct in_device *in_dev, __be32 *pmca, int sfmode, int sfcount, __be32 *psfsrc, int delta); static void ip_ma_put(struct ip_mc_list *im) { if (refcount_dec_and_test(&im->refcnt)) { in_dev_put(im->interface); kfree_rcu(im, rcu); } } #define for_each_pmc_rcu(in_dev, pmc) \ for (pmc = rcu_dereference(in_dev->mc_list); \ pmc != NULL; \ pmc = rcu_dereference(pmc->next_rcu)) #define for_each_pmc_rtnl(in_dev, pmc) \ for (pmc = rtnl_dereference(in_dev->mc_list); \ pmc != NULL; \ pmc = rtnl_dereference(pmc->next_rcu)) static void ip_sf_list_clear_all(struct ip_sf_list *psf) { struct ip_sf_list *next; while (psf) { next = psf->sf_next; kfree(psf); psf = next; } } #ifdef CONFIG_IP_MULTICAST /* * Timer management */ static void igmp_stop_timer(struct ip_mc_list *im) { spin_lock_bh(&im->lock); if (timer_delete(&im->timer)) refcount_dec(&im->refcnt); im->tm_running = 0; im->reporter = 0; im->unsolicit_count = 0; spin_unlock_bh(&im->lock); } /* It must be called with locked im->lock */ static void igmp_start_timer(struct ip_mc_list *im, int max_delay) { int tv = get_random_u32_below(max_delay); im->tm_running = 1; if (refcount_inc_not_zero(&im->refcnt)) { if (mod_timer(&im->timer, jiffies + tv + 2)) ip_ma_put(im); } } static void igmp_gq_start_timer(struct in_device *in_dev) { int tv = get_random_u32_below(in_dev->mr_maxdelay); unsigned long exp = jiffies + tv + 2; if (in_dev->mr_gq_running && time_after_eq(exp, (in_dev->mr_gq_timer).expires)) return; in_dev->mr_gq_running = 1; if (!mod_timer(&in_dev->mr_gq_timer, exp)) in_dev_hold(in_dev); } static void igmp_ifc_start_timer(struct in_device *in_dev, int delay) { int tv = get_random_u32_below(delay); if (!mod_timer(&in_dev->mr_ifc_timer, jiffies+tv+2)) in_dev_hold(in_dev); } static void igmp_mod_timer(struct ip_mc_list *im, int max_delay) { spin_lock_bh(&im->lock); im->unsolicit_count = 0; if (timer_delete(&im->timer)) { if ((long)(im->timer.expires-jiffies) < max_delay) { add_timer(&im->timer); im->tm_running = 1; spin_unlock_bh(&im->lock); return; } refcount_dec(&im->refcnt); } igmp_start_timer(im, max_delay); spin_unlock_bh(&im->lock); } /* * Send an IGMP report. */ #define IGMP_SIZE (sizeof(struct igmphdr)+sizeof(struct iphdr)+4) static int is_in(struct ip_mc_list *pmc, struct ip_sf_list *psf, int type, int gdeleted, int sdeleted) { switch (type) { case IGMPV3_MODE_IS_INCLUDE: case IGMPV3_MODE_IS_EXCLUDE: if (gdeleted || sdeleted) return 0; if (!(pmc->gsquery && !psf->sf_gsresp)) { if (pmc->sfmode == MCAST_INCLUDE) return 1; /* don't include if this source is excluded * in all filters */ if (psf->sf_count[MCAST_INCLUDE]) return type == IGMPV3_MODE_IS_INCLUDE; return pmc->sfcount[MCAST_EXCLUDE] == psf->sf_count[MCAST_EXCLUDE]; } return 0; case IGMPV3_CHANGE_TO_INCLUDE: if (gdeleted || sdeleted) return 0; return psf->sf_count[MCAST_INCLUDE] != 0; case IGMPV3_CHANGE_TO_EXCLUDE: if (gdeleted || sdeleted) return 0; if (pmc->sfcount[MCAST_EXCLUDE] == 0 || psf->sf_count[MCAST_INCLUDE]) return 0; return pmc->sfcount[MCAST_EXCLUDE] == psf->sf_count[MCAST_EXCLUDE]; case IGMPV3_ALLOW_NEW_SOURCES: if (gdeleted || !psf->sf_crcount) return 0; return (pmc->sfmode == MCAST_INCLUDE) ^ sdeleted; case IGMPV3_BLOCK_OLD_SOURCES: if (pmc->sfmode == MCAST_INCLUDE) return gdeleted || (psf->sf_crcount && sdeleted); return psf->sf_crcount && !gdeleted && !sdeleted; } return 0; } static int igmp_scount(struct ip_mc_list *pmc, int type, int gdeleted, int sdeleted) { struct ip_sf_list *psf; int scount = 0; for (psf = pmc->sources; psf; psf = psf->sf_next) { if (!is_in(pmc, psf, type, gdeleted, sdeleted)) continue; scount++; } return scount; } /* source address selection per RFC 3376 section 4.2.13 */ static __be32 igmpv3_get_srcaddr(struct net_device *dev, const struct flowi4 *fl4) { struct in_device *in_dev = __in_dev_get_rcu(dev); const struct in_ifaddr *ifa; if (!in_dev) return htonl(INADDR_ANY); in_dev_for_each_ifa_rcu(ifa, in_dev) { if (fl4->saddr == ifa->ifa_local) return fl4->saddr; } return htonl(INADDR_ANY); } static struct sk_buff *igmpv3_newpack(struct net_device *dev, unsigned int mtu) { struct sk_buff *skb; struct rtable *rt; struct iphdr *pip; struct igmpv3_report *pig; struct net *net = dev_net(dev); struct flowi4 fl4; int hlen = LL_RESERVED_SPACE(dev); int tlen = dev->needed_tailroom; unsigned int size; size = min(mtu, IP_MAX_MTU); while (1) { skb = alloc_skb(size + hlen + tlen, GFP_ATOMIC | __GFP_NOWARN); if (skb) break; size >>= 1; if (size < 256) return NULL; } skb->priority = TC_PRIO_CONTROL; rt = ip_route_output_ports(net, &fl4, NULL, IGMPV3_ALL_MCR, 0, 0, 0, IPPROTO_IGMP, 0, dev->ifindex); if (IS_ERR(rt)) { kfree_skb(skb); return NULL; } skb_dst_set(skb, &rt->dst); skb->dev = dev; skb_reserve(skb, hlen); skb_tailroom_reserve(skb, mtu, tlen); skb_reset_network_header(skb); pip = ip_hdr(skb); skb_put(skb, sizeof(struct iphdr) + 4); pip->version = 4; pip->ihl = (sizeof(struct iphdr)+4)>>2; pip->tos = 0xc0; pip->frag_off = htons(IP_DF); pip->ttl = 1; pip->daddr = fl4.daddr; rcu_read_lock(); pip->saddr = igmpv3_get_srcaddr(dev, &fl4); rcu_read_unlock(); pip->protocol = IPPROTO_IGMP; pip->tot_len = 0; /* filled in later */ ip_select_ident(net, skb, NULL); ((u8 *)&pip[1])[0] = IPOPT_RA; ((u8 *)&pip[1])[1] = 4; ((u8 *)&pip[1])[2] = 0; ((u8 *)&pip[1])[3] = 0; skb->transport_header = skb->network_header + sizeof(struct iphdr) + 4; skb_put(skb, sizeof(*pig)); pig = igmpv3_report_hdr(skb); pig->type = IGMPV3_HOST_MEMBERSHIP_REPORT; pig->resv1 = 0; pig->csum = 0; pig->resv2 = 0; pig->ngrec = 0; return skb; } static int igmpv3_sendpack(struct sk_buff *skb) { struct igmphdr *pig = igmp_hdr(skb); const int igmplen = skb_tail_pointer(skb) - skb_transport_header(skb); pig->csum = ip_compute_csum(igmp_hdr(skb), igmplen); return ip_local_out(skb_dst_dev_net(skb), skb->sk, skb); } static int grec_size(struct ip_mc_list *pmc, int type, int gdel, int sdel) { return sizeof(struct igmpv3_grec) + 4*igmp_scount(pmc, type, gdel, sdel); } static struct sk_buff *add_grhead(struct sk_buff *skb, struct ip_mc_list *pmc, int type, struct igmpv3_grec **ppgr, unsigned int mtu) { struct net_device *dev = pmc->interface->dev; struct igmpv3_report *pih; struct igmpv3_grec *pgr; if (!skb) { skb = igmpv3_newpack(dev, mtu); if (!skb) return NULL; } pgr = skb_put(skb, sizeof(struct igmpv3_grec)); pgr->grec_type = type; pgr->grec_auxwords = 0; pgr->grec_nsrcs = 0; pgr->grec_mca = pmc->multiaddr; pih = igmpv3_report_hdr(skb); pih->ngrec = htons(ntohs(pih->ngrec)+1); *ppgr = pgr; return skb; } #define AVAILABLE(skb) ((skb) ? skb_availroom(skb) : 0) static struct sk_buff *add_grec(struct sk_buff *skb, struct ip_mc_list *pmc, int type, int gdeleted, int sdeleted) { struct net_device *dev = pmc->interface->dev; struct net *net = dev_net(dev); struct igmpv3_report *pih; struct igmpv3_grec *pgr = NULL; struct ip_sf_list *psf, *psf_next, *psf_prev, **psf_list; int scount, stotal, first, isquery, truncate; unsigned int mtu; if (pmc->multiaddr == IGMP_ALL_HOSTS) return skb; if (ipv4_is_local_multicast(pmc->multiaddr) && !READ_ONCE(net->ipv4.sysctl_igmp_llm_reports)) return skb; mtu = READ_ONCE(dev->mtu); if (mtu < IPV4_MIN_MTU) return skb; isquery = type == IGMPV3_MODE_IS_INCLUDE || type == IGMPV3_MODE_IS_EXCLUDE; truncate = type == IGMPV3_MODE_IS_EXCLUDE || type == IGMPV3_CHANGE_TO_EXCLUDE; stotal = scount = 0; psf_list = sdeleted ? &pmc->tomb : &pmc->sources; if (!*psf_list) goto empty_source; pih = skb ? igmpv3_report_hdr(skb) : NULL; /* EX and TO_EX get a fresh packet, if needed */ if (truncate) { if (pih && pih->ngrec && AVAILABLE(skb) < grec_size(pmc, type, gdeleted, sdeleted)) { if (skb) igmpv3_sendpack(skb); skb = igmpv3_newpack(dev, mtu); } } first = 1; psf_prev = NULL; for (psf = *psf_list; psf; psf = psf_next) { __be32 *psrc; psf_next = psf->sf_next; if (!is_in(pmc, psf, type, gdeleted, sdeleted)) { psf_prev = psf; continue; } /* Based on RFC3376 5.1. Should not send source-list change * records when there is a filter mode change. */ if (((gdeleted && pmc->sfmode == MCAST_EXCLUDE) || (!gdeleted && pmc->crcount)) && (type == IGMPV3_ALLOW_NEW_SOURCES || type == IGMPV3_BLOCK_OLD_SOURCES) && psf->sf_crcount) goto decrease_sf_crcount; /* clear marks on query responses */ if (isquery) psf->sf_gsresp = 0; if (AVAILABLE(skb) < sizeof(__be32) + first*sizeof(struct igmpv3_grec)) { if (truncate && !first) break; /* truncate these */ if (pgr) pgr->grec_nsrcs = htons(scount); if (skb) igmpv3_sendpack(skb); skb = igmpv3_newpack(dev, mtu); first = 1; scount = 0; } if (first) { skb = add_grhead(skb, pmc, type, &pgr, mtu); first = 0; } if (!skb) return NULL; psrc = skb_put(skb, sizeof(__be32)); *psrc = psf->sf_inaddr; scount++; stotal++; if ((type == IGMPV3_ALLOW_NEW_SOURCES || type == IGMPV3_BLOCK_OLD_SOURCES) && psf->sf_crcount) { decrease_sf_crcount: psf->sf_crcount--; if ((sdeleted || gdeleted) && psf->sf_crcount == 0) { if (psf_prev) psf_prev->sf_next = psf->sf_next; else *psf_list = psf->sf_next; kfree(psf); continue; } } psf_prev = psf; } empty_source: if (!stotal) { if (type == IGMPV3_ALLOW_NEW_SOURCES || type == IGMPV3_BLOCK_OLD_SOURCES) return skb; if (pmc->crcount || isquery) { /* make sure we have room for group header */ if (skb && AVAILABLE(skb) < sizeof(struct igmpv3_grec)) { igmpv3_sendpack(skb); skb = NULL; /* add_grhead will get a new one */ } skb = add_grhead(skb, pmc, type, &pgr, mtu); } } if (pgr) pgr->grec_nsrcs = htons(scount); if (isquery) pmc->gsquery = 0; /* clear query state on report */ return skb; } static int igmpv3_send_report(struct in_device *in_dev, struct ip_mc_list *pmc) { struct sk_buff *skb = NULL; struct net *net = dev_net(in_dev->dev); int type; if (!pmc) { rcu_read_lock(); for_each_pmc_rcu(in_dev, pmc) { if (pmc->multiaddr == IGMP_ALL_HOSTS) continue; if (ipv4_is_local_multicast(pmc->multiaddr) && !READ_ONCE(net->ipv4.sysctl_igmp_llm_reports)) continue; spin_lock_bh(&pmc->lock); if (pmc->sfcount[MCAST_EXCLUDE]) type = IGMPV3_MODE_IS_EXCLUDE; else type = IGMPV3_MODE_IS_INCLUDE; skb = add_grec(skb, pmc, type, 0, 0); spin_unlock_bh(&pmc->lock); } rcu_read_unlock(); } else { spin_lock_bh(&pmc->lock); if (pmc->sfcount[MCAST_EXCLUDE]) type = IGMPV3_MODE_IS_EXCLUDE; else type = IGMPV3_MODE_IS_INCLUDE; skb = add_grec(skb, pmc, type, 0, 0); spin_unlock_bh(&pmc->lock); } if (!skb) return 0; return igmpv3_sendpack(skb); } /* * remove zero-count source records from a source filter list */ static void igmpv3_clear_zeros(struct ip_sf_list **ppsf) { struct ip_sf_list *psf_prev, *psf_next, *psf; psf_prev = NULL; for (psf = *ppsf; psf; psf = psf_next) { psf_next = psf->sf_next; if (psf->sf_crcount == 0) { if (psf_prev) psf_prev->sf_next = psf->sf_next; else *ppsf = psf->sf_next; kfree(psf); } else psf_prev = psf; } } static void kfree_pmc(struct ip_mc_list *pmc) { ip_sf_list_clear_all(pmc->sources); ip_sf_list_clear_all(pmc->tomb); kfree(pmc); } static void igmpv3_send_cr(struct in_device *in_dev) { struct ip_mc_list *pmc, *pmc_prev, *pmc_next; struct sk_buff *skb = NULL; int type, dtype; rcu_read_lock(); spin_lock_bh(&in_dev->mc_tomb_lock); /* deleted MCA's */ pmc_prev = NULL; for (pmc = in_dev->mc_tomb; pmc; pmc = pmc_next) { pmc_next = pmc->next; if (pmc->sfmode == MCAST_INCLUDE) { type = IGMPV3_BLOCK_OLD_SOURCES; dtype = IGMPV3_BLOCK_OLD_SOURCES; skb = add_grec(skb, pmc, type, 1, 0); skb = add_grec(skb, pmc, dtype, 1, 1); } if (pmc->crcount) { if (pmc->sfmode == MCAST_EXCLUDE) { type = IGMPV3_CHANGE_TO_INCLUDE; skb = add_grec(skb, pmc, type, 1, 0); } pmc->crcount--; if (pmc->crcount == 0) { igmpv3_clear_zeros(&pmc->tomb); igmpv3_clear_zeros(&pmc->sources); } } if (pmc->crcount == 0 && !pmc->tomb && !pmc->sources) { if (pmc_prev) pmc_prev->next = pmc_next; else in_dev->mc_tomb = pmc_next; in_dev_put(pmc->interface); kfree_pmc(pmc); } else pmc_prev = pmc; } spin_unlock_bh(&in_dev->mc_tomb_lock); /* change recs */ for_each_pmc_rcu(in_dev, pmc) { spin_lock_bh(&pmc->lock); if (pmc->sfcount[MCAST_EXCLUDE]) { type = IGMPV3_BLOCK_OLD_SOURCES; dtype = IGMPV3_ALLOW_NEW_SOURCES; } else { type = IGMPV3_ALLOW_NEW_SOURCES; dtype = IGMPV3_BLOCK_OLD_SOURCES; } skb = add_grec(skb, pmc, type, 0, 0); skb = add_grec(skb, pmc, dtype, 0, 1); /* deleted sources */ /* filter mode changes */ if (pmc->crcount) { if (pmc->sfmode == MCAST_EXCLUDE) type = IGMPV3_CHANGE_TO_EXCLUDE; else type = IGMPV3_CHANGE_TO_INCLUDE; skb = add_grec(skb, pmc, type, 0, 0); pmc->crcount--; } spin_unlock_bh(&pmc->lock); } rcu_read_unlock(); if (!skb) return; (void) igmpv3_sendpack(skb); } static int igmp_send_report(struct in_device *in_dev, struct ip_mc_list *pmc, int type) { struct sk_buff *skb; struct iphdr *iph; struct igmphdr *ih; struct rtable *rt; struct net_device *dev = in_dev->dev; struct net *net = dev_net(dev); __be32 group = pmc ? pmc->multiaddr : 0; struct flowi4 fl4; __be32 dst; int hlen, tlen; if (type == IGMPV3_HOST_MEMBERSHIP_REPORT) return igmpv3_send_report(in_dev, pmc); if (ipv4_is_local_multicast(group) && !READ_ONCE(net->ipv4.sysctl_igmp_llm_reports)) return 0; if (type == IGMP_HOST_LEAVE_MESSAGE) dst = IGMP_ALL_ROUTER; else dst = group; rt = ip_route_output_ports(net, &fl4, NULL, dst, 0, 0, 0, IPPROTO_IGMP, 0, dev->ifindex); if (IS_ERR(rt)) return -1; hlen = LL_RESERVED_SPACE(dev); tlen = dev->needed_tailroom; skb = alloc_skb(IGMP_SIZE + hlen + tlen, GFP_ATOMIC); if (!skb) { ip_rt_put(rt); return -1; } skb->priority = TC_PRIO_CONTROL; skb_dst_set(skb, &rt->dst); skb_reserve(skb, hlen); skb_reset_network_header(skb); iph = ip_hdr(skb); skb_put(skb, sizeof(struct iphdr) + 4); iph->version = 4; iph->ihl = (sizeof(struct iphdr)+4)>>2; iph->tos = 0xc0; iph->frag_off = htons(IP_DF); iph->ttl = 1; iph->daddr = dst; iph->saddr = fl4.saddr; iph->protocol = IPPROTO_IGMP; ip_select_ident(net, skb, NULL); ((u8 *)&iph[1])[0] = IPOPT_RA; ((u8 *)&iph[1])[1] = 4; ((u8 *)&iph[1])[2] = 0; ((u8 *)&iph[1])[3] = 0; ih = skb_put(skb, sizeof(struct igmphdr)); ih->type = type; ih->code = 0; ih->csum = 0; ih->group = group; ih->csum = ip_compute_csum((void *)ih, sizeof(struct igmphdr)); return ip_local_out(net, skb->sk, skb); } static void igmp_gq_timer_expire(struct timer_list *t) { struct in_device *in_dev = timer_container_of(in_dev, t, mr_gq_timer); in_dev->mr_gq_running = 0; igmpv3_send_report(in_dev, NULL); in_dev_put(in_dev); } static void igmp_ifc_timer_expire(struct timer_list *t) { struct in_device *in_dev = timer_container_of(in_dev, t, mr_ifc_timer); u32 mr_ifc_count; igmpv3_send_cr(in_dev); restart: mr_ifc_count = READ_ONCE(in_dev->mr_ifc_count); if (mr_ifc_count) { if (cmpxchg(&in_dev->mr_ifc_count, mr_ifc_count, mr_ifc_count - 1) != mr_ifc_count) goto restart; igmp_ifc_start_timer(in_dev, unsolicited_report_interval(in_dev)); } in_dev_put(in_dev); } static void igmp_ifc_event(struct in_device *in_dev) { struct net *net = dev_net(in_dev->dev); if (IGMP_V1_SEEN(in_dev) || IGMP_V2_SEEN(in_dev)) return; WRITE_ONCE(in_dev->mr_ifc_count, in_dev->mr_qrv ?: READ_ONCE(net->ipv4.sysctl_igmp_qrv)); igmp_ifc_start_timer(in_dev, 1); } static void igmp_timer_expire(struct timer_list *t) { struct ip_mc_list *im = timer_container_of(im, t, timer); struct in_device *in_dev = im->interface; spin_lock(&im->lock); im->tm_running = 0; if (im->unsolicit_count && --im->unsolicit_count) igmp_start_timer(im, unsolicited_report_interval(in_dev)); im->reporter = 1; spin_unlock(&im->lock); if (IGMP_V1_SEEN(in_dev)) igmp_send_report(in_dev, im, IGMP_HOST_MEMBERSHIP_REPORT); else if (IGMP_V2_SEEN(in_dev)) igmp_send_report(in_dev, im, IGMPV2_HOST_MEMBERSHIP_REPORT); else igmp_send_report(in_dev, im, IGMPV3_HOST_MEMBERSHIP_REPORT); ip_ma_put(im); } /* mark EXCLUDE-mode sources */ static int igmp_xmarksources(struct ip_mc_list *pmc, int nsrcs, __be32 *srcs) { struct ip_sf_list *psf; int i, scount; scount = 0; for (psf = pmc->sources; psf; psf = psf->sf_next) { if (scount == nsrcs) break; for (i = 0; i < nsrcs; i++) { /* skip inactive filters */ if (psf->sf_count[MCAST_INCLUDE] || pmc->sfcount[MCAST_EXCLUDE] != psf->sf_count[MCAST_EXCLUDE]) break; if (srcs[i] == psf->sf_inaddr) { scount++; break; } } } pmc->gsquery = 0; if (scount == nsrcs) /* all sources excluded */ return 0; return 1; } static int igmp_marksources(struct ip_mc_list *pmc, int nsrcs, __be32 *srcs) { struct ip_sf_list *psf; int i, scount; if (pmc->sfmode == MCAST_EXCLUDE) return igmp_xmarksources(pmc, nsrcs, srcs); /* mark INCLUDE-mode sources */ scount = 0; for (psf = pmc->sources; psf; psf = psf->sf_next) { if (scount == nsrcs) break; for (i = 0; i < nsrcs; i++) if (srcs[i] == psf->sf_inaddr) { psf->sf_gsresp = 1; scount++; break; } } if (!scount) { pmc->gsquery = 0; return 0; } pmc->gsquery = 1; return 1; } /* return true if packet was dropped */ static bool igmp_heard_report(struct in_device *in_dev, __be32 group) { struct ip_mc_list *im; struct net *net = dev_net(in_dev->dev); /* Timers are only set for non-local groups */ if (group == IGMP_ALL_HOSTS) return false; if (ipv4_is_local_multicast(group) && !READ_ONCE(net->ipv4.sysctl_igmp_llm_reports)) return false; rcu_read_lock(); for_each_pmc_rcu(in_dev, im) { if (im->multiaddr == group) { igmp_stop_timer(im); break; } } rcu_read_unlock(); return false; } /* return true if packet was dropped */ static bool igmp_heard_query(struct in_device *in_dev, struct sk_buff *skb, int len) { struct igmphdr *ih = igmp_hdr(skb); struct igmpv3_query *ih3 = igmpv3_query_hdr(skb); struct ip_mc_list *im; __be32 group = ih->group; int max_delay; int mark = 0; struct net *net = dev_net(in_dev->dev); if (len == 8) { if (ih->code == 0) { /* Alas, old v1 router presents here. */ max_delay = IGMP_QUERY_RESPONSE_INTERVAL; in_dev->mr_v1_seen = jiffies + (in_dev->mr_qrv * in_dev->mr_qi) + in_dev->mr_qri; group = 0; } else { /* v2 router present */ max_delay = ih->code*(HZ/IGMP_TIMER_SCALE); in_dev->mr_v2_seen = jiffies + (in_dev->mr_qrv * in_dev->mr_qi) + in_dev->mr_qri; } /* cancel the interface change timer */ WRITE_ONCE(in_dev->mr_ifc_count, 0); if (timer_delete(&in_dev->mr_ifc_timer)) __in_dev_put(in_dev); /* clear deleted report items */ igmpv3_clear_delrec(in_dev); } else if (len < 12) { return true; /* ignore bogus packet; freed by caller */ } else if (IGMP_V1_SEEN(in_dev)) { /* This is a v3 query with v1 queriers present */ max_delay = IGMP_QUERY_RESPONSE_INTERVAL; group = 0; } else if (IGMP_V2_SEEN(in_dev)) { /* this is a v3 query with v2 queriers present; * Interpretation of the max_delay code is problematic here. * A real v2 host would use ih_code directly, while v3 has a * different encoding. We use the v3 encoding as more likely * to be intended in a v3 query. */ max_delay = IGMPV3_MRC(ih3->code)*(HZ/IGMP_TIMER_SCALE); if (!max_delay) max_delay = 1; /* can't mod w/ 0 */ } else { /* v3 */ if (!pskb_may_pull(skb, sizeof(struct igmpv3_query))) return true; ih3 = igmpv3_query_hdr(skb); if (ih3->nsrcs) { if (!pskb_may_pull(skb, sizeof(struct igmpv3_query) + ntohs(ih3->nsrcs)*sizeof(__be32))) return true; ih3 = igmpv3_query_hdr(skb); } max_delay = IGMPV3_MRC(ih3->code)*(HZ/IGMP_TIMER_SCALE); if (!max_delay) max_delay = 1; /* can't mod w/ 0 */ in_dev->mr_maxdelay = max_delay; /* RFC3376, 4.1.6. QRV and 4.1.7. QQIC, when the most recently * received value was zero, use the default or statically * configured value. */ in_dev->mr_qrv = ih3->qrv ?: READ_ONCE(net->ipv4.sysctl_igmp_qrv); in_dev->mr_qi = IGMPV3_QQIC(ih3->qqic)*HZ ?: IGMP_QUERY_INTERVAL; /* RFC3376, 8.3. Query Response Interval: * The number of seconds represented by the [Query Response * Interval] must be less than the [Query Interval]. */ if (in_dev->mr_qri >= in_dev->mr_qi) in_dev->mr_qri = (in_dev->mr_qi/HZ - 1)*HZ; if (!group) { /* general query */ if (ih3->nsrcs) return true; /* no sources allowed */ igmp_gq_start_timer(in_dev); return false; } /* mark sources to include, if group & source-specific */ mark = ih3->nsrcs != 0; } /* * - Start the timers in all of our membership records * that the query applies to for the interface on * which the query arrived excl. those that belong * to a "local" group (224.0.0.X) * - For timers already running check if they need to * be reset. * - Use the igmp->igmp_code field as the maximum * delay possible */ rcu_read_lock(); for_each_pmc_rcu(in_dev, im) { int changed; if (group && group != im->multiaddr) continue; if (im->multiaddr == IGMP_ALL_HOSTS) continue; if (ipv4_is_local_multicast(im->multiaddr) && !READ_ONCE(net->ipv4.sysctl_igmp_llm_reports)) continue; spin_lock_bh(&im->lock); if (im->tm_running) im->gsquery = im->gsquery && mark; else im->gsquery = mark; changed = !im->gsquery || igmp_marksources(im, ntohs(ih3->nsrcs), ih3->srcs); spin_unlock_bh(&im->lock); if (changed) igmp_mod_timer(im, max_delay); } rcu_read_unlock(); return false; } /* called in rcu_read_lock() section */ int igmp_rcv(struct sk_buff *skb) { /* This basically follows the spec line by line -- see RFC1112 */ struct igmphdr *ih; struct net_device *dev = skb->dev; struct in_device *in_dev; int len = skb->len; bool dropped = true; if (netif_is_l3_master(dev)) { dev = dev_get_by_index_rcu(dev_net(dev), IPCB(skb)->iif); if (!dev) goto drop; } in_dev = __in_dev_get_rcu(dev); if (!in_dev) goto drop; if (!pskb_may_pull(skb, sizeof(struct igmphdr))) goto drop; if (skb_checksum_simple_validate(skb)) goto drop; ih = igmp_hdr(skb); switch (ih->type) { case IGMP_HOST_MEMBERSHIP_QUERY: dropped = igmp_heard_query(in_dev, skb, len); break; case IGMP_HOST_MEMBERSHIP_REPORT: case IGMPV2_HOST_MEMBERSHIP_REPORT: /* Is it our report looped back? */ if (rt_is_output_route(skb_rtable(skb))) break; /* don't rely on MC router hearing unicast reports */ if (skb->pkt_type == PACKET_MULTICAST || skb->pkt_type == PACKET_BROADCAST) dropped = igmp_heard_report(in_dev, ih->group); break; case IGMP_PIM: #ifdef CONFIG_IP_PIMSM_V1 return pim_rcv_v1(skb); #endif case IGMPV3_HOST_MEMBERSHIP_REPORT: case IGMP_DVMRP: case IGMP_TRACE: case IGMP_HOST_LEAVE_MESSAGE: case IGMP_MTRACE: case IGMP_MTRACE_RESP: break; default: break; } drop: if (dropped) kfree_skb(skb); else consume_skb(skb); return 0; } #endif /* * Add a filter to a device */ static void ip_mc_filter_add(struct in_device *in_dev, __be32 addr) { char buf[MAX_ADDR_LEN]; struct net_device *dev = in_dev->dev; /* Checking for IFF_MULTICAST here is WRONG-WRONG-WRONG. We will get multicast token leakage, when IFF_MULTICAST is changed. This check should be done in ndo_set_rx_mode routine. Something sort of: if (dev->mc_list && dev->flags&IFF_MULTICAST) { do it; } --ANK */ if (arp_mc_map(addr, buf, dev, 0) == 0) dev_mc_add(dev, buf); } /* * Remove a filter from a device */ static void ip_mc_filter_del(struct in_device *in_dev, __be32 addr) { char buf[MAX_ADDR_LEN]; struct net_device *dev = in_dev->dev; if (arp_mc_map(addr, buf, dev, 0) == 0) dev_mc_del(dev, buf); } #ifdef CONFIG_IP_MULTICAST /* * deleted ip_mc_list manipulation */ static void igmpv3_add_delrec(struct in_device *in_dev, struct ip_mc_list *im, gfp_t gfp) { struct ip_mc_list *pmc; struct net *net = dev_net(in_dev->dev); /* this is an "ip_mc_list" for convenience; only the fields below * are actually used. In particular, the refcnt and users are not * used for management of the delete list. Using the same structure * for deleted items allows change reports to use common code with * non-deleted or query-response MCA's. */ pmc = kzalloc(sizeof(*pmc), gfp); if (!pmc) return; spin_lock_init(&pmc->lock); spin_lock_bh(&im->lock); pmc->interface = im->interface; in_dev_hold(in_dev); pmc->multiaddr = im->multiaddr; pmc->crcount = in_dev->mr_qrv ?: READ_ONCE(net->ipv4.sysctl_igmp_qrv); pmc->sfmode = im->sfmode; if (pmc->sfmode == MCAST_INCLUDE) { struct ip_sf_list *psf; pmc->tomb = im->tomb; pmc->sources = im->sources; im->tomb = im->sources = NULL; for (psf = pmc->sources; psf; psf = psf->sf_next) psf->sf_crcount = pmc->crcount; } spin_unlock_bh(&im->lock); spin_lock_bh(&in_dev->mc_tomb_lock); pmc->next = in_dev->mc_tomb; in_dev->mc_tomb = pmc; spin_unlock_bh(&in_dev->mc_tomb_lock); } /* * restore ip_mc_list deleted records */ static void igmpv3_del_delrec(struct in_device *in_dev, struct ip_mc_list *im) { struct ip_mc_list *pmc, *pmc_prev; struct ip_sf_list *psf; struct net *net = dev_net(in_dev->dev); __be32 multiaddr = im->multiaddr; spin_lock_bh(&in_dev->mc_tomb_lock); pmc_prev = NULL; for (pmc = in_dev->mc_tomb; pmc; pmc = pmc->next) { if (pmc->multiaddr == multiaddr) break; pmc_prev = pmc; } if (pmc) { if (pmc_prev) pmc_prev->next = pmc->next; else in_dev->mc_tomb = pmc->next; } spin_unlock_bh(&in_dev->mc_tomb_lock); spin_lock_bh(&im->lock); if (pmc) { im->interface = pmc->interface; if (im->sfmode == MCAST_INCLUDE) { swap(im->tomb, pmc->tomb); swap(im->sources, pmc->sources); for (psf = im->sources; psf; psf = psf->sf_next) psf->sf_crcount = in_dev->mr_qrv ?: READ_ONCE(net->ipv4.sysctl_igmp_qrv); } else { im->crcount = in_dev->mr_qrv ?: READ_ONCE(net->ipv4.sysctl_igmp_qrv); } in_dev_put(pmc->interface); kfree_pmc(pmc); } spin_unlock_bh(&im->lock); } /* * flush ip_mc_list deleted records */ static void igmpv3_clear_delrec(struct in_device *in_dev) { struct ip_mc_list *pmc, *nextpmc; spin_lock_bh(&in_dev->mc_tomb_lock); pmc = in_dev->mc_tomb; in_dev->mc_tomb = NULL; spin_unlock_bh(&in_dev->mc_tomb_lock); for (; pmc; pmc = nextpmc) { nextpmc = pmc->next; ip_mc_clear_src(pmc); in_dev_put(pmc->interface); kfree_pmc(pmc); } /* clear dead sources, too */ rcu_read_lock(); for_each_pmc_rcu(in_dev, pmc) { struct ip_sf_list *psf; spin_lock_bh(&pmc->lock); psf = pmc->tomb; pmc->tomb = NULL; spin_unlock_bh(&pmc->lock); ip_sf_list_clear_all(psf); } rcu_read_unlock(); } #endif static void __igmp_group_dropped(struct ip_mc_list *im, gfp_t gfp) { struct in_device *in_dev = im->interface; #ifdef CONFIG_IP_MULTICAST struct net *net = dev_net(in_dev->dev); int reporter; #endif if (im->loaded) { im->loaded = 0; ip_mc_filter_del(in_dev, im->multiaddr); } #ifdef CONFIG_IP_MULTICAST if (im->multiaddr == IGMP_ALL_HOSTS) return; if (ipv4_is_local_multicast(im->multiaddr) && !READ_ONCE(net->ipv4.sysctl_igmp_llm_reports)) return; reporter = im->reporter; igmp_stop_timer(im); if (!in_dev->dead) { if (IGMP_V1_SEEN(in_dev)) return; if (IGMP_V2_SEEN(in_dev)) { if (reporter) igmp_send_report(in_dev, im, IGMP_HOST_LEAVE_MESSAGE); return; } /* IGMPv3 */ igmpv3_add_delrec(in_dev, im, gfp); igmp_ifc_event(in_dev); } #endif } static void igmp_group_dropped(struct ip_mc_list *im) { __igmp_group_dropped(im, GFP_KERNEL); } static void igmp_group_added(struct ip_mc_list *im) { struct in_device *in_dev = im->interface; #ifdef CONFIG_IP_MULTICAST struct net *net = dev_net(in_dev->dev); #endif if (im->loaded == 0) { im->loaded = 1; ip_mc_filter_add(in_dev, im->multiaddr); } #ifdef CONFIG_IP_MULTICAST if (im->multiaddr == IGMP_ALL_HOSTS) return; if (ipv4_is_local_multicast(im->multiaddr) && !READ_ONCE(net->ipv4.sysctl_igmp_llm_reports)) return; if (in_dev->dead) return; im->unsolicit_count = READ_ONCE(net->ipv4.sysctl_igmp_qrv); if (IGMP_V1_SEEN(in_dev) || IGMP_V2_SEEN(in_dev)) { spin_lock_bh(&im->lock); igmp_start_timer(im, IGMP_INITIAL_REPORT_DELAY); spin_unlock_bh(&im->lock); return; } /* else, v3 */ /* Based on RFC3376 5.1, for newly added INCLUDE SSM, we should * not send filter-mode change record as the mode should be from * IN() to IN(A). */ if (im->sfmode == MCAST_EXCLUDE) im->crcount = in_dev->mr_qrv ?: READ_ONCE(net->ipv4.sysctl_igmp_qrv); igmp_ifc_event(in_dev); #endif } /* * Multicast list managers */ static u32 ip_mc_hash(const struct ip_mc_list *im) { return hash_32((__force u32)im->multiaddr, MC_HASH_SZ_LOG); } static void ip_mc_hash_add(struct in_device *in_dev, struct ip_mc_list *im) { struct ip_mc_list __rcu **mc_hash; u32 hash; mc_hash = rtnl_dereference(in_dev->mc_hash); if (mc_hash) { hash = ip_mc_hash(im); im->next_hash = mc_hash[hash]; rcu_assign_pointer(mc_hash[hash], im); return; } /* do not use a hash table for small number of items */ if (in_dev->mc_count < 4) return; mc_hash = kzalloc(sizeof(struct ip_mc_list *) << MC_HASH_SZ_LOG, GFP_KERNEL); if (!mc_hash) return; for_each_pmc_rtnl(in_dev, im) { hash = ip_mc_hash(im); im->next_hash = mc_hash[hash]; RCU_INIT_POINTER(mc_hash[hash], im); } rcu_assign_pointer(in_dev->mc_hash, mc_hash); } static void ip_mc_hash_remove(struct in_device *in_dev, struct ip_mc_list *im) { struct ip_mc_list __rcu **mc_hash = rtnl_dereference(in_dev->mc_hash); struct ip_mc_list *aux; if (!mc_hash) return; mc_hash += ip_mc_hash(im); while ((aux = rtnl_dereference(*mc_hash)) != im) mc_hash = &aux->next_hash; *mc_hash = im->next_hash; } int inet_fill_ifmcaddr(struct sk_buff *skb, struct net_device *dev, const struct ip_mc_list *im, struct inet_fill_args *args) { struct ifa_cacheinfo ci; struct ifaddrmsg *ifm; struct nlmsghdr *nlh; nlh = nlmsg_put(skb, args->portid, args->seq, args->event, sizeof(struct ifaddrmsg), args->flags); if (!nlh) return -EMSGSIZE; ifm = nlmsg_data(nlh); ifm->ifa_family = AF_INET; ifm->ifa_prefixlen = 32; ifm->ifa_flags = IFA_F_PERMANENT; ifm->ifa_scope = RT_SCOPE_UNIVERSE; ifm->ifa_index = dev->ifindex; ci.cstamp = (READ_ONCE(im->mca_cstamp) - INITIAL_JIFFIES) * 100UL / HZ; ci.tstamp = ci.cstamp; ci.ifa_prefered = INFINITY_LIFE_TIME; ci.ifa_valid = INFINITY_LIFE_TIME; if (nla_put_in_addr(skb, IFA_MULTICAST, im->multiaddr) < 0 || nla_put(skb, IFA_CACHEINFO, sizeof(ci), &ci) < 0) { nlmsg_cancel(skb, nlh); return -EMSGSIZE; } nlmsg_end(skb, nlh); return 0; } static void inet_ifmcaddr_notify(struct net_device *dev, const struct ip_mc_list *im, int event) { struct inet_fill_args fillargs = { .event = event, }; struct net *net = dev_net(dev); struct sk_buff *skb; int err = -ENOMEM; skb = nlmsg_new(NLMSG_ALIGN(sizeof(struct ifaddrmsg)) + nla_total_size(sizeof(__be32)) + nla_total_size(sizeof(struct ifa_cacheinfo)), GFP_KERNEL); if (!skb) goto error; err = inet_fill_ifmcaddr(skb, dev, im, &fillargs); if (err < 0) { WARN_ON_ONCE(err == -EMSGSIZE); nlmsg_free(skb); goto error; } rtnl_notify(skb, net, 0, RTNLGRP_IPV4_MCADDR, NULL, GFP_KERNEL); return; error: rtnl_set_sk_err(net, RTNLGRP_IPV4_MCADDR, err); } /* * A socket has joined a multicast group on device dev. */ static void ____ip_mc_inc_group(struct in_device *in_dev, __be32 addr, unsigned int mode, gfp_t gfp) { struct ip_mc_list __rcu **mc_hash; struct ip_mc_list *im; ASSERT_RTNL(); mc_hash = rtnl_dereference(in_dev->mc_hash); if (mc_hash) { u32 hash = hash_32((__force u32)addr, MC_HASH_SZ_LOG); for (im = rtnl_dereference(mc_hash[hash]); im; im = rtnl_dereference(im->next_hash)) { if (im->multiaddr == addr) break; } } else { for_each_pmc_rtnl(in_dev, im) { if (im->multiaddr == addr) break; } } if (im) { im->users++; ip_mc_add_src(in_dev, &addr, mode, 0, NULL, 0); goto out; } im = kzalloc(sizeof(*im), gfp); if (!im) goto out; im->users = 1; im->interface = in_dev; in_dev_hold(in_dev); im->multiaddr = addr; im->mca_cstamp = jiffies; im->mca_tstamp = im->mca_cstamp; /* initial mode is (EX, empty) */ im->sfmode = mode; im->sfcount[mode] = 1; refcount_set(&im->refcnt, 1); spin_lock_init(&im->lock); #ifdef CONFIG_IP_MULTICAST timer_setup(&im->timer, igmp_timer_expire, 0); #endif im->next_rcu = in_dev->mc_list; in_dev->mc_count++; rcu_assign_pointer(in_dev->mc_list, im); ip_mc_hash_add(in_dev, im); #ifdef CONFIG_IP_MULTICAST igmpv3_del_delrec(in_dev, im); #endif igmp_group_added(im); inet_ifmcaddr_notify(in_dev->dev, im, RTM_NEWMULTICAST); if (!in_dev->dead) ip_rt_multicast_event(in_dev); out: return; } void __ip_mc_inc_group(struct in_device *in_dev, __be32 addr, gfp_t gfp) { ____ip_mc_inc_group(in_dev, addr, MCAST_EXCLUDE, gfp); } EXPORT_SYMBOL(__ip_mc_inc_group); void ip_mc_inc_group(struct in_device *in_dev, __be32 addr) { __ip_mc_inc_group(in_dev, addr, GFP_KERNEL); } EXPORT_SYMBOL(ip_mc_inc_group); static int ip_mc_check_iphdr(struct sk_buff *skb) { const struct iphdr *iph; unsigned int len; unsigned int offset = skb_network_offset(skb) + sizeof(*iph); if (!pskb_may_pull(skb, offset)) return -EINVAL; iph = ip_hdr(skb); if (iph->version != 4 || ip_hdrlen(skb) < sizeof(*iph)) return -EINVAL; offset += ip_hdrlen(skb) - sizeof(*iph); if (!pskb_may_pull(skb, offset)) return -EINVAL; iph = ip_hdr(skb); if (unlikely(ip_fast_csum((u8 *)iph, iph->ihl))) return -EINVAL; len = skb_network_offset(skb) + ntohs(iph->tot_len); if (skb->len < len || len < offset) return -EINVAL; skb_set_transport_header(skb, offset); return 0; } static int ip_mc_check_igmp_reportv3(struct sk_buff *skb) { unsigned int len = skb_transport_offset(skb); len += sizeof(struct igmpv3_report); return ip_mc_may_pull(skb, len) ? 0 : -EINVAL; } static int ip_mc_check_igmp_query(struct sk_buff *skb) { unsigned int transport_len = ip_transport_len(skb); unsigned int len; /* IGMPv{1,2}? */ if (transport_len != sizeof(struct igmphdr)) { /* or IGMPv3? */ if (transport_len < sizeof(struct igmpv3_query)) return -EINVAL; len = skb_transport_offset(skb) + sizeof(struct igmpv3_query); if (!ip_mc_may_pull(skb, len)) return -EINVAL; } /* RFC2236+RFC3376 (IGMPv2+IGMPv3) require the multicast link layer * all-systems destination addresses (224.0.0.1) for general queries */ if (!igmp_hdr(skb)->group && ip_hdr(skb)->daddr != htonl(INADDR_ALLHOSTS_GROUP)) return -EINVAL; return 0; } static int ip_mc_check_igmp_msg(struct sk_buff *skb) { switch (igmp_hdr(skb)->type) { case IGMP_HOST_LEAVE_MESSAGE: case IGMP_HOST_MEMBERSHIP_REPORT: case IGMPV2_HOST_MEMBERSHIP_REPORT: return 0; case IGMPV3_HOST_MEMBERSHIP_REPORT: return ip_mc_check_igmp_reportv3(skb); case IGMP_HOST_MEMBERSHIP_QUERY: return ip_mc_check_igmp_query(skb); default: return -ENOMSG; } } static __sum16 ip_mc_validate_checksum(struct sk_buff *skb) { return skb_checksum_simple_validate(skb); } static int ip_mc_check_igmp_csum(struct sk_buff *skb) { unsigned int len = skb_transport_offset(skb) + sizeof(struct igmphdr); unsigned int transport_len = ip_transport_len(skb); struct sk_buff *skb_chk; if (!ip_mc_may_pull(skb, len)) return -EINVAL; skb_chk = skb_checksum_trimmed(skb, transport_len, ip_mc_validate_checksum); if (!skb_chk) return -EINVAL; if (skb_chk != skb) kfree_skb(skb_chk); return 0; } /** * ip_mc_check_igmp - checks whether this is a sane IGMP packet * @skb: the skb to validate * * Checks whether an IPv4 packet is a valid IGMP packet. If so sets * skb transport header accordingly and returns zero. * * -EINVAL: A broken packet was detected, i.e. it violates some internet * standard * -ENOMSG: IP header validation succeeded but it is not an IGMP packet. * -ENOMEM: A memory allocation failure happened. * * Caller needs to set the skb network header and free any returned skb if it * differs from the provided skb. */ int ip_mc_check_igmp(struct sk_buff *skb) { int ret = ip_mc_check_iphdr(skb); if (ret < 0) return ret; if (ip_hdr(skb)->protocol != IPPROTO_IGMP) return -ENOMSG; ret = ip_mc_check_igmp_csum(skb); if (ret < 0) return ret; return ip_mc_check_igmp_msg(skb); } EXPORT_SYMBOL(ip_mc_check_igmp); /* * Resend IGMP JOIN report; used by netdev notifier. */ static void ip_mc_rejoin_groups(struct in_device *in_dev) { #ifdef CONFIG_IP_MULTICAST struct ip_mc_list *im; int type; struct net *net = dev_net(in_dev->dev); ASSERT_RTNL(); for_each_pmc_rtnl(in_dev, im) { if (im->multiaddr == IGMP_ALL_HOSTS) continue; if (ipv4_is_local_multicast(im->multiaddr) && !READ_ONCE(net->ipv4.sysctl_igmp_llm_reports)) continue; /* a failover is happening and switches * must be notified immediately */ if (IGMP_V1_SEEN(in_dev)) type = IGMP_HOST_MEMBERSHIP_REPORT; else if (IGMP_V2_SEEN(in_dev)) type = IGMPV2_HOST_MEMBERSHIP_REPORT; else type = IGMPV3_HOST_MEMBERSHIP_REPORT; igmp_send_report(in_dev, im, type); } #endif } /* * A socket has left a multicast group on device dev */ void __ip_mc_dec_group(struct in_device *in_dev, __be32 addr, gfp_t gfp) { struct ip_mc_list *i; struct ip_mc_list __rcu **ip; ASSERT_RTNL(); for (ip = &in_dev->mc_list; (i = rtnl_dereference(*ip)) != NULL; ip = &i->next_rcu) { if (i->multiaddr == addr) { if (--i->users == 0) { ip_mc_hash_remove(in_dev, i); *ip = i->next_rcu; in_dev->mc_count--; __igmp_group_dropped(i, gfp); inet_ifmcaddr_notify(in_dev->dev, i, RTM_DELMULTICAST); ip_mc_clear_src(i); if (!in_dev->dead) ip_rt_multicast_event(in_dev); ip_ma_put(i); return; } break; } } } EXPORT_SYMBOL(__ip_mc_dec_group); /* Device changing type */ void ip_mc_unmap(struct in_device *in_dev) { struct ip_mc_list *pmc; ASSERT_RTNL(); for_each_pmc_rtnl(in_dev, pmc) igmp_group_dropped(pmc); } void ip_mc_remap(struct in_device *in_dev) { struct ip_mc_list *pmc; ASSERT_RTNL(); for_each_pmc_rtnl(in_dev, pmc) { #ifdef CONFIG_IP_MULTICAST igmpv3_del_delrec(in_dev, pmc); #endif igmp_group_added(pmc); } } /* Device going down */ void ip_mc_down(struct in_device *in_dev) { struct ip_mc_list *pmc; ASSERT_RTNL(); for_each_pmc_rtnl(in_dev, pmc) igmp_group_dropped(pmc); #ifdef CONFIG_IP_MULTICAST WRITE_ONCE(in_dev->mr_ifc_count, 0); if (timer_delete(&in_dev->mr_ifc_timer)) __in_dev_put(in_dev); in_dev->mr_gq_running = 0; if (timer_delete(&in_dev->mr_gq_timer)) __in_dev_put(in_dev); #endif ip_mc_dec_group(in_dev, IGMP_ALL_HOSTS); } #ifdef CONFIG_IP_MULTICAST static void ip_mc_reset(struct in_device *in_dev) { struct net *net = dev_net(in_dev->dev); in_dev->mr_qi = IGMP_QUERY_INTERVAL; in_dev->mr_qri = IGMP_QUERY_RESPONSE_INTERVAL; in_dev->mr_qrv = READ_ONCE(net->ipv4.sysctl_igmp_qrv); } #else static void ip_mc_reset(struct in_device *in_dev) { } #endif void ip_mc_init_dev(struct in_device *in_dev) { ASSERT_RTNL(); #ifdef CONFIG_IP_MULTICAST timer_setup(&in_dev->mr_gq_timer, igmp_gq_timer_expire, 0); timer_setup(&in_dev->mr_ifc_timer, igmp_ifc_timer_expire, 0); #endif ip_mc_reset(in_dev); spin_lock_init(&in_dev->mc_tomb_lock); } /* Device going up */ void ip_mc_up(struct in_device *in_dev) { struct ip_mc_list *pmc; ASSERT_RTNL(); ip_mc_reset(in_dev); ip_mc_inc_group(in_dev, IGMP_ALL_HOSTS); for_each_pmc_rtnl(in_dev, pmc) { #ifdef CONFIG_IP_MULTICAST igmpv3_del_delrec(in_dev, pmc); #endif igmp_group_added(pmc); } } /* * Device is about to be destroyed: clean up. */ void ip_mc_destroy_dev(struct in_device *in_dev) { struct ip_mc_list *i; ASSERT_RTNL(); /* Deactivate timers */ ip_mc_down(in_dev); #ifdef CONFIG_IP_MULTICAST igmpv3_clear_delrec(in_dev); #endif while ((i = rtnl_dereference(in_dev->mc_list)) != NULL) { in_dev->mc_list = i->next_rcu; in_dev->mc_count--; ip_mc_clear_src(i); ip_ma_put(i); } } /* RTNL is locked */ static struct in_device *ip_mc_find_dev(struct net *net, struct ip_mreqn *imr) { struct net_device *dev = NULL; struct in_device *idev = NULL; if (imr->imr_ifindex) { idev = inetdev_by_index(net, imr->imr_ifindex); return idev; } if (imr->imr_address.s_addr) { dev = __ip_dev_find(net, imr->imr_address.s_addr, false); if (!dev) return NULL; } if (!dev) { struct rtable *rt = ip_route_output(net, imr->imr_multiaddr.s_addr, 0, 0, 0, RT_SCOPE_UNIVERSE); if (!IS_ERR(rt)) { dev = rt->dst.dev; ip_rt_put(rt); } } if (dev) { imr->imr_ifindex = dev->ifindex; idev = __in_dev_get_rtnl(dev); } return idev; } /* * Join a socket to a group */ static int ip_mc_del1_src(struct ip_mc_list *pmc, int sfmode, __be32 *psfsrc) { struct ip_sf_list *psf, *psf_prev; int rv = 0; psf_prev = NULL; for (psf = pmc->sources; psf; psf = psf->sf_next) { if (psf->sf_inaddr == *psfsrc) break; psf_prev = psf; } if (!psf || psf->sf_count[sfmode] == 0) { /* source filter not found, or count wrong => bug */ return -ESRCH; } psf->sf_count[sfmode]--; if (psf->sf_count[sfmode] == 0) { ip_rt_multicast_event(pmc->interface); } if (!psf->sf_count[MCAST_INCLUDE] && !psf->sf_count[MCAST_EXCLUDE]) { #ifdef CONFIG_IP_MULTICAST struct in_device *in_dev = pmc->interface; struct net *net = dev_net(in_dev->dev); #endif /* no more filters for this source */ if (psf_prev) psf_prev->sf_next = psf->sf_next; else pmc->sources = psf->sf_next; #ifdef CONFIG_IP_MULTICAST if (psf->sf_oldin && !IGMP_V1_SEEN(in_dev) && !IGMP_V2_SEEN(in_dev)) { psf->sf_crcount = in_dev->mr_qrv ?: READ_ONCE(net->ipv4.sysctl_igmp_qrv); psf->sf_next = pmc->tomb; pmc->tomb = psf; rv = 1; } else #endif kfree(psf); } return rv; } #ifndef CONFIG_IP_MULTICAST #define igmp_ifc_event(x) do { } while (0) #endif static int ip_mc_del_src(struct in_device *in_dev, __be32 *pmca, int sfmode, int sfcount, __be32 *psfsrc, int delta) { struct ip_mc_list *pmc; int changerec = 0; int i, err; if (!in_dev) return -ENODEV; rcu_read_lock(); for_each_pmc_rcu(in_dev, pmc) { if (*pmca == pmc->multiaddr) break; } if (!pmc) { /* MCA not found?? bug */ rcu_read_unlock(); return -ESRCH; } spin_lock_bh(&pmc->lock); rcu_read_unlock(); #ifdef CONFIG_IP_MULTICAST sf_markstate(pmc); #endif if (!delta) { err = -EINVAL; if (!pmc->sfcount[sfmode]) goto out_unlock; pmc->sfcount[sfmode]--; } err = 0; for (i = 0; i < sfcount; i++) { int rv = ip_mc_del1_src(pmc, sfmode, &psfsrc[i]); changerec |= rv > 0; if (!err && rv < 0) err = rv; } if (pmc->sfmode == MCAST_EXCLUDE && pmc->sfcount[MCAST_EXCLUDE] == 0 && pmc->sfcount[MCAST_INCLUDE]) { #ifdef CONFIG_IP_MULTICAST struct ip_sf_list *psf; struct net *net = dev_net(in_dev->dev); #endif /* filter mode change */ pmc->sfmode = MCAST_INCLUDE; #ifdef CONFIG_IP_MULTICAST pmc->crcount = in_dev->mr_qrv ?: READ_ONCE(net->ipv4.sysctl_igmp_qrv); WRITE_ONCE(in_dev->mr_ifc_count, pmc->crcount); for (psf = pmc->sources; psf; psf = psf->sf_next) psf->sf_crcount = 0; igmp_ifc_event(pmc->interface); } else if (sf_setstate(pmc) || changerec) { igmp_ifc_event(pmc->interface); #endif } out_unlock: spin_unlock_bh(&pmc->lock); return err; } /* * Add multicast single-source filter to the interface list */ static int ip_mc_add1_src(struct ip_mc_list *pmc, int sfmode, __be32 *psfsrc) { struct ip_sf_list *psf, *psf_prev; psf_prev = NULL; for (psf = pmc->sources; psf; psf = psf->sf_next) { if (psf->sf_inaddr == *psfsrc) break; psf_prev = psf; } if (!psf) { psf = kzalloc(sizeof(*psf), GFP_ATOMIC); if (!psf) return -ENOBUFS; psf->sf_inaddr = *psfsrc; if (psf_prev) { psf_prev->sf_next = psf; } else pmc->sources = psf; } psf->sf_count[sfmode]++; if (psf->sf_count[sfmode] == 1) { ip_rt_multicast_event(pmc->interface); } return 0; } #ifdef CONFIG_IP_MULTICAST static void sf_markstate(struct ip_mc_list *pmc) { struct ip_sf_list *psf; int mca_xcount = pmc->sfcount[MCAST_EXCLUDE]; for (psf = pmc->sources; psf; psf = psf->sf_next) if (pmc->sfcount[MCAST_EXCLUDE]) { psf->sf_oldin = mca_xcount == psf->sf_count[MCAST_EXCLUDE] && !psf->sf_count[MCAST_INCLUDE]; } else psf->sf_oldin = psf->sf_count[MCAST_INCLUDE] != 0; } static int sf_setstate(struct ip_mc_list *pmc) { struct ip_sf_list *psf, *dpsf; int mca_xcount = pmc->sfcount[MCAST_EXCLUDE]; int qrv = pmc->interface->mr_qrv; int new_in, rv; rv = 0; for (psf = pmc->sources; psf; psf = psf->sf_next) { if (pmc->sfcount[MCAST_EXCLUDE]) { new_in = mca_xcount == psf->sf_count[MCAST_EXCLUDE] && !psf->sf_count[MCAST_INCLUDE]; } else new_in = psf->sf_count[MCAST_INCLUDE] != 0; if (new_in) { if (!psf->sf_oldin) { struct ip_sf_list *prev = NULL; for (dpsf = pmc->tomb; dpsf; dpsf = dpsf->sf_next) { if (dpsf->sf_inaddr == psf->sf_inaddr) break; prev = dpsf; } if (dpsf) { if (prev) prev->sf_next = dpsf->sf_next; else pmc->tomb = dpsf->sf_next; kfree(dpsf); } psf->sf_crcount = qrv; rv++; } } else if (psf->sf_oldin) { psf->sf_crcount = 0; /* * add or update "delete" records if an active filter * is now inactive */ for (dpsf = pmc->tomb; dpsf; dpsf = dpsf->sf_next) if (dpsf->sf_inaddr == psf->sf_inaddr) break; if (!dpsf) { dpsf = kmalloc(sizeof(*dpsf), GFP_ATOMIC); if (!dpsf) continue; *dpsf = *psf; /* pmc->lock held by callers */ dpsf->sf_next = pmc->tomb; pmc->tomb = dpsf; } dpsf->sf_crcount = qrv; rv++; } } return rv; } #endif /* * Add multicast source filter list to the interface list */ static int ip_mc_add_src(struct in_device *in_dev, __be32 *pmca, int sfmode, int sfcount, __be32 *psfsrc, int delta) { struct ip_mc_list *pmc; int isexclude; int i, err; if (!in_dev) return -ENODEV; rcu_read_lock(); for_each_pmc_rcu(in_dev, pmc) { if (*pmca == pmc->multiaddr) break; } if (!pmc) { /* MCA not found?? bug */ rcu_read_unlock(); return -ESRCH; } spin_lock_bh(&pmc->lock); rcu_read_unlock(); #ifdef CONFIG_IP_MULTICAST sf_markstate(pmc); #endif isexclude = pmc->sfmode == MCAST_EXCLUDE; if (!delta) pmc->sfcount[sfmode]++; err = 0; for (i = 0; i < sfcount; i++) { err = ip_mc_add1_src(pmc, sfmode, &psfsrc[i]); if (err) break; } if (err) { int j; if (!delta) pmc->sfcount[sfmode]--; for (j = 0; j < i; j++) (void) ip_mc_del1_src(pmc, sfmode, &psfsrc[j]); } else if (isexclude != (pmc->sfcount[MCAST_EXCLUDE] != 0)) { #ifdef CONFIG_IP_MULTICAST struct ip_sf_list *psf; struct net *net = dev_net(pmc->interface->dev); in_dev = pmc->interface; #endif /* filter mode change */ if (pmc->sfcount[MCAST_EXCLUDE]) pmc->sfmode = MCAST_EXCLUDE; else if (pmc->sfcount[MCAST_INCLUDE]) pmc->sfmode = MCAST_INCLUDE; #ifdef CONFIG_IP_MULTICAST /* else no filters; keep old mode for reports */ pmc->crcount = in_dev->mr_qrv ?: READ_ONCE(net->ipv4.sysctl_igmp_qrv); WRITE_ONCE(in_dev->mr_ifc_count, pmc->crcount); for (psf = pmc->sources; psf; psf = psf->sf_next) psf->sf_crcount = 0; igmp_ifc_event(in_dev); } else if (sf_setstate(pmc)) { igmp_ifc_event(in_dev); #endif } spin_unlock_bh(&pmc->lock); return err; } static void ip_mc_clear_src(struct ip_mc_list *pmc) { struct ip_sf_list *tomb, *sources; spin_lock_bh(&pmc->lock); tomb = pmc->tomb; pmc->tomb = NULL; sources = pmc->sources; pmc->sources = NULL; pmc->sfmode = MCAST_EXCLUDE; pmc->sfcount[MCAST_INCLUDE] = 0; pmc->sfcount[MCAST_EXCLUDE] = 1; spin_unlock_bh(&pmc->lock); ip_sf_list_clear_all(tomb); ip_sf_list_clear_all(sources); } /* Join a multicast group */ static int __ip_mc_join_group(struct sock *sk, struct ip_mreqn *imr, unsigned int mode) { __be32 addr = imr->imr_multiaddr.s_addr; struct ip_mc_socklist *iml, *i; struct in_device *in_dev; struct inet_sock *inet = inet_sk(sk); struct net *net = sock_net(sk); int ifindex; int count = 0; int err; ASSERT_RTNL(); if (!ipv4_is_multicast(addr)) return -EINVAL; in_dev = ip_mc_find_dev(net, imr); if (!in_dev) { err = -ENODEV; goto done; } err = -EADDRINUSE; ifindex = imr->imr_ifindex; for_each_pmc_rtnl(inet, i) { if (i->multi.imr_multiaddr.s_addr == addr && i->multi.imr_ifindex == ifindex) goto done; count++; } err = -ENOBUFS; if (count >= READ_ONCE(net->ipv4.sysctl_igmp_max_memberships)) goto done; iml = sock_kmalloc(sk, sizeof(*iml), GFP_KERNEL); if (!iml) goto done; memcpy(&iml->multi, imr, sizeof(*imr)); iml->next_rcu = inet->mc_list; iml->sflist = NULL; iml->sfmode = mode; rcu_assign_pointer(inet->mc_list, iml); ____ip_mc_inc_group(in_dev, addr, mode, GFP_KERNEL); err = 0; done: return err; } /* Join ASM (Any-Source Multicast) group */ int ip_mc_join_group(struct sock *sk, struct ip_mreqn *imr) { return __ip_mc_join_group(sk, imr, MCAST_EXCLUDE); } EXPORT_SYMBOL(ip_mc_join_group); /* Join SSM (Source-Specific Multicast) group */ int ip_mc_join_group_ssm(struct sock *sk, struct ip_mreqn *imr, unsigned int mode) { return __ip_mc_join_group(sk, imr, mode); } static int ip_mc_leave_src(struct sock *sk, struct ip_mc_socklist *iml, struct in_device *in_dev) { struct ip_sf_socklist *psf = rtnl_dereference(iml->sflist); int err; if (!psf) { /* any-source empty exclude case */ return ip_mc_del_src(in_dev, &iml->multi.imr_multiaddr.s_addr, iml->sfmode, 0, NULL, 0); } err = ip_mc_del_src(in_dev, &iml->multi.imr_multiaddr.s_addr, iml->sfmode, psf->sl_count, psf->sl_addr, 0); RCU_INIT_POINTER(iml->sflist, NULL); /* decrease mem now to avoid the memleak warning */ atomic_sub(struct_size(psf, sl_addr, psf->sl_max), &sk->sk_omem_alloc); kfree_rcu(psf, rcu); return err; } int ip_mc_leave_group(struct sock *sk, struct ip_mreqn *imr) { struct inet_sock *inet = inet_sk(sk); struct ip_mc_socklist *iml; struct ip_mc_socklist __rcu **imlp; struct in_device *in_dev; struct net *net = sock_net(sk); __be32 group = imr->imr_multiaddr.s_addr; u32 ifindex; int ret = -EADDRNOTAVAIL; ASSERT_RTNL(); in_dev = ip_mc_find_dev(net, imr); if (!imr->imr_ifindex && !imr->imr_address.s_addr && !in_dev) { ret = -ENODEV; goto out; } ifindex = imr->imr_ifindex; for (imlp = &inet->mc_list; (iml = rtnl_dereference(*imlp)) != NULL; imlp = &iml->next_rcu) { if (iml->multi.imr_multiaddr.s_addr != group) continue; if (ifindex) { if (iml->multi.imr_ifindex != ifindex) continue; } else if (imr->imr_address.s_addr && imr->imr_address.s_addr != iml->multi.imr_address.s_addr) continue; (void) ip_mc_leave_src(sk, iml, in_dev); *imlp = iml->next_rcu; if (in_dev) ip_mc_dec_group(in_dev, group); /* decrease mem now to avoid the memleak warning */ atomic_sub(sizeof(*iml), &sk->sk_omem_alloc); kfree_rcu(iml, rcu); return 0; } out: return ret; } EXPORT_SYMBOL(ip_mc_leave_group); int ip_mc_source(int add, int omode, struct sock *sk, struct ip_mreq_source *mreqs, int ifindex) { int err; struct ip_mreqn imr; __be32 addr = mreqs->imr_multiaddr; struct ip_mc_socklist *pmc; struct in_device *in_dev = NULL; struct inet_sock *inet = inet_sk(sk); struct ip_sf_socklist *psl; struct net *net = sock_net(sk); int leavegroup = 0; int i, j, rv; if (!ipv4_is_multicast(addr)) return -EINVAL; ASSERT_RTNL(); imr.imr_multiaddr.s_addr = mreqs->imr_multiaddr; imr.imr_address.s_addr = mreqs->imr_interface; imr.imr_ifindex = ifindex; in_dev = ip_mc_find_dev(net, &imr); if (!in_dev) { err = -ENODEV; goto done; } err = -EADDRNOTAVAIL; for_each_pmc_rtnl(inet, pmc) { if ((pmc->multi.imr_multiaddr.s_addr == imr.imr_multiaddr.s_addr) && (pmc->multi.imr_ifindex == imr.imr_ifindex)) break; } if (!pmc) { /* must have a prior join */ err = -EINVAL; goto done; } /* if a source filter was set, must be the same mode as before */ if (pmc->sflist) { if (pmc->sfmode != omode) { err = -EINVAL; goto done; } } else if (pmc->sfmode != omode) { /* allow mode switches for empty-set filters */ ip_mc_add_src(in_dev, &mreqs->imr_multiaddr, omode, 0, NULL, 0); ip_mc_del_src(in_dev, &mreqs->imr_multiaddr, pmc->sfmode, 0, NULL, 0); pmc->sfmode = omode; } psl = rtnl_dereference(pmc->sflist); if (!add) { if (!psl) goto done; /* err = -EADDRNOTAVAIL */ rv = !0; for (i = 0; i < psl->sl_count; i++) { rv = memcmp(&psl->sl_addr[i], &mreqs->imr_sourceaddr, sizeof(__be32)); if (rv == 0) break; } if (rv) /* source not found */ goto done; /* err = -EADDRNOTAVAIL */ /* special case - (INCLUDE, empty) == LEAVE_GROUP */ if (psl->sl_count == 1 && omode == MCAST_INCLUDE) { leavegroup = 1; goto done; } /* update the interface filter */ ip_mc_del_src(in_dev, &mreqs->imr_multiaddr, omode, 1, &mreqs->imr_sourceaddr, 1); for (j = i+1; j < psl->sl_count; j++) psl->sl_addr[j-1] = psl->sl_addr[j]; psl->sl_count--; err = 0; goto done; } /* else, add a new source to the filter */ if (psl && psl->sl_count >= READ_ONCE(net->ipv4.sysctl_igmp_max_msf)) { err = -ENOBUFS; goto done; } if (!psl || psl->sl_count == psl->sl_max) { struct ip_sf_socklist *newpsl; int count = IP_SFBLOCK; if (psl) count += psl->sl_max; newpsl = sock_kmalloc(sk, struct_size(newpsl, sl_addr, count), GFP_KERNEL); if (!newpsl) { err = -ENOBUFS; goto done; } newpsl->sl_max = count; newpsl->sl_count = count - IP_SFBLOCK; if (psl) { for (i = 0; i < psl->sl_count; i++) newpsl->sl_addr[i] = psl->sl_addr[i]; /* decrease mem now to avoid the memleak warning */ atomic_sub(struct_size(psl, sl_addr, psl->sl_max), &sk->sk_omem_alloc); } rcu_assign_pointer(pmc->sflist, newpsl); if (psl) kfree_rcu(psl, rcu); psl = newpsl; } rv = 1; /* > 0 for insert logic below if sl_count is 0 */ for (i = 0; i < psl->sl_count; i++) { rv = memcmp(&psl->sl_addr[i], &mreqs->imr_sourceaddr, sizeof(__be32)); if (rv == 0) break; } if (rv == 0) /* address already there is an error */ goto done; for (j = psl->sl_count-1; j >= i; j--) psl->sl_addr[j+1] = psl->sl_addr[j]; psl->sl_addr[i] = mreqs->imr_sourceaddr; psl->sl_count++; err = 0; /* update the interface list */ ip_mc_add_src(in_dev, &mreqs->imr_multiaddr, omode, 1, &mreqs->imr_sourceaddr, 1); done: if (leavegroup) err = ip_mc_leave_group(sk, &imr); return err; } int ip_mc_msfilter(struct sock *sk, struct ip_msfilter *msf, int ifindex) { int err = 0; struct ip_mreqn imr; __be32 addr = msf->imsf_multiaddr; struct ip_mc_socklist *pmc; struct in_device *in_dev; struct inet_sock *inet = inet_sk(sk); struct ip_sf_socklist *newpsl, *psl; struct net *net = sock_net(sk); int leavegroup = 0; if (!ipv4_is_multicast(addr)) return -EINVAL; if (msf->imsf_fmode != MCAST_INCLUDE && msf->imsf_fmode != MCAST_EXCLUDE) return -EINVAL; ASSERT_RTNL(); imr.imr_multiaddr.s_addr = msf->imsf_multiaddr; imr.imr_address.s_addr = msf->imsf_interface; imr.imr_ifindex = ifindex; in_dev = ip_mc_find_dev(net, &imr); if (!in_dev) { err = -ENODEV; goto done; } /* special case - (INCLUDE, empty) == LEAVE_GROUP */ if (msf->imsf_fmode == MCAST_INCLUDE && msf->imsf_numsrc == 0) { leavegroup = 1; goto done; } for_each_pmc_rtnl(inet, pmc) { if (pmc->multi.imr_multiaddr.s_addr == msf->imsf_multiaddr && pmc->multi.imr_ifindex == imr.imr_ifindex) break; } if (!pmc) { /* must have a prior join */ err = -EINVAL; goto done; } if (msf->imsf_numsrc) { newpsl = sock_kmalloc(sk, struct_size(newpsl, sl_addr, msf->imsf_numsrc), GFP_KERNEL); if (!newpsl) { err = -ENOBUFS; goto done; } newpsl->sl_max = newpsl->sl_count = msf->imsf_numsrc; memcpy(newpsl->sl_addr, msf->imsf_slist_flex, flex_array_size(msf, imsf_slist_flex, msf->imsf_numsrc)); err = ip_mc_add_src(in_dev, &msf->imsf_multiaddr, msf->imsf_fmode, newpsl->sl_count, newpsl->sl_addr, 0); if (err) { sock_kfree_s(sk, newpsl, struct_size(newpsl, sl_addr, newpsl->sl_max)); goto done; } } else { newpsl = NULL; (void) ip_mc_add_src(in_dev, &msf->imsf_multiaddr, msf->imsf_fmode, 0, NULL, 0); } psl = rtnl_dereference(pmc->sflist); if (psl) { (void) ip_mc_del_src(in_dev, &msf->imsf_multiaddr, pmc->sfmode, psl->sl_count, psl->sl_addr, 0); /* decrease mem now to avoid the memleak warning */ atomic_sub(struct_size(psl, sl_addr, psl->sl_max), &sk->sk_omem_alloc); } else { (void) ip_mc_del_src(in_dev, &msf->imsf_multiaddr, pmc->sfmode, 0, NULL, 0); } rcu_assign_pointer(pmc->sflist, newpsl); if (psl) kfree_rcu(psl, rcu); pmc->sfmode = msf->imsf_fmode; err = 0; done: if (leavegroup) err = ip_mc_leave_group(sk, &imr); return err; } int ip_mc_msfget(struct sock *sk, struct ip_msfilter *msf, sockptr_t optval, sockptr_t optlen) { int err, len, count, copycount, msf_size; struct ip_mreqn imr; __be32 addr = msf->imsf_multiaddr; struct ip_mc_socklist *pmc; struct in_device *in_dev; struct inet_sock *inet = inet_sk(sk); struct ip_sf_socklist *psl; struct net *net = sock_net(sk); ASSERT_RTNL(); if (!ipv4_is_multicast(addr)) return -EINVAL; imr.imr_multiaddr.s_addr = msf->imsf_multiaddr; imr.imr_address.s_addr = msf->imsf_interface; imr.imr_ifindex = 0; in_dev = ip_mc_find_dev(net, &imr); if (!in_dev) { err = -ENODEV; goto done; } err = -EADDRNOTAVAIL; for_each_pmc_rtnl(inet, pmc) { if (pmc->multi.imr_multiaddr.s_addr == msf->imsf_multiaddr && pmc->multi.imr_ifindex == imr.imr_ifindex) break; } if (!pmc) /* must have a prior join */ goto done; msf->imsf_fmode = pmc->sfmode; psl = rtnl_dereference(pmc->sflist); if (!psl) { count = 0; } else { count = psl->sl_count; } copycount = count < msf->imsf_numsrc ? count : msf->imsf_numsrc; len = flex_array_size(psl, sl_addr, copycount); msf->imsf_numsrc = count; msf_size = IP_MSFILTER_SIZE(copycount); if (copy_to_sockptr(optlen, &msf_size, sizeof(int)) || copy_to_sockptr(optval, msf, IP_MSFILTER_SIZE(0))) { return -EFAULT; } if (len && copy_to_sockptr_offset(optval, offsetof(struct ip_msfilter, imsf_slist_flex), psl->sl_addr, len)) return -EFAULT; return 0; done: return err; } int ip_mc_gsfget(struct sock *sk, struct group_filter *gsf, sockptr_t optval, size_t ss_offset) { int i, count, copycount; struct sockaddr_in *psin; __be32 addr; struct ip_mc_socklist *pmc; struct inet_sock *inet = inet_sk(sk); struct ip_sf_socklist *psl; ASSERT_RTNL(); psin = (struct sockaddr_in *)&gsf->gf_group; if (psin->sin_family != AF_INET) return -EINVAL; addr = psin->sin_addr.s_addr; if (!ipv4_is_multicast(addr)) return -EINVAL; for_each_pmc_rtnl(inet, pmc) { if (pmc->multi.imr_multiaddr.s_addr == addr && pmc->multi.imr_ifindex == gsf->gf_interface) break; } if (!pmc) /* must have a prior join */ return -EADDRNOTAVAIL; gsf->gf_fmode = pmc->sfmode; psl = rtnl_dereference(pmc->sflist); count = psl ? psl->sl_count : 0; copycount = count < gsf->gf_numsrc ? count : gsf->gf_numsrc; gsf->gf_numsrc = count; for (i = 0; i < copycount; i++) { struct sockaddr_storage ss; psin = (struct sockaddr_in *)&ss; memset(&ss, 0, sizeof(ss)); psin->sin_family = AF_INET; psin->sin_addr.s_addr = psl->sl_addr[i]; if (copy_to_sockptr_offset(optval, ss_offset, &ss, sizeof(ss))) return -EFAULT; ss_offset += sizeof(ss); } return 0; } /* * check if a multicast source filter allows delivery for a given <src,dst,intf> */ int ip_mc_sf_allow(const struct sock *sk, __be32 loc_addr, __be32 rmt_addr, int dif, int sdif) { const struct inet_sock *inet = inet_sk(sk); struct ip_mc_socklist *pmc; struct ip_sf_socklist *psl; int i; int ret; ret = 1; if (!ipv4_is_multicast(loc_addr)) goto out; rcu_read_lock(); for_each_pmc_rcu(inet, pmc) { if (pmc->multi.imr_multiaddr.s_addr == loc_addr && (pmc->multi.imr_ifindex == dif || (sdif && pmc->multi.imr_ifindex == sdif))) break; } ret = inet_test_bit(MC_ALL, sk); if (!pmc) goto unlock; psl = rcu_dereference(pmc->sflist); ret = (pmc->sfmode == MCAST_EXCLUDE); if (!psl) goto unlock; for (i = 0; i < psl->sl_count; i++) { if (psl->sl_addr[i] == rmt_addr) break; } ret = 0; if (pmc->sfmode == MCAST_INCLUDE && i >= psl->sl_count) goto unlock; if (pmc->sfmode == MCAST_EXCLUDE && i < psl->sl_count) goto unlock; ret = 1; unlock: rcu_read_unlock(); out: return ret; } /* * A socket is closing. */ void ip_mc_drop_socket(struct sock *sk) { struct inet_sock *inet = inet_sk(sk); struct ip_mc_socklist *iml; struct net *net = sock_net(sk); if (!inet->mc_list) return; rtnl_lock(); while ((iml = rtnl_dereference(inet->mc_list)) != NULL) { struct in_device *in_dev; inet->mc_list = iml->next_rcu; in_dev = inetdev_by_index(net, iml->multi.imr_ifindex); (void) ip_mc_leave_src(sk, iml, in_dev); if (in_dev) ip_mc_dec_group(in_dev, iml->multi.imr_multiaddr.s_addr); /* decrease mem now to avoid the memleak warning */ atomic_sub(sizeof(*iml), &sk->sk_omem_alloc); kfree_rcu(iml, rcu); } rtnl_unlock(); } /* called with rcu_read_lock() */ int ip_check_mc_rcu(struct in_device *in_dev, __be32 mc_addr, __be32 src_addr, u8 proto) { struct ip_mc_list *im; struct ip_mc_list __rcu **mc_hash; struct ip_sf_list *psf; int rv = 0; mc_hash = rcu_dereference(in_dev->mc_hash); if (mc_hash) { u32 hash = hash_32((__force u32)mc_addr, MC_HASH_SZ_LOG); for (im = rcu_dereference(mc_hash[hash]); im != NULL; im = rcu_dereference(im->next_hash)) { if (im->multiaddr == mc_addr) break; } } else { for_each_pmc_rcu(in_dev, im) { if (im->multiaddr == mc_addr) break; } } if (im && proto == IPPROTO_IGMP) { rv = 1; } else if (im) { if (src_addr) { spin_lock_bh(&im->lock); for (psf = im->sources; psf; psf = psf->sf_next) { if (psf->sf_inaddr == src_addr) break; } if (psf) rv = psf->sf_count[MCAST_INCLUDE] || psf->sf_count[MCAST_EXCLUDE] != im->sfcount[MCAST_EXCLUDE]; else rv = im->sfcount[MCAST_EXCLUDE] != 0; spin_unlock_bh(&im->lock); } else rv = 1; /* unspecified source; tentatively allow */ } return rv; } #if defined(CONFIG_PROC_FS) struct igmp_mc_iter_state { struct seq_net_private p; struct net_device *dev; struct in_device *in_dev; }; #define igmp_mc_seq_private(seq) ((struct igmp_mc_iter_state *)(seq)->private) static inline struct ip_mc_list *igmp_mc_get_first(struct seq_file *seq) { struct net *net = seq_file_net(seq); struct ip_mc_list *im = NULL; struct igmp_mc_iter_state *state = igmp_mc_seq_private(seq); state->in_dev = NULL; for_each_netdev_rcu(net, state->dev) { struct in_device *in_dev; in_dev = __in_dev_get_rcu(state->dev); if (!in_dev) continue; im = rcu_dereference(in_dev->mc_list); if (im) { state->in_dev = in_dev; break; } } return im; } static struct ip_mc_list *igmp_mc_get_next(struct seq_file *seq, struct ip_mc_list *im) { struct igmp_mc_iter_state *state = igmp_mc_seq_private(seq); im = rcu_dereference(im->next_rcu); while (!im) { state->dev = next_net_device_rcu(state->dev); if (!state->dev) { state->in_dev = NULL; break; } state->in_dev = __in_dev_get_rcu(state->dev); if (!state->in_dev) continue; im = rcu_dereference(state->in_dev->mc_list); } return im; } static struct ip_mc_list *igmp_mc_get_idx(struct seq_file *seq, loff_t pos) { struct ip_mc_list *im = igmp_mc_get_first(seq); if (im) while (pos && (im = igmp_mc_get_next(seq, im)) != NULL) --pos; return pos ? NULL : im; } static void *igmp_mc_seq_start(struct seq_file *seq, loff_t *pos) __acquires(rcu) { rcu_read_lock(); return *pos ? igmp_mc_get_idx(seq, *pos - 1) : SEQ_START_TOKEN; } static void *igmp_mc_seq_next(struct seq_file *seq, void *v, loff_t *pos) { struct ip_mc_list *im; if (v == SEQ_START_TOKEN) im = igmp_mc_get_first(seq); else im = igmp_mc_get_next(seq, v); ++*pos; return im; } static void igmp_mc_seq_stop(struct seq_file *seq, void *v) __releases(rcu) { struct igmp_mc_iter_state *state = igmp_mc_seq_private(seq); state->in_dev = NULL; state->dev = NULL; rcu_read_unlock(); } static int igmp_mc_seq_show(struct seq_file *seq, void *v) { if (v == SEQ_START_TOKEN) seq_puts(seq, "Idx\tDevice : Count Querier\tGroup Users Timer\tReporter\n"); else { struct ip_mc_list *im = v; struct igmp_mc_iter_state *state = igmp_mc_seq_private(seq); char *querier; long delta; #ifdef CONFIG_IP_MULTICAST querier = IGMP_V1_SEEN(state->in_dev) ? "V1" : IGMP_V2_SEEN(state->in_dev) ? "V2" : "V3"; #else querier = "NONE"; #endif if (rcu_access_pointer(state->in_dev->mc_list) == im) { seq_printf(seq, "%d\t%-10s: %5d %7s\n", state->dev->ifindex, state->dev->name, state->in_dev->mc_count, querier); } delta = im->timer.expires - jiffies; seq_printf(seq, "\t\t\t\t%08X %5d %d:%08lX\t\t%d\n", im->multiaddr, im->users, im->tm_running, im->tm_running ? jiffies_delta_to_clock_t(delta) : 0, im->reporter); } return 0; } static const struct seq_operations igmp_mc_seq_ops = { .start = igmp_mc_seq_start, .next = igmp_mc_seq_next, .stop = igmp_mc_seq_stop, .show = igmp_mc_seq_show, }; struct igmp_mcf_iter_state { struct seq_net_private p; struct net_device *dev; struct in_device *idev; struct ip_mc_list *im; }; #define igmp_mcf_seq_private(seq) ((struct igmp_mcf_iter_state *)(seq)->private) static inline struct ip_sf_list *igmp_mcf_get_first(struct seq_file *seq) { struct net *net = seq_file_net(seq); struct ip_sf_list *psf = NULL; struct ip_mc_list *im = NULL; struct igmp_mcf_iter_state *state = igmp_mcf_seq_private(seq); state->idev = NULL; state->im = NULL; for_each_netdev_rcu(net, state->dev) { struct in_device *idev; idev = __in_dev_get_rcu(state->dev); if (unlikely(!idev)) continue; im = rcu_dereference(idev->mc_list); if (likely(im)) { spin_lock_bh(&im->lock); psf = im->sources; if (likely(psf)) { state->im = im; state->idev = idev; break; } spin_unlock_bh(&im->lock); } } return psf; } static struct ip_sf_list *igmp_mcf_get_next(struct seq_file *seq, struct ip_sf_list *psf) { struct igmp_mcf_iter_state *state = igmp_mcf_seq_private(seq); psf = psf->sf_next; while (!psf) { spin_unlock_bh(&state->im->lock); state->im = state->im->next; while (!state->im) { state->dev = next_net_device_rcu(state->dev); if (!state->dev) { state->idev = NULL; goto out; } state->idev = __in_dev_get_rcu(state->dev); if (!state->idev) continue; state->im = rcu_dereference(state->idev->mc_list); } spin_lock_bh(&state->im->lock); psf = state->im->sources; } out: return psf; } static struct ip_sf_list *igmp_mcf_get_idx(struct seq_file *seq, loff_t pos) { struct ip_sf_list *psf = igmp_mcf_get_first(seq); if (psf) while (pos && (psf = igmp_mcf_get_next(seq, psf)) != NULL) --pos; return pos ? NULL : psf; } static void *igmp_mcf_seq_start(struct seq_file *seq, loff_t *pos) __acquires(rcu) { rcu_read_lock(); return *pos ? igmp_mcf_get_idx(seq, *pos - 1) : SEQ_START_TOKEN; } static void *igmp_mcf_seq_next(struct seq_file *seq, void *v, loff_t *pos) { struct ip_sf_list *psf; if (v == SEQ_START_TOKEN) psf = igmp_mcf_get_first(seq); else psf = igmp_mcf_get_next(seq, v); ++*pos; return psf; } static void igmp_mcf_seq_stop(struct seq_file *seq, void *v) __releases(rcu) { struct igmp_mcf_iter_state *state = igmp_mcf_seq_private(seq); if (likely(state->im)) { spin_unlock_bh(&state->im->lock); state->im = NULL; } state->idev = NULL; state->dev = NULL; rcu_read_unlock(); } static int igmp_mcf_seq_show(struct seq_file *seq, void *v) { struct ip_sf_list *psf = v; struct igmp_mcf_iter_state *state = igmp_mcf_seq_private(seq); if (v == SEQ_START_TOKEN) { seq_puts(seq, "Idx Device MCA SRC INC EXC\n"); } else { seq_printf(seq, "%3d %6.6s 0x%08x " "0x%08x %6lu %6lu\n", state->dev->ifindex, state->dev->name, ntohl(state->im->multiaddr), ntohl(psf->sf_inaddr), psf->sf_count[MCAST_INCLUDE], psf->sf_count[MCAST_EXCLUDE]); } return 0; } static const struct seq_operations igmp_mcf_seq_ops = { .start = igmp_mcf_seq_start, .next = igmp_mcf_seq_next, .stop = igmp_mcf_seq_stop, .show = igmp_mcf_seq_show, }; static int __net_init igmp_net_init(struct net *net) { struct proc_dir_entry *pde; int err; pde = proc_create_net("igmp", 0444, net->proc_net, &igmp_mc_seq_ops, sizeof(struct igmp_mc_iter_state)); if (!pde) goto out_igmp; pde = proc_create_net("mcfilter", 0444, net->proc_net, &igmp_mcf_seq_ops, sizeof(struct igmp_mcf_iter_state)); if (!pde) goto out_mcfilter; err = inet_ctl_sock_create(&net->ipv4.mc_autojoin_sk, AF_INET, SOCK_DGRAM, 0, net); if (err < 0) { pr_err("Failed to initialize the IGMP autojoin socket (err %d)\n", err); goto out_sock; } return 0; out_sock: remove_proc_entry("mcfilter", net->proc_net); out_mcfilter: remove_proc_entry("igmp", net->proc_net); out_igmp: return -ENOMEM; } static void __net_exit igmp_net_exit(struct net *net) { remove_proc_entry("mcfilter", net->proc_net); remove_proc_entry("igmp", net->proc_net); inet_ctl_sock_destroy(net->ipv4.mc_autojoin_sk); } static struct pernet_operations igmp_net_ops = { .init = igmp_net_init, .exit = igmp_net_exit, }; #endif static int igmp_netdev_event(struct notifier_block *this, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct in_device *in_dev; switch (event) { case NETDEV_RESEND_IGMP: in_dev = __in_dev_get_rtnl(dev); if (in_dev) ip_mc_rejoin_groups(in_dev); break; default: break; } return NOTIFY_DONE; } static struct notifier_block igmp_notifier = { .notifier_call = igmp_netdev_event, }; int __init igmp_mc_init(void) { #if defined(CONFIG_PROC_FS) int err; err = register_pernet_subsys(&igmp_net_ops); if (err) return err; err = register_netdevice_notifier(&igmp_notifier); if (err) goto reg_notif_fail; return 0; reg_notif_fail: unregister_pernet_subsys(&igmp_net_ops); return err; #else return register_netdevice_notifier(&igmp_notifier); #endif }
5 5 4 116 3 56 4 14 40 9 3 49 53 19 18 38 82 82 81 1 14 18 70 17 70 17 75 14 81 14 39 40 20 20 28 1 7 4 1 37 4 8 32 8 10 93 16 94 177 33 98 28 118 89 26 111 3 105 7 104 12 37 26 26 5 31 5 27 3 27 1 10 26 38 60 54 1 6 6 1 1 1 1 5 22 7 16 2 21 22 1 23 8 62 54 153 25 82 96 14 15 73 29 274 263 35 268 25 274 25 17 1 4 31 1 3 28 2 20 16 4 17 10 3 3 9 8 8 17 8 13 12 1 12 2 1 1 1 1 1 1 1 1 63 63 63 4 4 4 4 22 6 5 1 17 13 13 8 7 6 6 40 18 22 22 2 18 9 6 6 6 20 20 20 1 20 4 8 14 14 20 20 5 19 13 1 4 9 4 10 10 10 7 3 10 10 10 10 10 7 3 10 2 1 2 7 6 5 5 5 5 2 3 5 2 3 3 4 1 3 1 1 3 2 1 3 3 3 3 9 9 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 // SPDX-License-Identifier: GPL-2.0 /* * * Copyright (C) 2019-2021 Paragon Software GmbH, All rights reserved. * * TODO: Merge attr_set_size/attr_data_get_block/attr_allocate_frame? */ #include <linux/fs.h> #include <linux/slab.h> #include <linux/kernel.h> #include "debug.h" #include "ntfs.h" #include "ntfs_fs.h" /* * You can set external NTFS_MIN_LOG2_OF_CLUMP/NTFS_MAX_LOG2_OF_CLUMP to manage * preallocate algorithm. */ #ifndef NTFS_MIN_LOG2_OF_CLUMP #define NTFS_MIN_LOG2_OF_CLUMP 16 #endif #ifndef NTFS_MAX_LOG2_OF_CLUMP #define NTFS_MAX_LOG2_OF_CLUMP 26 #endif // 16M #define NTFS_CLUMP_MIN (1 << (NTFS_MIN_LOG2_OF_CLUMP + 8)) // 16G #define NTFS_CLUMP_MAX (1ull << (NTFS_MAX_LOG2_OF_CLUMP + 8)) static inline u64 get_pre_allocated(u64 size) { u32 clump; u8 align_shift; u64 ret; if (size <= NTFS_CLUMP_MIN) { clump = 1 << NTFS_MIN_LOG2_OF_CLUMP; align_shift = NTFS_MIN_LOG2_OF_CLUMP; } else if (size >= NTFS_CLUMP_MAX) { clump = 1 << NTFS_MAX_LOG2_OF_CLUMP; align_shift = NTFS_MAX_LOG2_OF_CLUMP; } else { align_shift = NTFS_MIN_LOG2_OF_CLUMP - 1 + __ffs(size >> (8 + NTFS_MIN_LOG2_OF_CLUMP)); clump = 1u << align_shift; } ret = (((size + clump - 1) >> align_shift)) << align_shift; return ret; } /* * attr_load_runs - Load all runs stored in @attr. */ static int attr_load_runs(struct ATTRIB *attr, struct ntfs_inode *ni, struct runs_tree *run, const CLST *vcn) { int err; CLST svcn = le64_to_cpu(attr->nres.svcn); CLST evcn = le64_to_cpu(attr->nres.evcn); u32 asize; u16 run_off; if (svcn >= evcn + 1 || run_is_mapped_full(run, svcn, evcn)) return 0; if (vcn && (evcn < *vcn || *vcn < svcn)) return -EINVAL; asize = le32_to_cpu(attr->size); run_off = le16_to_cpu(attr->nres.run_off); if (run_off > asize) return -EINVAL; err = run_unpack_ex(run, ni->mi.sbi, ni->mi.rno, svcn, evcn, vcn ? *vcn : svcn, Add2Ptr(attr, run_off), asize - run_off); if (err < 0) return err; return 0; } /* * run_deallocate_ex - Deallocate clusters. */ static int run_deallocate_ex(struct ntfs_sb_info *sbi, struct runs_tree *run, CLST vcn, CLST len, CLST *done, bool trim) { int err = 0; CLST vcn_next, vcn0 = vcn, lcn, clen, dn = 0; size_t idx; if (!len) goto out; if (!run_lookup_entry(run, vcn, &lcn, &clen, &idx)) { failed: run_truncate(run, vcn0); err = -EINVAL; goto out; } for (;;) { if (clen > len) clen = len; if (!clen) { err = -EINVAL; goto out; } if (lcn != SPARSE_LCN) { if (sbi) { /* mark bitmap range [lcn + clen) as free and trim clusters. */ mark_as_free_ex(sbi, lcn, clen, trim); } dn += clen; } len -= clen; if (!len) break; vcn_next = vcn + clen; if (!run_get_entry(run, ++idx, &vcn, &lcn, &clen) || vcn != vcn_next) { /* Save memory - don't load entire run. */ goto failed; } } out: if (done) *done += dn; return err; } /* * attr_allocate_clusters - Find free space, mark it as used and store in @run. */ int attr_allocate_clusters(struct ntfs_sb_info *sbi, struct runs_tree *run, CLST vcn, CLST lcn, CLST len, CLST *pre_alloc, enum ALLOCATE_OPT opt, CLST *alen, const size_t fr, CLST *new_lcn, CLST *new_len) { int err; CLST flen, vcn0 = vcn, pre = pre_alloc ? *pre_alloc : 0; size_t cnt = run->count; for (;;) { err = ntfs_look_for_free_space(sbi, lcn, len + pre, &lcn, &flen, opt); if (err == -ENOSPC && pre) { pre = 0; if (*pre_alloc) *pre_alloc = 0; continue; } if (err) goto out; if (vcn == vcn0) { /* Return the first fragment. */ if (new_lcn) *new_lcn = lcn; if (new_len) *new_len = flen; } /* Add new fragment into run storage. */ if (!run_add_entry(run, vcn, lcn, flen, opt & ALLOCATE_MFT)) { /* Undo last 'ntfs_look_for_free_space' */ mark_as_free_ex(sbi, lcn, len, false); err = -ENOMEM; goto out; } if (opt & ALLOCATE_ZERO) { u8 shift = sbi->cluster_bits - SECTOR_SHIFT; err = blkdev_issue_zeroout(sbi->sb->s_bdev, (sector_t)lcn << shift, (sector_t)flen << shift, GFP_NOFS, 0); if (err) goto out; } vcn += flen; if (flen >= len || (opt & ALLOCATE_MFT) || (fr && run->count - cnt >= fr)) { *alen = vcn - vcn0; return 0; } len -= flen; } out: /* Undo 'ntfs_look_for_free_space' */ if (vcn - vcn0) { run_deallocate_ex(sbi, run, vcn0, vcn - vcn0, NULL, false); run_truncate(run, vcn0); } return err; } /* * attr_make_nonresident * * If page is not NULL - it is already contains resident data * and locked (called from ni_write_frame()). */ int attr_make_nonresident(struct ntfs_inode *ni, struct ATTRIB *attr, struct ATTR_LIST_ENTRY *le, struct mft_inode *mi, u64 new_size, struct runs_tree *run, struct ATTRIB **ins_attr, struct page *page) { struct ntfs_sb_info *sbi; struct ATTRIB *attr_s; struct MFT_REC *rec; u32 used, asize, rsize, aoff; bool is_data; CLST len, alen; char *next; int err; if (attr->non_res) { *ins_attr = attr; return 0; } sbi = mi->sbi; rec = mi->mrec; attr_s = NULL; used = le32_to_cpu(rec->used); asize = le32_to_cpu(attr->size); next = Add2Ptr(attr, asize); aoff = PtrOffset(rec, attr); rsize = le32_to_cpu(attr->res.data_size); is_data = attr->type == ATTR_DATA && !attr->name_len; /* len - how many clusters required to store 'rsize' bytes */ if (is_attr_compressed(attr)) { u8 shift = sbi->cluster_bits + NTFS_LZNT_CUNIT; len = ((rsize + (1u << shift) - 1) >> shift) << NTFS_LZNT_CUNIT; } else { len = bytes_to_cluster(sbi, rsize); } run_init(run); /* Make a copy of original attribute. */ attr_s = kmemdup(attr, asize, GFP_NOFS); if (!attr_s) { err = -ENOMEM; goto out; } if (!len) { /* Empty resident -> Empty nonresident. */ alen = 0; } else { const char *data = resident_data(attr); err = attr_allocate_clusters(sbi, run, 0, 0, len, NULL, ALLOCATE_DEF, &alen, 0, NULL, NULL); if (err) goto out1; if (!rsize) { /* Empty resident -> Non empty nonresident. */ } else if (!is_data) { err = ntfs_sb_write_run(sbi, run, 0, data, rsize, 0); if (err) goto out2; } else if (!page) { struct address_space *mapping = ni->vfs_inode.i_mapping; struct folio *folio; folio = __filemap_get_folio( mapping, 0, FGP_LOCK | FGP_ACCESSED | FGP_CREAT, mapping_gfp_mask(mapping)); if (IS_ERR(folio)) { err = PTR_ERR(folio); goto out2; } folio_fill_tail(folio, 0, data, rsize); folio_mark_uptodate(folio); folio_mark_dirty(folio); folio_unlock(folio); folio_put(folio); } } /* Remove original attribute. */ used -= asize; memmove(attr, Add2Ptr(attr, asize), used - aoff); rec->used = cpu_to_le32(used); mi->dirty = true; if (le) al_remove_le(ni, le); err = ni_insert_nonresident(ni, attr_s->type, attr_name(attr_s), attr_s->name_len, run, 0, alen, attr_s->flags, &attr, NULL, NULL); if (err) goto out3; kfree(attr_s); attr->nres.data_size = cpu_to_le64(rsize); attr->nres.valid_size = attr->nres.data_size; *ins_attr = attr; if (is_data) ni->ni_flags &= ~NI_FLAG_RESIDENT; /* Resident attribute becomes non resident. */ return 0; out3: attr = Add2Ptr(rec, aoff); memmove(next, attr, used - aoff); memcpy(attr, attr_s, asize); rec->used = cpu_to_le32(used + asize); mi->dirty = true; out2: /* Undo: do not trim new allocated clusters. */ run_deallocate(sbi, run, false); run_close(run); out1: kfree(attr_s); out: return err; } /* * attr_set_size_res - Helper for attr_set_size(). */ static int attr_set_size_res(struct ntfs_inode *ni, struct ATTRIB *attr, struct ATTR_LIST_ENTRY *le, struct mft_inode *mi, u64 new_size, struct runs_tree *run, struct ATTRIB **ins_attr) { struct ntfs_sb_info *sbi = mi->sbi; struct MFT_REC *rec = mi->mrec; u32 used = le32_to_cpu(rec->used); u32 asize = le32_to_cpu(attr->size); u32 aoff = PtrOffset(rec, attr); u32 rsize = le32_to_cpu(attr->res.data_size); u32 tail = used - aoff - asize; char *next = Add2Ptr(attr, asize); s64 dsize = ALIGN(new_size, 8) - ALIGN(rsize, 8); if (dsize < 0) { memmove(next + dsize, next, tail); } else if (dsize > 0) { if (used + dsize > sbi->max_bytes_per_attr) return attr_make_nonresident(ni, attr, le, mi, new_size, run, ins_attr, NULL); memmove(next + dsize, next, tail); memset(next, 0, dsize); } if (new_size > rsize) memset(Add2Ptr(resident_data(attr), rsize), 0, new_size - rsize); rec->used = cpu_to_le32(used + dsize); attr->size = cpu_to_le32(asize + dsize); attr->res.data_size = cpu_to_le32(new_size); mi->dirty = true; *ins_attr = attr; return 0; } /* * attr_set_size - Change the size of attribute. * * Extend: * - Sparse/compressed: No allocated clusters. * - Normal: Append allocated and preallocated new clusters. * Shrink: * - No deallocate if @keep_prealloc is set. */ int attr_set_size(struct ntfs_inode *ni, enum ATTR_TYPE type, const __le16 *name, u8 name_len, struct runs_tree *run, u64 new_size, const u64 *new_valid, bool keep_prealloc, struct ATTRIB **ret) { int err = 0; struct ntfs_sb_info *sbi = ni->mi.sbi; u8 cluster_bits = sbi->cluster_bits; bool is_mft = ni->mi.rno == MFT_REC_MFT && type == ATTR_DATA && !name_len; u64 old_valid, old_size, old_alloc, new_alloc, new_alloc_tmp; struct ATTRIB *attr = NULL, *attr_b; struct ATTR_LIST_ENTRY *le, *le_b; struct mft_inode *mi, *mi_b; CLST alen, vcn, lcn, new_alen, old_alen, svcn, evcn; CLST next_svcn, pre_alloc = -1, done = 0; bool is_ext, is_bad = false; bool dirty = false; u32 align; struct MFT_REC *rec; again: alen = 0; le_b = NULL; attr_b = ni_find_attr(ni, NULL, &le_b, type, name, name_len, NULL, &mi_b); if (!attr_b) { err = -ENOENT; goto bad_inode; } if (!attr_b->non_res) { err = attr_set_size_res(ni, attr_b, le_b, mi_b, new_size, run, &attr_b); if (err) return err; /* Return if file is still resident. */ if (!attr_b->non_res) { dirty = true; goto ok1; } /* Layout of records may be changed, so do a full search. */ goto again; } is_ext = is_attr_ext(attr_b); align = sbi->cluster_size; if (is_ext) align <<= attr_b->nres.c_unit; old_valid = le64_to_cpu(attr_b->nres.valid_size); old_size = le64_to_cpu(attr_b->nres.data_size); old_alloc = le64_to_cpu(attr_b->nres.alloc_size); again_1: old_alen = old_alloc >> cluster_bits; new_alloc = (new_size + align - 1) & ~(u64)(align - 1); new_alen = new_alloc >> cluster_bits; if (keep_prealloc && new_size < old_size) { attr_b->nres.data_size = cpu_to_le64(new_size); mi_b->dirty = dirty = true; goto ok; } vcn = old_alen - 1; svcn = le64_to_cpu(attr_b->nres.svcn); evcn = le64_to_cpu(attr_b->nres.evcn); if (svcn <= vcn && vcn <= evcn) { attr = attr_b; le = le_b; mi = mi_b; } else if (!le_b) { err = -EINVAL; goto bad_inode; } else { le = le_b; attr = ni_find_attr(ni, attr_b, &le, type, name, name_len, &vcn, &mi); if (!attr) { err = -EINVAL; goto bad_inode; } next_le_1: svcn = le64_to_cpu(attr->nres.svcn); evcn = le64_to_cpu(attr->nres.evcn); } /* * Here we have: * attr,mi,le - last attribute segment (containing 'vcn'). * attr_b,mi_b,le_b - base (primary) attribute segment. */ next_le: rec = mi->mrec; err = attr_load_runs(attr, ni, run, NULL); if (err) goto out; if (new_size > old_size) { CLST to_allocate; size_t free; if (new_alloc <= old_alloc) { attr_b->nres.data_size = cpu_to_le64(new_size); mi_b->dirty = dirty = true; goto ok; } /* * Add clusters. In simple case we have to: * - allocate space (vcn, lcn, len) * - update packed run in 'mi' * - update attr->nres.evcn * - update attr_b->nres.data_size/attr_b->nres.alloc_size */ to_allocate = new_alen - old_alen; add_alloc_in_same_attr_seg: lcn = 0; if (is_mft) { /* MFT allocates clusters from MFT zone. */ pre_alloc = 0; } else if (is_ext) { /* No preallocate for sparse/compress. */ pre_alloc = 0; } else if (pre_alloc == -1) { pre_alloc = 0; if (type == ATTR_DATA && !name_len && sbi->options->prealloc) { pre_alloc = bytes_to_cluster( sbi, get_pre_allocated( new_size)) - new_alen; } /* Get the last LCN to allocate from. */ if (old_alen && !run_lookup_entry(run, vcn, &lcn, NULL, NULL)) { lcn = SPARSE_LCN; } if (lcn == SPARSE_LCN) lcn = 0; else if (lcn) lcn += 1; free = wnd_zeroes(&sbi->used.bitmap); if (to_allocate > free) { err = -ENOSPC; goto out; } if (pre_alloc && to_allocate + pre_alloc > free) pre_alloc = 0; } vcn = old_alen; if (is_ext) { if (!run_add_entry(run, vcn, SPARSE_LCN, to_allocate, false)) { err = -ENOMEM; goto out; } alen = to_allocate; } else { /* ~3 bytes per fragment. */ err = attr_allocate_clusters( sbi, run, vcn, lcn, to_allocate, &pre_alloc, is_mft ? ALLOCATE_MFT : ALLOCATE_DEF, &alen, is_mft ? 0 : (sbi->record_size - le32_to_cpu(rec->used) + 8) / 3 + 1, NULL, NULL); if (err) goto out; } done += alen; vcn += alen; if (to_allocate > alen) to_allocate -= alen; else to_allocate = 0; pack_runs: err = mi_pack_runs(mi, attr, run, vcn - svcn); if (err) goto undo_1; next_svcn = le64_to_cpu(attr->nres.evcn) + 1; new_alloc_tmp = (u64)next_svcn << cluster_bits; attr_b->nres.alloc_size = cpu_to_le64(new_alloc_tmp); mi_b->dirty = dirty = true; if (next_svcn >= vcn && !to_allocate) { /* Normal way. Update attribute and exit. */ attr_b->nres.data_size = cpu_to_le64(new_size); goto ok; } /* At least two MFT to avoid recursive loop. */ if (is_mft && next_svcn == vcn && ((u64)done << sbi->cluster_bits) >= 2 * sbi->record_size) { new_size = new_alloc_tmp; attr_b->nres.data_size = attr_b->nres.alloc_size; goto ok; } if (le32_to_cpu(rec->used) < sbi->record_size) { old_alen = next_svcn; evcn = old_alen - 1; goto add_alloc_in_same_attr_seg; } attr_b->nres.data_size = attr_b->nres.alloc_size; if (new_alloc_tmp < old_valid) attr_b->nres.valid_size = attr_b->nres.data_size; if (type == ATTR_LIST) { err = ni_expand_list(ni); if (err) goto undo_2; if (next_svcn < vcn) goto pack_runs; /* Layout of records is changed. */ goto again; } if (!ni->attr_list.size) { err = ni_create_attr_list(ni); /* In case of error layout of records is not changed. */ if (err) goto undo_2; /* Layout of records is changed. */ } if (next_svcn >= vcn) { /* This is MFT data, repeat. */ goto again; } /* Insert new attribute segment. */ err = ni_insert_nonresident(ni, type, name, name_len, run, next_svcn, vcn - next_svcn, attr_b->flags, &attr, &mi, NULL); /* * Layout of records maybe changed. * Find base attribute to update. */ le_b = NULL; attr_b = ni_find_attr(ni, NULL, &le_b, type, name, name_len, NULL, &mi_b); if (!attr_b) { err = -EINVAL; goto bad_inode; } if (err) { /* ni_insert_nonresident failed. */ attr = NULL; goto undo_2; } /* keep runs for $MFT::$ATTR_DATA and $MFT::$ATTR_BITMAP. */ if (ni->mi.rno != MFT_REC_MFT) run_truncate_head(run, evcn + 1); svcn = le64_to_cpu(attr->nres.svcn); evcn = le64_to_cpu(attr->nres.evcn); /* * Attribute is in consistency state. * Save this point to restore to if next steps fail. */ old_valid = old_size = old_alloc = (u64)vcn << cluster_bits; attr_b->nres.valid_size = attr_b->nres.data_size = attr_b->nres.alloc_size = cpu_to_le64(old_size); mi_b->dirty = dirty = true; goto again_1; } if (new_size != old_size || (new_alloc != old_alloc && !keep_prealloc)) { /* * Truncate clusters. In simple case we have to: * - update packed run in 'mi' * - update attr->nres.evcn * - update attr_b->nres.data_size/attr_b->nres.alloc_size * - mark and trim clusters as free (vcn, lcn, len) */ CLST dlen = 0; vcn = max(svcn, new_alen); new_alloc_tmp = (u64)vcn << cluster_bits; if (vcn > svcn) { err = mi_pack_runs(mi, attr, run, vcn - svcn); if (err) goto out; } else if (le && le->vcn) { u16 le_sz = le16_to_cpu(le->size); /* * NOTE: List entries for one attribute are always * the same size. We deal with last entry (vcn==0) * and it is not first in entries array * (list entry for std attribute always first). * So it is safe to step back. */ mi_remove_attr(NULL, mi, attr); if (!al_remove_le(ni, le)) { err = -EINVAL; goto bad_inode; } le = (struct ATTR_LIST_ENTRY *)((u8 *)le - le_sz); } else { attr->nres.evcn = cpu_to_le64((u64)vcn - 1); mi->dirty = true; } attr_b->nres.alloc_size = cpu_to_le64(new_alloc_tmp); if (vcn == new_alen) { attr_b->nres.data_size = cpu_to_le64(new_size); if (new_size < old_valid) attr_b->nres.valid_size = attr_b->nres.data_size; } else { if (new_alloc_tmp <= le64_to_cpu(attr_b->nres.data_size)) attr_b->nres.data_size = attr_b->nres.alloc_size; if (new_alloc_tmp < le64_to_cpu(attr_b->nres.valid_size)) attr_b->nres.valid_size = attr_b->nres.alloc_size; } mi_b->dirty = dirty = true; err = run_deallocate_ex(sbi, run, vcn, evcn - vcn + 1, &dlen, true); if (err) goto out; if (is_ext) { /* dlen - really deallocated clusters. */ le64_sub_cpu(&attr_b->nres.total_size, ((u64)dlen << cluster_bits)); } run_truncate(run, vcn); if (new_alloc_tmp <= new_alloc) goto ok; old_size = new_alloc_tmp; vcn = svcn - 1; if (le == le_b) { attr = attr_b; mi = mi_b; evcn = svcn - 1; svcn = 0; goto next_le; } if (le->type != type || le->name_len != name_len || memcmp(le_name(le), name, name_len * sizeof(short))) { err = -EINVAL; goto bad_inode; } err = ni_load_mi(ni, le, &mi); if (err) goto out; attr = mi_find_attr(ni, mi, NULL, type, name, name_len, &le->id); if (!attr) { err = -EINVAL; goto bad_inode; } goto next_le_1; } ok: if (new_valid) { __le64 valid = cpu_to_le64(min(*new_valid, new_size)); if (attr_b->nres.valid_size != valid) { attr_b->nres.valid_size = valid; mi_b->dirty = true; } } ok1: if (ret) *ret = attr_b; if (((type == ATTR_DATA && !name_len) || (type == ATTR_ALLOC && name == I30_NAME))) { /* Update inode_set_bytes. */ if (attr_b->non_res) { new_alloc = le64_to_cpu(attr_b->nres.alloc_size); if (inode_get_bytes(&ni->vfs_inode) != new_alloc) { inode_set_bytes(&ni->vfs_inode, new_alloc); dirty = true; } } /* Don't forget to update duplicate information in parent. */ if (dirty) { ni->ni_flags |= NI_FLAG_UPDATE_PARENT; mark_inode_dirty(&ni->vfs_inode); } } return 0; undo_2: vcn -= alen; attr_b->nres.data_size = cpu_to_le64(old_size); attr_b->nres.valid_size = cpu_to_le64(old_valid); attr_b->nres.alloc_size = cpu_to_le64(old_alloc); /* Restore 'attr' and 'mi'. */ if (attr) goto restore_run; if (le64_to_cpu(attr_b->nres.svcn) <= svcn && svcn <= le64_to_cpu(attr_b->nres.evcn)) { attr = attr_b; le = le_b; mi = mi_b; } else if (!le_b) { err = -EINVAL; goto bad_inode; } else { le = le_b; attr = ni_find_attr(ni, attr_b, &le, type, name, name_len, &svcn, &mi); if (!attr) goto bad_inode; } restore_run: if (mi_pack_runs(mi, attr, run, evcn - svcn + 1)) is_bad = true; undo_1: run_deallocate_ex(sbi, run, vcn, alen, NULL, false); run_truncate(run, vcn); out: if (is_bad) { bad_inode: _ntfs_bad_inode(&ni->vfs_inode); } return err; } /* * attr_data_get_block - Returns 'lcn' and 'len' for given 'vcn'. * * @new == NULL means just to get current mapping for 'vcn' * @new != NULL means allocate real cluster if 'vcn' maps to hole * @zero - zeroout new allocated clusters * * NOTE: * - @new != NULL is called only for sparsed or compressed attributes. * - new allocated clusters are zeroed via blkdev_issue_zeroout. */ int attr_data_get_block(struct ntfs_inode *ni, CLST vcn, CLST clen, CLST *lcn, CLST *len, bool *new, bool zero) { int err = 0; struct runs_tree *run = &ni->file.run; struct ntfs_sb_info *sbi; u8 cluster_bits; struct ATTRIB *attr, *attr_b; struct ATTR_LIST_ENTRY *le, *le_b; struct mft_inode *mi, *mi_b; CLST hint, svcn, to_alloc, evcn1, next_svcn, asize, end, vcn0, alen; CLST alloc, evcn; unsigned fr; u64 total_size, total_size0; int step = 0; if (new) *new = false; /* Try to find in cache. */ down_read(&ni->file.run_lock); if (!run_lookup_entry(run, vcn, lcn, len, NULL)) *len = 0; up_read(&ni->file.run_lock); if (*len && (*lcn != SPARSE_LCN || !new)) return 0; /* Fast normal way without allocation. */ /* No cluster in cache or we need to allocate cluster in hole. */ sbi = ni->mi.sbi; cluster_bits = sbi->cluster_bits; ni_lock(ni); down_write(&ni->file.run_lock); /* Repeat the code above (under write lock). */ if (!run_lookup_entry(run, vcn, lcn, len, NULL)) *len = 0; if (*len) { if (*lcn != SPARSE_LCN || !new) goto out; /* normal way without allocation. */ if (clen > *len) clen = *len; } le_b = NULL; attr_b = ni_find_attr(ni, NULL, &le_b, ATTR_DATA, NULL, 0, NULL, &mi_b); if (!attr_b) { err = -ENOENT; goto out; } if (!attr_b->non_res) { *lcn = RESIDENT_LCN; *len = 1; goto out; } asize = le64_to_cpu(attr_b->nres.alloc_size) >> cluster_bits; if (vcn >= asize) { if (new) { err = -EINVAL; } else { *len = 1; *lcn = SPARSE_LCN; } goto out; } svcn = le64_to_cpu(attr_b->nres.svcn); evcn1 = le64_to_cpu(attr_b->nres.evcn) + 1; attr = attr_b; le = le_b; mi = mi_b; if (le_b && (vcn < svcn || evcn1 <= vcn)) { attr = ni_find_attr(ni, attr_b, &le, ATTR_DATA, NULL, 0, &vcn, &mi); if (!attr) { err = -EINVAL; goto out; } svcn = le64_to_cpu(attr->nres.svcn); evcn1 = le64_to_cpu(attr->nres.evcn) + 1; } /* Load in cache actual information. */ err = attr_load_runs(attr, ni, run, NULL); if (err) goto out; /* Check for compressed frame. */ err = attr_is_frame_compressed(ni, attr_b, vcn >> NTFS_LZNT_CUNIT, &hint, run); if (err) goto out; if (hint) { /* if frame is compressed - don't touch it. */ *lcn = COMPRESSED_LCN; /* length to the end of frame. */ *len = NTFS_LZNT_CLUSTERS - (vcn & (NTFS_LZNT_CLUSTERS - 1)); err = 0; goto out; } if (!*len) { if (run_lookup_entry(run, vcn, lcn, len, NULL)) { if (*lcn != SPARSE_LCN || !new) goto ok; /* Slow normal way without allocation. */ if (clen > *len) clen = *len; } else if (!new) { /* Here we may return -ENOENT. * In any case caller gets zero length. */ goto ok; } } if (!is_attr_ext(attr_b)) { /* The code below only for sparsed or compressed attributes. */ err = -EINVAL; goto out; } vcn0 = vcn; to_alloc = clen; fr = (sbi->record_size - le32_to_cpu(mi->mrec->used) + 8) / 3 + 1; /* Allocate frame aligned clusters. * ntfs.sys usually uses 16 clusters per frame for sparsed or compressed. * ntfs3 uses 1 cluster per frame for new created sparsed files. */ if (attr_b->nres.c_unit) { CLST clst_per_frame = 1u << attr_b->nres.c_unit; CLST cmask = ~(clst_per_frame - 1); /* Get frame aligned vcn and to_alloc. */ vcn = vcn0 & cmask; to_alloc = ((vcn0 + clen + clst_per_frame - 1) & cmask) - vcn; if (fr < clst_per_frame) fr = clst_per_frame; zero = true; /* Check if 'vcn' and 'vcn0' in different attribute segments. */ if (vcn < svcn || evcn1 <= vcn) { struct ATTRIB *attr2; /* Load runs for truncated vcn. */ attr2 = ni_find_attr(ni, attr_b, &le_b, ATTR_DATA, NULL, 0, &vcn, &mi); if (!attr2) { err = -EINVAL; goto out; } evcn1 = le64_to_cpu(attr2->nres.evcn) + 1; err = attr_load_runs(attr2, ni, run, NULL); if (err) goto out; } } if (vcn + to_alloc > asize) to_alloc = asize - vcn; /* Get the last LCN to allocate from. */ hint = 0; if (vcn > evcn1) { if (!run_add_entry(run, evcn1, SPARSE_LCN, vcn - evcn1, false)) { err = -ENOMEM; goto out; } } else if (vcn && !run_lookup_entry(run, vcn - 1, &hint, NULL, NULL)) { hint = -1; } /* Allocate and zeroout new clusters. */ err = attr_allocate_clusters(sbi, run, vcn, hint + 1, to_alloc, NULL, zero ? ALLOCATE_ZERO : ALLOCATE_DEF, &alen, fr, lcn, len); if (err) goto out; *new = true; step = 1; end = vcn + alen; /* Save 'total_size0' to restore if error. */ total_size0 = le64_to_cpu(attr_b->nres.total_size); total_size = total_size0 + ((u64)alen << cluster_bits); if (vcn != vcn0) { if (!run_lookup_entry(run, vcn0, lcn, len, NULL)) { err = -EINVAL; goto out; } if (*lcn == SPARSE_LCN) { /* Internal error. Should not happened. */ WARN_ON(1); err = -EINVAL; goto out; } /* Check case when vcn0 + len overlaps new allocated clusters. */ if (vcn0 + *len > end) *len = end - vcn0; } repack: err = mi_pack_runs(mi, attr, run, max(end, evcn1) - svcn); if (err) goto out; attr_b->nres.total_size = cpu_to_le64(total_size); inode_set_bytes(&ni->vfs_inode, total_size); ni->ni_flags |= NI_FLAG_UPDATE_PARENT; mi_b->dirty = true; mark_inode_dirty(&ni->vfs_inode); /* Stored [vcn : next_svcn) from [vcn : end). */ next_svcn = le64_to_cpu(attr->nres.evcn) + 1; if (end <= evcn1) { if (next_svcn == evcn1) { /* Normal way. Update attribute and exit. */ goto ok; } /* Add new segment [next_svcn : evcn1 - next_svcn). */ if (!ni->attr_list.size) { err = ni_create_attr_list(ni); if (err) goto undo1; /* Layout of records is changed. */ le_b = NULL; attr_b = ni_find_attr(ni, NULL, &le_b, ATTR_DATA, NULL, 0, NULL, &mi_b); if (!attr_b) { err = -ENOENT; goto out; } attr = attr_b; le = le_b; mi = mi_b; goto repack; } } /* * The code below may require additional cluster (to extend attribute list) * and / or one MFT record * It is too complex to undo operations if -ENOSPC occurs deep inside * in 'ni_insert_nonresident'. * Return in advance -ENOSPC here if there are no free cluster and no free MFT. */ if (!ntfs_check_for_free_space(sbi, 1, 1)) { /* Undo step 1. */ err = -ENOSPC; goto undo1; } step = 2; svcn = evcn1; /* Estimate next attribute. */ attr = ni_find_attr(ni, attr, &le, ATTR_DATA, NULL, 0, &svcn, &mi); if (!attr) { /* Insert new attribute segment. */ goto ins_ext; } /* Try to update existed attribute segment. */ alloc = bytes_to_cluster(sbi, le64_to_cpu(attr_b->nres.alloc_size)); evcn = le64_to_cpu(attr->nres.evcn); if (end < next_svcn) end = next_svcn; while (end > evcn) { /* Remove segment [svcn : evcn). */ mi_remove_attr(NULL, mi, attr); if (!al_remove_le(ni, le)) { err = -EINVAL; goto out; } if (evcn + 1 >= alloc) { /* Last attribute segment. */ evcn1 = evcn + 1; goto ins_ext; } if (ni_load_mi(ni, le, &mi)) { attr = NULL; goto out; } attr = mi_find_attr(ni, mi, NULL, ATTR_DATA, NULL, 0, &le->id); if (!attr) { err = -EINVAL; goto out; } svcn = le64_to_cpu(attr->nres.svcn); evcn = le64_to_cpu(attr->nres.evcn); } if (end < svcn) end = svcn; err = attr_load_runs(attr, ni, run, &end); if (err) goto out; evcn1 = evcn + 1; attr->nres.svcn = cpu_to_le64(next_svcn); err = mi_pack_runs(mi, attr, run, evcn1 - next_svcn); if (err) goto out; le->vcn = cpu_to_le64(next_svcn); ni->attr_list.dirty = true; mi->dirty = true; next_svcn = le64_to_cpu(attr->nres.evcn) + 1; ins_ext: if (evcn1 > next_svcn) { err = ni_insert_nonresident(ni, ATTR_DATA, NULL, 0, run, next_svcn, evcn1 - next_svcn, attr_b->flags, &attr, &mi, NULL); if (err) goto out; } ok: run_truncate_around(run, vcn); out: if (err && step > 1) { /* Too complex to restore. */ _ntfs_bad_inode(&ni->vfs_inode); } up_write(&ni->file.run_lock); ni_unlock(ni); return err; undo1: /* Undo step1. */ attr_b->nres.total_size = cpu_to_le64(total_size0); inode_set_bytes(&ni->vfs_inode, total_size0); if (run_deallocate_ex(sbi, run, vcn, alen, NULL, false) || !run_add_entry(run, vcn, SPARSE_LCN, alen, false) || mi_pack_runs(mi, attr, run, max(end, evcn1) - svcn)) { _ntfs_bad_inode(&ni->vfs_inode); } goto out; } int attr_data_read_resident(struct ntfs_inode *ni, struct folio *folio) { u64 vbo; struct ATTRIB *attr; u32 data_size; size_t len; attr = ni_find_attr(ni, NULL, NULL, ATTR_DATA, NULL, 0, NULL, NULL); if (!attr) return -EINVAL; if (attr->non_res) return E_NTFS_NONRESIDENT; vbo = folio->index << PAGE_SHIFT; data_size = le32_to_cpu(attr->res.data_size); if (vbo > data_size) len = 0; else len = min(data_size - vbo, folio_size(folio)); folio_fill_tail(folio, 0, resident_data(attr) + vbo, len); folio_mark_uptodate(folio); return 0; } int attr_data_write_resident(struct ntfs_inode *ni, struct folio *folio) { u64 vbo; struct mft_inode *mi; struct ATTRIB *attr; u32 data_size; attr = ni_find_attr(ni, NULL, NULL, ATTR_DATA, NULL, 0, NULL, &mi); if (!attr) return -EINVAL; if (attr->non_res) { /* Return special error code to check this case. */ return E_NTFS_NONRESIDENT; } vbo = folio->index << PAGE_SHIFT; data_size = le32_to_cpu(attr->res.data_size); if (vbo < data_size) { char *data = resident_data(attr); size_t len = min(data_size - vbo, folio_size(folio)); memcpy_from_folio(data + vbo, folio, 0, len); mi->dirty = true; } ni->i_valid = data_size; return 0; } /* * attr_load_runs_vcn - Load runs with VCN. */ int attr_load_runs_vcn(struct ntfs_inode *ni, enum ATTR_TYPE type, const __le16 *name, u8 name_len, struct runs_tree *run, CLST vcn) { struct ATTRIB *attr; int err; CLST svcn, evcn; u16 ro; if (!ni) { /* Is record corrupted? */ return -ENOENT; } attr = ni_find_attr(ni, NULL, NULL, type, name, name_len, &vcn, NULL); if (!attr) { /* Is record corrupted? */ return -ENOENT; } svcn = le64_to_cpu(attr->nres.svcn); evcn = le64_to_cpu(attr->nres.evcn); if (evcn < vcn || vcn < svcn) { /* Is record corrupted? */ return -EINVAL; } ro = le16_to_cpu(attr->nres.run_off); if (ro > le32_to_cpu(attr->size)) return -EINVAL; err = run_unpack_ex(run, ni->mi.sbi, ni->mi.rno, svcn, evcn, svcn, Add2Ptr(attr, ro), le32_to_cpu(attr->size) - ro); if (err < 0) return err; return 0; } /* * attr_load_runs_range - Load runs for given range [from to). */ int attr_load_runs_range(struct ntfs_inode *ni, enum ATTR_TYPE type, const __le16 *name, u8 name_len, struct runs_tree *run, u64 from, u64 to) { struct ntfs_sb_info *sbi = ni->mi.sbi; u8 cluster_bits = sbi->cluster_bits; CLST vcn; CLST vcn_last = (to - 1) >> cluster_bits; CLST lcn, clen; int err; for (vcn = from >> cluster_bits; vcn <= vcn_last; vcn += clen) { if (!run_lookup_entry(run, vcn, &lcn, &clen, NULL)) { err = attr_load_runs_vcn(ni, type, name, name_len, run, vcn); if (err) return err; clen = 0; /* Next run_lookup_entry(vcn) must be success. */ } } return 0; } #ifdef CONFIG_NTFS3_LZX_XPRESS /* * attr_wof_frame_info * * Read header of Xpress/LZX file to get info about frame. */ int attr_wof_frame_info(struct ntfs_inode *ni, struct ATTRIB *attr, struct runs_tree *run, u64 frame, u64 frames, u8 frame_bits, u32 *ondisk_size, u64 *vbo_data) { struct ntfs_sb_info *sbi = ni->mi.sbi; u64 vbo[2], off[2], wof_size; u32 voff; u8 bytes_per_off; char *addr; struct folio *folio; int i, err; __le32 *off32; __le64 *off64; if (ni->vfs_inode.i_size < 0x100000000ull) { /* File starts with array of 32 bit offsets. */ bytes_per_off = sizeof(__le32); vbo[1] = frame << 2; *vbo_data = frames << 2; } else { /* File starts with array of 64 bit offsets. */ bytes_per_off = sizeof(__le64); vbo[1] = frame << 3; *vbo_data = frames << 3; } /* * Read 4/8 bytes at [vbo - 4(8)] == offset where compressed frame starts. * Read 4/8 bytes at [vbo] == offset where compressed frame ends. */ if (!attr->non_res) { if (vbo[1] + bytes_per_off > le32_to_cpu(attr->res.data_size)) { _ntfs_bad_inode(&ni->vfs_inode); return -EINVAL; } addr = resident_data(attr); if (bytes_per_off == sizeof(__le32)) { off32 = Add2Ptr(addr, vbo[1]); off[0] = vbo[1] ? le32_to_cpu(off32[-1]) : 0; off[1] = le32_to_cpu(off32[0]); } else { off64 = Add2Ptr(addr, vbo[1]); off[0] = vbo[1] ? le64_to_cpu(off64[-1]) : 0; off[1] = le64_to_cpu(off64[0]); } *vbo_data += off[0]; *ondisk_size = off[1] - off[0]; return 0; } wof_size = le64_to_cpu(attr->nres.data_size); down_write(&ni->file.run_lock); folio = ni->file.offs_folio; if (!folio) { folio = folio_alloc(GFP_KERNEL, 0); if (!folio) { err = -ENOMEM; goto out; } folio->index = -1; ni->file.offs_folio = folio; } folio_lock(folio); addr = folio_address(folio); if (vbo[1]) { voff = vbo[1] & (PAGE_SIZE - 1); vbo[0] = vbo[1] - bytes_per_off; i = 0; } else { voff = 0; vbo[0] = 0; off[0] = 0; i = 1; } do { pgoff_t index = vbo[i] >> PAGE_SHIFT; if (index != folio->index) { struct page *page = &folio->page; u64 from = vbo[i] & ~(u64)(PAGE_SIZE - 1); u64 to = min(from + PAGE_SIZE, wof_size); err = attr_load_runs_range(ni, ATTR_DATA, WOF_NAME, ARRAY_SIZE(WOF_NAME), run, from, to); if (err) goto out1; err = ntfs_bio_pages(sbi, run, &page, 1, from, to - from, REQ_OP_READ); if (err) { folio->index = -1; goto out1; } folio->index = index; } if (i) { if (bytes_per_off == sizeof(__le32)) { off32 = Add2Ptr(addr, voff); off[1] = le32_to_cpu(*off32); } else { off64 = Add2Ptr(addr, voff); off[1] = le64_to_cpu(*off64); } } else if (!voff) { if (bytes_per_off == sizeof(__le32)) { off32 = Add2Ptr(addr, PAGE_SIZE - sizeof(u32)); off[0] = le32_to_cpu(*off32); } else { off64 = Add2Ptr(addr, PAGE_SIZE - sizeof(u64)); off[0] = le64_to_cpu(*off64); } } else { /* Two values in one page. */ if (bytes_per_off == sizeof(__le32)) { off32 = Add2Ptr(addr, voff); off[0] = le32_to_cpu(off32[-1]); off[1] = le32_to_cpu(off32[0]); } else { off64 = Add2Ptr(addr, voff); off[0] = le64_to_cpu(off64[-1]); off[1] = le64_to_cpu(off64[0]); } break; } } while (++i < 2); *vbo_data += off[0]; *ondisk_size = off[1] - off[0]; out1: folio_unlock(folio); out: up_write(&ni->file.run_lock); return err; } #endif /* * attr_is_frame_compressed - Used to detect compressed frame. * * attr - base (primary) attribute segment. * run - run to use, usually == &ni->file.run. * Only base segments contains valid 'attr->nres.c_unit' */ int attr_is_frame_compressed(struct ntfs_inode *ni, struct ATTRIB *attr, CLST frame, CLST *clst_data, struct runs_tree *run) { int err; u32 clst_frame; CLST clen, lcn, vcn, alen, slen, vcn_next; size_t idx; *clst_data = 0; if (!is_attr_compressed(attr)) return 0; if (!attr->non_res) return 0; clst_frame = 1u << attr->nres.c_unit; vcn = frame * clst_frame; if (!run_lookup_entry(run, vcn, &lcn, &clen, &idx)) { err = attr_load_runs_vcn(ni, attr->type, attr_name(attr), attr->name_len, run, vcn); if (err) return err; if (!run_lookup_entry(run, vcn, &lcn, &clen, &idx)) return -EINVAL; } if (lcn == SPARSE_LCN) { /* Sparsed frame. */ return 0; } if (clen >= clst_frame) { /* * The frame is not compressed 'cause * it does not contain any sparse clusters. */ *clst_data = clst_frame; return 0; } alen = bytes_to_cluster(ni->mi.sbi, le64_to_cpu(attr->nres.alloc_size)); slen = 0; *clst_data = clen; /* * The frame is compressed if *clst_data + slen >= clst_frame. * Check next fragments. */ while ((vcn += clen) < alen) { vcn_next = vcn; if (!run_get_entry(run, ++idx, &vcn, &lcn, &clen) || vcn_next != vcn) { err = attr_load_runs_vcn(ni, attr->type, attr_name(attr), attr->name_len, run, vcn_next); if (err) return err; vcn = vcn_next; if (!run_lookup_entry(run, vcn, &lcn, &clen, &idx)) return -EINVAL; } if (lcn == SPARSE_LCN) { slen += clen; } else { if (slen) { /* * Data_clusters + sparse_clusters = * not enough for frame. */ return -EINVAL; } *clst_data += clen; } if (*clst_data + slen >= clst_frame) { if (!slen) { /* * There is no sparsed clusters in this frame * so it is not compressed. */ *clst_data = clst_frame; } else { /* Frame is compressed. */ } break; } } return 0; } /* * attr_allocate_frame - Allocate/free clusters for @frame. * * Assumed: down_write(&ni->file.run_lock); */ int attr_allocate_frame(struct ntfs_inode *ni, CLST frame, size_t compr_size, u64 new_valid) { int err = 0; struct runs_tree *run = &ni->file.run; struct ntfs_sb_info *sbi = ni->mi.sbi; struct ATTRIB *attr = NULL, *attr_b; struct ATTR_LIST_ENTRY *le, *le_b; struct mft_inode *mi, *mi_b; CLST svcn, evcn1, next_svcn, len; CLST vcn, end, clst_data; u64 total_size, valid_size, data_size; le_b = NULL; attr_b = ni_find_attr(ni, NULL, &le_b, ATTR_DATA, NULL, 0, NULL, &mi_b); if (!attr_b) return -ENOENT; if (!is_attr_ext(attr_b)) return -EINVAL; vcn = frame << NTFS_LZNT_CUNIT; total_size = le64_to_cpu(attr_b->nres.total_size); svcn = le64_to_cpu(attr_b->nres.svcn); evcn1 = le64_to_cpu(attr_b->nres.evcn) + 1; data_size = le64_to_cpu(attr_b->nres.data_size); if (svcn <= vcn && vcn < evcn1) { attr = attr_b; le = le_b; mi = mi_b; } else if (!le_b) { err = -EINVAL; goto out; } else { le = le_b; attr = ni_find_attr(ni, attr_b, &le, ATTR_DATA, NULL, 0, &vcn, &mi); if (!attr) { err = -EINVAL; goto out; } svcn = le64_to_cpu(attr->nres.svcn); evcn1 = le64_to_cpu(attr->nres.evcn) + 1; } err = attr_load_runs(attr, ni, run, NULL); if (err) goto out; err = attr_is_frame_compressed(ni, attr_b, frame, &clst_data, run); if (err) goto out; total_size -= (u64)clst_data << sbi->cluster_bits; len = bytes_to_cluster(sbi, compr_size); if (len == clst_data) goto out; if (len < clst_data) { err = run_deallocate_ex(sbi, run, vcn + len, clst_data - len, NULL, true); if (err) goto out; if (!run_add_entry(run, vcn + len, SPARSE_LCN, clst_data - len, false)) { err = -ENOMEM; goto out; } end = vcn + clst_data; /* Run contains updated range [vcn + len : end). */ } else { CLST alen, hint = 0; /* Get the last LCN to allocate from. */ if (vcn + clst_data && !run_lookup_entry(run, vcn + clst_data - 1, &hint, NULL, NULL)) { hint = -1; } err = attr_allocate_clusters(sbi, run, vcn + clst_data, hint + 1, len - clst_data, NULL, ALLOCATE_DEF, &alen, 0, NULL, NULL); if (err) goto out; end = vcn + len; /* Run contains updated range [vcn + clst_data : end). */ } total_size += (u64)len << sbi->cluster_bits; repack: err = mi_pack_runs(mi, attr, run, max(end, evcn1) - svcn); if (err) goto out; attr_b->nres.total_size = cpu_to_le64(total_size); inode_set_bytes(&ni->vfs_inode, total_size); ni->ni_flags |= NI_FLAG_UPDATE_PARENT; mi_b->dirty = true; mark_inode_dirty(&ni->vfs_inode); /* Stored [vcn : next_svcn) from [vcn : end). */ next_svcn = le64_to_cpu(attr->nres.evcn) + 1; if (end <= evcn1) { if (next_svcn == evcn1) { /* Normal way. Update attribute and exit. */ goto ok; } /* Add new segment [next_svcn : evcn1 - next_svcn). */ if (!ni->attr_list.size) { err = ni_create_attr_list(ni); if (err) goto out; /* Layout of records is changed. */ le_b = NULL; attr_b = ni_find_attr(ni, NULL, &le_b, ATTR_DATA, NULL, 0, NULL, &mi_b); if (!attr_b) { err = -ENOENT; goto out; } attr = attr_b; le = le_b; mi = mi_b; goto repack; } } svcn = evcn1; /* Estimate next attribute. */ attr = ni_find_attr(ni, attr, &le, ATTR_DATA, NULL, 0, &svcn, &mi); if (attr) { CLST alloc = bytes_to_cluster( sbi, le64_to_cpu(attr_b->nres.alloc_size)); CLST evcn = le64_to_cpu(attr->nres.evcn); if (end < next_svcn) end = next_svcn; while (end > evcn) { /* Remove segment [svcn : evcn). */ mi_remove_attr(NULL, mi, attr); if (!al_remove_le(ni, le)) { err = -EINVAL; goto out; } if (evcn + 1 >= alloc) { /* Last attribute segment. */ evcn1 = evcn + 1; goto ins_ext; } if (ni_load_mi(ni, le, &mi)) { attr = NULL; goto out; } attr = mi_find_attr(ni, mi, NULL, ATTR_DATA, NULL, 0, &le->id); if (!attr) { err = -EINVAL; goto out; } svcn = le64_to_cpu(attr->nres.svcn); evcn = le64_to_cpu(attr->nres.evcn); } if (end < svcn) end = svcn; err = attr_load_runs(attr, ni, run, &end); if (err) goto out; evcn1 = evcn + 1; attr->nres.svcn = cpu_to_le64(next_svcn); err = mi_pack_runs(mi, attr, run, evcn1 - next_svcn); if (err) goto out; le->vcn = cpu_to_le64(next_svcn); ni->attr_list.dirty = true; mi->dirty = true; next_svcn = le64_to_cpu(attr->nres.evcn) + 1; } ins_ext: if (evcn1 > next_svcn) { err = ni_insert_nonresident(ni, ATTR_DATA, NULL, 0, run, next_svcn, evcn1 - next_svcn, attr_b->flags, &attr, &mi, NULL); if (err) goto out; } ok: run_truncate_around(run, vcn); out: if (attr_b) { if (new_valid > data_size) new_valid = data_size; valid_size = le64_to_cpu(attr_b->nres.valid_size); if (new_valid != valid_size) { attr_b->nres.valid_size = cpu_to_le64(valid_size); mi_b->dirty = true; } } return err; } /* * attr_collapse_range - Collapse range in file. */ int attr_collapse_range(struct ntfs_inode *ni, u64 vbo, u64 bytes) { int err = 0; struct runs_tree *run = &ni->file.run; struct ntfs_sb_info *sbi = ni->mi.sbi; struct ATTRIB *attr = NULL, *attr_b; struct ATTR_LIST_ENTRY *le, *le_b; struct mft_inode *mi, *mi_b; CLST svcn, evcn1, len, dealloc, alen; CLST vcn, end; u64 valid_size, data_size, alloc_size, total_size; u32 mask; __le16 a_flags; if (!bytes) return 0; le_b = NULL; attr_b = ni_find_attr(ni, NULL, &le_b, ATTR_DATA, NULL, 0, NULL, &mi_b); if (!attr_b) return -ENOENT; if (!attr_b->non_res) { /* Attribute is resident. Nothing to do? */ return 0; } data_size = le64_to_cpu(attr_b->nres.data_size); alloc_size = le64_to_cpu(attr_b->nres.alloc_size); a_flags = attr_b->flags; if (is_attr_ext(attr_b)) { total_size = le64_to_cpu(attr_b->nres.total_size); mask = (sbi->cluster_size << attr_b->nres.c_unit) - 1; } else { total_size = alloc_size; mask = sbi->cluster_mask; } if ((vbo & mask) || (bytes & mask)) { /* Allow to collapse only cluster aligned ranges. */ return -EINVAL; } if (vbo > data_size) return -EINVAL; down_write(&ni->file.run_lock); if (vbo + bytes >= data_size) { u64 new_valid = min(ni->i_valid, vbo); /* Simple truncate file at 'vbo'. */ truncate_setsize(&ni->vfs_inode, vbo); err = attr_set_size(ni, ATTR_DATA, NULL, 0, &ni->file.run, vbo, &new_valid, true, NULL); if (!err && new_valid < ni->i_valid) ni->i_valid = new_valid; goto out; } /* * Enumerate all attribute segments and collapse. */ alen = alloc_size >> sbi->cluster_bits; vcn = vbo >> sbi->cluster_bits; len = bytes >> sbi->cluster_bits; end = vcn + len; dealloc = 0; svcn = le64_to_cpu(attr_b->nres.svcn); evcn1 = le64_to_cpu(attr_b->nres.evcn) + 1; if (svcn <= vcn && vcn < evcn1) { attr = attr_b; le = le_b; mi = mi_b; } else if (!le_b) { err = -EINVAL; goto out; } else { le = le_b; attr = ni_find_attr(ni, attr_b, &le, ATTR_DATA, NULL, 0, &vcn, &mi); if (!attr) { err = -EINVAL; goto out; } svcn = le64_to_cpu(attr->nres.svcn); evcn1 = le64_to_cpu(attr->nres.evcn) + 1; } for (;;) { if (svcn >= end) { /* Shift VCN- */ attr->nres.svcn = cpu_to_le64(svcn - len); attr->nres.evcn = cpu_to_le64(evcn1 - 1 - len); if (le) { le->vcn = attr->nres.svcn; ni->attr_list.dirty = true; } mi->dirty = true; } else if (svcn < vcn || end < evcn1) { CLST vcn1, eat, next_svcn; /* Collapse a part of this attribute segment. */ err = attr_load_runs(attr, ni, run, &svcn); if (err) goto out; vcn1 = max(vcn, svcn); eat = min(end, evcn1) - vcn1; err = run_deallocate_ex(sbi, run, vcn1, eat, &dealloc, true); if (err) goto out; if (!run_collapse_range(run, vcn1, eat)) { err = -ENOMEM; goto out; } if (svcn >= vcn) { /* Shift VCN */ attr->nres.svcn = cpu_to_le64(vcn); if (le) { le->vcn = attr->nres.svcn; ni->attr_list.dirty = true; } } err = mi_pack_runs(mi, attr, run, evcn1 - svcn - eat); if (err) goto out; next_svcn = le64_to_cpu(attr->nres.evcn) + 1; if (next_svcn + eat < evcn1) { err = ni_insert_nonresident( ni, ATTR_DATA, NULL, 0, run, next_svcn, evcn1 - eat - next_svcn, a_flags, &attr, &mi, &le); if (err) goto out; /* Layout of records maybe changed. */ attr_b = NULL; } /* Free all allocated memory. */ run_truncate(run, 0); } else { u16 le_sz; u16 roff = le16_to_cpu(attr->nres.run_off); if (roff > le32_to_cpu(attr->size)) { err = -EINVAL; goto out; } run_unpack_ex(RUN_DEALLOCATE, sbi, ni->mi.rno, svcn, evcn1 - 1, svcn, Add2Ptr(attr, roff), le32_to_cpu(attr->size) - roff); /* Delete this attribute segment. */ mi_remove_attr(NULL, mi, attr); if (!le) break; le_sz = le16_to_cpu(le->size); if (!al_remove_le(ni, le)) { err = -EINVAL; goto out; } if (evcn1 >= alen) break; if (!svcn) { /* Load next record that contains this attribute. */ if (ni_load_mi(ni, le, &mi)) { err = -EINVAL; goto out; } /* Look for required attribute. */ attr = mi_find_attr(ni, mi, NULL, ATTR_DATA, NULL, 0, &le->id); if (!attr) { err = -EINVAL; goto out; } goto next_attr; } le = (struct ATTR_LIST_ENTRY *)((u8 *)le - le_sz); } if (evcn1 >= alen) break; attr = ni_enum_attr_ex(ni, attr, &le, &mi); if (!attr) { err = -EINVAL; goto out; } next_attr: svcn = le64_to_cpu(attr->nres.svcn); evcn1 = le64_to_cpu(attr->nres.evcn) + 1; } if (!attr_b) { le_b = NULL; attr_b = ni_find_attr(ni, NULL, &le_b, ATTR_DATA, NULL, 0, NULL, &mi_b); if (!attr_b) { err = -ENOENT; goto out; } } data_size -= bytes; valid_size = ni->i_valid; if (vbo + bytes <= valid_size) valid_size -= bytes; else if (vbo < valid_size) valid_size = vbo; attr_b->nres.alloc_size = cpu_to_le64(alloc_size - bytes); attr_b->nres.data_size = cpu_to_le64(data_size); attr_b->nres.valid_size = cpu_to_le64(min(valid_size, data_size)); total_size -= (u64)dealloc << sbi->cluster_bits; if (is_attr_ext(attr_b)) attr_b->nres.total_size = cpu_to_le64(total_size); mi_b->dirty = true; /* Update inode size. */ ni->i_valid = valid_size; i_size_write(&ni->vfs_inode, data_size); inode_set_bytes(&ni->vfs_inode, total_size); ni->ni_flags |= NI_FLAG_UPDATE_PARENT; mark_inode_dirty(&ni->vfs_inode); out: up_write(&ni->file.run_lock); if (err) _ntfs_bad_inode(&ni->vfs_inode); return err; } /* * attr_punch_hole * * Not for normal files. */ int attr_punch_hole(struct ntfs_inode *ni, u64 vbo, u64 bytes, u32 *frame_size) { int err = 0; struct runs_tree *run = &ni->file.run; struct ntfs_sb_info *sbi = ni->mi.sbi; struct ATTRIB *attr = NULL, *attr_b; struct ATTR_LIST_ENTRY *le, *le_b; struct mft_inode *mi, *mi_b; CLST svcn, evcn1, vcn, len, end, alen, hole, next_svcn; u64 total_size, alloc_size; u32 mask; __le16 a_flags; struct runs_tree run2; if (!bytes) return 0; le_b = NULL; attr_b = ni_find_attr(ni, NULL, &le_b, ATTR_DATA, NULL, 0, NULL, &mi_b); if (!attr_b) return -ENOENT; if (!attr_b->non_res) { u32 data_size = le32_to_cpu(attr_b->res.data_size); u32 from, to; if (vbo > data_size) return 0; from = vbo; to = min_t(u64, vbo + bytes, data_size); memset(Add2Ptr(resident_data(attr_b), from), 0, to - from); return 0; } if (!is_attr_ext(attr_b)) return -EOPNOTSUPP; alloc_size = le64_to_cpu(attr_b->nres.alloc_size); total_size = le64_to_cpu(attr_b->nres.total_size); if (vbo >= alloc_size) { /* NOTE: It is allowed. */ return 0; } mask = (sbi->cluster_size << attr_b->nres.c_unit) - 1; bytes += vbo; if (bytes > alloc_size) bytes = alloc_size; bytes -= vbo; if ((vbo & mask) || (bytes & mask)) { /* We have to zero a range(s). */ if (frame_size == NULL) { /* Caller insists range is aligned. */ return -EINVAL; } *frame_size = mask + 1; return E_NTFS_NOTALIGNED; } down_write(&ni->file.run_lock); run_init(&run2); run_truncate(run, 0); /* * Enumerate all attribute segments and punch hole where necessary. */ alen = alloc_size >> sbi->cluster_bits; vcn = vbo >> sbi->cluster_bits; len = bytes >> sbi->cluster_bits; end = vcn + len; hole = 0; svcn = le64_to_cpu(attr_b->nres.svcn); evcn1 = le64_to_cpu(attr_b->nres.evcn) + 1; a_flags = attr_b->flags; if (svcn <= vcn && vcn < evcn1) { attr = attr_b; le = le_b; mi = mi_b; } else if (!le_b) { err = -EINVAL; goto bad_inode; } else { le = le_b; attr = ni_find_attr(ni, attr_b, &le, ATTR_DATA, NULL, 0, &vcn, &mi); if (!attr) { err = -EINVAL; goto bad_inode; } svcn = le64_to_cpu(attr->nres.svcn); evcn1 = le64_to_cpu(attr->nres.evcn) + 1; } while (svcn < end) { CLST vcn1, zero, hole2 = hole; err = attr_load_runs(attr, ni, run, &svcn); if (err) goto done; vcn1 = max(vcn, svcn); zero = min(end, evcn1) - vcn1; /* * Check range [vcn1 + zero). * Calculate how many clusters there are. * Don't do any destructive actions. */ err = run_deallocate_ex(NULL, run, vcn1, zero, &hole2, false); if (err) goto done; /* Check if required range is already hole. */ if (hole2 == hole) goto next_attr; /* Make a clone of run to undo. */ err = run_clone(run, &run2); if (err) goto done; /* Make a hole range (sparse) [vcn1 + zero). */ if (!run_add_entry(run, vcn1, SPARSE_LCN, zero, false)) { err = -ENOMEM; goto done; } /* Update run in attribute segment. */ err = mi_pack_runs(mi, attr, run, evcn1 - svcn); if (err) goto done; next_svcn = le64_to_cpu(attr->nres.evcn) + 1; if (next_svcn < evcn1) { /* Insert new attribute segment. */ err = ni_insert_nonresident(ni, ATTR_DATA, NULL, 0, run, next_svcn, evcn1 - next_svcn, a_flags, &attr, &mi, &le); if (err) goto undo_punch; /* Layout of records maybe changed. */ attr_b = NULL; } /* Real deallocate. Should not fail. */ run_deallocate_ex(sbi, &run2, vcn1, zero, &hole, true); next_attr: /* Free all allocated memory. */ run_truncate(run, 0); if (evcn1 >= alen) break; /* Get next attribute segment. */ attr = ni_enum_attr_ex(ni, attr, &le, &mi); if (!attr) { err = -EINVAL; goto bad_inode; } svcn = le64_to_cpu(attr->nres.svcn); evcn1 = le64_to_cpu(attr->nres.evcn) + 1; } done: if (!hole) goto out; if (!attr_b) { attr_b = ni_find_attr(ni, NULL, NULL, ATTR_DATA, NULL, 0, NULL, &mi_b); if (!attr_b) { err = -EINVAL; goto bad_inode; } } total_size -= (u64)hole << sbi->cluster_bits; attr_b->nres.total_size = cpu_to_le64(total_size); mi_b->dirty = true; /* Update inode size. */ inode_set_bytes(&ni->vfs_inode, total_size); ni->ni_flags |= NI_FLAG_UPDATE_PARENT; mark_inode_dirty(&ni->vfs_inode); out: run_close(&run2); up_write(&ni->file.run_lock); return err; bad_inode: _ntfs_bad_inode(&ni->vfs_inode); goto out; undo_punch: /* * Restore packed runs. * 'mi_pack_runs' should not fail, cause we restore original. */ if (mi_pack_runs(mi, attr, &run2, evcn1 - svcn)) goto bad_inode; goto done; } /* * attr_insert_range - Insert range (hole) in file. * Not for normal files. */ int attr_insert_range(struct ntfs_inode *ni, u64 vbo, u64 bytes) { int err = 0; struct runs_tree *run = &ni->file.run; struct ntfs_sb_info *sbi = ni->mi.sbi; struct ATTRIB *attr = NULL, *attr_b; struct ATTR_LIST_ENTRY *le, *le_b; struct mft_inode *mi, *mi_b; CLST vcn, svcn, evcn1, len, next_svcn; u64 data_size, alloc_size; u32 mask; __le16 a_flags; if (!bytes) return 0; le_b = NULL; attr_b = ni_find_attr(ni, NULL, &le_b, ATTR_DATA, NULL, 0, NULL, &mi_b); if (!attr_b) return -ENOENT; if (!is_attr_ext(attr_b)) { /* It was checked above. See fallocate. */ return -EOPNOTSUPP; } if (!attr_b->non_res) { data_size = le32_to_cpu(attr_b->res.data_size); alloc_size = data_size; mask = sbi->cluster_mask; /* cluster_size - 1 */ } else { data_size = le64_to_cpu(attr_b->nres.data_size); alloc_size = le64_to_cpu(attr_b->nres.alloc_size); mask = (sbi->cluster_size << attr_b->nres.c_unit) - 1; } if (vbo >= data_size) { /* * Insert range after the file size is not allowed. * If the offset is equal to or greater than the end of * file, an error is returned. For such operations (i.e., inserting * a hole at the end of file), ftruncate(2) should be used. */ return -EINVAL; } if ((vbo & mask) || (bytes & mask)) { /* Allow to insert only frame aligned ranges. */ return -EINVAL; } /* * valid_size <= data_size <= alloc_size * Check alloc_size for maximum possible. */ if (bytes > sbi->maxbytes_sparse - alloc_size) return -EFBIG; vcn = vbo >> sbi->cluster_bits; len = bytes >> sbi->cluster_bits; down_write(&ni->file.run_lock); if (!attr_b->non_res) { err = attr_set_size(ni, ATTR_DATA, NULL, 0, run, data_size + bytes, NULL, false, NULL); le_b = NULL; attr_b = ni_find_attr(ni, NULL, &le_b, ATTR_DATA, NULL, 0, NULL, &mi_b); if (!attr_b) { err = -EINVAL; goto bad_inode; } if (err) goto out; if (!attr_b->non_res) { /* Still resident. */ char *data = Add2Ptr(attr_b, le16_to_cpu(attr_b->res.data_off)); memmove(data + bytes, data, bytes); memset(data, 0, bytes); goto done; } /* Resident files becomes nonresident. */ data_size = le64_to_cpu(attr_b->nres.data_size); alloc_size = le64_to_cpu(attr_b->nres.alloc_size); } /* * Enumerate all attribute segments and shift start vcn. */ a_flags = attr_b->flags; svcn = le64_to_cpu(attr_b->nres.svcn); evcn1 = le64_to_cpu(attr_b->nres.evcn) + 1; if (svcn <= vcn && vcn < evcn1) { attr = attr_b; le = le_b; mi = mi_b; } else if (!le_b) { err = -EINVAL; goto bad_inode; } else { le = le_b; attr = ni_find_attr(ni, attr_b, &le, ATTR_DATA, NULL, 0, &vcn, &mi); if (!attr) { err = -EINVAL; goto bad_inode; } svcn = le64_to_cpu(attr->nres.svcn); evcn1 = le64_to_cpu(attr->nres.evcn) + 1; } run_truncate(run, 0); /* clear cached values. */ err = attr_load_runs(attr, ni, run, NULL); if (err) goto out; if (!run_insert_range(run, vcn, len)) { err = -ENOMEM; goto out; } /* Try to pack in current record as much as possible. */ err = mi_pack_runs(mi, attr, run, evcn1 + len - svcn); if (err) goto out; next_svcn = le64_to_cpu(attr->nres.evcn) + 1; while ((attr = ni_enum_attr_ex(ni, attr, &le, &mi)) && attr->type == ATTR_DATA && !attr->name_len) { le64_add_cpu(&attr->nres.svcn, len); le64_add_cpu(&attr->nres.evcn, len); if (le) { le->vcn = attr->nres.svcn; ni->attr_list.dirty = true; } mi->dirty = true; } if (next_svcn < evcn1 + len) { err = ni_insert_nonresident(ni, ATTR_DATA, NULL, 0, run, next_svcn, evcn1 + len - next_svcn, a_flags, NULL, NULL, NULL); le_b = NULL; attr_b = ni_find_attr(ni, NULL, &le_b, ATTR_DATA, NULL, 0, NULL, &mi_b); if (!attr_b) { err = -EINVAL; goto bad_inode; } if (err) { /* ni_insert_nonresident failed. Try to undo. */ goto undo_insert_range; } } /* * Update primary attribute segment. */ if (vbo <= ni->i_valid) ni->i_valid += bytes; attr_b->nres.data_size = cpu_to_le64(data_size + bytes); attr_b->nres.alloc_size = cpu_to_le64(alloc_size + bytes); /* ni->valid may be not equal valid_size (temporary). */ if (ni->i_valid > data_size + bytes) attr_b->nres.valid_size = attr_b->nres.data_size; else attr_b->nres.valid_size = cpu_to_le64(ni->i_valid); mi_b->dirty = true; done: i_size_write(&ni->vfs_inode, ni->vfs_inode.i_size + bytes); ni->ni_flags |= NI_FLAG_UPDATE_PARENT; mark_inode_dirty(&ni->vfs_inode); out: run_truncate(run, 0); /* clear cached values. */ up_write(&ni->file.run_lock); return err; bad_inode: _ntfs_bad_inode(&ni->vfs_inode); goto out; undo_insert_range: svcn = le64_to_cpu(attr_b->nres.svcn); evcn1 = le64_to_cpu(attr_b->nres.evcn) + 1; if (svcn <= vcn && vcn < evcn1) { attr = attr_b; le = le_b; mi = mi_b; } else if (!le_b) { goto bad_inode; } else { le = le_b; attr = ni_find_attr(ni, attr_b, &le, ATTR_DATA, NULL, 0, &vcn, &mi); if (!attr) { goto bad_inode; } svcn = le64_to_cpu(attr->nres.svcn); evcn1 = le64_to_cpu(attr->nres.evcn) + 1; } if (attr_load_runs(attr, ni, run, NULL)) goto bad_inode; if (!run_collapse_range(run, vcn, len)) goto bad_inode; if (mi_pack_runs(mi, attr, run, evcn1 + len - svcn)) goto bad_inode; while ((attr = ni_enum_attr_ex(ni, attr, &le, &mi)) && attr->type == ATTR_DATA && !attr->name_len) { le64_sub_cpu(&attr->nres.svcn, len); le64_sub_cpu(&attr->nres.evcn, len); if (le) { le->vcn = attr->nres.svcn; ni->attr_list.dirty = true; } mi->dirty = true; } goto out; } /* * attr_force_nonresident * * Convert default data attribute into non resident form. */ int attr_force_nonresident(struct ntfs_inode *ni) { int err; struct ATTRIB *attr; struct ATTR_LIST_ENTRY *le = NULL; struct mft_inode *mi; attr = ni_find_attr(ni, NULL, &le, ATTR_DATA, NULL, 0, NULL, &mi); if (!attr) { _ntfs_bad_inode(&ni->vfs_inode); return -ENOENT; } if (attr->non_res) { /* Already non resident. */ return 0; } down_write(&ni->file.run_lock); err = attr_make_nonresident(ni, attr, le, mi, le32_to_cpu(attr->res.data_size), &ni->file.run, &attr, NULL); up_write(&ni->file.run_lock); return err; }
1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_NVRAM_H #define _LINUX_NVRAM_H #include <linux/errno.h> #include <uapi/linux/nvram.h> #ifdef CONFIG_PPC #include <asm/machdep.h> #endif /** * struct nvram_ops - NVRAM functionality made available to drivers * @read: validate checksum (if any) then load a range of bytes from NVRAM * @write: store a range of bytes to NVRAM then update checksum (if any) * @read_byte: load a single byte from NVRAM * @write_byte: store a single byte to NVRAM * @get_size: return the fixed number of bytes in the NVRAM * * Architectures which provide an nvram ops struct need not implement all * of these methods. If the NVRAM hardware can be accessed only one byte * at a time then it may be sufficient to provide .read_byte and .write_byte. * If the NVRAM has a checksum (and it is to be checked) the .read and * .write methods can be used to implement that efficiently. * * Portable drivers may use the wrapper functions defined here. * The nvram_read() and nvram_write() functions call the .read and .write * methods when available and fall back on the .read_byte and .write_byte * methods otherwise. */ struct nvram_ops { ssize_t (*get_size)(void); unsigned char (*read_byte)(int); void (*write_byte)(unsigned char, int); ssize_t (*read)(char *, size_t, loff_t *); ssize_t (*write)(char *, size_t, loff_t *); #if defined(CONFIG_X86) || defined(CONFIG_M68K) long (*initialize)(void); long (*set_checksum)(void); #endif }; extern const struct nvram_ops arch_nvram_ops; static inline ssize_t nvram_get_size(void) { #ifdef CONFIG_PPC if (ppc_md.nvram_size) return ppc_md.nvram_size(); #else if (arch_nvram_ops.get_size) return arch_nvram_ops.get_size(); #endif return -ENODEV; } static inline unsigned char nvram_read_byte(int addr) { #ifdef CONFIG_PPC if (ppc_md.nvram_read_val) return ppc_md.nvram_read_val(addr); #else if (arch_nvram_ops.read_byte) return arch_nvram_ops.read_byte(addr); #endif return 0xFF; } static inline void nvram_write_byte(unsigned char val, int addr) { #ifdef CONFIG_PPC if (ppc_md.nvram_write_val) ppc_md.nvram_write_val(addr, val); #else if (arch_nvram_ops.write_byte) arch_nvram_ops.write_byte(val, addr); #endif } static inline ssize_t nvram_read_bytes(char *buf, size_t count, loff_t *ppos) { ssize_t nvram_size = nvram_get_size(); loff_t i; char *p = buf; if (nvram_size < 0) return nvram_size; for (i = *ppos; count > 0 && i < nvram_size; ++i, ++p, --count) *p = nvram_read_byte(i); *ppos = i; return p - buf; } static inline ssize_t nvram_write_bytes(char *buf, size_t count, loff_t *ppos) { ssize_t nvram_size = nvram_get_size(); loff_t i; char *p = buf; if (nvram_size < 0) return nvram_size; for (i = *ppos; count > 0 && i < nvram_size; ++i, ++p, --count) nvram_write_byte(*p, i); *ppos = i; return p - buf; } static inline ssize_t nvram_read(char *buf, size_t count, loff_t *ppos) { #ifdef CONFIG_PPC if (ppc_md.nvram_read) return ppc_md.nvram_read(buf, count, ppos); #else if (arch_nvram_ops.read) return arch_nvram_ops.read(buf, count, ppos); #endif return nvram_read_bytes(buf, count, ppos); } static inline ssize_t nvram_write(char *buf, size_t count, loff_t *ppos) { #ifdef CONFIG_PPC if (ppc_md.nvram_write) return ppc_md.nvram_write(buf, count, ppos); #else if (arch_nvram_ops.write) return arch_nvram_ops.write(buf, count, ppos); #endif return nvram_write_bytes(buf, count, ppos); } #endif /* _LINUX_NVRAM_H */
2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 /* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright (C) 1991, 1992 Linus Torvalds * Copyright (C) 2000, 2001, 2002 Andi Kleen, SuSE Labs */ #ifndef _ASM_X86_STACKTRACE_H #define _ASM_X86_STACKTRACE_H #include <linux/uaccess.h> #include <linux/ptrace.h> #include <asm/cpu_entry_area.h> #include <asm/switch_to.h> enum stack_type { STACK_TYPE_UNKNOWN, STACK_TYPE_TASK, STACK_TYPE_IRQ, STACK_TYPE_SOFTIRQ, STACK_TYPE_ENTRY, STACK_TYPE_EXCEPTION, STACK_TYPE_EXCEPTION_LAST = STACK_TYPE_EXCEPTION + N_EXCEPTION_STACKS-1, }; struct stack_info { enum stack_type type; unsigned long *begin, *end, *next_sp; }; bool in_task_stack(unsigned long *stack, struct task_struct *task, struct stack_info *info); bool in_entry_stack(unsigned long *stack, struct stack_info *info); int get_stack_info(unsigned long *stack, struct task_struct *task, struct stack_info *info, unsigned long *visit_mask); bool get_stack_info_noinstr(unsigned long *stack, struct task_struct *task, struct stack_info *info); static __always_inline bool get_stack_guard_info(unsigned long *stack, struct stack_info *info) { /* make sure it's not in the stack proper */ if (get_stack_info_noinstr(stack, current, info)) return false; /* but if it is in the page below it, we hit a guard */ return get_stack_info_noinstr((void *)stack + PAGE_SIZE, current, info); } const char *stack_type_name(enum stack_type type); static inline bool on_stack(struct stack_info *info, void *addr, size_t len) { void *begin = info->begin; void *end = info->end; return (info->type != STACK_TYPE_UNKNOWN && addr >= begin && addr < end && addr + len > begin && addr + len <= end); } #ifdef CONFIG_X86_32 #define STACKSLOTS_PER_LINE 8 #else #define STACKSLOTS_PER_LINE 4 #endif #ifdef CONFIG_FRAME_POINTER static inline unsigned long * get_frame_pointer(struct task_struct *task, struct pt_regs *regs) { if (regs) return (unsigned long *)regs->bp; if (task == current) return __builtin_frame_address(0); return &((struct inactive_task_frame *)task->thread.sp)->bp; } #else static inline unsigned long * get_frame_pointer(struct task_struct *task, struct pt_regs *regs) { return NULL; } #endif /* CONFIG_FRAME_POINTER */ static inline unsigned long * get_stack_pointer(struct task_struct *task, struct pt_regs *regs) { if (regs) return (unsigned long *)regs->sp; if (task == current) return __builtin_frame_address(0); return (unsigned long *)task->thread.sp; } /* The form of the top of the frame on the stack */ struct stack_frame { struct stack_frame *next_frame; unsigned long return_address; }; struct stack_frame_ia32 { u32 next_frame; u32 return_address; }; void show_opcodes(struct pt_regs *regs, const char *loglvl); void show_ip(struct pt_regs *regs, const char *loglvl); #endif /* _ASM_X86_STACKTRACE_H */
33 14 31 1 19 12 31 31 9 2 9 13 5 9 9 2 9 11 8 3 1 1 33 20 6 8 2 11 13 1 1 3 3 1 26 25 1 1 1 7 7 7 5 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) 2017-2018 HUAWEI, Inc. * https://www.huawei.com/ * Copyright (C) 2021-2022, Alibaba Cloud */ #include <linux/security.h> #include <linux/xxhash.h> #include "xattr.h" struct erofs_xattr_iter { struct super_block *sb; struct erofs_buf buf; erofs_off_t pos; void *kaddr; char *buffer; int buffer_size, buffer_ofs; /* getxattr */ int index, infix_len; struct qstr name; /* listxattr */ struct dentry *dentry; }; static int erofs_init_inode_xattrs(struct inode *inode) { struct erofs_inode *const vi = EROFS_I(inode); struct erofs_xattr_iter it; unsigned int i; struct erofs_xattr_ibody_header *ih; struct super_block *sb = inode->i_sb; int ret = 0; /* the most case is that xattrs of this inode are initialized. */ if (test_bit(EROFS_I_EA_INITED_BIT, &vi->flags)) { /* * paired with smp_mb() at the end of the function to ensure * fields will only be observed after the bit is set. */ smp_mb(); return 0; } if (wait_on_bit_lock(&vi->flags, EROFS_I_BL_XATTR_BIT, TASK_KILLABLE)) return -ERESTARTSYS; /* someone has initialized xattrs for us? */ if (test_bit(EROFS_I_EA_INITED_BIT, &vi->flags)) goto out_unlock; /* * bypass all xattr operations if ->xattr_isize is not greater than * sizeof(struct erofs_xattr_ibody_header), in detail: * 1) it is not enough to contain erofs_xattr_ibody_header then * ->xattr_isize should be 0 (it means no xattr); * 2) it is just to contain erofs_xattr_ibody_header, which is on-disk * undefined right now (maybe use later with some new sb feature). */ if (vi->xattr_isize == sizeof(struct erofs_xattr_ibody_header)) { erofs_err(sb, "xattr_isize %d of nid %llu is not supported yet", vi->xattr_isize, vi->nid); ret = -EOPNOTSUPP; goto out_unlock; } else if (vi->xattr_isize < sizeof(struct erofs_xattr_ibody_header)) { if (vi->xattr_isize) { erofs_err(sb, "bogus xattr ibody @ nid %llu", vi->nid); DBG_BUGON(1); ret = -EFSCORRUPTED; goto out_unlock; /* xattr ondisk layout error */ } ret = -ENODATA; goto out_unlock; } it.buf = __EROFS_BUF_INITIALIZER; ret = erofs_init_metabuf(&it.buf, sb, erofs_inode_in_metabox(inode)); if (ret) goto out_unlock; it.pos = erofs_iloc(inode) + vi->inode_isize; /* read in shared xattr array (non-atomic, see kmalloc below) */ it.kaddr = erofs_bread(&it.buf, it.pos, true); if (IS_ERR(it.kaddr)) { ret = PTR_ERR(it.kaddr); goto out_unlock; } ih = it.kaddr; vi->xattr_name_filter = le32_to_cpu(ih->h_name_filter); vi->xattr_shared_count = ih->h_shared_count; vi->xattr_shared_xattrs = kmalloc_array(vi->xattr_shared_count, sizeof(uint), GFP_KERNEL); if (!vi->xattr_shared_xattrs) { erofs_put_metabuf(&it.buf); ret = -ENOMEM; goto out_unlock; } /* let's skip ibody header */ it.pos += sizeof(struct erofs_xattr_ibody_header); for (i = 0; i < vi->xattr_shared_count; ++i) { it.kaddr = erofs_bread(&it.buf, it.pos, true); if (IS_ERR(it.kaddr)) { kfree(vi->xattr_shared_xattrs); vi->xattr_shared_xattrs = NULL; ret = PTR_ERR(it.kaddr); goto out_unlock; } vi->xattr_shared_xattrs[i] = le32_to_cpu(*(__le32 *)it.kaddr); it.pos += sizeof(__le32); } erofs_put_metabuf(&it.buf); /* paired with smp_mb() at the beginning of the function. */ smp_mb(); set_bit(EROFS_I_EA_INITED_BIT, &vi->flags); out_unlock: clear_and_wake_up_bit(EROFS_I_BL_XATTR_BIT, &vi->flags); return ret; } static bool erofs_xattr_user_list(struct dentry *dentry) { return test_opt(&EROFS_SB(dentry->d_sb)->opt, XATTR_USER); } static bool erofs_xattr_trusted_list(struct dentry *dentry) { return capable(CAP_SYS_ADMIN); } static int erofs_xattr_generic_get(const struct xattr_handler *handler, struct dentry *unused, struct inode *inode, const char *name, void *buffer, size_t size) { if (handler->flags == EROFS_XATTR_INDEX_USER && !test_opt(&EROFS_I_SB(inode)->opt, XATTR_USER)) return -EOPNOTSUPP; return erofs_getxattr(inode, handler->flags, name, buffer, size); } const struct xattr_handler erofs_xattr_user_handler = { .prefix = XATTR_USER_PREFIX, .flags = EROFS_XATTR_INDEX_USER, .list = erofs_xattr_user_list, .get = erofs_xattr_generic_get, }; const struct xattr_handler erofs_xattr_trusted_handler = { .prefix = XATTR_TRUSTED_PREFIX, .flags = EROFS_XATTR_INDEX_TRUSTED, .list = erofs_xattr_trusted_list, .get = erofs_xattr_generic_get, }; #ifdef CONFIG_EROFS_FS_SECURITY const struct xattr_handler __maybe_unused erofs_xattr_security_handler = { .prefix = XATTR_SECURITY_PREFIX, .flags = EROFS_XATTR_INDEX_SECURITY, .get = erofs_xattr_generic_get, }; #endif const struct xattr_handler * const erofs_xattr_handlers[] = { &erofs_xattr_user_handler, &erofs_xattr_trusted_handler, #ifdef CONFIG_EROFS_FS_SECURITY &erofs_xattr_security_handler, #endif NULL, }; static int erofs_xattr_copy_to_buffer(struct erofs_xattr_iter *it, unsigned int len) { unsigned int slice, processed; struct super_block *sb = it->sb; void *src; for (processed = 0; processed < len; processed += slice) { it->kaddr = erofs_bread(&it->buf, it->pos, true); if (IS_ERR(it->kaddr)) return PTR_ERR(it->kaddr); src = it->kaddr; slice = min_t(unsigned int, sb->s_blocksize - erofs_blkoff(sb, it->pos), len - processed); memcpy(it->buffer + it->buffer_ofs, src, slice); it->buffer_ofs += slice; it->pos += slice; } return 0; } static int erofs_listxattr_foreach(struct erofs_xattr_iter *it) { struct erofs_xattr_entry entry; unsigned int base_index, name_total, prefix_len, infix_len = 0; const char *prefix, *infix = NULL; int err; /* 1. handle xattr entry */ entry = *(struct erofs_xattr_entry *)it->kaddr; it->pos += sizeof(struct erofs_xattr_entry); base_index = entry.e_name_index; if (entry.e_name_index & EROFS_XATTR_LONG_PREFIX) { struct erofs_sb_info *sbi = EROFS_SB(it->sb); struct erofs_xattr_prefix_item *pf = sbi->xattr_prefixes + (entry.e_name_index & EROFS_XATTR_LONG_PREFIX_MASK); if (pf >= sbi->xattr_prefixes + sbi->xattr_prefix_count) return 0; infix = pf->prefix->infix; infix_len = pf->infix_len; base_index = pf->prefix->base_index; } prefix = erofs_xattr_prefix(base_index, it->dentry); if (!prefix) return 0; prefix_len = strlen(prefix); name_total = prefix_len + infix_len + entry.e_name_len + 1; if (!it->buffer) { it->buffer_ofs += name_total; return 0; } if (it->buffer_ofs + name_total > it->buffer_size) return -ERANGE; memcpy(it->buffer + it->buffer_ofs, prefix, prefix_len); memcpy(it->buffer + it->buffer_ofs + prefix_len, infix, infix_len); it->buffer_ofs += prefix_len + infix_len; /* 2. handle xattr name */ err = erofs_xattr_copy_to_buffer(it, entry.e_name_len); if (err) return err; it->buffer[it->buffer_ofs++] = '\0'; return 0; } static int erofs_getxattr_foreach(struct erofs_xattr_iter *it) { struct super_block *sb = it->sb; struct erofs_xattr_entry entry; unsigned int slice, processed, value_sz; /* 1. handle xattr entry */ entry = *(struct erofs_xattr_entry *)it->kaddr; it->pos += sizeof(struct erofs_xattr_entry); value_sz = le16_to_cpu(entry.e_value_size); /* should also match the infix for long name prefixes */ if (entry.e_name_index & EROFS_XATTR_LONG_PREFIX) { struct erofs_sb_info *sbi = EROFS_SB(sb); struct erofs_xattr_prefix_item *pf = sbi->xattr_prefixes + (entry.e_name_index & EROFS_XATTR_LONG_PREFIX_MASK); if (pf >= sbi->xattr_prefixes + sbi->xattr_prefix_count) return -ENODATA; if (it->index != pf->prefix->base_index || it->name.len != entry.e_name_len + pf->infix_len) return -ENODATA; if (memcmp(it->name.name, pf->prefix->infix, pf->infix_len)) return -ENODATA; it->infix_len = pf->infix_len; } else { if (it->index != entry.e_name_index || it->name.len != entry.e_name_len) return -ENODATA; it->infix_len = 0; } /* 2. handle xattr name */ for (processed = 0; processed < entry.e_name_len; processed += slice) { it->kaddr = erofs_bread(&it->buf, it->pos, true); if (IS_ERR(it->kaddr)) return PTR_ERR(it->kaddr); slice = min_t(unsigned int, sb->s_blocksize - erofs_blkoff(sb, it->pos), entry.e_name_len - processed); if (memcmp(it->name.name + it->infix_len + processed, it->kaddr, slice)) return -ENODATA; it->pos += slice; } /* 3. handle xattr value */ if (!it->buffer) { it->buffer_ofs = value_sz; return 0; } if (it->buffer_size < value_sz) return -ERANGE; return erofs_xattr_copy_to_buffer(it, value_sz); } static int erofs_xattr_iter_inline(struct erofs_xattr_iter *it, struct inode *inode, bool getxattr) { struct erofs_inode *const vi = EROFS_I(inode); unsigned int xattr_header_sz, remaining, entry_sz; erofs_off_t next_pos; int ret; xattr_header_sz = sizeof(struct erofs_xattr_ibody_header) + sizeof(u32) * vi->xattr_shared_count; if (xattr_header_sz >= vi->xattr_isize) { DBG_BUGON(xattr_header_sz > vi->xattr_isize); return -ENODATA; } ret = erofs_init_metabuf(&it->buf, it->sb, erofs_inode_in_metabox(inode)); if (ret) return ret; remaining = vi->xattr_isize - xattr_header_sz; it->pos = erofs_iloc(inode) + vi->inode_isize + xattr_header_sz; while (remaining) { it->kaddr = erofs_bread(&it->buf, it->pos, true); if (IS_ERR(it->kaddr)) return PTR_ERR(it->kaddr); entry_sz = erofs_xattr_entry_size(it->kaddr); /* xattr on-disk corruption: xattr entry beyond xattr_isize */ if (remaining < entry_sz) { DBG_BUGON(1); return -EFSCORRUPTED; } remaining -= entry_sz; next_pos = it->pos + entry_sz; if (getxattr) ret = erofs_getxattr_foreach(it); else ret = erofs_listxattr_foreach(it); if ((getxattr && ret != -ENODATA) || (!getxattr && ret)) break; it->pos = next_pos; } return ret; } static int erofs_xattr_iter_shared(struct erofs_xattr_iter *it, struct inode *inode, bool getxattr) { struct erofs_inode *const vi = EROFS_I(inode); struct super_block *const sb = it->sb; struct erofs_sb_info *sbi = EROFS_SB(sb); unsigned int i = 0; int ret; ret = erofs_init_metabuf(&it->buf, sb, erofs_sb_has_shared_ea_in_metabox(sbi)); if (ret) return ret; while (i < vi->xattr_shared_count) { it->pos = erofs_pos(sb, sbi->xattr_blkaddr) + vi->xattr_shared_xattrs[i++] * sizeof(__le32); it->kaddr = erofs_bread(&it->buf, it->pos, true); if (IS_ERR(it->kaddr)) return PTR_ERR(it->kaddr); if (getxattr) ret = erofs_getxattr_foreach(it); else ret = erofs_listxattr_foreach(it); if ((getxattr && ret != -ENODATA) || (!getxattr && ret)) break; } return i ? ret : -ENODATA; } int erofs_getxattr(struct inode *inode, int index, const char *name, void *buffer, size_t buffer_size) { int ret; unsigned int hashbit; struct erofs_xattr_iter it; struct erofs_inode *vi = EROFS_I(inode); struct erofs_sb_info *sbi = EROFS_SB(inode->i_sb); if (!name) return -EINVAL; ret = erofs_init_inode_xattrs(inode); if (ret) return ret; /* reserved flag is non-zero if there's any change of on-disk format */ if (erofs_sb_has_xattr_filter(sbi) && !sbi->xattr_filter_reserved) { hashbit = xxh32(name, strlen(name), EROFS_XATTR_FILTER_SEED + index); hashbit &= EROFS_XATTR_FILTER_BITS - 1; if (vi->xattr_name_filter & (1U << hashbit)) return -ENODATA; } it.index = index; it.name = QSTR(name); if (it.name.len > EROFS_NAME_LEN) return -ERANGE; it.sb = inode->i_sb; it.buf = __EROFS_BUF_INITIALIZER; it.buffer = buffer; it.buffer_size = buffer_size; it.buffer_ofs = 0; ret = erofs_xattr_iter_inline(&it, inode, true); if (ret == -ENODATA) ret = erofs_xattr_iter_shared(&it, inode, true); erofs_put_metabuf(&it.buf); return ret ? ret : it.buffer_ofs; } ssize_t erofs_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size) { int ret; struct erofs_xattr_iter it; struct inode *inode = d_inode(dentry); ret = erofs_init_inode_xattrs(inode); if (ret == -ENODATA) return 0; if (ret) return ret; it.sb = dentry->d_sb; it.buf = __EROFS_BUF_INITIALIZER; it.dentry = dentry; it.buffer = buffer; it.buffer_size = buffer_size; it.buffer_ofs = 0; ret = erofs_xattr_iter_inline(&it, inode, false); if (!ret || ret == -ENODATA) ret = erofs_xattr_iter_shared(&it, inode, false); if (ret == -ENODATA) ret = 0; erofs_put_metabuf(&it.buf); return ret ? ret : it.buffer_ofs; } void erofs_xattr_prefixes_cleanup(struct super_block *sb) { struct erofs_sb_info *sbi = EROFS_SB(sb); int i; if (sbi->xattr_prefixes) { for (i = 0; i < sbi->xattr_prefix_count; i++) kfree(sbi->xattr_prefixes[i].prefix); kfree(sbi->xattr_prefixes); sbi->xattr_prefixes = NULL; } } int erofs_xattr_prefixes_init(struct super_block *sb) { struct erofs_sb_info *sbi = EROFS_SB(sb); struct erofs_buf buf = __EROFS_BUF_INITIALIZER; erofs_off_t pos = (erofs_off_t)sbi->xattr_prefix_start << 2; struct erofs_xattr_prefix_item *pfs; int ret = 0, i, len; bool plain = erofs_sb_has_plain_xattr_pfx(sbi); if (!sbi->xattr_prefix_count) return 0; pfs = kcalloc(sbi->xattr_prefix_count, sizeof(*pfs), GFP_KERNEL); if (!pfs) return -ENOMEM; if (!plain) { if (erofs_sb_has_metabox(sbi)) (void)erofs_init_metabuf(&buf, sb, true); else if (sbi->packed_inode) buf.mapping = sbi->packed_inode->i_mapping; else plain = true; } if (plain) (void)erofs_init_metabuf(&buf, sb, false); for (i = 0; i < sbi->xattr_prefix_count; i++) { void *ptr = erofs_read_metadata(sb, &buf, &pos, &len); if (IS_ERR(ptr)) { ret = PTR_ERR(ptr); break; } else if (len < sizeof(*pfs->prefix) || len > EROFS_NAME_LEN + sizeof(*pfs->prefix)) { kfree(ptr); ret = -EFSCORRUPTED; break; } pfs[i].prefix = ptr; pfs[i].infix_len = len - sizeof(struct erofs_xattr_long_prefix); } erofs_put_metabuf(&buf); sbi->xattr_prefixes = pfs; if (ret) erofs_xattr_prefixes_cleanup(sb); return ret; } #ifdef CONFIG_EROFS_FS_POSIX_ACL struct posix_acl *erofs_get_acl(struct inode *inode, int type, bool rcu) { struct posix_acl *acl; int prefix, rc; char *value = NULL; if (rcu) return ERR_PTR(-ECHILD); switch (type) { case ACL_TYPE_ACCESS: prefix = EROFS_XATTR_INDEX_POSIX_ACL_ACCESS; break; case ACL_TYPE_DEFAULT: prefix = EROFS_XATTR_INDEX_POSIX_ACL_DEFAULT; break; default: return ERR_PTR(-EINVAL); } rc = erofs_getxattr(inode, prefix, "", NULL, 0); if (rc > 0) { value = kmalloc(rc, GFP_KERNEL); if (!value) return ERR_PTR(-ENOMEM); rc = erofs_getxattr(inode, prefix, "", value, rc); } if (rc == -ENODATA) acl = NULL; else if (rc < 0) acl = ERR_PTR(rc); else acl = posix_acl_from_xattr(&init_user_ns, value, rc); kfree(value); return acl; } #endif
5848 592 987 850 146 988 147 457 5789 5793 5792 5807 5512 5511 3 275 275 273 578 2392 2031 578 148 1 2391 771 2180 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 /* SPDX-License-Identifier: GPL-2.0 */ /* * Percpu refcounts: * (C) 2012 Google, Inc. * Author: Kent Overstreet <koverstreet@google.com> * * This implements a refcount with similar semantics to atomic_t - atomic_inc(), * atomic_dec_and_test() - but percpu. * * There's one important difference between percpu refs and normal atomic_t * refcounts; you have to keep track of your initial refcount, and then when you * start shutting down you call percpu_ref_kill() _before_ dropping the initial * refcount. * * The refcount will have a range of 0 to ((1U << 31) - 1), i.e. one bit less * than an atomic_t - this is because of the way shutdown works, see * percpu_ref_kill()/PERCPU_COUNT_BIAS. * * Before you call percpu_ref_kill(), percpu_ref_put() does not check for the * refcount hitting 0 - it can't, if it was in percpu mode. percpu_ref_kill() * puts the ref back in single atomic_t mode, collecting the per cpu refs and * issuing the appropriate barriers, and then marks the ref as shutting down so * that percpu_ref_put() will check for the ref hitting 0. After it returns, * it's safe to drop the initial ref. * * USAGE: * * See fs/aio.c for some example usage; it's used there for struct kioctx, which * is created when userspaces calls io_setup(), and destroyed when userspace * calls io_destroy() or the process exits. * * In the aio code, kill_ioctx() is called when we wish to destroy a kioctx; it * removes the kioctx from the proccess's table of kioctxs and kills percpu_ref. * After that, there can't be any new users of the kioctx (from lookup_ioctx()) * and it's then safe to drop the initial ref with percpu_ref_put(). * * Note that the free path, free_ioctx(), needs to go through explicit call_rcu() * to synchronize with RCU protected lookup_ioctx(). percpu_ref operations don't * imply RCU grace periods of any kind and if a user wants to combine percpu_ref * with RCU protection, it must be done explicitly. * * Code that does a two stage shutdown like this often needs some kind of * explicit synchronization to ensure the initial refcount can only be dropped * once - percpu_ref_kill() does this for you, it returns true once and false if * someone else already called it. The aio code uses it this way, but it's not * necessary if the code has some other mechanism to synchronize teardown. * around. */ #ifndef _LINUX_PERCPU_REFCOUNT_H #define _LINUX_PERCPU_REFCOUNT_H #include <linux/atomic.h> #include <linux/percpu.h> #include <linux/rcupdate.h> #include <linux/types.h> #include <linux/gfp.h> struct percpu_ref; typedef void (percpu_ref_func_t)(struct percpu_ref *); /* flags set in the lower bits of percpu_ref->percpu_count_ptr */ enum { __PERCPU_REF_ATOMIC = 1LU << 0, /* operating in atomic mode */ __PERCPU_REF_DEAD = 1LU << 1, /* (being) killed */ __PERCPU_REF_ATOMIC_DEAD = __PERCPU_REF_ATOMIC | __PERCPU_REF_DEAD, __PERCPU_REF_FLAG_BITS = 2, }; /* @flags for percpu_ref_init() */ enum { /* * Start w/ ref == 1 in atomic mode. Can be switched to percpu * operation using percpu_ref_switch_to_percpu(). If initialized * with this flag, the ref will stay in atomic mode until * percpu_ref_switch_to_percpu() is invoked on it. * Implies ALLOW_REINIT. */ PERCPU_REF_INIT_ATOMIC = 1 << 0, /* * Start dead w/ ref == 0 in atomic mode. Must be revived with * percpu_ref_reinit() before used. Implies INIT_ATOMIC and * ALLOW_REINIT. */ PERCPU_REF_INIT_DEAD = 1 << 1, /* * Allow switching from atomic mode to percpu mode. */ PERCPU_REF_ALLOW_REINIT = 1 << 2, }; struct percpu_ref_data { atomic_long_t count; percpu_ref_func_t *release; percpu_ref_func_t *confirm_switch; bool force_atomic:1; bool allow_reinit:1; struct rcu_head rcu; struct percpu_ref *ref; }; struct percpu_ref { /* * The low bit of the pointer indicates whether the ref is in percpu * mode; if set, then get/put will manipulate the atomic_t. */ unsigned long percpu_count_ptr; /* * 'percpu_ref' is often embedded into user structure, and only * 'percpu_count_ptr' is required in fast path, move other fields * into 'percpu_ref_data', so we can reduce memory footprint in * fast path. */ struct percpu_ref_data *data; }; int __must_check percpu_ref_init(struct percpu_ref *ref, percpu_ref_func_t *release, unsigned int flags, gfp_t gfp); void percpu_ref_exit(struct percpu_ref *ref); void percpu_ref_switch_to_atomic(struct percpu_ref *ref, percpu_ref_func_t *confirm_switch); void percpu_ref_switch_to_atomic_sync(struct percpu_ref *ref); void percpu_ref_switch_to_percpu(struct percpu_ref *ref); void percpu_ref_kill_and_confirm(struct percpu_ref *ref, percpu_ref_func_t *confirm_kill); void percpu_ref_resurrect(struct percpu_ref *ref); void percpu_ref_reinit(struct percpu_ref *ref); bool percpu_ref_is_zero(struct percpu_ref *ref); /** * percpu_ref_kill - drop the initial ref * @ref: percpu_ref to kill * * Must be used to drop the initial ref on a percpu refcount; must be called * precisely once before shutdown. * * Switches @ref into atomic mode before gathering up the percpu counters * and dropping the initial ref. * * There are no implied RCU grace periods between kill and release. */ static inline void percpu_ref_kill(struct percpu_ref *ref) { percpu_ref_kill_and_confirm(ref, NULL); } /* * Internal helper. Don't use outside percpu-refcount proper. The * function doesn't return the pointer and let the caller test it for NULL * because doing so forces the compiler to generate two conditional * branches as it can't assume that @ref->percpu_count is not NULL. */ static inline bool __ref_is_percpu(struct percpu_ref *ref, unsigned long __percpu **percpu_countp) { unsigned long percpu_ptr; /* * The value of @ref->percpu_count_ptr is tested for * !__PERCPU_REF_ATOMIC, which may be set asynchronously, and then * used as a pointer. If the compiler generates a separate fetch * when using it as a pointer, __PERCPU_REF_ATOMIC may be set in * between contaminating the pointer value, meaning that * READ_ONCE() is required when fetching it. * * The dependency ordering from the READ_ONCE() pairs * with smp_store_release() in __percpu_ref_switch_to_percpu(). */ percpu_ptr = READ_ONCE(ref->percpu_count_ptr); /* * Theoretically, the following could test just ATOMIC; however, * then we'd have to mask off DEAD separately as DEAD may be * visible without ATOMIC if we race with percpu_ref_kill(). DEAD * implies ATOMIC anyway. Test them together. */ if (unlikely(percpu_ptr & __PERCPU_REF_ATOMIC_DEAD)) return false; *percpu_countp = (unsigned long __percpu *)percpu_ptr; return true; } /** * percpu_ref_get_many - increment a percpu refcount * @ref: percpu_ref to get * @nr: number of references to get * * Analogous to atomic_long_add(). * * This function is safe to call as long as @ref is between init and exit. */ static inline void percpu_ref_get_many(struct percpu_ref *ref, unsigned long nr) { unsigned long __percpu *percpu_count; rcu_read_lock(); if (__ref_is_percpu(ref, &percpu_count)) this_cpu_add(*percpu_count, nr); else atomic_long_add(nr, &ref->data->count); rcu_read_unlock(); } /** * percpu_ref_get - increment a percpu refcount * @ref: percpu_ref to get * * Analogous to atomic_long_inc(). * * This function is safe to call as long as @ref is between init and exit. */ static inline void percpu_ref_get(struct percpu_ref *ref) { percpu_ref_get_many(ref, 1); } /** * percpu_ref_tryget_many - try to increment a percpu refcount * @ref: percpu_ref to try-get * @nr: number of references to get * * Increment a percpu refcount by @nr unless its count already reached zero. * Returns %true on success; %false on failure. * * This function is safe to call as long as @ref is between init and exit. */ static inline bool percpu_ref_tryget_many(struct percpu_ref *ref, unsigned long nr) { unsigned long __percpu *percpu_count; bool ret; rcu_read_lock(); if (__ref_is_percpu(ref, &percpu_count)) { this_cpu_add(*percpu_count, nr); ret = true; } else { ret = atomic_long_add_unless(&ref->data->count, nr, 0); } rcu_read_unlock(); return ret; } /** * percpu_ref_tryget - try to increment a percpu refcount * @ref: percpu_ref to try-get * * Increment a percpu refcount unless its count already reached zero. * Returns %true on success; %false on failure. * * This function is safe to call as long as @ref is between init and exit. */ static inline bool percpu_ref_tryget(struct percpu_ref *ref) { return percpu_ref_tryget_many(ref, 1); } /** * percpu_ref_tryget_live_rcu - same as percpu_ref_tryget_live() but the * caller is responsible for taking RCU. * * This function is safe to call as long as @ref is between init and exit. */ static inline bool percpu_ref_tryget_live_rcu(struct percpu_ref *ref) { unsigned long __percpu *percpu_count; bool ret = false; WARN_ON_ONCE(!rcu_read_lock_held()); if (likely(__ref_is_percpu(ref, &percpu_count))) { this_cpu_inc(*percpu_count); ret = true; } else if (!(ref->percpu_count_ptr & __PERCPU_REF_DEAD)) { ret = atomic_long_inc_not_zero(&ref->data->count); } return ret; } /** * percpu_ref_tryget_live - try to increment a live percpu refcount * @ref: percpu_ref to try-get * * Increment a percpu refcount unless it has already been killed. Returns * %true on success; %false on failure. * * Completion of percpu_ref_kill() in itself doesn't guarantee that this * function will fail. For such guarantee, percpu_ref_kill_and_confirm() * should be used. After the confirm_kill callback is invoked, it's * guaranteed that no new reference will be given out by * percpu_ref_tryget_live(). * * This function is safe to call as long as @ref is between init and exit. */ static inline bool percpu_ref_tryget_live(struct percpu_ref *ref) { bool ret = false; rcu_read_lock(); ret = percpu_ref_tryget_live_rcu(ref); rcu_read_unlock(); return ret; } /** * percpu_ref_put_many - decrement a percpu refcount * @ref: percpu_ref to put * @nr: number of references to put * * Decrement the refcount, and if 0, call the release function (which was passed * to percpu_ref_init()) * * This function is safe to call as long as @ref is between init and exit. */ static inline void percpu_ref_put_many(struct percpu_ref *ref, unsigned long nr) { unsigned long __percpu *percpu_count; rcu_read_lock(); if (__ref_is_percpu(ref, &percpu_count)) this_cpu_sub(*percpu_count, nr); else if (unlikely(atomic_long_sub_and_test(nr, &ref->data->count))) ref->data->release(ref); rcu_read_unlock(); } /** * percpu_ref_put - decrement a percpu refcount * @ref: percpu_ref to put * * Decrement the refcount, and if 0, call the release function (which was passed * to percpu_ref_init()) * * This function is safe to call as long as @ref is between init and exit. */ static inline void percpu_ref_put(struct percpu_ref *ref) { percpu_ref_put_many(ref, 1); } /** * percpu_ref_is_dying - test whether a percpu refcount is dying or dead * @ref: percpu_ref to test * * Returns %true if @ref is dying or dead. * * This function is safe to call as long as @ref is between init and exit * and the caller is responsible for synchronizing against state changes. */ static inline bool percpu_ref_is_dying(struct percpu_ref *ref) { return ref->percpu_count_ptr & __PERCPU_REF_DEAD; } #endif
2 1 4 8 2 13 2 15 21 3 10 15 10 5 13 8 8 8 1 7 4 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 // SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2016-2021 Christoph Hellwig. */ #include <linux/iomap.h> #include <linux/fiemap.h> #include <linux/pagemap.h> static int iomap_to_fiemap(struct fiemap_extent_info *fi, const struct iomap *iomap, u32 flags) { switch (iomap->type) { case IOMAP_HOLE: /* skip holes */ return 0; case IOMAP_DELALLOC: flags |= FIEMAP_EXTENT_DELALLOC | FIEMAP_EXTENT_UNKNOWN; break; case IOMAP_MAPPED: break; case IOMAP_UNWRITTEN: flags |= FIEMAP_EXTENT_UNWRITTEN; break; case IOMAP_INLINE: flags |= FIEMAP_EXTENT_DATA_INLINE; break; } if (iomap->flags & IOMAP_F_MERGED) flags |= FIEMAP_EXTENT_MERGED; if (iomap->flags & IOMAP_F_SHARED) flags |= FIEMAP_EXTENT_SHARED; return fiemap_fill_next_extent(fi, iomap->offset, iomap->addr != IOMAP_NULL_ADDR ? iomap->addr : 0, iomap->length, flags); } static int iomap_fiemap_iter(struct iomap_iter *iter, struct fiemap_extent_info *fi, struct iomap *prev) { int ret; if (iter->iomap.type == IOMAP_HOLE) goto advance; ret = iomap_to_fiemap(fi, prev, 0); *prev = iter->iomap; if (ret < 0) return ret; if (ret == 1) /* extent array full */ return 0; advance: return iomap_iter_advance_full(iter); } int iomap_fiemap(struct inode *inode, struct fiemap_extent_info *fi, u64 start, u64 len, const struct iomap_ops *ops) { struct iomap_iter iter = { .inode = inode, .pos = start, .len = len, .flags = IOMAP_REPORT, }; struct iomap prev = { .type = IOMAP_HOLE, }; int ret; ret = fiemap_prep(inode, fi, start, &iter.len, 0); if (ret) return ret; while ((ret = iomap_iter(&iter, ops)) > 0) iter.status = iomap_fiemap_iter(&iter, fi, &prev); if (prev.type != IOMAP_HOLE) { ret = iomap_to_fiemap(fi, &prev, FIEMAP_EXTENT_LAST); if (ret < 0) return ret; } /* inode with no (attribute) mapping will give ENOENT */ if (ret < 0 && ret != -ENOENT) return ret; return 0; } EXPORT_SYMBOL_GPL(iomap_fiemap); /* legacy ->bmap interface. 0 is the error return (!) */ sector_t iomap_bmap(struct address_space *mapping, sector_t bno, const struct iomap_ops *ops) { struct iomap_iter iter = { .inode = mapping->host, .pos = (loff_t)bno << mapping->host->i_blkbits, .len = i_blocksize(mapping->host), .flags = IOMAP_REPORT, }; const unsigned int blkshift = mapping->host->i_blkbits - SECTOR_SHIFT; int ret; if (filemap_write_and_wait(mapping)) return 0; bno = 0; while ((ret = iomap_iter(&iter, ops)) > 0) { if (iter.iomap.type == IOMAP_MAPPED) bno = iomap_sector(&iter.iomap, iter.pos) >> blkshift; /* leave iter.status unset to abort loop */ } if (ret) return 0; return bno; } EXPORT_SYMBOL_GPL(iomap_bmap);
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_FPU_XCR_H #define _ASM_X86_FPU_XCR_H #define XCR_XFEATURE_ENABLED_MASK 0x00000000 #define XCR_XFEATURE_IN_USE_MASK 0x00000001 static __always_inline u64 xgetbv(u32 index) { u32 eax, edx; asm volatile("xgetbv" : "=a" (eax), "=d" (edx) : "c" (index)); return eax + ((u64)edx << 32); } static inline void xsetbv(u32 index, u64 value) { u32 eax = value; u32 edx = value >> 32; asm volatile("xsetbv" :: "a" (eax), "d" (edx), "c" (index)); } /* * Return a mask of xfeatures which are currently being tracked * by the processor as being in the initial configuration. * * Callers should check X86_FEATURE_XGETBV1. */ static __always_inline u64 xfeatures_in_use(void) { return xgetbv(XCR_XFEATURE_IN_USE_MASK); } #endif /* _ASM_X86_FPU_XCR_H */
42 39 13 43 37 26 69 43 43 36 23 107 108 108 5 41 41 41 39 31 78 44 108 74 89 91 3 3 1 1 2 44 44 44 41 42 43 5 5 135 1 3 106 132 5 51 1 51 1 45 45 44 44 42 42 42 2 36 37 2 4 34 2 2 25 25 25 7 13 10 7 1 6 14 1 1 69 69 2 2 2 2 2 2 2 2 2 2 2 6 6 2 1 3 1 2 6 1 5 6 4 2 2 49 49 6 45 4 2 5 6 5 38 4 35 18 2 38 23 2 38 7 1 7 1 4 3 2 2 1 2 1 1 1 3 3 1 1 2 3 1 2 3 14 14 49 8 8 2 3 8 8 8 8 1 1 1 95 95 88 95 95 1 1 1 67 67 67 67 67 67 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 // SPDX-License-Identifier: GPL-2.0-only /* * * Copyright (C) 2011 Novell Inc. */ #include <linux/fs.h> #include <linux/slab.h> #include <linux/namei.h> #include <linux/file.h> #include <linux/xattr.h> #include <linux/rbtree.h> #include <linux/security.h> #include <linux/cred.h> #include <linux/ratelimit.h> #include <linux/overflow.h> #include "overlayfs.h" struct ovl_cache_entry { unsigned int len; unsigned int type; u64 real_ino; u64 ino; struct list_head l_node; struct rb_node node; struct ovl_cache_entry *next_maybe_whiteout; bool is_upper; bool is_whiteout; bool check_xwhiteout; const char *c_name; int c_len; char name[]; }; struct ovl_dir_cache { long refcount; u64 version; struct list_head entries; struct rb_root root; }; struct ovl_readdir_data { struct dir_context ctx; struct dentry *dentry; bool is_lowest; struct rb_root *root; struct list_head *list; struct list_head middle; struct ovl_cache_entry *first_maybe_whiteout; struct unicode_map *map; int count; int err; bool is_upper; bool d_type_supported; bool in_xwhiteouts_dir; }; struct ovl_dir_file { bool is_real; bool is_upper; struct ovl_dir_cache *cache; struct list_head *cursor; struct file *realfile; struct file *upperfile; }; static struct ovl_cache_entry *ovl_cache_entry_from_node(struct rb_node *n) { return rb_entry(n, struct ovl_cache_entry, node); } static int ovl_casefold(struct ovl_readdir_data *rdd, const char *str, int len, char **dst) { const struct qstr qstr = { .name = str, .len = len }; char *cf_name; int cf_len; if (!IS_ENABLED(CONFIG_UNICODE) || !rdd->map || is_dot_dotdot(str, len)) return 0; cf_name = kmalloc(NAME_MAX, GFP_KERNEL); if (!cf_name) { rdd->err = -ENOMEM; return -ENOMEM; } cf_len = utf8_casefold(rdd->map, &qstr, cf_name, NAME_MAX); if (cf_len > 0) *dst = cf_name; else kfree(cf_name); return cf_len; } static bool ovl_cache_entry_find_link(const char *name, int len, struct rb_node ***link, struct rb_node **parent) { bool found = false; struct rb_node **newp = *link; while (!found && *newp) { int cmp; struct ovl_cache_entry *tmp; *parent = *newp; tmp = ovl_cache_entry_from_node(*newp); cmp = strncmp(name, tmp->c_name, len); if (cmp > 0) newp = &tmp->node.rb_right; else if (cmp < 0 || len < tmp->c_len) newp = &tmp->node.rb_left; else found = true; } *link = newp; return found; } static struct ovl_cache_entry *ovl_cache_entry_find(struct rb_root *root, const char *name, int len) { struct rb_node *node = root->rb_node; int cmp; while (node) { struct ovl_cache_entry *p = ovl_cache_entry_from_node(node); cmp = strncmp(name, p->c_name, len); if (cmp > 0) node = p->node.rb_right; else if (cmp < 0 || len < p->c_len) node = p->node.rb_left; else return p; } return NULL; } static bool ovl_calc_d_ino(struct ovl_readdir_data *rdd, struct ovl_cache_entry *p) { /* Don't care if not doing ovl_iter() */ if (!rdd->dentry) return false; /* Always recalc d_ino when remapping lower inode numbers */ if (ovl_xino_bits(OVL_FS(rdd->dentry->d_sb))) return true; /* Always recalc d_ino for parent */ if (strcmp(p->name, "..") == 0) return true; /* If this is lower, then native d_ino will do */ if (!rdd->is_upper) return false; /* * Recalc d_ino for '.' and for all entries if dir is impure (contains * copied up entries) */ if ((p->name[0] == '.' && p->len == 1) || ovl_test_flag(OVL_IMPURE, d_inode(rdd->dentry))) return true; return false; } static struct ovl_cache_entry *ovl_cache_entry_new(struct ovl_readdir_data *rdd, const char *name, int len, const char *c_name, int c_len, u64 ino, unsigned int d_type) { struct ovl_cache_entry *p; p = kmalloc(struct_size(p, name, len + 1), GFP_KERNEL); if (!p) return NULL; memcpy(p->name, name, len); p->name[len] = '\0'; p->len = len; p->type = d_type; p->real_ino = ino; p->ino = ino; /* Defer setting d_ino for upper entry to ovl_iterate() */ if (ovl_calc_d_ino(rdd, p)) p->ino = 0; p->is_upper = rdd->is_upper; p->is_whiteout = false; /* Defer check for overlay.whiteout to ovl_iterate() */ p->check_xwhiteout = rdd->in_xwhiteouts_dir && d_type == DT_REG; if (c_name && c_name != name) { p->c_name = c_name; p->c_len = c_len; } else { p->c_name = p->name; p->c_len = len; } if (d_type == DT_CHR) { p->next_maybe_whiteout = rdd->first_maybe_whiteout; rdd->first_maybe_whiteout = p; } return p; } /* Return 0 for found, 1 for added, <0 for error */ static int ovl_cache_entry_add_rb(struct ovl_readdir_data *rdd, const char *name, int len, const char *c_name, int c_len, u64 ino, unsigned int d_type) { struct rb_node **newp = &rdd->root->rb_node; struct rb_node *parent = NULL; struct ovl_cache_entry *p; if (ovl_cache_entry_find_link(c_name, c_len, &newp, &parent)) return 0; p = ovl_cache_entry_new(rdd, name, len, c_name, c_len, ino, d_type); if (p == NULL) { rdd->err = -ENOMEM; return -ENOMEM; } list_add_tail(&p->l_node, rdd->list); rb_link_node(&p->node, parent, newp); rb_insert_color(&p->node, rdd->root); return 1; } /* Return 0 for found, 1 for added, <0 for error */ static int ovl_fill_lowest(struct ovl_readdir_data *rdd, const char *name, int namelen, const char *c_name, int c_len, loff_t offset, u64 ino, unsigned int d_type) { struct ovl_cache_entry *p; p = ovl_cache_entry_find(rdd->root, c_name, c_len); if (p) { list_move_tail(&p->l_node, &rdd->middle); return 0; } else { p = ovl_cache_entry_new(rdd, name, namelen, c_name, c_len, ino, d_type); if (p == NULL) rdd->err = -ENOMEM; else list_add_tail(&p->l_node, &rdd->middle); } return rdd->err ?: 1; } static void ovl_cache_entry_free(struct ovl_cache_entry *p) { if (p->c_name != p->name) kfree(p->c_name); kfree(p); } void ovl_cache_free(struct list_head *list) { struct ovl_cache_entry *p; struct ovl_cache_entry *n; list_for_each_entry_safe(p, n, list, l_node) ovl_cache_entry_free(p); INIT_LIST_HEAD(list); } void ovl_dir_cache_free(struct inode *inode) { struct ovl_dir_cache *cache = ovl_dir_cache(inode); if (cache) { ovl_cache_free(&cache->entries); kfree(cache); } } static void ovl_cache_put(struct ovl_dir_file *od, struct inode *inode) { struct ovl_dir_cache *cache = od->cache; WARN_ON(cache->refcount <= 0); cache->refcount--; if (!cache->refcount) { if (ovl_dir_cache(inode) == cache) ovl_set_dir_cache(inode, NULL); ovl_cache_free(&cache->entries); kfree(cache); } } static bool ovl_fill_merge(struct dir_context *ctx, const char *name, int namelen, loff_t offset, u64 ino, unsigned int d_type) { struct ovl_readdir_data *rdd = container_of(ctx, struct ovl_readdir_data, ctx); struct ovl_fs *ofs = OVL_FS(rdd->dentry->d_sb); const char *c_name = NULL; char *cf_name = NULL; int c_len = 0, ret; if (ofs->casefold) c_len = ovl_casefold(rdd, name, namelen, &cf_name); if (rdd->err) return false; if (c_len <= 0) { c_name = name; c_len = namelen; } else { c_name = cf_name; } rdd->count++; if (!rdd->is_lowest) ret = ovl_cache_entry_add_rb(rdd, name, namelen, c_name, c_len, ino, d_type); else ret = ovl_fill_lowest(rdd, name, namelen, c_name, c_len, offset, ino, d_type); /* * If ret == 1, that means that c_name is being used as part of struct * ovl_cache_entry and will be freed at ovl_cache_free(). Otherwise, * c_name was found in the rb-tree so we can free it here. */ if (ret != 1 && c_name != name) kfree(c_name); return ret >= 0; } static int ovl_check_whiteouts(const struct path *path, struct ovl_readdir_data *rdd) { int err = 0; struct dentry *dentry, *dir = path->dentry; const struct cred *old_cred; old_cred = ovl_override_creds(rdd->dentry->d_sb); while (rdd->first_maybe_whiteout) { struct ovl_cache_entry *p = rdd->first_maybe_whiteout; rdd->first_maybe_whiteout = p->next_maybe_whiteout; dentry = lookup_one_positive_killable(mnt_idmap(path->mnt), &QSTR_LEN(p->name, p->len), dir); if (!IS_ERR(dentry)) { p->is_whiteout = ovl_is_whiteout(dentry); dput(dentry); } else if (PTR_ERR(dentry) == -EINTR) { err = -EINTR; break; } } ovl_revert_creds(old_cred); return err; } static inline int ovl_dir_read(const struct path *realpath, struct ovl_readdir_data *rdd) { struct file *realfile; int err; realfile = ovl_path_open(realpath, O_RDONLY | O_LARGEFILE); if (IS_ERR(realfile)) return PTR_ERR(realfile); rdd->first_maybe_whiteout = NULL; rdd->ctx.pos = 0; do { rdd->count = 0; rdd->err = 0; err = iterate_dir(realfile, &rdd->ctx); if (err >= 0) err = rdd->err; } while (!err && rdd->count); if (!err && rdd->first_maybe_whiteout && rdd->dentry) err = ovl_check_whiteouts(realpath, rdd); fput(realfile); return err; } static void ovl_dir_reset(struct file *file) { struct ovl_dir_file *od = file->private_data; struct ovl_dir_cache *cache = od->cache; struct inode *inode = file_inode(file); bool is_real; if (cache && ovl_inode_version_get(inode) != cache->version) { ovl_cache_put(od, inode); od->cache = NULL; od->cursor = NULL; } is_real = ovl_dir_is_real(inode); if (od->is_real != is_real) { /* is_real can only become false when dir is copied up */ if (WARN_ON(is_real)) return; od->is_real = false; } } static int ovl_dir_read_merged(struct dentry *dentry, struct list_head *list, struct rb_root *root) { int err; struct path realpath; struct ovl_readdir_data rdd = { .ctx.actor = ovl_fill_merge, .ctx.count = INT_MAX, .dentry = dentry, .list = list, .root = root, .is_lowest = false, .map = NULL, }; int idx, next; const struct ovl_layer *layer; struct ovl_fs *ofs = OVL_FS(dentry->d_sb); for (idx = 0; idx != -1; idx = next) { next = ovl_path_next(idx, dentry, &realpath, &layer); if (ofs->casefold) rdd.map = sb_encoding(realpath.dentry->d_sb); rdd.is_upper = ovl_dentry_upper(dentry) == realpath.dentry; rdd.in_xwhiteouts_dir = layer->has_xwhiteouts && ovl_dentry_has_xwhiteouts(dentry); if (next != -1) { err = ovl_dir_read(&realpath, &rdd); if (err) break; } else { /* * Insert lowest layer entries before upper ones, this * allows offsets to be reasonably constant */ list_add(&rdd.middle, rdd.list); rdd.is_lowest = true; err = ovl_dir_read(&realpath, &rdd); list_del(&rdd.middle); } } return err; } static void ovl_seek_cursor(struct ovl_dir_file *od, loff_t pos) { struct list_head *p; loff_t off = 0; list_for_each(p, &od->cache->entries) { if (off >= pos) break; off++; } /* Cursor is safe since the cache is stable */ od->cursor = p; } static struct ovl_dir_cache *ovl_cache_get(struct dentry *dentry) { int res; struct ovl_dir_cache *cache; struct inode *inode = d_inode(dentry); cache = ovl_dir_cache(inode); if (cache && ovl_inode_version_get(inode) == cache->version) { WARN_ON(!cache->refcount); cache->refcount++; return cache; } ovl_set_dir_cache(d_inode(dentry), NULL); cache = kzalloc(sizeof(struct ovl_dir_cache), GFP_KERNEL); if (!cache) return ERR_PTR(-ENOMEM); cache->refcount = 1; INIT_LIST_HEAD(&cache->entries); cache->root = RB_ROOT; res = ovl_dir_read_merged(dentry, &cache->entries, &cache->root); if (res) { ovl_cache_free(&cache->entries); kfree(cache); return ERR_PTR(res); } cache->version = ovl_inode_version_get(inode); ovl_set_dir_cache(inode, cache); return cache; } /* Map inode number to lower fs unique range */ static u64 ovl_remap_lower_ino(u64 ino, int xinobits, int fsid, const char *name, int namelen, bool warn) { unsigned int xinoshift = 64 - xinobits; if (unlikely(ino >> xinoshift)) { if (warn) { pr_warn_ratelimited("d_ino too big (%.*s, ino=%llu, xinobits=%d)\n", namelen, name, ino, xinobits); } return ino; } /* * The lowest xinobit is reserved for mapping the non-peresistent inode * numbers range, but this range is only exposed via st_ino, not here. */ return ino | ((u64)fsid) << (xinoshift + 1); } /* * Set d_ino for upper entries if needed. Non-upper entries should always report * the uppermost real inode ino and should not call this function. * * When not all layer are on same fs, report real ino also for upper. * * When all layers are on the same fs, and upper has a reference to * copy up origin, call vfs_getattr() on the overlay entry to make * sure that d_ino will be consistent with st_ino from stat(2). * * Also checks the overlay.whiteout xattr by doing a full lookup which will return * negative in this case. */ static int ovl_cache_update(const struct path *path, struct ovl_cache_entry *p, bool update_ino) { struct dentry *dir = path->dentry; struct ovl_fs *ofs = OVL_FS(dir->d_sb); struct dentry *this = NULL; enum ovl_path_type type; u64 ino = p->real_ino; int xinobits = ovl_xino_bits(ofs); int err = 0; if (!ovl_same_dev(ofs) && !p->check_xwhiteout) goto out; if (p->name[0] == '.') { if (p->len == 1) { this = dget(dir); goto get; } if (p->len == 2 && p->name[1] == '.') { /* we shall not be moved */ this = dget(dir->d_parent); goto get; } } /* This checks also for xwhiteouts */ this = lookup_one(mnt_idmap(path->mnt), &QSTR_LEN(p->name, p->len), dir); if (IS_ERR_OR_NULL(this) || !this->d_inode) { /* Mark a stale entry */ p->is_whiteout = true; if (IS_ERR(this)) { err = PTR_ERR(this); this = NULL; goto fail; } goto out; } get: if (!ovl_same_dev(ofs) || !update_ino) goto out; type = ovl_path_type(this); if (OVL_TYPE_ORIGIN(type)) { struct kstat stat; struct path statpath = *path; statpath.dentry = this; err = vfs_getattr(&statpath, &stat, STATX_INO, 0); if (err) goto fail; /* * Directory inode is always on overlay st_dev. * Non-dir with ovl_same_dev() could be on pseudo st_dev in case * of xino bits overflow. */ WARN_ON_ONCE(S_ISDIR(stat.mode) && dir->d_sb->s_dev != stat.dev); ino = stat.ino; } else if (xinobits && !OVL_TYPE_UPPER(type)) { ino = ovl_remap_lower_ino(ino, xinobits, ovl_layer_lower(this)->fsid, p->name, p->len, ovl_xino_warn(ofs)); } out: p->ino = ino; dput(this); return err; fail: pr_warn_ratelimited("failed to look up (%s) for ino (%i)\n", p->name, err); goto out; } static bool ovl_fill_plain(struct dir_context *ctx, const char *name, int namelen, loff_t offset, u64 ino, unsigned int d_type) { struct ovl_cache_entry *p; struct ovl_readdir_data *rdd = container_of(ctx, struct ovl_readdir_data, ctx); rdd->count++; p = ovl_cache_entry_new(rdd, name, namelen, NULL, 0, ino, d_type); if (p == NULL) { rdd->err = -ENOMEM; return false; } list_add_tail(&p->l_node, rdd->list); return true; } static int ovl_dir_read_impure(const struct path *path, struct list_head *list, struct rb_root *root) { int err; struct path realpath; struct ovl_cache_entry *p, *n; struct ovl_readdir_data rdd = { .ctx.actor = ovl_fill_plain, .ctx.count = INT_MAX, .list = list, .root = root, }; INIT_LIST_HEAD(list); *root = RB_ROOT; ovl_path_upper(path->dentry, &realpath); err = ovl_dir_read(&realpath, &rdd); if (err) return err; list_for_each_entry_safe(p, n, list, l_node) { if (strcmp(p->name, ".") != 0 && strcmp(p->name, "..") != 0) { err = ovl_cache_update(path, p, true); if (err) return err; } if (p->ino == p->real_ino) { list_del(&p->l_node); ovl_cache_entry_free(p); } else { struct rb_node **newp = &root->rb_node; struct rb_node *parent = NULL; if (WARN_ON(ovl_cache_entry_find_link(p->name, p->len, &newp, &parent))) return -EIO; rb_link_node(&p->node, parent, newp); rb_insert_color(&p->node, root); } } return 0; } static struct ovl_dir_cache *ovl_cache_get_impure(const struct path *path) { int res; struct dentry *dentry = path->dentry; struct inode *inode = d_inode(dentry); struct ovl_fs *ofs = OVL_FS(dentry->d_sb); struct ovl_dir_cache *cache; cache = ovl_dir_cache(inode); if (cache && ovl_inode_version_get(inode) == cache->version) return cache; /* Impure cache is not refcounted, free it here */ ovl_dir_cache_free(inode); ovl_set_dir_cache(inode, NULL); cache = kzalloc(sizeof(struct ovl_dir_cache), GFP_KERNEL); if (!cache) return ERR_PTR(-ENOMEM); res = ovl_dir_read_impure(path, &cache->entries, &cache->root); if (res) { ovl_cache_free(&cache->entries); kfree(cache); return ERR_PTR(res); } if (list_empty(&cache->entries)) { /* * A good opportunity to get rid of an unneeded "impure" flag. * Removing the "impure" xattr is best effort. */ if (!ovl_want_write(dentry)) { ovl_removexattr(ofs, ovl_dentry_upper(dentry), OVL_XATTR_IMPURE); ovl_drop_write(dentry); } ovl_clear_flag(OVL_IMPURE, inode); kfree(cache); return NULL; } cache->version = ovl_inode_version_get(inode); ovl_set_dir_cache(inode, cache); return cache; } struct ovl_readdir_translate { struct dir_context *orig_ctx; struct ovl_dir_cache *cache; struct dir_context ctx; u64 parent_ino; int fsid; int xinobits; bool xinowarn; }; static bool ovl_fill_real(struct dir_context *ctx, const char *name, int namelen, loff_t offset, u64 ino, unsigned int d_type) { struct ovl_readdir_translate *rdt = container_of(ctx, struct ovl_readdir_translate, ctx); struct dir_context *orig_ctx = rdt->orig_ctx; bool res; if (rdt->parent_ino && strcmp(name, "..") == 0) { ino = rdt->parent_ino; } else if (rdt->cache) { struct ovl_cache_entry *p; p = ovl_cache_entry_find(&rdt->cache->root, name, namelen); if (p) ino = p->ino; } else if (rdt->xinobits) { ino = ovl_remap_lower_ino(ino, rdt->xinobits, rdt->fsid, name, namelen, rdt->xinowarn); } res = orig_ctx->actor(orig_ctx, name, namelen, offset, ino, d_type); ctx->count = orig_ctx->count; return res; } static bool ovl_is_impure_dir(struct file *file) { struct ovl_dir_file *od = file->private_data; struct inode *dir = file_inode(file); /* * Only upper dir can be impure, but if we are in the middle of * iterating a lower real dir, dir could be copied up and marked * impure. We only want the impure cache if we started iterating * a real upper dir to begin with. */ return od->is_upper && ovl_test_flag(OVL_IMPURE, dir); } static int ovl_iterate_real(struct file *file, struct dir_context *ctx) { int err; struct ovl_dir_file *od = file->private_data; struct dentry *dir = file->f_path.dentry; struct ovl_fs *ofs = OVL_FS(dir->d_sb); const struct ovl_layer *lower_layer = ovl_layer_lower(dir); struct ovl_readdir_translate rdt = { .ctx.actor = ovl_fill_real, .ctx.count = ctx->count, .orig_ctx = ctx, .xinobits = ovl_xino_bits(ofs), .xinowarn = ovl_xino_warn(ofs), }; if (rdt.xinobits && lower_layer) rdt.fsid = lower_layer->fsid; if (OVL_TYPE_MERGE(ovl_path_type(dir->d_parent))) { struct kstat stat; struct path statpath = file->f_path; statpath.dentry = dir->d_parent; err = vfs_getattr(&statpath, &stat, STATX_INO, 0); if (err) return err; WARN_ON_ONCE(dir->d_sb->s_dev != stat.dev); rdt.parent_ino = stat.ino; } if (ovl_is_impure_dir(file)) { rdt.cache = ovl_cache_get_impure(&file->f_path); if (IS_ERR(rdt.cache)) return PTR_ERR(rdt.cache); } err = iterate_dir(od->realfile, &rdt.ctx); ctx->pos = rdt.ctx.pos; return err; } static int ovl_iterate(struct file *file, struct dir_context *ctx) { struct ovl_dir_file *od = file->private_data; struct dentry *dentry = file->f_path.dentry; struct ovl_fs *ofs = OVL_FS(dentry->d_sb); struct ovl_cache_entry *p; const struct cred *old_cred; int err; old_cred = ovl_override_creds(dentry->d_sb); if (!ctx->pos) ovl_dir_reset(file); if (od->is_real) { /* * If parent is merge, then need to adjust d_ino for '..', if * dir is impure then need to adjust d_ino for copied up * entries. */ if (ovl_xino_bits(ofs) || (ovl_same_fs(ofs) && (ovl_is_impure_dir(file) || OVL_TYPE_MERGE(ovl_path_type(dentry->d_parent))))) { err = ovl_iterate_real(file, ctx); } else { err = iterate_dir(od->realfile, ctx); } goto out; } if (!od->cache) { struct ovl_dir_cache *cache; cache = ovl_cache_get(dentry); err = PTR_ERR(cache); if (IS_ERR(cache)) goto out; od->cache = cache; ovl_seek_cursor(od, ctx->pos); } while (od->cursor != &od->cache->entries) { p = list_entry(od->cursor, struct ovl_cache_entry, l_node); if (!p->is_whiteout) { if (!p->ino || p->check_xwhiteout) { err = ovl_cache_update(&file->f_path, p, !p->ino); if (err) goto out; } } /* ovl_cache_update() sets is_whiteout on stale entry */ if (!p->is_whiteout) { if (!dir_emit(ctx, p->name, p->len, p->ino, p->type)) break; } od->cursor = p->l_node.next; ctx->pos++; } err = 0; out: ovl_revert_creds(old_cred); return err; } static loff_t ovl_dir_llseek(struct file *file, loff_t offset, int origin) { loff_t res; struct ovl_dir_file *od = file->private_data; inode_lock(file_inode(file)); if (!file->f_pos) ovl_dir_reset(file); if (od->is_real) { res = vfs_llseek(od->realfile, offset, origin); file->f_pos = od->realfile->f_pos; } else { res = -EINVAL; switch (origin) { case SEEK_CUR: offset += file->f_pos; break; case SEEK_SET: break; default: goto out_unlock; } if (offset < 0) goto out_unlock; if (offset != file->f_pos) { file->f_pos = offset; if (od->cache) ovl_seek_cursor(od, offset); } res = offset; } out_unlock: inode_unlock(file_inode(file)); return res; } static struct file *ovl_dir_open_realfile(const struct file *file, const struct path *realpath) { struct file *res; const struct cred *old_cred; old_cred = ovl_override_creds(file_inode(file)->i_sb); res = ovl_path_open(realpath, O_RDONLY | (file->f_flags & O_LARGEFILE)); ovl_revert_creds(old_cred); return res; } /* * Like ovl_real_fdget(), returns upperfile if dir was copied up since open. * Unlike ovl_real_fdget(), this caches upperfile in file->private_data. * * TODO: use same abstract type for file->private_data of dir and file so * upperfile could also be cached for files as well. */ struct file *ovl_dir_real_file(const struct file *file, bool want_upper) { struct ovl_dir_file *od = file->private_data; struct dentry *dentry = file->f_path.dentry; struct file *old, *realfile = od->realfile; if (!OVL_TYPE_UPPER(ovl_path_type(dentry))) return want_upper ? NULL : realfile; /* * Need to check if we started out being a lower dir, but got copied up */ if (!od->is_upper) { realfile = READ_ONCE(od->upperfile); if (!realfile) { struct path upperpath; ovl_path_upper(dentry, &upperpath); realfile = ovl_dir_open_realfile(file, &upperpath); if (IS_ERR(realfile)) return realfile; old = cmpxchg_release(&od->upperfile, NULL, realfile); if (old) { fput(realfile); realfile = old; } } } return realfile; } static int ovl_dir_fsync(struct file *file, loff_t start, loff_t end, int datasync) { struct file *realfile; int err; err = ovl_sync_status(OVL_FS(file_inode(file)->i_sb)); if (err <= 0) return err; realfile = ovl_dir_real_file(file, true); err = PTR_ERR_OR_ZERO(realfile); /* Nothing to sync for lower */ if (!realfile || err) return err; return vfs_fsync_range(realfile, start, end, datasync); } static int ovl_dir_release(struct inode *inode, struct file *file) { struct ovl_dir_file *od = file->private_data; if (od->cache) { inode_lock(inode); ovl_cache_put(od, inode); inode_unlock(inode); } fput(od->realfile); if (od->upperfile) fput(od->upperfile); kfree(od); return 0; } static int ovl_dir_open(struct inode *inode, struct file *file) { struct path realpath; struct file *realfile; struct ovl_dir_file *od; enum ovl_path_type type; od = kzalloc(sizeof(struct ovl_dir_file), GFP_KERNEL); if (!od) return -ENOMEM; type = ovl_path_real(file->f_path.dentry, &realpath); realfile = ovl_dir_open_realfile(file, &realpath); if (IS_ERR(realfile)) { kfree(od); return PTR_ERR(realfile); } od->realfile = realfile; od->is_real = ovl_dir_is_real(inode); od->is_upper = OVL_TYPE_UPPER(type); file->private_data = od; return 0; } WRAP_DIR_ITER(ovl_iterate) // FIXME! const struct file_operations ovl_dir_operations = { .read = generic_read_dir, .open = ovl_dir_open, .iterate_shared = shared_ovl_iterate, .llseek = ovl_dir_llseek, .fsync = ovl_dir_fsync, .release = ovl_dir_release, }; int ovl_check_empty_dir(struct dentry *dentry, struct list_head *list) { int err; struct ovl_cache_entry *p, *n; struct rb_root root = RB_ROOT; const struct cred *old_cred; old_cred = ovl_override_creds(dentry->d_sb); err = ovl_dir_read_merged(dentry, list, &root); ovl_revert_creds(old_cred); if (err) return err; err = 0; list_for_each_entry_safe(p, n, list, l_node) { /* * Select whiteouts in upperdir, they should * be cleared when deleting this directory. */ if (p->is_whiteout) { if (p->is_upper) continue; goto del_entry; } if (p->name[0] == '.') { if (p->len == 1) goto del_entry; if (p->len == 2 && p->name[1] == '.') goto del_entry; } err = -ENOTEMPTY; break; del_entry: list_del(&p->l_node); ovl_cache_entry_free(p); } return err; } void ovl_cleanup_whiteouts(struct ovl_fs *ofs, struct dentry *upper, struct list_head *list) { struct ovl_cache_entry *p; list_for_each_entry(p, list, l_node) { struct dentry *dentry; if (WARN_ON(!p->is_whiteout || !p->is_upper)) continue; dentry = ovl_lookup_upper_unlocked(ofs, p->name, upper, p->len); if (IS_ERR(dentry)) { pr_err("lookup '%s/%.*s' failed (%i)\n", upper->d_name.name, p->len, p->name, (int) PTR_ERR(dentry)); continue; } if (dentry->d_inode) ovl_cleanup(ofs, upper, dentry); dput(dentry); } } static bool ovl_check_d_type(struct dir_context *ctx, const char *name, int namelen, loff_t offset, u64 ino, unsigned int d_type) { struct ovl_readdir_data *rdd = container_of(ctx, struct ovl_readdir_data, ctx); /* Even if d_type is not supported, DT_DIR is returned for . and .. */ if (!strncmp(name, ".", namelen) || !strncmp(name, "..", namelen)) return true; if (d_type != DT_UNKNOWN) rdd->d_type_supported = true; return true; } /* * Returns 1 if d_type is supported, 0 not supported/unknown. Negative values * if error is encountered. */ int ovl_check_d_type_supported(const struct path *realpath) { int err; struct ovl_readdir_data rdd = { .ctx.actor = ovl_check_d_type, .ctx.count = INT_MAX, .d_type_supported = false, }; err = ovl_dir_read(realpath, &rdd); if (err) return err; return rdd.d_type_supported; } #define OVL_INCOMPATDIR_NAME "incompat" static int ovl_workdir_cleanup_recurse(struct ovl_fs *ofs, const struct path *path, int level) { int err; LIST_HEAD(list); struct ovl_cache_entry *p; struct ovl_readdir_data rdd = { .ctx.actor = ovl_fill_plain, .ctx.count = INT_MAX, .list = &list, }; bool incompat = false; /* * The "work/incompat" directory is treated specially - if it is not * empty, instead of printing a generic error and mounting read-only, * we will error about incompat features and fail the mount. * * When called from ovl_indexdir_cleanup(), path->dentry->d_name.name * starts with '#'. */ if (level == 2 && !strcmp(path->dentry->d_name.name, OVL_INCOMPATDIR_NAME)) incompat = true; err = ovl_dir_read(path, &rdd); if (err) goto out; list_for_each_entry(p, &list, l_node) { struct dentry *dentry; if (p->name[0] == '.') { if (p->len == 1) continue; if (p->len == 2 && p->name[1] == '.') continue; } else if (incompat) { pr_err("overlay with incompat feature '%s' cannot be mounted\n", p->name); err = -EINVAL; break; } dentry = ovl_lookup_upper_unlocked(ofs, p->name, path->dentry, p->len); if (IS_ERR(dentry)) continue; if (dentry->d_inode) err = ovl_workdir_cleanup(ofs, path->dentry, path->mnt, dentry, level); dput(dentry); if (err) break; } out: ovl_cache_free(&list); return err; } int ovl_workdir_cleanup(struct ovl_fs *ofs, struct dentry *parent, struct vfsmount *mnt, struct dentry *dentry, int level) { int err; if (!d_is_dir(dentry) || level > 1) return ovl_cleanup(ofs, parent, dentry); err = ovl_parent_lock(parent, dentry); if (err) return err; err = ovl_do_rmdir(ofs, parent->d_inode, dentry); ovl_parent_unlock(parent); if (err) { struct path path = { .mnt = mnt, .dentry = dentry }; err = ovl_workdir_cleanup_recurse(ofs, &path, level + 1); if (!err) err = ovl_cleanup(ofs, parent, dentry); } return err; } int ovl_indexdir_cleanup(struct ovl_fs *ofs) { int err; struct dentry *indexdir = ofs->workdir; struct dentry *index = NULL; struct path path = { .mnt = ovl_upper_mnt(ofs), .dentry = indexdir }; LIST_HEAD(list); struct ovl_cache_entry *p; struct ovl_readdir_data rdd = { .ctx.actor = ovl_fill_plain, .ctx.count = INT_MAX, .list = &list, }; err = ovl_dir_read(&path, &rdd); if (err) goto out; list_for_each_entry(p, &list, l_node) { if (p->name[0] == '.') { if (p->len == 1) continue; if (p->len == 2 && p->name[1] == '.') continue; } index = ovl_lookup_upper_unlocked(ofs, p->name, indexdir, p->len); if (IS_ERR(index)) { err = PTR_ERR(index); index = NULL; break; } /* Cleanup leftover from index create/cleanup attempt */ if (index->d_name.name[0] == '#') { err = ovl_workdir_cleanup(ofs, indexdir, path.mnt, index, 1); if (err) break; goto next; } err = ovl_verify_index(ofs, index); if (!err) { goto next; } else if (err == -ESTALE) { /* Cleanup stale index entries */ err = ovl_cleanup(ofs, indexdir, index); } else if (err != -ENOENT) { /* * Abort mount to avoid corrupting the index if * an incompatible index entry was found or on out * of memory. */ break; } else if (ofs->config.nfs_export) { /* * Whiteout orphan index to block future open by * handle after overlay nlink dropped to zero. */ err = ovl_cleanup_and_whiteout(ofs, indexdir, index); } else { /* Cleanup orphan index entries */ err = ovl_cleanup(ofs, indexdir, index); } if (err) break; next: dput(index); index = NULL; } dput(index); out: ovl_cache_free(&list); if (err) pr_err("failed index dir cleanup (%i)\n", err); return err; }
31 32 4 28 29 8 7 3 5 5 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 // SPDX-License-Identifier: GPL-2.0-or-later #define pr_fmt(fmt) "ref_tracker: " fmt #include <linux/export.h> #include <linux/list_sort.h> #include <linux/ref_tracker.h> #include <linux/slab.h> #include <linux/stacktrace.h> #include <linux/stackdepot.h> #include <linux/seq_file.h> #define REF_TRACKER_STACK_ENTRIES 16 #define STACK_BUF_SIZE 1024 struct ref_tracker { struct list_head head; /* anchor into dir->list or dir->quarantine */ bool dead; depot_stack_handle_t alloc_stack_handle; depot_stack_handle_t free_stack_handle; }; struct ref_tracker_dir_stats { int total; int count; struct { depot_stack_handle_t stack_handle; unsigned int count; } stacks[]; }; #ifdef CONFIG_DEBUG_FS #include <linux/xarray.h> /* * ref_tracker_dir_init() is usually called in allocation-safe contexts, but * the same is not true of ref_tracker_dir_exit() which can be called from * anywhere an object is freed. Removing debugfs dentries is a blocking * operation, so we defer that work to the debugfs_reap_worker. * * Each dentry is tracked in the appropriate xarray. When * ref_tracker_dir_exit() is called, its entries in the xarrays are marked and * the workqueue job is scheduled. The worker then runs and deletes any marked * dentries asynchronously. */ static struct xarray debugfs_dentries; static struct xarray debugfs_symlinks; static struct work_struct debugfs_reap_worker; #define REF_TRACKER_DIR_DEAD XA_MARK_0 static inline void ref_tracker_debugfs_mark(struct ref_tracker_dir *dir) { unsigned long flags; xa_lock_irqsave(&debugfs_dentries, flags); __xa_set_mark(&debugfs_dentries, (unsigned long)dir, REF_TRACKER_DIR_DEAD); xa_unlock_irqrestore(&debugfs_dentries, flags); xa_lock_irqsave(&debugfs_symlinks, flags); __xa_set_mark(&debugfs_symlinks, (unsigned long)dir, REF_TRACKER_DIR_DEAD); xa_unlock_irqrestore(&debugfs_symlinks, flags); schedule_work(&debugfs_reap_worker); } #else static inline void ref_tracker_debugfs_mark(struct ref_tracker_dir *dir) { } #endif static struct ref_tracker_dir_stats * ref_tracker_get_stats(struct ref_tracker_dir *dir, unsigned int limit) { struct ref_tracker_dir_stats *stats; struct ref_tracker *tracker; stats = kmalloc(struct_size(stats, stacks, limit), GFP_NOWAIT); if (!stats) return ERR_PTR(-ENOMEM); stats->total = 0; stats->count = 0; list_for_each_entry(tracker, &dir->list, head) { depot_stack_handle_t stack = tracker->alloc_stack_handle; int i; ++stats->total; for (i = 0; i < stats->count; ++i) if (stats->stacks[i].stack_handle == stack) break; if (i >= limit) continue; if (i >= stats->count) { stats->stacks[i].stack_handle = stack; stats->stacks[i].count = 0; ++stats->count; } ++stats->stacks[i].count; } return stats; } struct ostream { void __ostream_printf (*func)(struct ostream *stream, char *fmt, ...); char *prefix; char *buf; struct seq_file *seq; int size, used; }; static void __ostream_printf pr_ostream_log(struct ostream *stream, char *fmt, ...) { va_list args; va_start(args, fmt); vprintk(fmt, args); va_end(args); } static void __ostream_printf pr_ostream_buf(struct ostream *stream, char *fmt, ...) { int ret, len = stream->size - stream->used; va_list args; va_start(args, fmt); ret = vsnprintf(stream->buf + stream->used, len, fmt, args); va_end(args); if (ret > 0) stream->used += min(ret, len); } #define pr_ostream(stream, fmt, args...) \ ({ \ struct ostream *_s = (stream); \ \ _s->func(_s, fmt, ##args); \ }) static void __ref_tracker_dir_pr_ostream(struct ref_tracker_dir *dir, unsigned int display_limit, struct ostream *s) { struct ref_tracker_dir_stats *stats; unsigned int i = 0, skipped; depot_stack_handle_t stack; char *sbuf; lockdep_assert_held(&dir->lock); if (list_empty(&dir->list)) return; stats = ref_tracker_get_stats(dir, display_limit); if (IS_ERR(stats)) { pr_ostream(s, "%s%s@%p: couldn't get stats, error %pe\n", s->prefix, dir->class, dir, stats); return; } sbuf = kmalloc(STACK_BUF_SIZE, GFP_NOWAIT); for (i = 0, skipped = stats->total; i < stats->count; ++i) { stack = stats->stacks[i].stack_handle; if (sbuf && !stack_depot_snprint(stack, sbuf, STACK_BUF_SIZE, 4)) sbuf[0] = 0; pr_ostream(s, "%s%s@%p has %d/%d users at\n%s\n", s->prefix, dir->class, dir, stats->stacks[i].count, stats->total, sbuf); skipped -= stats->stacks[i].count; } if (skipped) pr_ostream(s, "%s%s@%p skipped reports about %d/%d users.\n", s->prefix, dir->class, dir, skipped, stats->total); kfree(sbuf); kfree(stats); } void ref_tracker_dir_print_locked(struct ref_tracker_dir *dir, unsigned int display_limit) { struct ostream os = { .func = pr_ostream_log, .prefix = "ref_tracker: " }; __ref_tracker_dir_pr_ostream(dir, display_limit, &os); } EXPORT_SYMBOL(ref_tracker_dir_print_locked); void ref_tracker_dir_print(struct ref_tracker_dir *dir, unsigned int display_limit) { unsigned long flags; spin_lock_irqsave(&dir->lock, flags); ref_tracker_dir_print_locked(dir, display_limit); spin_unlock_irqrestore(&dir->lock, flags); } EXPORT_SYMBOL(ref_tracker_dir_print); int ref_tracker_dir_snprint(struct ref_tracker_dir *dir, char *buf, size_t size) { struct ostream os = { .func = pr_ostream_buf, .prefix = "ref_tracker: ", .buf = buf, .size = size }; unsigned long flags; spin_lock_irqsave(&dir->lock, flags); __ref_tracker_dir_pr_ostream(dir, 16, &os); spin_unlock_irqrestore(&dir->lock, flags); return os.used; } EXPORT_SYMBOL(ref_tracker_dir_snprint); void ref_tracker_dir_exit(struct ref_tracker_dir *dir) { struct ref_tracker *tracker, *n; unsigned long flags; bool leak = false; dir->dead = true; /* * The xarray entries must be marked before the dir->lock is taken to * protect simultaneous debugfs readers. */ ref_tracker_debugfs_mark(dir); spin_lock_irqsave(&dir->lock, flags); list_for_each_entry_safe(tracker, n, &dir->quarantine, head) { list_del(&tracker->head); kfree(tracker); dir->quarantine_avail++; } if (!list_empty(&dir->list)) { ref_tracker_dir_print_locked(dir, 16); leak = true; list_for_each_entry_safe(tracker, n, &dir->list, head) { list_del(&tracker->head); kfree(tracker); } } spin_unlock_irqrestore(&dir->lock, flags); WARN_ON_ONCE(leak); WARN_ON_ONCE(refcount_read(&dir->untracked) != 1); WARN_ON_ONCE(refcount_read(&dir->no_tracker) != 1); } EXPORT_SYMBOL(ref_tracker_dir_exit); int ref_tracker_alloc(struct ref_tracker_dir *dir, struct ref_tracker **trackerp, gfp_t gfp) { unsigned long entries[REF_TRACKER_STACK_ENTRIES]; struct ref_tracker *tracker; unsigned int nr_entries; gfp_t gfp_mask = gfp | __GFP_NOWARN; unsigned long flags; WARN_ON_ONCE(dir->dead); if (!trackerp) { refcount_inc(&dir->no_tracker); return 0; } if (gfp & __GFP_DIRECT_RECLAIM) gfp_mask |= __GFP_NOFAIL; *trackerp = tracker = kzalloc(sizeof(*tracker), gfp_mask); if (unlikely(!tracker)) { pr_err_once("memory allocation failure, unreliable refcount tracker.\n"); refcount_inc(&dir->untracked); return -ENOMEM; } nr_entries = stack_trace_save(entries, ARRAY_SIZE(entries), 1); tracker->alloc_stack_handle = stack_depot_save(entries, nr_entries, gfp); spin_lock_irqsave(&dir->lock, flags); list_add(&tracker->head, &dir->list); spin_unlock_irqrestore(&dir->lock, flags); return 0; } EXPORT_SYMBOL_GPL(ref_tracker_alloc); int ref_tracker_free(struct ref_tracker_dir *dir, struct ref_tracker **trackerp) { unsigned long entries[REF_TRACKER_STACK_ENTRIES]; depot_stack_handle_t stack_handle; struct ref_tracker *tracker; unsigned int nr_entries; unsigned long flags; WARN_ON_ONCE(dir->dead); if (!trackerp) { refcount_dec(&dir->no_tracker); return 0; } tracker = *trackerp; if (!tracker) { refcount_dec(&dir->untracked); return -EEXIST; } nr_entries = stack_trace_save(entries, ARRAY_SIZE(entries), 1); stack_handle = stack_depot_save(entries, nr_entries, GFP_NOWAIT); spin_lock_irqsave(&dir->lock, flags); if (tracker->dead) { pr_err("reference already released.\n"); if (tracker->alloc_stack_handle) { pr_err("allocated in:\n"); stack_depot_print(tracker->alloc_stack_handle); } if (tracker->free_stack_handle) { pr_err("freed in:\n"); stack_depot_print(tracker->free_stack_handle); } spin_unlock_irqrestore(&dir->lock, flags); WARN_ON_ONCE(1); return -EINVAL; } tracker->dead = true; tracker->free_stack_handle = stack_handle; list_move_tail(&tracker->head, &dir->quarantine); if (!dir->quarantine_avail) { tracker = list_first_entry(&dir->quarantine, struct ref_tracker, head); list_del(&tracker->head); } else { dir->quarantine_avail--; tracker = NULL; } spin_unlock_irqrestore(&dir->lock, flags); kfree(tracker); return 0; } EXPORT_SYMBOL_GPL(ref_tracker_free); #ifdef CONFIG_DEBUG_FS #include <linux/debugfs.h> static struct dentry *ref_tracker_debug_dir = (struct dentry *)-ENOENT; static void __ostream_printf pr_ostream_seq(struct ostream *stream, char *fmt, ...) { va_list args; va_start(args, fmt); seq_vprintf(stream->seq, fmt, args); va_end(args); } static int ref_tracker_dir_seq_print(struct ref_tracker_dir *dir, struct seq_file *seq) { struct ostream os = { .func = pr_ostream_seq, .prefix = "", .seq = seq }; __ref_tracker_dir_pr_ostream(dir, 16, &os); return os.used; } static int ref_tracker_debugfs_show(struct seq_file *f, void *v) { struct ref_tracker_dir *dir = f->private; unsigned long index = (unsigned long)dir; unsigned long flags; int ret; /* * "dir" may not exist at this point if ref_tracker_dir_exit() has * already been called. Take care not to dereference it until its * legitimacy is established. * * The xa_lock is necessary to ensure that "dir" doesn't disappear * before its lock can be taken. If it's in the hash and not marked * dead, then it's safe to take dir->lock which prevents * ref_tracker_dir_exit() from completing. Once the dir->lock is * acquired, the xa_lock can be released. All of this must be IRQ-safe. */ xa_lock_irqsave(&debugfs_dentries, flags); if (!xa_load(&debugfs_dentries, index) || xa_get_mark(&debugfs_dentries, index, REF_TRACKER_DIR_DEAD)) { xa_unlock_irqrestore(&debugfs_dentries, flags); return -ENODATA; } spin_lock(&dir->lock); xa_unlock(&debugfs_dentries); ret = ref_tracker_dir_seq_print(dir, f); spin_unlock_irqrestore(&dir->lock, flags); return ret; } static int ref_tracker_debugfs_open(struct inode *inode, struct file *filp) { struct ref_tracker_dir *dir = inode->i_private; return single_open(filp, ref_tracker_debugfs_show, dir); } static const struct file_operations ref_tracker_debugfs_fops = { .owner = THIS_MODULE, .open = ref_tracker_debugfs_open, .read = seq_read, .llseek = seq_lseek, .release = single_release, }; /** * ref_tracker_dir_debugfs - create debugfs file for ref_tracker_dir * @dir: ref_tracker_dir to be associated with debugfs file * * In most cases, a debugfs file will be created automatically for every * ref_tracker_dir. If the object was created before debugfs is brought up * then that may fail. In those cases, it is safe to call this at a later * time to create the file. */ void ref_tracker_dir_debugfs(struct ref_tracker_dir *dir) { char name[NAME_MAX + 1]; struct dentry *dentry; int ret; /* No-op if already created */ dentry = xa_load(&debugfs_dentries, (unsigned long)dir); if (dentry && !xa_is_err(dentry)) return; ret = snprintf(name, sizeof(name), "%s@%p", dir->class, dir); name[sizeof(name) - 1] = '\0'; if (ret < sizeof(name)) { dentry = debugfs_create_file(name, S_IFREG | 0400, ref_tracker_debug_dir, dir, &ref_tracker_debugfs_fops); if (!IS_ERR(dentry)) { void *old; old = xa_store_irq(&debugfs_dentries, (unsigned long)dir, dentry, GFP_KERNEL); if (xa_is_err(old)) debugfs_remove(dentry); else WARN_ON_ONCE(old); } } } EXPORT_SYMBOL(ref_tracker_dir_debugfs); void __ostream_printf ref_tracker_dir_symlink(struct ref_tracker_dir *dir, const char *fmt, ...) { char name[NAME_MAX + 1]; struct dentry *symlink, *dentry; va_list args; int ret; symlink = xa_load(&debugfs_symlinks, (unsigned long)dir); dentry = xa_load(&debugfs_dentries, (unsigned long)dir); /* Already created?*/ if (symlink && !xa_is_err(symlink)) return; if (!dentry || xa_is_err(dentry)) return; va_start(args, fmt); ret = vsnprintf(name, sizeof(name), fmt, args); va_end(args); name[sizeof(name) - 1] = '\0'; if (ret < sizeof(name)) { symlink = debugfs_create_symlink(name, ref_tracker_debug_dir, dentry->d_name.name); if (!IS_ERR(symlink)) { void *old; old = xa_store_irq(&debugfs_symlinks, (unsigned long)dir, symlink, GFP_KERNEL); if (xa_is_err(old)) debugfs_remove(symlink); else WARN_ON_ONCE(old); } } } EXPORT_SYMBOL(ref_tracker_dir_symlink); static void debugfs_reap_work(struct work_struct *work) { struct dentry *dentry; unsigned long index; bool reaped; do { reaped = false; xa_for_each_marked(&debugfs_symlinks, index, dentry, REF_TRACKER_DIR_DEAD) { xa_erase_irq(&debugfs_symlinks, index); debugfs_remove(dentry); reaped = true; } xa_for_each_marked(&debugfs_dentries, index, dentry, REF_TRACKER_DIR_DEAD) { xa_erase_irq(&debugfs_dentries, index); debugfs_remove(dentry); reaped = true; } } while (reaped); } static int __init ref_tracker_debugfs_postcore_init(void) { INIT_WORK(&debugfs_reap_worker, debugfs_reap_work); xa_init_flags(&debugfs_dentries, XA_FLAGS_LOCK_IRQ); xa_init_flags(&debugfs_symlinks, XA_FLAGS_LOCK_IRQ); return 0; } postcore_initcall(ref_tracker_debugfs_postcore_init); static int __init ref_tracker_debugfs_late_init(void) { ref_tracker_debug_dir = debugfs_create_dir("ref_tracker", NULL); return 0; } late_initcall(ref_tracker_debugfs_late_init); #endif /* CONFIG_DEBUG_FS */
130 19 114 130 128 2 12 12 12 12 12 12 12 12 12 175 174 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 // SPDX-License-Identifier: GPL-2.0 /* * linux/fs/ext4/acl.c * * Copyright (C) 2001-2003 Andreas Gruenbacher, <agruen@suse.de> */ #include <linux/quotaops.h> #include "ext4_jbd2.h" #include "ext4.h" #include "xattr.h" #include "acl.h" /* * Convert from filesystem to in-memory representation. */ static struct posix_acl * ext4_acl_from_disk(const void *value, size_t size) { const char *end = (char *)value + size; int n, count; struct posix_acl *acl; if (!value) return NULL; if (size < sizeof(ext4_acl_header)) return ERR_PTR(-EINVAL); if (((ext4_acl_header *)value)->a_version != cpu_to_le32(EXT4_ACL_VERSION)) return ERR_PTR(-EINVAL); value = (char *)value + sizeof(ext4_acl_header); count = ext4_acl_count(size); if (count < 0) return ERR_PTR(-EINVAL); if (count == 0) return NULL; acl = posix_acl_alloc(count, GFP_NOFS); if (!acl) return ERR_PTR(-ENOMEM); for (n = 0; n < count; n++) { ext4_acl_entry *entry = (ext4_acl_entry *)value; if ((char *)value + sizeof(ext4_acl_entry_short) > end) goto fail; acl->a_entries[n].e_tag = le16_to_cpu(entry->e_tag); acl->a_entries[n].e_perm = le16_to_cpu(entry->e_perm); switch (acl->a_entries[n].e_tag) { case ACL_USER_OBJ: case ACL_GROUP_OBJ: case ACL_MASK: case ACL_OTHER: value = (char *)value + sizeof(ext4_acl_entry_short); break; case ACL_USER: value = (char *)value + sizeof(ext4_acl_entry); if ((char *)value > end) goto fail; acl->a_entries[n].e_uid = make_kuid(&init_user_ns, le32_to_cpu(entry->e_id)); break; case ACL_GROUP: value = (char *)value + sizeof(ext4_acl_entry); if ((char *)value > end) goto fail; acl->a_entries[n].e_gid = make_kgid(&init_user_ns, le32_to_cpu(entry->e_id)); break; default: goto fail; } } if (value != end) goto fail; return acl; fail: posix_acl_release(acl); return ERR_PTR(-EINVAL); } /* * Convert from in-memory to filesystem representation. */ static void * ext4_acl_to_disk(const struct posix_acl *acl, size_t *size) { ext4_acl_header *ext_acl; char *e; size_t n; *size = ext4_acl_size(acl->a_count); ext_acl = kmalloc(sizeof(ext4_acl_header) + acl->a_count * sizeof(ext4_acl_entry), GFP_NOFS); if (!ext_acl) return ERR_PTR(-ENOMEM); ext_acl->a_version = cpu_to_le32(EXT4_ACL_VERSION); e = (char *)ext_acl + sizeof(ext4_acl_header); for (n = 0; n < acl->a_count; n++) { const struct posix_acl_entry *acl_e = &acl->a_entries[n]; ext4_acl_entry *entry = (ext4_acl_entry *)e; entry->e_tag = cpu_to_le16(acl_e->e_tag); entry->e_perm = cpu_to_le16(acl_e->e_perm); switch (acl_e->e_tag) { case ACL_USER: entry->e_id = cpu_to_le32( from_kuid(&init_user_ns, acl_e->e_uid)); e += sizeof(ext4_acl_entry); break; case ACL_GROUP: entry->e_id = cpu_to_le32( from_kgid(&init_user_ns, acl_e->e_gid)); e += sizeof(ext4_acl_entry); break; case ACL_USER_OBJ: case ACL_GROUP_OBJ: case ACL_MASK: case ACL_OTHER: e += sizeof(ext4_acl_entry_short); break; default: goto fail; } } return (char *)ext_acl; fail: kfree(ext_acl); return ERR_PTR(-EINVAL); } /* * Inode operation get_posix_acl(). * * inode->i_rwsem: don't care */ struct posix_acl * ext4_get_acl(struct inode *inode, int type, bool rcu) { int name_index; char *value = NULL; struct posix_acl *acl; int retval; if (rcu) return ERR_PTR(-ECHILD); switch (type) { case ACL_TYPE_ACCESS: name_index = EXT4_XATTR_INDEX_POSIX_ACL_ACCESS; break; case ACL_TYPE_DEFAULT: name_index = EXT4_XATTR_INDEX_POSIX_ACL_DEFAULT; break; default: BUG(); } retval = ext4_xattr_get(inode, name_index, "", NULL, 0); if (retval > 0) { value = kmalloc(retval, GFP_NOFS); if (!value) return ERR_PTR(-ENOMEM); retval = ext4_xattr_get(inode, name_index, "", value, retval); } if (retval > 0) acl = ext4_acl_from_disk(value, retval); else if (retval == -ENODATA || retval == -ENOSYS) acl = NULL; else acl = ERR_PTR(retval); kfree(value); return acl; } /* * Set the access or default ACL of an inode. * * inode->i_rwsem: down unless called from ext4_new_inode */ static int __ext4_set_acl(handle_t *handle, struct inode *inode, int type, struct posix_acl *acl, int xattr_flags) { int name_index; void *value = NULL; size_t size = 0; int error; switch (type) { case ACL_TYPE_ACCESS: name_index = EXT4_XATTR_INDEX_POSIX_ACL_ACCESS; break; case ACL_TYPE_DEFAULT: name_index = EXT4_XATTR_INDEX_POSIX_ACL_DEFAULT; if (!S_ISDIR(inode->i_mode)) return acl ? -EACCES : 0; break; default: return -EINVAL; } if (acl) { value = ext4_acl_to_disk(acl, &size); if (IS_ERR(value)) return (int)PTR_ERR(value); } error = ext4_xattr_set_handle(handle, inode, name_index, "", value, size, xattr_flags); kfree(value); if (!error) set_cached_acl(inode, type, acl); return error; } int ext4_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, struct posix_acl *acl, int type) { handle_t *handle; int error, credits, retries = 0; size_t acl_size = acl ? ext4_acl_size(acl->a_count) : 0; struct inode *inode = d_inode(dentry); umode_t mode = inode->i_mode; int update_mode = 0; error = dquot_initialize(inode); if (error) return error; retry: error = ext4_xattr_set_credits(inode, acl_size, false /* is_create */, &credits); if (error) return error; handle = ext4_journal_start(inode, EXT4_HT_XATTR, credits); if (IS_ERR(handle)) return PTR_ERR(handle); if ((type == ACL_TYPE_ACCESS) && acl) { error = posix_acl_update_mode(idmap, inode, &mode, &acl); if (error) goto out_stop; if (mode != inode->i_mode) update_mode = 1; } error = __ext4_set_acl(handle, inode, type, acl, 0 /* xattr_flags */); if (!error && update_mode) { inode->i_mode = mode; inode_set_ctime_current(inode); error = ext4_mark_inode_dirty(handle, inode); } out_stop: ext4_journal_stop(handle); if (error == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) goto retry; return error; } /* * Initialize the ACLs of a new inode. Called from ext4_new_inode. * * dir->i_rwsem: down * inode->i_rwsem: up (access to inode is still exclusive) */ int ext4_init_acl(handle_t *handle, struct inode *inode, struct inode *dir) { struct posix_acl *default_acl, *acl; int error; error = posix_acl_create(dir, &inode->i_mode, &default_acl, &acl); if (error) return error; if (default_acl) { error = __ext4_set_acl(handle, inode, ACL_TYPE_DEFAULT, default_acl, XATTR_CREATE); posix_acl_release(default_acl); } else { inode->i_default_acl = NULL; } if (acl) { if (!error) error = __ext4_set_acl(handle, inode, ACL_TYPE_ACCESS, acl, XATTR_CREATE); posix_acl_release(acl); } else { inode->i_acl = NULL; } return error; }
3 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 // SPDX-License-Identifier: GPL-2.0-only /* * Detect Hung Task * * kernel/hung_task.c - kernel thread for detecting tasks stuck in D state * */ #include <linux/mm.h> #include <linux/cpu.h> #include <linux/nmi.h> #include <linux/init.h> #include <linux/delay.h> #include <linux/freezer.h> #include <linux/kthread.h> #include <linux/lockdep.h> #include <linux/export.h> #include <linux/panic_notifier.h> #include <linux/sysctl.h> #include <linux/suspend.h> #include <linux/utsname.h> #include <linux/sched/signal.h> #include <linux/sched/debug.h> #include <linux/sched/sysctl.h> #include <linux/hung_task.h> #include <linux/rwsem.h> #include <trace/events/sched.h> /* * The number of tasks checked: */ static int __read_mostly sysctl_hung_task_check_count = PID_MAX_LIMIT; /* * Total number of tasks detected as hung since boot: */ static unsigned long __read_mostly sysctl_hung_task_detect_count; /* * Limit number of tasks checked in a batch. * * This value controls the preemptibility of khungtaskd since preemption * is disabled during the critical section. It also controls the size of * the RCU grace period. So it needs to be upper-bound. */ #define HUNG_TASK_LOCK_BREAK (HZ / 10) /* * Zero means infinite timeout - no checking done: */ unsigned long __read_mostly sysctl_hung_task_timeout_secs = CONFIG_DEFAULT_HUNG_TASK_TIMEOUT; EXPORT_SYMBOL_GPL(sysctl_hung_task_timeout_secs); /* * Zero (default value) means use sysctl_hung_task_timeout_secs: */ static unsigned long __read_mostly sysctl_hung_task_check_interval_secs; static int __read_mostly sysctl_hung_task_warnings = 10; static int __read_mostly did_panic; static bool hung_task_show_lock; static bool hung_task_call_panic; static bool hung_task_show_all_bt; static struct task_struct *watchdog_task; #ifdef CONFIG_SMP /* * Should we dump all CPUs backtraces in a hung task event? * Defaults to 0, can be changed via sysctl. */ static unsigned int __read_mostly sysctl_hung_task_all_cpu_backtrace; #else #define sysctl_hung_task_all_cpu_backtrace 0 #endif /* CONFIG_SMP */ /* * Should we panic (and reboot, if panic_timeout= is set) when a * hung task is detected: */ static unsigned int __read_mostly sysctl_hung_task_panic = IS_ENABLED(CONFIG_BOOTPARAM_HUNG_TASK_PANIC); static int hung_task_panic(struct notifier_block *this, unsigned long event, void *ptr) { did_panic = 1; return NOTIFY_DONE; } static struct notifier_block panic_block = { .notifier_call = hung_task_panic, }; static bool task_is_hung(struct task_struct *t, unsigned long timeout) { unsigned long switch_count = t->nvcsw + t->nivcsw; unsigned int state = READ_ONCE(t->__state); /* * skip the TASK_KILLABLE tasks -- these can be killed * skip the TASK_IDLE tasks -- those are genuinely idle * skip the TASK_FROZEN task -- it reasonably stops scheduling by freezer */ if (!(state & TASK_UNINTERRUPTIBLE) || (state & (TASK_WAKEKILL | TASK_NOLOAD | TASK_FROZEN))) return false; /* * When a freshly created task is scheduled once, changes its state to * TASK_UNINTERRUPTIBLE without having ever been switched out once, it * musn't be checked. */ if (unlikely(!switch_count)) return false; if (switch_count != t->last_switch_count) { t->last_switch_count = switch_count; t->last_switch_time = jiffies; return false; } if (time_is_after_jiffies(t->last_switch_time + timeout * HZ)) return false; return true; } #ifdef CONFIG_DETECT_HUNG_TASK_BLOCKER static void debug_show_blocker(struct task_struct *task, unsigned long timeout) { struct task_struct *g, *t; unsigned long owner, blocker, blocker_type; const char *rwsem_blocked_by, *rwsem_blocked_as; RCU_LOCKDEP_WARN(!rcu_read_lock_held(), "No rcu lock held"); blocker = READ_ONCE(task->blocker); if (!blocker) return; blocker_type = hung_task_get_blocker_type(blocker); switch (blocker_type) { case BLOCKER_TYPE_MUTEX: owner = mutex_get_owner(hung_task_blocker_to_lock(blocker)); break; case BLOCKER_TYPE_SEM: owner = sem_last_holder(hung_task_blocker_to_lock(blocker)); break; case BLOCKER_TYPE_RWSEM_READER: case BLOCKER_TYPE_RWSEM_WRITER: owner = (unsigned long)rwsem_owner( hung_task_blocker_to_lock(blocker)); rwsem_blocked_as = (blocker_type == BLOCKER_TYPE_RWSEM_READER) ? "reader" : "writer"; rwsem_blocked_by = is_rwsem_reader_owned( hung_task_blocker_to_lock(blocker)) ? "reader" : "writer"; break; default: WARN_ON_ONCE(1); return; } if (unlikely(!owner)) { switch (blocker_type) { case BLOCKER_TYPE_MUTEX: pr_err("INFO: task %s:%d is blocked on a mutex, but the owner is not found.\n", task->comm, task->pid); break; case BLOCKER_TYPE_SEM: pr_err("INFO: task %s:%d is blocked on a semaphore, but the last holder is not found.\n", task->comm, task->pid); break; case BLOCKER_TYPE_RWSEM_READER: case BLOCKER_TYPE_RWSEM_WRITER: pr_err("INFO: task %s:%d is blocked on an rw-semaphore, but the owner is not found.\n", task->comm, task->pid); break; } return; } /* Ensure the owner information is correct. */ for_each_process_thread(g, t) { if ((unsigned long)t != owner) continue; switch (blocker_type) { case BLOCKER_TYPE_MUTEX: pr_err("INFO: task %s:%d is blocked on a mutex likely owned by task %s:%d.\n", task->comm, task->pid, t->comm, t->pid); break; case BLOCKER_TYPE_SEM: pr_err("INFO: task %s:%d blocked on a semaphore likely last held by task %s:%d\n", task->comm, task->pid, t->comm, t->pid); break; case BLOCKER_TYPE_RWSEM_READER: case BLOCKER_TYPE_RWSEM_WRITER: pr_err("INFO: task %s:%d <%s> blocked on an rw-semaphore likely owned by task %s:%d <%s>\n", task->comm, task->pid, rwsem_blocked_as, t->comm, t->pid, rwsem_blocked_by); break; } /* Avoid duplicated task dump, skip if the task is also hung. */ if (!task_is_hung(t, timeout)) sched_show_task(t); return; } } #else static inline void debug_show_blocker(struct task_struct *task, unsigned long timeout) { } #endif static void check_hung_task(struct task_struct *t, unsigned long timeout) { if (!task_is_hung(t, timeout)) return; /* * This counter tracks the total number of tasks detected as hung * since boot. */ sysctl_hung_task_detect_count++; trace_sched_process_hang(t); if (sysctl_hung_task_panic) { console_verbose(); hung_task_show_lock = true; hung_task_call_panic = true; } /* * Ok, the task did not get scheduled for more than 2 minutes, * complain: */ if (sysctl_hung_task_warnings || hung_task_call_panic) { if (sysctl_hung_task_warnings > 0) sysctl_hung_task_warnings--; pr_err("INFO: task %s:%d blocked for more than %ld seconds.\n", t->comm, t->pid, (jiffies - t->last_switch_time) / HZ); pr_err(" %s %s %.*s\n", print_tainted(), init_utsname()->release, (int)strcspn(init_utsname()->version, " "), init_utsname()->version); if (t->flags & PF_POSTCOREDUMP) pr_err(" Blocked by coredump.\n"); pr_err("\"echo 0 > /proc/sys/kernel/hung_task_timeout_secs\"" " disables this message.\n"); sched_show_task(t); debug_show_blocker(t, timeout); hung_task_show_lock = true; if (sysctl_hung_task_all_cpu_backtrace) hung_task_show_all_bt = true; if (!sysctl_hung_task_warnings) pr_info("Future hung task reports are suppressed, see sysctl kernel.hung_task_warnings\n"); } touch_nmi_watchdog(); } /* * To avoid extending the RCU grace period for an unbounded amount of time, * periodically exit the critical section and enter a new one. * * For preemptible RCU it is sufficient to call rcu_read_unlock in order * to exit the grace period. For classic RCU, a reschedule is required. */ static bool rcu_lock_break(struct task_struct *g, struct task_struct *t) { bool can_cont; get_task_struct(g); get_task_struct(t); rcu_read_unlock(); cond_resched(); rcu_read_lock(); can_cont = pid_alive(g) && pid_alive(t); put_task_struct(t); put_task_struct(g); return can_cont; } /* * Check whether a TASK_UNINTERRUPTIBLE does not get woken up for * a really long time (120 seconds). If that happens, print out * a warning. */ static void check_hung_uninterruptible_tasks(unsigned long timeout) { int max_count = sysctl_hung_task_check_count; unsigned long last_break = jiffies; struct task_struct *g, *t; /* * If the system crashed already then all bets are off, * do not report extra hung tasks: */ if (test_taint(TAINT_DIE) || did_panic) return; hung_task_show_lock = false; rcu_read_lock(); for_each_process_thread(g, t) { if (!max_count--) goto unlock; if (time_after(jiffies, last_break + HUNG_TASK_LOCK_BREAK)) { if (!rcu_lock_break(g, t)) goto unlock; last_break = jiffies; } check_hung_task(t, timeout); } unlock: rcu_read_unlock(); if (hung_task_show_lock) debug_show_all_locks(); if (hung_task_show_all_bt) { hung_task_show_all_bt = false; trigger_all_cpu_backtrace(); } if (hung_task_call_panic) panic("hung_task: blocked tasks"); } static long hung_timeout_jiffies(unsigned long last_checked, unsigned long timeout) { /* timeout of 0 will disable the watchdog */ return timeout ? last_checked - jiffies + timeout * HZ : MAX_SCHEDULE_TIMEOUT; } #ifdef CONFIG_SYSCTL /* * Process updating of timeout sysctl */ static int proc_dohung_task_timeout_secs(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { int ret; ret = proc_doulongvec_minmax(table, write, buffer, lenp, ppos); if (ret || !write) goto out; wake_up_process(watchdog_task); out: return ret; } /* * This is needed for proc_doulongvec_minmax of sysctl_hung_task_timeout_secs * and hung_task_check_interval_secs */ static const unsigned long hung_task_timeout_max = (LONG_MAX / HZ); static const struct ctl_table hung_task_sysctls[] = { #ifdef CONFIG_SMP { .procname = "hung_task_all_cpu_backtrace", .data = &sysctl_hung_task_all_cpu_backtrace, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, #endif /* CONFIG_SMP */ { .procname = "hung_task_panic", .data = &sysctl_hung_task_panic, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, { .procname = "hung_task_check_count", .data = &sysctl_hung_task_check_count, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, }, { .procname = "hung_task_timeout_secs", .data = &sysctl_hung_task_timeout_secs, .maxlen = sizeof(unsigned long), .mode = 0644, .proc_handler = proc_dohung_task_timeout_secs, .extra2 = (void *)&hung_task_timeout_max, }, { .procname = "hung_task_check_interval_secs", .data = &sysctl_hung_task_check_interval_secs, .maxlen = sizeof(unsigned long), .mode = 0644, .proc_handler = proc_dohung_task_timeout_secs, .extra2 = (void *)&hung_task_timeout_max, }, { .procname = "hung_task_warnings", .data = &sysctl_hung_task_warnings, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_NEG_ONE, }, { .procname = "hung_task_detect_count", .data = &sysctl_hung_task_detect_count, .maxlen = sizeof(unsigned long), .mode = 0444, .proc_handler = proc_doulongvec_minmax, }, }; static void __init hung_task_sysctl_init(void) { register_sysctl_init("kernel", hung_task_sysctls); } #else #define hung_task_sysctl_init() do { } while (0) #endif /* CONFIG_SYSCTL */ static atomic_t reset_hung_task = ATOMIC_INIT(0); void reset_hung_task_detector(void) { atomic_set(&reset_hung_task, 1); } EXPORT_SYMBOL_GPL(reset_hung_task_detector); static bool hung_detector_suspended; static int hungtask_pm_notify(struct notifier_block *self, unsigned long action, void *hcpu) { switch (action) { case PM_SUSPEND_PREPARE: case PM_HIBERNATION_PREPARE: case PM_RESTORE_PREPARE: hung_detector_suspended = true; break; case PM_POST_SUSPEND: case PM_POST_HIBERNATION: case PM_POST_RESTORE: hung_detector_suspended = false; break; default: break; } return NOTIFY_OK; } /* * kthread which checks for tasks stuck in D state */ static int watchdog(void *dummy) { unsigned long hung_last_checked = jiffies; set_user_nice(current, 0); for ( ; ; ) { unsigned long timeout = sysctl_hung_task_timeout_secs; unsigned long interval = sysctl_hung_task_check_interval_secs; long t; if (interval == 0) interval = timeout; interval = min_t(unsigned long, interval, timeout); t = hung_timeout_jiffies(hung_last_checked, interval); if (t <= 0) { if (!atomic_xchg(&reset_hung_task, 0) && !hung_detector_suspended) check_hung_uninterruptible_tasks(timeout); hung_last_checked = jiffies; continue; } schedule_timeout_interruptible(t); } return 0; } static int __init hung_task_init(void) { atomic_notifier_chain_register(&panic_notifier_list, &panic_block); /* Disable hung task detector on suspend */ pm_notifier(hungtask_pm_notify, 0); watchdog_task = kthread_run(watchdog, NULL, "khungtaskd"); hung_task_sysctl_init(); return 0; } subsys_initcall(hung_task_init);
3 1 1 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (c) 2013, 2014 * Phillip Lougher <phillip@squashfs.org.uk> */ #include <linux/bio.h> #include <linux/mutex.h> #include <linux/slab.h> #include <linux/vmalloc.h> #include <linux/lz4.h> #include "squashfs_fs.h" #include "squashfs_fs_sb.h" #include "squashfs.h" #include "decompressor.h" #include "page_actor.h" #define LZ4_LEGACY 1 struct lz4_comp_opts { __le32 version; __le32 flags; }; struct squashfs_lz4 { void *input; void *output; }; static void *lz4_comp_opts(struct squashfs_sb_info *msblk, void *buff, int len) { struct lz4_comp_opts *comp_opts = buff; /* LZ4 compressed filesystems always have compression options */ if (comp_opts == NULL || len < sizeof(*comp_opts)) return ERR_PTR(-EIO); if (le32_to_cpu(comp_opts->version) != LZ4_LEGACY) { /* LZ4 format currently used by the kernel is the 'legacy' * format */ ERROR("Unknown LZ4 version\n"); return ERR_PTR(-EINVAL); } return NULL; } static void *lz4_init(struct squashfs_sb_info *msblk, void *buff) { int block_size = max_t(int, msblk->block_size, SQUASHFS_METADATA_SIZE); struct squashfs_lz4 *stream; stream = kzalloc(sizeof(*stream), GFP_KERNEL); if (stream == NULL) goto failed; stream->input = vmalloc(block_size); if (stream->input == NULL) goto failed2; stream->output = vmalloc(block_size); if (stream->output == NULL) goto failed3; return stream; failed3: vfree(stream->input); failed2: kfree(stream); failed: ERROR("Failed to initialise LZ4 decompressor\n"); return ERR_PTR(-ENOMEM); } static void lz4_free(void *strm) { struct squashfs_lz4 *stream = strm; if (stream) { vfree(stream->input); vfree(stream->output); } kfree(stream); } static int lz4_uncompress(struct squashfs_sb_info *msblk, void *strm, struct bio *bio, int offset, int length, struct squashfs_page_actor *output) { struct bvec_iter_all iter_all = {}; struct bio_vec *bvec = bvec_init_iter_all(&iter_all); struct squashfs_lz4 *stream = strm; void *buff = stream->input, *data; int bytes = length, res; while (bio_next_segment(bio, &iter_all)) { int avail = min(bytes, ((int)bvec->bv_len) - offset); data = bvec_virt(bvec); memcpy(buff, data + offset, avail); buff += avail; bytes -= avail; offset = 0; } res = LZ4_decompress_safe(stream->input, stream->output, length, output->length); if (res < 0) return -EIO; bytes = res; data = squashfs_first_page(output); buff = stream->output; while (data) { if (bytes <= PAGE_SIZE) { if (!IS_ERR(data)) memcpy(data, buff, bytes); break; } if (!IS_ERR(data)) memcpy(data, buff, PAGE_SIZE); buff += PAGE_SIZE; bytes -= PAGE_SIZE; data = squashfs_next_page(output); } squashfs_finish_page(output); return res; } const struct squashfs_decompressor squashfs_lz4_comp_ops = { .init = lz4_init, .comp_opts = lz4_comp_opts, .free = lz4_free, .decompress = lz4_uncompress, .id = LZ4_COMPRESSION, .name = "lz4", .alloc_buffer = 0, .supported = 1 };
2 5479 36 5471 2741 2 77 77 2743 2748 2 2 2 2 2 2 2 2 2 2 2 2 2742 7 2742 2745 5486 5476 5469 77 5487 2749 5478 5471 5477 5475 5478 5485 5471 16 16 16 16 750 748 2 2 2 5473 16 16 16 134 134 134 133 133 134 35 35 36 34 35 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) 2016 Facebook * Copyright (C) 2013-2014 Jens Axboe */ #include <linux/sched.h> #include <linux/random.h> #include <linux/sbitmap.h> #include <linux/seq_file.h> static int init_alloc_hint(struct sbitmap *sb, gfp_t flags) { unsigned depth = sb->depth; sb->alloc_hint = alloc_percpu_gfp(unsigned int, flags); if (!sb->alloc_hint) return -ENOMEM; if (depth && !sb->round_robin) { int i; for_each_possible_cpu(i) *per_cpu_ptr(sb->alloc_hint, i) = get_random_u32_below(depth); } return 0; } static inline unsigned update_alloc_hint_before_get(struct sbitmap *sb, unsigned int depth) { unsigned hint; hint = this_cpu_read(*sb->alloc_hint); if (unlikely(hint >= depth)) { hint = depth ? get_random_u32_below(depth) : 0; this_cpu_write(*sb->alloc_hint, hint); } return hint; } static inline void update_alloc_hint_after_get(struct sbitmap *sb, unsigned int depth, unsigned int hint, unsigned int nr) { if (nr == -1) { /* If the map is full, a hint won't do us much good. */ this_cpu_write(*sb->alloc_hint, 0); } else if (nr == hint || unlikely(sb->round_robin)) { /* Only update the hint if we used it. */ hint = nr + 1; if (hint >= depth - 1) hint = 0; this_cpu_write(*sb->alloc_hint, hint); } } /* * See if we have deferred clears that we can batch move */ static inline bool sbitmap_deferred_clear(struct sbitmap_word *map, unsigned int depth, unsigned int alloc_hint, bool wrap) { unsigned long mask, word_mask; guard(raw_spinlock_irqsave)(&map->swap_lock); if (!map->cleared) { if (depth == 0) return false; word_mask = (~0UL) >> (BITS_PER_LONG - depth); /* * The current behavior is to always retry after moving * ->cleared to word, and we change it to retry in case * of any free bits. To avoid an infinite loop, we need * to take wrap & alloc_hint into account, otherwise a * soft lockup may occur. */ if (!wrap && alloc_hint) word_mask &= ~((1UL << alloc_hint) - 1); return (READ_ONCE(map->word) & word_mask) != word_mask; } /* * First get a stable cleared mask, setting the old mask to 0. */ mask = xchg(&map->cleared, 0); /* * Now clear the masked bits in our free word */ atomic_long_andnot(mask, (atomic_long_t *)&map->word); BUILD_BUG_ON(sizeof(atomic_long_t) != sizeof(map->word)); return true; } int sbitmap_init_node(struct sbitmap *sb, unsigned int depth, int shift, gfp_t flags, int node, bool round_robin, bool alloc_hint) { unsigned int bits_per_word; int i; if (shift < 0) shift = sbitmap_calculate_shift(depth); bits_per_word = 1U << shift; if (bits_per_word > BITS_PER_LONG) return -EINVAL; sb->shift = shift; sb->depth = depth; sb->map_nr = DIV_ROUND_UP(sb->depth, bits_per_word); sb->round_robin = round_robin; if (depth == 0) { sb->map = NULL; return 0; } if (alloc_hint) { if (init_alloc_hint(sb, flags)) return -ENOMEM; } else { sb->alloc_hint = NULL; } sb->map = kvzalloc_node(sb->map_nr * sizeof(*sb->map), flags, node); if (!sb->map) { free_percpu(sb->alloc_hint); return -ENOMEM; } for (i = 0; i < sb->map_nr; i++) raw_spin_lock_init(&sb->map[i].swap_lock); return 0; } EXPORT_SYMBOL_GPL(sbitmap_init_node); void sbitmap_resize(struct sbitmap *sb, unsigned int depth) { unsigned int bits_per_word = 1U << sb->shift; unsigned int i; for (i = 0; i < sb->map_nr; i++) sbitmap_deferred_clear(&sb->map[i], 0, 0, 0); sb->depth = depth; sb->map_nr = DIV_ROUND_UP(sb->depth, bits_per_word); } EXPORT_SYMBOL_GPL(sbitmap_resize); static int __sbitmap_get_word(unsigned long *word, unsigned long depth, unsigned int hint, bool wrap) { int nr; /* don't wrap if starting from 0 */ wrap = wrap && hint; while (1) { nr = find_next_zero_bit(word, depth, hint); if (unlikely(nr >= depth)) { /* * We started with an offset, and we didn't reset the * offset to 0 in a failure case, so start from 0 to * exhaust the map. */ if (hint && wrap) { hint = 0; continue; } return -1; } if (!test_and_set_bit_lock(nr, word)) break; hint = nr + 1; if (hint >= depth - 1) hint = 0; } return nr; } static int sbitmap_find_bit_in_word(struct sbitmap_word *map, unsigned int depth, unsigned int alloc_hint, bool wrap) { int nr; do { nr = __sbitmap_get_word(&map->word, depth, alloc_hint, wrap); if (nr != -1) break; if (!sbitmap_deferred_clear(map, depth, alloc_hint, wrap)) break; } while (1); return nr; } static unsigned int __map_depth_with_shallow(const struct sbitmap *sb, int index, unsigned int shallow_depth) { u64 shallow_word_depth; unsigned int word_depth, reminder; word_depth = __map_depth(sb, index); if (shallow_depth >= sb->depth) return word_depth; shallow_word_depth = word_depth * shallow_depth; reminder = do_div(shallow_word_depth, sb->depth); if (reminder >= (index + 1) * word_depth) shallow_word_depth++; return (unsigned int)shallow_word_depth; } static int sbitmap_find_bit(struct sbitmap *sb, unsigned int shallow_depth, unsigned int index, unsigned int alloc_hint, bool wrap) { unsigned int i; int nr = -1; for (i = 0; i < sb->map_nr; i++) { unsigned int depth = __map_depth_with_shallow(sb, index, shallow_depth); if (depth) nr = sbitmap_find_bit_in_word(&sb->map[index], depth, alloc_hint, wrap); if (nr != -1) { nr += index << sb->shift; break; } /* Jump to next index. */ alloc_hint = 0; if (++index >= sb->map_nr) index = 0; } return nr; } static int __sbitmap_get(struct sbitmap *sb, unsigned int alloc_hint) { unsigned int index; index = SB_NR_TO_INDEX(sb, alloc_hint); /* * Unless we're doing round robin tag allocation, just use the * alloc_hint to find the right word index. No point in looping * twice in find_next_zero_bit() for that case. */ if (sb->round_robin) alloc_hint = SB_NR_TO_BIT(sb, alloc_hint); else alloc_hint = 0; return sbitmap_find_bit(sb, UINT_MAX, index, alloc_hint, !sb->round_robin); } int sbitmap_get(struct sbitmap *sb) { int nr; unsigned int hint, depth; if (WARN_ON_ONCE(unlikely(!sb->alloc_hint))) return -1; depth = READ_ONCE(sb->depth); hint = update_alloc_hint_before_get(sb, depth); nr = __sbitmap_get(sb, hint); update_alloc_hint_after_get(sb, depth, hint, nr); return nr; } EXPORT_SYMBOL_GPL(sbitmap_get); static int __sbitmap_get_shallow(struct sbitmap *sb, unsigned int alloc_hint, unsigned long shallow_depth) { unsigned int index; index = SB_NR_TO_INDEX(sb, alloc_hint); alloc_hint = SB_NR_TO_BIT(sb, alloc_hint); return sbitmap_find_bit(sb, shallow_depth, index, alloc_hint, true); } /** * sbitmap_get_shallow() - Try to allocate a free bit from a &struct sbitmap, * limiting the depth used from each word. * @sb: Bitmap to allocate from. * @shallow_depth: The maximum number of bits to allocate from the bitmap. * * This rather specific operation allows for having multiple users with * different allocation limits. E.g., there can be a high-priority class that * uses sbitmap_get() and a low-priority class that uses sbitmap_get_shallow() * with a @shallow_depth of (sb->depth >> 1). Then, the low-priority * class can only allocate half of the total bits in the bitmap, preventing it * from starving out the high-priority class. * * Return: Non-negative allocated bit number if successful, -1 otherwise. */ static int sbitmap_get_shallow(struct sbitmap *sb, unsigned long shallow_depth) { int nr; unsigned int hint, depth; if (WARN_ON_ONCE(unlikely(!sb->alloc_hint))) return -1; depth = READ_ONCE(sb->depth); hint = update_alloc_hint_before_get(sb, depth); nr = __sbitmap_get_shallow(sb, hint, shallow_depth); update_alloc_hint_after_get(sb, depth, hint, nr); return nr; } bool sbitmap_any_bit_set(const struct sbitmap *sb) { unsigned int i; for (i = 0; i < sb->map_nr; i++) { if (sb->map[i].word & ~sb->map[i].cleared) return true; } return false; } EXPORT_SYMBOL_GPL(sbitmap_any_bit_set); static unsigned int __sbitmap_weight(const struct sbitmap *sb, bool set) { unsigned int i, weight = 0; for (i = 0; i < sb->map_nr; i++) { const struct sbitmap_word *word = &sb->map[i]; unsigned int word_depth = __map_depth(sb, i); if (set) weight += bitmap_weight(&word->word, word_depth); else weight += bitmap_weight(&word->cleared, word_depth); } return weight; } static unsigned int sbitmap_cleared(const struct sbitmap *sb) { return __sbitmap_weight(sb, false); } unsigned int sbitmap_weight(const struct sbitmap *sb) { return __sbitmap_weight(sb, true) - sbitmap_cleared(sb); } EXPORT_SYMBOL_GPL(sbitmap_weight); void sbitmap_show(struct sbitmap *sb, struct seq_file *m) { seq_printf(m, "depth=%u\n", sb->depth); seq_printf(m, "busy=%u\n", sbitmap_weight(sb)); seq_printf(m, "cleared=%u\n", sbitmap_cleared(sb)); seq_printf(m, "bits_per_word=%u\n", 1U << sb->shift); seq_printf(m, "map_nr=%u\n", sb->map_nr); } EXPORT_SYMBOL_GPL(sbitmap_show); static inline void emit_byte(struct seq_file *m, unsigned int offset, u8 byte) { if ((offset & 0xf) == 0) { if (offset != 0) seq_putc(m, '\n'); seq_printf(m, "%08x:", offset); } if ((offset & 0x1) == 0) seq_putc(m, ' '); seq_printf(m, "%02x", byte); } void sbitmap_bitmap_show(struct sbitmap *sb, struct seq_file *m) { u8 byte = 0; unsigned int byte_bits = 0; unsigned int offset = 0; int i; for (i = 0; i < sb->map_nr; i++) { unsigned long word = READ_ONCE(sb->map[i].word); unsigned long cleared = READ_ONCE(sb->map[i].cleared); unsigned int word_bits = __map_depth(sb, i); word &= ~cleared; while (word_bits > 0) { unsigned int bits = min(8 - byte_bits, word_bits); byte |= (word & (BIT(bits) - 1)) << byte_bits; byte_bits += bits; if (byte_bits == 8) { emit_byte(m, offset, byte); byte = 0; byte_bits = 0; offset++; } word >>= bits; word_bits -= bits; } } if (byte_bits) { emit_byte(m, offset, byte); offset++; } if (offset) seq_putc(m, '\n'); } EXPORT_SYMBOL_GPL(sbitmap_bitmap_show); static unsigned int sbq_calc_wake_batch(struct sbitmap_queue *sbq, unsigned int depth) { return clamp_t(unsigned int, min(depth, sbq->min_shallow_depth) / SBQ_WAIT_QUEUES, 1, SBQ_WAKE_BATCH); } int sbitmap_queue_init_node(struct sbitmap_queue *sbq, unsigned int depth, int shift, bool round_robin, gfp_t flags, int node) { int ret; int i; ret = sbitmap_init_node(&sbq->sb, depth, shift, flags, node, round_robin, true); if (ret) return ret; sbq->min_shallow_depth = UINT_MAX; sbq->wake_batch = sbq_calc_wake_batch(sbq, depth); atomic_set(&sbq->wake_index, 0); atomic_set(&sbq->ws_active, 0); atomic_set(&sbq->completion_cnt, 0); atomic_set(&sbq->wakeup_cnt, 0); sbq->ws = kzalloc_node(SBQ_WAIT_QUEUES * sizeof(*sbq->ws), flags, node); if (!sbq->ws) { sbitmap_free(&sbq->sb); return -ENOMEM; } for (i = 0; i < SBQ_WAIT_QUEUES; i++) init_waitqueue_head(&sbq->ws[i].wait); return 0; } EXPORT_SYMBOL_GPL(sbitmap_queue_init_node); static void sbitmap_queue_update_wake_batch(struct sbitmap_queue *sbq, unsigned int depth) { unsigned int wake_batch; wake_batch = sbq_calc_wake_batch(sbq, depth); if (sbq->wake_batch != wake_batch) WRITE_ONCE(sbq->wake_batch, wake_batch); } void sbitmap_queue_recalculate_wake_batch(struct sbitmap_queue *sbq, unsigned int users) { unsigned int wake_batch; unsigned int depth = (sbq->sb.depth + users - 1) / users; wake_batch = clamp_val(depth / SBQ_WAIT_QUEUES, 1, SBQ_WAKE_BATCH); WRITE_ONCE(sbq->wake_batch, wake_batch); } EXPORT_SYMBOL_GPL(sbitmap_queue_recalculate_wake_batch); void sbitmap_queue_resize(struct sbitmap_queue *sbq, unsigned int depth) { sbitmap_queue_update_wake_batch(sbq, depth); sbitmap_resize(&sbq->sb, depth); } EXPORT_SYMBOL_GPL(sbitmap_queue_resize); int __sbitmap_queue_get(struct sbitmap_queue *sbq) { return sbitmap_get(&sbq->sb); } EXPORT_SYMBOL_GPL(__sbitmap_queue_get); unsigned long __sbitmap_queue_get_batch(struct sbitmap_queue *sbq, int nr_tags, unsigned int *offset) { struct sbitmap *sb = &sbq->sb; unsigned int hint, depth; unsigned long index, nr; int i; if (unlikely(sb->round_robin)) return 0; depth = READ_ONCE(sb->depth); hint = update_alloc_hint_before_get(sb, depth); index = SB_NR_TO_INDEX(sb, hint); for (i = 0; i < sb->map_nr; i++) { struct sbitmap_word *map = &sb->map[index]; unsigned long get_mask; unsigned int map_depth = __map_depth(sb, index); unsigned long val; sbitmap_deferred_clear(map, 0, 0, 0); val = READ_ONCE(map->word); if (val == (1UL << (map_depth - 1)) - 1) goto next; nr = find_first_zero_bit(&val, map_depth); if (nr + nr_tags <= map_depth) { atomic_long_t *ptr = (atomic_long_t *) &map->word; get_mask = ((1UL << nr_tags) - 1) << nr; while (!atomic_long_try_cmpxchg(ptr, &val, get_mask | val)) ; get_mask = (get_mask & ~val) >> nr; if (get_mask) { *offset = nr + (index << sb->shift); update_alloc_hint_after_get(sb, depth, hint, *offset + nr_tags - 1); return get_mask; } } next: /* Jump to next index. */ if (++index >= sb->map_nr) index = 0; } return 0; } int sbitmap_queue_get_shallow(struct sbitmap_queue *sbq, unsigned int shallow_depth) { WARN_ON_ONCE(shallow_depth < sbq->min_shallow_depth); return sbitmap_get_shallow(&sbq->sb, shallow_depth); } EXPORT_SYMBOL_GPL(sbitmap_queue_get_shallow); void sbitmap_queue_min_shallow_depth(struct sbitmap_queue *sbq, unsigned int min_shallow_depth) { sbq->min_shallow_depth = min_shallow_depth; sbitmap_queue_update_wake_batch(sbq, sbq->sb.depth); } EXPORT_SYMBOL_GPL(sbitmap_queue_min_shallow_depth); static void __sbitmap_queue_wake_up(struct sbitmap_queue *sbq, int nr) { int i, wake_index, woken; if (!atomic_read(&sbq->ws_active)) return; wake_index = atomic_read(&sbq->wake_index); for (i = 0; i < SBQ_WAIT_QUEUES; i++) { struct sbq_wait_state *ws = &sbq->ws[wake_index]; /* * Advance the index before checking the current queue. * It improves fairness, by ensuring the queue doesn't * need to be fully emptied before trying to wake up * from the next one. */ wake_index = sbq_index_inc(wake_index); if (waitqueue_active(&ws->wait)) { woken = wake_up_nr(&ws->wait, nr); if (woken == nr) break; nr -= woken; } } if (wake_index != atomic_read(&sbq->wake_index)) atomic_set(&sbq->wake_index, wake_index); } void sbitmap_queue_wake_up(struct sbitmap_queue *sbq, int nr) { unsigned int wake_batch = READ_ONCE(sbq->wake_batch); unsigned int wakeups; if (!atomic_read(&sbq->ws_active)) return; atomic_add(nr, &sbq->completion_cnt); wakeups = atomic_read(&sbq->wakeup_cnt); do { if (atomic_read(&sbq->completion_cnt) - wakeups < wake_batch) return; } while (!atomic_try_cmpxchg(&sbq->wakeup_cnt, &wakeups, wakeups + wake_batch)); __sbitmap_queue_wake_up(sbq, wake_batch); } EXPORT_SYMBOL_GPL(sbitmap_queue_wake_up); static inline void sbitmap_update_cpu_hint(struct sbitmap *sb, int cpu, int tag) { if (likely(!sb->round_robin && tag < sb->depth)) data_race(*per_cpu_ptr(sb->alloc_hint, cpu) = tag); } void sbitmap_queue_clear_batch(struct sbitmap_queue *sbq, int offset, int *tags, int nr_tags) { struct sbitmap *sb = &sbq->sb; unsigned long *addr = NULL; unsigned long mask = 0; int i; smp_mb__before_atomic(); for (i = 0; i < nr_tags; i++) { const int tag = tags[i] - offset; unsigned long *this_addr; /* since we're clearing a batch, skip the deferred map */ this_addr = &sb->map[SB_NR_TO_INDEX(sb, tag)].word; if (!addr) { addr = this_addr; } else if (addr != this_addr) { atomic_long_andnot(mask, (atomic_long_t *) addr); mask = 0; addr = this_addr; } mask |= (1UL << SB_NR_TO_BIT(sb, tag)); } if (mask) atomic_long_andnot(mask, (atomic_long_t *) addr); smp_mb__after_atomic(); sbitmap_queue_wake_up(sbq, nr_tags); sbitmap_update_cpu_hint(&sbq->sb, raw_smp_processor_id(), tags[nr_tags - 1] - offset); } void sbitmap_queue_clear(struct sbitmap_queue *sbq, unsigned int nr, unsigned int cpu) { /* * Once the clear bit is set, the bit may be allocated out. * * Orders READ/WRITE on the associated instance(such as request * of blk_mq) by this bit for avoiding race with re-allocation, * and its pair is the memory barrier implied in __sbitmap_get_word. * * One invariant is that the clear bit has to be zero when the bit * is in use. */ smp_mb__before_atomic(); sbitmap_deferred_clear_bit(&sbq->sb, nr); /* * Pairs with the memory barrier in set_current_state() to ensure the * proper ordering of clear_bit_unlock()/waitqueue_active() in the waker * and test_and_set_bit_lock()/prepare_to_wait()/finish_wait() in the * waiter. See the comment on waitqueue_active(). */ smp_mb__after_atomic(); sbitmap_queue_wake_up(sbq, 1); sbitmap_update_cpu_hint(&sbq->sb, cpu, nr); } EXPORT_SYMBOL_GPL(sbitmap_queue_clear); void sbitmap_queue_wake_all(struct sbitmap_queue *sbq) { int i, wake_index; /* * Pairs with the memory barrier in set_current_state() like in * sbitmap_queue_wake_up(). */ smp_mb(); wake_index = atomic_read(&sbq->wake_index); for (i = 0; i < SBQ_WAIT_QUEUES; i++) { struct sbq_wait_state *ws = &sbq->ws[wake_index]; if (waitqueue_active(&ws->wait)) wake_up(&ws->wait); wake_index = sbq_index_inc(wake_index); } } EXPORT_SYMBOL_GPL(sbitmap_queue_wake_all); void sbitmap_queue_show(struct sbitmap_queue *sbq, struct seq_file *m) { bool first; int i; sbitmap_show(&sbq->sb, m); seq_puts(m, "alloc_hint={"); first = true; for_each_possible_cpu(i) { if (!first) seq_puts(m, ", "); first = false; seq_printf(m, "%u", *per_cpu_ptr(sbq->sb.alloc_hint, i)); } seq_puts(m, "}\n"); seq_printf(m, "wake_batch=%u\n", sbq->wake_batch); seq_printf(m, "wake_index=%d\n", atomic_read(&sbq->wake_index)); seq_printf(m, "ws_active=%d\n", atomic_read(&sbq->ws_active)); seq_puts(m, "ws={\n"); for (i = 0; i < SBQ_WAIT_QUEUES; i++) { struct sbq_wait_state *ws = &sbq->ws[i]; seq_printf(m, "\t{.wait=%s},\n", waitqueue_active(&ws->wait) ? "active" : "inactive"); } seq_puts(m, "}\n"); seq_printf(m, "round_robin=%d\n", sbq->sb.round_robin); seq_printf(m, "min_shallow_depth=%u\n", sbq->min_shallow_depth); } EXPORT_SYMBOL_GPL(sbitmap_queue_show); void sbitmap_add_wait_queue(struct sbitmap_queue *sbq, struct sbq_wait_state *ws, struct sbq_wait *sbq_wait) { if (!sbq_wait->sbq) { sbq_wait->sbq = sbq; atomic_inc(&sbq->ws_active); add_wait_queue(&ws->wait, &sbq_wait->wait); } } EXPORT_SYMBOL_GPL(sbitmap_add_wait_queue); void sbitmap_del_wait_queue(struct sbq_wait *sbq_wait) { list_del_init(&sbq_wait->wait.entry); if (sbq_wait->sbq) { atomic_dec(&sbq_wait->sbq->ws_active); sbq_wait->sbq = NULL; } } EXPORT_SYMBOL_GPL(sbitmap_del_wait_queue); void sbitmap_prepare_to_wait(struct sbitmap_queue *sbq, struct sbq_wait_state *ws, struct sbq_wait *sbq_wait, int state) { if (!sbq_wait->sbq) { atomic_inc(&sbq->ws_active); sbq_wait->sbq = sbq; } prepare_to_wait_exclusive(&ws->wait, &sbq_wait->wait, state); } EXPORT_SYMBOL_GPL(sbitmap_prepare_to_wait); void sbitmap_finish_wait(struct sbitmap_queue *sbq, struct sbq_wait_state *ws, struct sbq_wait *sbq_wait) { finish_wait(&ws->wait, &sbq_wait->wait); if (sbq_wait->sbq) { atomic_dec(&sbq->ws_active); sbq_wait->sbq = NULL; } } EXPORT_SYMBOL_GPL(sbitmap_finish_wait);
38 30 28 10 35 35 34 9 62 62 42 61 19 1 2 39 39 3 37 38 19 62 2 62 23 38 32 63 63 2 12 20 31 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) 2017-2018 HUAWEI, Inc. * https://www.huawei.com/ * Copyright (C) 2022, Alibaba Cloud */ #include "xattr.h" #include <trace/events/erofs.h> struct erofs_qstr { const unsigned char *name; const unsigned char *end; }; /* based on the end of qn is accurate and it must have the trailing '\0' */ static inline int erofs_dirnamecmp(const struct erofs_qstr *qn, const struct erofs_qstr *qd, unsigned int *matched) { unsigned int i = *matched; /* * on-disk error, let's only BUG_ON in the debugging mode. * otherwise, it will return 1 to just skip the invalid name * and go on (in consideration of the lookup performance). */ DBG_BUGON(qd->name > qd->end); /* qd could not have trailing '\0' */ /* However it is absolutely safe if < qd->end */ while (qd->name + i < qd->end && qd->name[i] != '\0') { if (qn->name[i] != qd->name[i]) { *matched = i; return qn->name[i] > qd->name[i] ? 1 : -1; } ++i; } *matched = i; /* See comments in __d_alloc on the terminating NUL character */ return qn->name[i] == '\0' ? 0 : 1; } #define nameoff_from_disk(off, sz) (le16_to_cpu(off) & ((sz) - 1)) static struct erofs_dirent *find_target_dirent(struct erofs_qstr *name, u8 *data, unsigned int dirblksize, const int ndirents) { int head, back; unsigned int startprfx, endprfx; struct erofs_dirent *const de = (struct erofs_dirent *)data; /* since the 1st dirent has been evaluated previously */ head = 1; back = ndirents - 1; startprfx = endprfx = 0; while (head <= back) { const int mid = head + (back - head) / 2; const int nameoff = nameoff_from_disk(de[mid].nameoff, dirblksize); unsigned int matched = min(startprfx, endprfx); struct erofs_qstr dname = { .name = data + nameoff, .end = mid >= ndirents - 1 ? data + dirblksize : data + nameoff_from_disk(de[mid + 1].nameoff, dirblksize) }; /* string comparison without already matched prefix */ int ret = erofs_dirnamecmp(name, &dname, &matched); if (!ret) { return de + mid; } else if (ret > 0) { head = mid + 1; startprfx = matched; } else { back = mid - 1; endprfx = matched; } } return ERR_PTR(-ENOENT); } static void *erofs_find_target_block(struct erofs_buf *target, struct inode *dir, struct erofs_qstr *name, int *_ndirents) { unsigned int bsz = i_blocksize(dir); int head = 0, back = erofs_iblks(dir) - 1; unsigned int startprfx = 0, endprfx = 0; void *candidate = ERR_PTR(-ENOENT); while (head <= back) { const int mid = head + (back - head) / 2; struct erofs_buf buf = __EROFS_BUF_INITIALIZER; struct erofs_dirent *de; buf.mapping = dir->i_mapping; de = erofs_bread(&buf, erofs_pos(dir->i_sb, mid), true); if (!IS_ERR(de)) { const int nameoff = nameoff_from_disk(de->nameoff, bsz); const int ndirents = nameoff / sizeof(*de); int diff; unsigned int matched; struct erofs_qstr dname; if (!ndirents) { erofs_put_metabuf(&buf); erofs_err(dir->i_sb, "corrupted dir block %d @ nid %llu", mid, EROFS_I(dir)->nid); DBG_BUGON(1); de = ERR_PTR(-EFSCORRUPTED); goto out; } matched = min(startprfx, endprfx); dname.name = (u8 *)de + nameoff; if (ndirents == 1) dname.end = (u8 *)de + bsz; else dname.end = (u8 *)de + nameoff_from_disk(de[1].nameoff, bsz); /* string comparison without already matched prefix */ diff = erofs_dirnamecmp(name, &dname, &matched); if (diff < 0) { erofs_put_metabuf(&buf); back = mid - 1; endprfx = matched; continue; } if (!IS_ERR(candidate)) erofs_put_metabuf(target); *target = buf; if (!diff) { *_ndirents = 0; return de; } head = mid + 1; startprfx = matched; candidate = de; *_ndirents = ndirents; continue; } out: /* free if the candidate is valid */ if (!IS_ERR(candidate)) erofs_put_metabuf(target); return de; } return candidate; } int erofs_namei(struct inode *dir, const struct qstr *name, erofs_nid_t *nid, unsigned int *d_type) { int ndirents; struct erofs_buf buf = __EROFS_BUF_INITIALIZER; struct erofs_dirent *de; struct erofs_qstr qn; if (!dir->i_size) return -ENOENT; qn.name = name->name; qn.end = name->name + name->len; buf.mapping = dir->i_mapping; ndirents = 0; de = erofs_find_target_block(&buf, dir, &qn, &ndirents); if (IS_ERR(de)) return PTR_ERR(de); if (ndirents) de = find_target_dirent(&qn, (u8 *)de, i_blocksize(dir), ndirents); if (!IS_ERR(de)) { *nid = le64_to_cpu(de->nid); *d_type = de->file_type; } erofs_put_metabuf(&buf); return PTR_ERR_OR_ZERO(de); } static struct dentry *erofs_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) { int err; erofs_nid_t nid; unsigned int d_type; struct inode *inode; trace_erofs_lookup(dir, dentry, flags); if (dentry->d_name.len > EROFS_NAME_LEN) return ERR_PTR(-ENAMETOOLONG); err = erofs_namei(dir, &dentry->d_name, &nid, &d_type); if (err == -ENOENT) /* negative dentry */ inode = NULL; else if (err) inode = ERR_PTR(err); else inode = erofs_iget(dir->i_sb, nid); return d_splice_alias(inode, dentry); } const struct inode_operations erofs_dir_iops = { .lookup = erofs_lookup, .getattr = erofs_getattr, .listxattr = erofs_listxattr, .get_inode_acl = erofs_get_acl, .fiemap = erofs_fiemap, };
17 17 24 27 2 2 11 2 12 2 11 12 11 2 24 11 27 27 27 27 27 27 27 10 27 27 24 10 2 2 2 2 4 2 17 4 17 2 2 2 2 2 2 2 2 2 2 2 1007 2 1007 11 11 2892 2899 2891 17 2899 11 4 11 11 11 11 11 11 10 11 11 11 11 11 24 24 18 8 2874 2870 14 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 // SPDX-License-Identifier: GPL-2.0-or-later /* * Copyright (C) 2008 Red Hat, Inc., Eric Paris <eparis@redhat.com> */ /* * fsnotify inode mark locking/lifetime/and refcnting * * REFCNT: * The group->recnt and mark->refcnt tell how many "things" in the kernel * currently are referencing the objects. Both kind of objects typically will * live inside the kernel with a refcnt of 2, one for its creation and one for * the reference a group and a mark hold to each other. * If you are holding the appropriate locks, you can take a reference and the * object itself is guaranteed to survive until the reference is dropped. * * LOCKING: * There are 3 locks involved with fsnotify inode marks and they MUST be taken * in order as follows: * * group->mark_mutex * mark->lock * mark->connector->lock * * group->mark_mutex protects the marks_list anchored inside a given group and * each mark is hooked via the g_list. It also protects the groups private * data (i.e group limits). * mark->lock protects the marks attributes like its masks and flags. * Furthermore it protects the access to a reference of the group that the mark * is assigned to as well as the access to a reference of the inode/vfsmount * that is being watched by the mark. * * mark->connector->lock protects the list of marks anchored inside an * inode / vfsmount and each mark is hooked via the i_list. * * A list of notification marks relating to inode / mnt is contained in * fsnotify_mark_connector. That structure is alive as long as there are any * marks in the list and is also protected by fsnotify_mark_srcu. A mark gets * detached from fsnotify_mark_connector when last reference to the mark is * dropped. Thus having mark reference is enough to protect mark->connector * pointer and to make sure fsnotify_mark_connector cannot disappear. Also * because we remove mark from g_list before dropping mark reference associated * with that, any mark found through g_list is guaranteed to have * mark->connector set until we drop group->mark_mutex. * * LIFETIME: * Inode marks survive between when they are added to an inode and when their * refcnt==0. Marks are also protected by fsnotify_mark_srcu. * * The inode mark can be cleared for a number of different reasons including: * - The inode is unlinked for the last time. (fsnotify_inode_remove) * - The inode is being evicted from cache. (fsnotify_inode_delete) * - The fs the inode is on is unmounted. (fsnotify_inode_delete/fsnotify_unmount_inodes) * - Something explicitly requests that it be removed. (fsnotify_destroy_mark) * - The fsnotify_group associated with the mark is going away and all such marks * need to be cleaned up. (fsnotify_clear_marks_by_group) * * This has the very interesting property of being able to run concurrently with * any (or all) other directions. */ #include <linux/fs.h> #include <linux/init.h> #include <linux/kernel.h> #include <linux/kthread.h> #include <linux/module.h> #include <linux/mutex.h> #include <linux/slab.h> #include <linux/spinlock.h> #include <linux/srcu.h> #include <linux/ratelimit.h> #include <linux/atomic.h> #include <linux/fsnotify_backend.h> #include "fsnotify.h" #define FSNOTIFY_REAPER_DELAY (1) /* 1 jiffy */ struct srcu_struct fsnotify_mark_srcu; struct kmem_cache *fsnotify_mark_connector_cachep; static DEFINE_SPINLOCK(destroy_lock); static LIST_HEAD(destroy_list); static struct fsnotify_mark_connector *connector_destroy_list; static void fsnotify_mark_destroy_workfn(struct work_struct *work); static DECLARE_DELAYED_WORK(reaper_work, fsnotify_mark_destroy_workfn); static void fsnotify_connector_destroy_workfn(struct work_struct *work); static DECLARE_WORK(connector_reaper_work, fsnotify_connector_destroy_workfn); void fsnotify_get_mark(struct fsnotify_mark *mark) { WARN_ON_ONCE(!refcount_read(&mark->refcnt)); refcount_inc(&mark->refcnt); } static fsnotify_connp_t *fsnotify_object_connp(void *obj, enum fsnotify_obj_type obj_type) { switch (obj_type) { case FSNOTIFY_OBJ_TYPE_INODE: return &((struct inode *)obj)->i_fsnotify_marks; case FSNOTIFY_OBJ_TYPE_VFSMOUNT: return &real_mount(obj)->mnt_fsnotify_marks; case FSNOTIFY_OBJ_TYPE_SB: return fsnotify_sb_marks(obj); case FSNOTIFY_OBJ_TYPE_MNTNS: return &((struct mnt_namespace *)obj)->n_fsnotify_marks; default: return NULL; } } static __u32 *fsnotify_conn_mask_p(struct fsnotify_mark_connector *conn) { if (conn->type == FSNOTIFY_OBJ_TYPE_INODE) return &fsnotify_conn_inode(conn)->i_fsnotify_mask; else if (conn->type == FSNOTIFY_OBJ_TYPE_VFSMOUNT) return &fsnotify_conn_mount(conn)->mnt_fsnotify_mask; else if (conn->type == FSNOTIFY_OBJ_TYPE_SB) return &fsnotify_conn_sb(conn)->s_fsnotify_mask; else if (conn->type == FSNOTIFY_OBJ_TYPE_MNTNS) return &fsnotify_conn_mntns(conn)->n_fsnotify_mask; return NULL; } __u32 fsnotify_conn_mask(struct fsnotify_mark_connector *conn) { if (WARN_ON(!fsnotify_valid_obj_type(conn->type))) return 0; return READ_ONCE(*fsnotify_conn_mask_p(conn)); } static void fsnotify_get_sb_watched_objects(struct super_block *sb) { atomic_long_inc(fsnotify_sb_watched_objects(sb)); } static void fsnotify_put_sb_watched_objects(struct super_block *sb) { atomic_long_t *watched_objects = fsnotify_sb_watched_objects(sb); /* the superblock can go away after this decrement */ if (atomic_long_dec_and_test(watched_objects)) wake_up_var(watched_objects); } static void fsnotify_get_inode_ref(struct inode *inode) { ihold(inode); fsnotify_get_sb_watched_objects(inode->i_sb); } static void fsnotify_put_inode_ref(struct inode *inode) { /* read ->i_sb before the inode can go away */ struct super_block *sb = inode->i_sb; iput(inode); fsnotify_put_sb_watched_objects(sb); } /* * Grab or drop watched objects reference depending on whether the connector * is attached and has any marks attached. */ static void fsnotify_update_sb_watchers(struct super_block *sb, struct fsnotify_mark_connector *conn) { struct fsnotify_sb_info *sbinfo = fsnotify_sb_info(sb); bool is_watched = conn->flags & FSNOTIFY_CONN_FLAG_IS_WATCHED; struct fsnotify_mark *first_mark = NULL; unsigned int highest_prio = 0; if (conn->obj) first_mark = hlist_entry_safe(conn->list.first, struct fsnotify_mark, obj_list); if (first_mark) highest_prio = first_mark->group->priority; if (WARN_ON(highest_prio >= __FSNOTIFY_PRIO_NUM)) highest_prio = 0; /* * If the highest priority of group watching this object is prio, * then watched object has a reference on counters [0..prio]. * Update priority >= 1 watched objects counters. */ for (unsigned int p = conn->prio + 1; p <= highest_prio; p++) atomic_long_inc(&sbinfo->watched_objects[p]); for (unsigned int p = conn->prio; p > highest_prio; p--) atomic_long_dec(&sbinfo->watched_objects[p]); conn->prio = highest_prio; /* Update priority >= 0 (a.k.a total) watched objects counter */ BUILD_BUG_ON(FSNOTIFY_PRIO_NORMAL != 0); if (first_mark && !is_watched) { conn->flags |= FSNOTIFY_CONN_FLAG_IS_WATCHED; fsnotify_get_sb_watched_objects(sb); } else if (!first_mark && is_watched) { conn->flags &= ~FSNOTIFY_CONN_FLAG_IS_WATCHED; fsnotify_put_sb_watched_objects(sb); } } /* * Grab or drop inode reference for the connector if needed. * * When it's time to drop the reference, we only clear the HAS_IREF flag and * return the inode object. fsnotify_drop_object() will be resonsible for doing * iput() outside of spinlocks. This happens when last mark that wanted iref is * detached. */ static struct inode *fsnotify_update_iref(struct fsnotify_mark_connector *conn, bool want_iref) { bool has_iref = conn->flags & FSNOTIFY_CONN_FLAG_HAS_IREF; struct inode *inode = NULL; if (conn->type != FSNOTIFY_OBJ_TYPE_INODE || want_iref == has_iref) return NULL; if (want_iref) { /* Pin inode if any mark wants inode refcount held */ fsnotify_get_inode_ref(fsnotify_conn_inode(conn)); conn->flags |= FSNOTIFY_CONN_FLAG_HAS_IREF; } else { /* Unpin inode after detach of last mark that wanted iref */ inode = fsnotify_conn_inode(conn); conn->flags &= ~FSNOTIFY_CONN_FLAG_HAS_IREF; } return inode; } static void *__fsnotify_recalc_mask(struct fsnotify_mark_connector *conn) { u32 new_mask = 0; bool want_iref = false; struct fsnotify_mark *mark; assert_spin_locked(&conn->lock); /* We can get detached connector here when inode is getting unlinked. */ if (!fsnotify_valid_obj_type(conn->type)) return NULL; hlist_for_each_entry(mark, &conn->list, obj_list) { if (!(mark->flags & FSNOTIFY_MARK_FLAG_ATTACHED)) continue; new_mask |= fsnotify_calc_mask(mark); if (conn->type == FSNOTIFY_OBJ_TYPE_INODE && !(mark->flags & FSNOTIFY_MARK_FLAG_NO_IREF)) want_iref = true; } /* * We use WRITE_ONCE() to prevent silly compiler optimizations from * confusing readers not holding conn->lock with partial updates. */ WRITE_ONCE(*fsnotify_conn_mask_p(conn), new_mask); return fsnotify_update_iref(conn, want_iref); } static bool fsnotify_conn_watches_children( struct fsnotify_mark_connector *conn) { if (conn->type != FSNOTIFY_OBJ_TYPE_INODE) return false; return fsnotify_inode_watches_children(fsnotify_conn_inode(conn)); } static void fsnotify_conn_set_children_dentry_flags( struct fsnotify_mark_connector *conn) { if (conn->type != FSNOTIFY_OBJ_TYPE_INODE) return; fsnotify_set_children_dentry_flags(fsnotify_conn_inode(conn)); } /* * Calculate mask of events for a list of marks. The caller must make sure * connector and connector->obj cannot disappear under us. Callers achieve * this by holding a mark->lock or mark->group->mark_mutex for a mark on this * list. */ void fsnotify_recalc_mask(struct fsnotify_mark_connector *conn) { bool update_children; if (!conn) return; spin_lock(&conn->lock); update_children = !fsnotify_conn_watches_children(conn); __fsnotify_recalc_mask(conn); update_children &= fsnotify_conn_watches_children(conn); spin_unlock(&conn->lock); /* * Set children's PARENT_WATCHED flags only if parent started watching. * When parent stops watching, we clear false positive PARENT_WATCHED * flags lazily in __fsnotify_parent(). */ if (update_children) fsnotify_conn_set_children_dentry_flags(conn); } /* Free all connectors queued for freeing once SRCU period ends */ static void fsnotify_connector_destroy_workfn(struct work_struct *work) { struct fsnotify_mark_connector *conn, *free; spin_lock(&destroy_lock); conn = connector_destroy_list; connector_destroy_list = NULL; spin_unlock(&destroy_lock); synchronize_srcu(&fsnotify_mark_srcu); while (conn) { free = conn; conn = conn->destroy_next; kmem_cache_free(fsnotify_mark_connector_cachep, free); } } static void *fsnotify_detach_connector_from_object( struct fsnotify_mark_connector *conn, unsigned int *type) { fsnotify_connp_t *connp = fsnotify_object_connp(conn->obj, conn->type); struct super_block *sb = fsnotify_connector_sb(conn); struct inode *inode = NULL; *type = conn->type; if (conn->type == FSNOTIFY_OBJ_TYPE_DETACHED) return NULL; if (conn->type == FSNOTIFY_OBJ_TYPE_INODE) { inode = fsnotify_conn_inode(conn); inode->i_fsnotify_mask = 0; /* Unpin inode when detaching from connector */ if (!(conn->flags & FSNOTIFY_CONN_FLAG_HAS_IREF)) inode = NULL; } else if (conn->type == FSNOTIFY_OBJ_TYPE_VFSMOUNT) { fsnotify_conn_mount(conn)->mnt_fsnotify_mask = 0; } else if (conn->type == FSNOTIFY_OBJ_TYPE_SB) { fsnotify_conn_sb(conn)->s_fsnotify_mask = 0; } else if (conn->type == FSNOTIFY_OBJ_TYPE_MNTNS) { fsnotify_conn_mntns(conn)->n_fsnotify_mask = 0; } rcu_assign_pointer(*connp, NULL); conn->obj = NULL; conn->type = FSNOTIFY_OBJ_TYPE_DETACHED; if (sb) fsnotify_update_sb_watchers(sb, conn); return inode; } static void fsnotify_final_mark_destroy(struct fsnotify_mark *mark) { struct fsnotify_group *group = mark->group; if (WARN_ON_ONCE(!group)) return; group->ops->free_mark(mark); fsnotify_put_group(group); } /* Drop object reference originally held by a connector */ static void fsnotify_drop_object(unsigned int type, void *objp) { if (!objp) return; /* Currently only inode references are passed to be dropped */ if (WARN_ON_ONCE(type != FSNOTIFY_OBJ_TYPE_INODE)) return; fsnotify_put_inode_ref(objp); } void fsnotify_put_mark(struct fsnotify_mark *mark) { struct fsnotify_mark_connector *conn = READ_ONCE(mark->connector); void *objp = NULL; unsigned int type = FSNOTIFY_OBJ_TYPE_DETACHED; bool free_conn = false; /* Catch marks that were actually never attached to object */ if (!conn) { if (refcount_dec_and_test(&mark->refcnt)) fsnotify_final_mark_destroy(mark); return; } /* * We have to be careful so that traversals of obj_list under lock can * safely grab mark reference. */ if (!refcount_dec_and_lock(&mark->refcnt, &conn->lock)) return; hlist_del_init_rcu(&mark->obj_list); if (hlist_empty(&conn->list)) { objp = fsnotify_detach_connector_from_object(conn, &type); free_conn = true; } else { struct super_block *sb = fsnotify_connector_sb(conn); /* Update watched objects after detaching mark */ if (sb) fsnotify_update_sb_watchers(sb, conn); objp = __fsnotify_recalc_mask(conn); type = conn->type; } WRITE_ONCE(mark->connector, NULL); spin_unlock(&conn->lock); fsnotify_drop_object(type, objp); if (free_conn) { spin_lock(&destroy_lock); conn->destroy_next = connector_destroy_list; connector_destroy_list = conn; spin_unlock(&destroy_lock); queue_work(system_dfl_wq, &connector_reaper_work); } /* * Note that we didn't update flags telling whether inode cares about * what's happening with children. We update these flags from * __fsnotify_parent() lazily when next event happens on one of our * children. */ spin_lock(&destroy_lock); list_add(&mark->g_list, &destroy_list); spin_unlock(&destroy_lock); queue_delayed_work(system_dfl_wq, &reaper_work, FSNOTIFY_REAPER_DELAY); } EXPORT_SYMBOL_GPL(fsnotify_put_mark); /* * Get mark reference when we found the mark via lockless traversal of object * list. Mark can be already removed from the list by now and on its way to be * destroyed once SRCU period ends. * * Also pin the group so it doesn't disappear under us. */ static bool fsnotify_get_mark_safe(struct fsnotify_mark *mark) { if (!mark) return true; if (refcount_inc_not_zero(&mark->refcnt)) { spin_lock(&mark->lock); if (mark->flags & FSNOTIFY_MARK_FLAG_ATTACHED) { /* mark is attached, group is still alive then */ atomic_inc(&mark->group->user_waits); spin_unlock(&mark->lock); return true; } spin_unlock(&mark->lock); fsnotify_put_mark(mark); } return false; } /* * Puts marks and wakes up group destruction if necessary. * * Pairs with fsnotify_get_mark_safe() */ static void fsnotify_put_mark_wake(struct fsnotify_mark *mark) { if (mark) { struct fsnotify_group *group = mark->group; fsnotify_put_mark(mark); /* * We abuse notification_waitq on group shutdown for waiting for * all marks pinned when waiting for userspace. */ if (atomic_dec_and_test(&group->user_waits) && group->shutdown) wake_up(&group->notification_waitq); } } bool fsnotify_prepare_user_wait(struct fsnotify_iter_info *iter_info) __releases(&fsnotify_mark_srcu) { int type; fsnotify_foreach_iter_type(type) { /* This can fail if mark is being removed */ if (!fsnotify_get_mark_safe(iter_info->marks[type])) { __release(&fsnotify_mark_srcu); goto fail; } } /* * Now that both marks are pinned by refcount in the inode / vfsmount * lists, we can drop SRCU lock, and safely resume the list iteration * once userspace returns. */ srcu_read_unlock(&fsnotify_mark_srcu, iter_info->srcu_idx); return true; fail: for (type--; type >= 0; type--) fsnotify_put_mark_wake(iter_info->marks[type]); return false; } void fsnotify_finish_user_wait(struct fsnotify_iter_info *iter_info) __acquires(&fsnotify_mark_srcu) { int type; iter_info->srcu_idx = srcu_read_lock(&fsnotify_mark_srcu); fsnotify_foreach_iter_type(type) fsnotify_put_mark_wake(iter_info->marks[type]); } /* * Mark mark as detached, remove it from group list. Mark still stays in object * list until its last reference is dropped. Note that we rely on mark being * removed from group list before corresponding reference to it is dropped. In * particular we rely on mark->connector being valid while we hold * group->mark_mutex if we found the mark through g_list. * * Must be called with group->mark_mutex held. The caller must either hold * reference to the mark or be protected by fsnotify_mark_srcu. */ void fsnotify_detach_mark(struct fsnotify_mark *mark) { fsnotify_group_assert_locked(mark->group); WARN_ON_ONCE(!srcu_read_lock_held(&fsnotify_mark_srcu) && refcount_read(&mark->refcnt) < 1 + !!(mark->flags & FSNOTIFY_MARK_FLAG_ATTACHED)); spin_lock(&mark->lock); /* something else already called this function on this mark */ if (!(mark->flags & FSNOTIFY_MARK_FLAG_ATTACHED)) { spin_unlock(&mark->lock); return; } mark->flags &= ~FSNOTIFY_MARK_FLAG_ATTACHED; list_del_init(&mark->g_list); spin_unlock(&mark->lock); /* Drop mark reference acquired in fsnotify_add_mark_locked() */ fsnotify_put_mark(mark); } /* * Free fsnotify mark. The mark is actually only marked as being freed. The * freeing is actually happening only once last reference to the mark is * dropped from a workqueue which first waits for srcu period end. * * Caller must have a reference to the mark or be protected by * fsnotify_mark_srcu. */ void fsnotify_free_mark(struct fsnotify_mark *mark) { struct fsnotify_group *group = mark->group; spin_lock(&mark->lock); /* something else already called this function on this mark */ if (!(mark->flags & FSNOTIFY_MARK_FLAG_ALIVE)) { spin_unlock(&mark->lock); return; } mark->flags &= ~FSNOTIFY_MARK_FLAG_ALIVE; spin_unlock(&mark->lock); /* * Some groups like to know that marks are being freed. This is a * callback to the group function to let it know that this mark * is being freed. */ if (group->ops->freeing_mark) group->ops->freeing_mark(mark, group); } void fsnotify_destroy_mark(struct fsnotify_mark *mark, struct fsnotify_group *group) { fsnotify_group_lock(group); fsnotify_detach_mark(mark); fsnotify_group_unlock(group); fsnotify_free_mark(mark); } EXPORT_SYMBOL_GPL(fsnotify_destroy_mark); /* * Sorting function for lists of fsnotify marks. * * Fanotify supports different notification classes (reflected as priority of * notification group). Events shall be passed to notification groups in * decreasing priority order. To achieve this marks in notification lists for * inodes and vfsmounts are sorted so that priorities of corresponding groups * are descending. * * Furthermore correct handling of the ignore mask requires processing inode * and vfsmount marks of each group together. Using the group address as * further sort criterion provides a unique sorting order and thus we can * merge inode and vfsmount lists of marks in linear time and find groups * present in both lists. * * A return value of 1 signifies that b has priority over a. * A return value of 0 signifies that the two marks have to be handled together. * A return value of -1 signifies that a has priority over b. */ int fsnotify_compare_groups(struct fsnotify_group *a, struct fsnotify_group *b) { if (a == b) return 0; if (!a) return 1; if (!b) return -1; if (a->priority < b->priority) return 1; if (a->priority > b->priority) return -1; if (a < b) return 1; return -1; } static int fsnotify_attach_info_to_sb(struct super_block *sb) { struct fsnotify_sb_info *sbinfo; /* sb info is freed on fsnotify_sb_delete() */ sbinfo = kzalloc(sizeof(*sbinfo), GFP_KERNEL); if (!sbinfo) return -ENOMEM; /* * cmpxchg() provides the barrier so that callers of fsnotify_sb_info() * will observe an initialized structure */ if (cmpxchg(&sb->s_fsnotify_info, NULL, sbinfo)) { /* Someone else created sbinfo for us */ kfree(sbinfo); } return 0; } static int fsnotify_attach_connector_to_object(fsnotify_connp_t *connp, void *obj, unsigned int obj_type) { struct fsnotify_mark_connector *conn; conn = kmem_cache_alloc(fsnotify_mark_connector_cachep, GFP_KERNEL); if (!conn) return -ENOMEM; spin_lock_init(&conn->lock); INIT_HLIST_HEAD(&conn->list); conn->flags = 0; conn->prio = 0; conn->type = obj_type; conn->obj = obj; /* * cmpxchg() provides the barrier so that readers of *connp can see * only initialized structure */ if (cmpxchg(connp, NULL, conn)) { /* Someone else created list structure for us */ kmem_cache_free(fsnotify_mark_connector_cachep, conn); } return 0; } /* * Get mark connector, make sure it is alive and return with its lock held. * This is for users that get connector pointer from inode or mount. Users that * hold reference to a mark on the list may directly lock connector->lock as * they are sure list cannot go away under them. */ static struct fsnotify_mark_connector *fsnotify_grab_connector( fsnotify_connp_t *connp) { struct fsnotify_mark_connector *conn; int idx; idx = srcu_read_lock(&fsnotify_mark_srcu); conn = srcu_dereference(*connp, &fsnotify_mark_srcu); if (!conn) goto out; spin_lock(&conn->lock); if (conn->type == FSNOTIFY_OBJ_TYPE_DETACHED) { spin_unlock(&conn->lock); srcu_read_unlock(&fsnotify_mark_srcu, idx); return NULL; } out: srcu_read_unlock(&fsnotify_mark_srcu, idx); return conn; } /* * Add mark into proper place in given list of marks. These marks may be used * for the fsnotify backend to determine which event types should be delivered * to which group and for which inodes. These marks are ordered according to * priority, highest number first, and then by the group's location in memory. */ static int fsnotify_add_mark_list(struct fsnotify_mark *mark, void *obj, unsigned int obj_type, int add_flags) { struct super_block *sb = fsnotify_object_sb(obj, obj_type); struct fsnotify_mark *lmark, *last = NULL; struct fsnotify_mark_connector *conn; fsnotify_connp_t *connp; int cmp; int err = 0; if (WARN_ON(!fsnotify_valid_obj_type(obj_type))) return -EINVAL; /* * Attach the sb info before attaching a connector to any object on sb. * The sb info will remain attached as long as sb lives. */ if (sb && !fsnotify_sb_info(sb)) { err = fsnotify_attach_info_to_sb(sb); if (err) return err; } connp = fsnotify_object_connp(obj, obj_type); restart: spin_lock(&mark->lock); conn = fsnotify_grab_connector(connp); if (!conn) { spin_unlock(&mark->lock); err = fsnotify_attach_connector_to_object(connp, obj, obj_type); if (err) return err; goto restart; } /* is mark the first mark? */ if (hlist_empty(&conn->list)) { hlist_add_head_rcu(&mark->obj_list, &conn->list); goto added; } /* should mark be in the middle of the current list? */ hlist_for_each_entry(lmark, &conn->list, obj_list) { last = lmark; if ((lmark->group == mark->group) && (lmark->flags & FSNOTIFY_MARK_FLAG_ATTACHED) && !(mark->group->flags & FSNOTIFY_GROUP_DUPS)) { err = -EEXIST; goto out_err; } cmp = fsnotify_compare_groups(lmark->group, mark->group); if (cmp >= 0) { hlist_add_before_rcu(&mark->obj_list, &lmark->obj_list); goto added; } } BUG_ON(last == NULL); /* mark should be the last entry. last is the current last entry */ hlist_add_behind_rcu(&mark->obj_list, &last->obj_list); added: if (sb) fsnotify_update_sb_watchers(sb, conn); /* * Since connector is attached to object using cmpxchg() we are * guaranteed that connector initialization is fully visible by anyone * seeing mark->connector set. */ WRITE_ONCE(mark->connector, conn); out_err: spin_unlock(&conn->lock); spin_unlock(&mark->lock); return err; } /* * Attach an initialized mark to a given group and fs object. * These marks may be used for the fsnotify backend to determine which * event types should be delivered to which group. */ int fsnotify_add_mark_locked(struct fsnotify_mark *mark, void *obj, unsigned int obj_type, int add_flags) { struct fsnotify_group *group = mark->group; int ret = 0; fsnotify_group_assert_locked(group); /* * LOCKING ORDER!!!! * group->mark_mutex * mark->lock * mark->connector->lock */ spin_lock(&mark->lock); mark->flags |= FSNOTIFY_MARK_FLAG_ALIVE | FSNOTIFY_MARK_FLAG_ATTACHED; list_add(&mark->g_list, &group->marks_list); fsnotify_get_mark(mark); /* for g_list */ spin_unlock(&mark->lock); ret = fsnotify_add_mark_list(mark, obj, obj_type, add_flags); if (ret) goto err; fsnotify_recalc_mask(mark->connector); return ret; err: spin_lock(&mark->lock); mark->flags &= ~(FSNOTIFY_MARK_FLAG_ALIVE | FSNOTIFY_MARK_FLAG_ATTACHED); list_del_init(&mark->g_list); spin_unlock(&mark->lock); fsnotify_put_mark(mark); return ret; } int fsnotify_add_mark(struct fsnotify_mark *mark, void *obj, unsigned int obj_type, int add_flags) { int ret; struct fsnotify_group *group = mark->group; fsnotify_group_lock(group); ret = fsnotify_add_mark_locked(mark, obj, obj_type, add_flags); fsnotify_group_unlock(group); return ret; } EXPORT_SYMBOL_GPL(fsnotify_add_mark); /* * Given a list of marks, find the mark associated with given group. If found * take a reference to that mark and return it, else return NULL. */ struct fsnotify_mark *fsnotify_find_mark(void *obj, unsigned int obj_type, struct fsnotify_group *group) { fsnotify_connp_t *connp = fsnotify_object_connp(obj, obj_type); struct fsnotify_mark_connector *conn; struct fsnotify_mark *mark; if (!connp) return NULL; conn = fsnotify_grab_connector(connp); if (!conn) return NULL; hlist_for_each_entry(mark, &conn->list, obj_list) { if (mark->group == group && (mark->flags & FSNOTIFY_MARK_FLAG_ATTACHED)) { fsnotify_get_mark(mark); spin_unlock(&conn->lock); return mark; } } spin_unlock(&conn->lock); return NULL; } EXPORT_SYMBOL_GPL(fsnotify_find_mark); /* Clear any marks in a group with given type mask */ void fsnotify_clear_marks_by_group(struct fsnotify_group *group, unsigned int obj_type) { struct fsnotify_mark *lmark, *mark; LIST_HEAD(to_free); struct list_head *head = &to_free; /* Skip selection step if we want to clear all marks. */ if (obj_type == FSNOTIFY_OBJ_TYPE_ANY) { head = &group->marks_list; goto clear; } /* * We have to be really careful here. Anytime we drop mark_mutex, e.g. * fsnotify_clear_marks_by_inode() can come and free marks. Even in our * to_free list so we have to use mark_mutex even when accessing that * list. And freeing mark requires us to drop mark_mutex. So we can * reliably free only the first mark in the list. That's why we first * move marks to free to to_free list in one go and then free marks in * to_free list one by one. */ fsnotify_group_lock(group); list_for_each_entry_safe(mark, lmark, &group->marks_list, g_list) { if (mark->connector->type == obj_type) list_move(&mark->g_list, &to_free); } fsnotify_group_unlock(group); clear: while (1) { fsnotify_group_lock(group); if (list_empty(head)) { fsnotify_group_unlock(group); break; } mark = list_first_entry(head, struct fsnotify_mark, g_list); fsnotify_get_mark(mark); fsnotify_detach_mark(mark); fsnotify_group_unlock(group); fsnotify_free_mark(mark); fsnotify_put_mark(mark); } } /* Destroy all marks attached to an object via connector */ void fsnotify_destroy_marks(fsnotify_connp_t *connp) { struct fsnotify_mark_connector *conn; struct fsnotify_mark *mark, *old_mark = NULL; void *objp; unsigned int type; conn = fsnotify_grab_connector(connp); if (!conn) return; /* * We have to be careful since we can race with e.g. * fsnotify_clear_marks_by_group() and once we drop the conn->lock, the * list can get modified. However we are holding mark reference and * thus our mark cannot be removed from obj_list so we can continue * iteration after regaining conn->lock. */ hlist_for_each_entry(mark, &conn->list, obj_list) { fsnotify_get_mark(mark); spin_unlock(&conn->lock); if (old_mark) fsnotify_put_mark(old_mark); old_mark = mark; fsnotify_destroy_mark(mark, mark->group); spin_lock(&conn->lock); } /* * Detach list from object now so that we don't pin inode until all * mark references get dropped. It would lead to strange results such * as delaying inode deletion or blocking unmount. */ objp = fsnotify_detach_connector_from_object(conn, &type); spin_unlock(&conn->lock); if (old_mark) fsnotify_put_mark(old_mark); fsnotify_drop_object(type, objp); } /* * Nothing fancy, just initialize lists and locks and counters. */ void fsnotify_init_mark(struct fsnotify_mark *mark, struct fsnotify_group *group) { memset(mark, 0, sizeof(*mark)); spin_lock_init(&mark->lock); refcount_set(&mark->refcnt, 1); fsnotify_get_group(group); mark->group = group; WRITE_ONCE(mark->connector, NULL); } EXPORT_SYMBOL_GPL(fsnotify_init_mark); /* * Destroy all marks in destroy_list, waits for SRCU period to finish before * actually freeing marks. */ static void fsnotify_mark_destroy_workfn(struct work_struct *work) { struct fsnotify_mark *mark, *next; struct list_head private_destroy_list; spin_lock(&destroy_lock); /* exchange the list head */ list_replace_init(&destroy_list, &private_destroy_list); spin_unlock(&destroy_lock); synchronize_srcu(&fsnotify_mark_srcu); list_for_each_entry_safe(mark, next, &private_destroy_list, g_list) { list_del_init(&mark->g_list); fsnotify_final_mark_destroy(mark); } } /* Wait for all marks queued for destruction to be actually destroyed */ void fsnotify_wait_marks_destroyed(void) { flush_delayed_work(&reaper_work); } EXPORT_SYMBOL_GPL(fsnotify_wait_marks_destroyed);
97 5933 5941 5932 5943 215 4561 4574 4324 4575 5856 5869 4252 4392 228 230 229 5844 5854 4189 4435 340 341 340 1128 1130 782 790 3 3 3 3 3 3 2 2 2 3 3 3 8 8 3 3 2 2 3 3 8 8 3 3 3 3 3 3 1 3 3 3 1 7 7 7 2 2 2 2 2 2 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 // SPDX-License-Identifier: GPL-2.0-only #include <linux/export.h> #include <linux/nsproxy.h> #include <linux/slab.h> #include <linux/sched/signal.h> #include <linux/user_namespace.h> #include <linux/proc_ns.h> #include <linux/highuid.h> #include <linux/cred.h> #include <linux/securebits.h> #include <linux/security.h> #include <linux/keyctl.h> #include <linux/key-type.h> #include <keys/user-type.h> #include <linux/seq_file.h> #include <linux/fs.h> #include <linux/uaccess.h> #include <linux/ctype.h> #include <linux/projid.h> #include <linux/fs_struct.h> #include <linux/bsearch.h> #include <linux/sort.h> #include <linux/nstree.h> static struct kmem_cache *user_ns_cachep __ro_after_init; static DEFINE_MUTEX(userns_state_mutex); static bool new_idmap_permitted(const struct file *file, struct user_namespace *ns, int cap_setid, struct uid_gid_map *map); static void free_user_ns(struct work_struct *work); static struct ucounts *inc_user_namespaces(struct user_namespace *ns, kuid_t uid) { return inc_ucount(ns, uid, UCOUNT_USER_NAMESPACES); } static void dec_user_namespaces(struct ucounts *ucounts) { return dec_ucount(ucounts, UCOUNT_USER_NAMESPACES); } static void set_cred_user_ns(struct cred *cred, struct user_namespace *user_ns) { /* Start with the same capabilities as init but useless for doing * anything as the capabilities are bound to the new user namespace. */ cred->securebits = SECUREBITS_DEFAULT; cred->cap_inheritable = CAP_EMPTY_SET; cred->cap_permitted = CAP_FULL_SET; cred->cap_effective = CAP_FULL_SET; cred->cap_ambient = CAP_EMPTY_SET; cred->cap_bset = CAP_FULL_SET; #ifdef CONFIG_KEYS key_put(cred->request_key_auth); cred->request_key_auth = NULL; #endif /* tgcred will be cleared in our caller bc CLONE_THREAD won't be set */ cred->user_ns = user_ns; } static unsigned long enforced_nproc_rlimit(void) { unsigned long limit = RLIM_INFINITY; /* Is RLIMIT_NPROC currently enforced? */ if (!uid_eq(current_uid(), GLOBAL_ROOT_UID) || (current_user_ns() != &init_user_ns)) limit = rlimit(RLIMIT_NPROC); return limit; } /* * Create a new user namespace, deriving the creator from the user in the * passed credentials, and replacing that user with the new root user for the * new namespace. * * This is called by copy_creds(), which will finish setting the target task's * credentials. */ int create_user_ns(struct cred *new) { struct user_namespace *ns, *parent_ns = new->user_ns; kuid_t owner = new->euid; kgid_t group = new->egid; struct ucounts *ucounts; int ret, i; ret = -ENOSPC; if (parent_ns->level > 32) goto fail; ucounts = inc_user_namespaces(parent_ns, owner); if (!ucounts) goto fail; /* * Verify that we can not violate the policy of which files * may be accessed that is specified by the root directory, * by verifying that the root directory is at the root of the * mount namespace which allows all files to be accessed. */ ret = -EPERM; if (current_chrooted()) goto fail_dec; /* The creator needs a mapping in the parent user namespace * or else we won't be able to reasonably tell userspace who * created a user_namespace. */ ret = -EPERM; if (!kuid_has_mapping(parent_ns, owner) || !kgid_has_mapping(parent_ns, group)) goto fail_dec; ret = security_create_user_ns(new); if (ret < 0) goto fail_dec; ret = -ENOMEM; ns = kmem_cache_zalloc(user_ns_cachep, GFP_KERNEL); if (!ns) goto fail_dec; ns->parent_could_setfcap = cap_raised(new->cap_effective, CAP_SETFCAP); ret = ns_common_init(ns); if (ret) goto fail_free; /* Leave the new->user_ns reference with the new user namespace. */ ns->parent = parent_ns; ns->level = parent_ns->level + 1; ns->owner = owner; ns->group = group; INIT_WORK(&ns->work, free_user_ns); for (i = 0; i < UCOUNT_COUNTS; i++) { ns->ucount_max[i] = INT_MAX; } set_userns_rlimit_max(ns, UCOUNT_RLIMIT_NPROC, enforced_nproc_rlimit()); set_userns_rlimit_max(ns, UCOUNT_RLIMIT_MSGQUEUE, rlimit(RLIMIT_MSGQUEUE)); set_userns_rlimit_max(ns, UCOUNT_RLIMIT_SIGPENDING, rlimit(RLIMIT_SIGPENDING)); set_userns_rlimit_max(ns, UCOUNT_RLIMIT_MEMLOCK, rlimit(RLIMIT_MEMLOCK)); ns->ucounts = ucounts; /* Inherit USERNS_SETGROUPS_ALLOWED from our parent */ mutex_lock(&userns_state_mutex); ns->flags = parent_ns->flags; mutex_unlock(&userns_state_mutex); #ifdef CONFIG_KEYS INIT_LIST_HEAD(&ns->keyring_name_list); init_rwsem(&ns->keyring_sem); #endif ret = -ENOMEM; if (!setup_userns_sysctls(ns)) goto fail_keyring; set_cred_user_ns(new, ns); ns_tree_add(ns); return 0; fail_keyring: #ifdef CONFIG_PERSISTENT_KEYRINGS key_put(ns->persistent_keyring_register); #endif ns_common_free(ns); fail_free: kmem_cache_free(user_ns_cachep, ns); fail_dec: dec_user_namespaces(ucounts); fail: return ret; } int unshare_userns(unsigned long unshare_flags, struct cred **new_cred) { struct cred *cred; int err = -ENOMEM; if (!(unshare_flags & CLONE_NEWUSER)) return 0; cred = prepare_creds(); if (cred) { err = create_user_ns(cred); if (err) put_cred(cred); else *new_cred = cred; } return err; } static void free_user_ns(struct work_struct *work) { struct user_namespace *parent, *ns = container_of(work, struct user_namespace, work); do { struct ucounts *ucounts = ns->ucounts; parent = ns->parent; ns_tree_remove(ns); if (ns->gid_map.nr_extents > UID_GID_MAP_MAX_BASE_EXTENTS) { kfree(ns->gid_map.forward); kfree(ns->gid_map.reverse); } if (ns->uid_map.nr_extents > UID_GID_MAP_MAX_BASE_EXTENTS) { kfree(ns->uid_map.forward); kfree(ns->uid_map.reverse); } if (ns->projid_map.nr_extents > UID_GID_MAP_MAX_BASE_EXTENTS) { kfree(ns->projid_map.forward); kfree(ns->projid_map.reverse); } #if IS_ENABLED(CONFIG_BINFMT_MISC) kfree(ns->binfmt_misc); #endif retire_userns_sysctls(ns); key_free_user_ns(ns); ns_common_free(ns); /* Concurrent nstree traversal depends on a grace period. */ kfree_rcu(ns, ns.ns_rcu); dec_user_namespaces(ucounts); ns = parent; } while (ns_ref_put(parent)); } void __put_user_ns(struct user_namespace *ns) { schedule_work(&ns->work); } EXPORT_SYMBOL(__put_user_ns); /* * struct idmap_key - holds the information necessary to find an idmapping in a * sorted idmap array. It is passed to cmp_map_id() as first argument. */ struct idmap_key { bool map_up; /* true -> id from kid; false -> kid from id */ u32 id; /* id to find */ u32 count; }; /* * cmp_map_id - Function to be passed to bsearch() to find the requested * idmapping. Expects struct idmap_key to be passed via @k. */ static int cmp_map_id(const void *k, const void *e) { u32 first, last, id2; const struct idmap_key *key = k; const struct uid_gid_extent *el = e; id2 = key->id + key->count - 1; /* handle map_id_{down,up}() */ if (key->map_up) first = el->lower_first; else first = el->first; last = first + el->count - 1; if (key->id >= first && key->id <= last && (id2 >= first && id2 <= last)) return 0; if (key->id < first || id2 < first) return -1; return 1; } /* * map_id_range_down_max - Find idmap via binary search in ordered idmap array. * Can only be called if number of mappings exceeds UID_GID_MAP_MAX_BASE_EXTENTS. */ static struct uid_gid_extent * map_id_range_down_max(unsigned extents, struct uid_gid_map *map, u32 id, u32 count) { struct idmap_key key; key.map_up = false; key.count = count; key.id = id; return bsearch(&key, map->forward, extents, sizeof(struct uid_gid_extent), cmp_map_id); } /* * map_id_range_down_base - Find idmap via binary search in static extent array. * Can only be called if number of mappings is equal or less than * UID_GID_MAP_MAX_BASE_EXTENTS. */ static struct uid_gid_extent * map_id_range_down_base(unsigned extents, struct uid_gid_map *map, u32 id, u32 count) { unsigned idx; u32 first, last, id2; id2 = id + count - 1; /* Find the matching extent */ for (idx = 0; idx < extents; idx++) { first = map->extent[idx].first; last = first + map->extent[idx].count - 1; if (id >= first && id <= last && (id2 >= first && id2 <= last)) return &map->extent[idx]; } return NULL; } static u32 map_id_range_down(struct uid_gid_map *map, u32 id, u32 count) { struct uid_gid_extent *extent; unsigned extents = map->nr_extents; smp_rmb(); if (extents <= UID_GID_MAP_MAX_BASE_EXTENTS) extent = map_id_range_down_base(extents, map, id, count); else extent = map_id_range_down_max(extents, map, id, count); /* Map the id or note failure */ if (extent) id = (id - extent->first) + extent->lower_first; else id = (u32) -1; return id; } u32 map_id_down(struct uid_gid_map *map, u32 id) { return map_id_range_down(map, id, 1); } /* * map_id_up_base - Find idmap via binary search in static extent array. * Can only be called if number of mappings is equal or less than * UID_GID_MAP_MAX_BASE_EXTENTS. */ static struct uid_gid_extent * map_id_range_up_base(unsigned extents, struct uid_gid_map *map, u32 id, u32 count) { unsigned idx; u32 first, last, id2; id2 = id + count - 1; /* Find the matching extent */ for (idx = 0; idx < extents; idx++) { first = map->extent[idx].lower_first; last = first + map->extent[idx].count - 1; if (id >= first && id <= last && (id2 >= first && id2 <= last)) return &map->extent[idx]; } return NULL; } /* * map_id_up_max - Find idmap via binary search in ordered idmap array. * Can only be called if number of mappings exceeds UID_GID_MAP_MAX_BASE_EXTENTS. */ static struct uid_gid_extent * map_id_range_up_max(unsigned extents, struct uid_gid_map *map, u32 id, u32 count) { struct idmap_key key; key.map_up = true; key.count = count; key.id = id; return bsearch(&key, map->reverse, extents, sizeof(struct uid_gid_extent), cmp_map_id); } u32 map_id_range_up(struct uid_gid_map *map, u32 id, u32 count) { struct uid_gid_extent *extent; unsigned extents = map->nr_extents; smp_rmb(); if (extents <= UID_GID_MAP_MAX_BASE_EXTENTS) extent = map_id_range_up_base(extents, map, id, count); else extent = map_id_range_up_max(extents, map, id, count); /* Map the id or note failure */ if (extent) id = (id - extent->lower_first) + extent->first; else id = (u32) -1; return id; } u32 map_id_up(struct uid_gid_map *map, u32 id) { return map_id_range_up(map, id, 1); } /** * make_kuid - Map a user-namespace uid pair into a kuid. * @ns: User namespace that the uid is in * @uid: User identifier * * Maps a user-namespace uid pair into a kernel internal kuid, * and returns that kuid. * * When there is no mapping defined for the user-namespace uid * pair INVALID_UID is returned. Callers are expected to test * for and handle INVALID_UID being returned. INVALID_UID * may be tested for using uid_valid(). */ kuid_t make_kuid(struct user_namespace *ns, uid_t uid) { /* Map the uid to a global kernel uid */ return KUIDT_INIT(map_id_down(&ns->uid_map, uid)); } EXPORT_SYMBOL(make_kuid); /** * from_kuid - Create a uid from a kuid user-namespace pair. * @targ: The user namespace we want a uid in. * @kuid: The kernel internal uid to start with. * * Map @kuid into the user-namespace specified by @targ and * return the resulting uid. * * There is always a mapping into the initial user_namespace. * * If @kuid has no mapping in @targ (uid_t)-1 is returned. */ uid_t from_kuid(struct user_namespace *targ, kuid_t kuid) { /* Map the uid from a global kernel uid */ return map_id_up(&targ->uid_map, __kuid_val(kuid)); } EXPORT_SYMBOL(from_kuid); /** * from_kuid_munged - Create a uid from a kuid user-namespace pair. * @targ: The user namespace we want a uid in. * @kuid: The kernel internal uid to start with. * * Map @kuid into the user-namespace specified by @targ and * return the resulting uid. * * There is always a mapping into the initial user_namespace. * * Unlike from_kuid from_kuid_munged never fails and always * returns a valid uid. This makes from_kuid_munged appropriate * for use in syscalls like stat and getuid where failing the * system call and failing to provide a valid uid are not an * options. * * If @kuid has no mapping in @targ overflowuid is returned. */ uid_t from_kuid_munged(struct user_namespace *targ, kuid_t kuid) { uid_t uid; uid = from_kuid(targ, kuid); if (uid == (uid_t) -1) uid = overflowuid; return uid; } EXPORT_SYMBOL(from_kuid_munged); /** * make_kgid - Map a user-namespace gid pair into a kgid. * @ns: User namespace that the gid is in * @gid: group identifier * * Maps a user-namespace gid pair into a kernel internal kgid, * and returns that kgid. * * When there is no mapping defined for the user-namespace gid * pair INVALID_GID is returned. Callers are expected to test * for and handle INVALID_GID being returned. INVALID_GID may be * tested for using gid_valid(). */ kgid_t make_kgid(struct user_namespace *ns, gid_t gid) { /* Map the gid to a global kernel gid */ return KGIDT_INIT(map_id_down(&ns->gid_map, gid)); } EXPORT_SYMBOL(make_kgid); /** * from_kgid - Create a gid from a kgid user-namespace pair. * @targ: The user namespace we want a gid in. * @kgid: The kernel internal gid to start with. * * Map @kgid into the user-namespace specified by @targ and * return the resulting gid. * * There is always a mapping into the initial user_namespace. * * If @kgid has no mapping in @targ (gid_t)-1 is returned. */ gid_t from_kgid(struct user_namespace *targ, kgid_t kgid) { /* Map the gid from a global kernel gid */ return map_id_up(&targ->gid_map, __kgid_val(kgid)); } EXPORT_SYMBOL(from_kgid); /** * from_kgid_munged - Create a gid from a kgid user-namespace pair. * @targ: The user namespace we want a gid in. * @kgid: The kernel internal gid to start with. * * Map @kgid into the user-namespace specified by @targ and * return the resulting gid. * * There is always a mapping into the initial user_namespace. * * Unlike from_kgid from_kgid_munged never fails and always * returns a valid gid. This makes from_kgid_munged appropriate * for use in syscalls like stat and getgid where failing the * system call and failing to provide a valid gid are not options. * * If @kgid has no mapping in @targ overflowgid is returned. */ gid_t from_kgid_munged(struct user_namespace *targ, kgid_t kgid) { gid_t gid; gid = from_kgid(targ, kgid); if (gid == (gid_t) -1) gid = overflowgid; return gid; } EXPORT_SYMBOL(from_kgid_munged); /** * make_kprojid - Map a user-namespace projid pair into a kprojid. * @ns: User namespace that the projid is in * @projid: Project identifier * * Maps a user-namespace uid pair into a kernel internal kuid, * and returns that kuid. * * When there is no mapping defined for the user-namespace projid * pair INVALID_PROJID is returned. Callers are expected to test * for and handle INVALID_PROJID being returned. INVALID_PROJID * may be tested for using projid_valid(). */ kprojid_t make_kprojid(struct user_namespace *ns, projid_t projid) { /* Map the uid to a global kernel uid */ return KPROJIDT_INIT(map_id_down(&ns->projid_map, projid)); } EXPORT_SYMBOL(make_kprojid); /** * from_kprojid - Create a projid from a kprojid user-namespace pair. * @targ: The user namespace we want a projid in. * @kprojid: The kernel internal project identifier to start with. * * Map @kprojid into the user-namespace specified by @targ and * return the resulting projid. * * There is always a mapping into the initial user_namespace. * * If @kprojid has no mapping in @targ (projid_t)-1 is returned. */ projid_t from_kprojid(struct user_namespace *targ, kprojid_t kprojid) { /* Map the uid from a global kernel uid */ return map_id_up(&targ->projid_map, __kprojid_val(kprojid)); } EXPORT_SYMBOL(from_kprojid); /** * from_kprojid_munged - Create a projiid from a kprojid user-namespace pair. * @targ: The user namespace we want a projid in. * @kprojid: The kernel internal projid to start with. * * Map @kprojid into the user-namespace specified by @targ and * return the resulting projid. * * There is always a mapping into the initial user_namespace. * * Unlike from_kprojid from_kprojid_munged never fails and always * returns a valid projid. This makes from_kprojid_munged * appropriate for use in syscalls like stat and where * failing the system call and failing to provide a valid projid are * not an options. * * If @kprojid has no mapping in @targ OVERFLOW_PROJID is returned. */ projid_t from_kprojid_munged(struct user_namespace *targ, kprojid_t kprojid) { projid_t projid; projid = from_kprojid(targ, kprojid); if (projid == (projid_t) -1) projid = OVERFLOW_PROJID; return projid; } EXPORT_SYMBOL(from_kprojid_munged); static int uid_m_show(struct seq_file *seq, void *v) { struct user_namespace *ns = seq->private; struct uid_gid_extent *extent = v; struct user_namespace *lower_ns; uid_t lower; lower_ns = seq_user_ns(seq); if ((lower_ns == ns) && lower_ns->parent) lower_ns = lower_ns->parent; lower = from_kuid(lower_ns, KUIDT_INIT(extent->lower_first)); seq_printf(seq, "%10u %10u %10u\n", extent->first, lower, extent->count); return 0; } static int gid_m_show(struct seq_file *seq, void *v) { struct user_namespace *ns = seq->private; struct uid_gid_extent *extent = v; struct user_namespace *lower_ns; gid_t lower; lower_ns = seq_user_ns(seq); if ((lower_ns == ns) && lower_ns->parent) lower_ns = lower_ns->parent; lower = from_kgid(lower_ns, KGIDT_INIT(extent->lower_first)); seq_printf(seq, "%10u %10u %10u\n", extent->first, lower, extent->count); return 0; } static int projid_m_show(struct seq_file *seq, void *v) { struct user_namespace *ns = seq->private; struct uid_gid_extent *extent = v; struct user_namespace *lower_ns; projid_t lower; lower_ns = seq_user_ns(seq); if ((lower_ns == ns) && lower_ns->parent) lower_ns = lower_ns->parent; lower = from_kprojid(lower_ns, KPROJIDT_INIT(extent->lower_first)); seq_printf(seq, "%10u %10u %10u\n", extent->first, lower, extent->count); return 0; } static void *m_start(struct seq_file *seq, loff_t *ppos, struct uid_gid_map *map) { loff_t pos = *ppos; unsigned extents = map->nr_extents; smp_rmb(); if (pos >= extents) return NULL; if (extents <= UID_GID_MAP_MAX_BASE_EXTENTS) return &map->extent[pos]; return &map->forward[pos]; } static void *uid_m_start(struct seq_file *seq, loff_t *ppos) { struct user_namespace *ns = seq->private; return m_start(seq, ppos, &ns->uid_map); } static void *gid_m_start(struct seq_file *seq, loff_t *ppos) { struct user_namespace *ns = seq->private; return m_start(seq, ppos, &ns->gid_map); } static void *projid_m_start(struct seq_file *seq, loff_t *ppos) { struct user_namespace *ns = seq->private; return m_start(seq, ppos, &ns->projid_map); } static void *m_next(struct seq_file *seq, void *v, loff_t *pos) { (*pos)++; return seq->op->start(seq, pos); } static void m_stop(struct seq_file *seq, void *v) { return; } const struct seq_operations proc_uid_seq_operations = { .start = uid_m_start, .stop = m_stop, .next = m_next, .show = uid_m_show, }; const struct seq_operations proc_gid_seq_operations = { .start = gid_m_start, .stop = m_stop, .next = m_next, .show = gid_m_show, }; const struct seq_operations proc_projid_seq_operations = { .start = projid_m_start, .stop = m_stop, .next = m_next, .show = projid_m_show, }; static bool mappings_overlap(struct uid_gid_map *new_map, struct uid_gid_extent *extent) { u32 upper_first, lower_first, upper_last, lower_last; unsigned idx; upper_first = extent->first; lower_first = extent->lower_first; upper_last = upper_first + extent->count - 1; lower_last = lower_first + extent->count - 1; for (idx = 0; idx < new_map->nr_extents; idx++) { u32 prev_upper_first, prev_lower_first; u32 prev_upper_last, prev_lower_last; struct uid_gid_extent *prev; if (new_map->nr_extents <= UID_GID_MAP_MAX_BASE_EXTENTS) prev = &new_map->extent[idx]; else prev = &new_map->forward[idx]; prev_upper_first = prev->first; prev_lower_first = prev->lower_first; prev_upper_last = prev_upper_first + prev->count - 1; prev_lower_last = prev_lower_first + prev->count - 1; /* Does the upper range intersect a previous extent? */ if ((prev_upper_first <= upper_last) && (prev_upper_last >= upper_first)) return true; /* Does the lower range intersect a previous extent? */ if ((prev_lower_first <= lower_last) && (prev_lower_last >= lower_first)) return true; } return false; } /* * insert_extent - Safely insert a new idmap extent into struct uid_gid_map. * Takes care to allocate a 4K block of memory if the number of mappings exceeds * UID_GID_MAP_MAX_BASE_EXTENTS. */ static int insert_extent(struct uid_gid_map *map, struct uid_gid_extent *extent) { struct uid_gid_extent *dest; if (map->nr_extents == UID_GID_MAP_MAX_BASE_EXTENTS) { struct uid_gid_extent *forward; /* Allocate memory for 340 mappings. */ forward = kmalloc_array(UID_GID_MAP_MAX_EXTENTS, sizeof(struct uid_gid_extent), GFP_KERNEL); if (!forward) return -ENOMEM; /* Copy over memory. Only set up memory for the forward pointer. * Defer the memory setup for the reverse pointer. */ memcpy(forward, map->extent, map->nr_extents * sizeof(map->extent[0])); map->forward = forward; map->reverse = NULL; } if (map->nr_extents < UID_GID_MAP_MAX_BASE_EXTENTS) dest = &map->extent[map->nr_extents]; else dest = &map->forward[map->nr_extents]; *dest = *extent; map->nr_extents++; return 0; } /* cmp function to sort() forward mappings */ static int cmp_extents_forward(const void *a, const void *b) { const struct uid_gid_extent *e1 = a; const struct uid_gid_extent *e2 = b; if (e1->first < e2->first) return -1; if (e1->first > e2->first) return 1; return 0; } /* cmp function to sort() reverse mappings */ static int cmp_extents_reverse(const void *a, const void *b) { const struct uid_gid_extent *e1 = a; const struct uid_gid_extent *e2 = b; if (e1->lower_first < e2->lower_first) return -1; if (e1->lower_first > e2->lower_first) return 1; return 0; } /* * sort_idmaps - Sorts an array of idmap entries. * Can only be called if number of mappings exceeds UID_GID_MAP_MAX_BASE_EXTENTS. */ static int sort_idmaps(struct uid_gid_map *map) { if (map->nr_extents <= UID_GID_MAP_MAX_BASE_EXTENTS) return 0; /* Sort forward array. */ sort(map->forward, map->nr_extents, sizeof(struct uid_gid_extent), cmp_extents_forward, NULL); /* Only copy the memory from forward we actually need. */ map->reverse = kmemdup_array(map->forward, map->nr_extents, sizeof(struct uid_gid_extent), GFP_KERNEL); if (!map->reverse) return -ENOMEM; /* Sort reverse array. */ sort(map->reverse, map->nr_extents, sizeof(struct uid_gid_extent), cmp_extents_reverse, NULL); return 0; } /** * verify_root_map() - check the uid 0 mapping * @file: idmapping file * @map_ns: user namespace of the target process * @new_map: requested idmap * * If a process requests mapping parent uid 0 into the new ns, verify that the * process writing the map had the CAP_SETFCAP capability as the target process * will be able to write fscaps that are valid in ancestor user namespaces. * * Return: true if the mapping is allowed, false if not. */ static bool verify_root_map(const struct file *file, struct user_namespace *map_ns, struct uid_gid_map *new_map) { int idx; const struct user_namespace *file_ns = file->f_cred->user_ns; struct uid_gid_extent *extent0 = NULL; for (idx = 0; idx < new_map->nr_extents; idx++) { if (new_map->nr_extents <= UID_GID_MAP_MAX_BASE_EXTENTS) extent0 = &new_map->extent[idx]; else extent0 = &new_map->forward[idx]; if (extent0->lower_first == 0) break; extent0 = NULL; } if (!extent0) return true; if (map_ns == file_ns) { /* The process unshared its ns and is writing to its own * /proc/self/uid_map. User already has full capabilites in * the new namespace. Verify that the parent had CAP_SETFCAP * when it unshared. * */ if (!file_ns->parent_could_setfcap) return false; } else { /* Process p1 is writing to uid_map of p2, who is in a child * user namespace to p1's. Verify that the opener of the map * file has CAP_SETFCAP against the parent of the new map * namespace */ if (!file_ns_capable(file, map_ns->parent, CAP_SETFCAP)) return false; } return true; } static ssize_t map_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos, int cap_setid, struct uid_gid_map *map, struct uid_gid_map *parent_map) { struct seq_file *seq = file->private_data; struct user_namespace *map_ns = seq->private; struct uid_gid_map new_map; unsigned idx; struct uid_gid_extent extent; char *kbuf, *pos, *next_line; ssize_t ret; /* Only allow < page size writes at the beginning of the file */ if ((*ppos != 0) || (count >= PAGE_SIZE)) return -EINVAL; /* Slurp in the user data */ kbuf = memdup_user_nul(buf, count); if (IS_ERR(kbuf)) return PTR_ERR(kbuf); /* * The userns_state_mutex serializes all writes to any given map. * * Any map is only ever written once. * * An id map fits within 1 cache line on most architectures. * * On read nothing needs to be done unless you are on an * architecture with a crazy cache coherency model like alpha. * * There is a one time data dependency between reading the * count of the extents and the values of the extents. The * desired behavior is to see the values of the extents that * were written before the count of the extents. * * To achieve this smp_wmb() is used on guarantee the write * order and smp_rmb() is guaranteed that we don't have crazy * architectures returning stale data. */ mutex_lock(&userns_state_mutex); memset(&new_map, 0, sizeof(struct uid_gid_map)); ret = -EPERM; /* Only allow one successful write to the map */ if (map->nr_extents != 0) goto out; /* * Adjusting namespace settings requires capabilities on the target. */ if (cap_valid(cap_setid) && !file_ns_capable(file, map_ns, CAP_SYS_ADMIN)) goto out; /* Parse the user data */ ret = -EINVAL; pos = kbuf; for (; pos; pos = next_line) { /* Find the end of line and ensure I don't look past it */ next_line = strchr(pos, '\n'); if (next_line) { *next_line = '\0'; next_line++; if (*next_line == '\0') next_line = NULL; } pos = skip_spaces(pos); extent.first = simple_strtoul(pos, &pos, 10); if (!isspace(*pos)) goto out; pos = skip_spaces(pos); extent.lower_first = simple_strtoul(pos, &pos, 10); if (!isspace(*pos)) goto out; pos = skip_spaces(pos); extent.count = simple_strtoul(pos, &pos, 10); if (*pos && !isspace(*pos)) goto out; /* Verify there is not trailing junk on the line */ pos = skip_spaces(pos); if (*pos != '\0') goto out; /* Verify we have been given valid starting values */ if ((extent.first == (u32) -1) || (extent.lower_first == (u32) -1)) goto out; /* Verify count is not zero and does not cause the * extent to wrap */ if ((extent.first + extent.count) <= extent.first) goto out; if ((extent.lower_first + extent.count) <= extent.lower_first) goto out; /* Do the ranges in extent overlap any previous extents? */ if (mappings_overlap(&new_map, &extent)) goto out; if ((new_map.nr_extents + 1) == UID_GID_MAP_MAX_EXTENTS && (next_line != NULL)) goto out; ret = insert_extent(&new_map, &extent); if (ret < 0) goto out; ret = -EINVAL; } /* Be very certain the new map actually exists */ if (new_map.nr_extents == 0) goto out; ret = -EPERM; /* Validate the user is allowed to use user id's mapped to. */ if (!new_idmap_permitted(file, map_ns, cap_setid, &new_map)) goto out; ret = -EPERM; /* Map the lower ids from the parent user namespace to the * kernel global id space. */ for (idx = 0; idx < new_map.nr_extents; idx++) { struct uid_gid_extent *e; u32 lower_first; if (new_map.nr_extents <= UID_GID_MAP_MAX_BASE_EXTENTS) e = &new_map.extent[idx]; else e = &new_map.forward[idx]; lower_first = map_id_range_down(parent_map, e->lower_first, e->count); /* Fail if we can not map the specified extent to * the kernel global id space. */ if (lower_first == (u32) -1) goto out; e->lower_first = lower_first; } /* * If we want to use binary search for lookup, this clones the extent * array and sorts both copies. */ ret = sort_idmaps(&new_map); if (ret < 0) goto out; /* Install the map */ if (new_map.nr_extents <= UID_GID_MAP_MAX_BASE_EXTENTS) { memcpy(map->extent, new_map.extent, new_map.nr_extents * sizeof(new_map.extent[0])); } else { map->forward = new_map.forward; map->reverse = new_map.reverse; } smp_wmb(); map->nr_extents = new_map.nr_extents; *ppos = count; ret = count; out: if (ret < 0 && new_map.nr_extents > UID_GID_MAP_MAX_BASE_EXTENTS) { kfree(new_map.forward); kfree(new_map.reverse); map->forward = NULL; map->reverse = NULL; map->nr_extents = 0; } mutex_unlock(&userns_state_mutex); kfree(kbuf); return ret; } ssize_t proc_uid_map_write(struct file *file, const char __user *buf, size_t size, loff_t *ppos) { struct seq_file *seq = file->private_data; struct user_namespace *ns = seq->private; struct user_namespace *seq_ns = seq_user_ns(seq); if (!ns->parent) return -EPERM; if ((seq_ns != ns) && (seq_ns != ns->parent)) return -EPERM; return map_write(file, buf, size, ppos, CAP_SETUID, &ns->uid_map, &ns->parent->uid_map); } ssize_t proc_gid_map_write(struct file *file, const char __user *buf, size_t size, loff_t *ppos) { struct seq_file *seq = file->private_data; struct user_namespace *ns = seq->private; struct user_namespace *seq_ns = seq_user_ns(seq); if (!ns->parent) return -EPERM; if ((seq_ns != ns) && (seq_ns != ns->parent)) return -EPERM; return map_write(file, buf, size, ppos, CAP_SETGID, &ns->gid_map, &ns->parent->gid_map); } ssize_t proc_projid_map_write(struct file *file, const char __user *buf, size_t size, loff_t *ppos) { struct seq_file *seq = file->private_data; struct user_namespace *ns = seq->private; struct user_namespace *seq_ns = seq_user_ns(seq); if (!ns->parent) return -EPERM; if ((seq_ns != ns) && (seq_ns != ns->parent)) return -EPERM; /* Anyone can set any valid project id no capability needed */ return map_write(file, buf, size, ppos, -1, &ns->projid_map, &ns->parent->projid_map); } static bool new_idmap_permitted(const struct file *file, struct user_namespace *ns, int cap_setid, struct uid_gid_map *new_map) { const struct cred *cred = file->f_cred; if (cap_setid == CAP_SETUID && !verify_root_map(file, ns, new_map)) return false; /* Don't allow mappings that would allow anything that wouldn't * be allowed without the establishment of unprivileged mappings. */ if ((new_map->nr_extents == 1) && (new_map->extent[0].count == 1) && uid_eq(ns->owner, cred->euid)) { u32 id = new_map->extent[0].lower_first; if (cap_setid == CAP_SETUID) { kuid_t uid = make_kuid(ns->parent, id); if (uid_eq(uid, cred->euid)) return true; } else if (cap_setid == CAP_SETGID) { kgid_t gid = make_kgid(ns->parent, id); if (!(ns->flags & USERNS_SETGROUPS_ALLOWED) && gid_eq(gid, cred->egid)) return true; } } /* Allow anyone to set a mapping that doesn't require privilege */ if (!cap_valid(cap_setid)) return true; /* Allow the specified ids if we have the appropriate capability * (CAP_SETUID or CAP_SETGID) over the parent user namespace. * And the opener of the id file also has the appropriate capability. */ if (ns_capable(ns->parent, cap_setid) && file_ns_capable(file, ns->parent, cap_setid)) return true; return false; } int proc_setgroups_show(struct seq_file *seq, void *v) { struct user_namespace *ns = seq->private; unsigned long userns_flags = READ_ONCE(ns->flags); seq_printf(seq, "%s\n", (userns_flags & USERNS_SETGROUPS_ALLOWED) ? "allow" : "deny"); return 0; } ssize_t proc_setgroups_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { struct seq_file *seq = file->private_data; struct user_namespace *ns = seq->private; char kbuf[8], *pos; bool setgroups_allowed; ssize_t ret; /* Only allow a very narrow range of strings to be written */ ret = -EINVAL; if ((*ppos != 0) || (count >= sizeof(kbuf))) goto out; /* What was written? */ ret = -EFAULT; if (copy_from_user(kbuf, buf, count)) goto out; kbuf[count] = '\0'; pos = kbuf; /* What is being requested? */ ret = -EINVAL; if (strncmp(pos, "allow", 5) == 0) { pos += 5; setgroups_allowed = true; } else if (strncmp(pos, "deny", 4) == 0) { pos += 4; setgroups_allowed = false; } else goto out; /* Verify there is not trailing junk on the line */ pos = skip_spaces(pos); if (*pos != '\0') goto out; ret = -EPERM; mutex_lock(&userns_state_mutex); if (setgroups_allowed) { /* Enabling setgroups after setgroups has been disabled * is not allowed. */ if (!(ns->flags & USERNS_SETGROUPS_ALLOWED)) goto out_unlock; } else { /* Permanently disabling setgroups after setgroups has * been enabled by writing the gid_map is not allowed. */ if (ns->gid_map.nr_extents != 0) goto out_unlock; ns->flags &= ~USERNS_SETGROUPS_ALLOWED; } mutex_unlock(&userns_state_mutex); /* Report a successful write */ *ppos = count; ret = count; out: return ret; out_unlock: mutex_unlock(&userns_state_mutex); goto out; } bool userns_may_setgroups(const struct user_namespace *ns) { bool allowed; mutex_lock(&userns_state_mutex); /* It is not safe to use setgroups until a gid mapping in * the user namespace has been established. */ allowed = ns->gid_map.nr_extents != 0; /* Is setgroups allowed? */ allowed = allowed && (ns->flags & USERNS_SETGROUPS_ALLOWED); mutex_unlock(&userns_state_mutex); return allowed; } /* * Returns true if @child is the same namespace or a descendant of * @ancestor. */ bool in_userns(const struct user_namespace *ancestor, const struct user_namespace *child) { const struct user_namespace *ns; for (ns = child; ns->level > ancestor->level; ns = ns->parent) ; return (ns == ancestor); } bool current_in_userns(const struct user_namespace *target_ns) { return in_userns(target_ns, current_user_ns()); } EXPORT_SYMBOL(current_in_userns); static struct ns_common *userns_get(struct task_struct *task) { struct user_namespace *user_ns; rcu_read_lock(); user_ns = get_user_ns(__task_cred(task)->user_ns); rcu_read_unlock(); return user_ns ? &user_ns->ns : NULL; } static void userns_put(struct ns_common *ns) { put_user_ns(to_user_ns(ns)); } static int userns_install(struct nsset *nsset, struct ns_common *ns) { struct user_namespace *user_ns = to_user_ns(ns); struct cred *cred; /* Don't allow gaining capabilities by reentering * the same user namespace. */ if (user_ns == current_user_ns()) return -EINVAL; /* Tasks that share a thread group must share a user namespace */ if (!thread_group_empty(current)) return -EINVAL; if (current->fs->users != 1) return -EINVAL; if (!ns_capable(user_ns, CAP_SYS_ADMIN)) return -EPERM; cred = nsset_cred(nsset); if (!cred) return -EINVAL; put_user_ns(cred->user_ns); set_cred_user_ns(cred, get_user_ns(user_ns)); if (set_cred_ucounts(cred) < 0) return -EINVAL; return 0; } struct ns_common *ns_get_owner(struct ns_common *ns) { struct user_namespace *my_user_ns = current_user_ns(); struct user_namespace *owner, *p; /* See if the owner is in the current user namespace */ owner = p = ns->ops->owner(ns); for (;;) { if (!p) return ERR_PTR(-EPERM); if (p == my_user_ns) break; p = p->parent; } return &get_user_ns(owner)->ns; } static struct user_namespace *userns_owner(struct ns_common *ns) { return to_user_ns(ns)->parent; } const struct proc_ns_operations userns_operations = { .name = "user", .get = userns_get, .put = userns_put, .install = userns_install, .owner = userns_owner, .get_parent = ns_get_owner, }; static __init int user_namespaces_init(void) { user_ns_cachep = KMEM_CACHE(user_namespace, SLAB_PANIC | SLAB_ACCOUNT); ns_tree_add(&init_user_ns); return 0; } subsys_initcall(user_namespaces_init);
8 8 1 7 6 1 1 1 1 1 1 1 2 1 1 1 2 2 6 6 6 6 5 6 6 6 6 6 3 6 7 7 1 6 5 5 5 1 2 2 2 2 1 1 1 1 1 2 6 6 6 6 2 2 2 1 1 2 3 3 1 1 5 2 4 5 5 5 5 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 // SPDX-License-Identifier: GPL-1.0+ /* * n_tty.c --- implements the N_TTY line discipline. * * This code used to be in tty_io.c, but things are getting hairy * enough that it made sense to split things off. (The N_TTY * processing has changed so much that it's hardly recognizable, * anyway...) * * Note that the open routine for N_TTY is guaranteed never to return * an error. This is because Linux will fall back to setting a line * to N_TTY if it can not switch to any other line discipline. * * Written by Theodore Ts'o, Copyright 1994. * * This file also contains code originally written by Linus Torvalds, * Copyright 1991, 1992, 1993, and by Julian Cowley, Copyright 1994. * * Reduced memory usage for older ARM systems - Russell King. * * 2000/01/20 Fixed SMP locking on put_tty_queue using bits of * the patch by Andrew J. Kroll <ag784@freenet.buffalo.edu> * who actually finally proved there really was a race. * * 2002/03/18 Implemented n_tty_wakeup to send SIGIO POLL_OUTs to * waiting writing processes-Sapan Bhatia <sapan@corewars.org>. * Also fixed a bug in BLOCKING mode where n_tty_write returns * EAGAIN */ #include <linux/bitmap.h> #include <linux/bitops.h> #include <linux/ctype.h> #include <linux/errno.h> #include <linux/export.h> #include <linux/fcntl.h> #include <linux/file.h> #include <linux/jiffies.h> #include <linux/math.h> #include <linux/poll.h> #include <linux/ratelimit.h> #include <linux/sched.h> #include <linux/signal.h> #include <linux/slab.h> #include <linux/string.h> #include <linux/tty.h> #include <linux/types.h> #include <linux/uaccess.h> #include <linux/vmalloc.h> #include "tty.h" /* * Until this number of characters is queued in the xmit buffer, select will * return "we have room for writes". */ #define WAKEUP_CHARS 256 #define N_TTY_BUF_SIZE 4096 /* * This defines the low- and high-watermarks for throttling and * unthrottling the TTY driver. These watermarks are used for * controlling the space in the read buffer. */ #define TTY_THRESHOLD_THROTTLE 128 /* now based on remaining room */ #define TTY_THRESHOLD_UNTHROTTLE 128 /* * Special byte codes used in the echo buffer to represent operations * or special handling of characters. Bytes in the echo buffer that * are not part of such special blocks are treated as normal character * codes. */ #define ECHO_OP_START 0xff #define ECHO_OP_MOVE_BACK_COL 0x80 #define ECHO_OP_SET_CANON_COL 0x81 #define ECHO_OP_ERASE_TAB 0x82 #define ECHO_COMMIT_WATERMARK 256 #define ECHO_BLOCK 256 #define ECHO_DISCARD_WATERMARK N_TTY_BUF_SIZE - (ECHO_BLOCK + 32) struct n_tty_data { /* producer-published */ size_t read_head; size_t commit_head; size_t canon_head; size_t echo_head; size_t echo_commit; size_t echo_mark; DECLARE_BITMAP(char_map, 256); /* private to n_tty_receive_overrun (single-threaded) */ unsigned long overrun_time; unsigned int num_overrun; /* non-atomic */ bool no_room; /* must hold exclusive termios_rwsem to reset these */ unsigned char lnext:1, erasing:1, raw:1, real_raw:1, icanon:1; unsigned char push:1; /* shared by producer and consumer */ u8 read_buf[N_TTY_BUF_SIZE]; DECLARE_BITMAP(read_flags, N_TTY_BUF_SIZE); u8 echo_buf[N_TTY_BUF_SIZE]; /* consumer-published */ size_t read_tail; size_t line_start; /* # of chars looked ahead (to find software flow control chars) */ size_t lookahead_count; /* protected by output lock */ unsigned int column; unsigned int canon_column; size_t echo_tail; struct mutex atomic_read_lock; struct mutex output_lock; }; #define MASK(x) ((x) & (N_TTY_BUF_SIZE - 1)) static inline size_t read_cnt(struct n_tty_data *ldata) { return ldata->read_head - ldata->read_tail; } static inline u8 read_buf(struct n_tty_data *ldata, size_t i) { return ldata->read_buf[MASK(i)]; } static inline u8 *read_buf_addr(struct n_tty_data *ldata, size_t i) { return &ldata->read_buf[MASK(i)]; } static inline u8 echo_buf(struct n_tty_data *ldata, size_t i) { smp_rmb(); /* Matches smp_wmb() in add_echo_byte(). */ return ldata->echo_buf[MASK(i)]; } static inline u8 *echo_buf_addr(struct n_tty_data *ldata, size_t i) { return &ldata->echo_buf[MASK(i)]; } /* If we are not echoing the data, perhaps this is a secret so erase it */ static void zero_buffer(const struct tty_struct *tty, u8 *buffer, size_t size) { if (L_ICANON(tty) && !L_ECHO(tty)) memset(buffer, 0, size); } static void tty_copy(const struct tty_struct *tty, void *to, size_t tail, size_t n) { struct n_tty_data *ldata = tty->disc_data; size_t size = N_TTY_BUF_SIZE - tail; void *from = read_buf_addr(ldata, tail); if (n > size) { tty_audit_add_data(tty, from, size); memcpy(to, from, size); zero_buffer(tty, from, size); to += size; n -= size; from = ldata->read_buf; } tty_audit_add_data(tty, from, n); memcpy(to, from, n); zero_buffer(tty, from, n); } /** * n_tty_kick_worker - start input worker (if required) * @tty: terminal * * Re-schedules the flip buffer work if it may have stopped. * * Locking: * * Caller holds exclusive %termios_rwsem, or * * n_tty_read()/consumer path: * holds non-exclusive %termios_rwsem */ static void n_tty_kick_worker(const struct tty_struct *tty) { struct n_tty_data *ldata = tty->disc_data; /* Did the input worker stop? Restart it */ if (unlikely(READ_ONCE(ldata->no_room))) { WRITE_ONCE(ldata->no_room, 0); WARN_RATELIMIT(tty->port->itty == NULL, "scheduling with invalid itty\n"); /* see if ldisc has been killed - if so, this means that * even though the ldisc has been halted and ->buf.work * cancelled, ->buf.work is about to be rescheduled */ WARN_RATELIMIT(test_bit(TTY_LDISC_HALTED, &tty->flags), "scheduling buffer work for halted ldisc\n"); tty_buffer_restart_work(tty->port); } } static ssize_t chars_in_buffer(const struct tty_struct *tty) { const struct n_tty_data *ldata = tty->disc_data; size_t head = ldata->icanon ? ldata->canon_head : ldata->commit_head; return head - ldata->read_tail; } /** * n_tty_write_wakeup - asynchronous I/O notifier * @tty: tty device * * Required for the ptys, serial driver etc. since processes that attach * themselves to the master and rely on ASYNC IO must be woken up. */ static void n_tty_write_wakeup(struct tty_struct *tty) { clear_bit(TTY_DO_WRITE_WAKEUP, &tty->flags); kill_fasync(&tty->fasync, SIGIO, POLL_OUT); } static void n_tty_check_throttle(struct tty_struct *tty) { struct n_tty_data *ldata = tty->disc_data; /* * Check the remaining room for the input canonicalization * mode. We don't want to throttle the driver if we're in * canonical mode and don't have a newline yet! */ if (ldata->icanon && ldata->canon_head == ldata->read_tail) return; do { tty_set_flow_change(tty, TTY_THROTTLE_SAFE); if (N_TTY_BUF_SIZE - read_cnt(ldata) >= TTY_THRESHOLD_THROTTLE) break; } while (!tty_throttle_safe(tty)); __tty_set_flow_change(tty, 0); } static void n_tty_check_unthrottle(struct tty_struct *tty) { if (tty->driver->type == TTY_DRIVER_TYPE_PTY) { if (chars_in_buffer(tty) > TTY_THRESHOLD_UNTHROTTLE) return; n_tty_kick_worker(tty); tty_wakeup(tty->link); return; } /* If there is enough space in the read buffer now, let the * low-level driver know. We use chars_in_buffer() to * check the buffer, as it now knows about canonical mode. * Otherwise, if the driver is throttled and the line is * longer than TTY_THRESHOLD_UNTHROTTLE in canonical mode, * we won't get any more characters. */ do { tty_set_flow_change(tty, TTY_UNTHROTTLE_SAFE); if (chars_in_buffer(tty) > TTY_THRESHOLD_UNTHROTTLE) break; n_tty_kick_worker(tty); } while (!tty_unthrottle_safe(tty)); __tty_set_flow_change(tty, 0); } /** * put_tty_queue - add character to tty * @c: character * @ldata: n_tty data * * Add a character to the tty read_buf queue. * * Locking: * * n_tty_receive_buf()/producer path: * caller holds non-exclusive %termios_rwsem */ static inline void put_tty_queue(u8 c, struct n_tty_data *ldata) { *read_buf_addr(ldata, ldata->read_head) = c; ldata->read_head++; } /** * reset_buffer_flags - reset buffer state * @ldata: line disc data to reset * * Reset the read buffer counters and clear the flags. Called from * n_tty_open() and n_tty_flush_buffer(). * * Locking: * * caller holds exclusive %termios_rwsem, or * * (locking is not required) */ static void reset_buffer_flags(struct n_tty_data *ldata) { ldata->read_head = ldata->canon_head = ldata->read_tail = 0; ldata->commit_head = 0; ldata->line_start = 0; ldata->erasing = 0; bitmap_zero(ldata->read_flags, N_TTY_BUF_SIZE); ldata->push = 0; ldata->lookahead_count = 0; } static void n_tty_packet_mode_flush(struct tty_struct *tty) { unsigned long flags; if (tty->link->ctrl.packet) { spin_lock_irqsave(&tty->ctrl.lock, flags); tty->ctrl.pktstatus |= TIOCPKT_FLUSHREAD; spin_unlock_irqrestore(&tty->ctrl.lock, flags); wake_up_interruptible(&tty->link->read_wait); } } /** * n_tty_flush_buffer - clean input queue * @tty: terminal device * * Flush the input buffer. Called when the tty layer wants the buffer flushed * (eg at hangup) or when the %N_TTY line discipline internally has to clean * the pending queue (for example some signals). * * Holds %termios_rwsem to exclude producer/consumer while buffer indices are * reset. * * Locking: %ctrl.lock, exclusive %termios_rwsem */ static void n_tty_flush_buffer(struct tty_struct *tty) { down_write(&tty->termios_rwsem); reset_buffer_flags(tty->disc_data); n_tty_kick_worker(tty); if (tty->link) n_tty_packet_mode_flush(tty); up_write(&tty->termios_rwsem); } /** * is_utf8_continuation - utf8 multibyte check * @c: byte to check * * Returns: true if the utf8 character @c is a multibyte continuation * character. We use this to correctly compute the on-screen size of the * character when printing. */ static inline int is_utf8_continuation(u8 c) { return (c & 0xc0) == 0x80; } /** * is_continuation - multibyte check * @c: byte to check * @tty: terminal device * * Returns: true if the utf8 character @c is a multibyte continuation character * and the terminal is in unicode mode. */ static inline int is_continuation(u8 c, const struct tty_struct *tty) { return I_IUTF8(tty) && is_utf8_continuation(c); } /** * do_output_char - output one character * @c: character (or partial unicode symbol) * @tty: terminal device * @space: space available in tty driver write buffer * * This is a helper function that handles one output character (including * special characters like TAB, CR, LF, etc.), doing OPOST processing and * putting the results in the tty driver's write buffer. * * Note that Linux currently ignores TABDLY, CRDLY, VTDLY, FFDLY and NLDLY. * They simply aren't relevant in the world today. If you ever need them, add * them here. * * Returns: the number of bytes of buffer space used or -1 if no space left. * * Locking: should be called under the %output_lock to protect the column state * and space left in the buffer. */ static int do_output_char(u8 c, struct tty_struct *tty, int space) { struct n_tty_data *ldata = tty->disc_data; int spaces; if (!space) return -1; switch (c) { case '\n': if (O_ONLRET(tty)) ldata->column = 0; if (O_ONLCR(tty)) { if (space < 2) return -1; ldata->canon_column = ldata->column = 0; tty->ops->write(tty, "\r\n", 2); return 2; } ldata->canon_column = ldata->column; break; case '\r': if (O_ONOCR(tty) && ldata->column == 0) return 0; if (O_OCRNL(tty)) { c = '\n'; if (O_ONLRET(tty)) ldata->canon_column = ldata->column = 0; break; } ldata->canon_column = ldata->column = 0; break; case '\t': spaces = 8 - (ldata->column & 7); if (O_TABDLY(tty) == XTABS) { if (space < spaces) return -1; ldata->column += spaces; tty->ops->write(tty, " ", spaces); return spaces; } ldata->column += spaces; break; case '\b': if (ldata->column > 0) ldata->column--; break; default: if (!iscntrl(c)) { if (O_OLCUC(tty)) c = toupper(c); if (!is_continuation(c, tty)) ldata->column++; } break; } tty_put_char(tty, c); return 1; } /** * process_output - output post processor * @c: character (or partial unicode symbol) * @tty: terminal device * * Output one character with OPOST processing. * * Returns: -1 when the output device is full and the character must be * retried. * * Locking: %output_lock to protect column state and space left (also, this is *called from n_tty_write() under the tty layer write lock). */ static int process_output(u8 c, struct tty_struct *tty) { struct n_tty_data *ldata = tty->disc_data; guard(mutex)(&ldata->output_lock); if (do_output_char(c, tty, tty_write_room(tty)) < 0) return -1; return 0; } /** * process_output_block - block post processor * @tty: terminal device * @buf: character buffer * @nr: number of bytes to output * * Output a block of characters with OPOST processing. * * This path is used to speed up block console writes, among other things when * processing blocks of output data. It handles only the simple cases normally * found and helps to generate blocks of symbols for the console driver and * thus improve performance. * * Returns: the number of characters output. * * Locking: %output_lock to protect column state and space left (also, this is * called from n_tty_write() under the tty layer write lock). */ static ssize_t process_output_block(struct tty_struct *tty, const u8 *buf, unsigned int nr) { struct n_tty_data *ldata = tty->disc_data; unsigned int space, i; const u8 *cp; guard(mutex)(&ldata->output_lock); space = tty_write_room(tty); if (space == 0) return 0; if (nr > space) nr = space; for (i = 0, cp = buf; i < nr; i++, cp++) { u8 c = *cp; switch (c) { case '\n': if (O_ONLRET(tty)) ldata->column = 0; if (O_ONLCR(tty)) goto do_write; ldata->canon_column = ldata->column; break; case '\r': if (O_ONOCR(tty) && ldata->column == 0) goto do_write; if (O_OCRNL(tty)) goto do_write; ldata->canon_column = ldata->column = 0; break; case '\t': goto do_write; case '\b': if (ldata->column > 0) ldata->column--; break; default: if (!iscntrl(c)) { if (O_OLCUC(tty)) goto do_write; if (!is_continuation(c, tty)) ldata->column++; } break; } } do_write: return tty->ops->write(tty, buf, i); } static int n_tty_process_echo_ops(struct tty_struct *tty, size_t *tail, int space) { struct n_tty_data *ldata = tty->disc_data; u8 op; /* * Since add_echo_byte() is called without holding output_lock, we * might see only portion of multi-byte operation. */ if (MASK(ldata->echo_commit) == MASK(*tail + 1)) return -ENODATA; /* * If the buffer byte is the start of a multi-byte operation, get the * next byte, which is either the op code or a control character value. */ op = echo_buf(ldata, *tail + 1); switch (op) { case ECHO_OP_ERASE_TAB: { unsigned int num_chars, num_bs; if (MASK(ldata->echo_commit) == MASK(*tail + 2)) return -ENODATA; num_chars = echo_buf(ldata, *tail + 2); /* * Determine how many columns to go back in order to erase the * tab. This depends on the number of columns used by other * characters within the tab area. If this (modulo 8) count is * from the start of input rather than from a previous tab, we * offset by canon column. Otherwise, tab spacing is normal. */ if (!(num_chars & 0x80)) num_chars += ldata->canon_column; num_bs = 8 - (num_chars & 7); if (num_bs > space) return -ENOSPC; space -= num_bs; while (num_bs--) { tty_put_char(tty, '\b'); if (ldata->column > 0) ldata->column--; } *tail += 3; break; } case ECHO_OP_SET_CANON_COL: ldata->canon_column = ldata->column; *tail += 2; break; case ECHO_OP_MOVE_BACK_COL: if (ldata->column > 0) ldata->column--; *tail += 2; break; case ECHO_OP_START: /* This is an escaped echo op start code */ if (!space) return -ENOSPC; tty_put_char(tty, ECHO_OP_START); ldata->column++; space--; *tail += 2; break; default: /* * If the op is not a special byte code, it is a ctrl char * tagged to be echoed as "^X" (where X is the letter * representing the control char). Note that we must ensure * there is enough space for the whole ctrl pair. */ if (space < 2) return -ENOSPC; tty_put_char(tty, '^'); tty_put_char(tty, op ^ 0100); ldata->column += 2; space -= 2; *tail += 2; break; } return space; } /** * __process_echoes - write pending echo characters * @tty: terminal device * * Write previously buffered echo (and other ldisc-generated) characters to the * tty. * * Characters generated by the ldisc (including echoes) need to be buffered * because the driver's write buffer can fill during heavy program output. * Echoing straight to the driver will often fail under these conditions, * causing lost characters and resulting mismatches of ldisc state information. * * Since the ldisc state must represent the characters actually sent to the * driver at the time of the write, operations like certain changes in column * state are also saved in the buffer and executed here. * * A circular fifo buffer is used so that the most recent characters are * prioritized. Also, when control characters are echoed with a prefixed "^", * the pair is treated atomically and thus not separated. * * Locking: callers must hold %output_lock. */ static size_t __process_echoes(struct tty_struct *tty) { struct n_tty_data *ldata = tty->disc_data; unsigned int space, old_space; size_t tail; u8 c; old_space = space = tty_write_room(tty); tail = ldata->echo_tail; while (MASK(ldata->echo_commit) != MASK(tail)) { c = echo_buf(ldata, tail); if (c == ECHO_OP_START) { int ret = n_tty_process_echo_ops(tty, &tail, space); if (ret == -ENODATA) goto not_yet_stored; if (ret < 0) break; space = ret; } else { if (O_OPOST(tty)) { int retval = do_output_char(c, tty, space); if (retval < 0) break; space -= retval; } else { if (!space) break; tty_put_char(tty, c); space -= 1; } tail += 1; } } /* If the echo buffer is nearly full (so that the possibility exists * of echo overrun before the next commit), then discard enough * data at the tail to prevent a subsequent overrun */ while (ldata->echo_commit > tail && ldata->echo_commit - tail >= ECHO_DISCARD_WATERMARK) { if (echo_buf(ldata, tail) == ECHO_OP_START) { if (echo_buf(ldata, tail + 1) == ECHO_OP_ERASE_TAB) tail += 3; else tail += 2; } else tail++; } not_yet_stored: ldata->echo_tail = tail; return old_space - space; } static void commit_echoes(struct tty_struct *tty) { struct n_tty_data *ldata = tty->disc_data; size_t nr, old, echoed; size_t head; mutex_lock(&ldata->output_lock); head = ldata->echo_head; ldata->echo_mark = head; old = ldata->echo_commit - ldata->echo_tail; /* Process committed echoes if the accumulated # of bytes * is over the threshold (and try again each time another * block is accumulated) */ nr = head - ldata->echo_tail; if (nr < ECHO_COMMIT_WATERMARK || (nr % ECHO_BLOCK > old % ECHO_BLOCK)) { mutex_unlock(&ldata->output_lock); return; } ldata->echo_commit = head; echoed = __process_echoes(tty); mutex_unlock(&ldata->output_lock); if (echoed && tty->ops->flush_chars) tty->ops->flush_chars(tty); } static void process_echoes(struct tty_struct *tty) { struct n_tty_data *ldata = tty->disc_data; size_t echoed; if (ldata->echo_mark == ldata->echo_tail) return; mutex_lock(&ldata->output_lock); ldata->echo_commit = ldata->echo_mark; echoed = __process_echoes(tty); mutex_unlock(&ldata->output_lock); if (echoed && tty->ops->flush_chars) tty->ops->flush_chars(tty); } /* NB: echo_mark and echo_head should be equivalent here */ static void flush_echoes(struct tty_struct *tty) { struct n_tty_data *ldata = tty->disc_data; if ((!L_ECHO(tty) && !L_ECHONL(tty)) || ldata->echo_commit == ldata->echo_head) return; mutex_lock(&ldata->output_lock); ldata->echo_commit = ldata->echo_head; __process_echoes(tty); mutex_unlock(&ldata->output_lock); } /** * add_echo_byte - add a byte to the echo buffer * @c: unicode byte to echo * @ldata: n_tty data * * Add a character or operation byte to the echo buffer. */ static inline void add_echo_byte(u8 c, struct n_tty_data *ldata) { *echo_buf_addr(ldata, ldata->echo_head) = c; smp_wmb(); /* Matches smp_rmb() in echo_buf(). */ ldata->echo_head++; } /** * echo_move_back_col - add operation to move back a column * @ldata: n_tty data * * Add an operation to the echo buffer to move back one column. */ static void echo_move_back_col(struct n_tty_data *ldata) { add_echo_byte(ECHO_OP_START, ldata); add_echo_byte(ECHO_OP_MOVE_BACK_COL, ldata); } /** * echo_set_canon_col - add operation to set the canon column * @ldata: n_tty data * * Add an operation to the echo buffer to set the canon column to the current * column. */ static void echo_set_canon_col(struct n_tty_data *ldata) { add_echo_byte(ECHO_OP_START, ldata); add_echo_byte(ECHO_OP_SET_CANON_COL, ldata); } /** * echo_erase_tab - add operation to erase a tab * @num_chars: number of character columns already used * @after_tab: true if num_chars starts after a previous tab * @ldata: n_tty data * * Add an operation to the echo buffer to erase a tab. * * Called by the eraser function, which knows how many character columns have * been used since either a previous tab or the start of input. This * information will be used later, along with canon column (if applicable), to * go back the correct number of columns. */ static void echo_erase_tab(unsigned int num_chars, int after_tab, struct n_tty_data *ldata) { add_echo_byte(ECHO_OP_START, ldata); add_echo_byte(ECHO_OP_ERASE_TAB, ldata); /* We only need to know this modulo 8 (tab spacing) */ num_chars &= 7; /* Set the high bit as a flag if num_chars is after a previous tab */ if (after_tab) num_chars |= 0x80; add_echo_byte(num_chars, ldata); } /** * echo_char_raw - echo a character raw * @c: unicode byte to echo * @ldata: line disc data * * Echo user input back onto the screen. This must be called only when * L_ECHO(tty) is true. Called from the &tty_driver.receive_buf() path. * * This variant does not treat control characters specially. */ static void echo_char_raw(u8 c, struct n_tty_data *ldata) { if (c == ECHO_OP_START) { add_echo_byte(ECHO_OP_START, ldata); add_echo_byte(ECHO_OP_START, ldata); } else { add_echo_byte(c, ldata); } } /** * echo_char - echo a character * @c: unicode byte to echo * @tty: terminal device * * Echo user input back onto the screen. This must be called only when * L_ECHO(tty) is true. Called from the &tty_driver.receive_buf() path. * * This variant tags control characters to be echoed as "^X" (where X is the * letter representing the control char). */ static void echo_char(u8 c, const struct tty_struct *tty) { struct n_tty_data *ldata = tty->disc_data; if (c == ECHO_OP_START) { add_echo_byte(ECHO_OP_START, ldata); add_echo_byte(ECHO_OP_START, ldata); } else { if (L_ECHOCTL(tty) && iscntrl(c) && c != '\t') add_echo_byte(ECHO_OP_START, ldata); add_echo_byte(c, ldata); } } /** * finish_erasing - complete erase * @ldata: n_tty data */ static inline void finish_erasing(struct n_tty_data *ldata) { if (ldata->erasing) { echo_char_raw('/', ldata); ldata->erasing = 0; } } /** * eraser - handle erase function * @c: character input * @tty: terminal device * * Perform erase and necessary output when an erase character is present in the * stream from the driver layer. Handles the complexities of UTF-8 multibyte * symbols. * * Locking: n_tty_receive_buf()/producer path: * caller holds non-exclusive %termios_rwsem */ static void eraser(u8 c, const struct tty_struct *tty) { struct n_tty_data *ldata = tty->disc_data; enum { ERASE, WERASE, KILL } kill_type; size_t head; size_t cnt; int seen_alnums; if (ldata->read_head == ldata->canon_head) { /* process_output('\a', tty); */ /* what do you think? */ return; } if (c == ERASE_CHAR(tty)) kill_type = ERASE; else if (c == WERASE_CHAR(tty)) kill_type = WERASE; else { if (!L_ECHO(tty)) { ldata->read_head = ldata->canon_head; return; } if (!L_ECHOK(tty) || !L_ECHOKE(tty) || !L_ECHOE(tty)) { ldata->read_head = ldata->canon_head; finish_erasing(ldata); echo_char(KILL_CHAR(tty), tty); /* Add a newline if ECHOK is on and ECHOKE is off. */ if (L_ECHOK(tty)) echo_char_raw('\n', ldata); return; } kill_type = KILL; } seen_alnums = 0; while (MASK(ldata->read_head) != MASK(ldata->canon_head)) { head = ldata->read_head; /* erase a single possibly multibyte character */ do { head--; c = read_buf(ldata, head); } while (is_continuation(c, tty) && MASK(head) != MASK(ldata->canon_head)); /* do not partially erase */ if (is_continuation(c, tty)) break; if (kill_type == WERASE) { /* Equivalent to BSD's ALTWERASE. */ if (isalnum(c) || c == '_') seen_alnums++; else if (seen_alnums) break; } cnt = ldata->read_head - head; ldata->read_head = head; if (L_ECHO(tty)) { if (L_ECHOPRT(tty)) { if (!ldata->erasing) { echo_char_raw('\\', ldata); ldata->erasing = 1; } /* if cnt > 1, output a multi-byte character */ echo_char(c, tty); while (--cnt > 0) { head++; echo_char_raw(read_buf(ldata, head), ldata); echo_move_back_col(ldata); } } else if (kill_type == ERASE && !L_ECHOE(tty)) { echo_char(ERASE_CHAR(tty), tty); } else if (c == '\t') { unsigned int num_chars = 0; int after_tab = 0; size_t tail = ldata->read_head; /* * Count the columns used for characters * since the start of input or after a * previous tab. * This info is used to go back the correct * number of columns. */ while (MASK(tail) != MASK(ldata->canon_head)) { tail--; c = read_buf(ldata, tail); if (c == '\t') { after_tab = 1; break; } else if (iscntrl(c)) { if (L_ECHOCTL(tty)) num_chars += 2; } else if (!is_continuation(c, tty)) { num_chars++; } } echo_erase_tab(num_chars, after_tab, ldata); } else { if (iscntrl(c) && L_ECHOCTL(tty)) { echo_char_raw('\b', ldata); echo_char_raw(' ', ldata); echo_char_raw('\b', ldata); } if (!iscntrl(c) || L_ECHOCTL(tty)) { echo_char_raw('\b', ldata); echo_char_raw(' ', ldata); echo_char_raw('\b', ldata); } } } if (kill_type == ERASE) break; } if (ldata->read_head == ldata->canon_head && L_ECHO(tty)) finish_erasing(ldata); } static void __isig(int sig, struct tty_struct *tty) { struct pid *tty_pgrp = tty_get_pgrp(tty); if (tty_pgrp) { kill_pgrp(tty_pgrp, sig, 1); put_pid(tty_pgrp); } } /** * isig - handle the ISIG optio * @sig: signal * @tty: terminal * * Called when a signal is being sent due to terminal input. Called from the * &tty_driver.receive_buf() path, so serialized. * * Performs input and output flush if !NOFLSH. In this context, the echo * buffer is 'output'. The signal is processed first to alert any current * readers or writers to discontinue and exit their i/o loops. * * Locking: %ctrl.lock */ static void isig(int sig, struct tty_struct *tty) { struct n_tty_data *ldata = tty->disc_data; if (L_NOFLSH(tty)) { /* signal only */ __isig(sig, tty); } else { /* signal and flush */ up_read(&tty->termios_rwsem); down_write(&tty->termios_rwsem); __isig(sig, tty); /* clear echo buffer */ mutex_lock(&ldata->output_lock); ldata->echo_head = ldata->echo_tail = 0; ldata->echo_mark = ldata->echo_commit = 0; mutex_unlock(&ldata->output_lock); /* clear output buffer */ tty_driver_flush_buffer(tty); /* clear input buffer */ reset_buffer_flags(tty->disc_data); /* notify pty master of flush */ if (tty->link) n_tty_packet_mode_flush(tty); up_write(&tty->termios_rwsem); down_read(&tty->termios_rwsem); } } /** * n_tty_receive_break - handle break * @tty: terminal * * An RS232 break event has been hit in the incoming bitstream. This can cause * a variety of events depending upon the termios settings. * * Locking: n_tty_receive_buf()/producer path: * caller holds non-exclusive termios_rwsem * * Note: may get exclusive %termios_rwsem if flushing input buffer */ static void n_tty_receive_break(struct tty_struct *tty) { struct n_tty_data *ldata = tty->disc_data; if (I_IGNBRK(tty)) return; if (I_BRKINT(tty)) { isig(SIGINT, tty); return; } if (I_PARMRK(tty)) { put_tty_queue('\377', ldata); put_tty_queue('\0', ldata); } put_tty_queue('\0', ldata); } /** * n_tty_receive_overrun - handle overrun reporting * @tty: terminal * * Data arrived faster than we could process it. While the tty driver has * flagged this the bits that were missed are gone forever. * * Called from the receive_buf path so single threaded. Does not need locking * as num_overrun and overrun_time are function private. */ static void n_tty_receive_overrun(const struct tty_struct *tty) { struct n_tty_data *ldata = tty->disc_data; ldata->num_overrun++; if (time_is_before_jiffies(ldata->overrun_time + HZ)) { tty_warn(tty, "%u input overrun(s)\n", ldata->num_overrun); ldata->overrun_time = jiffies; ldata->num_overrun = 0; } } /** * n_tty_receive_parity_error - error notifier * @tty: terminal device * @c: character * * Process a parity error and queue the right data to indicate the error case * if necessary. * * Locking: n_tty_receive_buf()/producer path: * caller holds non-exclusive %termios_rwsem */ static void n_tty_receive_parity_error(const struct tty_struct *tty, u8 c) { struct n_tty_data *ldata = tty->disc_data; if (I_INPCK(tty)) { if (I_IGNPAR(tty)) return; if (I_PARMRK(tty)) { put_tty_queue('\377', ldata); put_tty_queue('\0', ldata); put_tty_queue(c, ldata); } else put_tty_queue('\0', ldata); } else put_tty_queue(c, ldata); } static void n_tty_receive_signal_char(struct tty_struct *tty, int signal, u8 c) { isig(signal, tty); if (I_IXON(tty)) start_tty(tty); if (L_ECHO(tty)) { echo_char(c, tty); commit_echoes(tty); } else process_echoes(tty); } static bool n_tty_is_char_flow_ctrl(struct tty_struct *tty, u8 c) { return c == START_CHAR(tty) || c == STOP_CHAR(tty); } /** * n_tty_receive_char_flow_ctrl - receive flow control chars * @tty: terminal device * @c: character * @lookahead_done: lookahead has processed this character already * * Receive and process flow control character actions. * * In case lookahead for flow control chars already handled the character in * advance to the normal receive, the actions are skipped during normal * receive. * * Returns true if @c is consumed as flow-control character, the character * must not be treated as normal character. */ static bool n_tty_receive_char_flow_ctrl(struct tty_struct *tty, u8 c, bool lookahead_done) { if (!n_tty_is_char_flow_ctrl(tty, c)) return false; if (lookahead_done) return true; if (c == START_CHAR(tty)) { start_tty(tty); process_echoes(tty); return true; } /* STOP_CHAR */ stop_tty(tty); return true; } static void n_tty_receive_handle_newline(struct tty_struct *tty, u8 c) { struct n_tty_data *ldata = tty->disc_data; set_bit(MASK(ldata->read_head), ldata->read_flags); put_tty_queue(c, ldata); smp_store_release(&ldata->canon_head, ldata->read_head); kill_fasync(&tty->fasync, SIGIO, POLL_IN); wake_up_interruptible_poll(&tty->read_wait, EPOLLIN | EPOLLRDNORM); } static bool n_tty_receive_char_canon(struct tty_struct *tty, u8 c) { struct n_tty_data *ldata = tty->disc_data; if (c == ERASE_CHAR(tty) || c == KILL_CHAR(tty) || (c == WERASE_CHAR(tty) && L_IEXTEN(tty))) { eraser(c, tty); commit_echoes(tty); return true; } if (c == LNEXT_CHAR(tty) && L_IEXTEN(tty)) { ldata->lnext = 1; if (L_ECHO(tty)) { finish_erasing(ldata); if (L_ECHOCTL(tty)) { echo_char_raw('^', ldata); echo_char_raw('\b', ldata); commit_echoes(tty); } } return true; } if (c == REPRINT_CHAR(tty) && L_ECHO(tty) && L_IEXTEN(tty)) { size_t tail = ldata->canon_head; finish_erasing(ldata); echo_char(c, tty); echo_char_raw('\n', ldata); while (MASK(tail) != MASK(ldata->read_head)) { echo_char(read_buf(ldata, tail), tty); tail++; } commit_echoes(tty); return true; } if (c == '\n') { if (L_ECHO(tty) || L_ECHONL(tty)) { echo_char_raw('\n', ldata); commit_echoes(tty); } n_tty_receive_handle_newline(tty, c); return true; } if (c == EOF_CHAR(tty)) { c = __DISABLED_CHAR; n_tty_receive_handle_newline(tty, c); return true; } if ((c == EOL_CHAR(tty)) || (c == EOL2_CHAR(tty) && L_IEXTEN(tty))) { /* * XXX are EOL_CHAR and EOL2_CHAR echoed?!? */ if (L_ECHO(tty)) { /* Record the column of first canon char. */ if (ldata->canon_head == ldata->read_head) echo_set_canon_col(ldata); echo_char(c, tty); commit_echoes(tty); } /* * XXX does PARMRK doubling happen for * EOL_CHAR and EOL2_CHAR? */ if (c == '\377' && I_PARMRK(tty)) put_tty_queue(c, ldata); n_tty_receive_handle_newline(tty, c); return true; } return false; } static void n_tty_receive_char_special(struct tty_struct *tty, u8 c, bool lookahead_done) { struct n_tty_data *ldata = tty->disc_data; if (I_IXON(tty) && n_tty_receive_char_flow_ctrl(tty, c, lookahead_done)) return; if (L_ISIG(tty)) { if (c == INTR_CHAR(tty)) { n_tty_receive_signal_char(tty, SIGINT, c); return; } else if (c == QUIT_CHAR(tty)) { n_tty_receive_signal_char(tty, SIGQUIT, c); return; } else if (c == SUSP_CHAR(tty)) { n_tty_receive_signal_char(tty, SIGTSTP, c); return; } } if (tty->flow.stopped && !tty->flow.tco_stopped && I_IXON(tty) && I_IXANY(tty)) { start_tty(tty); process_echoes(tty); } if (c == '\r') { if (I_IGNCR(tty)) return; if (I_ICRNL(tty)) c = '\n'; } else if (c == '\n' && I_INLCR(tty)) c = '\r'; if (ldata->icanon && n_tty_receive_char_canon(tty, c)) return; if (L_ECHO(tty)) { finish_erasing(ldata); if (c == '\n') echo_char_raw('\n', ldata); else { /* Record the column of first canon char. */ if (ldata->canon_head == ldata->read_head) echo_set_canon_col(ldata); echo_char(c, tty); } commit_echoes(tty); } /* PARMRK doubling check */ if (c == '\377' && I_PARMRK(tty)) put_tty_queue(c, ldata); put_tty_queue(c, ldata); } /** * n_tty_receive_char - perform processing * @tty: terminal device * @c: character * * Process an individual character of input received from the driver. This is * serialized with respect to itself by the rules for the driver above. * * Locking: n_tty_receive_buf()/producer path: * caller holds non-exclusive %termios_rwsem * publishes canon_head if canonical mode is active */ static void n_tty_receive_char(struct tty_struct *tty, u8 c) { struct n_tty_data *ldata = tty->disc_data; if (tty->flow.stopped && !tty->flow.tco_stopped && I_IXON(tty) && I_IXANY(tty)) { start_tty(tty); process_echoes(tty); } if (L_ECHO(tty)) { finish_erasing(ldata); /* Record the column of first canon char. */ if (ldata->canon_head == ldata->read_head) echo_set_canon_col(ldata); echo_char(c, tty); commit_echoes(tty); } /* PARMRK doubling check */ if (c == '\377' && I_PARMRK(tty)) put_tty_queue(c, ldata); put_tty_queue(c, ldata); } static void n_tty_receive_char_closing(struct tty_struct *tty, u8 c, bool lookahead_done) { if (I_ISTRIP(tty)) c &= 0x7f; if (I_IUCLC(tty) && L_IEXTEN(tty)) c = tolower(c); if (I_IXON(tty)) { if (!n_tty_receive_char_flow_ctrl(tty, c, lookahead_done) && tty->flow.stopped && !tty->flow.tco_stopped && I_IXANY(tty) && c != INTR_CHAR(tty) && c != QUIT_CHAR(tty) && c != SUSP_CHAR(tty)) { start_tty(tty); process_echoes(tty); } } } static void n_tty_receive_char_flagged(struct tty_struct *tty, u8 c, u8 flag) { switch (flag) { case TTY_BREAK: n_tty_receive_break(tty); break; case TTY_PARITY: case TTY_FRAME: n_tty_receive_parity_error(tty, c); break; case TTY_OVERRUN: n_tty_receive_overrun(tty); break; default: tty_err(tty, "unknown flag %u\n", flag); break; } } static void n_tty_receive_char_lnext(struct tty_struct *tty, u8 c, u8 flag) { struct n_tty_data *ldata = tty->disc_data; ldata->lnext = 0; if (likely(flag == TTY_NORMAL)) { if (I_ISTRIP(tty)) c &= 0x7f; if (I_IUCLC(tty) && L_IEXTEN(tty)) c = tolower(c); n_tty_receive_char(tty, c); } else n_tty_receive_char_flagged(tty, c, flag); } /* Caller must ensure count > 0 */ static void n_tty_lookahead_flow_ctrl(struct tty_struct *tty, const u8 *cp, const u8 *fp, size_t count) { struct n_tty_data *ldata = tty->disc_data; u8 flag = TTY_NORMAL; ldata->lookahead_count += count; if (!I_IXON(tty)) return; while (count--) { if (fp) flag = *fp++; if (likely(flag == TTY_NORMAL)) n_tty_receive_char_flow_ctrl(tty, *cp, false); cp++; } } static void n_tty_receive_buf_real_raw(const struct tty_struct *tty, const u8 *cp, size_t count) { struct n_tty_data *ldata = tty->disc_data; /* handle buffer wrap-around by a loop */ for (unsigned int i = 0; i < 2; i++) { size_t head = MASK(ldata->read_head); size_t n = min(count, N_TTY_BUF_SIZE - head); memcpy(read_buf_addr(ldata, head), cp, n); ldata->read_head += n; cp += n; count -= n; } } static void n_tty_receive_buf_raw(struct tty_struct *tty, const u8 *cp, const u8 *fp, size_t count) { struct n_tty_data *ldata = tty->disc_data; u8 flag = TTY_NORMAL; while (count--) { if (fp) flag = *fp++; if (likely(flag == TTY_NORMAL)) put_tty_queue(*cp++, ldata); else n_tty_receive_char_flagged(tty, *cp++, flag); } } static void n_tty_receive_buf_closing(struct tty_struct *tty, const u8 *cp, const u8 *fp, size_t count, bool lookahead_done) { u8 flag = TTY_NORMAL; while (count--) { if (fp) flag = *fp++; if (likely(flag == TTY_NORMAL)) n_tty_receive_char_closing(tty, *cp++, lookahead_done); } } static void n_tty_receive_buf_standard(struct tty_struct *tty, const u8 *cp, const u8 *fp, size_t count, bool lookahead_done) { struct n_tty_data *ldata = tty->disc_data; u8 flag = TTY_NORMAL; while (count--) { u8 c = *cp++; if (fp) flag = *fp++; if (ldata->lnext) { n_tty_receive_char_lnext(tty, c, flag); continue; } if (unlikely(flag != TTY_NORMAL)) { n_tty_receive_char_flagged(tty, c, flag); continue; } if (I_ISTRIP(tty)) c &= 0x7f; if (I_IUCLC(tty) && L_IEXTEN(tty)) c = tolower(c); if (L_EXTPROC(tty)) { put_tty_queue(c, ldata); continue; } if (test_bit(c, ldata->char_map)) n_tty_receive_char_special(tty, c, lookahead_done); else n_tty_receive_char(tty, c); } } static void __receive_buf(struct tty_struct *tty, const u8 *cp, const u8 *fp, size_t count) { struct n_tty_data *ldata = tty->disc_data; bool preops = I_ISTRIP(tty) || (I_IUCLC(tty) && L_IEXTEN(tty)); size_t la_count = min(ldata->lookahead_count, count); if (ldata->real_raw) n_tty_receive_buf_real_raw(tty, cp, count); else if (ldata->raw || (L_EXTPROC(tty) && !preops)) n_tty_receive_buf_raw(tty, cp, fp, count); else if (tty->closing && !L_EXTPROC(tty)) { if (la_count > 0) { n_tty_receive_buf_closing(tty, cp, fp, la_count, true); cp += la_count; if (fp) fp += la_count; count -= la_count; } if (count > 0) n_tty_receive_buf_closing(tty, cp, fp, count, false); } else { if (la_count > 0) { n_tty_receive_buf_standard(tty, cp, fp, la_count, true); cp += la_count; if (fp) fp += la_count; count -= la_count; } if (count > 0) n_tty_receive_buf_standard(tty, cp, fp, count, false); flush_echoes(tty); if (tty->ops->flush_chars) tty->ops->flush_chars(tty); } ldata->lookahead_count -= la_count; if (ldata->icanon && !L_EXTPROC(tty)) return; /* publish read_head to consumer */ smp_store_release(&ldata->commit_head, ldata->read_head); if (read_cnt(ldata)) { kill_fasync(&tty->fasync, SIGIO, POLL_IN); wake_up_interruptible_poll(&tty->read_wait, EPOLLIN | EPOLLRDNORM); } } /** * n_tty_receive_buf_common - process input * @tty: device to receive input * @cp: input chars * @fp: flags for each char (if %NULL, all chars are %TTY_NORMAL) * @count: number of input chars in @cp * @flow: enable flow control * * Called by the terminal driver when a block of characters has been received. * This function must be called from soft contexts not from interrupt context. * The driver is responsible for making calls one at a time and in order (or * using flush_to_ldisc()). * * Returns: the # of input chars from @cp which were processed. * * In canonical mode, the maximum line length is 4096 chars (including the line * termination char); lines longer than 4096 chars are truncated. After 4095 * chars, input data is still processed but not stored. Overflow processing * ensures the tty can always receive more input until at least one line can be * read. * * In non-canonical mode, the read buffer will only accept 4095 chars; this * provides the necessary space for a newline char if the input mode is * switched to canonical. * * Note it is possible for the read buffer to _contain_ 4096 chars in * non-canonical mode: the read buffer could already contain the maximum canon * line of 4096 chars when the mode is switched to non-canonical. * * Locking: n_tty_receive_buf()/producer path: * claims non-exclusive %termios_rwsem * publishes commit_head or canon_head */ static size_t n_tty_receive_buf_common(struct tty_struct *tty, const u8 *cp, const u8 *fp, size_t count, bool flow) { struct n_tty_data *ldata = tty->disc_data; size_t n, rcvd = 0; int room, overflow; down_read(&tty->termios_rwsem); do { /* * When PARMRK is set, each input char may take up to 3 chars * in the read buf; reduce the buffer space avail by 3x * * If we are doing input canonicalization, and there are no * pending newlines, let characters through without limit, so * that erase characters will be handled. Other excess * characters will be beeped. * * paired with store in *_copy_from_read_buf() -- guarantees * the consumer has loaded the data in read_buf up to the new * read_tail (so this producer will not overwrite unread data) */ size_t tail = smp_load_acquire(&ldata->read_tail); room = N_TTY_BUF_SIZE - (ldata->read_head - tail); if (I_PARMRK(tty)) room = DIV_ROUND_UP(room, 3); room--; if (room <= 0) { overflow = ldata->icanon && ldata->canon_head == tail; if (overflow && room < 0) ldata->read_head--; room = overflow; WRITE_ONCE(ldata->no_room, flow && !room); } else overflow = 0; n = min_t(size_t, count, room); if (!n) break; /* ignore parity errors if handling overflow */ if (!overflow || !fp || *fp != TTY_PARITY) __receive_buf(tty, cp, fp, n); cp += n; if (fp) fp += n; count -= n; rcvd += n; } while (!test_bit(TTY_LDISC_CHANGING, &tty->flags)); tty->receive_room = room; /* Unthrottle if handling overflow on pty */ if (tty->driver->type == TTY_DRIVER_TYPE_PTY) { if (overflow) { tty_set_flow_change(tty, TTY_UNTHROTTLE_SAFE); tty_unthrottle_safe(tty); __tty_set_flow_change(tty, 0); } } else n_tty_check_throttle(tty); if (unlikely(ldata->no_room)) { /* * Barrier here is to ensure to read the latest read_tail in * chars_in_buffer() and to make sure that read_tail is not loaded * before ldata->no_room is set. */ smp_mb(); if (!chars_in_buffer(tty)) n_tty_kick_worker(tty); } up_read(&tty->termios_rwsem); return rcvd; } static void n_tty_receive_buf(struct tty_struct *tty, const u8 *cp, const u8 *fp, size_t count) { n_tty_receive_buf_common(tty, cp, fp, count, false); } static size_t n_tty_receive_buf2(struct tty_struct *tty, const u8 *cp, const u8 *fp, size_t count) { return n_tty_receive_buf_common(tty, cp, fp, count, true); } /** * n_tty_set_termios - termios data changed * @tty: terminal * @old: previous data * * Called by the tty layer when the user changes termios flags so that the line * discipline can plan ahead. This function cannot sleep and is protected from * re-entry by the tty layer. The user is guaranteed that this function will * not be re-entered or in progress when the ldisc is closed. * * Locking: Caller holds @tty->termios_rwsem */ static void n_tty_set_termios(struct tty_struct *tty, const struct ktermios *old) { struct n_tty_data *ldata = tty->disc_data; if (!old || (old->c_lflag ^ tty->termios.c_lflag) & (ICANON | EXTPROC)) { bitmap_zero(ldata->read_flags, N_TTY_BUF_SIZE); ldata->line_start = ldata->read_tail; if (!L_ICANON(tty) || !read_cnt(ldata)) { ldata->canon_head = ldata->read_tail; ldata->push = 0; } else { set_bit(MASK(ldata->read_head - 1), ldata->read_flags); ldata->canon_head = ldata->read_head; ldata->push = 1; } ldata->commit_head = ldata->read_head; ldata->erasing = 0; ldata->lnext = 0; } ldata->icanon = (L_ICANON(tty) != 0); if (I_ISTRIP(tty) || I_IUCLC(tty) || I_IGNCR(tty) || I_ICRNL(tty) || I_INLCR(tty) || L_ICANON(tty) || I_IXON(tty) || L_ISIG(tty) || L_ECHO(tty) || I_PARMRK(tty)) { bitmap_zero(ldata->char_map, 256); if (I_IGNCR(tty) || I_ICRNL(tty)) set_bit('\r', ldata->char_map); if (I_INLCR(tty)) set_bit('\n', ldata->char_map); if (L_ICANON(tty)) { set_bit(ERASE_CHAR(tty), ldata->char_map); set_bit(KILL_CHAR(tty), ldata->char_map); set_bit(EOF_CHAR(tty), ldata->char_map); set_bit('\n', ldata->char_map); set_bit(EOL_CHAR(tty), ldata->char_map); if (L_IEXTEN(tty)) { set_bit(WERASE_CHAR(tty), ldata->char_map); set_bit(LNEXT_CHAR(tty), ldata->char_map); set_bit(EOL2_CHAR(tty), ldata->char_map); if (L_ECHO(tty)) set_bit(REPRINT_CHAR(tty), ldata->char_map); } } if (I_IXON(tty)) { set_bit(START_CHAR(tty), ldata->char_map); set_bit(STOP_CHAR(tty), ldata->char_map); } if (L_ISIG(tty)) { set_bit(INTR_CHAR(tty), ldata->char_map); set_bit(QUIT_CHAR(tty), ldata->char_map); set_bit(SUSP_CHAR(tty), ldata->char_map); } clear_bit(__DISABLED_CHAR, ldata->char_map); ldata->raw = 0; ldata->real_raw = 0; } else { ldata->raw = 1; if ((I_IGNBRK(tty) || (!I_BRKINT(tty) && !I_PARMRK(tty))) && (I_IGNPAR(tty) || !I_INPCK(tty)) && (tty->driver->flags & TTY_DRIVER_REAL_RAW)) ldata->real_raw = 1; else ldata->real_raw = 0; } /* * Fix tty hang when I_IXON(tty) is cleared, but the tty * been stopped by STOP_CHAR(tty) before it. */ if (!I_IXON(tty) && old && (old->c_iflag & IXON) && !tty->flow.tco_stopped) { start_tty(tty); process_echoes(tty); } /* The termios change make the tty ready for I/O */ wake_up_interruptible(&tty->write_wait); wake_up_interruptible(&tty->read_wait); } /** * n_tty_close - close the ldisc for this tty * @tty: device * * Called from the terminal layer when this line discipline is being shut down, * either because of a close or becsuse of a discipline change. The function * will not be called while other ldisc methods are in progress. */ static void n_tty_close(struct tty_struct *tty) { struct n_tty_data *ldata = tty->disc_data; if (tty->link) n_tty_packet_mode_flush(tty); down_write(&tty->termios_rwsem); vfree(ldata); tty->disc_data = NULL; up_write(&tty->termios_rwsem); } /** * n_tty_open - open an ldisc * @tty: terminal to open * * Called when this line discipline is being attached to the terminal device. * Can sleep. Called serialized so that no other events will occur in parallel. * No further open will occur until a close. */ static int n_tty_open(struct tty_struct *tty) { struct n_tty_data *ldata; /* Currently a malloc failure here can panic */ ldata = vzalloc(sizeof(*ldata)); if (!ldata) return -ENOMEM; ldata->overrun_time = jiffies; mutex_init(&ldata->atomic_read_lock); mutex_init(&ldata->output_lock); tty->disc_data = ldata; tty->closing = 0; /* indicate buffer work may resume */ clear_bit(TTY_LDISC_HALTED, &tty->flags); n_tty_set_termios(tty, NULL); tty_unthrottle(tty); return 0; } static inline int input_available_p(const struct tty_struct *tty, int poll) { const struct n_tty_data *ldata = tty->disc_data; int amt = poll && !TIME_CHAR(tty) && MIN_CHAR(tty) ? MIN_CHAR(tty) : 1; if (ldata->icanon && !L_EXTPROC(tty)) return ldata->canon_head != ldata->read_tail; else return ldata->commit_head - ldata->read_tail >= amt; } /** * copy_from_read_buf - copy read data directly * @tty: terminal device * @kbp: data * @nr: size of data * * Helper function to speed up n_tty_read(). It is only called when %ICANON is * off; it copies characters straight from the tty queue. * * Returns: true if it successfully copied data, but there is still more data * to be had. * * Locking: * * called under the @ldata->atomic_read_lock sem * * n_tty_read()/consumer path: * caller holds non-exclusive %termios_rwsem; * read_tail published */ static bool copy_from_read_buf(const struct tty_struct *tty, u8 **kbp, size_t *nr) { struct n_tty_data *ldata = tty->disc_data; size_t n; bool is_eof; size_t head = smp_load_acquire(&ldata->commit_head); size_t tail = MASK(ldata->read_tail); n = min3(head - ldata->read_tail, N_TTY_BUF_SIZE - tail, *nr); if (!n) return false; u8 *from = read_buf_addr(ldata, tail); memcpy(*kbp, from, n); is_eof = n == 1 && *from == EOF_CHAR(tty); tty_audit_add_data(tty, from, n); zero_buffer(tty, from, n); smp_store_release(&ldata->read_tail, ldata->read_tail + n); /* Turn single EOF into zero-length read */ if (L_EXTPROC(tty) && ldata->icanon && is_eof && head == ldata->read_tail) return false; *kbp += n; *nr -= n; /* If we have more to copy, let the caller know */ return head != ldata->read_tail; } /** * canon_copy_from_read_buf - copy read data in canonical mode * @tty: terminal device * @kbp: data * @nr: size of data * * Helper function for n_tty_read(). It is only called when %ICANON is on; it * copies one line of input up to and including the line-delimiting character * into the result buffer. * * Note: When termios is changed from non-canonical to canonical mode and the * read buffer contains data, n_tty_set_termios() simulates an EOF push (as if * C-d were input) _without_ the %DISABLED_CHAR in the buffer. This causes data * already processed as input to be immediately available as input although a * newline has not been received. * * Locking: * * called under the %atomic_read_lock mutex * * n_tty_read()/consumer path: * caller holds non-exclusive %termios_rwsem; * read_tail published */ static bool canon_copy_from_read_buf(const struct tty_struct *tty, u8 **kbp, size_t *nr) { struct n_tty_data *ldata = tty->disc_data; size_t n, size, more, c; size_t eol; size_t tail, canon_head; int found = 0; /* N.B. avoid overrun if nr == 0 */ if (!*nr) return false; canon_head = smp_load_acquire(&ldata->canon_head); n = min(*nr, canon_head - ldata->read_tail); tail = MASK(ldata->read_tail); size = min_t(size_t, tail + n, N_TTY_BUF_SIZE); eol = find_next_bit(ldata->read_flags, size, tail); more = n - (size - tail); if (eol == N_TTY_BUF_SIZE && more) { /* scan wrapped without finding set bit */ eol = find_first_bit(ldata->read_flags, more); found = eol != more; } else found = eol != size; n = eol - tail; if (n > N_TTY_BUF_SIZE) n += N_TTY_BUF_SIZE; c = n + found; if (!found || read_buf(ldata, eol) != __DISABLED_CHAR) n = c; tty_copy(tty, *kbp, tail, n); *kbp += n; *nr -= n; if (found) clear_bit(eol, ldata->read_flags); smp_store_release(&ldata->read_tail, ldata->read_tail + c); if (found) { if (!ldata->push) ldata->line_start = ldata->read_tail; else ldata->push = 0; tty_audit_push(); return false; } /* No EOL found - do a continuation retry if there is more data */ return ldata->read_tail != canon_head; } /* * If we finished a read at the exact location of an * EOF (special EOL character that's a __DISABLED_CHAR) * in the stream, silently eat the EOF. */ static void canon_skip_eof(struct n_tty_data *ldata) { size_t tail, canon_head; canon_head = smp_load_acquire(&ldata->canon_head); tail = ldata->read_tail; // No data? if (tail == canon_head) return; // See if the tail position is EOF in the circular buffer tail &= (N_TTY_BUF_SIZE - 1); if (!test_bit(tail, ldata->read_flags)) return; if (read_buf(ldata, tail) != __DISABLED_CHAR) return; // Clear the EOL bit, skip the EOF char. clear_bit(tail, ldata->read_flags); smp_store_release(&ldata->read_tail, ldata->read_tail + 1); } /** * job_control - check job control * @tty: tty * @file: file handle * * Perform job control management checks on this @file/@tty descriptor and if * appropriate send any needed signals and return a negative error code if * action should be taken. * * Locking: * * redirected write test is safe * * current->signal->tty check is safe * * ctrl.lock to safely reference @tty->ctrl.pgrp */ static int job_control(struct tty_struct *tty, struct file *file) { /* Job control check -- must be done at start and after every sleep (POSIX.1 7.1.1.4). */ /* NOTE: not yet done after every sleep pending a thorough check of the logic of this change. -- jlc */ /* don't stop on /dev/console */ if (file->f_op->write_iter == redirected_tty_write) return 0; return __tty_check_change(tty, SIGTTIN); } /* * We still hold the atomic_read_lock and the termios_rwsem, and can just * continue to copy data. */ static ssize_t n_tty_continue_cookie(struct tty_struct *tty, u8 *kbuf, size_t nr, void **cookie) { struct n_tty_data *ldata = tty->disc_data; u8 *kb = kbuf; if (ldata->icanon && !L_EXTPROC(tty)) { /* * If we have filled the user buffer, see if we should skip an * EOF character before releasing the lock and returning done. */ if (!nr) canon_skip_eof(ldata); else if (canon_copy_from_read_buf(tty, &kb, &nr)) return kb - kbuf; } else { if (copy_from_read_buf(tty, &kb, &nr)) return kb - kbuf; } /* No more data - release locks and stop retries */ n_tty_kick_worker(tty); n_tty_check_unthrottle(tty); up_read(&tty->termios_rwsem); mutex_unlock(&ldata->atomic_read_lock); *cookie = NULL; return kb - kbuf; } static int n_tty_wait_for_input(struct tty_struct *tty, struct file *file, struct wait_queue_entry *wait, long *timeout) { if (test_bit(TTY_OTHER_CLOSED, &tty->flags)) return -EIO; if (tty_hung_up_p(file)) return 0; /* * Abort readers for ttys which never actually get hung up. * See __tty_hangup(). */ if (test_bit(TTY_HUPPING, &tty->flags)) return 0; if (!*timeout) return 0; if (tty_io_nonblock(tty, file)) return -EAGAIN; if (signal_pending(current)) return -ERESTARTSYS; up_read(&tty->termios_rwsem); *timeout = wait_woken(wait, TASK_INTERRUPTIBLE, *timeout); down_read(&tty->termios_rwsem); return 1; } /** * n_tty_read - read function for tty * @tty: tty device * @file: file object * @kbuf: kernelspace buffer pointer * @nr: size of I/O * @cookie: if non-%NULL, this is a continuation read * @offset: where to continue reading from (unused in n_tty) * * Perform reads for the line discipline. We are guaranteed that the line * discipline will not be closed under us but we may get multiple parallel * readers and must handle this ourselves. We may also get a hangup. Always * called in user context, may sleep. * * This code must be sure never to sleep through a hangup. * * Locking: n_tty_read()/consumer path: * claims non-exclusive termios_rwsem; * publishes read_tail */ static ssize_t n_tty_read(struct tty_struct *tty, struct file *file, u8 *kbuf, size_t nr, void **cookie, unsigned long offset) { struct n_tty_data *ldata = tty->disc_data; u8 *kb = kbuf; DEFINE_WAIT_FUNC(wait, woken_wake_function); int minimum, time; ssize_t retval; long timeout; bool packet; size_t old_tail; /* Is this a continuation of a read started earlier? */ if (*cookie) return n_tty_continue_cookie(tty, kbuf, nr, cookie); retval = job_control(tty, file); if (retval < 0) return retval; /* * Internal serialization of reads. */ if (file->f_flags & O_NONBLOCK) { if (!mutex_trylock(&ldata->atomic_read_lock)) return -EAGAIN; } else { if (mutex_lock_interruptible(&ldata->atomic_read_lock)) return -ERESTARTSYS; } down_read(&tty->termios_rwsem); minimum = time = 0; timeout = MAX_SCHEDULE_TIMEOUT; if (!ldata->icanon) { minimum = MIN_CHAR(tty); if (minimum) { time = (HZ / 10) * TIME_CHAR(tty); } else { timeout = (HZ / 10) * TIME_CHAR(tty); minimum = 1; } } packet = tty->ctrl.packet; old_tail = ldata->read_tail; add_wait_queue(&tty->read_wait, &wait); while (nr) { /* First test for status change. */ if (packet && tty->link->ctrl.pktstatus) { u8 cs; if (kb != kbuf) break; spin_lock_irq(&tty->link->ctrl.lock); cs = tty->link->ctrl.pktstatus; tty->link->ctrl.pktstatus = 0; spin_unlock_irq(&tty->link->ctrl.lock); *kb++ = cs; nr--; break; } if (!input_available_p(tty, 0)) { up_read(&tty->termios_rwsem); tty_buffer_flush_work(tty->port); down_read(&tty->termios_rwsem); if (!input_available_p(tty, 0)) { int ret = n_tty_wait_for_input(tty, file, &wait, &timeout); if (ret <= 0) { retval = ret; break; } continue; } } if (ldata->icanon && !L_EXTPROC(tty)) { if (canon_copy_from_read_buf(tty, &kb, &nr)) goto more_to_be_read; } else { /* Deal with packet mode. */ if (packet && kb == kbuf) { *kb++ = TIOCPKT_DATA; nr--; } if (copy_from_read_buf(tty, &kb, &nr) && kb - kbuf >= minimum) goto more_to_be_read; } n_tty_check_unthrottle(tty); if (kb - kbuf >= minimum) break; if (time) timeout = time; } if (old_tail != ldata->read_tail) { /* * Make sure no_room is not read in n_tty_kick_worker() * before setting ldata->read_tail in copy_from_read_buf(). */ smp_mb(); n_tty_kick_worker(tty); } up_read(&tty->termios_rwsem); remove_wait_queue(&tty->read_wait, &wait); mutex_unlock(&ldata->atomic_read_lock); if (kb - kbuf) retval = kb - kbuf; return retval; more_to_be_read: /* * There is more to be had and we have nothing more to wait for, so * let's mark us for retries. * * NOTE! We return here with both the termios_sem and atomic_read_lock * still held, the retries will release them when done. */ remove_wait_queue(&tty->read_wait, &wait); *cookie = cookie; return kb - kbuf; } /** * n_tty_write - write function for tty * @tty: tty device * @file: file object * @buf: userspace buffer pointer * @nr: size of I/O * * Write function of the terminal device. This is serialized with respect to * other write callers but not to termios changes, reads and other such events. * Since the receive code will echo characters, thus calling driver write * methods, the %output_lock is used in the output processing functions called * here as well as in the echo processing function to protect the column state * and space left in the buffer. * * This code must be sure never to sleep through a hangup. * * Locking: output_lock to protect column state and space left * (note that the process_output*() functions take this lock themselves) */ static ssize_t n_tty_write(struct tty_struct *tty, struct file *file, const u8 *buf, size_t nr) { const u8 *b = buf; DEFINE_WAIT_FUNC(wait, woken_wake_function); ssize_t num, retval = 0; /* Job control check -- must be done at start (POSIX.1 7.1.1.4). */ if (L_TOSTOP(tty) && file->f_op->write_iter != redirected_tty_write) { retval = tty_check_change(tty); if (retval) return retval; } down_read(&tty->termios_rwsem); /* Write out any echoed characters that are still pending */ process_echoes(tty); add_wait_queue(&tty->write_wait, &wait); while (1) { if (signal_pending(current)) { retval = -ERESTARTSYS; break; } if (tty_hung_up_p(file) || (tty->link && !tty->link->count)) { retval = -EIO; break; } if (O_OPOST(tty)) { while (nr > 0) { num = process_output_block(tty, b, nr); if (num < 0) { if (num == -EAGAIN) break; retval = num; goto break_out; } b += num; nr -= num; if (nr == 0) break; if (process_output(*b, tty) < 0) break; b++; nr--; } if (tty->ops->flush_chars) tty->ops->flush_chars(tty); } else { struct n_tty_data *ldata = tty->disc_data; while (nr > 0) { mutex_lock(&ldata->output_lock); num = tty->ops->write(tty, b, nr); mutex_unlock(&ldata->output_lock); if (num < 0) { retval = num; goto break_out; } if (!num) break; b += num; nr -= num; } } if (!nr) break; if (tty_io_nonblock(tty, file)) { retval = -EAGAIN; break; } up_read(&tty->termios_rwsem); wait_woken(&wait, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT); down_read(&tty->termios_rwsem); } break_out: remove_wait_queue(&tty->write_wait, &wait); if (nr && tty->fasync) set_bit(TTY_DO_WRITE_WAKEUP, &tty->flags); up_read(&tty->termios_rwsem); return (b - buf) ? b - buf : retval; } /** * n_tty_poll - poll method for N_TTY * @tty: terminal device * @file: file accessing it * @wait: poll table * * Called when the line discipline is asked to poll() for data or for special * events. This code is not serialized with respect to other events save * open/close. * * This code must be sure never to sleep through a hangup. * * Locking: called without the kernel lock held -- fine. */ static __poll_t n_tty_poll(struct tty_struct *tty, struct file *file, poll_table *wait) { __poll_t mask = 0; poll_wait(file, &tty->read_wait, wait); poll_wait(file, &tty->write_wait, wait); if (input_available_p(tty, 1)) mask |= EPOLLIN | EPOLLRDNORM; else { tty_buffer_flush_work(tty->port); if (input_available_p(tty, 1)) mask |= EPOLLIN | EPOLLRDNORM; } if (tty->ctrl.packet && tty->link->ctrl.pktstatus) mask |= EPOLLPRI | EPOLLIN | EPOLLRDNORM; if (test_bit(TTY_OTHER_CLOSED, &tty->flags)) mask |= EPOLLHUP; if (tty_hung_up_p(file)) mask |= EPOLLHUP; if (tty->ops->write && !tty_is_writelocked(tty) && tty_chars_in_buffer(tty) < WAKEUP_CHARS && tty_write_room(tty) > 0) mask |= EPOLLOUT | EPOLLWRNORM; return mask; } static unsigned long inq_canon(struct n_tty_data *ldata) { size_t nr, head, tail; if (ldata->canon_head == ldata->read_tail) return 0; head = ldata->canon_head; tail = ldata->read_tail; nr = head - tail; /* Skip EOF-chars.. */ while (MASK(head) != MASK(tail)) { if (test_bit(MASK(tail), ldata->read_flags) && read_buf(ldata, tail) == __DISABLED_CHAR) nr--; tail++; } return nr; } static int n_tty_ioctl(struct tty_struct *tty, unsigned int cmd, unsigned long arg) { struct n_tty_data *ldata = tty->disc_data; unsigned int num; switch (cmd) { case TIOCOUTQ: return put_user(tty_chars_in_buffer(tty), (int __user *) arg); case TIOCINQ: down_write(&tty->termios_rwsem); if (L_ICANON(tty) && !L_EXTPROC(tty)) num = inq_canon(ldata); else num = read_cnt(ldata); up_write(&tty->termios_rwsem); return put_user(num, (unsigned int __user *) arg); default: return n_tty_ioctl_helper(tty, cmd, arg); } } static struct tty_ldisc_ops n_tty_ops = { .owner = THIS_MODULE, .num = N_TTY, .name = "n_tty", .open = n_tty_open, .close = n_tty_close, .flush_buffer = n_tty_flush_buffer, .read = n_tty_read, .write = n_tty_write, .ioctl = n_tty_ioctl, .set_termios = n_tty_set_termios, .poll = n_tty_poll, .receive_buf = n_tty_receive_buf, .write_wakeup = n_tty_write_wakeup, .receive_buf2 = n_tty_receive_buf2, .lookahead_buf = n_tty_lookahead_flow_ctrl, }; /** * n_tty_inherit_ops - inherit N_TTY methods * @ops: struct tty_ldisc_ops where to save N_TTY methods * * Enables a 'subclass' line discipline to 'inherit' N_TTY methods. */ void n_tty_inherit_ops(struct tty_ldisc_ops *ops) { *ops = n_tty_ops; ops->owner = NULL; } EXPORT_SYMBOL_GPL(n_tty_inherit_ops); void __init n_tty_init(void) { tty_register_ldisc(&n_tty_ops); }
41 41 41 41 41 41 41 41 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 // SPDX-License-Identifier: GPL-2.0 /* * Provide a default dump_stack() function for architectures * which don't implement their own. */ #include <linux/kernel.h> #include <linux/buildid.h> #include <linux/export.h> #include <linux/sched.h> #include <linux/sched/debug.h> #include <linux/smp.h> #include <linux/atomic.h> #include <linux/kexec.h> #include <linux/utsname.h> #include <linux/stop_machine.h> static char dump_stack_arch_desc_str[128]; /** * dump_stack_set_arch_desc - set arch-specific str to show with task dumps * @fmt: printf-style format string * @...: arguments for the format string * * The configured string will be printed right after utsname during task * dumps. Usually used to add arch-specific system identifiers. If an * arch wants to make use of such an ID string, it should initialize this * as soon as possible during boot. */ void __init dump_stack_set_arch_desc(const char *fmt, ...) { va_list args; va_start(args, fmt); vsnprintf(dump_stack_arch_desc_str, sizeof(dump_stack_arch_desc_str), fmt, args); va_end(args); } #if IS_ENABLED(CONFIG_STACKTRACE_BUILD_ID) #define BUILD_ID_FMT " %20phN" #define BUILD_ID_VAL vmlinux_build_id #else #define BUILD_ID_FMT "%s" #define BUILD_ID_VAL "" #endif /** * dump_stack_print_info - print generic debug info for dump_stack() * @log_lvl: log level * * Arch-specific dump_stack() implementations can use this function to * print out the same debug information as the generic dump_stack(). */ void dump_stack_print_info(const char *log_lvl) { printk("%sCPU: %d UID: %u PID: %d Comm: %.20s %s%s %s %.*s %s " BUILD_ID_FMT "\n", log_lvl, raw_smp_processor_id(), __kuid_val(current_real_cred()->euid), current->pid, current->comm, kexec_crash_loaded() ? "Kdump: loaded " : "", print_tainted(), init_utsname()->release, (int)strcspn(init_utsname()->version, " "), init_utsname()->version, preempt_model_str(), BUILD_ID_VAL); if (get_taint()) printk("%s%s\n", log_lvl, print_tainted_verbose()); if (dump_stack_arch_desc_str[0] != '\0') printk("%sHardware name: %s\n", log_lvl, dump_stack_arch_desc_str); print_worker_info(log_lvl, current); print_stop_info(log_lvl, current); print_scx_info(log_lvl, current); } /** * show_regs_print_info - print generic debug info for show_regs() * @log_lvl: log level * * show_regs() implementations can use this function to print out generic * debug information. */ void show_regs_print_info(const char *log_lvl) { dump_stack_print_info(log_lvl); } static void __dump_stack(const char *log_lvl) { dump_stack_print_info(log_lvl); show_stack(NULL, NULL, log_lvl); } /** * dump_stack_lvl - dump the current task information and its stack trace * @log_lvl: log level * * Architectures can override this implementation by implementing its own. */ asmlinkage __visible void dump_stack_lvl(const char *log_lvl) { bool in_panic = panic_on_this_cpu(); unsigned long flags; /* * Permit this cpu to perform nested stack dumps while serialising * against other CPUs, unless this CPU is in panic. * * When in panic, non-panic CPUs are not permitted to store new * printk messages so there is no need to synchronize the output. * This avoids potential deadlock in panic() if another CPU is * holding and unable to release the printk_cpu_sync. */ if (!in_panic) printk_cpu_sync_get_irqsave(flags); __dump_stack(log_lvl); if (!in_panic) printk_cpu_sync_put_irqrestore(flags); } EXPORT_SYMBOL(dump_stack_lvl); asmlinkage __visible void dump_stack(void) { dump_stack_lvl(KERN_DEFAULT); } EXPORT_SYMBOL(dump_stack);
1 1 1 1 1 1 1 1 1 1 27 23 11 11 11 6 7 6 11 19 1 1 1 19 19 1 18 19 1 1 5 2 4 22 5 19 23 23 22 23 5 19 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 // SPDX-License-Identifier: GPL-2.0-or-later /* Generic associative array implementation. * * See Documentation/core-api/assoc_array.rst for information. * * Copyright (C) 2013 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) */ //#define DEBUG #include <linux/rcupdate.h> #include <linux/slab.h> #include <linux/err.h> #include <linux/assoc_array_priv.h> /* * Iterate over an associative array. The caller must hold the RCU read lock * or better. */ static int assoc_array_subtree_iterate(const struct assoc_array_ptr *root, const struct assoc_array_ptr *stop, int (*iterator)(const void *leaf, void *iterator_data), void *iterator_data) { const struct assoc_array_shortcut *shortcut; const struct assoc_array_node *node; const struct assoc_array_ptr *cursor, *ptr, *parent; unsigned long has_meta; int slot, ret; cursor = root; begin_node: if (assoc_array_ptr_is_shortcut(cursor)) { /* Descend through a shortcut */ shortcut = assoc_array_ptr_to_shortcut(cursor); cursor = READ_ONCE(shortcut->next_node); /* Address dependency. */ } node = assoc_array_ptr_to_node(cursor); slot = 0; /* We perform two passes of each node. * * The first pass does all the leaves in this node. This means we * don't miss any leaves if the node is split up by insertion whilst * we're iterating over the branches rooted here (we may, however, see * some leaves twice). */ has_meta = 0; for (; slot < ASSOC_ARRAY_FAN_OUT; slot++) { ptr = READ_ONCE(node->slots[slot]); /* Address dependency. */ has_meta |= (unsigned long)ptr; if (ptr && assoc_array_ptr_is_leaf(ptr)) { /* We need a barrier between the read of the pointer, * which is supplied by the above READ_ONCE(). */ /* Invoke the callback */ ret = iterator(assoc_array_ptr_to_leaf(ptr), iterator_data); if (ret) return ret; } } /* The second pass attends to all the metadata pointers. If we follow * one of these we may find that we don't come back here, but rather go * back to a replacement node with the leaves in a different layout. * * We are guaranteed to make progress, however, as the slot number for * a particular portion of the key space cannot change - and we * continue at the back pointer + 1. */ if (!(has_meta & ASSOC_ARRAY_PTR_META_TYPE)) goto finished_node; slot = 0; continue_node: node = assoc_array_ptr_to_node(cursor); for (; slot < ASSOC_ARRAY_FAN_OUT; slot++) { ptr = READ_ONCE(node->slots[slot]); /* Address dependency. */ if (assoc_array_ptr_is_meta(ptr)) { cursor = ptr; goto begin_node; } } finished_node: /* Move up to the parent (may need to skip back over a shortcut) */ parent = READ_ONCE(node->back_pointer); /* Address dependency. */ slot = node->parent_slot; if (parent == stop) return 0; if (assoc_array_ptr_is_shortcut(parent)) { shortcut = assoc_array_ptr_to_shortcut(parent); cursor = parent; parent = READ_ONCE(shortcut->back_pointer); /* Address dependency. */ slot = shortcut->parent_slot; if (parent == stop) return 0; } /* Ascend to next slot in parent node */ cursor = parent; slot++; goto continue_node; } /** * assoc_array_iterate - Pass all objects in the array to a callback * @array: The array to iterate over. * @iterator: The callback function. * @iterator_data: Private data for the callback function. * * Iterate over all the objects in an associative array. Each one will be * presented to the iterator function. * * If the array is being modified concurrently with the iteration then it is * possible that some objects in the array will be passed to the iterator * callback more than once - though every object should be passed at least * once. If this is undesirable then the caller must lock against modification * for the duration of this function. * * The function will return 0 if no objects were in the array or else it will * return the result of the last iterator function called. Iteration stops * immediately if any call to the iteration function results in a non-zero * return. * * The caller should hold the RCU read lock or better if concurrent * modification is possible. */ int assoc_array_iterate(const struct assoc_array *array, int (*iterator)(const void *object, void *iterator_data), void *iterator_data) { struct assoc_array_ptr *root = READ_ONCE(array->root); /* Address dependency. */ if (!root) return 0; return assoc_array_subtree_iterate(root, NULL, iterator, iterator_data); } enum assoc_array_walk_status { assoc_array_walk_tree_empty, assoc_array_walk_found_terminal_node, assoc_array_walk_found_wrong_shortcut, }; struct assoc_array_walk_result { struct { struct assoc_array_node *node; /* Node in which leaf might be found */ int level; int slot; } terminal_node; struct { struct assoc_array_shortcut *shortcut; int level; int sc_level; unsigned long sc_segments; unsigned long dissimilarity; } wrong_shortcut; }; /* * Navigate through the internal tree looking for the closest node to the key. */ static enum assoc_array_walk_status assoc_array_walk(const struct assoc_array *array, const struct assoc_array_ops *ops, const void *index_key, struct assoc_array_walk_result *result) { struct assoc_array_shortcut *shortcut; struct assoc_array_node *node; struct assoc_array_ptr *cursor, *ptr; unsigned long sc_segments, dissimilarity; unsigned long segments; int level, sc_level, next_sc_level; int slot; pr_devel("-->%s()\n", __func__); cursor = READ_ONCE(array->root); /* Address dependency. */ if (!cursor) return assoc_array_walk_tree_empty; level = 0; /* Use segments from the key for the new leaf to navigate through the * internal tree, skipping through nodes and shortcuts that are on * route to the destination. Eventually we'll come to a slot that is * either empty or contains a leaf at which point we've found a node in * which the leaf we're looking for might be found or into which it * should be inserted. */ jumped: segments = ops->get_key_chunk(index_key, level); pr_devel("segments[%d]: %lx\n", level, segments); if (assoc_array_ptr_is_shortcut(cursor)) goto follow_shortcut; consider_node: node = assoc_array_ptr_to_node(cursor); slot = segments >> (level & ASSOC_ARRAY_KEY_CHUNK_MASK); slot &= ASSOC_ARRAY_FAN_MASK; ptr = READ_ONCE(node->slots[slot]); /* Address dependency. */ pr_devel("consider slot %x [ix=%d type=%lu]\n", slot, level, (unsigned long)ptr & 3); if (!assoc_array_ptr_is_meta(ptr)) { /* The node doesn't have a node/shortcut pointer in the slot * corresponding to the index key that we have to follow. */ result->terminal_node.node = node; result->terminal_node.level = level; result->terminal_node.slot = slot; pr_devel("<--%s() = terminal_node\n", __func__); return assoc_array_walk_found_terminal_node; } if (assoc_array_ptr_is_node(ptr)) { /* There is a pointer to a node in the slot corresponding to * this index key segment, so we need to follow it. */ cursor = ptr; level += ASSOC_ARRAY_LEVEL_STEP; if ((level & ASSOC_ARRAY_KEY_CHUNK_MASK) != 0) goto consider_node; goto jumped; } /* There is a shortcut in the slot corresponding to the index key * segment. We follow the shortcut if its partial index key matches * this leaf's. Otherwise we need to split the shortcut. */ cursor = ptr; follow_shortcut: shortcut = assoc_array_ptr_to_shortcut(cursor); pr_devel("shortcut to %d\n", shortcut->skip_to_level); sc_level = level + ASSOC_ARRAY_LEVEL_STEP; BUG_ON(sc_level > shortcut->skip_to_level); do { /* Check the leaf against the shortcut's index key a word at a * time, trimming the final word (the shortcut stores the index * key completely from the root to the shortcut's target). */ if ((sc_level & ASSOC_ARRAY_KEY_CHUNK_MASK) == 0) segments = ops->get_key_chunk(index_key, sc_level); sc_segments = shortcut->index_key[sc_level >> ASSOC_ARRAY_KEY_CHUNK_SHIFT]; dissimilarity = segments ^ sc_segments; if (round_up(sc_level, ASSOC_ARRAY_KEY_CHUNK_SIZE) > shortcut->skip_to_level) { /* Trim segments that are beyond the shortcut */ int shift = shortcut->skip_to_level & ASSOC_ARRAY_KEY_CHUNK_MASK; dissimilarity &= ~(ULONG_MAX << shift); next_sc_level = shortcut->skip_to_level; } else { next_sc_level = sc_level + ASSOC_ARRAY_KEY_CHUNK_SIZE; next_sc_level = round_down(next_sc_level, ASSOC_ARRAY_KEY_CHUNK_SIZE); } if (dissimilarity != 0) { /* This shortcut points elsewhere */ result->wrong_shortcut.shortcut = shortcut; result->wrong_shortcut.level = level; result->wrong_shortcut.sc_level = sc_level; result->wrong_shortcut.sc_segments = sc_segments; result->wrong_shortcut.dissimilarity = dissimilarity; return assoc_array_walk_found_wrong_shortcut; } sc_level = next_sc_level; } while (sc_level < shortcut->skip_to_level); /* The shortcut matches the leaf's index to this point. */ cursor = READ_ONCE(shortcut->next_node); /* Address dependency. */ if (((level ^ sc_level) & ~ASSOC_ARRAY_KEY_CHUNK_MASK) != 0) { level = sc_level; goto jumped; } else { level = sc_level; goto consider_node; } } /** * assoc_array_find - Find an object by index key * @array: The associative array to search. * @ops: The operations to use. * @index_key: The key to the object. * * Find an object in an associative array by walking through the internal tree * to the node that should contain the object and then searching the leaves * there. NULL is returned if the requested object was not found in the array. * * The caller must hold the RCU read lock or better. */ void *assoc_array_find(const struct assoc_array *array, const struct assoc_array_ops *ops, const void *index_key) { struct assoc_array_walk_result result; const struct assoc_array_node *node; const struct assoc_array_ptr *ptr; const void *leaf; int slot; if (assoc_array_walk(array, ops, index_key, &result) != assoc_array_walk_found_terminal_node) return NULL; node = result.terminal_node.node; /* If the target key is available to us, it's has to be pointed to by * the terminal node. */ for (slot = 0; slot < ASSOC_ARRAY_FAN_OUT; slot++) { ptr = READ_ONCE(node->slots[slot]); /* Address dependency. */ if (ptr && assoc_array_ptr_is_leaf(ptr)) { /* We need a barrier between the read of the pointer * and dereferencing the pointer - but only if we are * actually going to dereference it. */ leaf = assoc_array_ptr_to_leaf(ptr); if (ops->compare_object(leaf, index_key)) return (void *)leaf; } } return NULL; } /* * Destructively iterate over an associative array. The caller must prevent * other simultaneous accesses. */ static void assoc_array_destroy_subtree(struct assoc_array_ptr *root, const struct assoc_array_ops *ops) { struct assoc_array_shortcut *shortcut; struct assoc_array_node *node; struct assoc_array_ptr *cursor, *parent = NULL; int slot = -1; pr_devel("-->%s()\n", __func__); cursor = root; if (!cursor) { pr_devel("empty\n"); return; } move_to_meta: if (assoc_array_ptr_is_shortcut(cursor)) { /* Descend through a shortcut */ pr_devel("[%d] shortcut\n", slot); BUG_ON(!assoc_array_ptr_is_shortcut(cursor)); shortcut = assoc_array_ptr_to_shortcut(cursor); BUG_ON(shortcut->back_pointer != parent); BUG_ON(slot != -1 && shortcut->parent_slot != slot); parent = cursor; cursor = shortcut->next_node; slot = -1; BUG_ON(!assoc_array_ptr_is_node(cursor)); } pr_devel("[%d] node\n", slot); node = assoc_array_ptr_to_node(cursor); BUG_ON(node->back_pointer != parent); BUG_ON(slot != -1 && node->parent_slot != slot); slot = 0; continue_node: pr_devel("Node %p [back=%p]\n", node, node->back_pointer); for (; slot < ASSOC_ARRAY_FAN_OUT; slot++) { struct assoc_array_ptr *ptr = node->slots[slot]; if (!ptr) continue; if (assoc_array_ptr_is_meta(ptr)) { parent = cursor; cursor = ptr; goto move_to_meta; } if (ops) { pr_devel("[%d] free leaf\n", slot); ops->free_object(assoc_array_ptr_to_leaf(ptr)); } } parent = node->back_pointer; slot = node->parent_slot; pr_devel("free node\n"); kfree(node); if (!parent) return; /* Done */ /* Move back up to the parent (may need to free a shortcut on * the way up) */ if (assoc_array_ptr_is_shortcut(parent)) { shortcut = assoc_array_ptr_to_shortcut(parent); BUG_ON(shortcut->next_node != cursor); cursor = parent; parent = shortcut->back_pointer; slot = shortcut->parent_slot; pr_devel("free shortcut\n"); kfree(shortcut); if (!parent) return; BUG_ON(!assoc_array_ptr_is_node(parent)); } /* Ascend to next slot in parent node */ pr_devel("ascend to %p[%d]\n", parent, slot); cursor = parent; node = assoc_array_ptr_to_node(cursor); slot++; goto continue_node; } /** * assoc_array_destroy - Destroy an associative array * @array: The array to destroy. * @ops: The operations to use. * * Discard all metadata and free all objects in an associative array. The * array will be empty and ready to use again upon completion. This function * cannot fail. * * The caller must prevent all other accesses whilst this takes place as no * attempt is made to adjust pointers gracefully to permit RCU readlock-holding * accesses to continue. On the other hand, no memory allocation is required. */ void assoc_array_destroy(struct assoc_array *array, const struct assoc_array_ops *ops) { assoc_array_destroy_subtree(array->root, ops); array->root = NULL; } /* * Handle insertion into an empty tree. */ static bool assoc_array_insert_in_empty_tree(struct assoc_array_edit *edit) { struct assoc_array_node *new_n0; pr_devel("-->%s()\n", __func__); new_n0 = kzalloc(sizeof(struct assoc_array_node), GFP_KERNEL); if (!new_n0) return false; edit->new_meta[0] = assoc_array_node_to_ptr(new_n0); edit->leaf_p = &new_n0->slots[0]; edit->adjust_count_on = new_n0; edit->set[0].ptr = &edit->array->root; edit->set[0].to = assoc_array_node_to_ptr(new_n0); pr_devel("<--%s() = ok [no root]\n", __func__); return true; } /* * Handle insertion into a terminal node. */ static bool assoc_array_insert_into_terminal_node(struct assoc_array_edit *edit, const struct assoc_array_ops *ops, const void *index_key, struct assoc_array_walk_result *result) { struct assoc_array_shortcut *shortcut, *new_s0; struct assoc_array_node *node, *new_n0, *new_n1, *side; struct assoc_array_ptr *ptr; unsigned long dissimilarity, base_seg, blank; size_t keylen; bool have_meta; int level, diff; int slot, next_slot, free_slot, i, j; node = result->terminal_node.node; level = result->terminal_node.level; edit->segment_cache[ASSOC_ARRAY_FAN_OUT] = result->terminal_node.slot; pr_devel("-->%s()\n", __func__); /* We arrived at a node which doesn't have an onward node or shortcut * pointer that we have to follow. This means that (a) the leaf we * want must go here (either by insertion or replacement) or (b) we * need to split this node and insert in one of the fragments. */ free_slot = -1; /* Firstly, we have to check the leaves in this node to see if there's * a matching one we should replace in place. */ for (i = 0; i < ASSOC_ARRAY_FAN_OUT; i++) { ptr = node->slots[i]; if (!ptr) { free_slot = i; continue; } if (assoc_array_ptr_is_leaf(ptr) && ops->compare_object(assoc_array_ptr_to_leaf(ptr), index_key)) { pr_devel("replace in slot %d\n", i); edit->leaf_p = &node->slots[i]; edit->dead_leaf = node->slots[i]; pr_devel("<--%s() = ok [replace]\n", __func__); return true; } } /* If there is a free slot in this node then we can just insert the * leaf here. */ if (free_slot >= 0) { pr_devel("insert in free slot %d\n", free_slot); edit->leaf_p = &node->slots[free_slot]; edit->adjust_count_on = node; pr_devel("<--%s() = ok [insert]\n", __func__); return true; } /* The node has no spare slots - so we're either going to have to split * it or insert another node before it. * * Whatever, we're going to need at least two new nodes - so allocate * those now. We may also need a new shortcut, but we deal with that * when we need it. */ new_n0 = kzalloc(sizeof(struct assoc_array_node), GFP_KERNEL); if (!new_n0) return false; edit->new_meta[0] = assoc_array_node_to_ptr(new_n0); new_n1 = kzalloc(sizeof(struct assoc_array_node), GFP_KERNEL); if (!new_n1) return false; edit->new_meta[1] = assoc_array_node_to_ptr(new_n1); /* We need to find out how similar the leaves are. */ pr_devel("no spare slots\n"); have_meta = false; for (i = 0; i < ASSOC_ARRAY_FAN_OUT; i++) { ptr = node->slots[i]; if (assoc_array_ptr_is_meta(ptr)) { edit->segment_cache[i] = 0xff; have_meta = true; continue; } base_seg = ops->get_object_key_chunk( assoc_array_ptr_to_leaf(ptr), level); base_seg >>= level & ASSOC_ARRAY_KEY_CHUNK_MASK; edit->segment_cache[i] = base_seg & ASSOC_ARRAY_FAN_MASK; } if (have_meta) { pr_devel("have meta\n"); goto split_node; } /* The node contains only leaves */ dissimilarity = 0; base_seg = edit->segment_cache[0]; for (i = 1; i < ASSOC_ARRAY_FAN_OUT; i++) dissimilarity |= edit->segment_cache[i] ^ base_seg; pr_devel("only leaves; dissimilarity=%lx\n", dissimilarity); if ((dissimilarity & ASSOC_ARRAY_FAN_MASK) == 0) { /* The old leaves all cluster in the same slot. We will need * to insert a shortcut if the new node wants to cluster with them. */ if ((edit->segment_cache[ASSOC_ARRAY_FAN_OUT] ^ base_seg) == 0) goto all_leaves_cluster_together; /* Otherwise all the old leaves cluster in the same slot, but * the new leaf wants to go into a different slot - so we * create a new node (n0) to hold the new leaf and a pointer to * a new node (n1) holding all the old leaves. * * This can be done by falling through to the node splitting * path. */ pr_devel("present leaves cluster but not new leaf\n"); } split_node: pr_devel("split node\n"); /* We need to split the current node. The node must contain anything * from a single leaf (in the one leaf case, this leaf will cluster * with the new leaf) and the rest meta-pointers, to all leaves, some * of which may cluster. * * It won't contain the case in which all the current leaves plus the * new leaves want to cluster in the same slot. * * We need to expel at least two leaves out of a set consisting of the * leaves in the node and the new leaf. The current meta pointers can * just be copied as they shouldn't cluster with any of the leaves. * * We need a new node (n0) to replace the current one and a new node to * take the expelled nodes (n1). */ edit->set[0].to = assoc_array_node_to_ptr(new_n0); new_n0->back_pointer = node->back_pointer; new_n0->parent_slot = node->parent_slot; new_n1->back_pointer = assoc_array_node_to_ptr(new_n0); new_n1->parent_slot = -1; /* Need to calculate this */ do_split_node: pr_devel("do_split_node\n"); new_n0->nr_leaves_on_branch = node->nr_leaves_on_branch; new_n1->nr_leaves_on_branch = 0; /* Begin by finding two matching leaves. There have to be at least two * that match - even if there are meta pointers - because any leaf that * would match a slot with a meta pointer in it must be somewhere * behind that meta pointer and cannot be here. Further, given N * remaining leaf slots, we now have N+1 leaves to go in them. */ for (i = 0; i < ASSOC_ARRAY_FAN_OUT; i++) { slot = edit->segment_cache[i]; if (slot != 0xff) for (j = i + 1; j < ASSOC_ARRAY_FAN_OUT + 1; j++) if (edit->segment_cache[j] == slot) goto found_slot_for_multiple_occupancy; } found_slot_for_multiple_occupancy: pr_devel("same slot: %x %x [%02x]\n", i, j, slot); BUG_ON(i >= ASSOC_ARRAY_FAN_OUT); BUG_ON(j >= ASSOC_ARRAY_FAN_OUT + 1); BUG_ON(slot >= ASSOC_ARRAY_FAN_OUT); new_n1->parent_slot = slot; /* Metadata pointers cannot change slot */ for (i = 0; i < ASSOC_ARRAY_FAN_OUT; i++) if (assoc_array_ptr_is_meta(node->slots[i])) new_n0->slots[i] = node->slots[i]; else new_n0->slots[i] = NULL; BUG_ON(new_n0->slots[slot] != NULL); new_n0->slots[slot] = assoc_array_node_to_ptr(new_n1); /* Filter the leaf pointers between the new nodes */ free_slot = -1; next_slot = 0; for (i = 0; i < ASSOC_ARRAY_FAN_OUT; i++) { if (assoc_array_ptr_is_meta(node->slots[i])) continue; if (edit->segment_cache[i] == slot) { new_n1->slots[next_slot++] = node->slots[i]; new_n1->nr_leaves_on_branch++; } else { do { free_slot++; } while (new_n0->slots[free_slot] != NULL); new_n0->slots[free_slot] = node->slots[i]; } } pr_devel("filtered: f=%x n=%x\n", free_slot, next_slot); if (edit->segment_cache[ASSOC_ARRAY_FAN_OUT] != slot) { do { free_slot++; } while (new_n0->slots[free_slot] != NULL); edit->leaf_p = &new_n0->slots[free_slot]; edit->adjust_count_on = new_n0; } else { edit->leaf_p = &new_n1->slots[next_slot++]; edit->adjust_count_on = new_n1; } BUG_ON(next_slot <= 1); edit->set_backpointers_to = assoc_array_node_to_ptr(new_n0); for (i = 0; i < ASSOC_ARRAY_FAN_OUT; i++) { if (edit->segment_cache[i] == 0xff) { ptr = node->slots[i]; BUG_ON(assoc_array_ptr_is_leaf(ptr)); if (assoc_array_ptr_is_node(ptr)) { side = assoc_array_ptr_to_node(ptr); edit->set_backpointers[i] = &side->back_pointer; } else { shortcut = assoc_array_ptr_to_shortcut(ptr); edit->set_backpointers[i] = &shortcut->back_pointer; } } } ptr = node->back_pointer; if (!ptr) edit->set[0].ptr = &edit->array->root; else if (assoc_array_ptr_is_node(ptr)) edit->set[0].ptr = &assoc_array_ptr_to_node(ptr)->slots[node->parent_slot]; else edit->set[0].ptr = &assoc_array_ptr_to_shortcut(ptr)->next_node; edit->excised_meta[0] = assoc_array_node_to_ptr(node); pr_devel("<--%s() = ok [split node]\n", __func__); return true; all_leaves_cluster_together: /* All the leaves, new and old, want to cluster together in this node * in the same slot, so we have to replace this node with a shortcut to * skip over the identical parts of the key and then place a pair of * nodes, one inside the other, at the end of the shortcut and * distribute the keys between them. * * Firstly we need to work out where the leaves start diverging as a * bit position into their keys so that we know how big the shortcut * needs to be. * * We only need to make a single pass of N of the N+1 leaves because if * any keys differ between themselves at bit X then at least one of * them must also differ with the base key at bit X or before. */ pr_devel("all leaves cluster together\n"); diff = INT_MAX; for (i = 0; i < ASSOC_ARRAY_FAN_OUT; i++) { int x = ops->diff_objects(assoc_array_ptr_to_leaf(node->slots[i]), index_key); if (x < diff) { BUG_ON(x < 0); diff = x; } } BUG_ON(diff == INT_MAX); BUG_ON(diff < level + ASSOC_ARRAY_LEVEL_STEP); keylen = round_up(diff, ASSOC_ARRAY_KEY_CHUNK_SIZE); keylen >>= ASSOC_ARRAY_KEY_CHUNK_SHIFT; new_s0 = kzalloc(struct_size(new_s0, index_key, keylen), GFP_KERNEL); if (!new_s0) return false; edit->new_meta[2] = assoc_array_shortcut_to_ptr(new_s0); edit->set[0].to = assoc_array_shortcut_to_ptr(new_s0); new_s0->back_pointer = node->back_pointer; new_s0->parent_slot = node->parent_slot; new_s0->next_node = assoc_array_node_to_ptr(new_n0); new_n0->back_pointer = assoc_array_shortcut_to_ptr(new_s0); new_n0->parent_slot = 0; new_n1->back_pointer = assoc_array_node_to_ptr(new_n0); new_n1->parent_slot = -1; /* Need to calculate this */ new_s0->skip_to_level = level = diff & ~ASSOC_ARRAY_LEVEL_STEP_MASK; pr_devel("skip_to_level = %d [diff %d]\n", level, diff); BUG_ON(level <= 0); for (i = 0; i < keylen; i++) new_s0->index_key[i] = ops->get_key_chunk(index_key, i * ASSOC_ARRAY_KEY_CHUNK_SIZE); if (level & ASSOC_ARRAY_KEY_CHUNK_MASK) { blank = ULONG_MAX << (level & ASSOC_ARRAY_KEY_CHUNK_MASK); pr_devel("blank off [%zu] %d: %lx\n", keylen - 1, level, blank); new_s0->index_key[keylen - 1] &= ~blank; } /* This now reduces to a node splitting exercise for which we'll need * to regenerate the disparity table. */ for (i = 0; i < ASSOC_ARRAY_FAN_OUT; i++) { ptr = node->slots[i]; base_seg = ops->get_object_key_chunk(assoc_array_ptr_to_leaf(ptr), level); base_seg >>= level & ASSOC_ARRAY_KEY_CHUNK_MASK; edit->segment_cache[i] = base_seg & ASSOC_ARRAY_FAN_MASK; } base_seg = ops->get_key_chunk(index_key, level); base_seg >>= level & ASSOC_ARRAY_KEY_CHUNK_MASK; edit->segment_cache[ASSOC_ARRAY_FAN_OUT] = base_seg & ASSOC_ARRAY_FAN_MASK; goto do_split_node; } /* * Handle insertion into the middle of a shortcut. */ static bool assoc_array_insert_mid_shortcut(struct assoc_array_edit *edit, const struct assoc_array_ops *ops, struct assoc_array_walk_result *result) { struct assoc_array_shortcut *shortcut, *new_s0, *new_s1; struct assoc_array_node *node, *new_n0, *side; unsigned long sc_segments, dissimilarity, blank; size_t keylen; int level, sc_level, diff; int sc_slot; shortcut = result->wrong_shortcut.shortcut; level = result->wrong_shortcut.level; sc_level = result->wrong_shortcut.sc_level; sc_segments = result->wrong_shortcut.sc_segments; dissimilarity = result->wrong_shortcut.dissimilarity; pr_devel("-->%s(ix=%d dis=%lx scix=%d)\n", __func__, level, dissimilarity, sc_level); /* We need to split a shortcut and insert a node between the two * pieces. Zero-length pieces will be dispensed with entirely. * * First of all, we need to find out in which level the first * difference was. */ diff = __ffs(dissimilarity); diff &= ~ASSOC_ARRAY_LEVEL_STEP_MASK; diff += sc_level & ~ASSOC_ARRAY_KEY_CHUNK_MASK; pr_devel("diff=%d\n", diff); if (!shortcut->back_pointer) { edit->set[0].ptr = &edit->array->root; } else if (assoc_array_ptr_is_node(shortcut->back_pointer)) { node = assoc_array_ptr_to_node(shortcut->back_pointer); edit->set[0].ptr = &node->slots[shortcut->parent_slot]; } else { BUG(); } edit->excised_meta[0] = assoc_array_shortcut_to_ptr(shortcut); /* Create a new node now since we're going to need it anyway */ new_n0 = kzalloc(sizeof(struct assoc_array_node), GFP_KERNEL); if (!new_n0) return false; edit->new_meta[0] = assoc_array_node_to_ptr(new_n0); edit->adjust_count_on = new_n0; /* Insert a new shortcut before the new node if this segment isn't of * zero length - otherwise we just connect the new node directly to the * parent. */ level += ASSOC_ARRAY_LEVEL_STEP; if (diff > level) { pr_devel("pre-shortcut %d...%d\n", level, diff); keylen = round_up(diff, ASSOC_ARRAY_KEY_CHUNK_SIZE); keylen >>= ASSOC_ARRAY_KEY_CHUNK_SHIFT; new_s0 = kzalloc(struct_size(new_s0, index_key, keylen), GFP_KERNEL); if (!new_s0) return false; edit->new_meta[1] = assoc_array_shortcut_to_ptr(new_s0); edit->set[0].to = assoc_array_shortcut_to_ptr(new_s0); new_s0->back_pointer = shortcut->back_pointer; new_s0->parent_slot = shortcut->parent_slot; new_s0->next_node = assoc_array_node_to_ptr(new_n0); new_s0->skip_to_level = diff; new_n0->back_pointer = assoc_array_shortcut_to_ptr(new_s0); new_n0->parent_slot = 0; memcpy(new_s0->index_key, shortcut->index_key, flex_array_size(new_s0, index_key, keylen)); blank = ULONG_MAX << (diff & ASSOC_ARRAY_KEY_CHUNK_MASK); pr_devel("blank off [%zu] %d: %lx\n", keylen - 1, diff, blank); new_s0->index_key[keylen - 1] &= ~blank; } else { pr_devel("no pre-shortcut\n"); edit->set[0].to = assoc_array_node_to_ptr(new_n0); new_n0->back_pointer = shortcut->back_pointer; new_n0->parent_slot = shortcut->parent_slot; } side = assoc_array_ptr_to_node(shortcut->next_node); new_n0->nr_leaves_on_branch = side->nr_leaves_on_branch; /* We need to know which slot in the new node is going to take a * metadata pointer. */ sc_slot = sc_segments >> (diff & ASSOC_ARRAY_KEY_CHUNK_MASK); sc_slot &= ASSOC_ARRAY_FAN_MASK; pr_devel("new slot %lx >> %d -> %d\n", sc_segments, diff & ASSOC_ARRAY_KEY_CHUNK_MASK, sc_slot); /* Determine whether we need to follow the new node with a replacement * for the current shortcut. We could in theory reuse the current * shortcut if its parent slot number doesn't change - but that's a * 1-in-16 chance so not worth expending the code upon. */ level = diff + ASSOC_ARRAY_LEVEL_STEP; if (level < shortcut->skip_to_level) { pr_devel("post-shortcut %d...%d\n", level, shortcut->skip_to_level); keylen = round_up(shortcut->skip_to_level, ASSOC_ARRAY_KEY_CHUNK_SIZE); keylen >>= ASSOC_ARRAY_KEY_CHUNK_SHIFT; new_s1 = kzalloc(struct_size(new_s1, index_key, keylen), GFP_KERNEL); if (!new_s1) return false; edit->new_meta[2] = assoc_array_shortcut_to_ptr(new_s1); new_s1->back_pointer = assoc_array_node_to_ptr(new_n0); new_s1->parent_slot = sc_slot; new_s1->next_node = shortcut->next_node; new_s1->skip_to_level = shortcut->skip_to_level; new_n0->slots[sc_slot] = assoc_array_shortcut_to_ptr(new_s1); memcpy(new_s1->index_key, shortcut->index_key, flex_array_size(new_s1, index_key, keylen)); edit->set[1].ptr = &side->back_pointer; edit->set[1].to = assoc_array_shortcut_to_ptr(new_s1); } else { pr_devel("no post-shortcut\n"); /* We don't have to replace the pointed-to node as long as we * use memory barriers to make sure the parent slot number is * changed before the back pointer (the parent slot number is * irrelevant to the old parent shortcut). */ new_n0->slots[sc_slot] = shortcut->next_node; edit->set_parent_slot[0].p = &side->parent_slot; edit->set_parent_slot[0].to = sc_slot; edit->set[1].ptr = &side->back_pointer; edit->set[1].to = assoc_array_node_to_ptr(new_n0); } /* Install the new leaf in a spare slot in the new node. */ if (sc_slot == 0) edit->leaf_p = &new_n0->slots[1]; else edit->leaf_p = &new_n0->slots[0]; pr_devel("<--%s() = ok [split shortcut]\n", __func__); return true; } /** * assoc_array_insert - Script insertion of an object into an associative array * @array: The array to insert into. * @ops: The operations to use. * @index_key: The key to insert at. * @object: The object to insert. * * Precalculate and preallocate a script for the insertion or replacement of an * object in an associative array. This results in an edit script that can * either be applied or cancelled. * * The function returns a pointer to an edit script or -ENOMEM. * * The caller should lock against other modifications and must continue to hold * the lock until assoc_array_apply_edit() has been called. * * Accesses to the tree may take place concurrently with this function, * provided they hold the RCU read lock. */ struct assoc_array_edit *assoc_array_insert(struct assoc_array *array, const struct assoc_array_ops *ops, const void *index_key, void *object) { struct assoc_array_walk_result result; struct assoc_array_edit *edit; pr_devel("-->%s()\n", __func__); /* The leaf pointer we're given must not have the bottom bit set as we * use those for type-marking the pointer. NULL pointers are also not * allowed as they indicate an empty slot but we have to allow them * here as they can be updated later. */ BUG_ON(assoc_array_ptr_is_meta(object)); edit = kzalloc(sizeof(struct assoc_array_edit), GFP_KERNEL); if (!edit) return ERR_PTR(-ENOMEM); edit->array = array; edit->ops = ops; edit->leaf = assoc_array_leaf_to_ptr(object); edit->adjust_count_by = 1; switch (assoc_array_walk(array, ops, index_key, &result)) { case assoc_array_walk_tree_empty: /* Allocate a root node if there isn't one yet */ if (!assoc_array_insert_in_empty_tree(edit)) goto enomem; return edit; case assoc_array_walk_found_terminal_node: /* We found a node that doesn't have a node/shortcut pointer in * the slot corresponding to the index key that we have to * follow. */ if (!assoc_array_insert_into_terminal_node(edit, ops, index_key, &result)) goto enomem; return edit; case assoc_array_walk_found_wrong_shortcut: /* We found a shortcut that didn't match our key in a slot we * needed to follow. */ if (!assoc_array_insert_mid_shortcut(edit, ops, &result)) goto enomem; return edit; } enomem: /* Clean up after an out of memory error */ pr_devel("enomem\n"); assoc_array_cancel_edit(edit); return ERR_PTR(-ENOMEM); } /** * assoc_array_insert_set_object - Set the new object pointer in an edit script * @edit: The edit script to modify. * @object: The object pointer to set. * * Change the object to be inserted in an edit script. The object pointed to * by the old object is not freed. This must be done prior to applying the * script. */ void assoc_array_insert_set_object(struct assoc_array_edit *edit, void *object) { BUG_ON(!object); edit->leaf = assoc_array_leaf_to_ptr(object); } struct assoc_array_delete_collapse_context { struct assoc_array_node *node; const void *skip_leaf; int slot; }; /* * Subtree collapse to node iterator. */ static int assoc_array_delete_collapse_iterator(const void *leaf, void *iterator_data) { struct assoc_array_delete_collapse_context *collapse = iterator_data; if (leaf == collapse->skip_leaf) return 0; BUG_ON(collapse->slot >= ASSOC_ARRAY_FAN_OUT); collapse->node->slots[collapse->slot++] = assoc_array_leaf_to_ptr(leaf); return 0; } /** * assoc_array_delete - Script deletion of an object from an associative array * @array: The array to search. * @ops: The operations to use. * @index_key: The key to the object. * * Precalculate and preallocate a script for the deletion of an object from an * associative array. This results in an edit script that can either be * applied or cancelled. * * The function returns a pointer to an edit script if the object was found, * NULL if the object was not found or -ENOMEM. * * The caller should lock against other modifications and must continue to hold * the lock until assoc_array_apply_edit() has been called. * * Accesses to the tree may take place concurrently with this function, * provided they hold the RCU read lock. */ struct assoc_array_edit *assoc_array_delete(struct assoc_array *array, const struct assoc_array_ops *ops, const void *index_key) { struct assoc_array_delete_collapse_context collapse; struct assoc_array_walk_result result; struct assoc_array_node *node, *new_n0; struct assoc_array_edit *edit; struct assoc_array_ptr *ptr; bool has_meta; int slot, i; pr_devel("-->%s()\n", __func__); edit = kzalloc(sizeof(struct assoc_array_edit), GFP_KERNEL); if (!edit) return ERR_PTR(-ENOMEM); edit->array = array; edit->ops = ops; edit->adjust_count_by = -1; switch (assoc_array_walk(array, ops, index_key, &result)) { case assoc_array_walk_found_terminal_node: /* We found a node that should contain the leaf we've been * asked to remove - *if* it's in the tree. */ pr_devel("terminal_node\n"); node = result.terminal_node.node; for (slot = 0; slot < ASSOC_ARRAY_FAN_OUT; slot++) { ptr = node->slots[slot]; if (ptr && assoc_array_ptr_is_leaf(ptr) && ops->compare_object(assoc_array_ptr_to_leaf(ptr), index_key)) goto found_leaf; } fallthrough; case assoc_array_walk_tree_empty: case assoc_array_walk_found_wrong_shortcut: default: assoc_array_cancel_edit(edit); pr_devel("not found\n"); return NULL; } found_leaf: BUG_ON(array->nr_leaves_on_tree <= 0); /* In the simplest form of deletion we just clear the slot and release * the leaf after a suitable interval. */ edit->dead_leaf = node->slots[slot]; edit->set[0].ptr = &node->slots[slot]; edit->set[0].to = NULL; edit->adjust_count_on = node; /* If that concludes erasure of the last leaf, then delete the entire * internal array. */ if (array->nr_leaves_on_tree == 1) { edit->set[1].ptr = &array->root; edit->set[1].to = NULL; edit->adjust_count_on = NULL; edit->excised_subtree = array->root; pr_devel("all gone\n"); return edit; } /* However, we'd also like to clear up some metadata blocks if we * possibly can. * * We go for a simple algorithm of: if this node has FAN_OUT or fewer * leaves in it, then attempt to collapse it - and attempt to * recursively collapse up the tree. * * We could also try and collapse in partially filled subtrees to take * up space in this node. */ if (node->nr_leaves_on_branch <= ASSOC_ARRAY_FAN_OUT + 1) { struct assoc_array_node *parent, *grandparent; struct assoc_array_ptr *ptr; /* First of all, we need to know if this node has metadata so * that we don't try collapsing if all the leaves are already * here. */ has_meta = false; for (i = 0; i < ASSOC_ARRAY_FAN_OUT; i++) { ptr = node->slots[i]; if (assoc_array_ptr_is_meta(ptr)) { has_meta = true; break; } } pr_devel("leaves: %ld [m=%d]\n", node->nr_leaves_on_branch - 1, has_meta); /* Look further up the tree to see if we can collapse this node * into a more proximal node too. */ parent = node; collapse_up: pr_devel("collapse subtree: %ld\n", parent->nr_leaves_on_branch); ptr = parent->back_pointer; if (!ptr) goto do_collapse; if (assoc_array_ptr_is_shortcut(ptr)) { struct assoc_array_shortcut *s = assoc_array_ptr_to_shortcut(ptr); ptr = s->back_pointer; if (!ptr) goto do_collapse; } grandparent = assoc_array_ptr_to_node(ptr); if (grandparent->nr_leaves_on_branch <= ASSOC_ARRAY_FAN_OUT + 1) { parent = grandparent; goto collapse_up; } do_collapse: /* There's no point collapsing if the original node has no meta * pointers to discard and if we didn't merge into one of that * node's ancestry. */ if (has_meta || parent != node) { node = parent; /* Create a new node to collapse into */ new_n0 = kzalloc(sizeof(struct assoc_array_node), GFP_KERNEL); if (!new_n0) goto enomem; edit->new_meta[0] = assoc_array_node_to_ptr(new_n0); new_n0->back_pointer = node->back_pointer; new_n0->parent_slot = node->parent_slot; new_n0->nr_leaves_on_branch = node->nr_leaves_on_branch; edit->adjust_count_on = new_n0; collapse.node = new_n0; collapse.skip_leaf = assoc_array_ptr_to_leaf(edit->dead_leaf); collapse.slot = 0; assoc_array_subtree_iterate(assoc_array_node_to_ptr(node), node->back_pointer, assoc_array_delete_collapse_iterator, &collapse); pr_devel("collapsed %d,%lu\n", collapse.slot, new_n0->nr_leaves_on_branch); BUG_ON(collapse.slot != new_n0->nr_leaves_on_branch - 1); if (!node->back_pointer) { edit->set[1].ptr = &array->root; } else if (assoc_array_ptr_is_leaf(node->back_pointer)) { BUG(); } else if (assoc_array_ptr_is_node(node->back_pointer)) { struct assoc_array_node *p = assoc_array_ptr_to_node(node->back_pointer); edit->set[1].ptr = &p->slots[node->parent_slot]; } else if (assoc_array_ptr_is_shortcut(node->back_pointer)) { struct assoc_array_shortcut *s = assoc_array_ptr_to_shortcut(node->back_pointer); edit->set[1].ptr = &s->next_node; } edit->set[1].to = assoc_array_node_to_ptr(new_n0); edit->excised_subtree = assoc_array_node_to_ptr(node); } } return edit; enomem: /* Clean up after an out of memory error */ pr_devel("enomem\n"); assoc_array_cancel_edit(edit); return ERR_PTR(-ENOMEM); } /** * assoc_array_clear - Script deletion of all objects from an associative array * @array: The array to clear. * @ops: The operations to use. * * Precalculate and preallocate a script for the deletion of all the objects * from an associative array. This results in an edit script that can either * be applied or cancelled. * * The function returns a pointer to an edit script if there are objects to be * deleted, NULL if there are no objects in the array or -ENOMEM. * * The caller should lock against other modifications and must continue to hold * the lock until assoc_array_apply_edit() has been called. * * Accesses to the tree may take place concurrently with this function, * provided they hold the RCU read lock. */ struct assoc_array_edit *assoc_array_clear(struct assoc_array *array, const struct assoc_array_ops *ops) { struct assoc_array_edit *edit; pr_devel("-->%s()\n", __func__); if (!array->root) return NULL; edit = kzalloc(sizeof(struct assoc_array_edit), GFP_KERNEL); if (!edit) return ERR_PTR(-ENOMEM); edit->array = array; edit->ops = ops; edit->set[1].ptr = &array->root; edit->set[1].to = NULL; edit->excised_subtree = array->root; edit->ops_for_excised_subtree = ops; pr_devel("all gone\n"); return edit; } /* * Handle the deferred destruction after an applied edit. */ static void assoc_array_rcu_cleanup(struct rcu_head *head) { struct assoc_array_edit *edit = container_of(head, struct assoc_array_edit, rcu); int i; pr_devel("-->%s()\n", __func__); if (edit->dead_leaf) edit->ops->free_object(assoc_array_ptr_to_leaf(edit->dead_leaf)); for (i = 0; i < ARRAY_SIZE(edit->excised_meta); i++) if (edit->excised_meta[i]) kfree(assoc_array_ptr_to_node(edit->excised_meta[i])); if (edit->excised_subtree) { BUG_ON(assoc_array_ptr_is_leaf(edit->excised_subtree)); if (assoc_array_ptr_is_node(edit->excised_subtree)) { struct assoc_array_node *n = assoc_array_ptr_to_node(edit->excised_subtree); n->back_pointer = NULL; } else { struct assoc_array_shortcut *s = assoc_array_ptr_to_shortcut(edit->excised_subtree); s->back_pointer = NULL; } assoc_array_destroy_subtree(edit->excised_subtree, edit->ops_for_excised_subtree); } kfree(edit); } /** * assoc_array_apply_edit - Apply an edit script to an associative array * @edit: The script to apply. * * Apply an edit script to an associative array to effect an insertion, * deletion or clearance. As the edit script includes preallocated memory, * this is guaranteed not to fail. * * The edit script, dead objects and dead metadata will be scheduled for * destruction after an RCU grace period to permit those doing read-only * accesses on the array to continue to do so under the RCU read lock whilst * the edit is taking place. */ void assoc_array_apply_edit(struct assoc_array_edit *edit) { struct assoc_array_shortcut *shortcut; struct assoc_array_node *node; struct assoc_array_ptr *ptr; int i; pr_devel("-->%s()\n", __func__); smp_wmb(); if (edit->leaf_p) *edit->leaf_p = edit->leaf; smp_wmb(); for (i = 0; i < ARRAY_SIZE(edit->set_parent_slot); i++) if (edit->set_parent_slot[i].p) *edit->set_parent_slot[i].p = edit->set_parent_slot[i].to; smp_wmb(); for (i = 0; i < ARRAY_SIZE(edit->set_backpointers); i++) if (edit->set_backpointers[i]) *edit->set_backpointers[i] = edit->set_backpointers_to; smp_wmb(); for (i = 0; i < ARRAY_SIZE(edit->set); i++) if (edit->set[i].ptr) *edit->set[i].ptr = edit->set[i].to; if (edit->array->root == NULL) { edit->array->nr_leaves_on_tree = 0; } else if (edit->adjust_count_on) { node = edit->adjust_count_on; for (;;) { node->nr_leaves_on_branch += edit->adjust_count_by; ptr = node->back_pointer; if (!ptr) break; if (assoc_array_ptr_is_shortcut(ptr)) { shortcut = assoc_array_ptr_to_shortcut(ptr); ptr = shortcut->back_pointer; if (!ptr) break; } BUG_ON(!assoc_array_ptr_is_node(ptr)); node = assoc_array_ptr_to_node(ptr); } edit->array->nr_leaves_on_tree += edit->adjust_count_by; } call_rcu(&edit->rcu, assoc_array_rcu_cleanup); } /** * assoc_array_cancel_edit - Discard an edit script. * @edit: The script to discard. * * Free an edit script and all the preallocated data it holds without making * any changes to the associative array it was intended for. * * NOTE! In the case of an insertion script, this does _not_ release the leaf * that was to be inserted. That is left to the caller. */ void assoc_array_cancel_edit(struct assoc_array_edit *edit) { struct assoc_array_ptr *ptr; int i; pr_devel("-->%s()\n", __func__); /* Clean up after an out of memory error */ for (i = 0; i < ARRAY_SIZE(edit->new_meta); i++) { ptr = edit->new_meta[i]; if (ptr) { if (assoc_array_ptr_is_node(ptr)) kfree(assoc_array_ptr_to_node(ptr)); else kfree(assoc_array_ptr_to_shortcut(ptr)); } } kfree(edit); } /** * assoc_array_gc - Garbage collect an associative array. * @array: The array to clean. * @ops: The operations to use. * @iterator: A callback function to pass judgement on each object. * @iterator_data: Private data for the callback function. * * Collect garbage from an associative array and pack down the internal tree to * save memory. * * The iterator function is asked to pass judgement upon each object in the * array. If it returns false, the object is discard and if it returns true, * the object is kept. If it returns true, it must increment the object's * usage count (or whatever it needs to do to retain it) before returning. * * This function returns 0 if successful or -ENOMEM if out of memory. In the * latter case, the array is not changed. * * The caller should lock against other modifications and must continue to hold * the lock until assoc_array_apply_edit() has been called. * * Accesses to the tree may take place concurrently with this function, * provided they hold the RCU read lock. */ int assoc_array_gc(struct assoc_array *array, const struct assoc_array_ops *ops, bool (*iterator)(void *object, void *iterator_data), void *iterator_data) { struct assoc_array_shortcut *shortcut, *new_s; struct assoc_array_node *node, *new_n; struct assoc_array_edit *edit; struct assoc_array_ptr *cursor, *ptr; struct assoc_array_ptr *new_root, *new_parent, **new_ptr_pp; unsigned long nr_leaves_on_tree; bool retained; int keylen, slot, nr_free, next_slot, i; pr_devel("-->%s()\n", __func__); if (!array->root) return 0; edit = kzalloc(sizeof(struct assoc_array_edit), GFP_KERNEL); if (!edit) return -ENOMEM; edit->array = array; edit->ops = ops; edit->ops_for_excised_subtree = ops; edit->set[0].ptr = &array->root; edit->excised_subtree = array->root; new_root = new_parent = NULL; new_ptr_pp = &new_root; cursor = array->root; descend: /* If this point is a shortcut, then we need to duplicate it and * advance the target cursor. */ if (assoc_array_ptr_is_shortcut(cursor)) { shortcut = assoc_array_ptr_to_shortcut(cursor); keylen = round_up(shortcut->skip_to_level, ASSOC_ARRAY_KEY_CHUNK_SIZE); keylen >>= ASSOC_ARRAY_KEY_CHUNK_SHIFT; new_s = kmalloc(struct_size(new_s, index_key, keylen), GFP_KERNEL); if (!new_s) goto enomem; pr_devel("dup shortcut %p -> %p\n", shortcut, new_s); memcpy(new_s, shortcut, struct_size(new_s, index_key, keylen)); new_s->back_pointer = new_parent; new_s->parent_slot = shortcut->parent_slot; *new_ptr_pp = new_parent = assoc_array_shortcut_to_ptr(new_s); new_ptr_pp = &new_s->next_node; cursor = shortcut->next_node; } /* Duplicate the node at this position */ node = assoc_array_ptr_to_node(cursor); new_n = kzalloc(sizeof(struct assoc_array_node), GFP_KERNEL); if (!new_n) goto enomem; pr_devel("dup node %p -> %p\n", node, new_n); new_n->back_pointer = new_parent; new_n->parent_slot = node->parent_slot; *new_ptr_pp = new_parent = assoc_array_node_to_ptr(new_n); new_ptr_pp = NULL; slot = 0; continue_node: /* Filter across any leaves and gc any subtrees */ for (; slot < ASSOC_ARRAY_FAN_OUT; slot++) { ptr = node->slots[slot]; if (!ptr) continue; if (assoc_array_ptr_is_leaf(ptr)) { if (iterator(assoc_array_ptr_to_leaf(ptr), iterator_data)) /* The iterator will have done any reference * counting on the object for us. */ new_n->slots[slot] = ptr; continue; } new_ptr_pp = &new_n->slots[slot]; cursor = ptr; goto descend; } retry_compress: pr_devel("-- compress node %p --\n", new_n); /* Count up the number of empty slots in this node and work out the * subtree leaf count. */ new_n->nr_leaves_on_branch = 0; nr_free = 0; for (slot = 0; slot < ASSOC_ARRAY_FAN_OUT; slot++) { ptr = new_n->slots[slot]; if (!ptr) nr_free++; else if (assoc_array_ptr_is_leaf(ptr)) new_n->nr_leaves_on_branch++; } pr_devel("free=%d, leaves=%lu\n", nr_free, new_n->nr_leaves_on_branch); /* See what we can fold in */ retained = false; next_slot = 0; for (slot = 0; slot < ASSOC_ARRAY_FAN_OUT; slot++) { struct assoc_array_shortcut *s; struct assoc_array_node *child; ptr = new_n->slots[slot]; if (!ptr || assoc_array_ptr_is_leaf(ptr)) continue; s = NULL; if (assoc_array_ptr_is_shortcut(ptr)) { s = assoc_array_ptr_to_shortcut(ptr); ptr = s->next_node; } child = assoc_array_ptr_to_node(ptr); new_n->nr_leaves_on_branch += child->nr_leaves_on_branch; if (child->nr_leaves_on_branch <= nr_free + 1) { /* Fold the child node into this one */ pr_devel("[%d] fold node %lu/%d [nx %d]\n", slot, child->nr_leaves_on_branch, nr_free + 1, next_slot); /* We would already have reaped an intervening shortcut * on the way back up the tree. */ BUG_ON(s); new_n->slots[slot] = NULL; nr_free++; if (slot < next_slot) next_slot = slot; for (i = 0; i < ASSOC_ARRAY_FAN_OUT; i++) { struct assoc_array_ptr *p = child->slots[i]; if (!p) continue; BUG_ON(assoc_array_ptr_is_meta(p)); while (new_n->slots[next_slot]) next_slot++; BUG_ON(next_slot >= ASSOC_ARRAY_FAN_OUT); new_n->slots[next_slot++] = p; nr_free--; } kfree(child); } else { pr_devel("[%d] retain node %lu/%d [nx %d]\n", slot, child->nr_leaves_on_branch, nr_free + 1, next_slot); retained = true; } } if (retained && new_n->nr_leaves_on_branch <= ASSOC_ARRAY_FAN_OUT) { pr_devel("internal nodes remain despite enough space, retrying\n"); goto retry_compress; } pr_devel("after: %lu\n", new_n->nr_leaves_on_branch); nr_leaves_on_tree = new_n->nr_leaves_on_branch; /* Excise this node if it is singly occupied by a shortcut */ if (nr_free == ASSOC_ARRAY_FAN_OUT - 1) { for (slot = 0; slot < ASSOC_ARRAY_FAN_OUT; slot++) if ((ptr = new_n->slots[slot])) break; if (assoc_array_ptr_is_meta(ptr) && assoc_array_ptr_is_shortcut(ptr)) { pr_devel("excise node %p with 1 shortcut\n", new_n); new_s = assoc_array_ptr_to_shortcut(ptr); new_parent = new_n->back_pointer; slot = new_n->parent_slot; kfree(new_n); if (!new_parent) { new_s->back_pointer = NULL; new_s->parent_slot = 0; new_root = ptr; goto gc_complete; } if (assoc_array_ptr_is_shortcut(new_parent)) { /* We can discard any preceding shortcut also */ struct assoc_array_shortcut *s = assoc_array_ptr_to_shortcut(new_parent); pr_devel("excise preceding shortcut\n"); new_parent = new_s->back_pointer = s->back_pointer; slot = new_s->parent_slot = s->parent_slot; kfree(s); if (!new_parent) { new_s->back_pointer = NULL; new_s->parent_slot = 0; new_root = ptr; goto gc_complete; } } new_s->back_pointer = new_parent; new_s->parent_slot = slot; new_n = assoc_array_ptr_to_node(new_parent); new_n->slots[slot] = ptr; goto ascend_old_tree; } } /* Excise any shortcuts we might encounter that point to nodes that * only contain leaves. */ ptr = new_n->back_pointer; if (!ptr) goto gc_complete; if (assoc_array_ptr_is_shortcut(ptr)) { new_s = assoc_array_ptr_to_shortcut(ptr); new_parent = new_s->back_pointer; slot = new_s->parent_slot; if (new_n->nr_leaves_on_branch <= ASSOC_ARRAY_FAN_OUT) { struct assoc_array_node *n; pr_devel("excise shortcut\n"); new_n->back_pointer = new_parent; new_n->parent_slot = slot; kfree(new_s); if (!new_parent) { new_root = assoc_array_node_to_ptr(new_n); goto gc_complete; } n = assoc_array_ptr_to_node(new_parent); n->slots[slot] = assoc_array_node_to_ptr(new_n); } } else { new_parent = ptr; } new_n = assoc_array_ptr_to_node(new_parent); ascend_old_tree: ptr = node->back_pointer; if (assoc_array_ptr_is_shortcut(ptr)) { shortcut = assoc_array_ptr_to_shortcut(ptr); slot = shortcut->parent_slot; cursor = shortcut->back_pointer; if (!cursor) goto gc_complete; } else { slot = node->parent_slot; cursor = ptr; } BUG_ON(!cursor); node = assoc_array_ptr_to_node(cursor); slot++; goto continue_node; gc_complete: edit->set[0].to = new_root; assoc_array_apply_edit(edit); array->nr_leaves_on_tree = nr_leaves_on_tree; return 0; enomem: pr_devel("enomem\n"); assoc_array_destroy_subtree(new_root, edit->ops); kfree(edit); return -ENOMEM; }
3617 2371 132 564 5328 5485 132 13 1125 32 5491 5484 5400 5400 332 16 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 /* SPDX-License-Identifier: GPL-2.0 */ #undef TRACE_SYSTEM #define TRACE_SYSTEM block #if !defined(_TRACE_BLOCK_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_BLOCK_H #include <linux/blktrace_api.h> #include <linux/blkdev.h> #include <linux/buffer_head.h> #include <linux/tracepoint.h> #include <uapi/linux/ioprio.h> #define RWBS_LEN 10 #define IOPRIO_CLASS_STRINGS \ { IOPRIO_CLASS_NONE, "none" }, \ { IOPRIO_CLASS_RT, "rt" }, \ { IOPRIO_CLASS_BE, "be" }, \ { IOPRIO_CLASS_IDLE, "idle" }, \ { IOPRIO_CLASS_INVALID, "invalid"} #ifdef CONFIG_BUFFER_HEAD DECLARE_EVENT_CLASS(block_buffer, TP_PROTO(struct buffer_head *bh), TP_ARGS(bh), TP_STRUCT__entry ( __field( dev_t, dev ) __field( sector_t, sector ) __field( size_t, size ) ), TP_fast_assign( __entry->dev = bh->b_bdev->bd_dev; __entry->sector = bh->b_blocknr; __entry->size = bh->b_size; ), TP_printk("%d,%d sector=%llu size=%zu", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long long)__entry->sector, __entry->size ) ); /** * block_touch_buffer - mark a buffer accessed * @bh: buffer_head being touched * * Called from touch_buffer(). */ DEFINE_EVENT(block_buffer, block_touch_buffer, TP_PROTO(struct buffer_head *bh), TP_ARGS(bh) ); /** * block_dirty_buffer - mark a buffer dirty * @bh: buffer_head being dirtied * * Called from mark_buffer_dirty(). */ DEFINE_EVENT(block_buffer, block_dirty_buffer, TP_PROTO(struct buffer_head *bh), TP_ARGS(bh) ); #endif /* CONFIG_BUFFER_HEAD */ /** * block_rq_requeue - place block IO request back on a queue * @rq: block IO operation request * * The block operation request @rq is being placed back into queue * @q. For some reason the request was not completed and needs to be * put back in the queue. */ TRACE_EVENT(block_rq_requeue, TP_PROTO(struct request *rq), TP_ARGS(rq), TP_STRUCT__entry( __field( dev_t, dev ) __field( sector_t, sector ) __field( unsigned int, nr_sector ) __field( unsigned short, ioprio ) __array( char, rwbs, RWBS_LEN ) __dynamic_array( char, cmd, 1 ) ), TP_fast_assign( __entry->dev = rq->q->disk ? disk_devt(rq->q->disk) : 0; __entry->sector = blk_rq_trace_sector(rq); __entry->nr_sector = blk_rq_trace_nr_sectors(rq); __entry->ioprio = req_get_ioprio(rq); blk_fill_rwbs(__entry->rwbs, rq->cmd_flags); __get_str(cmd)[0] = '\0'; ), TP_printk("%d,%d %s (%s) %llu + %u %s,%u,%u [%d]", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs, __get_str(cmd), (unsigned long long)__entry->sector, __entry->nr_sector, __print_symbolic(IOPRIO_PRIO_CLASS(__entry->ioprio), IOPRIO_CLASS_STRINGS), IOPRIO_PRIO_HINT(__entry->ioprio), IOPRIO_PRIO_LEVEL(__entry->ioprio), 0) ); DECLARE_EVENT_CLASS(block_rq_completion, TP_PROTO(struct request *rq, blk_status_t error, unsigned int nr_bytes), TP_ARGS(rq, error, nr_bytes), TP_STRUCT__entry( __field( dev_t, dev ) __field( sector_t, sector ) __field( unsigned int, nr_sector ) __field( int , error ) __field( unsigned short, ioprio ) __array( char, rwbs, RWBS_LEN ) __dynamic_array( char, cmd, 1 ) ), TP_fast_assign( __entry->dev = rq->q->disk ? disk_devt(rq->q->disk) : 0; __entry->sector = blk_rq_pos(rq); __entry->nr_sector = nr_bytes >> 9; __entry->error = blk_status_to_errno(error); __entry->ioprio = req_get_ioprio(rq); blk_fill_rwbs(__entry->rwbs, rq->cmd_flags); __get_str(cmd)[0] = '\0'; ), TP_printk("%d,%d %s (%s) %llu + %u %s,%u,%u [%d]", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs, __get_str(cmd), (unsigned long long)__entry->sector, __entry->nr_sector, __print_symbolic(IOPRIO_PRIO_CLASS(__entry->ioprio), IOPRIO_CLASS_STRINGS), IOPRIO_PRIO_HINT(__entry->ioprio), IOPRIO_PRIO_LEVEL(__entry->ioprio), __entry->error) ); /** * block_rq_complete - block IO operation completed by device driver * @rq: block operations request * @error: status code * @nr_bytes: number of completed bytes * * The block_rq_complete tracepoint event indicates that some portion * of operation request has been completed by the device driver. If * the @rq->bio is %NULL, then there is absolutely no additional work to * do for the request. If @rq->bio is non-NULL then there is * additional work required to complete the request. */ DEFINE_EVENT(block_rq_completion, block_rq_complete, TP_PROTO(struct request *rq, blk_status_t error, unsigned int nr_bytes), TP_ARGS(rq, error, nr_bytes) ); /** * block_rq_error - block IO operation error reported by device driver * @rq: block operations request * @error: status code * @nr_bytes: number of completed bytes * * The block_rq_error tracepoint event indicates that some portion * of operation request has failed as reported by the device driver. */ DEFINE_EVENT(block_rq_completion, block_rq_error, TP_PROTO(struct request *rq, blk_status_t error, unsigned int nr_bytes), TP_ARGS(rq, error, nr_bytes) ); DECLARE_EVENT_CLASS(block_rq, TP_PROTO(struct request *rq), TP_ARGS(rq), TP_STRUCT__entry( __field( dev_t, dev ) __field( sector_t, sector ) __field( unsigned int, nr_sector ) __field( unsigned int, bytes ) __field( unsigned short, ioprio ) __array( char, rwbs, RWBS_LEN ) __array( char, comm, TASK_COMM_LEN ) __dynamic_array( char, cmd, 1 ) ), TP_fast_assign( __entry->dev = rq->q->disk ? disk_devt(rq->q->disk) : 0; __entry->sector = blk_rq_trace_sector(rq); __entry->nr_sector = blk_rq_trace_nr_sectors(rq); __entry->bytes = blk_rq_bytes(rq); __entry->ioprio = req_get_ioprio(rq); blk_fill_rwbs(__entry->rwbs, rq->cmd_flags); __get_str(cmd)[0] = '\0'; memcpy(__entry->comm, current->comm, TASK_COMM_LEN); ), TP_printk("%d,%d %s %u (%s) %llu + %u %s,%u,%u [%s]", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs, __entry->bytes, __get_str(cmd), (unsigned long long)__entry->sector, __entry->nr_sector, __print_symbolic(IOPRIO_PRIO_CLASS(__entry->ioprio), IOPRIO_CLASS_STRINGS), IOPRIO_PRIO_HINT(__entry->ioprio), IOPRIO_PRIO_LEVEL(__entry->ioprio), __entry->comm) ); /** * block_rq_insert - insert block operation request into queue * @rq: block IO operation request * * Called immediately before block operation request @rq is inserted * into queue @q. The fields in the operation request @rq struct can * be examined to determine which device and sectors the pending * operation would access. */ DEFINE_EVENT(block_rq, block_rq_insert, TP_PROTO(struct request *rq), TP_ARGS(rq) ); /** * block_rq_issue - issue pending block IO request operation to device driver * @rq: block IO operation request * * Called when block operation request @rq from queue @q is sent to a * device driver for processing. */ DEFINE_EVENT(block_rq, block_rq_issue, TP_PROTO(struct request *rq), TP_ARGS(rq) ); /** * block_rq_merge - merge request with another one in the elevator * @rq: block IO operation request * * Called when block operation request @rq from queue @q is merged to another * request queued in the elevator. */ DEFINE_EVENT(block_rq, block_rq_merge, TP_PROTO(struct request *rq), TP_ARGS(rq) ); /** * block_io_start - insert a request for execution * @rq: block IO operation request * * Called when block operation request @rq is queued for execution */ DEFINE_EVENT(block_rq, block_io_start, TP_PROTO(struct request *rq), TP_ARGS(rq) ); /** * block_io_done - block IO operation request completed * @rq: block IO operation request * * Called when block operation request @rq is completed */ DEFINE_EVENT(block_rq, block_io_done, TP_PROTO(struct request *rq), TP_ARGS(rq) ); /** * block_bio_complete - completed all work on the block operation * @q: queue holding the block operation * @bio: block operation completed * * This tracepoint indicates there is no further work to do on this * block IO operation @bio. */ TRACE_EVENT(block_bio_complete, TP_PROTO(struct request_queue *q, struct bio *bio), TP_ARGS(q, bio), TP_STRUCT__entry( __field( dev_t, dev ) __field( sector_t, sector ) __field( unsigned, nr_sector ) __field( int, error ) __array( char, rwbs, RWBS_LEN) ), TP_fast_assign( __entry->dev = bio_dev(bio); __entry->sector = bio->bi_iter.bi_sector; __entry->nr_sector = bio_sectors(bio); __entry->error = blk_status_to_errno(bio->bi_status); blk_fill_rwbs(__entry->rwbs, bio->bi_opf); ), TP_printk("%d,%d %s %llu + %u [%d]", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs, (unsigned long long)__entry->sector, __entry->nr_sector, __entry->error) ); DECLARE_EVENT_CLASS(block_bio, TP_PROTO(struct bio *bio), TP_ARGS(bio), TP_STRUCT__entry( __field( dev_t, dev ) __field( sector_t, sector ) __field( unsigned int, nr_sector ) __array( char, rwbs, RWBS_LEN ) __array( char, comm, TASK_COMM_LEN ) ), TP_fast_assign( __entry->dev = bio_dev(bio); __entry->sector = bio->bi_iter.bi_sector; __entry->nr_sector = bio_sectors(bio); blk_fill_rwbs(__entry->rwbs, bio->bi_opf); memcpy(__entry->comm, current->comm, TASK_COMM_LEN); ), TP_printk("%d,%d %s %llu + %u [%s]", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs, (unsigned long long)__entry->sector, __entry->nr_sector, __entry->comm) ); /** * block_bio_backmerge - merging block operation to the end of an existing operation * @bio: new block operation to merge * * Merging block request @bio to the end of an existing block request. */ DEFINE_EVENT(block_bio, block_bio_backmerge, TP_PROTO(struct bio *bio), TP_ARGS(bio) ); /** * block_bio_frontmerge - merging block operation to the beginning of an existing operation * @bio: new block operation to merge * * Merging block IO operation @bio to the beginning of an existing block request. */ DEFINE_EVENT(block_bio, block_bio_frontmerge, TP_PROTO(struct bio *bio), TP_ARGS(bio) ); /** * block_bio_queue - putting new block IO operation in queue * @bio: new block operation * * About to place the block IO operation @bio into queue @q. */ DEFINE_EVENT(block_bio, block_bio_queue, TP_PROTO(struct bio *bio), TP_ARGS(bio) ); /** * block_getrq - get a free request entry in queue for block IO operations * @bio: pending block IO operation (can be %NULL) * * A request struct has been allocated to handle the block IO operation @bio. */ DEFINE_EVENT(block_bio, block_getrq, TP_PROTO(struct bio *bio), TP_ARGS(bio) ); /** * blk_zone_append_update_request_bio - update bio sector after zone append * @rq: the completed request that sets the bio sector * * Update the bio's bi_sector after a zone append command has been completed. */ DEFINE_EVENT(block_rq, blk_zone_append_update_request_bio, TP_PROTO(struct request *rq), TP_ARGS(rq) ); /** * block_plug - keep operations requests in request queue * @q: request queue to plug * * Plug the request queue @q. Do not allow block operation requests * to be sent to the device driver. Instead, accumulate requests in * the queue to improve throughput performance of the block device. */ TRACE_EVENT(block_plug, TP_PROTO(struct request_queue *q), TP_ARGS(q), TP_STRUCT__entry( __array( char, comm, TASK_COMM_LEN ) ), TP_fast_assign( memcpy(__entry->comm, current->comm, TASK_COMM_LEN); ), TP_printk("[%s]", __entry->comm) ); DECLARE_EVENT_CLASS(block_unplug, TP_PROTO(struct request_queue *q, unsigned int depth, bool explicit), TP_ARGS(q, depth, explicit), TP_STRUCT__entry( __field( int, nr_rq ) __array( char, comm, TASK_COMM_LEN ) ), TP_fast_assign( __entry->nr_rq = depth; memcpy(__entry->comm, current->comm, TASK_COMM_LEN); ), TP_printk("[%s] %d", __entry->comm, __entry->nr_rq) ); /** * block_unplug - release of operations requests in request queue * @q: request queue to unplug * @depth: number of requests just added to the queue * @explicit: whether this was an explicit unplug, or one from schedule() * * Unplug request queue @q because device driver is scheduled to work * on elements in the request queue. */ DEFINE_EVENT(block_unplug, block_unplug, TP_PROTO(struct request_queue *q, unsigned int depth, bool explicit), TP_ARGS(q, depth, explicit) ); /** * block_split - split a single bio struct into two bio structs * @bio: block operation being split * @new_sector: The starting sector for the new bio * * The bio request @bio needs to be split into two bio requests. The newly * created @bio request starts at @new_sector. This split may be required due to * hardware limitations such as operation crossing device boundaries in a RAID * system. */ TRACE_EVENT(block_split, TP_PROTO(struct bio *bio, unsigned int new_sector), TP_ARGS(bio, new_sector), TP_STRUCT__entry( __field( dev_t, dev ) __field( sector_t, sector ) __field( sector_t, new_sector ) __array( char, rwbs, RWBS_LEN ) __array( char, comm, TASK_COMM_LEN ) ), TP_fast_assign( __entry->dev = bio_dev(bio); __entry->sector = bio->bi_iter.bi_sector; __entry->new_sector = new_sector; blk_fill_rwbs(__entry->rwbs, bio->bi_opf); memcpy(__entry->comm, current->comm, TASK_COMM_LEN); ), TP_printk("%d,%d %s %llu / %llu [%s]", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs, (unsigned long long)__entry->sector, (unsigned long long)__entry->new_sector, __entry->comm) ); /** * block_bio_remap - map request for a logical device to the raw device * @bio: revised operation * @dev: original device for the operation * @from: original sector for the operation * * An operation for a logical device has been mapped to the * raw block device. */ TRACE_EVENT(block_bio_remap, TP_PROTO(struct bio *bio, dev_t dev, sector_t from), TP_ARGS(bio, dev, from), TP_STRUCT__entry( __field( dev_t, dev ) __field( sector_t, sector ) __field( unsigned int, nr_sector ) __field( dev_t, old_dev ) __field( sector_t, old_sector ) __array( char, rwbs, RWBS_LEN) ), TP_fast_assign( __entry->dev = bio_dev(bio); __entry->sector = bio->bi_iter.bi_sector; __entry->nr_sector = bio_sectors(bio); __entry->old_dev = dev; __entry->old_sector = from; blk_fill_rwbs(__entry->rwbs, bio->bi_opf); ), TP_printk("%d,%d %s %llu + %u <- (%d,%d) %llu", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs, (unsigned long long)__entry->sector, __entry->nr_sector, MAJOR(__entry->old_dev), MINOR(__entry->old_dev), (unsigned long long)__entry->old_sector) ); /** * block_rq_remap - map request for a block operation request * @rq: block IO operation request * @dev: device for the operation * @from: original sector for the operation * * The block operation request @rq in @q has been remapped. The block * operation request @rq holds the current information and @from hold * the original sector. */ TRACE_EVENT(block_rq_remap, TP_PROTO(struct request *rq, dev_t dev, sector_t from), TP_ARGS(rq, dev, from), TP_STRUCT__entry( __field( dev_t, dev ) __field( sector_t, sector ) __field( unsigned int, nr_sector ) __field( dev_t, old_dev ) __field( sector_t, old_sector ) __field( unsigned int, nr_bios ) __array( char, rwbs, RWBS_LEN) ), TP_fast_assign( __entry->dev = disk_devt(rq->q->disk); __entry->sector = blk_rq_pos(rq); __entry->nr_sector = blk_rq_sectors(rq); __entry->old_dev = dev; __entry->old_sector = from; __entry->nr_bios = blk_rq_count_bios(rq); blk_fill_rwbs(__entry->rwbs, rq->cmd_flags); ), TP_printk("%d,%d %s %llu + %u <- (%d,%d) %llu %u", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs, (unsigned long long)__entry->sector, __entry->nr_sector, MAJOR(__entry->old_dev), MINOR(__entry->old_dev), (unsigned long long)__entry->old_sector, __entry->nr_bios) ); /** * blkdev_zone_mgmt - Execute a zone management operation on a range of zones * @bio: The block IO operation sent down to the device * @nr_sectors: The number of sectors affected by this operation * * Execute a zone management operation on a specified range of zones. This * range is encoded in %nr_sectors, which has to be a multiple of the zone * size. */ TRACE_EVENT(blkdev_zone_mgmt, TP_PROTO(struct bio *bio, sector_t nr_sectors), TP_ARGS(bio, nr_sectors), TP_STRUCT__entry( __field( dev_t, dev ) __field( sector_t, sector ) __field( sector_t, nr_sectors ) __array( char, rwbs, RWBS_LEN) ), TP_fast_assign( __entry->dev = bio_dev(bio); __entry->sector = bio->bi_iter.bi_sector; __entry->nr_sectors = bio_sectors(bio); blk_fill_rwbs(__entry->rwbs, bio->bi_opf); ), TP_printk("%d,%d %s %llu + %llu", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs, (unsigned long long)__entry->sector, __entry->nr_sectors) ); DECLARE_EVENT_CLASS(block_zwplug, TP_PROTO(struct request_queue *q, unsigned int zno, sector_t sector, unsigned int nr_sectors), TP_ARGS(q, zno, sector, nr_sectors), TP_STRUCT__entry( __field( dev_t, dev ) __field( unsigned int, zno ) __field( sector_t, sector ) __field( unsigned int, nr_sectors ) ), TP_fast_assign( __entry->dev = disk_devt(q->disk); __entry->zno = zno; __entry->sector = sector; __entry->nr_sectors = nr_sectors; ), TP_printk("%d,%d zone %u, BIO %llu + %u", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->zno, (unsigned long long)__entry->sector, __entry->nr_sectors) ); DEFINE_EVENT(block_zwplug, disk_zone_wplug_add_bio, TP_PROTO(struct request_queue *q, unsigned int zno, sector_t sector, unsigned int nr_sectors), TP_ARGS(q, zno, sector, nr_sectors) ); DEFINE_EVENT(block_zwplug, blk_zone_wplug_bio, TP_PROTO(struct request_queue *q, unsigned int zno, sector_t sector, unsigned int nr_sectors), TP_ARGS(q, zno, sector, nr_sectors) ); #endif /* _TRACE_BLOCK_H */ /* This part must be outside protection */ #include <trace/define_trace.h>
1356 1355 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 // SPDX-License-Identifier: GPL-2.0 #include <linux/kernel.h> #include <linux/of.h> #include <linux/of_device.h> #include <linux/of_address.h> #include <linux/of_iommu.h> #include <linux/of_reserved_mem.h> #include <linux/dma-direct.h> /* for bus_dma_region */ #include <linux/dma-map-ops.h> #include <linux/init.h> #include <linux/mod_devicetable.h> #include <linux/slab.h> #include <linux/platform_device.h> #include <asm/errno.h> #include "of_private.h" /** * of_match_device - Tell if a struct device matches an of_device_id list * @matches: array of of_device_id match structures to search in * @dev: the OF device structure to match against * * Used by a driver to check whether an platform_device present in the * system is in its list of supported devices. */ const struct of_device_id *of_match_device(const struct of_device_id *matches, const struct device *dev) { if (!matches || !dev->of_node || dev->of_node_reused) return NULL; return of_match_node(matches, dev->of_node); } EXPORT_SYMBOL(of_match_device); static void of_dma_set_restricted_buffer(struct device *dev, struct device_node *np) { struct device_node *of_node = dev->of_node; struct of_phandle_iterator it; int rc, i = 0; if (!IS_ENABLED(CONFIG_DMA_RESTRICTED_POOL)) return; /* * If dev->of_node doesn't exist or doesn't contain memory-region, try * the OF node having DMA configuration. */ if (!of_property_present(of_node, "memory-region")) of_node = np; of_for_each_phandle(&it, rc, of_node, "memory-region", NULL, 0) { /* * There might be multiple memory regions, but only one * restricted-dma-pool region is allowed. */ if (of_device_is_compatible(it.node, "restricted-dma-pool") && of_device_is_available(it.node)) { if (of_reserved_mem_device_init_by_idx(dev, of_node, i)) dev_warn(dev, "failed to initialise \"restricted-dma-pool\" memory node\n"); of_node_put(it.node); break; } i++; } } /** * of_dma_configure_id - Setup DMA configuration * @dev: Device to apply DMA configuration * @np: Pointer to OF node having DMA configuration * @force_dma: Whether device is to be set up by of_dma_configure() even if * DMA capability is not explicitly described by firmware. * @id: Optional const pointer value input id * * Try to get devices's DMA configuration from DT and update it * accordingly. * * If platform code needs to use its own special DMA configuration, it * can use a platform bus notifier and handle BUS_NOTIFY_ADD_DEVICE events * to fix up DMA configuration. */ int of_dma_configure_id(struct device *dev, struct device_node *np, bool force_dma, const u32 *id) { const struct bus_dma_region *map = NULL; struct device_node *bus_np; u64 mask, end = 0; bool coherent, set_map = false; int ret; if (dev->dma_range_map) { dev_dbg(dev, "dma_range_map already set\n"); goto skip_map; } if (np == dev->of_node) bus_np = __of_get_dma_parent(np); else bus_np = of_node_get(np); ret = of_dma_get_range(bus_np, &map); of_node_put(bus_np); if (ret < 0) { /* * For legacy reasons, we have to assume some devices need * DMA configuration regardless of whether "dma-ranges" is * correctly specified or not. */ if (!force_dma) return ret == -ENODEV ? 0 : ret; } else { /* Determine the overall bounds of all DMA regions */ end = dma_range_map_max(map); set_map = true; } skip_map: /* * If @dev is expected to be DMA-capable then the bus code that created * it should have initialised its dma_mask pointer by this point. For * now, we'll continue the legacy behaviour of coercing it to the * coherent mask if not, but we'll no longer do so quietly. */ if (!dev->dma_mask) { dev_warn(dev, "DMA mask not set\n"); dev->dma_mask = &dev->coherent_dma_mask; } if (!end && dev->coherent_dma_mask) end = dev->coherent_dma_mask; else if (!end) end = (1ULL << 32) - 1; /* * Limit coherent and dma mask based on size and default mask * set by the driver. */ mask = DMA_BIT_MASK(ilog2(end) + 1); dev->coherent_dma_mask &= mask; *dev->dma_mask &= mask; /* ...but only set bus limit and range map if we found valid dma-ranges earlier */ if (set_map) { dev->bus_dma_limit = end; dev->dma_range_map = map; } coherent = of_dma_is_coherent(np); dev_dbg(dev, "device is%sdma coherent\n", coherent ? " " : " not "); ret = of_iommu_configure(dev, np, id); if (ret == -EPROBE_DEFER) { /* Don't touch range map if it wasn't set from a valid dma-ranges */ if (set_map) dev->dma_range_map = NULL; kfree(map); return -EPROBE_DEFER; } /* Take all other IOMMU errors to mean we'll just carry on without it */ dev_dbg(dev, "device is%sbehind an iommu\n", !ret ? " " : " not "); arch_setup_dma_ops(dev, coherent); if (ret) of_dma_set_restricted_buffer(dev, np); return 0; } EXPORT_SYMBOL_GPL(of_dma_configure_id); const void *of_device_get_match_data(const struct device *dev) { const struct of_device_id *match; match = of_match_device(dev->driver->of_match_table, dev); if (!match) return NULL; return match->data; } EXPORT_SYMBOL(of_device_get_match_data); /** * of_device_modalias - Fill buffer with newline terminated modalias string * @dev: Calling device * @str: Modalias string * @len: Size of @str */ ssize_t of_device_modalias(struct device *dev, char *str, ssize_t len) { ssize_t sl; if (!dev || !dev->of_node || dev->of_node_reused) return -ENODEV; sl = of_modalias(dev->of_node, str, len - 2); if (sl < 0) return sl; if (sl > len - 2) return -ENOMEM; str[sl++] = '\n'; str[sl] = 0; return sl; } EXPORT_SYMBOL_GPL(of_device_modalias); /** * of_device_uevent - Display OF related uevent information * @dev: Device to display the uevent information for * @env: Kernel object's userspace event reference to fill up */ void of_device_uevent(const struct device *dev, struct kobj_uevent_env *env) { const char *compat, *type; struct alias_prop *app; struct property *p; int seen = 0; if ((!dev) || (!dev->of_node)) return; add_uevent_var(env, "OF_NAME=%pOFn", dev->of_node); add_uevent_var(env, "OF_FULLNAME=%pOF", dev->of_node); type = of_node_get_device_type(dev->of_node); if (type) add_uevent_var(env, "OF_TYPE=%s", type); /* Since the compatible field can contain pretty much anything * it's not really legal to split it out with commas. We split it * up using a number of environment variables instead. */ of_property_for_each_string(dev->of_node, "compatible", p, compat) { add_uevent_var(env, "OF_COMPATIBLE_%d=%s", seen, compat); seen++; } add_uevent_var(env, "OF_COMPATIBLE_N=%d", seen); seen = 0; mutex_lock(&of_mutex); list_for_each_entry(app, &aliases_lookup, link) { if (dev->of_node == app->np) { add_uevent_var(env, "OF_ALIAS_%d=%s", seen, app->alias); seen++; } } mutex_unlock(&of_mutex); } EXPORT_SYMBOL_GPL(of_device_uevent); int of_device_uevent_modalias(const struct device *dev, struct kobj_uevent_env *env) { int sl; if ((!dev) || (!dev->of_node) || dev->of_node_reused) return -ENODEV; /* Devicetree modalias is tricky, we add it in 2 steps */ if (add_uevent_var(env, "MODALIAS=")) return -ENOMEM; sl = of_modalias(dev->of_node, &env->buf[env->buflen-1], sizeof(env->buf) - env->buflen); if (sl < 0) return sl; if (sl >= (sizeof(env->buf) - env->buflen)) return -ENOMEM; env->buflen += sl; return 0; } EXPORT_SYMBOL_GPL(of_device_uevent_modalias); /** * of_device_make_bus_id - Use the device node data to assign a unique name * @dev: pointer to device structure that is linked to a device tree node * * This routine will first try using the translated bus address to * derive a unique name. If it cannot, then it will prepend names from * parent nodes until a unique name can be derived. */ void of_device_make_bus_id(struct device *dev) { struct device_node *node = dev->of_node; const __be32 *reg; u64 addr; u32 mask; /* Construct the name, using parent nodes if necessary to ensure uniqueness */ while (node->parent) { /* * If the address can be translated, then that is as much * uniqueness as we need. Make it the first component and return */ reg = of_get_property(node, "reg", NULL); if (reg && (addr = of_translate_address(node, reg)) != OF_BAD_ADDR) { if (!of_property_read_u32(node, "mask", &mask)) dev_set_name(dev, dev_name(dev) ? "%llx.%x.%pOFn:%s" : "%llx.%x.%pOFn", addr, ffs(mask) - 1, node, dev_name(dev)); else dev_set_name(dev, dev_name(dev) ? "%llx.%pOFn:%s" : "%llx.%pOFn", addr, node, dev_name(dev)); return; } /* format arguments only used if dev_name() resolves to NULL */ dev_set_name(dev, dev_name(dev) ? "%s:%s" : "%s", kbasename(node->full_name), dev_name(dev)); node = node->parent; } } EXPORT_SYMBOL_GPL(of_device_make_bus_id);
2696 525 1576 2697 95 927 920 137 815 204 4 124 124 124 137 135 4 124 14 18 137 137 137 136 137 137 31 98 97 739 742 448 2762 2767 2448 211 197 107 37 37 76 76 75 76 76 1 75 75 76 4 4 4 1369 1364 695 102 100 69 690 687 1007 1009 11 11 102 56 102 93 100 86 14 56 101 2 35 35 35 35 31 31 2 690 689 1350 1157 1055 108 1157 1 101 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 // SPDX-License-Identifier: GPL-2.0-only /* File: fs/xattr.c Extended attribute handling. Copyright (C) 2001 by Andreas Gruenbacher <a.gruenbacher@computer.org> Copyright (C) 2001 SGI - Silicon Graphics, Inc <linux-xfs@oss.sgi.com> Copyright (c) 2004 Red Hat, Inc., James Morris <jmorris@redhat.com> */ #include <linux/fs.h> #include <linux/filelock.h> #include <linux/slab.h> #include <linux/file.h> #include <linux/xattr.h> #include <linux/mount.h> #include <linux/namei.h> #include <linux/security.h> #include <linux/syscalls.h> #include <linux/export.h> #include <linux/fsnotify.h> #include <linux/audit.h> #include <linux/vmalloc.h> #include <linux/posix_acl_xattr.h> #include <linux/uaccess.h> #include "internal.h" static const char * strcmp_prefix(const char *a, const char *a_prefix) { while (*a_prefix && *a == *a_prefix) { a++; a_prefix++; } return *a_prefix ? NULL : a; } /* * In order to implement different sets of xattr operations for each xattr * prefix, a filesystem should create a null-terminated array of struct * xattr_handler (one for each prefix) and hang a pointer to it off of the * s_xattr field of the superblock. */ #define for_each_xattr_handler(handlers, handler) \ if (handlers) \ for ((handler) = *(handlers)++; \ (handler) != NULL; \ (handler) = *(handlers)++) /* * Find the xattr_handler with the matching prefix. */ static const struct xattr_handler * xattr_resolve_name(struct inode *inode, const char **name) { const struct xattr_handler * const *handlers = inode->i_sb->s_xattr; const struct xattr_handler *handler; if (!(inode->i_opflags & IOP_XATTR)) { if (unlikely(is_bad_inode(inode))) return ERR_PTR(-EIO); return ERR_PTR(-EOPNOTSUPP); } for_each_xattr_handler(handlers, handler) { const char *n; n = strcmp_prefix(*name, xattr_prefix(handler)); if (n) { if (!handler->prefix ^ !*n) { if (*n) continue; return ERR_PTR(-EINVAL); } *name = n; return handler; } } return ERR_PTR(-EOPNOTSUPP); } /** * may_write_xattr - check whether inode allows writing xattr * @idmap: idmap of the mount the inode was found from * @inode: the inode on which to set an xattr * * Check whether the inode allows writing xattrs. Specifically, we can never * set or remove an extended attribute on a read-only filesystem or on an * immutable / append-only inode. * * We also need to ensure that the inode has a mapping in the mount to * not risk writing back invalid i_{g,u}id values. * * Return: On success zero is returned. On error a negative errno is returned. */ int may_write_xattr(struct mnt_idmap *idmap, struct inode *inode) { if (IS_IMMUTABLE(inode)) return -EPERM; if (IS_APPEND(inode)) return -EPERM; if (HAS_UNMAPPED_ID(idmap, inode)) return -EPERM; return 0; } /* * Check permissions for extended attribute access. This is a bit complicated * because different namespaces have very different rules. */ static int xattr_permission(struct mnt_idmap *idmap, struct inode *inode, const char *name, int mask) { if (mask & MAY_WRITE) { int ret; ret = may_write_xattr(idmap, inode); if (ret) return ret; } /* * No restriction for security.* and system.* from the VFS. Decision * on these is left to the underlying filesystem / security module. */ if (!strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN) || !strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN)) return 0; /* * The trusted.* namespace can only be accessed by privileged users. */ if (!strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN)) { if (!capable(CAP_SYS_ADMIN)) return (mask & MAY_WRITE) ? -EPERM : -ENODATA; return 0; } /* * In the user.* namespace, only regular files and directories can have * extended attributes. For sticky directories, only the owner and * privileged users can write attributes. */ if (!strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN)) { if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode)) return (mask & MAY_WRITE) ? -EPERM : -ENODATA; if (S_ISDIR(inode->i_mode) && (inode->i_mode & S_ISVTX) && (mask & MAY_WRITE) && !inode_owner_or_capable(idmap, inode)) return -EPERM; } return inode_permission(idmap, inode, mask); } /* * Look for any handler that deals with the specified namespace. */ int xattr_supports_user_prefix(struct inode *inode) { const struct xattr_handler * const *handlers = inode->i_sb->s_xattr; const struct xattr_handler *handler; if (!(inode->i_opflags & IOP_XATTR)) { if (unlikely(is_bad_inode(inode))) return -EIO; return -EOPNOTSUPP; } for_each_xattr_handler(handlers, handler) { if (!strncmp(xattr_prefix(handler), XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN)) return 0; } return -EOPNOTSUPP; } EXPORT_SYMBOL(xattr_supports_user_prefix); int __vfs_setxattr(struct mnt_idmap *idmap, struct dentry *dentry, struct inode *inode, const char *name, const void *value, size_t size, int flags) { const struct xattr_handler *handler; if (is_posix_acl_xattr(name)) return -EOPNOTSUPP; handler = xattr_resolve_name(inode, &name); if (IS_ERR(handler)) return PTR_ERR(handler); if (!handler->set) return -EOPNOTSUPP; if (size == 0) value = ""; /* empty EA, do not remove */ return handler->set(handler, idmap, dentry, inode, name, value, size, flags); } EXPORT_SYMBOL(__vfs_setxattr); /** * __vfs_setxattr_noperm - perform setxattr operation without performing * permission checks. * * @idmap: idmap of the mount the inode was found from * @dentry: object to perform setxattr on * @name: xattr name to set * @value: value to set @name to * @size: size of @value * @flags: flags to pass into filesystem operations * * returns the result of the internal setxattr or setsecurity operations. * * This function requires the caller to lock the inode's i_rwsem before it * is executed. It also assumes that the caller will make the appropriate * permission checks. */ int __vfs_setxattr_noperm(struct mnt_idmap *idmap, struct dentry *dentry, const char *name, const void *value, size_t size, int flags) { struct inode *inode = dentry->d_inode; int error = -EAGAIN; int issec = !strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN); if (issec) inode->i_flags &= ~S_NOSEC; if (inode->i_opflags & IOP_XATTR) { error = __vfs_setxattr(idmap, dentry, inode, name, value, size, flags); if (!error) { fsnotify_xattr(dentry); security_inode_post_setxattr(dentry, name, value, size, flags); } } else { if (unlikely(is_bad_inode(inode))) return -EIO; } if (error == -EAGAIN) { error = -EOPNOTSUPP; if (issec) { const char *suffix = name + XATTR_SECURITY_PREFIX_LEN; error = security_inode_setsecurity(inode, suffix, value, size, flags); if (!error) fsnotify_xattr(dentry); } } return error; } /** * __vfs_setxattr_locked - set an extended attribute while holding the inode * lock * * @idmap: idmap of the mount of the target inode * @dentry: object to perform setxattr on * @name: xattr name to set * @value: value to set @name to * @size: size of @value * @flags: flags to pass into filesystem operations * @delegated_inode: on return, will contain an inode pointer that * a delegation was broken on, NULL if none. */ int __vfs_setxattr_locked(struct mnt_idmap *idmap, struct dentry *dentry, const char *name, const void *value, size_t size, int flags, struct inode **delegated_inode) { struct inode *inode = dentry->d_inode; int error; error = xattr_permission(idmap, inode, name, MAY_WRITE); if (error) return error; error = security_inode_setxattr(idmap, dentry, name, value, size, flags); if (error) goto out; error = try_break_deleg(inode, delegated_inode); if (error) goto out; error = __vfs_setxattr_noperm(idmap, dentry, name, value, size, flags); out: return error; } EXPORT_SYMBOL_GPL(__vfs_setxattr_locked); int vfs_setxattr(struct mnt_idmap *idmap, struct dentry *dentry, const char *name, const void *value, size_t size, int flags) { struct inode *inode = dentry->d_inode; struct inode *delegated_inode = NULL; const void *orig_value = value; int error; if (size && strcmp(name, XATTR_NAME_CAPS) == 0) { error = cap_convert_nscap(idmap, dentry, &value, size); if (error < 0) return error; size = error; } retry_deleg: inode_lock(inode); error = __vfs_setxattr_locked(idmap, dentry, name, value, size, flags, &delegated_inode); inode_unlock(inode); if (delegated_inode) { error = break_deleg_wait(&delegated_inode); if (!error) goto retry_deleg; } if (value != orig_value) kfree(value); return error; } EXPORT_SYMBOL_GPL(vfs_setxattr); static ssize_t xattr_getsecurity(struct mnt_idmap *idmap, struct inode *inode, const char *name, void *value, size_t size) { void *buffer = NULL; ssize_t len; if (!value || !size) { len = security_inode_getsecurity(idmap, inode, name, &buffer, false); goto out_noalloc; } len = security_inode_getsecurity(idmap, inode, name, &buffer, true); if (len < 0) return len; if (size < len) { len = -ERANGE; goto out; } memcpy(value, buffer, len); out: kfree(buffer); out_noalloc: return len; } /* * vfs_getxattr_alloc - allocate memory, if necessary, before calling getxattr * * Allocate memory, if not already allocated, or re-allocate correct size, * before retrieving the extended attribute. The xattr value buffer should * always be freed by the caller, even on error. * * Returns the result of alloc, if failed, or the getxattr operation. */ int vfs_getxattr_alloc(struct mnt_idmap *idmap, struct dentry *dentry, const char *name, char **xattr_value, size_t xattr_size, gfp_t flags) { const struct xattr_handler *handler; struct inode *inode = dentry->d_inode; char *value = *xattr_value; int error; error = xattr_permission(idmap, inode, name, MAY_READ); if (error) return error; handler = xattr_resolve_name(inode, &name); if (IS_ERR(handler)) return PTR_ERR(handler); if (!handler->get) return -EOPNOTSUPP; error = handler->get(handler, dentry, inode, name, NULL, 0); if (error < 0) return error; if (!value || (error > xattr_size)) { value = krealloc(*xattr_value, error + 1, flags); if (!value) return -ENOMEM; memset(value, 0, error + 1); } error = handler->get(handler, dentry, inode, name, value, error); *xattr_value = value; return error; } ssize_t __vfs_getxattr(struct dentry *dentry, struct inode *inode, const char *name, void *value, size_t size) { const struct xattr_handler *handler; if (is_posix_acl_xattr(name)) return -EOPNOTSUPP; handler = xattr_resolve_name(inode, &name); if (IS_ERR(handler)) return PTR_ERR(handler); if (!handler->get) return -EOPNOTSUPP; return handler->get(handler, dentry, inode, name, value, size); } EXPORT_SYMBOL(__vfs_getxattr); ssize_t vfs_getxattr(struct mnt_idmap *idmap, struct dentry *dentry, const char *name, void *value, size_t size) { struct inode *inode = dentry->d_inode; int error; error = xattr_permission(idmap, inode, name, MAY_READ); if (error) return error; error = security_inode_getxattr(dentry, name); if (error) return error; if (!strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN)) { const char *suffix = name + XATTR_SECURITY_PREFIX_LEN; int ret = xattr_getsecurity(idmap, inode, suffix, value, size); /* * Only overwrite the return value if a security module * is actually active. */ if (ret == -EOPNOTSUPP) goto nolsm; return ret; } nolsm: return __vfs_getxattr(dentry, inode, name, value, size); } EXPORT_SYMBOL_GPL(vfs_getxattr); /** * vfs_listxattr - retrieve \0 separated list of xattr names * @dentry: the dentry from whose inode the xattr names are retrieved * @list: buffer to store xattr names into * @size: size of the buffer * * This function returns the names of all xattrs associated with the * inode of @dentry. * * Note, for legacy reasons the vfs_listxattr() function lists POSIX * ACLs as well. Since POSIX ACLs are decoupled from IOP_XATTR the * vfs_listxattr() function doesn't check for this flag since a * filesystem could implement POSIX ACLs without implementing any other * xattrs. * * However, since all codepaths that remove IOP_XATTR also assign of * inode operations that either don't implement or implement a stub * ->listxattr() operation. * * Return: On success, the size of the buffer that was used. On error a * negative error code. */ ssize_t vfs_listxattr(struct dentry *dentry, char *list, size_t size) { struct inode *inode = d_inode(dentry); ssize_t error; error = security_inode_listxattr(dentry); if (error) return error; if (inode->i_op->listxattr) { error = inode->i_op->listxattr(dentry, list, size); } else { error = security_inode_listsecurity(inode, list, size); if (size && error > size) error = -ERANGE; } return error; } EXPORT_SYMBOL_GPL(vfs_listxattr); int __vfs_removexattr(struct mnt_idmap *idmap, struct dentry *dentry, const char *name) { struct inode *inode = d_inode(dentry); const struct xattr_handler *handler; if (is_posix_acl_xattr(name)) return -EOPNOTSUPP; handler = xattr_resolve_name(inode, &name); if (IS_ERR(handler)) return PTR_ERR(handler); if (!handler->set) return -EOPNOTSUPP; return handler->set(handler, idmap, dentry, inode, name, NULL, 0, XATTR_REPLACE); } EXPORT_SYMBOL(__vfs_removexattr); /** * __vfs_removexattr_locked - set an extended attribute while holding the inode * lock * * @idmap: idmap of the mount of the target inode * @dentry: object to perform setxattr on * @name: name of xattr to remove * @delegated_inode: on return, will contain an inode pointer that * a delegation was broken on, NULL if none. */ int __vfs_removexattr_locked(struct mnt_idmap *idmap, struct dentry *dentry, const char *name, struct inode **delegated_inode) { struct inode *inode = dentry->d_inode; int error; error = xattr_permission(idmap, inode, name, MAY_WRITE); if (error) return error; error = security_inode_removexattr(idmap, dentry, name); if (error) goto out; error = try_break_deleg(inode, delegated_inode); if (error) goto out; error = __vfs_removexattr(idmap, dentry, name); if (error) return error; fsnotify_xattr(dentry); security_inode_post_removexattr(dentry, name); out: return error; } EXPORT_SYMBOL_GPL(__vfs_removexattr_locked); int vfs_removexattr(struct mnt_idmap *idmap, struct dentry *dentry, const char *name) { struct inode *inode = dentry->d_inode; struct inode *delegated_inode = NULL; int error; retry_deleg: inode_lock(inode); error = __vfs_removexattr_locked(idmap, dentry, name, &delegated_inode); inode_unlock(inode); if (delegated_inode) { error = break_deleg_wait(&delegated_inode); if (!error) goto retry_deleg; } return error; } EXPORT_SYMBOL_GPL(vfs_removexattr); int import_xattr_name(struct xattr_name *kname, const char __user *name) { int error = strncpy_from_user(kname->name, name, sizeof(kname->name)); if (error == 0 || error == sizeof(kname->name)) return -ERANGE; if (error < 0) return error; return 0; } /* * Extended attribute SET operations */ int setxattr_copy(const char __user *name, struct kernel_xattr_ctx *ctx) { int error; if (ctx->flags & ~(XATTR_CREATE|XATTR_REPLACE)) return -EINVAL; error = import_xattr_name(ctx->kname, name); if (error) return error; if (ctx->size) { if (ctx->size > XATTR_SIZE_MAX) return -E2BIG; ctx->kvalue = vmemdup_user(ctx->cvalue, ctx->size); if (IS_ERR(ctx->kvalue)) { error = PTR_ERR(ctx->kvalue); ctx->kvalue = NULL; } } return error; } static int do_setxattr(struct mnt_idmap *idmap, struct dentry *dentry, struct kernel_xattr_ctx *ctx) { if (is_posix_acl_xattr(ctx->kname->name)) return do_set_acl(idmap, dentry, ctx->kname->name, ctx->kvalue, ctx->size); return vfs_setxattr(idmap, dentry, ctx->kname->name, ctx->kvalue, ctx->size, ctx->flags); } int file_setxattr(struct file *f, struct kernel_xattr_ctx *ctx) { int error = mnt_want_write_file(f); if (!error) { audit_file(f); error = do_setxattr(file_mnt_idmap(f), f->f_path.dentry, ctx); mnt_drop_write_file(f); } return error; } /* unconditionally consumes filename */ int filename_setxattr(int dfd, struct filename *filename, unsigned int lookup_flags, struct kernel_xattr_ctx *ctx) { struct path path; int error; retry: error = filename_lookup(dfd, filename, lookup_flags, &path, NULL); if (error) goto out; error = mnt_want_write(path.mnt); if (!error) { error = do_setxattr(mnt_idmap(path.mnt), path.dentry, ctx); mnt_drop_write(path.mnt); } path_put(&path); if (retry_estale(error, lookup_flags)) { lookup_flags |= LOOKUP_REVAL; goto retry; } out: putname(filename); return error; } static int path_setxattrat(int dfd, const char __user *pathname, unsigned int at_flags, const char __user *name, const void __user *value, size_t size, int flags) { struct xattr_name kname; struct kernel_xattr_ctx ctx = { .cvalue = value, .kvalue = NULL, .size = size, .kname = &kname, .flags = flags, }; struct filename *filename; unsigned int lookup_flags = 0; int error; if ((at_flags & ~(AT_SYMLINK_NOFOLLOW | AT_EMPTY_PATH)) != 0) return -EINVAL; if (!(at_flags & AT_SYMLINK_NOFOLLOW)) lookup_flags = LOOKUP_FOLLOW; error = setxattr_copy(name, &ctx); if (error) return error; filename = getname_maybe_null(pathname, at_flags); if (!filename && dfd >= 0) { CLASS(fd, f)(dfd); if (fd_empty(f)) error = -EBADF; else error = file_setxattr(fd_file(f), &ctx); } else { error = filename_setxattr(dfd, filename, lookup_flags, &ctx); } kvfree(ctx.kvalue); return error; } SYSCALL_DEFINE6(setxattrat, int, dfd, const char __user *, pathname, unsigned int, at_flags, const char __user *, name, const struct xattr_args __user *, uargs, size_t, usize) { struct xattr_args args = {}; int error; BUILD_BUG_ON(sizeof(struct xattr_args) < XATTR_ARGS_SIZE_VER0); BUILD_BUG_ON(sizeof(struct xattr_args) != XATTR_ARGS_SIZE_LATEST); if (unlikely(usize < XATTR_ARGS_SIZE_VER0)) return -EINVAL; if (usize > PAGE_SIZE) return -E2BIG; error = copy_struct_from_user(&args, sizeof(args), uargs, usize); if (error) return error; return path_setxattrat(dfd, pathname, at_flags, name, u64_to_user_ptr(args.value), args.size, args.flags); } SYSCALL_DEFINE5(setxattr, const char __user *, pathname, const char __user *, name, const void __user *, value, size_t, size, int, flags) { return path_setxattrat(AT_FDCWD, pathname, 0, name, value, size, flags); } SYSCALL_DEFINE5(lsetxattr, const char __user *, pathname, const char __user *, name, const void __user *, value, size_t, size, int, flags) { return path_setxattrat(AT_FDCWD, pathname, AT_SYMLINK_NOFOLLOW, name, value, size, flags); } SYSCALL_DEFINE5(fsetxattr, int, fd, const char __user *, name, const void __user *,value, size_t, size, int, flags) { return path_setxattrat(fd, NULL, AT_EMPTY_PATH, name, value, size, flags); } /* * Extended attribute GET operations */ static ssize_t do_getxattr(struct mnt_idmap *idmap, struct dentry *d, struct kernel_xattr_ctx *ctx) { ssize_t error; char *kname = ctx->kname->name; void *kvalue = NULL; if (ctx->size) { if (ctx->size > XATTR_SIZE_MAX) ctx->size = XATTR_SIZE_MAX; kvalue = kvzalloc(ctx->size, GFP_KERNEL); if (!kvalue) return -ENOMEM; } if (is_posix_acl_xattr(kname)) error = do_get_acl(idmap, d, kname, kvalue, ctx->size); else error = vfs_getxattr(idmap, d, kname, kvalue, ctx->size); if (error > 0) { if (ctx->size && copy_to_user(ctx->value, kvalue, error)) error = -EFAULT; } else if (error == -ERANGE && ctx->size >= XATTR_SIZE_MAX) { /* The file system tried to returned a value bigger than XATTR_SIZE_MAX bytes. Not possible. */ error = -E2BIG; } kvfree(kvalue); return error; } ssize_t file_getxattr(struct file *f, struct kernel_xattr_ctx *ctx) { audit_file(f); return do_getxattr(file_mnt_idmap(f), f->f_path.dentry, ctx); } /* unconditionally consumes filename */ ssize_t filename_getxattr(int dfd, struct filename *filename, unsigned int lookup_flags, struct kernel_xattr_ctx *ctx) { struct path path; ssize_t error; retry: error = filename_lookup(dfd, filename, lookup_flags, &path, NULL); if (error) goto out; error = do_getxattr(mnt_idmap(path.mnt), path.dentry, ctx); path_put(&path); if (retry_estale(error, lookup_flags)) { lookup_flags |= LOOKUP_REVAL; goto retry; } out: putname(filename); return error; } static ssize_t path_getxattrat(int dfd, const char __user *pathname, unsigned int at_flags, const char __user *name, void __user *value, size_t size) { struct xattr_name kname; struct kernel_xattr_ctx ctx = { .value = value, .size = size, .kname = &kname, .flags = 0, }; struct filename *filename; ssize_t error; if ((at_flags & ~(AT_SYMLINK_NOFOLLOW | AT_EMPTY_PATH)) != 0) return -EINVAL; error = import_xattr_name(&kname, name); if (error) return error; filename = getname_maybe_null(pathname, at_flags); if (!filename && dfd >= 0) { CLASS(fd, f)(dfd); if (fd_empty(f)) return -EBADF; return file_getxattr(fd_file(f), &ctx); } else { int lookup_flags = 0; if (!(at_flags & AT_SYMLINK_NOFOLLOW)) lookup_flags = LOOKUP_FOLLOW; return filename_getxattr(dfd, filename, lookup_flags, &ctx); } } SYSCALL_DEFINE6(getxattrat, int, dfd, const char __user *, pathname, unsigned int, at_flags, const char __user *, name, struct xattr_args __user *, uargs, size_t, usize) { struct xattr_args args = {}; int error; BUILD_BUG_ON(sizeof(struct xattr_args) < XATTR_ARGS_SIZE_VER0); BUILD_BUG_ON(sizeof(struct xattr_args) != XATTR_ARGS_SIZE_LATEST); if (unlikely(usize < XATTR_ARGS_SIZE_VER0)) return -EINVAL; if (usize > PAGE_SIZE) return -E2BIG; error = copy_struct_from_user(&args, sizeof(args), uargs, usize); if (error) return error; if (args.flags != 0) return -EINVAL; return path_getxattrat(dfd, pathname, at_flags, name, u64_to_user_ptr(args.value), args.size); } SYSCALL_DEFINE4(getxattr, const char __user *, pathname, const char __user *, name, void __user *, value, size_t, size) { return path_getxattrat(AT_FDCWD, pathname, 0, name, value, size); } SYSCALL_DEFINE4(lgetxattr, const char __user *, pathname, const char __user *, name, void __user *, value, size_t, size) { return path_getxattrat(AT_FDCWD, pathname, AT_SYMLINK_NOFOLLOW, name, value, size); } SYSCALL_DEFINE4(fgetxattr, int, fd, const char __user *, name, void __user *, value, size_t, size) { return path_getxattrat(fd, NULL, AT_EMPTY_PATH, name, value, size); } /* * Extended attribute LIST operations */ static ssize_t listxattr(struct dentry *d, char __user *list, size_t size) { ssize_t error; char *klist = NULL; if (size) { if (size > XATTR_LIST_MAX) size = XATTR_LIST_MAX; klist = kvmalloc(size, GFP_KERNEL); if (!klist) return -ENOMEM; } error = vfs_listxattr(d, klist, size); if (error > 0) { if (size && copy_to_user(list, klist, error)) error = -EFAULT; } else if (error == -ERANGE && size >= XATTR_LIST_MAX) { /* The file system tried to returned a list bigger than XATTR_LIST_MAX bytes. Not possible. */ error = -E2BIG; } kvfree(klist); return error; } static ssize_t file_listxattr(struct file *f, char __user *list, size_t size) { audit_file(f); return listxattr(f->f_path.dentry, list, size); } /* unconditionally consumes filename */ static ssize_t filename_listxattr(int dfd, struct filename *filename, unsigned int lookup_flags, char __user *list, size_t size) { struct path path; ssize_t error; retry: error = filename_lookup(dfd, filename, lookup_flags, &path, NULL); if (error) goto out; error = listxattr(path.dentry, list, size); path_put(&path); if (retry_estale(error, lookup_flags)) { lookup_flags |= LOOKUP_REVAL; goto retry; } out: putname(filename); return error; } static ssize_t path_listxattrat(int dfd, const char __user *pathname, unsigned int at_flags, char __user *list, size_t size) { struct filename *filename; int lookup_flags; if ((at_flags & ~(AT_SYMLINK_NOFOLLOW | AT_EMPTY_PATH)) != 0) return -EINVAL; filename = getname_maybe_null(pathname, at_flags); if (!filename) { CLASS(fd, f)(dfd); if (fd_empty(f)) return -EBADF; return file_listxattr(fd_file(f), list, size); } lookup_flags = (at_flags & AT_SYMLINK_NOFOLLOW) ? 0 : LOOKUP_FOLLOW; return filename_listxattr(dfd, filename, lookup_flags, list, size); } SYSCALL_DEFINE5(listxattrat, int, dfd, const char __user *, pathname, unsigned int, at_flags, char __user *, list, size_t, size) { return path_listxattrat(dfd, pathname, at_flags, list, size); } SYSCALL_DEFINE3(listxattr, const char __user *, pathname, char __user *, list, size_t, size) { return path_listxattrat(AT_FDCWD, pathname, 0, list, size); } SYSCALL_DEFINE3(llistxattr, const char __user *, pathname, char __user *, list, size_t, size) { return path_listxattrat(AT_FDCWD, pathname, AT_SYMLINK_NOFOLLOW, list, size); } SYSCALL_DEFINE3(flistxattr, int, fd, char __user *, list, size_t, size) { return path_listxattrat(fd, NULL, AT_EMPTY_PATH, list, size); } /* * Extended attribute REMOVE operations */ static long removexattr(struct mnt_idmap *idmap, struct dentry *d, const char *name) { if (is_posix_acl_xattr(name)) return vfs_remove_acl(idmap, d, name); return vfs_removexattr(idmap, d, name); } static int file_removexattr(struct file *f, struct xattr_name *kname) { int error = mnt_want_write_file(f); if (!error) { audit_file(f); error = removexattr(file_mnt_idmap(f), f->f_path.dentry, kname->name); mnt_drop_write_file(f); } return error; } /* unconditionally consumes filename */ static int filename_removexattr(int dfd, struct filename *filename, unsigned int lookup_flags, struct xattr_name *kname) { struct path path; int error; retry: error = filename_lookup(dfd, filename, lookup_flags, &path, NULL); if (error) goto out; error = mnt_want_write(path.mnt); if (!error) { error = removexattr(mnt_idmap(path.mnt), path.dentry, kname->name); mnt_drop_write(path.mnt); } path_put(&path); if (retry_estale(error, lookup_flags)) { lookup_flags |= LOOKUP_REVAL; goto retry; } out: putname(filename); return error; } static int path_removexattrat(int dfd, const char __user *pathname, unsigned int at_flags, const char __user *name) { struct xattr_name kname; struct filename *filename; unsigned int lookup_flags; int error; if ((at_flags & ~(AT_SYMLINK_NOFOLLOW | AT_EMPTY_PATH)) != 0) return -EINVAL; error = import_xattr_name(&kname, name); if (error) return error; filename = getname_maybe_null(pathname, at_flags); if (!filename) { CLASS(fd, f)(dfd); if (fd_empty(f)) return -EBADF; return file_removexattr(fd_file(f), &kname); } lookup_flags = (at_flags & AT_SYMLINK_NOFOLLOW) ? 0 : LOOKUP_FOLLOW; return filename_removexattr(dfd, filename, lookup_flags, &kname); } SYSCALL_DEFINE4(removexattrat, int, dfd, const char __user *, pathname, unsigned int, at_flags, const char __user *, name) { return path_removexattrat(dfd, pathname, at_flags, name); } SYSCALL_DEFINE2(removexattr, const char __user *, pathname, const char __user *, name) { return path_removexattrat(AT_FDCWD, pathname, 0, name); } SYSCALL_DEFINE2(lremovexattr, const char __user *, pathname, const char __user *, name) { return path_removexattrat(AT_FDCWD, pathname, AT_SYMLINK_NOFOLLOW, name); } SYSCALL_DEFINE2(fremovexattr, int, fd, const char __user *, name) { return path_removexattrat(fd, NULL, AT_EMPTY_PATH, name); } int xattr_list_one(char **buffer, ssize_t *remaining_size, const char *name) { size_t len; len = strlen(name) + 1; if (*buffer) { if (*remaining_size < len) return -ERANGE; memcpy(*buffer, name, len); *buffer += len; } *remaining_size -= len; return 0; } /** * generic_listxattr - run through a dentry's xattr list() operations * @dentry: dentry to list the xattrs * @buffer: result buffer * @buffer_size: size of @buffer * * Combine the results of the list() operation from every xattr_handler in the * xattr_handler stack. * * Note that this will not include the entries for POSIX ACLs. */ ssize_t generic_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size) { const struct xattr_handler *handler, * const *handlers = dentry->d_sb->s_xattr; ssize_t remaining_size = buffer_size; for_each_xattr_handler(handlers, handler) { int err; if (!handler->name || (handler->list && !handler->list(dentry))) continue; err = xattr_list_one(&buffer, &remaining_size, handler->name); if (err) return err; } return buffer_size - remaining_size; } EXPORT_SYMBOL(generic_listxattr); /** * xattr_full_name - Compute full attribute name from suffix * * @handler: handler of the xattr_handler operation * @name: name passed to the xattr_handler operation * * The get and set xattr handler operations are called with the remainder of * the attribute name after skipping the handler's prefix: for example, "foo" * is passed to the get operation of a handler with prefix "user." to get * attribute "user.foo". The full name is still "there" in the name though. * * Note: the list xattr handler operation when called from the vfs is passed a * NULL name; some file systems use this operation internally, with varying * semantics. */ const char *xattr_full_name(const struct xattr_handler *handler, const char *name) { size_t prefix_len = strlen(xattr_prefix(handler)); return name - prefix_len; } EXPORT_SYMBOL(xattr_full_name); /** * simple_xattr_space - estimate the memory used by a simple xattr * @name: the full name of the xattr * @size: the size of its value * * This takes no account of how much larger the two slab objects actually are: * that would depend on the slab implementation, when what is required is a * deterministic number, which grows with name length and size and quantity. * * Return: The approximate number of bytes of memory used by such an xattr. */ size_t simple_xattr_space(const char *name, size_t size) { /* * Use "40" instead of sizeof(struct simple_xattr), to return the * same result on 32-bit and 64-bit, and even if simple_xattr grows. */ return 40 + size + strlen(name); } /** * simple_xattr_free - free an xattr object * @xattr: the xattr object * * Free the xattr object. Can handle @xattr being NULL. */ void simple_xattr_free(struct simple_xattr *xattr) { if (xattr) kfree(xattr->name); kvfree(xattr); } /** * simple_xattr_alloc - allocate new xattr object * @value: value of the xattr object * @size: size of @value * * Allocate a new xattr object and initialize respective members. The caller is * responsible for handling the name of the xattr. * * Return: On success a new xattr object is returned. On failure NULL is * returned. */ struct simple_xattr *simple_xattr_alloc(const void *value, size_t size) { struct simple_xattr *new_xattr; size_t len; /* wrap around? */ len = sizeof(*new_xattr) + size; if (len < sizeof(*new_xattr)) return NULL; new_xattr = kvmalloc(len, GFP_KERNEL_ACCOUNT); if (!new_xattr) return NULL; new_xattr->size = size; memcpy(new_xattr->value, value, size); return new_xattr; } /** * rbtree_simple_xattr_cmp - compare xattr name with current rbtree xattr entry * @key: xattr name * @node: current node * * Compare the xattr name with the xattr name attached to @node in the rbtree. * * Return: Negative value if continuing left, positive if continuing right, 0 * if the xattr attached to @node matches @key. */ static int rbtree_simple_xattr_cmp(const void *key, const struct rb_node *node) { const char *xattr_name = key; const struct simple_xattr *xattr; xattr = rb_entry(node, struct simple_xattr, rb_node); return strcmp(xattr->name, xattr_name); } /** * rbtree_simple_xattr_node_cmp - compare two xattr rbtree nodes * @new_node: new node * @node: current node * * Compare the xattr attached to @new_node with the xattr attached to @node. * * Return: Negative value if continuing left, positive if continuing right, 0 * if the xattr attached to @new_node matches the xattr attached to @node. */ static int rbtree_simple_xattr_node_cmp(struct rb_node *new_node, const struct rb_node *node) { struct simple_xattr *xattr; xattr = rb_entry(new_node, struct simple_xattr, rb_node); return rbtree_simple_xattr_cmp(xattr->name, node); } /** * simple_xattr_get - get an xattr object * @xattrs: the header of the xattr object * @name: the name of the xattr to retrieve * @buffer: the buffer to store the value into * @size: the size of @buffer * * Try to find and retrieve the xattr object associated with @name. * If @buffer is provided store the value of @xattr in @buffer * otherwise just return the length. The size of @buffer is limited * to XATTR_SIZE_MAX which currently is 65536. * * Return: On success the length of the xattr value is returned. On error a * negative error code is returned. */ int simple_xattr_get(struct simple_xattrs *xattrs, const char *name, void *buffer, size_t size) { struct simple_xattr *xattr = NULL; struct rb_node *rbp; int ret = -ENODATA; read_lock(&xattrs->lock); rbp = rb_find(name, &xattrs->rb_root, rbtree_simple_xattr_cmp); if (rbp) { xattr = rb_entry(rbp, struct simple_xattr, rb_node); ret = xattr->size; if (buffer) { if (size < xattr->size) ret = -ERANGE; else memcpy(buffer, xattr->value, xattr->size); } } read_unlock(&xattrs->lock); return ret; } /** * simple_xattr_set - set an xattr object * @xattrs: the header of the xattr object * @name: the name of the xattr to retrieve * @value: the value to store along the xattr * @size: the size of @value * @flags: the flags determining how to set the xattr * * Set a new xattr object. * If @value is passed a new xattr object will be allocated. If XATTR_REPLACE * is specified in @flags a matching xattr object for @name must already exist. * If it does it will be replaced with the new xattr object. If it doesn't we * fail. If XATTR_CREATE is specified and a matching xattr does already exist * we fail. If it doesn't we create a new xattr. If @flags is zero we simply * insert the new xattr replacing any existing one. * * If @value is empty and a matching xattr object is found we delete it if * XATTR_REPLACE is specified in @flags or @flags is zero. * * If @value is empty and no matching xattr object for @name is found we do * nothing if XATTR_CREATE is specified in @flags or @flags is zero. For * XATTR_REPLACE we fail as mentioned above. * * Return: On success, the removed or replaced xattr is returned, to be freed * by the caller; or NULL if none. On failure a negative error code is returned. */ struct simple_xattr *simple_xattr_set(struct simple_xattrs *xattrs, const char *name, const void *value, size_t size, int flags) { struct simple_xattr *old_xattr = NULL, *new_xattr = NULL; struct rb_node *parent = NULL, **rbp; int err = 0, ret; /* value == NULL means remove */ if (value) { new_xattr = simple_xattr_alloc(value, size); if (!new_xattr) return ERR_PTR(-ENOMEM); new_xattr->name = kstrdup(name, GFP_KERNEL_ACCOUNT); if (!new_xattr->name) { simple_xattr_free(new_xattr); return ERR_PTR(-ENOMEM); } } write_lock(&xattrs->lock); rbp = &xattrs->rb_root.rb_node; while (*rbp) { parent = *rbp; ret = rbtree_simple_xattr_cmp(name, *rbp); if (ret < 0) rbp = &(*rbp)->rb_left; else if (ret > 0) rbp = &(*rbp)->rb_right; else old_xattr = rb_entry(*rbp, struct simple_xattr, rb_node); if (old_xattr) break; } if (old_xattr) { /* Fail if XATTR_CREATE is requested and the xattr exists. */ if (flags & XATTR_CREATE) { err = -EEXIST; goto out_unlock; } if (new_xattr) rb_replace_node(&old_xattr->rb_node, &new_xattr->rb_node, &xattrs->rb_root); else rb_erase(&old_xattr->rb_node, &xattrs->rb_root); } else { /* Fail if XATTR_REPLACE is requested but no xattr is found. */ if (flags & XATTR_REPLACE) { err = -ENODATA; goto out_unlock; } /* * If XATTR_CREATE or no flags are specified together with a * new value simply insert it. */ if (new_xattr) { rb_link_node(&new_xattr->rb_node, parent, rbp); rb_insert_color(&new_xattr->rb_node, &xattrs->rb_root); } /* * If XATTR_CREATE or no flags are specified and neither an * old or new xattr exist then we don't need to do anything. */ } out_unlock: write_unlock(&xattrs->lock); if (!err) return old_xattr; simple_xattr_free(new_xattr); return ERR_PTR(err); } static bool xattr_is_trusted(const char *name) { return !strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN); } static bool xattr_is_maclabel(const char *name) { const char *suffix = name + XATTR_SECURITY_PREFIX_LEN; return !strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN) && security_ismaclabel(suffix); } /** * simple_xattr_list - list all xattr objects * @inode: inode from which to get the xattrs * @xattrs: the header of the xattr object * @buffer: the buffer to store all xattrs into * @size: the size of @buffer * * List all xattrs associated with @inode. If @buffer is NULL we returned * the required size of the buffer. If @buffer is provided we store the * xattrs value into it provided it is big enough. * * Note, the number of xattr names that can be listed with listxattr(2) is * limited to XATTR_LIST_MAX aka 65536 bytes. If a larger buffer is passed * then vfs_listxattr() caps it to XATTR_LIST_MAX and if more xattr names * are found it will return -E2BIG. * * Return: On success the required size or the size of the copied xattrs is * returned. On error a negative error code is returned. */ ssize_t simple_xattr_list(struct inode *inode, struct simple_xattrs *xattrs, char *buffer, size_t size) { bool trusted = ns_capable_noaudit(&init_user_ns, CAP_SYS_ADMIN); struct simple_xattr *xattr; struct rb_node *rbp; ssize_t remaining_size = size; int err = 0; err = posix_acl_listxattr(inode, &buffer, &remaining_size); if (err) return err; err = security_inode_listsecurity(inode, buffer, remaining_size); if (err < 0) return err; if (buffer) { if (remaining_size < err) return -ERANGE; buffer += err; } remaining_size -= err; err = 0; read_lock(&xattrs->lock); for (rbp = rb_first(&xattrs->rb_root); rbp; rbp = rb_next(rbp)) { xattr = rb_entry(rbp, struct simple_xattr, rb_node); /* skip "trusted." attributes for unprivileged callers */ if (!trusted && xattr_is_trusted(xattr->name)) continue; /* skip MAC labels; these are provided by LSM above */ if (xattr_is_maclabel(xattr->name)) continue; err = xattr_list_one(&buffer, &remaining_size, xattr->name); if (err) break; } read_unlock(&xattrs->lock); return err ? err : size - remaining_size; } /** * rbtree_simple_xattr_less - compare two xattr rbtree nodes * @new_node: new node * @node: current node * * Compare the xattr attached to @new_node with the xattr attached to @node. * Note that this function technically tolerates duplicate entries. * * Return: True if insertion point in the rbtree is found. */ static bool rbtree_simple_xattr_less(struct rb_node *new_node, const struct rb_node *node) { return rbtree_simple_xattr_node_cmp(new_node, node) < 0; } /** * simple_xattr_add - add xattr objects * @xattrs: the header of the xattr object * @new_xattr: the xattr object to add * * Add an xattr object to @xattrs. This assumes no replacement or removal * of matching xattrs is wanted. Should only be called during inode * initialization when a few distinct initial xattrs are supposed to be set. */ void simple_xattr_add(struct simple_xattrs *xattrs, struct simple_xattr *new_xattr) { write_lock(&xattrs->lock); rb_add(&new_xattr->rb_node, &xattrs->rb_root, rbtree_simple_xattr_less); write_unlock(&xattrs->lock); } /** * simple_xattrs_init - initialize new xattr header * @xattrs: header to initialize * * Initialize relevant fields of a an xattr header. */ void simple_xattrs_init(struct simple_xattrs *xattrs) { xattrs->rb_root = RB_ROOT; rwlock_init(&xattrs->lock); } /** * simple_xattrs_free - free xattrs * @xattrs: xattr header whose xattrs to destroy * @freed_space: approximate number of bytes of memory freed from @xattrs * * Destroy all xattrs in @xattr. When this is called no one can hold a * reference to any of the xattrs anymore. */ void simple_xattrs_free(struct simple_xattrs *xattrs, size_t *freed_space) { struct rb_node *rbp; if (freed_space) *freed_space = 0; rbp = rb_first(&xattrs->rb_root); while (rbp) { struct simple_xattr *xattr; struct rb_node *rbp_next; rbp_next = rb_next(rbp); xattr = rb_entry(rbp, struct simple_xattr, rb_node); rb_erase(&xattr->rb_node, &xattrs->rb_root); if (freed_space) *freed_space += simple_xattr_space(xattr->name, xattr->size); simple_xattr_free(xattr); rbp = rbp_next; } }
1049 1045 3 563 262 562 1595 1594 1591 271 1595 1591 1591 1596 1595 564 564 562 1590 1595 1592 822 1090 1559 1587 1592 1046 1009 324 1049 325 1008 352 323 325 558 558 559 261 262 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 // SPDX-License-Identifier: GPL-2.0 /* * fs/sysfs/group.c - Operations for adding/removing multiple files at once. * * Copyright (c) 2003 Patrick Mochel * Copyright (c) 2003 Open Source Development Lab * Copyright (c) 2013 Greg Kroah-Hartman * Copyright (c) 2013 The Linux Foundation */ #include <linux/kobject.h> #include <linux/module.h> #include <linux/dcache.h> #include <linux/namei.h> #include <linux/err.h> #include <linux/fs.h> #include "sysfs.h" static void remove_files(struct kernfs_node *parent, const struct attribute_group *grp) { struct attribute *const *attr; const struct bin_attribute *const *bin_attr; if (grp->attrs) for (attr = grp->attrs; *attr; attr++) kernfs_remove_by_name(parent, (*attr)->name); if (grp->bin_attrs) for (bin_attr = grp->bin_attrs; *bin_attr; bin_attr++) kernfs_remove_by_name(parent, (*bin_attr)->attr.name); } static umode_t __first_visible(const struct attribute_group *grp, struct kobject *kobj) { if (grp->attrs && grp->attrs[0] && grp->is_visible) return grp->is_visible(kobj, grp->attrs[0], 0); if (grp->bin_attrs && grp->bin_attrs[0] && grp->is_bin_visible) return grp->is_bin_visible(kobj, grp->bin_attrs[0], 0); return 0; } static int create_files(struct kernfs_node *parent, struct kobject *kobj, kuid_t uid, kgid_t gid, const struct attribute_group *grp, int update) { struct attribute *const *attr; const struct bin_attribute *const *bin_attr; int error = 0, i; if (grp->attrs) { for (i = 0, attr = grp->attrs; *attr && !error; i++, attr++) { umode_t mode = (*attr)->mode; /* * In update mode, we're changing the permissions or * visibility. Do this by first removing then * re-adding (if required) the file. */ if (update) kernfs_remove_by_name(parent, (*attr)->name); if (grp->is_visible) { mode = grp->is_visible(kobj, *attr, i); mode &= ~SYSFS_GROUP_INVISIBLE; if (!mode) continue; } WARN(mode & ~(SYSFS_PREALLOC | 0664), "Attribute %s: Invalid permissions 0%o\n", (*attr)->name, mode); mode &= SYSFS_PREALLOC | 0664; error = sysfs_add_file_mode_ns(parent, *attr, mode, uid, gid, NULL); if (unlikely(error)) break; } if (error) { remove_files(parent, grp); goto exit; } } if (grp->bin_attrs) { for (i = 0, bin_attr = grp->bin_attrs; *bin_attr; i++, bin_attr++) { umode_t mode = (*bin_attr)->attr.mode; size_t size = (*bin_attr)->size; if (update) kernfs_remove_by_name(parent, (*bin_attr)->attr.name); if (grp->is_bin_visible) { mode = grp->is_bin_visible(kobj, *bin_attr, i); mode &= ~SYSFS_GROUP_INVISIBLE; if (!mode) continue; } if (grp->bin_size) size = grp->bin_size(kobj, *bin_attr, i); WARN(mode & ~(SYSFS_PREALLOC | 0664), "Attribute %s: Invalid permissions 0%o\n", (*bin_attr)->attr.name, mode); mode &= SYSFS_PREALLOC | 0664; error = sysfs_add_bin_file_mode_ns(parent, *bin_attr, mode, size, uid, gid, NULL); if (error) break; } if (error) remove_files(parent, grp); } exit: return error; } static int internal_create_group(struct kobject *kobj, int update, const struct attribute_group *grp) { struct kernfs_node *kn; kuid_t uid; kgid_t gid; int error; if (WARN_ON(!kobj || (!update && !kobj->sd))) return -EINVAL; /* Updates may happen before the object has been instantiated */ if (unlikely(update && !kobj->sd)) return -EINVAL; if (!grp->attrs && !grp->bin_attrs) { pr_debug("sysfs: (bin_)attrs not set by subsystem for group: %s/%s, skipping\n", kobj->name, grp->name ?: ""); return 0; } kobject_get_ownership(kobj, &uid, &gid); if (grp->name) { umode_t mode = __first_visible(grp, kobj); if (mode & SYSFS_GROUP_INVISIBLE) mode = 0; else mode = S_IRWXU | S_IRUGO | S_IXUGO; if (update) { kn = kernfs_find_and_get(kobj->sd, grp->name); if (!kn) { pr_debug("attr grp %s/%s not created yet\n", kobj->name, grp->name); /* may have been invisible prior to this update */ update = 0; } else if (!mode) { sysfs_remove_group(kobj, grp); kernfs_put(kn); return 0; } } if (!update) { if (!mode) return 0; kn = kernfs_create_dir_ns(kobj->sd, grp->name, mode, uid, gid, kobj, NULL); if (IS_ERR(kn)) { if (PTR_ERR(kn) == -EEXIST) sysfs_warn_dup(kobj->sd, grp->name); return PTR_ERR(kn); } } } else { kn = kobj->sd; } kernfs_get(kn); error = create_files(kn, kobj, uid, gid, grp, update); if (error) { if (grp->name) kernfs_remove(kn); } kernfs_put(kn); if (grp->name && update) kernfs_put(kn); return error; } /** * sysfs_create_group - given a directory kobject, create an attribute group * @kobj: The kobject to create the group on * @grp: The attribute group to create * * This function creates a group for the first time. It will explicitly * warn and error if any of the attribute files being created already exist. * * Returns 0 on success or error code on failure. */ int sysfs_create_group(struct kobject *kobj, const struct attribute_group *grp) { return internal_create_group(kobj, 0, grp); } EXPORT_SYMBOL_GPL(sysfs_create_group); static int internal_create_groups(struct kobject *kobj, int update, const struct attribute_group **groups) { int error = 0; int i; if (!groups) return 0; for (i = 0; groups[i]; i++) { error = internal_create_group(kobj, update, groups[i]); if (error) { while (--i >= 0) sysfs_remove_group(kobj, groups[i]); break; } } return error; } /** * sysfs_create_groups - given a directory kobject, create a bunch of attribute groups * @kobj: The kobject to create the group on * @groups: The attribute groups to create, NULL terminated * * This function creates a bunch of attribute groups. If an error occurs when * creating a group, all previously created groups will be removed, unwinding * everything back to the original state when this function was called. * It will explicitly warn and error if any of the attribute files being * created already exist. * * Returns 0 on success or error code from sysfs_create_group on failure. */ int sysfs_create_groups(struct kobject *kobj, const struct attribute_group **groups) { return internal_create_groups(kobj, 0, groups); } EXPORT_SYMBOL_GPL(sysfs_create_groups); /** * sysfs_update_groups - given a directory kobject, create a bunch of attribute groups * @kobj: The kobject to update the group on * @groups: The attribute groups to update, NULL terminated * * This function update a bunch of attribute groups. If an error occurs when * updating a group, all previously updated groups will be removed together * with already existing (not updated) attributes. * * Returns 0 on success or error code from sysfs_update_group on failure. */ int sysfs_update_groups(struct kobject *kobj, const struct attribute_group **groups) { return internal_create_groups(kobj, 1, groups); } EXPORT_SYMBOL_GPL(sysfs_update_groups); /** * sysfs_update_group - given a directory kobject, update an attribute group * @kobj: The kobject to update the group on * @grp: The attribute group to update * * This function updates an attribute group. Unlike * sysfs_create_group(), it will explicitly not warn or error if any * of the attribute files being created already exist. Furthermore, * if the visibility of the files has changed through the is_visible() * callback, it will update the permissions and add or remove the * relevant files. Changing a group's name (subdirectory name under * kobj's directory in sysfs) is not allowed. * * The primary use for this function is to call it after making a change * that affects group visibility. * * Returns 0 on success or error code on failure. */ int sysfs_update_group(struct kobject *kobj, const struct attribute_group *grp) { return internal_create_group(kobj, 1, grp); } EXPORT_SYMBOL_GPL(sysfs_update_group); /** * sysfs_remove_group: remove a group from a kobject * @kobj: kobject to remove the group from * @grp: group to remove * * This function removes a group of attributes from a kobject. The attributes * previously have to have been created for this group, otherwise it will fail. */ void sysfs_remove_group(struct kobject *kobj, const struct attribute_group *grp) { struct kernfs_node *parent = kobj->sd; struct kernfs_node *kn; if (grp->name) { kn = kernfs_find_and_get(parent, grp->name); if (!kn) { pr_debug("sysfs group '%s' not found for kobject '%s'\n", grp->name, kobject_name(kobj)); return; } } else { kn = parent; kernfs_get(kn); } remove_files(kn, grp); if (grp->name) kernfs_remove(kn); kernfs_put(kn); } EXPORT_SYMBOL_GPL(sysfs_remove_group); /** * sysfs_remove_groups - remove a list of groups * * @kobj: The kobject for the groups to be removed from * @groups: NULL terminated list of groups to be removed * * If groups is not NULL, remove the specified groups from the kobject. */ void sysfs_remove_groups(struct kobject *kobj, const struct attribute_group **groups) { int i; if (!groups) return; for (i = 0; groups[i]; i++) sysfs_remove_group(kobj, groups[i]); } EXPORT_SYMBOL_GPL(sysfs_remove_groups); /** * sysfs_merge_group - merge files into a pre-existing named attribute group. * @kobj: The kobject containing the group. * @grp: The files to create and the attribute group they belong to. * * This function returns an error if the group doesn't exist, the .name field is * NULL or any of the files already exist in that group, in which case none of * the new files are created. */ int sysfs_merge_group(struct kobject *kobj, const struct attribute_group *grp) { struct kernfs_node *parent; kuid_t uid; kgid_t gid; int error = 0; struct attribute *const *attr; int i; parent = kernfs_find_and_get(kobj->sd, grp->name); if (!parent) return -ENOENT; kobject_get_ownership(kobj, &uid, &gid); for ((i = 0, attr = grp->attrs); *attr && !error; (++i, ++attr)) error = sysfs_add_file_mode_ns(parent, *attr, (*attr)->mode, uid, gid, NULL); if (error) { while (--i >= 0) kernfs_remove_by_name(parent, (*--attr)->name); } kernfs_put(parent); return error; } EXPORT_SYMBOL_GPL(sysfs_merge_group); /** * sysfs_unmerge_group - remove files from a pre-existing named attribute group. * @kobj: The kobject containing the group. * @grp: The files to remove and the attribute group they belong to. */ void sysfs_unmerge_group(struct kobject *kobj, const struct attribute_group *grp) { struct kernfs_node *parent; struct attribute *const *attr; parent = kernfs_find_and_get(kobj->sd, grp->name); if (parent) { for (attr = grp->attrs; *attr; ++attr) kernfs_remove_by_name(parent, (*attr)->name); kernfs_put(parent); } } EXPORT_SYMBOL_GPL(sysfs_unmerge_group); /** * sysfs_add_link_to_group - add a symlink to an attribute group. * @kobj: The kobject containing the group. * @group_name: The name of the group. * @target: The target kobject of the symlink to create. * @link_name: The name of the symlink to create. */ int sysfs_add_link_to_group(struct kobject *kobj, const char *group_name, struct kobject *target, const char *link_name) { struct kernfs_node *parent; int error = 0; parent = kernfs_find_and_get(kobj->sd, group_name); if (!parent) return -ENOENT; error = sysfs_create_link_sd(parent, target, link_name); kernfs_put(parent); return error; } EXPORT_SYMBOL_GPL(sysfs_add_link_to_group); /** * sysfs_remove_link_from_group - remove a symlink from an attribute group. * @kobj: The kobject containing the group. * @group_name: The name of the group. * @link_name: The name of the symlink to remove. */ void sysfs_remove_link_from_group(struct kobject *kobj, const char *group_name, const char *link_name) { struct kernfs_node *parent; parent = kernfs_find_and_get(kobj->sd, group_name); if (parent) { kernfs_remove_by_name(parent, link_name); kernfs_put(parent); } } EXPORT_SYMBOL_GPL(sysfs_remove_link_from_group); /** * compat_only_sysfs_link_entry_to_kobj - add a symlink to a kobject pointing * to a group or an attribute * @kobj: The kobject containing the group. * @target_kobj: The target kobject. * @target_name: The name of the target group or attribute. * @symlink_name: The name of the symlink file (target_name will be * considered if symlink_name is NULL). */ int compat_only_sysfs_link_entry_to_kobj(struct kobject *kobj, struct kobject *target_kobj, const char *target_name, const char *symlink_name) { struct kernfs_node *target; struct kernfs_node *entry; struct kernfs_node *link; /* * We don't own @target_kobj and it may be removed at any time. * Synchronize using sysfs_symlink_target_lock. See sysfs_remove_dir() * for details. */ spin_lock(&sysfs_symlink_target_lock); target = target_kobj->sd; if (target) kernfs_get(target); spin_unlock(&sysfs_symlink_target_lock); if (!target) return -ENOENT; entry = kernfs_find_and_get(target, target_name); if (!entry) { kernfs_put(target); return -ENOENT; } if (!symlink_name) symlink_name = target_name; link = kernfs_create_link(kobj->sd, symlink_name, entry); if (PTR_ERR(link) == -EEXIST) sysfs_warn_dup(kobj->sd, symlink_name); kernfs_put(entry); kernfs_put(target); return PTR_ERR_OR_ZERO(link); } EXPORT_SYMBOL_GPL(compat_only_sysfs_link_entry_to_kobj); static int sysfs_group_attrs_change_owner(struct kobject *kobj, struct kernfs_node *grp_kn, const struct attribute_group *grp, struct iattr *newattrs) { struct kernfs_node *kn; int error, i; umode_t mode; if (grp->attrs) { struct attribute *const *attr; for (i = 0, attr = grp->attrs; *attr; i++, attr++) { if (grp->is_visible) { mode = grp->is_visible(kobj, *attr, i); if (mode & SYSFS_GROUP_INVISIBLE) break; if (!mode) continue; } kn = kernfs_find_and_get(grp_kn, (*attr)->name); if (!kn) return -ENOENT; error = kernfs_setattr(kn, newattrs); kernfs_put(kn); if (error) return error; } } if (grp->bin_attrs) { const struct bin_attribute *const *bin_attr; for (i = 0, bin_attr = grp->bin_attrs; *bin_attr; i++, bin_attr++) { if (grp->is_bin_visible) { mode = grp->is_bin_visible(kobj, *bin_attr, i); if (mode & SYSFS_GROUP_INVISIBLE) break; if (!mode) continue; } kn = kernfs_find_and_get(grp_kn, (*bin_attr)->attr.name); if (!kn) return -ENOENT; error = kernfs_setattr(kn, newattrs); kernfs_put(kn); if (error) return error; } } return 0; } /** * sysfs_group_change_owner - change owner of an attribute group. * @kobj: The kobject containing the group. * @grp: The attribute group. * @kuid: new owner's kuid * @kgid: new owner's kgid * * Returns 0 on success or error code on failure. */ int sysfs_group_change_owner(struct kobject *kobj, const struct attribute_group *grp, kuid_t kuid, kgid_t kgid) { struct kernfs_node *grp_kn; int error; struct iattr newattrs = { .ia_valid = ATTR_UID | ATTR_GID, .ia_uid = kuid, .ia_gid = kgid, }; if (!kobj->state_in_sysfs) return -EINVAL; if (grp->name) { grp_kn = kernfs_find_and_get(kobj->sd, grp->name); } else { kernfs_get(kobj->sd); grp_kn = kobj->sd; } if (!grp_kn) return -ENOENT; error = kernfs_setattr(grp_kn, &newattrs); if (!error) error = sysfs_group_attrs_change_owner(kobj, grp_kn, grp, &newattrs); kernfs_put(grp_kn); return error; } EXPORT_SYMBOL_GPL(sysfs_group_change_owner); /** * sysfs_groups_change_owner - change owner of a set of attribute groups. * @kobj: The kobject containing the groups. * @groups: The attribute groups. * @kuid: new owner's kuid * @kgid: new owner's kgid * * Returns 0 on success or error code on failure. */ int sysfs_groups_change_owner(struct kobject *kobj, const struct attribute_group **groups, kuid_t kuid, kgid_t kgid) { int error = 0, i; if (!kobj->state_in_sysfs) return -EINVAL; if (!groups) return 0; for (i = 0; groups[i]; i++) { error = sysfs_group_change_owner(kobj, groups[i], kuid, kgid); if (error) break; } return error; } EXPORT_SYMBOL_GPL(sysfs_groups_change_owner);
343 343 369 369 191 47 22 162 184 50 139 139 184 28 183 122 16 98 42 50 3 425 426 2 2 3 212 31 19 12 275 28 28 184 51 84 183 275 221 53 137 137 256 19 17 25 24 446 437 3 1 4 2 1 1 2 19 19 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 /* * Ext4 orphan inode handling */ #include <linux/fs.h> #include <linux/quotaops.h> #include <linux/buffer_head.h> #include "ext4.h" #include "ext4_jbd2.h" static int ext4_orphan_file_add(handle_t *handle, struct inode *inode) { int i, j, start; struct ext4_orphan_info *oi = &EXT4_SB(inode->i_sb)->s_orphan_info; int ret = 0; bool found = false; __le32 *bdata; int inodes_per_ob = ext4_inodes_per_orphan_block(inode->i_sb); int looped = 0; /* * Find block with free orphan entry. Use CPU number for a naive hash * for a search start in the orphan file */ start = raw_smp_processor_id()*13 % oi->of_blocks; i = start; do { if (atomic_dec_if_positive(&oi->of_binfo[i].ob_free_entries) >= 0) { found = true; break; } if (++i >= oi->of_blocks) i = 0; } while (i != start); if (!found) { /* * For now we don't grow or shrink orphan file. We just use * whatever was allocated at mke2fs time. The additional * credits we would have to reserve for each orphan inode * operation just don't seem worth it. */ return -ENOSPC; } ret = ext4_journal_get_write_access(handle, inode->i_sb, oi->of_binfo[i].ob_bh, EXT4_JTR_ORPHAN_FILE); if (ret) { atomic_inc(&oi->of_binfo[i].ob_free_entries); return ret; } bdata = (__le32 *)(oi->of_binfo[i].ob_bh->b_data); /* Find empty slot in a block */ j = 0; do { if (looped) { /* * Did we walk through the block several times without * finding free entry? It is theoretically possible * if entries get constantly allocated and freed or * if the block is corrupted. Avoid indefinite looping * and bail. We'll use orphan list instead. */ if (looped > 3) { atomic_inc(&oi->of_binfo[i].ob_free_entries); return -ENOSPC; } cond_resched(); } while (bdata[j]) { if (++j >= inodes_per_ob) { j = 0; looped++; } } } while (cmpxchg(&bdata[j], (__le32)0, cpu_to_le32(inode->i_ino)) != (__le32)0); EXT4_I(inode)->i_orphan_idx = i * inodes_per_ob + j; ext4_set_inode_state(inode, EXT4_STATE_ORPHAN_FILE); return ext4_handle_dirty_metadata(handle, NULL, oi->of_binfo[i].ob_bh); } /* * ext4_orphan_add() links an unlinked or truncated inode into a list of * such inodes, starting at the superblock, in case we crash before the * file is closed/deleted, or in case the inode truncate spans multiple * transactions and the last transaction is not recovered after a crash. * * At filesystem recovery time, we walk this list deleting unlinked * inodes and truncating linked inodes in ext4_orphan_cleanup(). * * Orphan list manipulation functions must be called under i_rwsem unless * we are just creating the inode or deleting it. */ int ext4_orphan_add(handle_t *handle, struct inode *inode) { struct super_block *sb = inode->i_sb; struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_iloc iloc; int err = 0, rc; bool dirty = false; if (!sbi->s_journal || is_bad_inode(inode)) return 0; WARN_ON_ONCE(!(inode->i_state & (I_NEW | I_FREEING)) && !inode_is_locked(inode)); if (ext4_inode_orphan_tracked(inode)) return 0; /* * Orphan handling is only valid for files with data blocks * being truncated, or files being unlinked. Note that we either * hold i_rwsem, or the inode can not be referenced from outside, * so i_nlink should not be bumped due to race */ ASSERT((S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) || inode->i_nlink == 0); if (sbi->s_orphan_info.of_blocks) { err = ext4_orphan_file_add(handle, inode); /* * Fallback to normal orphan list of orphan file is * out of space */ if (err != -ENOSPC) return err; } BUFFER_TRACE(sbi->s_sbh, "get_write_access"); err = ext4_journal_get_write_access(handle, sb, sbi->s_sbh, EXT4_JTR_NONE); if (err) goto out; err = ext4_reserve_inode_write(handle, inode, &iloc); if (err) goto out; mutex_lock(&sbi->s_orphan_lock); /* * Due to previous errors inode may be already a part of on-disk * orphan list. If so skip on-disk list modification. */ if (!NEXT_ORPHAN(inode) || NEXT_ORPHAN(inode) > (le32_to_cpu(sbi->s_es->s_inodes_count))) { /* Insert this inode at the head of the on-disk orphan list */ NEXT_ORPHAN(inode) = le32_to_cpu(sbi->s_es->s_last_orphan); lock_buffer(sbi->s_sbh); sbi->s_es->s_last_orphan = cpu_to_le32(inode->i_ino); ext4_superblock_csum_set(sb); unlock_buffer(sbi->s_sbh); dirty = true; } list_add(&EXT4_I(inode)->i_orphan, &sbi->s_orphan); mutex_unlock(&sbi->s_orphan_lock); if (dirty) { err = ext4_handle_dirty_metadata(handle, NULL, sbi->s_sbh); rc = ext4_mark_iloc_dirty(handle, inode, &iloc); if (!err) err = rc; if (err) { /* * We have to remove inode from in-memory list if * addition to on disk orphan list failed. Stray orphan * list entries can cause panics at unmount time. */ mutex_lock(&sbi->s_orphan_lock); list_del_init(&EXT4_I(inode)->i_orphan); mutex_unlock(&sbi->s_orphan_lock); } } else brelse(iloc.bh); ext4_debug("superblock will point to %lu\n", inode->i_ino); ext4_debug("orphan inode %lu will point to %d\n", inode->i_ino, NEXT_ORPHAN(inode)); out: ext4_std_error(sb, err); return err; } static int ext4_orphan_file_del(handle_t *handle, struct inode *inode) { struct ext4_orphan_info *oi = &EXT4_SB(inode->i_sb)->s_orphan_info; __le32 *bdata; int blk, off; int inodes_per_ob = ext4_inodes_per_orphan_block(inode->i_sb); int ret = 0; if (!handle) goto out; blk = EXT4_I(inode)->i_orphan_idx / inodes_per_ob; off = EXT4_I(inode)->i_orphan_idx % inodes_per_ob; if (WARN_ON_ONCE(blk >= oi->of_blocks)) goto out; ret = ext4_journal_get_write_access(handle, inode->i_sb, oi->of_binfo[blk].ob_bh, EXT4_JTR_ORPHAN_FILE); if (ret) goto out; bdata = (__le32 *)(oi->of_binfo[blk].ob_bh->b_data); bdata[off] = 0; atomic_inc(&oi->of_binfo[blk].ob_free_entries); ret = ext4_handle_dirty_metadata(handle, NULL, oi->of_binfo[blk].ob_bh); out: ext4_clear_inode_state(inode, EXT4_STATE_ORPHAN_FILE); INIT_LIST_HEAD(&EXT4_I(inode)->i_orphan); return ret; } /* * ext4_orphan_del() removes an unlinked or truncated inode from the list * of such inodes stored on disk, because it is finally being cleaned up. */ int ext4_orphan_del(handle_t *handle, struct inode *inode) { struct list_head *prev; struct ext4_inode_info *ei = EXT4_I(inode); struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); __u32 ino_next; struct ext4_iloc iloc; int err = 0; if (!sbi->s_journal && !(sbi->s_mount_state & EXT4_ORPHAN_FS)) return 0; WARN_ON_ONCE(!(inode->i_state & (I_NEW | I_FREEING)) && !inode_is_locked(inode)); if (ext4_test_inode_state(inode, EXT4_STATE_ORPHAN_FILE)) return ext4_orphan_file_del(handle, inode); /* Do this quick check before taking global s_orphan_lock. */ if (list_empty(&ei->i_orphan)) return 0; if (handle) { /* Grab inode buffer early before taking global s_orphan_lock */ err = ext4_reserve_inode_write(handle, inode, &iloc); } mutex_lock(&sbi->s_orphan_lock); ext4_debug("remove inode %lu from orphan list\n", inode->i_ino); prev = ei->i_orphan.prev; list_del_init(&ei->i_orphan); /* If we're on an error path, we may not have a valid * transaction handle with which to update the orphan list on * disk, but we still need to remove the inode from the linked * list in memory. */ if (!handle || err) { mutex_unlock(&sbi->s_orphan_lock); goto out_err; } ino_next = NEXT_ORPHAN(inode); if (prev == &sbi->s_orphan) { ext4_debug("superblock will point to %u\n", ino_next); BUFFER_TRACE(sbi->s_sbh, "get_write_access"); err = ext4_journal_get_write_access(handle, inode->i_sb, sbi->s_sbh, EXT4_JTR_NONE); if (err) { mutex_unlock(&sbi->s_orphan_lock); goto out_brelse; } lock_buffer(sbi->s_sbh); sbi->s_es->s_last_orphan = cpu_to_le32(ino_next); ext4_superblock_csum_set(inode->i_sb); unlock_buffer(sbi->s_sbh); mutex_unlock(&sbi->s_orphan_lock); err = ext4_handle_dirty_metadata(handle, NULL, sbi->s_sbh); } else { struct ext4_iloc iloc2; struct inode *i_prev = &list_entry(prev, struct ext4_inode_info, i_orphan)->vfs_inode; ext4_debug("orphan inode %lu will point to %u\n", i_prev->i_ino, ino_next); err = ext4_reserve_inode_write(handle, i_prev, &iloc2); if (err) { mutex_unlock(&sbi->s_orphan_lock); goto out_brelse; } NEXT_ORPHAN(i_prev) = ino_next; err = ext4_mark_iloc_dirty(handle, i_prev, &iloc2); mutex_unlock(&sbi->s_orphan_lock); } if (err) goto out_brelse; NEXT_ORPHAN(inode) = 0; err = ext4_mark_iloc_dirty(handle, inode, &iloc); out_err: ext4_std_error(inode->i_sb, err); return err; out_brelse: brelse(iloc.bh); goto out_err; } #ifdef CONFIG_QUOTA static int ext4_quota_on_mount(struct super_block *sb, int type) { return dquot_quota_on_mount(sb, rcu_dereference_protected(EXT4_SB(sb)->s_qf_names[type], lockdep_is_held(&sb->s_umount)), EXT4_SB(sb)->s_jquota_fmt, type); } #endif static void ext4_process_orphan(struct inode *inode, int *nr_truncates, int *nr_orphans) { struct super_block *sb = inode->i_sb; int ret; dquot_initialize(inode); if (inode->i_nlink) { if (test_opt(sb, DEBUG)) ext4_msg(sb, KERN_DEBUG, "%s: truncating inode %lu to %lld bytes", __func__, inode->i_ino, inode->i_size); ext4_debug("truncating inode %lu to %lld bytes\n", inode->i_ino, inode->i_size); inode_lock(inode); truncate_inode_pages(inode->i_mapping, inode->i_size); ret = ext4_truncate(inode); if (ret) { /* * We need to clean up the in-core orphan list * manually if ext4_truncate() failed to get a * transaction handle. */ ext4_orphan_del(NULL, inode); ext4_std_error(inode->i_sb, ret); } inode_unlock(inode); (*nr_truncates)++; } else { if (test_opt(sb, DEBUG)) ext4_msg(sb, KERN_DEBUG, "%s: deleting unreferenced inode %lu", __func__, inode->i_ino); ext4_debug("deleting unreferenced inode %lu\n", inode->i_ino); (*nr_orphans)++; } iput(inode); /* The delete magic happens here! */ } /* ext4_orphan_cleanup() walks a singly-linked list of inodes (starting at * the superblock) which were deleted from all directories, but held open by * a process at the time of a crash. We walk the list and try to delete these * inodes at recovery time (only with a read-write filesystem). * * In order to keep the orphan inode chain consistent during traversal (in * case of crash during recovery), we link each inode into the superblock * orphan list_head and handle it the same way as an inode deletion during * normal operation (which journals the operations for us). * * We only do an iget() and an iput() on each inode, which is very safe if we * accidentally point at an in-use or already deleted inode. The worst that * can happen in this case is that we get a "bit already cleared" message from * ext4_free_inode(). The only reason we would point at a wrong inode is if * e2fsck was run on this filesystem, and it must have already done the orphan * inode cleanup for us, so we can safely abort without any further action. */ void ext4_orphan_cleanup(struct super_block *sb, struct ext4_super_block *es) { unsigned int s_flags = sb->s_flags; int nr_orphans = 0, nr_truncates = 0; struct inode *inode; int i, j; #ifdef CONFIG_QUOTA int quota_update = 0; #endif __le32 *bdata; struct ext4_orphan_info *oi = &EXT4_SB(sb)->s_orphan_info; int inodes_per_ob = ext4_inodes_per_orphan_block(sb); if (!es->s_last_orphan && !oi->of_blocks) { ext4_debug("no orphan inodes to clean up\n"); return; } if (bdev_read_only(sb->s_bdev)) { ext4_msg(sb, KERN_ERR, "write access " "unavailable, skipping orphan cleanup"); return; } /* Check if feature set would not allow a r/w mount */ if (!ext4_feature_set_ok(sb, 0)) { ext4_msg(sb, KERN_INFO, "Skipping orphan cleanup due to " "unknown ROCOMPAT features"); return; } if (EXT4_SB(sb)->s_mount_state & EXT4_ERROR_FS) { /* don't clear list on RO mount w/ errors */ if (es->s_last_orphan && !(s_flags & SB_RDONLY)) { ext4_msg(sb, KERN_INFO, "Errors on filesystem, " "clearing orphan list."); es->s_last_orphan = 0; } ext4_debug("Skipping orphan recovery on fs with errors.\n"); return; } if (s_flags & SB_RDONLY) { ext4_msg(sb, KERN_INFO, "orphan cleanup on readonly fs"); sb->s_flags &= ~SB_RDONLY; } #ifdef CONFIG_QUOTA /* * Turn on quotas which were not enabled for read-only mounts if * filesystem has quota feature, so that they are updated correctly. */ if (ext4_has_feature_quota(sb) && (s_flags & SB_RDONLY)) { int ret = ext4_enable_quotas(sb); if (!ret) quota_update = 1; else ext4_msg(sb, KERN_ERR, "Cannot turn on quotas: error %d", ret); } /* Turn on journaled quotas used for old sytle */ for (i = 0; i < EXT4_MAXQUOTAS; i++) { if (EXT4_SB(sb)->s_qf_names[i]) { int ret = ext4_quota_on_mount(sb, i); if (!ret) quota_update = 1; else ext4_msg(sb, KERN_ERR, "Cannot turn on journaled " "quota: type %d: error %d", i, ret); } } #endif while (es->s_last_orphan) { /* * We may have encountered an error during cleanup; if * so, skip the rest. */ if (EXT4_SB(sb)->s_mount_state & EXT4_ERROR_FS) { ext4_debug("Skipping orphan recovery on fs with errors.\n"); es->s_last_orphan = 0; break; } inode = ext4_orphan_get(sb, le32_to_cpu(es->s_last_orphan)); if (IS_ERR(inode)) { es->s_last_orphan = 0; break; } list_add(&EXT4_I(inode)->i_orphan, &EXT4_SB(sb)->s_orphan); ext4_process_orphan(inode, &nr_truncates, &nr_orphans); } for (i = 0; i < oi->of_blocks; i++) { bdata = (__le32 *)(oi->of_binfo[i].ob_bh->b_data); for (j = 0; j < inodes_per_ob; j++) { if (!bdata[j]) continue; inode = ext4_orphan_get(sb, le32_to_cpu(bdata[j])); if (IS_ERR(inode)) continue; ext4_set_inode_state(inode, EXT4_STATE_ORPHAN_FILE); EXT4_I(inode)->i_orphan_idx = i * inodes_per_ob + j; ext4_process_orphan(inode, &nr_truncates, &nr_orphans); } } #define PLURAL(x) (x), ((x) == 1) ? "" : "s" if (nr_orphans) ext4_msg(sb, KERN_INFO, "%d orphan inode%s deleted", PLURAL(nr_orphans)); if (nr_truncates) ext4_msg(sb, KERN_INFO, "%d truncate%s cleaned up", PLURAL(nr_truncates)); #ifdef CONFIG_QUOTA /* Turn off quotas if they were enabled for orphan cleanup */ if (quota_update) { for (i = 0; i < EXT4_MAXQUOTAS; i++) { if (sb_dqopt(sb)->files[i]) dquot_quota_off(sb, i); } } #endif sb->s_flags = s_flags; /* Restore SB_RDONLY status */ } void ext4_release_orphan_info(struct super_block *sb) { int i; struct ext4_orphan_info *oi = &EXT4_SB(sb)->s_orphan_info; if (!oi->of_blocks) return; for (i = 0; i < oi->of_blocks; i++) brelse(oi->of_binfo[i].ob_bh); kvfree(oi->of_binfo); } static struct ext4_orphan_block_tail *ext4_orphan_block_tail( struct super_block *sb, struct buffer_head *bh) { return (struct ext4_orphan_block_tail *)(bh->b_data + sb->s_blocksize - sizeof(struct ext4_orphan_block_tail)); } static int ext4_orphan_file_block_csum_verify(struct super_block *sb, struct buffer_head *bh) { __u32 calculated; int inodes_per_ob = ext4_inodes_per_orphan_block(sb); struct ext4_orphan_info *oi = &EXT4_SB(sb)->s_orphan_info; struct ext4_orphan_block_tail *ot; __le64 dsk_block_nr = cpu_to_le64(bh->b_blocknr); if (!ext4_has_feature_metadata_csum(sb)) return 1; ot = ext4_orphan_block_tail(sb, bh); calculated = ext4_chksum(oi->of_csum_seed, (__u8 *)&dsk_block_nr, sizeof(dsk_block_nr)); calculated = ext4_chksum(calculated, (__u8 *)bh->b_data, inodes_per_ob * sizeof(__u32)); return le32_to_cpu(ot->ob_checksum) == calculated; } /* This gets called only when checksumming is enabled */ void ext4_orphan_file_block_trigger(struct jbd2_buffer_trigger_type *triggers, struct buffer_head *bh, void *data, size_t size) { struct super_block *sb = EXT4_TRIGGER(triggers)->sb; __u32 csum; int inodes_per_ob = ext4_inodes_per_orphan_block(sb); struct ext4_orphan_info *oi = &EXT4_SB(sb)->s_orphan_info; struct ext4_orphan_block_tail *ot; __le64 dsk_block_nr = cpu_to_le64(bh->b_blocknr); csum = ext4_chksum(oi->of_csum_seed, (__u8 *)&dsk_block_nr, sizeof(dsk_block_nr)); csum = ext4_chksum(csum, (__u8 *)data, inodes_per_ob * sizeof(__u32)); ot = ext4_orphan_block_tail(sb, bh); ot->ob_checksum = cpu_to_le32(csum); } int ext4_init_orphan_info(struct super_block *sb) { struct ext4_orphan_info *oi = &EXT4_SB(sb)->s_orphan_info; struct inode *inode; int i, j; int ret; int free; __le32 *bdata; int inodes_per_ob = ext4_inodes_per_orphan_block(sb); struct ext4_orphan_block_tail *ot; ino_t orphan_ino = le32_to_cpu(EXT4_SB(sb)->s_es->s_orphan_file_inum); if (!ext4_has_feature_orphan_file(sb)) return 0; inode = ext4_iget(sb, orphan_ino, EXT4_IGET_SPECIAL); if (IS_ERR(inode)) { ext4_msg(sb, KERN_ERR, "get orphan inode failed"); return PTR_ERR(inode); } /* * This is just an artificial limit to prevent corrupted fs from * consuming absurd amounts of memory when pinning blocks of orphan * file in memory. */ if (inode->i_size > 8 << 20) { ext4_msg(sb, KERN_ERR, "orphan file too big: %llu", (unsigned long long)inode->i_size); ret = -EFSCORRUPTED; goto out_put; } oi->of_blocks = inode->i_size >> sb->s_blocksize_bits; oi->of_csum_seed = EXT4_I(inode)->i_csum_seed; oi->of_binfo = kvmalloc_array(oi->of_blocks, sizeof(struct ext4_orphan_block), GFP_KERNEL); if (!oi->of_binfo) { ret = -ENOMEM; goto out_put; } for (i = 0; i < oi->of_blocks; i++) { oi->of_binfo[i].ob_bh = ext4_bread(NULL, inode, i, 0); if (IS_ERR(oi->of_binfo[i].ob_bh)) { ret = PTR_ERR(oi->of_binfo[i].ob_bh); goto out_free; } if (!oi->of_binfo[i].ob_bh) { ret = -EIO; goto out_free; } ot = ext4_orphan_block_tail(sb, oi->of_binfo[i].ob_bh); if (le32_to_cpu(ot->ob_magic) != EXT4_ORPHAN_BLOCK_MAGIC) { ext4_error(sb, "orphan file block %d: bad magic", i); ret = -EIO; goto out_free; } if (!ext4_orphan_file_block_csum_verify(sb, oi->of_binfo[i].ob_bh)) { ext4_error(sb, "orphan file block %d: bad checksum", i); ret = -EIO; goto out_free; } bdata = (__le32 *)(oi->of_binfo[i].ob_bh->b_data); free = 0; for (j = 0; j < inodes_per_ob; j++) if (bdata[j] == 0) free++; atomic_set(&oi->of_binfo[i].ob_free_entries, free); } iput(inode); return 0; out_free: for (i--; i >= 0; i--) brelse(oi->of_binfo[i].ob_bh); kvfree(oi->of_binfo); out_put: iput(inode); return ret; } int ext4_orphan_file_empty(struct super_block *sb) { struct ext4_orphan_info *oi = &EXT4_SB(sb)->s_orphan_info; int i; int inodes_per_ob = ext4_inodes_per_orphan_block(sb); if (!ext4_has_feature_orphan_file(sb)) return 1; for (i = 0; i < oi->of_blocks; i++) if (atomic_read(&oi->of_binfo[i].ob_free_entries) != inodes_per_ob) return 0; return 1; }
21 21 114 106 20 104 264 103 12 1 115 115 96 264 1 103 96 13 103 104 71 21 31 18 13 3 9 7 104 103 104 104 104 104 104 104 104 103 1 96 7 13 12 12 12 12 12 12 12 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 // SPDX-License-Identifier: GPL-2.0+ /* * Copyright (C) 2016 Oracle. All Rights Reserved. * Author: Darrick J. Wong <darrick.wong@oracle.com> */ #include "xfs.h" #include "xfs_fs.h" #include "xfs_format.h" #include "xfs_log_format.h" #include "xfs_trans_resv.h" #include "xfs_bit.h" #include "xfs_shared.h" #include "xfs_mount.h" #include "xfs_defer.h" #include "xfs_trans.h" #include "xfs_trans_priv.h" #include "xfs_rmap_item.h" #include "xfs_log.h" #include "xfs_rmap.h" #include "xfs_error.h" #include "xfs_log_priv.h" #include "xfs_log_recover.h" #include "xfs_ag.h" #include "xfs_btree.h" #include "xfs_trace.h" #include "xfs_rtgroup.h" struct kmem_cache *xfs_rui_cache; struct kmem_cache *xfs_rud_cache; static const struct xfs_item_ops xfs_rui_item_ops; static inline struct xfs_rui_log_item *RUI_ITEM(struct xfs_log_item *lip) { return container_of(lip, struct xfs_rui_log_item, rui_item); } STATIC void xfs_rui_item_free( struct xfs_rui_log_item *ruip) { kvfree(ruip->rui_item.li_lv_shadow); if (ruip->rui_format.rui_nextents > XFS_RUI_MAX_FAST_EXTENTS) kfree(ruip); else kmem_cache_free(xfs_rui_cache, ruip); } /* * Freeing the RUI requires that we remove it from the AIL if it has already * been placed there. However, the RUI may not yet have been placed in the AIL * when called by xfs_rui_release() from RUD processing due to the ordering of * committed vs unpin operations in bulk insert operations. Hence the reference * count to ensure only the last caller frees the RUI. */ STATIC void xfs_rui_release( struct xfs_rui_log_item *ruip) { ASSERT(atomic_read(&ruip->rui_refcount) > 0); if (!atomic_dec_and_test(&ruip->rui_refcount)) return; xfs_trans_ail_delete(&ruip->rui_item, 0); xfs_rui_item_free(ruip); } STATIC void xfs_rui_item_size( struct xfs_log_item *lip, int *nvecs, int *nbytes) { struct xfs_rui_log_item *ruip = RUI_ITEM(lip); *nvecs += 1; *nbytes += xfs_rui_log_format_sizeof(ruip->rui_format.rui_nextents); } unsigned int xfs_rui_log_space(unsigned int nr) { return xlog_item_space(1, xfs_rui_log_format_sizeof(nr)); } /* * This is called to fill in the vector of log iovecs for the * given rui log item. We use only 1 iovec, and we point that * at the rui_log_format structure embedded in the rui item. * It is at this point that we assert that all of the extent * slots in the rui item have been filled. */ STATIC void xfs_rui_item_format( struct xfs_log_item *lip, struct xfs_log_vec *lv) { struct xfs_rui_log_item *ruip = RUI_ITEM(lip); struct xfs_log_iovec *vecp = NULL; ASSERT(atomic_read(&ruip->rui_next_extent) == ruip->rui_format.rui_nextents); ASSERT(lip->li_type == XFS_LI_RUI || lip->li_type == XFS_LI_RUI_RT); ruip->rui_format.rui_type = lip->li_type; ruip->rui_format.rui_size = 1; xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_RUI_FORMAT, &ruip->rui_format, xfs_rui_log_format_sizeof(ruip->rui_format.rui_nextents)); } /* * The unpin operation is the last place an RUI is manipulated in the log. It is * either inserted in the AIL or aborted in the event of a log I/O error. In * either case, the RUI transaction has been successfully committed to make it * this far. Therefore, we expect whoever committed the RUI to either construct * and commit the RUD or drop the RUD's reference in the event of error. Simply * drop the log's RUI reference now that the log is done with it. */ STATIC void xfs_rui_item_unpin( struct xfs_log_item *lip, int remove) { struct xfs_rui_log_item *ruip = RUI_ITEM(lip); xfs_rui_release(ruip); } /* * The RUI has been either committed or aborted if the transaction has been * cancelled. If the transaction was cancelled, an RUD isn't going to be * constructed and thus we free the RUI here directly. */ STATIC void xfs_rui_item_release( struct xfs_log_item *lip) { xfs_rui_release(RUI_ITEM(lip)); } /* * Allocate and initialize an rui item with the given number of extents. */ STATIC struct xfs_rui_log_item * xfs_rui_init( struct xfs_mount *mp, unsigned short item_type, uint nextents) { struct xfs_rui_log_item *ruip; ASSERT(nextents > 0); ASSERT(item_type == XFS_LI_RUI || item_type == XFS_LI_RUI_RT); if (nextents > XFS_RUI_MAX_FAST_EXTENTS) ruip = kzalloc(xfs_rui_log_item_sizeof(nextents), GFP_KERNEL | __GFP_NOFAIL); else ruip = kmem_cache_zalloc(xfs_rui_cache, GFP_KERNEL | __GFP_NOFAIL); xfs_log_item_init(mp, &ruip->rui_item, item_type, &xfs_rui_item_ops); ruip->rui_format.rui_nextents = nextents; ruip->rui_format.rui_id = (uintptr_t)(void *)ruip; atomic_set(&ruip->rui_next_extent, 0); atomic_set(&ruip->rui_refcount, 2); return ruip; } static inline struct xfs_rud_log_item *RUD_ITEM(struct xfs_log_item *lip) { return container_of(lip, struct xfs_rud_log_item, rud_item); } STATIC void xfs_rud_item_size( struct xfs_log_item *lip, int *nvecs, int *nbytes) { *nvecs += 1; *nbytes += sizeof(struct xfs_rud_log_format); } unsigned int xfs_rud_log_space(void) { return xlog_item_space(1, sizeof(struct xfs_rud_log_format)); } /* * This is called to fill in the vector of log iovecs for the * given rud log item. We use only 1 iovec, and we point that * at the rud_log_format structure embedded in the rud item. * It is at this point that we assert that all of the extent * slots in the rud item have been filled. */ STATIC void xfs_rud_item_format( struct xfs_log_item *lip, struct xfs_log_vec *lv) { struct xfs_rud_log_item *rudp = RUD_ITEM(lip); struct xfs_log_iovec *vecp = NULL; ASSERT(lip->li_type == XFS_LI_RUD || lip->li_type == XFS_LI_RUD_RT); rudp->rud_format.rud_type = lip->li_type; rudp->rud_format.rud_size = 1; xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_RUD_FORMAT, &rudp->rud_format, sizeof(struct xfs_rud_log_format)); } /* * The RUD is either committed or aborted if the transaction is cancelled. If * the transaction is cancelled, drop our reference to the RUI and free the * RUD. */ STATIC void xfs_rud_item_release( struct xfs_log_item *lip) { struct xfs_rud_log_item *rudp = RUD_ITEM(lip); xfs_rui_release(rudp->rud_ruip); kvfree(rudp->rud_item.li_lv_shadow); kmem_cache_free(xfs_rud_cache, rudp); } static struct xfs_log_item * xfs_rud_item_intent( struct xfs_log_item *lip) { return &RUD_ITEM(lip)->rud_ruip->rui_item; } static const struct xfs_item_ops xfs_rud_item_ops = { .flags = XFS_ITEM_RELEASE_WHEN_COMMITTED | XFS_ITEM_INTENT_DONE, .iop_size = xfs_rud_item_size, .iop_format = xfs_rud_item_format, .iop_release = xfs_rud_item_release, .iop_intent = xfs_rud_item_intent, }; static inline struct xfs_rmap_intent *ri_entry(const struct list_head *e) { return list_entry(e, struct xfs_rmap_intent, ri_list); } static inline bool xfs_rui_item_isrt(const struct xfs_log_item *lip) { ASSERT(lip->li_type == XFS_LI_RUI || lip->li_type == XFS_LI_RUI_RT); return lip->li_type == XFS_LI_RUI_RT; } /* Sort rmap intents by AG. */ static int xfs_rmap_update_diff_items( void *priv, const struct list_head *a, const struct list_head *b) { struct xfs_rmap_intent *ra = ri_entry(a); struct xfs_rmap_intent *rb = ri_entry(b); return ra->ri_group->xg_gno - rb->ri_group->xg_gno; } /* Log rmap updates in the intent item. */ STATIC void xfs_rmap_update_log_item( struct xfs_trans *tp, struct xfs_rui_log_item *ruip, struct xfs_rmap_intent *ri) { uint next_extent; struct xfs_map_extent *map; /* * atomic_inc_return gives us the value after the increment; * we want to use it as an array index so we need to subtract 1 from * it. */ next_extent = atomic_inc_return(&ruip->rui_next_extent) - 1; ASSERT(next_extent < ruip->rui_format.rui_nextents); map = &ruip->rui_format.rui_extents[next_extent]; map->me_owner = ri->ri_owner; map->me_startblock = ri->ri_bmap.br_startblock; map->me_startoff = ri->ri_bmap.br_startoff; map->me_len = ri->ri_bmap.br_blockcount; map->me_flags = 0; if (ri->ri_bmap.br_state == XFS_EXT_UNWRITTEN) map->me_flags |= XFS_RMAP_EXTENT_UNWRITTEN; if (ri->ri_whichfork == XFS_ATTR_FORK) map->me_flags |= XFS_RMAP_EXTENT_ATTR_FORK; switch (ri->ri_type) { case XFS_RMAP_MAP: map->me_flags |= XFS_RMAP_EXTENT_MAP; break; case XFS_RMAP_MAP_SHARED: map->me_flags |= XFS_RMAP_EXTENT_MAP_SHARED; break; case XFS_RMAP_UNMAP: map->me_flags |= XFS_RMAP_EXTENT_UNMAP; break; case XFS_RMAP_UNMAP_SHARED: map->me_flags |= XFS_RMAP_EXTENT_UNMAP_SHARED; break; case XFS_RMAP_CONVERT: map->me_flags |= XFS_RMAP_EXTENT_CONVERT; break; case XFS_RMAP_CONVERT_SHARED: map->me_flags |= XFS_RMAP_EXTENT_CONVERT_SHARED; break; case XFS_RMAP_ALLOC: map->me_flags |= XFS_RMAP_EXTENT_ALLOC; break; case XFS_RMAP_FREE: map->me_flags |= XFS_RMAP_EXTENT_FREE; break; default: ASSERT(0); } } static struct xfs_log_item * __xfs_rmap_update_create_intent( struct xfs_trans *tp, struct list_head *items, unsigned int count, bool sort, unsigned short item_type) { struct xfs_mount *mp = tp->t_mountp; struct xfs_rui_log_item *ruip; struct xfs_rmap_intent *ri; ASSERT(count > 0); ruip = xfs_rui_init(mp, item_type, count); if (sort) list_sort(mp, items, xfs_rmap_update_diff_items); list_for_each_entry(ri, items, ri_list) xfs_rmap_update_log_item(tp, ruip, ri); return &ruip->rui_item; } static struct xfs_log_item * xfs_rmap_update_create_intent( struct xfs_trans *tp, struct list_head *items, unsigned int count, bool sort) { return __xfs_rmap_update_create_intent(tp, items, count, sort, XFS_LI_RUI); } static inline unsigned short xfs_rud_type_from_rui(const struct xfs_rui_log_item *ruip) { return xfs_rui_item_isrt(&ruip->rui_item) ? XFS_LI_RUD_RT : XFS_LI_RUD; } /* Get an RUD so we can process all the deferred rmap updates. */ static struct xfs_log_item * xfs_rmap_update_create_done( struct xfs_trans *tp, struct xfs_log_item *intent, unsigned int count) { struct xfs_rui_log_item *ruip = RUI_ITEM(intent); struct xfs_rud_log_item *rudp; rudp = kmem_cache_zalloc(xfs_rud_cache, GFP_KERNEL | __GFP_NOFAIL); xfs_log_item_init(tp->t_mountp, &rudp->rud_item, xfs_rud_type_from_rui(ruip), &xfs_rud_item_ops); rudp->rud_ruip = ruip; rudp->rud_format.rud_rui_id = ruip->rui_format.rui_id; return &rudp->rud_item; } /* Add this deferred RUI to the transaction. */ void xfs_rmap_defer_add( struct xfs_trans *tp, struct xfs_rmap_intent *ri) { struct xfs_mount *mp = tp->t_mountp; /* * Deferred rmap updates for the realtime and data sections must use * separate transactions to finish deferred work because updates to * realtime metadata files can lock AGFs to allocate btree blocks and * we don't want that mixing with the AGF locks taken to finish data * section updates. */ ri->ri_group = xfs_group_intent_get(mp, ri->ri_bmap.br_startblock, ri->ri_realtime ? XG_TYPE_RTG : XG_TYPE_AG); trace_xfs_rmap_defer(mp, ri); xfs_defer_add(tp, &ri->ri_list, ri->ri_realtime ? &xfs_rtrmap_update_defer_type : &xfs_rmap_update_defer_type); } /* Cancel a deferred rmap update. */ STATIC void xfs_rmap_update_cancel_item( struct list_head *item) { struct xfs_rmap_intent *ri = ri_entry(item); xfs_group_intent_put(ri->ri_group); kmem_cache_free(xfs_rmap_intent_cache, ri); } /* Process a deferred rmap update. */ STATIC int xfs_rmap_update_finish_item( struct xfs_trans *tp, struct xfs_log_item *done, struct list_head *item, struct xfs_btree_cur **state) { struct xfs_rmap_intent *ri = ri_entry(item); int error; error = xfs_rmap_finish_one(tp, ri, state); xfs_rmap_update_cancel_item(item); return error; } /* Clean up after calling xfs_rmap_finish_one. */ STATIC void xfs_rmap_finish_one_cleanup( struct xfs_trans *tp, struct xfs_btree_cur *rcur, int error) { struct xfs_buf *agbp = NULL; if (rcur == NULL) return; agbp = rcur->bc_ag.agbp; xfs_btree_del_cursor(rcur, error); if (error && agbp) xfs_trans_brelse(tp, agbp); } /* Abort all pending RUIs. */ STATIC void xfs_rmap_update_abort_intent( struct xfs_log_item *intent) { xfs_rui_release(RUI_ITEM(intent)); } /* Is this recovered RUI ok? */ static inline bool xfs_rui_validate_map( struct xfs_mount *mp, bool isrt, struct xfs_map_extent *map) { if (!xfs_has_rmapbt(mp)) return false; if (map->me_flags & ~XFS_RMAP_EXTENT_FLAGS) return false; switch (map->me_flags & XFS_RMAP_EXTENT_TYPE_MASK) { case XFS_RMAP_EXTENT_MAP: case XFS_RMAP_EXTENT_MAP_SHARED: case XFS_RMAP_EXTENT_UNMAP: case XFS_RMAP_EXTENT_UNMAP_SHARED: case XFS_RMAP_EXTENT_CONVERT: case XFS_RMAP_EXTENT_CONVERT_SHARED: case XFS_RMAP_EXTENT_ALLOC: case XFS_RMAP_EXTENT_FREE: break; default: return false; } if (!XFS_RMAP_NON_INODE_OWNER(map->me_owner) && !xfs_verify_ino(mp, map->me_owner)) return false; if (!xfs_verify_fileext(mp, map->me_startoff, map->me_len)) return false; if (isrt) return xfs_verify_rtbext(mp, map->me_startblock, map->me_len); return xfs_verify_fsbext(mp, map->me_startblock, map->me_len); } static inline void xfs_rui_recover_work( struct xfs_mount *mp, struct xfs_defer_pending *dfp, bool isrt, const struct xfs_map_extent *map) { struct xfs_rmap_intent *ri; ri = kmem_cache_alloc(xfs_rmap_intent_cache, GFP_KERNEL | __GFP_NOFAIL); switch (map->me_flags & XFS_RMAP_EXTENT_TYPE_MASK) { case XFS_RMAP_EXTENT_MAP: ri->ri_type = XFS_RMAP_MAP; break; case XFS_RMAP_EXTENT_MAP_SHARED: ri->ri_type = XFS_RMAP_MAP_SHARED; break; case XFS_RMAP_EXTENT_UNMAP: ri->ri_type = XFS_RMAP_UNMAP; break; case XFS_RMAP_EXTENT_UNMAP_SHARED: ri->ri_type = XFS_RMAP_UNMAP_SHARED; break; case XFS_RMAP_EXTENT_CONVERT: ri->ri_type = XFS_RMAP_CONVERT; break; case XFS_RMAP_EXTENT_CONVERT_SHARED: ri->ri_type = XFS_RMAP_CONVERT_SHARED; break; case XFS_RMAP_EXTENT_ALLOC: ri->ri_type = XFS_RMAP_ALLOC; break; case XFS_RMAP_EXTENT_FREE: ri->ri_type = XFS_RMAP_FREE; break; default: ASSERT(0); return; } ri->ri_owner = map->me_owner; ri->ri_whichfork = (map->me_flags & XFS_RMAP_EXTENT_ATTR_FORK) ? XFS_ATTR_FORK : XFS_DATA_FORK; ri->ri_bmap.br_startblock = map->me_startblock; ri->ri_bmap.br_startoff = map->me_startoff; ri->ri_bmap.br_blockcount = map->me_len; ri->ri_bmap.br_state = (map->me_flags & XFS_RMAP_EXTENT_UNWRITTEN) ? XFS_EXT_UNWRITTEN : XFS_EXT_NORM; ri->ri_group = xfs_group_intent_get(mp, map->me_startblock, isrt ? XG_TYPE_RTG : XG_TYPE_AG); ri->ri_realtime = isrt; xfs_defer_add_item(dfp, &ri->ri_list); } /* * Process an rmap update intent item that was recovered from the log. * We need to update the rmapbt. */ STATIC int xfs_rmap_recover_work( struct xfs_defer_pending *dfp, struct list_head *capture_list) { struct xfs_trans_res resv; struct xfs_log_item *lip = dfp->dfp_intent; struct xfs_rui_log_item *ruip = RUI_ITEM(lip); struct xfs_trans *tp; struct xfs_mount *mp = lip->li_log->l_mp; bool isrt = xfs_rui_item_isrt(lip); int i; int error = 0; /* * First check the validity of the extents described by the * RUI. If any are bad, then assume that all are bad and * just toss the RUI. */ for (i = 0; i < ruip->rui_format.rui_nextents; i++) { if (!xfs_rui_validate_map(mp, isrt, &ruip->rui_format.rui_extents[i])) { XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, &ruip->rui_format, sizeof(ruip->rui_format)); return -EFSCORRUPTED; } xfs_rui_recover_work(mp, dfp, isrt, &ruip->rui_format.rui_extents[i]); } resv = xlog_recover_resv(&M_RES(mp)->tr_itruncate); error = xfs_trans_alloc(mp, &resv, mp->m_rmap_maxlevels, 0, XFS_TRANS_RESERVE, &tp); if (error) return error; error = xlog_recover_finish_intent(tp, dfp); if (error == -EFSCORRUPTED) XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, &ruip->rui_format, sizeof(ruip->rui_format)); if (error) goto abort_error; return xfs_defer_ops_capture_and_commit(tp, capture_list); abort_error: xfs_trans_cancel(tp); return error; } /* Relog an intent item to push the log tail forward. */ static struct xfs_log_item * xfs_rmap_relog_intent( struct xfs_trans *tp, struct xfs_log_item *intent, struct xfs_log_item *done_item) { struct xfs_rui_log_item *ruip; struct xfs_map_extent *map; unsigned int count; ASSERT(intent->li_type == XFS_LI_RUI || intent->li_type == XFS_LI_RUI_RT); count = RUI_ITEM(intent)->rui_format.rui_nextents; map = RUI_ITEM(intent)->rui_format.rui_extents; ruip = xfs_rui_init(tp->t_mountp, intent->li_type, count); memcpy(ruip->rui_format.rui_extents, map, count * sizeof(*map)); atomic_set(&ruip->rui_next_extent, count); return &ruip->rui_item; } const struct xfs_defer_op_type xfs_rmap_update_defer_type = { .name = "rmap", .max_items = XFS_RUI_MAX_FAST_EXTENTS, .create_intent = xfs_rmap_update_create_intent, .abort_intent = xfs_rmap_update_abort_intent, .create_done = xfs_rmap_update_create_done, .finish_item = xfs_rmap_update_finish_item, .finish_cleanup = xfs_rmap_finish_one_cleanup, .cancel_item = xfs_rmap_update_cancel_item, .recover_work = xfs_rmap_recover_work, .relog_intent = xfs_rmap_relog_intent, }; #ifdef CONFIG_XFS_RT static struct xfs_log_item * xfs_rtrmap_update_create_intent( struct xfs_trans *tp, struct list_head *items, unsigned int count, bool sort) { return __xfs_rmap_update_create_intent(tp, items, count, sort, XFS_LI_RUI_RT); } /* Clean up after calling xfs_rmap_finish_one. */ STATIC void xfs_rtrmap_finish_one_cleanup( struct xfs_trans *tp, struct xfs_btree_cur *rcur, int error) { if (rcur) xfs_btree_del_cursor(rcur, error); } const struct xfs_defer_op_type xfs_rtrmap_update_defer_type = { .name = "rtrmap", .max_items = XFS_RUI_MAX_FAST_EXTENTS, .create_intent = xfs_rtrmap_update_create_intent, .abort_intent = xfs_rmap_update_abort_intent, .create_done = xfs_rmap_update_create_done, .finish_item = xfs_rmap_update_finish_item, .finish_cleanup = xfs_rtrmap_finish_one_cleanup, .cancel_item = xfs_rmap_update_cancel_item, .recover_work = xfs_rmap_recover_work, .relog_intent = xfs_rmap_relog_intent, }; #else const struct xfs_defer_op_type xfs_rtrmap_update_defer_type = { .name = "rtrmap", }; #endif STATIC bool xfs_rui_item_match( struct xfs_log_item *lip, uint64_t intent_id) { return RUI_ITEM(lip)->rui_format.rui_id == intent_id; } static const struct xfs_item_ops xfs_rui_item_ops = { .flags = XFS_ITEM_INTENT, .iop_size = xfs_rui_item_size, .iop_format = xfs_rui_item_format, .iop_unpin = xfs_rui_item_unpin, .iop_release = xfs_rui_item_release, .iop_match = xfs_rui_item_match, }; static inline void xfs_rui_copy_format( struct xfs_rui_log_format *dst, const struct xfs_rui_log_format *src) { unsigned int i; memcpy(dst, src, offsetof(struct xfs_rui_log_format, rui_extents)); for (i = 0; i < src->rui_nextents; i++) memcpy(&dst->rui_extents[i], &src->rui_extents[i], sizeof(struct xfs_map_extent)); } /* * This routine is called to create an in-core extent rmap update * item from the rui format structure which was logged on disk. * It allocates an in-core rui, copies the extents from the format * structure into it, and adds the rui to the AIL with the given * LSN. */ STATIC int xlog_recover_rui_commit_pass2( struct xlog *log, struct list_head *buffer_list, struct xlog_recover_item *item, xfs_lsn_t lsn) { struct xfs_mount *mp = log->l_mp; struct xfs_rui_log_item *ruip; struct xfs_rui_log_format *rui_formatp; size_t len; rui_formatp = item->ri_buf[0].iov_base; if (item->ri_buf[0].iov_len < xfs_rui_log_format_sizeof(0)) { XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, item->ri_buf[0].iov_base, item->ri_buf[0].iov_len); return -EFSCORRUPTED; } len = xfs_rui_log_format_sizeof(rui_formatp->rui_nextents); if (item->ri_buf[0].iov_len != len) { XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, item->ri_buf[0].iov_base, item->ri_buf[0].iov_len); return -EFSCORRUPTED; } ruip = xfs_rui_init(mp, ITEM_TYPE(item), rui_formatp->rui_nextents); xfs_rui_copy_format(&ruip->rui_format, rui_formatp); atomic_set(&ruip->rui_next_extent, rui_formatp->rui_nextents); xlog_recover_intent_item(log, &ruip->rui_item, lsn, &xfs_rmap_update_defer_type); return 0; } const struct xlog_recover_item_ops xlog_rui_item_ops = { .item_type = XFS_LI_RUI, .commit_pass2 = xlog_recover_rui_commit_pass2, }; #ifdef CONFIG_XFS_RT STATIC int xlog_recover_rtrui_commit_pass2( struct xlog *log, struct list_head *buffer_list, struct xlog_recover_item *item, xfs_lsn_t lsn) { struct xfs_mount *mp = log->l_mp; struct xfs_rui_log_item *ruip; struct xfs_rui_log_format *rui_formatp; size_t len; rui_formatp = item->ri_buf[0].iov_base; if (item->ri_buf[0].iov_len < xfs_rui_log_format_sizeof(0)) { XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, item->ri_buf[0].iov_base, item->ri_buf[0].iov_len); return -EFSCORRUPTED; } len = xfs_rui_log_format_sizeof(rui_formatp->rui_nextents); if (item->ri_buf[0].iov_len != len) { XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, item->ri_buf[0].iov_base, item->ri_buf[0].iov_len); return -EFSCORRUPTED; } ruip = xfs_rui_init(mp, ITEM_TYPE(item), rui_formatp->rui_nextents); xfs_rui_copy_format(&ruip->rui_format, rui_formatp); atomic_set(&ruip->rui_next_extent, rui_formatp->rui_nextents); xlog_recover_intent_item(log, &ruip->rui_item, lsn, &xfs_rtrmap_update_defer_type); return 0; } #else STATIC int xlog_recover_rtrui_commit_pass2( struct xlog *log, struct list_head *buffer_list, struct xlog_recover_item *item, xfs_lsn_t lsn) { XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, log->l_mp, item->ri_buf[0].iov_base, item->ri_buf[0].iov_len); return -EFSCORRUPTED; } #endif const struct xlog_recover_item_ops xlog_rtrui_item_ops = { .item_type = XFS_LI_RUI_RT, .commit_pass2 = xlog_recover_rtrui_commit_pass2, }; /* * This routine is called when an RUD format structure is found in a committed * transaction in the log. Its purpose is to cancel the corresponding RUI if it * was still in the log. To do this it searches the AIL for the RUI with an id * equal to that in the RUD format structure. If we find it we drop the RUD * reference, which removes the RUI from the AIL and frees it. */ STATIC int xlog_recover_rud_commit_pass2( struct xlog *log, struct list_head *buffer_list, struct xlog_recover_item *item, xfs_lsn_t lsn) { struct xfs_rud_log_format *rud_formatp; rud_formatp = item->ri_buf[0].iov_base; if (item->ri_buf[0].iov_len != sizeof(struct xfs_rud_log_format)) { XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, log->l_mp, rud_formatp, item->ri_buf[0].iov_len); return -EFSCORRUPTED; } xlog_recover_release_intent(log, XFS_LI_RUI, rud_formatp->rud_rui_id); return 0; } const struct xlog_recover_item_ops xlog_rud_item_ops = { .item_type = XFS_LI_RUD, .commit_pass2 = xlog_recover_rud_commit_pass2, }; #ifdef CONFIG_XFS_RT STATIC int xlog_recover_rtrud_commit_pass2( struct xlog *log, struct list_head *buffer_list, struct xlog_recover_item *item, xfs_lsn_t lsn) { struct xfs_rud_log_format *rud_formatp; rud_formatp = item->ri_buf[0].iov_base; if (item->ri_buf[0].iov_len != sizeof(struct xfs_rud_log_format)) { XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, log->l_mp, rud_formatp, item->ri_buf[0].iov_len); return -EFSCORRUPTED; } xlog_recover_release_intent(log, XFS_LI_RUI_RT, rud_formatp->rud_rui_id); return 0; } #else # define xlog_recover_rtrud_commit_pass2 xlog_recover_rtrui_commit_pass2 #endif const struct xlog_recover_item_ops xlog_rtrud_item_ops = { .item_type = XFS_LI_RUD_RT, .commit_pass2 = xlog_recover_rtrud_commit_pass2, };
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * ALSA sequencer Timer * Copyright (c) 1998-1999 by Frank van de Pol <fvdpol@coil.demon.nl> */ #ifndef __SND_SEQ_TIMER_H #define __SND_SEQ_TIMER_H #include <sound/timer.h> #include <sound/seq_kernel.h> struct snd_seq_timer_tick { snd_seq_tick_time_t cur_tick; /* current tick */ unsigned long resolution; /* time per tick in nsec */ unsigned long fraction; /* current time per tick in nsec */ }; struct snd_seq_timer { /* ... tempo / offset / running state */ unsigned int running:1, /* running state of queue */ initialized:1; /* timer is initialized */ unsigned int tempo; /* current tempo, us/tick */ int ppq; /* time resolution, ticks/quarter */ snd_seq_real_time_t cur_time; /* current time */ struct snd_seq_timer_tick tick; /* current tick */ int tick_updated; int type; /* timer type */ struct snd_timer_id alsa_id; /* ALSA's timer ID */ struct snd_timer_instance *timeri; /* timer instance */ unsigned int ticks; unsigned long preferred_resolution; /* timer resolution, ticks/sec */ unsigned int skew; unsigned int skew_base; unsigned int tempo_base; struct timespec64 last_update; /* time of last clock update, used for interpolation */ spinlock_t lock; }; /* create new timer (constructor) */ struct snd_seq_timer *snd_seq_timer_new(void); /* delete timer (destructor) */ void snd_seq_timer_delete(struct snd_seq_timer **tmr); /* */ static inline void snd_seq_timer_update_tick(struct snd_seq_timer_tick *tick, unsigned long resolution) { if (tick->resolution > 0) { tick->fraction += resolution; tick->cur_tick += (unsigned int)(tick->fraction / tick->resolution); tick->fraction %= tick->resolution; } } /* compare timestamp between events */ /* return 1 if a >= b; otherwise return 0 */ static inline int snd_seq_compare_tick_time(snd_seq_tick_time_t *a, snd_seq_tick_time_t *b) { /* compare ticks */ return (*a >= *b); } static inline int snd_seq_compare_real_time(snd_seq_real_time_t *a, snd_seq_real_time_t *b) { /* compare real time */ if (a->tv_sec > b->tv_sec) return 1; if ((a->tv_sec == b->tv_sec) && (a->tv_nsec >= b->tv_nsec)) return 1; return 0; } static inline void snd_seq_sanity_real_time(snd_seq_real_time_t *tm) { while (tm->tv_nsec >= 1000000000) { /* roll-over */ tm->tv_nsec -= 1000000000; tm->tv_sec++; } } /* increment timestamp */ static inline void snd_seq_inc_real_time(snd_seq_real_time_t *tm, snd_seq_real_time_t *inc) { tm->tv_sec += inc->tv_sec; tm->tv_nsec += inc->tv_nsec; snd_seq_sanity_real_time(tm); } static inline void snd_seq_inc_time_nsec(snd_seq_real_time_t *tm, unsigned long nsec) { tm->tv_nsec += nsec; snd_seq_sanity_real_time(tm); } /* called by timer isr */ struct snd_seq_queue; int snd_seq_timer_open(struct snd_seq_queue *q); int snd_seq_timer_close(struct snd_seq_queue *q); void snd_seq_timer_defaults(struct snd_seq_timer *tmr); void snd_seq_timer_reset(struct snd_seq_timer *tmr); int snd_seq_timer_stop(struct snd_seq_timer *tmr); int snd_seq_timer_start(struct snd_seq_timer *tmr); int snd_seq_timer_continue(struct snd_seq_timer *tmr); int snd_seq_timer_set_tempo(struct snd_seq_timer *tmr, int tempo); int snd_seq_timer_set_tempo_ppq(struct snd_seq_timer *tmr, int tempo, int ppq, unsigned int tempo_base); int snd_seq_timer_set_position_tick(struct snd_seq_timer *tmr, snd_seq_tick_time_t position); int snd_seq_timer_set_position_time(struct snd_seq_timer *tmr, snd_seq_real_time_t position); int snd_seq_timer_set_skew(struct snd_seq_timer *tmr, unsigned int skew, unsigned int base); snd_seq_real_time_t snd_seq_timer_get_cur_time(struct snd_seq_timer *tmr, bool adjust_ktime); snd_seq_tick_time_t snd_seq_timer_get_cur_tick(struct snd_seq_timer *tmr); extern int seq_default_timer_class; extern int seq_default_timer_sclass; extern int seq_default_timer_card; extern int seq_default_timer_device; extern int seq_default_timer_subdevice; extern int seq_default_timer_resolution; #endif
26 26 26 15 22 4 12 10 13 11 4 4 1 7 7 7 87 86 78 14 14 14 19 2 20 20 24 24 8 24 4 20 7 12 12 12 20 20 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 // SPDX-License-Identifier: GPL-2.0-only /* * truncate.c * * PURPOSE * Truncate handling routines for the OSTA-UDF(tm) filesystem. * * COPYRIGHT * (C) 1999-2004 Ben Fennema * (C) 1999 Stelias Computing Inc * * HISTORY * * 02/24/99 blf Created. * */ #include "udfdecl.h" #include <linux/fs.h> #include <linux/mm.h> #include "udf_i.h" #include "udf_sb.h" static void extent_trunc(struct inode *inode, struct extent_position *epos, struct kernel_lb_addr *eloc, int8_t etype, uint32_t elen, uint32_t nelen) { struct kernel_lb_addr neloc = {}; int last_block = (elen + inode->i_sb->s_blocksize - 1) >> inode->i_sb->s_blocksize_bits; int first_block = (nelen + inode->i_sb->s_blocksize - 1) >> inode->i_sb->s_blocksize_bits; if (nelen) { if (etype == (EXT_NOT_RECORDED_ALLOCATED >> 30)) { udf_free_blocks(inode->i_sb, inode, eloc, 0, last_block); etype = (EXT_NOT_RECORDED_NOT_ALLOCATED >> 30); } else neloc = *eloc; nelen = (etype << 30) | nelen; } if (elen != nelen) { udf_write_aext(inode, epos, &neloc, nelen, 0); if (last_block > first_block) { if (etype == (EXT_RECORDED_ALLOCATED >> 30)) mark_inode_dirty(inode); if (etype != (EXT_NOT_RECORDED_NOT_ALLOCATED >> 30)) udf_free_blocks(inode->i_sb, inode, eloc, first_block, last_block - first_block); } } } /* * Truncate the last extent to match i_size. This function assumes * that preallocation extent is already truncated. */ void udf_truncate_tail_extent(struct inode *inode) { struct extent_position epos = {}; struct kernel_lb_addr eloc; uint32_t elen, nelen; uint64_t lbcount = 0; int8_t etype = -1, netype; int adsize; struct udf_inode_info *iinfo = UDF_I(inode); int ret; if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB || inode->i_size == iinfo->i_lenExtents) return; /* Are we going to delete the file anyway? */ if (inode->i_nlink == 0) return; if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT) adsize = sizeof(struct short_ad); else if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG) adsize = sizeof(struct long_ad); else BUG(); /* Find the last extent in the file */ while (1) { ret = udf_next_aext(inode, &epos, &eloc, &elen, &netype, 1); if (ret <= 0) break; etype = netype; lbcount += elen; if (lbcount > inode->i_size) { if (lbcount - inode->i_size >= inode->i_sb->s_blocksize) udf_warn(inode->i_sb, "Too long extent after EOF in inode %u: i_size: %lld lbcount: %lld extent %u+%u\n", (unsigned)inode->i_ino, (long long)inode->i_size, (long long)lbcount, (unsigned)eloc.logicalBlockNum, (unsigned)elen); nelen = elen - (lbcount - inode->i_size); epos.offset -= adsize; extent_trunc(inode, &epos, &eloc, etype, elen, nelen); epos.offset += adsize; if (udf_next_aext(inode, &epos, &eloc, &elen, &netype, 1) > 0) udf_err(inode->i_sb, "Extent after EOF in inode %u\n", (unsigned)inode->i_ino); break; } } /* This inode entry is in-memory only and thus we don't have to mark * the inode dirty */ if (ret >= 0) iinfo->i_lenExtents = inode->i_size; brelse(epos.bh); } void udf_discard_prealloc(struct inode *inode) { struct extent_position epos = {}; struct extent_position prev_epos = {}; struct kernel_lb_addr eloc; uint32_t elen; uint64_t lbcount = 0; int8_t etype = -1; struct udf_inode_info *iinfo = UDF_I(inode); int bsize = i_blocksize(inode); int8_t tmpetype = -1; int ret; if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB || ALIGN(inode->i_size, bsize) == ALIGN(iinfo->i_lenExtents, bsize)) return; epos.block = iinfo->i_location; /* Find the last extent in the file */ while (1) { ret = udf_next_aext(inode, &epos, &eloc, &elen, &tmpetype, 0); if (ret < 0) goto out; if (ret == 0) break; brelse(prev_epos.bh); prev_epos = epos; if (prev_epos.bh) get_bh(prev_epos.bh); ret = udf_next_aext(inode, &epos, &eloc, &elen, &etype, 1); if (ret < 0) goto out; lbcount += elen; } if (etype == (EXT_NOT_RECORDED_ALLOCATED >> 30)) { lbcount -= elen; udf_delete_aext(inode, prev_epos); udf_free_blocks(inode->i_sb, inode, &eloc, 0, DIV_ROUND_UP(elen, bsize)); } /* This inode entry is in-memory only and thus we don't have to mark * the inode dirty */ iinfo->i_lenExtents = lbcount; out: brelse(epos.bh); brelse(prev_epos.bh); } static void udf_update_alloc_ext_desc(struct inode *inode, struct extent_position *epos, u32 lenalloc) { struct super_block *sb = inode->i_sb; struct udf_sb_info *sbi = UDF_SB(sb); struct allocExtDesc *aed = (struct allocExtDesc *) (epos->bh->b_data); int len = sizeof(struct allocExtDesc); aed->lengthAllocDescs = cpu_to_le32(lenalloc); if (!UDF_QUERY_FLAG(sb, UDF_FLAG_STRICT) || sbi->s_udfrev >= 0x0201) len += lenalloc; udf_update_tag(epos->bh->b_data, len); mark_buffer_dirty_inode(epos->bh, inode); } /* * Truncate extents of inode to inode->i_size. This function can be used only * for making file shorter. For making file longer, udf_extend_file() has to * be used. */ int udf_truncate_extents(struct inode *inode) { struct extent_position epos; struct kernel_lb_addr eloc, neloc = {}; uint32_t elen, nelen = 0, indirect_ext_len = 0, lenalloc; int8_t etype; struct super_block *sb = inode->i_sb; sector_t first_block = inode->i_size >> sb->s_blocksize_bits, offset; loff_t byte_offset; int adsize; struct udf_inode_info *iinfo = UDF_I(inode); int ret = 0; if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT) adsize = sizeof(struct short_ad); else if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG) adsize = sizeof(struct long_ad); else BUG(); ret = inode_bmap(inode, first_block, &epos, &eloc, &elen, &offset, &etype); if (ret < 0) return ret; byte_offset = (offset << sb->s_blocksize_bits) + (inode->i_size & (sb->s_blocksize - 1)); if (ret == 0) { /* We should extend the file? */ WARN_ON(byte_offset); return 0; } epos.offset -= adsize; extent_trunc(inode, &epos, &eloc, etype, elen, byte_offset); epos.offset += adsize; if (byte_offset) lenalloc = epos.offset; else lenalloc = epos.offset - adsize; if (!epos.bh) lenalloc -= udf_file_entry_alloc_offset(inode); else lenalloc -= sizeof(struct allocExtDesc); while ((ret = udf_current_aext(inode, &epos, &eloc, &elen, &etype, 0)) > 0) { if (etype == (EXT_NEXT_EXTENT_ALLOCDESCS >> 30)) { udf_write_aext(inode, &epos, &neloc, nelen, 0); if (indirect_ext_len) { /* We managed to free all extents in the * indirect extent - free it too */ BUG_ON(!epos.bh); udf_free_blocks(sb, NULL, &epos.block, 0, indirect_ext_len); } else if (!epos.bh) { iinfo->i_lenAlloc = lenalloc; mark_inode_dirty(inode); } else udf_update_alloc_ext_desc(inode, &epos, lenalloc); brelse(epos.bh); epos.offset = sizeof(struct allocExtDesc); epos.block = eloc; epos.bh = sb_bread(sb, udf_get_lb_pblock(sb, &eloc, 0)); /* Error reading indirect block? */ if (!epos.bh) return -EIO; if (elen) indirect_ext_len = (elen + sb->s_blocksize - 1) >> sb->s_blocksize_bits; else indirect_ext_len = 1; } else { extent_trunc(inode, &epos, &eloc, etype, elen, 0); epos.offset += adsize; } } if (ret < 0) { brelse(epos.bh); return ret; } if (indirect_ext_len) { BUG_ON(!epos.bh); udf_free_blocks(sb, NULL, &epos.block, 0, indirect_ext_len); } else if (!epos.bh) { iinfo->i_lenAlloc = lenalloc; mark_inode_dirty(inode); } else udf_update_alloc_ext_desc(inode, &epos, lenalloc); iinfo->i_lenExtents = inode->i_size; brelse(epos.bh); return 0; }
17 16 1 1 17 17 17 17 17 17 17 16 17 17 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 // SPDX-License-Identifier: GPL-2.0-or-later #include <linux/plist.h> #include <linux/sched/task.h> #include <linux/sched/signal.h> #include <linux/freezer.h> #include "futex.h" /* * READ this before attempting to hack on futexes! * * Basic futex operation and ordering guarantees * ============================================= * * The waiter reads the futex value in user space and calls * futex_wait(). This function computes the hash bucket and acquires * the hash bucket lock. After that it reads the futex user space value * again and verifies that the data has not changed. If it has not changed * it enqueues itself into the hash bucket, releases the hash bucket lock * and schedules. * * The waker side modifies the user space value of the futex and calls * futex_wake(). This function computes the hash bucket and acquires the * hash bucket lock. Then it looks for waiters on that futex in the hash * bucket and wakes them. * * In futex wake up scenarios where no tasks are blocked on a futex, taking * the hb spinlock can be avoided and simply return. In order for this * optimization to work, ordering guarantees must exist so that the waiter * being added to the list is acknowledged when the list is concurrently being * checked by the waker, avoiding scenarios like the following: * * CPU 0 CPU 1 * val = *futex; * sys_futex(WAIT, futex, val); * futex_wait(futex, val); * uval = *futex; * *futex = newval; * sys_futex(WAKE, futex); * futex_wake(futex); * if (queue_empty()) * return; * if (uval == val) * lock(hash_bucket(futex)); * queue(); * unlock(hash_bucket(futex)); * schedule(); * * This would cause the waiter on CPU 0 to wait forever because it * missed the transition of the user space value from val to newval * and the waker did not find the waiter in the hash bucket queue. * * The correct serialization ensures that a waiter either observes * the changed user space value before blocking or is woken by a * concurrent waker: * * CPU 0 CPU 1 * val = *futex; * sys_futex(WAIT, futex, val); * futex_wait(futex, val); * * waiters++; (a) * smp_mb(); (A) <-- paired with -. * | * lock(hash_bucket(futex)); | * | * uval = *futex; | * | *futex = newval; * | sys_futex(WAKE, futex); * | futex_wake(futex); * | * `--------> smp_mb(); (B) * if (uval == val) * queue(); * unlock(hash_bucket(futex)); * schedule(); if (waiters) * lock(hash_bucket(futex)); * else wake_waiters(futex); * waiters--; (b) unlock(hash_bucket(futex)); * * Where (A) orders the waiters increment and the futex value read through * atomic operations (see futex_hb_waiters_inc) and where (B) orders the write * to futex and the waiters read (see futex_hb_waiters_pending()). * * This yields the following case (where X:=waiters, Y:=futex): * * X = Y = 0 * * w[X]=1 w[Y]=1 * MB MB * r[Y]=y r[X]=x * * Which guarantees that x==0 && y==0 is impossible; which translates back into * the guarantee that we cannot both miss the futex variable change and the * enqueue. * * Note that a new waiter is accounted for in (a) even when it is possible that * the wait call can return error, in which case we backtrack from it in (b). * Refer to the comment in futex_q_lock(). * * Similarly, in order to account for waiters being requeued on another * address we always increment the waiters for the destination bucket before * acquiring the lock. It then decrements them again after releasing it - * the code that actually moves the futex(es) between hash buckets (requeue_futex) * will do the additional required waiter count housekeeping. This is done for * double_lock_hb() and double_unlock_hb(), respectively. */ bool __futex_wake_mark(struct futex_q *q) { if (WARN(q->pi_state || q->rt_waiter, "refusing to wake PI futex\n")) return false; __futex_unqueue(q); /* * The waiting task can free the futex_q as soon as q->lock_ptr = NULL * is written, without taking any locks. This is possible in the event * of a spurious wakeup, for example. A memory barrier is required here * to prevent the following store to lock_ptr from getting ahead of the * plist_del in __futex_unqueue(). */ smp_store_release(&q->lock_ptr, NULL); return true; } /* * The hash bucket lock must be held when this is called. * Afterwards, the futex_q must not be accessed. Callers * must ensure to later call wake_up_q() for the actual * wakeups to occur. */ void futex_wake_mark(struct wake_q_head *wake_q, struct futex_q *q) { struct task_struct *p = q->task; get_task_struct(p); if (!__futex_wake_mark(q)) { put_task_struct(p); return; } /* * Queue the task for later wakeup for after we've released * the hb->lock. */ wake_q_add_safe(wake_q, p); } /* * Wake up waiters matching bitset queued on this futex (uaddr). */ int futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset) { struct futex_q *this, *next; union futex_key key = FUTEX_KEY_INIT; DEFINE_WAKE_Q(wake_q); int ret; if (!bitset) return -EINVAL; ret = get_futex_key(uaddr, flags, &key, FUTEX_READ); if (unlikely(ret != 0)) return ret; if ((flags & FLAGS_STRICT) && !nr_wake) return 0; CLASS(hb, hb)(&key); /* Make sure we really have tasks to wakeup */ if (!futex_hb_waiters_pending(hb)) return ret; spin_lock(&hb->lock); plist_for_each_entry_safe(this, next, &hb->chain, list) { if (futex_match (&this->key, &key)) { if (this->pi_state || this->rt_waiter) { ret = -EINVAL; break; } /* Check if one of the bits is set in both bitsets */ if (!(this->bitset & bitset)) continue; this->wake(&wake_q, this); if (++ret >= nr_wake) break; } } spin_unlock(&hb->lock); wake_up_q(&wake_q); return ret; } static int futex_atomic_op_inuser(unsigned int encoded_op, u32 __user *uaddr) { unsigned int op = (encoded_op & 0x70000000) >> 28; unsigned int cmp = (encoded_op & 0x0f000000) >> 24; int oparg = sign_extend32((encoded_op & 0x00fff000) >> 12, 11); int cmparg = sign_extend32(encoded_op & 0x00000fff, 11); int oldval, ret; if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28)) { if (oparg < 0 || oparg > 31) { /* * kill this print and return -EINVAL when userspace * is sane again */ pr_info_ratelimited("futex_wake_op: %s tries to shift op by %d; fix this program\n", current->comm, oparg); oparg &= 31; } oparg = 1 << oparg; } pagefault_disable(); ret = arch_futex_atomic_op_inuser(op, oparg, &oldval, uaddr); pagefault_enable(); if (ret) return ret; switch (cmp) { case FUTEX_OP_CMP_EQ: return oldval == cmparg; case FUTEX_OP_CMP_NE: return oldval != cmparg; case FUTEX_OP_CMP_LT: return oldval < cmparg; case FUTEX_OP_CMP_GE: return oldval >= cmparg; case FUTEX_OP_CMP_LE: return oldval <= cmparg; case FUTEX_OP_CMP_GT: return oldval > cmparg; default: return -ENOSYS; } } /* * Wake up all waiters hashed on the physical page that is mapped * to this virtual address: */ int futex_wake_op(u32 __user *uaddr1, unsigned int flags, u32 __user *uaddr2, int nr_wake, int nr_wake2, int op) { union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT; struct futex_q *this, *next; int ret, op_ret; DEFINE_WAKE_Q(wake_q); retry: ret = get_futex_key(uaddr1, flags, &key1, FUTEX_READ); if (unlikely(ret != 0)) return ret; ret = get_futex_key(uaddr2, flags, &key2, FUTEX_WRITE); if (unlikely(ret != 0)) return ret; retry_private: if (1) { CLASS(hb, hb1)(&key1); CLASS(hb, hb2)(&key2); double_lock_hb(hb1, hb2); op_ret = futex_atomic_op_inuser(op, uaddr2); if (unlikely(op_ret < 0)) { double_unlock_hb(hb1, hb2); if (!IS_ENABLED(CONFIG_MMU) || unlikely(op_ret != -EFAULT && op_ret != -EAGAIN)) { /* * we don't get EFAULT from MMU faults if we don't have * an MMU, but we might get them from range checking */ ret = op_ret; return ret; } if (op_ret == -EFAULT) { ret = fault_in_user_writeable(uaddr2); if (ret) return ret; } cond_resched(); if (!(flags & FLAGS_SHARED)) goto retry_private; goto retry; } plist_for_each_entry_safe(this, next, &hb1->chain, list) { if (futex_match(&this->key, &key1)) { if (this->pi_state || this->rt_waiter) { ret = -EINVAL; goto out_unlock; } this->wake(&wake_q, this); if (++ret >= nr_wake) break; } } if (op_ret > 0) { op_ret = 0; plist_for_each_entry_safe(this, next, &hb2->chain, list) { if (futex_match(&this->key, &key2)) { if (this->pi_state || this->rt_waiter) { ret = -EINVAL; goto out_unlock; } this->wake(&wake_q, this); if (++op_ret >= nr_wake2) break; } } ret += op_ret; } out_unlock: double_unlock_hb(hb1, hb2); } wake_up_q(&wake_q); return ret; } static long futex_wait_restart(struct restart_block *restart); /** * futex_do_wait() - wait for wakeup, timeout, or signal * @q: the futex_q to queue up on * @timeout: the prepared hrtimer_sleeper, or null for no timeout */ void futex_do_wait(struct futex_q *q, struct hrtimer_sleeper *timeout) { /* Arm the timer */ if (timeout) hrtimer_sleeper_start_expires(timeout, HRTIMER_MODE_ABS); /* * If we have been removed from the hash list, then another task * has tried to wake us, and we can skip the call to schedule(). */ if (likely(!plist_node_empty(&q->list))) { /* * If the timer has already expired, current will already be * flagged for rescheduling. Only call schedule if there * is no timeout, or if it has yet to expire. */ if (!timeout || timeout->task) schedule(); } __set_current_state(TASK_RUNNING); } /** * futex_unqueue_multiple - Remove various futexes from their hash bucket * @v: The list of futexes to unqueue * @count: Number of futexes in the list * * Helper to unqueue a list of futexes. This can't fail. * * Return: * - >=0 - Index of the last futex that was awoken; * - -1 - No futex was awoken */ int futex_unqueue_multiple(struct futex_vector *v, int count) { int ret = -1, i; for (i = 0; i < count; i++) { if (!futex_unqueue(&v[i].q)) ret = i; } return ret; } /** * futex_wait_multiple_setup - Prepare to wait and enqueue multiple futexes * @vs: The futex list to wait on * @count: The size of the list * @woken: Index of the last woken futex, if any. Used to notify the * caller that it can return this index to userspace (return parameter) * * Prepare multiple futexes in a single step and enqueue them. This may fail if * the futex list is invalid or if any futex was already awoken. On success the * task is ready to interruptible sleep. * * Return: * - 1 - One of the futexes was woken by another thread * - 0 - Success * - <0 - -EFAULT, -EWOULDBLOCK or -EINVAL */ int futex_wait_multiple_setup(struct futex_vector *vs, int count, int *woken) { bool retry = false; int ret, i; u32 uval; /* * Make sure to have a reference on the private_hash such that we * don't block on rehash after changing the task state below. */ guard(private_hash)(); /* * Enqueuing multiple futexes is tricky, because we need to enqueue * each futex on the list before dealing with the next one to avoid * deadlocking on the hash bucket. But, before enqueuing, we need to * make sure that current->state is TASK_INTERRUPTIBLE, so we don't * lose any wake events, which cannot be done before the get_futex_key * of the next key, because it calls get_user_pages, which can sleep. * Thus, we fetch the list of futexes keys in two steps, by first * pinning all the memory keys in the futex key, and only then we read * each key and queue the corresponding futex. * * Private futexes doesn't need to recalculate hash in retry, so skip * get_futex_key() when retrying. */ retry: for (i = 0; i < count; i++) { if (!(vs[i].w.flags & FLAGS_SHARED) && retry) continue; ret = get_futex_key(u64_to_user_ptr(vs[i].w.uaddr), vs[i].w.flags, &vs[i].q.key, FUTEX_READ); if (unlikely(ret)) return ret; } set_current_state(TASK_INTERRUPTIBLE|TASK_FREEZABLE); for (i = 0; i < count; i++) { u32 __user *uaddr = (u32 __user *)(unsigned long)vs[i].w.uaddr; struct futex_q *q = &vs[i].q; u32 val = vs[i].w.val; if (1) { CLASS(hb, hb)(&q->key); futex_q_lock(q, hb); ret = futex_get_value_locked(&uval, uaddr); if (!ret && uval == val) { /* * The bucket lock can't be held while dealing with the * next futex. Queue each futex at this moment so hb can * be unlocked. */ futex_queue(q, hb, current); continue; } futex_q_unlock(hb); } __set_current_state(TASK_RUNNING); /* * Even if something went wrong, if we find out that a futex * was woken, we don't return error and return this index to * userspace */ *woken = futex_unqueue_multiple(vs, i); if (*woken >= 0) return 1; if (ret) { /* * If we need to handle a page fault, we need to do so * without any lock and any enqueued futex (otherwise * we could lose some wakeup). So we do it here, after * undoing all the work done so far. In success, we * retry all the work. */ if (get_user(uval, uaddr)) return -EFAULT; retry = true; goto retry; } if (uval != val) return -EWOULDBLOCK; } return 0; } /** * futex_sleep_multiple - Check sleeping conditions and sleep * @vs: List of futexes to wait for * @count: Length of vs * @to: Timeout * * Sleep if and only if the timeout hasn't expired and no futex on the list has * been woken up. */ static void futex_sleep_multiple(struct futex_vector *vs, unsigned int count, struct hrtimer_sleeper *to) { if (to && !to->task) return; for (; count; count--, vs++) { if (!READ_ONCE(vs->q.lock_ptr)) return; } schedule(); } /** * futex_wait_multiple - Prepare to wait on and enqueue several futexes * @vs: The list of futexes to wait on * @count: The number of objects * @to: Timeout before giving up and returning to userspace * * Entry point for the FUTEX_WAIT_MULTIPLE futex operation, this function * sleeps on a group of futexes and returns on the first futex that is * wake, or after the timeout has elapsed. * * Return: * - >=0 - Hint to the futex that was awoken * - <0 - On error */ int futex_wait_multiple(struct futex_vector *vs, unsigned int count, struct hrtimer_sleeper *to) { int ret, hint = 0; if (to) hrtimer_sleeper_start_expires(to, HRTIMER_MODE_ABS); while (1) { ret = futex_wait_multiple_setup(vs, count, &hint); if (ret) { if (ret > 0) { /* A futex was woken during setup */ ret = hint; } return ret; } futex_sleep_multiple(vs, count, to); __set_current_state(TASK_RUNNING); ret = futex_unqueue_multiple(vs, count); if (ret >= 0) return ret; if (to && !to->task) return -ETIMEDOUT; else if (signal_pending(current)) return -ERESTARTSYS; /* * The final case is a spurious wakeup, for * which just retry. */ } } /** * futex_wait_setup() - Prepare to wait on a futex * @uaddr: the futex userspace address * @val: the expected value * @flags: futex flags (FLAGS_SHARED, etc.) * @q: the associated futex_q * @key2: the second futex_key if used for requeue PI * @task: Task queueing this futex * * Setup the futex_q and locate the hash_bucket. Get the futex value and * compare it with the expected value. Handle atomic faults internally. * Return with the hb lock held on success, and unlocked on failure. * * Return: * - 0 - uaddr contains val and hb has been locked; * - <0 - On error and the hb is unlocked. A possible reason: the uaddr can not * be read, does not contain the expected value or is not properly aligned. */ int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags, struct futex_q *q, union futex_key *key2, struct task_struct *task) { u32 uval; int ret; /* * Access the page AFTER the hash-bucket is locked. * Order is important: * * Userspace waiter: val = var; if (cond(val)) futex_wait(&var, val); * Userspace waker: if (cond(var)) { var = new; futex_wake(&var); } * * The basic logical guarantee of a futex is that it blocks ONLY * if cond(var) is known to be true at the time of blocking, for * any cond. If we locked the hash-bucket after testing *uaddr, that * would open a race condition where we could block indefinitely with * cond(var) false, which would violate the guarantee. * * On the other hand, we insert q and release the hash-bucket only * after testing *uaddr. This guarantees that futex_wait() will NOT * absorb a wakeup if *uaddr does not match the desired values * while the syscall executes. */ retry: ret = get_futex_key(uaddr, flags, &q->key, FUTEX_READ); if (unlikely(ret != 0)) return ret; retry_private: if (1) { CLASS(hb, hb)(&q->key); futex_q_lock(q, hb); ret = futex_get_value_locked(&uval, uaddr); if (ret) { futex_q_unlock(hb); ret = get_user(uval, uaddr); if (ret) return ret; if (!(flags & FLAGS_SHARED)) goto retry_private; goto retry; } if (uval != val) { futex_q_unlock(hb); return -EWOULDBLOCK; } if (key2 && futex_match(&q->key, key2)) { futex_q_unlock(hb); return -EINVAL; } /* * The task state is guaranteed to be set before another task can * wake it. set_current_state() is implemented using smp_store_mb() and * futex_queue() calls spin_unlock() upon completion, both serializing * access to the hash list and forcing another memory barrier. */ if (task == current) set_current_state(TASK_INTERRUPTIBLE|TASK_FREEZABLE); futex_queue(q, hb, task); } return ret; } int __futex_wait(u32 __user *uaddr, unsigned int flags, u32 val, struct hrtimer_sleeper *to, u32 bitset) { struct futex_q q = futex_q_init; int ret; if (!bitset) return -EINVAL; q.bitset = bitset; retry: /* * Prepare to wait on uaddr. On success, it holds hb->lock and q * is initialized. */ ret = futex_wait_setup(uaddr, val, flags, &q, NULL, current); if (ret) return ret; /* futex_queue and wait for wakeup, timeout, or a signal. */ futex_do_wait(&q, to); /* If we were woken (and unqueued), we succeeded, whatever. */ if (!futex_unqueue(&q)) return 0; if (to && !to->task) return -ETIMEDOUT; /* * We expect signal_pending(current), but we might be the * victim of a spurious wakeup as well. */ if (!signal_pending(current)) goto retry; return -ERESTARTSYS; } int futex_wait(u32 __user *uaddr, unsigned int flags, u32 val, ktime_t *abs_time, u32 bitset) { struct hrtimer_sleeper timeout, *to; struct restart_block *restart; int ret; to = futex_setup_timer(abs_time, &timeout, flags, current->timer_slack_ns); ret = __futex_wait(uaddr, flags, val, to, bitset); /* No timeout, nothing to clean up. */ if (!to) return ret; hrtimer_cancel(&to->timer); destroy_hrtimer_on_stack(&to->timer); if (ret == -ERESTARTSYS) { restart = &current->restart_block; restart->futex.uaddr = uaddr; restart->futex.val = val; restart->futex.time = *abs_time; restart->futex.bitset = bitset; restart->futex.flags = flags | FLAGS_HAS_TIMEOUT; return set_restart_fn(restart, futex_wait_restart); } return ret; } static long futex_wait_restart(struct restart_block *restart) { u32 __user *uaddr = restart->futex.uaddr; ktime_t t, *tp = NULL; if (restart->futex.flags & FLAGS_HAS_TIMEOUT) { t = restart->futex.time; tp = &t; } restart->fn = do_no_restart_syscall; return (long)futex_wait(uaddr, restart->futex.flags, restart->futex.val, tp, restart->futex.bitset); }
25 26 25 26 15 6 17 21 21 21 1 17 6 20 2 21 17 17 1 17 4 3 4 1 6 1 2 4 21 20 15 15 10 10 8 3 17 16 13 2 12 2 2 11 11 11 11 11 6 2 2 2 15 19 19 4 2 13 15 15 13 15 13 11 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 // SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc. * Copyright (c) 2013 Red Hat, Inc. * All Rights Reserved. */ #include "xfs.h" #include "xfs_fs.h" #include "xfs_shared.h" #include "xfs_format.h" #include "xfs_log_format.h" #include "xfs_trans_resv.h" #include "xfs_mount.h" #include "xfs_inode.h" #include "xfs_trans.h" #include "xfs_bmap.h" #include "xfs_buf_item.h" #include "xfs_dir2.h" #include "xfs_dir2_priv.h" #include "xfs_error.h" #include "xfs_trace.h" #include "xfs_log.h" #include "xfs_health.h" /* * Local function prototypes. */ static void xfs_dir2_block_log_leaf(xfs_trans_t *tp, struct xfs_buf *bp, int first, int last); static void xfs_dir2_block_log_tail(xfs_trans_t *tp, struct xfs_buf *bp); static int xfs_dir2_block_lookup_int(xfs_da_args_t *args, struct xfs_buf **bpp, int *entno); static int xfs_dir2_block_sort(const void *a, const void *b); static xfs_dahash_t xfs_dir_hash_dot, xfs_dir_hash_dotdot; /* * One-time startup routine called from xfs_init(). */ void xfs_dir_startup(void) { xfs_dir_hash_dot = xfs_da_hashname((unsigned char *)".", 1); xfs_dir_hash_dotdot = xfs_da_hashname((unsigned char *)"..", 2); } static xfs_failaddr_t xfs_dir3_block_verify( struct xfs_buf *bp) { struct xfs_mount *mp = bp->b_mount; struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr; if (!xfs_verify_magic(bp, hdr3->magic)) return __this_address; if (xfs_has_crc(mp)) { if (!uuid_equal(&hdr3->uuid, &mp->m_sb.sb_meta_uuid)) return __this_address; if (be64_to_cpu(hdr3->blkno) != xfs_buf_daddr(bp)) return __this_address; if (!xfs_log_check_lsn(mp, be64_to_cpu(hdr3->lsn))) return __this_address; } return __xfs_dir3_data_check(NULL, bp); } static void xfs_dir3_block_read_verify( struct xfs_buf *bp) { struct xfs_mount *mp = bp->b_mount; xfs_failaddr_t fa; if (xfs_has_crc(mp) && !xfs_buf_verify_cksum(bp, XFS_DIR3_DATA_CRC_OFF)) xfs_verifier_error(bp, -EFSBADCRC, __this_address); else { fa = xfs_dir3_block_verify(bp); if (fa) xfs_verifier_error(bp, -EFSCORRUPTED, fa); } } static void xfs_dir3_block_write_verify( struct xfs_buf *bp) { struct xfs_mount *mp = bp->b_mount; struct xfs_buf_log_item *bip = bp->b_log_item; struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr; xfs_failaddr_t fa; fa = xfs_dir3_block_verify(bp); if (fa) { xfs_verifier_error(bp, -EFSCORRUPTED, fa); return; } if (!xfs_has_crc(mp)) return; if (bip) hdr3->lsn = cpu_to_be64(bip->bli_item.li_lsn); xfs_buf_update_cksum(bp, XFS_DIR3_DATA_CRC_OFF); } const struct xfs_buf_ops xfs_dir3_block_buf_ops = { .name = "xfs_dir3_block", .magic = { cpu_to_be32(XFS_DIR2_BLOCK_MAGIC), cpu_to_be32(XFS_DIR3_BLOCK_MAGIC) }, .verify_read = xfs_dir3_block_read_verify, .verify_write = xfs_dir3_block_write_verify, .verify_struct = xfs_dir3_block_verify, }; xfs_failaddr_t xfs_dir3_block_header_check( struct xfs_buf *bp, xfs_ino_t owner) { struct xfs_mount *mp = bp->b_mount; if (xfs_has_crc(mp)) { struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr; if (hdr3->magic != cpu_to_be32(XFS_DIR3_BLOCK_MAGIC)) return __this_address; if (be64_to_cpu(hdr3->owner) != owner) return __this_address; } return NULL; } int xfs_dir3_block_read( struct xfs_trans *tp, struct xfs_inode *dp, xfs_ino_t owner, struct xfs_buf **bpp) { struct xfs_mount *mp = dp->i_mount; xfs_failaddr_t fa; int err; err = xfs_da_read_buf(tp, dp, mp->m_dir_geo->datablk, 0, bpp, XFS_DATA_FORK, &xfs_dir3_block_buf_ops); if (err || !*bpp) return err; /* Check things that we can't do in the verifier. */ fa = xfs_dir3_block_header_check(*bpp, owner); if (fa) { __xfs_buf_mark_corrupt(*bpp, fa); xfs_trans_brelse(tp, *bpp); *bpp = NULL; xfs_dirattr_mark_sick(dp, XFS_DATA_FORK); return -EFSCORRUPTED; } xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_DIR_BLOCK_BUF); return err; } static void xfs_dir3_block_init( struct xfs_da_args *args, struct xfs_buf *bp) { struct xfs_trans *tp = args->trans; struct xfs_inode *dp = args->dp; struct xfs_mount *mp = dp->i_mount; struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr; bp->b_ops = &xfs_dir3_block_buf_ops; xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DIR_BLOCK_BUF); if (xfs_has_crc(mp)) { memset(hdr3, 0, sizeof(*hdr3)); hdr3->magic = cpu_to_be32(XFS_DIR3_BLOCK_MAGIC); hdr3->blkno = cpu_to_be64(xfs_buf_daddr(bp)); hdr3->owner = cpu_to_be64(args->owner); uuid_copy(&hdr3->uuid, &mp->m_sb.sb_meta_uuid); return; } hdr3->magic = cpu_to_be32(XFS_DIR2_BLOCK_MAGIC); } static void xfs_dir2_block_need_space( struct xfs_inode *dp, struct xfs_dir2_data_hdr *hdr, struct xfs_dir2_block_tail *btp, struct xfs_dir2_leaf_entry *blp, __be16 **tagpp, struct xfs_dir2_data_unused **dupp, struct xfs_dir2_data_unused **enddupp, int *compact, int len) { struct xfs_dir2_data_free *bf; __be16 *tagp = NULL; struct xfs_dir2_data_unused *dup = NULL; struct xfs_dir2_data_unused *enddup = NULL; *compact = 0; bf = xfs_dir2_data_bestfree_p(dp->i_mount, hdr); /* * If there are stale entries we'll use one for the leaf. */ if (btp->stale) { if (be16_to_cpu(bf[0].length) >= len) { /* * The biggest entry enough to avoid compaction. */ dup = (xfs_dir2_data_unused_t *) ((char *)hdr + be16_to_cpu(bf[0].offset)); goto out; } /* * Will need to compact to make this work. * Tag just before the first leaf entry. */ *compact = 1; tagp = (__be16 *)blp - 1; /* Data object just before the first leaf entry. */ dup = (xfs_dir2_data_unused_t *)((char *)hdr + be16_to_cpu(*tagp)); /* * If it's not free then the data will go where the * leaf data starts now, if it works at all. */ if (be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG) { if (be16_to_cpu(dup->length) + (be32_to_cpu(btp->stale) - 1) * (uint)sizeof(*blp) < len) dup = NULL; } else if ((be32_to_cpu(btp->stale) - 1) * (uint)sizeof(*blp) < len) dup = NULL; else dup = (xfs_dir2_data_unused_t *)blp; goto out; } /* * no stale entries, so just use free space. * Tag just before the first leaf entry. */ tagp = (__be16 *)blp - 1; /* Data object just before the first leaf entry. */ enddup = (xfs_dir2_data_unused_t *)((char *)hdr + be16_to_cpu(*tagp)); /* * If it's not free then can't do this add without cleaning up: * the space before the first leaf entry needs to be free so it * can be expanded to hold the pointer to the new entry. */ if (be16_to_cpu(enddup->freetag) == XFS_DIR2_DATA_FREE_TAG) { /* * Check out the biggest freespace and see if it's the same one. */ dup = (xfs_dir2_data_unused_t *) ((char *)hdr + be16_to_cpu(bf[0].offset)); if (dup != enddup) { /* * Not the same free entry, just check its length. */ if (be16_to_cpu(dup->length) < len) dup = NULL; goto out; } /* * It is the biggest freespace, can it hold the leaf too? */ if (be16_to_cpu(dup->length) < len + (uint)sizeof(*blp)) { /* * Yes, use the second-largest entry instead if it works. */ if (be16_to_cpu(bf[1].length) >= len) dup = (xfs_dir2_data_unused_t *) ((char *)hdr + be16_to_cpu(bf[1].offset)); else dup = NULL; } } out: *tagpp = tagp; *dupp = dup; *enddupp = enddup; } /* * compact the leaf entries. * Leave the highest-numbered stale entry stale. * XXX should be the one closest to mid but mid is not yet computed. */ static void xfs_dir2_block_compact( struct xfs_da_args *args, struct xfs_buf *bp, struct xfs_dir2_data_hdr *hdr, struct xfs_dir2_block_tail *btp, struct xfs_dir2_leaf_entry *blp, int *needlog, int *lfloghigh, int *lfloglow) { int fromidx; /* source leaf index */ int toidx; /* target leaf index */ int needscan = 0; int highstale; /* high stale index */ fromidx = toidx = be32_to_cpu(btp->count) - 1; highstale = *lfloghigh = -1; for (; fromidx >= 0; fromidx--) { if (blp[fromidx].address == cpu_to_be32(XFS_DIR2_NULL_DATAPTR)) { if (highstale == -1) highstale = toidx; else { if (*lfloghigh == -1) *lfloghigh = toidx; continue; } } if (fromidx < toidx) blp[toidx] = blp[fromidx]; toidx--; } *lfloglow = toidx + 1 - (be32_to_cpu(btp->stale) - 1); *lfloghigh -= be32_to_cpu(btp->stale) - 1; be32_add_cpu(&btp->count, -(be32_to_cpu(btp->stale) - 1)); xfs_dir2_data_make_free(args, bp, (xfs_dir2_data_aoff_t)((char *)blp - (char *)hdr), (xfs_dir2_data_aoff_t)((be32_to_cpu(btp->stale) - 1) * sizeof(*blp)), needlog, &needscan); btp->stale = cpu_to_be32(1); /* * If we now need to rebuild the bestfree map, do so. * This needs to happen before the next call to use_free. */ if (needscan) xfs_dir2_data_freescan(args->dp->i_mount, hdr, needlog); } /* * Add an entry to a block directory. */ int /* error */ xfs_dir2_block_addname( xfs_da_args_t *args) /* directory op arguments */ { xfs_dir2_data_hdr_t *hdr; /* block header */ xfs_dir2_leaf_entry_t *blp; /* block leaf entries */ struct xfs_buf *bp; /* buffer for block */ xfs_dir2_block_tail_t *btp; /* block tail */ int compact; /* need to compact leaf ents */ xfs_dir2_data_entry_t *dep; /* block data entry */ xfs_inode_t *dp; /* directory inode */ xfs_dir2_data_unused_t *dup; /* block unused entry */ int error; /* error return value */ xfs_dir2_data_unused_t *enddup=NULL; /* unused at end of data */ xfs_dahash_t hash; /* hash value of found entry */ int high; /* high index for binary srch */ int highstale; /* high stale index */ int lfloghigh=0; /* last final leaf to log */ int lfloglow=0; /* first final leaf to log */ int len; /* length of the new entry */ int low; /* low index for binary srch */ int lowstale; /* low stale index */ int mid=0; /* midpoint for binary srch */ int needlog; /* need to log header */ int needscan; /* need to rescan freespace */ __be16 *tagp; /* pointer to tag value */ xfs_trans_t *tp; /* transaction structure */ trace_xfs_dir2_block_addname(args); dp = args->dp; tp = args->trans; /* Read the (one and only) directory block into bp. */ error = xfs_dir3_block_read(tp, dp, args->owner, &bp); if (error) return error; len = xfs_dir2_data_entsize(dp->i_mount, args->namelen); /* * Set up pointers to parts of the block. */ hdr = bp->b_addr; btp = xfs_dir2_block_tail_p(args->geo, hdr); blp = xfs_dir2_block_leaf_p(btp); /* * Find out if we can reuse stale entries or whether we need extra * space for entry and new leaf. */ xfs_dir2_block_need_space(dp, hdr, btp, blp, &tagp, &dup, &enddup, &compact, len); /* * Done everything we need for a space check now. */ if (args->op_flags & XFS_DA_OP_JUSTCHECK) { xfs_trans_brelse(tp, bp); if (!dup) return -ENOSPC; return 0; } /* * If we don't have space for the new entry & leaf ... */ if (!dup) { /* Don't have a space reservation: return no-space. */ if (args->total == 0) return -ENOSPC; /* * Convert to the next larger format. * Then add the new entry in that format. */ error = xfs_dir2_block_to_leaf(args, bp); if (error) return error; return xfs_dir2_leaf_addname(args); } needlog = needscan = 0; /* * If need to compact the leaf entries, do it now. */ if (compact) { xfs_dir2_block_compact(args, bp, hdr, btp, blp, &needlog, &lfloghigh, &lfloglow); /* recalculate blp post-compaction */ blp = xfs_dir2_block_leaf_p(btp); } else if (btp->stale) { /* * Set leaf logging boundaries to impossible state. * For the no-stale case they're set explicitly. */ lfloglow = be32_to_cpu(btp->count); lfloghigh = -1; } /* * Find the slot that's first lower than our hash value, -1 if none. */ for (low = 0, high = be32_to_cpu(btp->count) - 1; low <= high; ) { mid = (low + high) >> 1; if ((hash = be32_to_cpu(blp[mid].hashval)) == args->hashval) break; if (hash < args->hashval) low = mid + 1; else high = mid - 1; } while (mid >= 0 && be32_to_cpu(blp[mid].hashval) >= args->hashval) { mid--; } /* * No stale entries, will use enddup space to hold new leaf. */ if (!btp->stale) { xfs_dir2_data_aoff_t aoff; /* * Mark the space needed for the new leaf entry, now in use. */ aoff = (xfs_dir2_data_aoff_t)((char *)enddup - (char *)hdr + be16_to_cpu(enddup->length) - sizeof(*blp)); error = xfs_dir2_data_use_free(args, bp, enddup, aoff, (xfs_dir2_data_aoff_t)sizeof(*blp), &needlog, &needscan); if (error) return error; /* * Update the tail (entry count). */ be32_add_cpu(&btp->count, 1); /* * If we now need to rebuild the bestfree map, do so. * This needs to happen before the next call to use_free. */ if (needscan) { xfs_dir2_data_freescan(dp->i_mount, hdr, &needlog); needscan = 0; } /* * Adjust pointer to the first leaf entry, we're about to move * the table up one to open up space for the new leaf entry. * Then adjust our index to match. */ blp--; mid++; if (mid) memmove(blp, &blp[1], mid * sizeof(*blp)); lfloglow = 0; lfloghigh = mid; } /* * Use a stale leaf for our new entry. */ else { for (lowstale = mid; lowstale >= 0 && blp[lowstale].address != cpu_to_be32(XFS_DIR2_NULL_DATAPTR); lowstale--) continue; for (highstale = mid + 1; highstale < be32_to_cpu(btp->count) && blp[highstale].address != cpu_to_be32(XFS_DIR2_NULL_DATAPTR) && (lowstale < 0 || mid - lowstale > highstale - mid); highstale++) continue; /* * Move entries toward the low-numbered stale entry. */ if (lowstale >= 0 && (highstale == be32_to_cpu(btp->count) || mid - lowstale <= highstale - mid)) { if (mid - lowstale) memmove(&blp[lowstale], &blp[lowstale + 1], (mid - lowstale) * sizeof(*blp)); lfloglow = min(lowstale, lfloglow); lfloghigh = max(mid, lfloghigh); } /* * Move entries toward the high-numbered stale entry. */ else { ASSERT(highstale < be32_to_cpu(btp->count)); mid++; if (highstale - mid) memmove(&blp[mid + 1], &blp[mid], (highstale - mid) * sizeof(*blp)); lfloglow = min(mid, lfloglow); lfloghigh = max(highstale, lfloghigh); } be32_add_cpu(&btp->stale, -1); } /* * Point to the new data entry. */ dep = (xfs_dir2_data_entry_t *)dup; /* * Fill in the leaf entry. */ blp[mid].hashval = cpu_to_be32(args->hashval); blp[mid].address = cpu_to_be32(xfs_dir2_byte_to_dataptr( (char *)dep - (char *)hdr)); xfs_dir2_block_log_leaf(tp, bp, lfloglow, lfloghigh); /* * Mark space for the data entry used. */ error = xfs_dir2_data_use_free(args, bp, dup, (xfs_dir2_data_aoff_t)((char *)dup - (char *)hdr), (xfs_dir2_data_aoff_t)len, &needlog, &needscan); if (error) return error; /* * Create the new data entry. */ dep->inumber = cpu_to_be64(args->inumber); dep->namelen = args->namelen; memcpy(dep->name, args->name, args->namelen); xfs_dir2_data_put_ftype(dp->i_mount, dep, args->filetype); tagp = xfs_dir2_data_entry_tag_p(dp->i_mount, dep); *tagp = cpu_to_be16((char *)dep - (char *)hdr); /* * Clean up the bestfree array and log the header, tail, and entry. */ if (needscan) xfs_dir2_data_freescan(dp->i_mount, hdr, &needlog); if (needlog) xfs_dir2_data_log_header(args, bp); xfs_dir2_block_log_tail(tp, bp); xfs_dir2_data_log_entry(args, bp, dep); xfs_dir3_data_check(dp, bp); return 0; } /* * Log leaf entries from the block. */ static void xfs_dir2_block_log_leaf( xfs_trans_t *tp, /* transaction structure */ struct xfs_buf *bp, /* block buffer */ int first, /* index of first logged leaf */ int last) /* index of last logged leaf */ { xfs_dir2_data_hdr_t *hdr = bp->b_addr; xfs_dir2_leaf_entry_t *blp; xfs_dir2_block_tail_t *btp; btp = xfs_dir2_block_tail_p(tp->t_mountp->m_dir_geo, hdr); blp = xfs_dir2_block_leaf_p(btp); xfs_trans_log_buf(tp, bp, (uint)((char *)&blp[first] - (char *)hdr), (uint)((char *)&blp[last + 1] - (char *)hdr - 1)); } /* * Log the block tail. */ static void xfs_dir2_block_log_tail( xfs_trans_t *tp, /* transaction structure */ struct xfs_buf *bp) /* block buffer */ { xfs_dir2_data_hdr_t *hdr = bp->b_addr; xfs_dir2_block_tail_t *btp; btp = xfs_dir2_block_tail_p(tp->t_mountp->m_dir_geo, hdr); xfs_trans_log_buf(tp, bp, (uint)((char *)btp - (char *)hdr), (uint)((char *)(btp + 1) - (char *)hdr - 1)); } /* * Look up an entry in the block. This is the external routine, * xfs_dir2_block_lookup_int does the real work. */ int /* error */ xfs_dir2_block_lookup( xfs_da_args_t *args) /* dir lookup arguments */ { xfs_dir2_data_hdr_t *hdr; /* block header */ xfs_dir2_leaf_entry_t *blp; /* block leaf entries */ struct xfs_buf *bp; /* block buffer */ xfs_dir2_block_tail_t *btp; /* block tail */ xfs_dir2_data_entry_t *dep; /* block data entry */ xfs_inode_t *dp; /* incore inode */ int ent; /* entry index */ int error; /* error return value */ trace_xfs_dir2_block_lookup(args); /* * Get the buffer, look up the entry. * If not found (ENOENT) then return, have no buffer. */ if ((error = xfs_dir2_block_lookup_int(args, &bp, &ent))) return error; dp = args->dp; hdr = bp->b_addr; xfs_dir3_data_check(dp, bp); btp = xfs_dir2_block_tail_p(args->geo, hdr); blp = xfs_dir2_block_leaf_p(btp); /* * Get the offset from the leaf entry, to point to the data. */ dep = (xfs_dir2_data_entry_t *)((char *)hdr + xfs_dir2_dataptr_to_off(args->geo, be32_to_cpu(blp[ent].address))); /* * Fill in inode number, CI name if appropriate, release the block. */ args->inumber = be64_to_cpu(dep->inumber); args->filetype = xfs_dir2_data_get_ftype(dp->i_mount, dep); error = xfs_dir_cilookup_result(args, dep->name, dep->namelen); xfs_trans_brelse(args->trans, bp); return error; } /* * Internal block lookup routine. */ static int /* error */ xfs_dir2_block_lookup_int( xfs_da_args_t *args, /* dir lookup arguments */ struct xfs_buf **bpp, /* returned block buffer */ int *entno) /* returned entry number */ { xfs_dir2_dataptr_t addr; /* data entry address */ xfs_dir2_data_hdr_t *hdr; /* block header */ xfs_dir2_leaf_entry_t *blp; /* block leaf entries */ struct xfs_buf *bp; /* block buffer */ xfs_dir2_block_tail_t *btp; /* block tail */ xfs_dir2_data_entry_t *dep; /* block data entry */ xfs_inode_t *dp; /* incore inode */ int error; /* error return value */ xfs_dahash_t hash; /* found hash value */ int high; /* binary search high index */ int low; /* binary search low index */ int mid; /* binary search current idx */ xfs_trans_t *tp; /* transaction pointer */ enum xfs_dacmp cmp; /* comparison result */ dp = args->dp; tp = args->trans; error = xfs_dir3_block_read(tp, dp, args->owner, &bp); if (error) return error; hdr = bp->b_addr; xfs_dir3_data_check(dp, bp); btp = xfs_dir2_block_tail_p(args->geo, hdr); blp = xfs_dir2_block_leaf_p(btp); /* * Loop doing a binary search for our hash value. * Find our entry, ENOENT if it's not there. */ for (low = 0, high = be32_to_cpu(btp->count) - 1; ; ) { ASSERT(low <= high); mid = (low + high) >> 1; if ((hash = be32_to_cpu(blp[mid].hashval)) == args->hashval) break; if (hash < args->hashval) low = mid + 1; else high = mid - 1; if (low > high) { ASSERT(args->op_flags & XFS_DA_OP_OKNOENT); xfs_trans_brelse(tp, bp); return -ENOENT; } } /* * Back up to the first one with the right hash value. */ while (mid > 0 && be32_to_cpu(blp[mid - 1].hashval) == args->hashval) { mid--; } /* * Now loop forward through all the entries with the * right hash value looking for our name. */ do { if ((addr = be32_to_cpu(blp[mid].address)) == XFS_DIR2_NULL_DATAPTR) continue; /* * Get pointer to the entry from the leaf. */ dep = (xfs_dir2_data_entry_t *) ((char *)hdr + xfs_dir2_dataptr_to_off(args->geo, addr)); /* * Compare name and if it's an exact match, return the index * and buffer. If it's the first case-insensitive match, store * the index and buffer and continue looking for an exact match. */ cmp = xfs_dir2_compname(args, dep->name, dep->namelen); if (cmp != XFS_CMP_DIFFERENT && cmp != args->cmpresult) { args->cmpresult = cmp; *bpp = bp; *entno = mid; if (cmp == XFS_CMP_EXACT) return 0; } } while (++mid < be32_to_cpu(btp->count) && be32_to_cpu(blp[mid].hashval) == hash); ASSERT(args->op_flags & XFS_DA_OP_OKNOENT); /* * Here, we can only be doing a lookup (not a rename or replace). * If a case-insensitive match was found earlier, return success. */ if (args->cmpresult == XFS_CMP_CASE) return 0; /* * No match, release the buffer and return ENOENT. */ xfs_trans_brelse(tp, bp); return -ENOENT; } /* * Remove an entry from a block format directory. * If that makes the block small enough to fit in shortform, transform it. */ int /* error */ xfs_dir2_block_removename( xfs_da_args_t *args) /* directory operation args */ { xfs_dir2_data_hdr_t *hdr; /* block header */ xfs_dir2_leaf_entry_t *blp; /* block leaf pointer */ struct xfs_buf *bp; /* block buffer */ xfs_dir2_block_tail_t *btp; /* block tail */ xfs_dir2_data_entry_t *dep; /* block data entry */ xfs_inode_t *dp; /* incore inode */ int ent; /* block leaf entry index */ int error; /* error return value */ int needlog; /* need to log block header */ int needscan; /* need to fixup bestfree */ xfs_dir2_sf_hdr_t sfh; /* shortform header */ int size; /* shortform size */ xfs_trans_t *tp; /* transaction pointer */ trace_xfs_dir2_block_removename(args); /* * Look up the entry in the block. Gets the buffer and entry index. * It will always be there, the vnodeops level does a lookup first. */ if ((error = xfs_dir2_block_lookup_int(args, &bp, &ent))) { return error; } dp = args->dp; tp = args->trans; hdr = bp->b_addr; btp = xfs_dir2_block_tail_p(args->geo, hdr); blp = xfs_dir2_block_leaf_p(btp); /* * Point to the data entry using the leaf entry. */ dep = (xfs_dir2_data_entry_t *)((char *)hdr + xfs_dir2_dataptr_to_off(args->geo, be32_to_cpu(blp[ent].address))); /* * Mark the data entry's space free. */ needlog = needscan = 0; xfs_dir2_data_make_free(args, bp, (xfs_dir2_data_aoff_t)((char *)dep - (char *)hdr), xfs_dir2_data_entsize(dp->i_mount, dep->namelen), &needlog, &needscan); /* * Fix up the block tail. */ be32_add_cpu(&btp->stale, 1); xfs_dir2_block_log_tail(tp, bp); /* * Remove the leaf entry by marking it stale. */ blp[ent].address = cpu_to_be32(XFS_DIR2_NULL_DATAPTR); xfs_dir2_block_log_leaf(tp, bp, ent, ent); /* * Fix up bestfree, log the header if necessary. */ if (needscan) xfs_dir2_data_freescan(dp->i_mount, hdr, &needlog); if (needlog) xfs_dir2_data_log_header(args, bp); xfs_dir3_data_check(dp, bp); /* * See if the size as a shortform is good enough. */ size = xfs_dir2_block_sfsize(dp, hdr, &sfh); if (size > xfs_inode_data_fork_size(dp)) return 0; /* * If it works, do the conversion. */ return xfs_dir2_block_to_sf(args, bp, size, &sfh); } /* * Replace an entry in a V2 block directory. * Change the inode number to the new value. */ int /* error */ xfs_dir2_block_replace( xfs_da_args_t *args) /* directory operation args */ { xfs_dir2_data_hdr_t *hdr; /* block header */ xfs_dir2_leaf_entry_t *blp; /* block leaf entries */ struct xfs_buf *bp; /* block buffer */ xfs_dir2_block_tail_t *btp; /* block tail */ xfs_dir2_data_entry_t *dep; /* block data entry */ xfs_inode_t *dp; /* incore inode */ int ent; /* leaf entry index */ int error; /* error return value */ trace_xfs_dir2_block_replace(args); /* * Lookup the entry in the directory. Get buffer and entry index. * This will always succeed since the caller has already done a lookup. */ if ((error = xfs_dir2_block_lookup_int(args, &bp, &ent))) { return error; } dp = args->dp; hdr = bp->b_addr; btp = xfs_dir2_block_tail_p(args->geo, hdr); blp = xfs_dir2_block_leaf_p(btp); /* * Point to the data entry we need to change. */ dep = (xfs_dir2_data_entry_t *)((char *)hdr + xfs_dir2_dataptr_to_off(args->geo, be32_to_cpu(blp[ent].address))); ASSERT(be64_to_cpu(dep->inumber) != args->inumber); /* * Change the inode number to the new value. */ dep->inumber = cpu_to_be64(args->inumber); xfs_dir2_data_put_ftype(dp->i_mount, dep, args->filetype); xfs_dir2_data_log_entry(args, bp, dep); xfs_dir3_data_check(dp, bp); return 0; } /* * Qsort comparison routine for the block leaf entries. */ static int /* sort order */ xfs_dir2_block_sort( const void *a, /* first leaf entry */ const void *b) /* second leaf entry */ { const xfs_dir2_leaf_entry_t *la; /* first leaf entry */ const xfs_dir2_leaf_entry_t *lb; /* second leaf entry */ la = a; lb = b; return be32_to_cpu(la->hashval) < be32_to_cpu(lb->hashval) ? -1 : (be32_to_cpu(la->hashval) > be32_to_cpu(lb->hashval) ? 1 : 0); } /* * Convert a V2 leaf directory to a V2 block directory if possible. */ int /* error */ xfs_dir2_leaf_to_block( xfs_da_args_t *args, /* operation arguments */ struct xfs_buf *lbp, /* leaf buffer */ struct xfs_buf *dbp) /* data buffer */ { __be16 *bestsp; /* leaf bests table */ xfs_dir2_data_hdr_t *hdr; /* block header */ xfs_dir2_block_tail_t *btp; /* block tail */ xfs_inode_t *dp; /* incore directory inode */ xfs_dir2_data_unused_t *dup; /* unused data entry */ int error; /* error return value */ int from; /* leaf from index */ xfs_dir2_leaf_t *leaf; /* leaf structure */ xfs_dir2_leaf_entry_t *lep; /* leaf entry */ xfs_dir2_leaf_tail_t *ltp; /* leaf tail structure */ xfs_mount_t *mp; /* file system mount point */ int needlog; /* need to log data header */ int needscan; /* need to scan for bestfree */ xfs_dir2_sf_hdr_t sfh; /* shortform header */ int size; /* bytes used */ __be16 *tagp; /* end of entry (tag) */ int to; /* block/leaf to index */ xfs_trans_t *tp; /* transaction pointer */ struct xfs_dir3_icleaf_hdr leafhdr; trace_xfs_dir2_leaf_to_block(args); dp = args->dp; tp = args->trans; mp = dp->i_mount; leaf = lbp->b_addr; xfs_dir2_leaf_hdr_from_disk(mp, &leafhdr, leaf); ltp = xfs_dir2_leaf_tail_p(args->geo, leaf); ASSERT(leafhdr.magic == XFS_DIR2_LEAF1_MAGIC || leafhdr.magic == XFS_DIR3_LEAF1_MAGIC); /* * If there are data blocks other than the first one, take this * opportunity to remove trailing empty data blocks that may have * been left behind during no-space-reservation operations. * These will show up in the leaf bests table. */ while (dp->i_disk_size > args->geo->blksize) { int hdrsz; hdrsz = args->geo->data_entry_offset; bestsp = xfs_dir2_leaf_bests_p(ltp); if (be16_to_cpu(bestsp[be32_to_cpu(ltp->bestcount) - 1]) == args->geo->blksize - hdrsz) { if ((error = xfs_dir2_leaf_trim_data(args, lbp, (xfs_dir2_db_t)(be32_to_cpu(ltp->bestcount) - 1)))) return error; } else return 0; } /* * Read the data block if we don't already have it, give up if it fails. */ if (!dbp) { error = xfs_dir3_data_read(tp, dp, args->owner, args->geo->datablk, 0, &dbp); if (error) return error; } hdr = dbp->b_addr; ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) || hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC)); /* * Size of the "leaf" area in the block. */ size = (uint)sizeof(xfs_dir2_block_tail_t) + (uint)sizeof(*lep) * (leafhdr.count - leafhdr.stale); /* * Look at the last data entry. */ tagp = (__be16 *)((char *)hdr + args->geo->blksize) - 1; dup = (xfs_dir2_data_unused_t *)((char *)hdr + be16_to_cpu(*tagp)); /* * If it's not free or is too short we can't do it. */ if (be16_to_cpu(dup->freetag) != XFS_DIR2_DATA_FREE_TAG || be16_to_cpu(dup->length) < size) return 0; /* * Start converting it to block form. */ xfs_dir3_block_init(args, dbp); needlog = 1; needscan = 0; /* * Use up the space at the end of the block (blp/btp). */ error = xfs_dir2_data_use_free(args, dbp, dup, args->geo->blksize - size, size, &needlog, &needscan); if (error) return error; /* * Initialize the block tail. */ btp = xfs_dir2_block_tail_p(args->geo, hdr); btp->count = cpu_to_be32(leafhdr.count - leafhdr.stale); btp->stale = 0; xfs_dir2_block_log_tail(tp, dbp); /* * Initialize the block leaf area. We compact out stale entries. */ lep = xfs_dir2_block_leaf_p(btp); for (from = to = 0; from < leafhdr.count; from++) { if (leafhdr.ents[from].address == cpu_to_be32(XFS_DIR2_NULL_DATAPTR)) continue; lep[to++] = leafhdr.ents[from]; } ASSERT(to == be32_to_cpu(btp->count)); xfs_dir2_block_log_leaf(tp, dbp, 0, be32_to_cpu(btp->count) - 1); /* * Scan the bestfree if we need it and log the data block header. */ if (needscan) xfs_dir2_data_freescan(dp->i_mount, hdr, &needlog); if (needlog) xfs_dir2_data_log_header(args, dbp); /* * Pitch the old leaf block. */ error = xfs_da_shrink_inode(args, args->geo->leafblk, lbp); if (error) return error; /* * Now see if the resulting block can be shrunken to shortform. */ size = xfs_dir2_block_sfsize(dp, hdr, &sfh); if (size > xfs_inode_data_fork_size(dp)) return 0; return xfs_dir2_block_to_sf(args, dbp, size, &sfh); } /* * Convert the shortform directory to block form. */ int /* error */ xfs_dir2_sf_to_block( struct xfs_da_args *args) { struct xfs_trans *tp = args->trans; struct xfs_inode *dp = args->dp; struct xfs_mount *mp = dp->i_mount; struct xfs_ifork *ifp = xfs_ifork_ptr(dp, XFS_DATA_FORK); struct xfs_da_geometry *geo = args->geo; xfs_dir2_db_t blkno; /* dir-relative block # (0) */ xfs_dir2_data_hdr_t *hdr; /* block header */ xfs_dir2_leaf_entry_t *blp; /* block leaf entries */ struct xfs_buf *bp; /* block buffer */ xfs_dir2_block_tail_t *btp; /* block tail pointer */ xfs_dir2_data_entry_t *dep; /* data entry pointer */ int dummy; /* trash */ xfs_dir2_data_unused_t *dup; /* unused entry pointer */ int endoffset; /* end of data objects */ int error; /* error return value */ int i; /* index */ int needlog; /* need to log block header */ int needscan; /* need to scan block freespc */ int newoffset; /* offset from current entry */ unsigned int offset = geo->data_entry_offset; xfs_dir2_sf_entry_t *sfep; /* sf entry pointer */ struct xfs_dir2_sf_hdr *oldsfp = ifp->if_data; xfs_dir2_sf_hdr_t *sfp; /* shortform header */ __be16 *tagp; /* end of data entry */ struct xfs_name name; trace_xfs_dir2_sf_to_block(args); ASSERT(ifp->if_format == XFS_DINODE_FMT_LOCAL); ASSERT(dp->i_disk_size >= offsetof(struct xfs_dir2_sf_hdr, parent)); ASSERT(ifp->if_bytes == dp->i_disk_size); ASSERT(oldsfp != NULL); ASSERT(dp->i_disk_size >= xfs_dir2_sf_hdr_size(oldsfp->i8count)); ASSERT(dp->i_df.if_nextents == 0); /* * Copy the directory into a temporary buffer. * Then pitch the incore inode data so we can make extents. */ sfp = kmalloc(ifp->if_bytes, GFP_KERNEL | __GFP_NOFAIL); memcpy(sfp, oldsfp, ifp->if_bytes); xfs_idata_realloc(dp, -ifp->if_bytes, XFS_DATA_FORK); xfs_bmap_local_to_extents_empty(tp, dp, XFS_DATA_FORK); dp->i_disk_size = 0; /* * Add block 0 to the inode. */ error = xfs_dir2_grow_inode(args, XFS_DIR2_DATA_SPACE, &blkno); if (error) goto out_free; /* * Initialize the data block, then convert it to block format. */ error = xfs_dir3_data_init(args, blkno, &bp); if (error) goto out_free; xfs_dir3_block_init(args, bp); hdr = bp->b_addr; /* * Compute size of block "tail" area. */ i = (uint)sizeof(*btp) + (sfp->count + 2) * (uint)sizeof(xfs_dir2_leaf_entry_t); /* * The whole thing is initialized to free by the init routine. * Say we're using the leaf and tail area. */ dup = bp->b_addr + offset; needlog = needscan = 0; error = xfs_dir2_data_use_free(args, bp, dup, args->geo->blksize - i, i, &needlog, &needscan); if (error) goto out_free; ASSERT(needscan == 0); /* * Fill in the tail. */ btp = xfs_dir2_block_tail_p(args->geo, hdr); btp->count = cpu_to_be32(sfp->count + 2); /* ., .. */ btp->stale = 0; blp = xfs_dir2_block_leaf_p(btp); endoffset = (uint)((char *)blp - (char *)hdr); /* * Remove the freespace, we'll manage it. */ error = xfs_dir2_data_use_free(args, bp, dup, (xfs_dir2_data_aoff_t)((char *)dup - (char *)hdr), be16_to_cpu(dup->length), &needlog, &needscan); if (error) goto out_free; /* * Create entry for . */ dep = bp->b_addr + offset; dep->inumber = cpu_to_be64(args->owner); dep->namelen = 1; dep->name[0] = '.'; xfs_dir2_data_put_ftype(mp, dep, XFS_DIR3_FT_DIR); tagp = xfs_dir2_data_entry_tag_p(mp, dep); *tagp = cpu_to_be16(offset); xfs_dir2_data_log_entry(args, bp, dep); blp[0].hashval = cpu_to_be32(xfs_dir_hash_dot); blp[0].address = cpu_to_be32(xfs_dir2_byte_to_dataptr(offset)); offset += xfs_dir2_data_entsize(mp, dep->namelen); /* * Create entry for .. */ dep = bp->b_addr + offset; dep->inumber = cpu_to_be64(xfs_dir2_sf_get_parent_ino(sfp)); dep->namelen = 2; dep->name[0] = dep->name[1] = '.'; xfs_dir2_data_put_ftype(mp, dep, XFS_DIR3_FT_DIR); tagp = xfs_dir2_data_entry_tag_p(mp, dep); *tagp = cpu_to_be16(offset); xfs_dir2_data_log_entry(args, bp, dep); blp[1].hashval = cpu_to_be32(xfs_dir_hash_dotdot); blp[1].address = cpu_to_be32(xfs_dir2_byte_to_dataptr(offset)); offset += xfs_dir2_data_entsize(mp, dep->namelen); /* * Loop over existing entries, stuff them in. */ i = 0; if (!sfp->count) sfep = NULL; else sfep = xfs_dir2_sf_firstentry(sfp); /* * Need to preserve the existing offset values in the sf directory. * Insert holes (unused entries) where necessary. */ while (offset < endoffset) { /* * sfep is null when we reach the end of the list. */ if (sfep == NULL) newoffset = endoffset; else newoffset = xfs_dir2_sf_get_offset(sfep); /* * There should be a hole here, make one. */ if (offset < newoffset) { dup = bp->b_addr + offset; dup->freetag = cpu_to_be16(XFS_DIR2_DATA_FREE_TAG); dup->length = cpu_to_be16(newoffset - offset); *xfs_dir2_data_unused_tag_p(dup) = cpu_to_be16(offset); xfs_dir2_data_log_unused(args, bp, dup); xfs_dir2_data_freeinsert(hdr, xfs_dir2_data_bestfree_p(mp, hdr), dup, &dummy); offset += be16_to_cpu(dup->length); continue; } /* * Copy a real entry. */ dep = bp->b_addr + newoffset; dep->inumber = cpu_to_be64(xfs_dir2_sf_get_ino(mp, sfp, sfep)); dep->namelen = sfep->namelen; xfs_dir2_data_put_ftype(mp, dep, xfs_dir2_sf_get_ftype(mp, sfep)); memcpy(dep->name, sfep->name, dep->namelen); tagp = xfs_dir2_data_entry_tag_p(mp, dep); *tagp = cpu_to_be16(newoffset); xfs_dir2_data_log_entry(args, bp, dep); name.name = sfep->name; name.len = sfep->namelen; blp[2 + i].hashval = cpu_to_be32(xfs_dir2_hashname(mp, &name)); blp[2 + i].address = cpu_to_be32(xfs_dir2_byte_to_dataptr(newoffset)); offset = (int)((char *)(tagp + 1) - (char *)hdr); if (++i == sfp->count) sfep = NULL; else sfep = xfs_dir2_sf_nextentry(mp, sfp, sfep); } /* Done with the temporary buffer */ kfree(sfp); /* * Sort the leaf entries by hash value. */ xfs_sort(blp, be32_to_cpu(btp->count), sizeof(*blp), xfs_dir2_block_sort); /* * Log the leaf entry area and tail. * Already logged the header in data_init, ignore needlog. */ ASSERT(needscan == 0); xfs_dir2_block_log_leaf(tp, bp, 0, be32_to_cpu(btp->count) - 1); xfs_dir2_block_log_tail(tp, bp); xfs_dir3_data_check(dp, bp); return 0; out_free: kfree(sfp); return error; }
2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 // SPDX-License-Identifier: GPL-2.0-only /* * Ram backed block device driver. * * Copyright (C) 2007 Nick Piggin * Copyright (C) 2007 Novell Inc. * * Parts derived from drivers/block/rd.c, and drivers/block/loop.c, copyright * of their respective owners. */ #include <linux/init.h> #include <linux/initrd.h> #include <linux/module.h> #include <linux/moduleparam.h> #include <linux/major.h> #include <linux/blkdev.h> #include <linux/bio.h> #include <linux/highmem.h> #include <linux/mutex.h> #include <linux/pagemap.h> #include <linux/xarray.h> #include <linux/fs.h> #include <linux/slab.h> #include <linux/backing-dev.h> #include <linux/debugfs.h> #include <linux/uaccess.h> /* * Each block ramdisk device has a xarray brd_pages of pages that stores * the pages containing the block device's contents. */ struct brd_device { int brd_number; struct gendisk *brd_disk; struct list_head brd_list; /* * Backing store of pages. This is the contents of the block device. */ struct xarray brd_pages; u64 brd_nr_pages; }; /* * Look up and return a brd's page with reference grabbed for a given sector. */ static struct page *brd_lookup_page(struct brd_device *brd, sector_t sector) { struct page *page; XA_STATE(xas, &brd->brd_pages, sector >> PAGE_SECTORS_SHIFT); rcu_read_lock(); repeat: page = xas_load(&xas); if (xas_retry(&xas, page)) { xas_reset(&xas); goto repeat; } if (!page) goto out; if (!get_page_unless_zero(page)) { xas_reset(&xas); goto repeat; } if (unlikely(page != xas_reload(&xas))) { put_page(page); xas_reset(&xas); goto repeat; } out: rcu_read_unlock(); return page; } /* * Insert a new page for a given sector, if one does not already exist. * The returned page will grab reference. */ static struct page *brd_insert_page(struct brd_device *brd, sector_t sector, blk_opf_t opf) { gfp_t gfp = (opf & REQ_NOWAIT) ? GFP_NOWAIT : GFP_NOIO; struct page *page, *ret; page = alloc_page(gfp | __GFP_ZERO | __GFP_HIGHMEM); if (!page) return ERR_PTR(-ENOMEM); xa_lock(&brd->brd_pages); ret = __xa_cmpxchg(&brd->brd_pages, sector >> PAGE_SECTORS_SHIFT, NULL, page, gfp); if (!ret) { brd->brd_nr_pages++; get_page(page); xa_unlock(&brd->brd_pages); return page; } if (!xa_is_err(ret)) { get_page(ret); xa_unlock(&brd->brd_pages); put_page(page); return ret; } xa_unlock(&brd->brd_pages); put_page(page); return ERR_PTR(xa_err(ret)); } /* * Free all backing store pages and xarray. This must only be called when * there are no other users of the device. */ static void brd_free_pages(struct brd_device *brd) { struct page *page; pgoff_t idx; xa_for_each(&brd->brd_pages, idx, page) { put_page(page); cond_resched(); } xa_destroy(&brd->brd_pages); } /* * Process a single segment. The segment is capped to not cross page boundaries * in both the bio and the brd backing memory. */ static bool brd_rw_bvec(struct brd_device *brd, struct bio *bio) { struct bio_vec bv = bio_iter_iovec(bio, bio->bi_iter); sector_t sector = bio->bi_iter.bi_sector; u32 offset = (sector & (PAGE_SECTORS - 1)) << SECTOR_SHIFT; blk_opf_t opf = bio->bi_opf; struct page *page; void *kaddr; bv.bv_len = min_t(u32, bv.bv_len, PAGE_SIZE - offset); page = brd_lookup_page(brd, sector); if (!page && op_is_write(opf)) { page = brd_insert_page(brd, sector, opf); if (IS_ERR(page)) goto out_error; } kaddr = bvec_kmap_local(&bv); if (op_is_write(opf)) { memcpy_to_page(page, offset, kaddr, bv.bv_len); } else { if (page) memcpy_from_page(kaddr, page, offset, bv.bv_len); else memset(kaddr, 0, bv.bv_len); } kunmap_local(kaddr); bio_advance_iter_single(bio, &bio->bi_iter, bv.bv_len); if (page) put_page(page); return true; out_error: if (PTR_ERR(page) == -ENOMEM && (opf & REQ_NOWAIT)) bio_wouldblock_error(bio); else bio_io_error(bio); return false; } static void brd_do_discard(struct brd_device *brd, sector_t sector, u32 size) { sector_t aligned_sector = round_up(sector, PAGE_SECTORS); sector_t aligned_end = round_down( sector + (size >> SECTOR_SHIFT), PAGE_SECTORS); struct page *page; if (aligned_end <= aligned_sector) return; xa_lock(&brd->brd_pages); while (aligned_sector < aligned_end && aligned_sector < rd_size * 2) { page = __xa_erase(&brd->brd_pages, aligned_sector >> PAGE_SECTORS_SHIFT); if (page) { put_page(page); brd->brd_nr_pages--; } aligned_sector += PAGE_SECTORS; } xa_unlock(&brd->brd_pages); } static void brd_submit_bio(struct bio *bio) { struct brd_device *brd = bio->bi_bdev->bd_disk->private_data; if (unlikely(op_is_discard(bio->bi_opf))) { brd_do_discard(brd, bio->bi_iter.bi_sector, bio->bi_iter.bi_size); bio_endio(bio); return; } do { if (!brd_rw_bvec(brd, bio)) return; } while (bio->bi_iter.bi_size); bio_endio(bio); } static const struct block_device_operations brd_fops = { .owner = THIS_MODULE, .submit_bio = brd_submit_bio, }; /* * And now the modules code and kernel interface. */ static int rd_nr = CONFIG_BLK_DEV_RAM_COUNT; module_param(rd_nr, int, 0444); MODULE_PARM_DESC(rd_nr, "Maximum number of brd devices"); unsigned long rd_size = CONFIG_BLK_DEV_RAM_SIZE; module_param(rd_size, ulong, 0444); MODULE_PARM_DESC(rd_size, "Size of each RAM disk in kbytes."); static int max_part = 1; module_param(max_part, int, 0444); MODULE_PARM_DESC(max_part, "Num Minors to reserve between devices"); MODULE_DESCRIPTION("Ram backed block device driver"); MODULE_LICENSE("GPL"); MODULE_ALIAS_BLOCKDEV_MAJOR(RAMDISK_MAJOR); MODULE_ALIAS("rd"); #ifndef MODULE /* Legacy boot options - nonmodular */ static int __init ramdisk_size(char *str) { rd_size = simple_strtol(str, NULL, 0); return 1; } __setup("ramdisk_size=", ramdisk_size); #endif /* * The device scheme is derived from loop.c. Keep them in synch where possible * (should share code eventually). */ static LIST_HEAD(brd_devices); static DEFINE_MUTEX(brd_devices_mutex); static struct dentry *brd_debugfs_dir; static struct brd_device *brd_find_or_alloc_device(int i) { struct brd_device *brd; mutex_lock(&brd_devices_mutex); list_for_each_entry(brd, &brd_devices, brd_list) { if (brd->brd_number == i) { mutex_unlock(&brd_devices_mutex); return ERR_PTR(-EEXIST); } } brd = kzalloc(sizeof(*brd), GFP_KERNEL); if (!brd) { mutex_unlock(&brd_devices_mutex); return ERR_PTR(-ENOMEM); } brd->brd_number = i; list_add_tail(&brd->brd_list, &brd_devices); mutex_unlock(&brd_devices_mutex); return brd; } static void brd_free_device(struct brd_device *brd) { mutex_lock(&brd_devices_mutex); list_del(&brd->brd_list); mutex_unlock(&brd_devices_mutex); kfree(brd); } static int brd_alloc(int i) { struct brd_device *brd; struct gendisk *disk; char buf[DISK_NAME_LEN]; int err = -ENOMEM; struct queue_limits lim = { /* * This is so fdisk will align partitions on 4k, because of * direct_access API needing 4k alignment, returning a PFN * (This is only a problem on very small devices <= 4M, * otherwise fdisk will align on 1M. Regardless this call * is harmless) */ .physical_block_size = PAGE_SIZE, .max_hw_discard_sectors = UINT_MAX, .max_discard_segments = 1, .discard_granularity = PAGE_SIZE, .features = BLK_FEAT_SYNCHRONOUS | BLK_FEAT_NOWAIT, }; brd = brd_find_or_alloc_device(i); if (IS_ERR(brd)) return PTR_ERR(brd); xa_init(&brd->brd_pages); snprintf(buf, DISK_NAME_LEN, "ram%d", i); if (!IS_ERR_OR_NULL(brd_debugfs_dir)) debugfs_create_u64(buf, 0444, brd_debugfs_dir, &brd->brd_nr_pages); disk = brd->brd_disk = blk_alloc_disk(&lim, NUMA_NO_NODE); if (IS_ERR(disk)) { err = PTR_ERR(disk); goto out_free_dev; } disk->major = RAMDISK_MAJOR; disk->first_minor = i * max_part; disk->minors = max_part; disk->fops = &brd_fops; disk->private_data = brd; strscpy(disk->disk_name, buf, DISK_NAME_LEN); set_capacity(disk, rd_size * 2); err = add_disk(disk); if (err) goto out_cleanup_disk; return 0; out_cleanup_disk: put_disk(disk); out_free_dev: brd_free_device(brd); return err; } static void brd_probe(dev_t dev) { brd_alloc(MINOR(dev) / max_part); } static void brd_cleanup(void) { struct brd_device *brd, *next; debugfs_remove_recursive(brd_debugfs_dir); list_for_each_entry_safe(brd, next, &brd_devices, brd_list) { del_gendisk(brd->brd_disk); put_disk(brd->brd_disk); brd_free_pages(brd); brd_free_device(brd); } } static inline void brd_check_and_reset_par(void) { if (unlikely(!max_part)) max_part = 1; /* * make sure 'max_part' can be divided exactly by (1U << MINORBITS), * otherwise, it is possiable to get same dev_t when adding partitions. */ if ((1U << MINORBITS) % max_part != 0) max_part = 1UL << fls(max_part); if (max_part > DISK_MAX_PARTS) { pr_info("brd: max_part can't be larger than %d, reset max_part = %d.\n", DISK_MAX_PARTS, DISK_MAX_PARTS); max_part = DISK_MAX_PARTS; } } static int __init brd_init(void) { int err, i; /* * brd module now has a feature to instantiate underlying device * structure on-demand, provided that there is an access dev node. * * (1) if rd_nr is specified, create that many upfront. else * it defaults to CONFIG_BLK_DEV_RAM_COUNT * (2) User can further extend brd devices by create dev node themselves * and have kernel automatically instantiate actual device * on-demand. Example: * mknod /path/devnod_name b 1 X # 1 is the rd major * fdisk -l /path/devnod_name * If (X / max_part) was not already created it will be created * dynamically. */ brd_check_and_reset_par(); brd_debugfs_dir = debugfs_create_dir("ramdisk_pages", NULL); if (__register_blkdev(RAMDISK_MAJOR, "ramdisk", brd_probe)) { err = -EIO; goto out_free; } for (i = 0; i < rd_nr; i++) brd_alloc(i); pr_info("brd: module loaded\n"); return 0; out_free: brd_cleanup(); pr_info("brd: module NOT loaded !!!\n"); return err; } static void __exit brd_exit(void) { unregister_blkdev(RAMDISK_MAJOR, "ramdisk"); brd_cleanup(); pr_info("brd: module unloaded\n"); } module_init(brd_init); module_exit(brd_exit);
301 301 301 301 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 // SPDX-License-Identifier: GPL-2.0-only #include <linux/ns_common.h> #include <linux/proc_ns.h> #include <linux/vfsdebug.h> #ifdef CONFIG_DEBUG_VFS static void ns_debug(struct ns_common *ns, const struct proc_ns_operations *ops) { switch (ns->ns_type) { #ifdef CONFIG_CGROUPS case CLONE_NEWCGROUP: VFS_WARN_ON_ONCE(ops != &cgroupns_operations); break; #endif #ifdef CONFIG_IPC_NS case CLONE_NEWIPC: VFS_WARN_ON_ONCE(ops != &ipcns_operations); break; #endif case CLONE_NEWNS: VFS_WARN_ON_ONCE(ops != &mntns_operations); break; #ifdef CONFIG_NET_NS case CLONE_NEWNET: VFS_WARN_ON_ONCE(ops != &netns_operations); break; #endif #ifdef CONFIG_PID_NS case CLONE_NEWPID: VFS_WARN_ON_ONCE(ops != &pidns_operations); break; #endif #ifdef CONFIG_TIME_NS case CLONE_NEWTIME: VFS_WARN_ON_ONCE(ops != &timens_operations); break; #endif #ifdef CONFIG_USER_NS case CLONE_NEWUSER: VFS_WARN_ON_ONCE(ops != &userns_operations); break; #endif #ifdef CONFIG_UTS_NS case CLONE_NEWUTS: VFS_WARN_ON_ONCE(ops != &utsns_operations); break; #endif } } #endif int __ns_common_init(struct ns_common *ns, u32 ns_type, const struct proc_ns_operations *ops, int inum) { refcount_set(&ns->__ns_ref, 1); ns->stashed = NULL; ns->ops = ops; ns->ns_id = 0; ns->ns_type = ns_type; RB_CLEAR_NODE(&ns->ns_tree_node); INIT_LIST_HEAD(&ns->ns_list_node); #ifdef CONFIG_DEBUG_VFS ns_debug(ns, ops); #endif if (inum) { ns->inum = inum; return 0; } return proc_alloc_inum(&ns->inum); } void __ns_common_free(struct ns_common *ns) { proc_free_inum(ns->inum); }
4 4 4 4 4 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 // SPDX-License-Identifier: GPL-2.0 /* * This file contains functions which emulate a local clock-event * device via a broadcast event source. * * Copyright(C) 2005-2006, Thomas Gleixner <tglx@linutronix.de> * Copyright(C) 2005-2007, Red Hat, Inc., Ingo Molnar * Copyright(C) 2006-2007, Timesys Corp., Thomas Gleixner */ #include <linux/cpu.h> #include <linux/err.h> #include <linux/hrtimer.h> #include <linux/interrupt.h> #include <linux/percpu.h> #include <linux/profile.h> #include <linux/sched.h> #include <linux/smp.h> #include <linux/module.h> #include "tick-internal.h" /* * Broadcast support for broken x86 hardware, where the local apic * timer stops in C3 state. */ static struct tick_device tick_broadcast_device; static cpumask_var_t tick_broadcast_mask __cpumask_var_read_mostly; static cpumask_var_t tick_broadcast_on __cpumask_var_read_mostly; static cpumask_var_t tmpmask __cpumask_var_read_mostly; static int tick_broadcast_forced; static __cacheline_aligned_in_smp DEFINE_RAW_SPINLOCK(tick_broadcast_lock); #ifdef CONFIG_TICK_ONESHOT static DEFINE_PER_CPU(struct clock_event_device *, tick_oneshot_wakeup_device); static void tick_broadcast_setup_oneshot(struct clock_event_device *bc, bool from_periodic); static void tick_broadcast_clear_oneshot(int cpu); static void tick_resume_broadcast_oneshot(struct clock_event_device *bc); # ifdef CONFIG_HOTPLUG_CPU static void tick_broadcast_oneshot_offline(unsigned int cpu); # endif #else static inline void tick_broadcast_setup_oneshot(struct clock_event_device *bc, bool from_periodic) { BUG(); } static inline void tick_broadcast_clear_oneshot(int cpu) { } static inline void tick_resume_broadcast_oneshot(struct clock_event_device *bc) { } # ifdef CONFIG_HOTPLUG_CPU static inline void tick_broadcast_oneshot_offline(unsigned int cpu) { } # endif #endif /* * Debugging: see timer_list.c */ struct tick_device *tick_get_broadcast_device(void) { return &tick_broadcast_device; } struct cpumask *tick_get_broadcast_mask(void) { return tick_broadcast_mask; } static struct clock_event_device *tick_get_oneshot_wakeup_device(int cpu); const struct clock_event_device *tick_get_wakeup_device(int cpu) { return tick_get_oneshot_wakeup_device(cpu); } /* * Start the device in periodic mode */ static void tick_broadcast_start_periodic(struct clock_event_device *bc) { if (bc) tick_setup_periodic(bc, 1); } /* * Check, if the device can be utilized as broadcast device: */ static bool tick_check_broadcast_device(struct clock_event_device *curdev, struct clock_event_device *newdev) { if ((newdev->features & CLOCK_EVT_FEAT_DUMMY) || (newdev->features & CLOCK_EVT_FEAT_PERCPU) || (newdev->features & CLOCK_EVT_FEAT_C3STOP)) return false; if (tick_broadcast_device.mode == TICKDEV_MODE_ONESHOT && !(newdev->features & CLOCK_EVT_FEAT_ONESHOT)) return false; return !curdev || newdev->rating > curdev->rating; } #ifdef CONFIG_TICK_ONESHOT static struct clock_event_device *tick_get_oneshot_wakeup_device(int cpu) { return per_cpu(tick_oneshot_wakeup_device, cpu); } static void tick_oneshot_wakeup_handler(struct clock_event_device *wd) { /* * If we woke up early and the tick was reprogrammed in the * meantime then this may be spurious but harmless. */ tick_receive_broadcast(); } static bool tick_set_oneshot_wakeup_device(struct clock_event_device *newdev, int cpu) { struct clock_event_device *curdev = tick_get_oneshot_wakeup_device(cpu); if (!newdev) goto set_device; if ((newdev->features & CLOCK_EVT_FEAT_DUMMY) || (newdev->features & CLOCK_EVT_FEAT_C3STOP)) return false; if (!(newdev->features & CLOCK_EVT_FEAT_PERCPU) || !(newdev->features & CLOCK_EVT_FEAT_ONESHOT)) return false; if (!cpumask_equal(newdev->cpumask, cpumask_of(cpu))) return false; if (curdev && newdev->rating <= curdev->rating) return false; if (!try_module_get(newdev->owner)) return false; newdev->event_handler = tick_oneshot_wakeup_handler; set_device: clockevents_exchange_device(curdev, newdev); per_cpu(tick_oneshot_wakeup_device, cpu) = newdev; return true; } #else static struct clock_event_device *tick_get_oneshot_wakeup_device(int cpu) { return NULL; } static bool tick_set_oneshot_wakeup_device(struct clock_event_device *newdev, int cpu) { return false; } #endif /* * Conditionally install/replace broadcast device */ void tick_install_broadcast_device(struct clock_event_device *dev, int cpu) { struct clock_event_device *cur = tick_broadcast_device.evtdev; if (tick_set_oneshot_wakeup_device(dev, cpu)) return; if (!tick_check_broadcast_device(cur, dev)) return; if (!try_module_get(dev->owner)) return; clockevents_exchange_device(cur, dev); if (cur) cur->event_handler = clockevents_handle_noop; tick_broadcast_device.evtdev = dev; if (!cpumask_empty(tick_broadcast_mask)) tick_broadcast_start_periodic(dev); if (!(dev->features & CLOCK_EVT_FEAT_ONESHOT)) return; /* * If the system already runs in oneshot mode, switch the newly * registered broadcast device to oneshot mode explicitly. */ if (tick_broadcast_oneshot_active()) { tick_broadcast_switch_to_oneshot(); return; } /* * Inform all cpus about this. We might be in a situation * where we did not switch to oneshot mode because the per cpu * devices are affected by CLOCK_EVT_FEAT_C3STOP and the lack * of a oneshot capable broadcast device. Without that * notification the systems stays stuck in periodic mode * forever. */ tick_clock_notify(); } /* * Check, if the device is the broadcast device */ int tick_is_broadcast_device(struct clock_event_device *dev) { return (dev && tick_broadcast_device.evtdev == dev); } int tick_broadcast_update_freq(struct clock_event_device *dev, u32 freq) { int ret = -ENODEV; if (tick_is_broadcast_device(dev)) { raw_spin_lock(&tick_broadcast_lock); ret = __clockevents_update_freq(dev, freq); raw_spin_unlock(&tick_broadcast_lock); } return ret; } static void err_broadcast(const struct cpumask *mask) { pr_crit_once("Failed to broadcast timer tick. Some CPUs may be unresponsive.\n"); } static void tick_device_setup_broadcast_func(struct clock_event_device *dev) { if (!dev->broadcast) dev->broadcast = tick_broadcast; if (!dev->broadcast) { pr_warn_once("%s depends on broadcast, but no broadcast function available\n", dev->name); dev->broadcast = err_broadcast; } } /* * Check, if the device is dysfunctional and a placeholder, which * needs to be handled by the broadcast device. */ int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu) { struct clock_event_device *bc = tick_broadcast_device.evtdev; unsigned long flags; int ret = 0; raw_spin_lock_irqsave(&tick_broadcast_lock, flags); /* * Devices might be registered with both periodic and oneshot * mode disabled. This signals, that the device needs to be * operated from the broadcast device and is a placeholder for * the cpu local device. */ if (!tick_device_is_functional(dev)) { dev->event_handler = tick_handle_periodic; tick_device_setup_broadcast_func(dev); cpumask_set_cpu(cpu, tick_broadcast_mask); if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC) tick_broadcast_start_periodic(bc); else tick_broadcast_setup_oneshot(bc, false); ret = 1; } else { /* * Clear the broadcast bit for this cpu if the * device is not power state affected. */ if (!(dev->features & CLOCK_EVT_FEAT_C3STOP)) cpumask_clear_cpu(cpu, tick_broadcast_mask); else tick_device_setup_broadcast_func(dev); /* * Clear the broadcast bit if the CPU is not in * periodic broadcast on state. */ if (!cpumask_test_cpu(cpu, tick_broadcast_on)) cpumask_clear_cpu(cpu, tick_broadcast_mask); switch (tick_broadcast_device.mode) { case TICKDEV_MODE_ONESHOT: /* * If the system is in oneshot mode we can * unconditionally clear the oneshot mask bit, * because the CPU is running and therefore * not in an idle state which causes the power * state affected device to stop. Let the * caller initialize the device. */ tick_broadcast_clear_oneshot(cpu); ret = 0; break; case TICKDEV_MODE_PERIODIC: /* * If the system is in periodic mode, check * whether the broadcast device can be * switched off now. */ if (cpumask_empty(tick_broadcast_mask) && bc) clockevents_shutdown(bc); /* * If we kept the cpu in the broadcast mask, * tell the caller to leave the per cpu device * in shutdown state. The periodic interrupt * is delivered by the broadcast device, if * the broadcast device exists and is not * hrtimer based. */ if (bc && !(bc->features & CLOCK_EVT_FEAT_HRTIMER)) ret = cpumask_test_cpu(cpu, tick_broadcast_mask); break; default: break; } } raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags); return ret; } int tick_receive_broadcast(void) { struct tick_device *td = this_cpu_ptr(&tick_cpu_device); struct clock_event_device *evt = td->evtdev; if (!evt) return -ENODEV; if (!evt->event_handler) return -EINVAL; evt->event_handler(evt); return 0; } /* * Broadcast the event to the cpus, which are set in the mask (mangled). */ static bool tick_do_broadcast(struct cpumask *mask) { int cpu = smp_processor_id(); struct tick_device *td; bool local = false; /* * Check, if the current cpu is in the mask */ if (cpumask_test_cpu(cpu, mask)) { struct clock_event_device *bc = tick_broadcast_device.evtdev; cpumask_clear_cpu(cpu, mask); /* * We only run the local handler, if the broadcast * device is not hrtimer based. Otherwise we run into * a hrtimer recursion. * * local timer_interrupt() * local_handler() * expire_hrtimers() * bc_handler() * local_handler() * expire_hrtimers() */ local = !(bc->features & CLOCK_EVT_FEAT_HRTIMER); } if (!cpumask_empty(mask)) { /* * It might be necessary to actually check whether the devices * have different broadcast functions. For now, just use the * one of the first device. This works as long as we have this * misfeature only on x86 (lapic) */ td = &per_cpu(tick_cpu_device, cpumask_first(mask)); td->evtdev->broadcast(mask); } return local; } /* * Periodic broadcast: * - invoke the broadcast handlers */ static bool tick_do_periodic_broadcast(void) { cpumask_and(tmpmask, cpu_online_mask, tick_broadcast_mask); return tick_do_broadcast(tmpmask); } /* * Event handler for periodic broadcast ticks */ static void tick_handle_periodic_broadcast(struct clock_event_device *dev) { struct tick_device *td = this_cpu_ptr(&tick_cpu_device); bool bc_local; raw_spin_lock(&tick_broadcast_lock); /* Handle spurious interrupts gracefully */ if (clockevent_state_shutdown(tick_broadcast_device.evtdev)) { raw_spin_unlock(&tick_broadcast_lock); return; } bc_local = tick_do_periodic_broadcast(); if (clockevent_state_oneshot(dev)) { ktime_t next = ktime_add_ns(dev->next_event, TICK_NSEC); clockevents_program_event(dev, next, true); } raw_spin_unlock(&tick_broadcast_lock); /* * We run the handler of the local cpu after dropping * tick_broadcast_lock because the handler might deadlock when * trying to switch to oneshot mode. */ if (bc_local) td->evtdev->event_handler(td->evtdev); } /** * tick_broadcast_control - Enable/disable or force broadcast mode * @mode: The selected broadcast mode * * Called when the system enters a state where affected tick devices * might stop. Note: TICK_BROADCAST_FORCE cannot be undone. */ void tick_broadcast_control(enum tick_broadcast_mode mode) { struct clock_event_device *bc, *dev; struct tick_device *td; int cpu, bc_stopped; unsigned long flags; /* Protects also the local clockevent device. */ raw_spin_lock_irqsave(&tick_broadcast_lock, flags); td = this_cpu_ptr(&tick_cpu_device); dev = td->evtdev; /* * Is the device not affected by the powerstate ? */ if (!dev || !(dev->features & CLOCK_EVT_FEAT_C3STOP)) goto out; if (!tick_device_is_functional(dev)) goto out; cpu = smp_processor_id(); bc = tick_broadcast_device.evtdev; bc_stopped = cpumask_empty(tick_broadcast_mask); switch (mode) { case TICK_BROADCAST_FORCE: tick_broadcast_forced = 1; fallthrough; case TICK_BROADCAST_ON: cpumask_set_cpu(cpu, tick_broadcast_on); if (!cpumask_test_and_set_cpu(cpu, tick_broadcast_mask)) { /* * Only shutdown the cpu local device, if: * * - the broadcast device exists * - the broadcast device is not a hrtimer based one * - the broadcast device is in periodic mode to * avoid a hiccup during switch to oneshot mode */ if (bc && !(bc->features & CLOCK_EVT_FEAT_HRTIMER) && tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC) clockevents_shutdown(dev); } break; case TICK_BROADCAST_OFF: if (tick_broadcast_forced) break; cpumask_clear_cpu(cpu, tick_broadcast_on); if (cpumask_test_and_clear_cpu(cpu, tick_broadcast_mask)) { if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC) tick_setup_periodic(dev, 0); } break; } if (bc) { if (cpumask_empty(tick_broadcast_mask)) { if (!bc_stopped) clockevents_shutdown(bc); } else if (bc_stopped) { if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC) tick_broadcast_start_periodic(bc); else tick_broadcast_setup_oneshot(bc, false); } } out: raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags); } EXPORT_SYMBOL_GPL(tick_broadcast_control); /* * Set the periodic handler depending on broadcast on/off */ void tick_set_periodic_handler(struct clock_event_device *dev, int broadcast) { if (!broadcast) dev->event_handler = tick_handle_periodic; else dev->event_handler = tick_handle_periodic_broadcast; } #ifdef CONFIG_HOTPLUG_CPU static void tick_shutdown_broadcast(void) { struct clock_event_device *bc = tick_broadcast_device.evtdev; if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC) { if (bc && cpumask_empty(tick_broadcast_mask)) clockevents_shutdown(bc); } } /* * Remove a CPU from broadcasting */ void tick_broadcast_offline(unsigned int cpu) { raw_spin_lock(&tick_broadcast_lock); cpumask_clear_cpu(cpu, tick_broadcast_mask); cpumask_clear_cpu(cpu, tick_broadcast_on); tick_broadcast_oneshot_offline(cpu); tick_shutdown_broadcast(); raw_spin_unlock(&tick_broadcast_lock); } #endif void tick_suspend_broadcast(void) { struct clock_event_device *bc; unsigned long flags; raw_spin_lock_irqsave(&tick_broadcast_lock, flags); bc = tick_broadcast_device.evtdev; if (bc) clockevents_shutdown(bc); raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags); } /* * This is called from tick_resume_local() on a resuming CPU. That's * called from the core resume function, tick_unfreeze() and the magic XEN * resume hackery. * * In none of these cases the broadcast device mode can change and the * bit of the resuming CPU in the broadcast mask is safe as well. */ bool tick_resume_check_broadcast(void) { if (tick_broadcast_device.mode == TICKDEV_MODE_ONESHOT) return false; else return cpumask_test_cpu(smp_processor_id(), tick_broadcast_mask); } void tick_resume_broadcast(void) { struct clock_event_device *bc; unsigned long flags; raw_spin_lock_irqsave(&tick_broadcast_lock, flags); bc = tick_broadcast_device.evtdev; if (bc) { clockevents_tick_resume(bc); switch (tick_broadcast_device.mode) { case TICKDEV_MODE_PERIODIC: if (!cpumask_empty(tick_broadcast_mask)) tick_broadcast_start_periodic(bc); break; case TICKDEV_MODE_ONESHOT: if (!cpumask_empty(tick_broadcast_mask)) tick_resume_broadcast_oneshot(bc); break; } } raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags); } #ifdef CONFIG_TICK_ONESHOT static cpumask_var_t tick_broadcast_oneshot_mask __cpumask_var_read_mostly; static cpumask_var_t tick_broadcast_pending_mask __cpumask_var_read_mostly; static cpumask_var_t tick_broadcast_force_mask __cpumask_var_read_mostly; /* * Exposed for debugging: see timer_list.c */ struct cpumask *tick_get_broadcast_oneshot_mask(void) { return tick_broadcast_oneshot_mask; } /* * Called before going idle with interrupts disabled. Checks whether a * broadcast event from the other core is about to happen. We detected * that in tick_broadcast_oneshot_control(). The callsite can use this * to avoid a deep idle transition as we are about to get the * broadcast IPI right away. */ noinstr int tick_check_broadcast_expired(void) { #ifdef _ASM_GENERIC_BITOPS_INSTRUMENTED_NON_ATOMIC_H return arch_test_bit(smp_processor_id(), cpumask_bits(tick_broadcast_force_mask)); #else return cpumask_test_cpu(smp_processor_id(), tick_broadcast_force_mask); #endif } /* * Set broadcast interrupt affinity */ static void tick_broadcast_set_affinity(struct clock_event_device *bc, const struct cpumask *cpumask) { if (!(bc->features & CLOCK_EVT_FEAT_DYNIRQ)) return; if (cpumask_equal(bc->cpumask, cpumask)) return; bc->cpumask = cpumask; irq_set_affinity(bc->irq, bc->cpumask); } static void tick_broadcast_set_event(struct clock_event_device *bc, int cpu, ktime_t expires) { if (!clockevent_state_oneshot(bc)) clockevents_switch_state(bc, CLOCK_EVT_STATE_ONESHOT); clockevents_program_event(bc, expires, 1); tick_broadcast_set_affinity(bc, cpumask_of(cpu)); } static void tick_resume_broadcast_oneshot(struct clock_event_device *bc) { clockevents_switch_state(bc, CLOCK_EVT_STATE_ONESHOT); } /* * Called from irq_enter() when idle was interrupted to reenable the * per cpu device. */ void tick_check_oneshot_broadcast_this_cpu(void) { if (cpumask_test_cpu(smp_processor_id(), tick_broadcast_oneshot_mask)) { struct tick_device *td = this_cpu_ptr(&tick_cpu_device); /* * We might be in the middle of switching over from * periodic to oneshot. If the CPU has not yet * switched over, leave the device alone. */ if (td->mode == TICKDEV_MODE_ONESHOT) { clockevents_switch_state(td->evtdev, CLOCK_EVT_STATE_ONESHOT); } } } /* * Handle oneshot mode broadcasting */ static void tick_handle_oneshot_broadcast(struct clock_event_device *dev) { struct tick_device *td; ktime_t now, next_event; int cpu, next_cpu = 0; bool bc_local; raw_spin_lock(&tick_broadcast_lock); dev->next_event = KTIME_MAX; next_event = KTIME_MAX; cpumask_clear(tmpmask); now = ktime_get(); /* Find all expired events */ for_each_cpu(cpu, tick_broadcast_oneshot_mask) { /* * Required for !SMP because for_each_cpu() reports * unconditionally CPU0 as set on UP kernels. */ if (!IS_ENABLED(CONFIG_SMP) && cpumask_empty(tick_broadcast_oneshot_mask)) break; td = &per_cpu(tick_cpu_device, cpu); if (td->evtdev->next_event <= now) { cpumask_set_cpu(cpu, tmpmask); /* * Mark the remote cpu in the pending mask, so * it can avoid reprogramming the cpu local * timer in tick_broadcast_oneshot_control(). */ cpumask_set_cpu(cpu, tick_broadcast_pending_mask); } else if (td->evtdev->next_event < next_event) { next_event = td->evtdev->next_event; next_cpu = cpu; } } /* * Remove the current cpu from the pending mask. The event is * delivered immediately in tick_do_broadcast() ! */ cpumask_clear_cpu(smp_processor_id(), tick_broadcast_pending_mask); /* Take care of enforced broadcast requests */ cpumask_or(tmpmask, tmpmask, tick_broadcast_force_mask); cpumask_clear(tick_broadcast_force_mask); /* * Sanity check. Catch the case where we try to broadcast to * offline cpus. */ if (WARN_ON_ONCE(!cpumask_subset(tmpmask, cpu_online_mask))) cpumask_and(tmpmask, tmpmask, cpu_online_mask); /* * Wakeup the cpus which have an expired event. */ bc_local = tick_do_broadcast(tmpmask); /* * Two reasons for reprogram: * * - The global event did not expire any CPU local * events. This happens in dyntick mode, as the maximum PIT * delta is quite small. * * - There are pending events on sleeping CPUs which were not * in the event mask */ if (next_event != KTIME_MAX) tick_broadcast_set_event(dev, next_cpu, next_event); raw_spin_unlock(&tick_broadcast_lock); if (bc_local) { td = this_cpu_ptr(&tick_cpu_device); td->evtdev->event_handler(td->evtdev); } } static int broadcast_needs_cpu(struct clock_event_device *bc, int cpu) { if (!(bc->features & CLOCK_EVT_FEAT_HRTIMER)) return 0; if (bc->next_event == KTIME_MAX) return 0; return bc->bound_on == cpu ? -EBUSY : 0; } static void broadcast_shutdown_local(struct clock_event_device *bc, struct clock_event_device *dev) { /* * For hrtimer based broadcasting we cannot shutdown the cpu * local device if our own event is the first one to expire or * if we own the broadcast timer. */ if (bc->features & CLOCK_EVT_FEAT_HRTIMER) { if (broadcast_needs_cpu(bc, smp_processor_id())) return; if (dev->next_event < bc->next_event) return; } clockevents_switch_state(dev, CLOCK_EVT_STATE_SHUTDOWN); } static int ___tick_broadcast_oneshot_control(enum tick_broadcast_state state, struct tick_device *td, int cpu) { struct clock_event_device *bc, *dev = td->evtdev; int ret = 0; ktime_t now; raw_spin_lock(&tick_broadcast_lock); bc = tick_broadcast_device.evtdev; if (state == TICK_BROADCAST_ENTER) { /* * If the current CPU owns the hrtimer broadcast * mechanism, it cannot go deep idle and we do not add * the CPU to the broadcast mask. We don't have to go * through the EXIT path as the local timer is not * shutdown. */ ret = broadcast_needs_cpu(bc, cpu); if (ret) goto out; /* * If the broadcast device is in periodic mode, we * return. */ if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC) { /* If it is a hrtimer based broadcast, return busy */ if (bc->features & CLOCK_EVT_FEAT_HRTIMER) ret = -EBUSY; goto out; } if (!cpumask_test_and_set_cpu(cpu, tick_broadcast_oneshot_mask)) { WARN_ON_ONCE(cpumask_test_cpu(cpu, tick_broadcast_pending_mask)); /* Conditionally shut down the local timer. */ broadcast_shutdown_local(bc, dev); /* * We only reprogram the broadcast timer if we * did not mark ourself in the force mask and * if the cpu local event is earlier than the * broadcast event. If the current CPU is in * the force mask, then we are going to be * woken by the IPI right away; we return * busy, so the CPU does not try to go deep * idle. */ if (cpumask_test_cpu(cpu, tick_broadcast_force_mask)) { ret = -EBUSY; } else if (dev->next_event < bc->next_event) { tick_broadcast_set_event(bc, cpu, dev->next_event); /* * In case of hrtimer broadcasts the * programming might have moved the * timer to this cpu. If yes, remove * us from the broadcast mask and * return busy. */ ret = broadcast_needs_cpu(bc, cpu); if (ret) { cpumask_clear_cpu(cpu, tick_broadcast_oneshot_mask); } } } } else { if (cpumask_test_and_clear_cpu(cpu, tick_broadcast_oneshot_mask)) { clockevents_switch_state(dev, CLOCK_EVT_STATE_ONESHOT); /* * The cpu which was handling the broadcast * timer marked this cpu in the broadcast * pending mask and fired the broadcast * IPI. So we are going to handle the expired * event anyway via the broadcast IPI * handler. No need to reprogram the timer * with an already expired event. */ if (cpumask_test_and_clear_cpu(cpu, tick_broadcast_pending_mask)) goto out; /* * Bail out if there is no next event. */ if (dev->next_event == KTIME_MAX) goto out; /* * If the pending bit is not set, then we are * either the CPU handling the broadcast * interrupt or we got woken by something else. * * We are no longer in the broadcast mask, so * if the cpu local expiry time is already * reached, we would reprogram the cpu local * timer with an already expired event. * * This can lead to a ping-pong when we return * to idle and therefore rearm the broadcast * timer before the cpu local timer was able * to fire. This happens because the forced * reprogramming makes sure that the event * will happen in the future and depending on * the min_delta setting this might be far * enough out that the ping-pong starts. * * If the cpu local next_event has expired * then we know that the broadcast timer * next_event has expired as well and * broadcast is about to be handled. So we * avoid reprogramming and enforce that the * broadcast handler, which did not run yet, * will invoke the cpu local handler. * * We cannot call the handler directly from * here, because we might be in a NOHZ phase * and we did not go through the irq_enter() * nohz fixups. */ now = ktime_get(); if (dev->next_event <= now) { cpumask_set_cpu(cpu, tick_broadcast_force_mask); goto out; } /* * We got woken by something else. Reprogram * the cpu local timer device. */ tick_program_event(dev->next_event, 1); } } out: raw_spin_unlock(&tick_broadcast_lock); return ret; } static int tick_oneshot_wakeup_control(enum tick_broadcast_state state, struct tick_device *td, int cpu) { struct clock_event_device *dev, *wd; dev = td->evtdev; if (td->mode != TICKDEV_MODE_ONESHOT) return -EINVAL; wd = tick_get_oneshot_wakeup_device(cpu); if (!wd) return -ENODEV; switch (state) { case TICK_BROADCAST_ENTER: clockevents_switch_state(dev, CLOCK_EVT_STATE_ONESHOT_STOPPED); clockevents_switch_state(wd, CLOCK_EVT_STATE_ONESHOT); clockevents_program_event(wd, dev->next_event, 1); break; case TICK_BROADCAST_EXIT: /* We may have transitioned to oneshot mode while idle */ if (clockevent_get_state(wd) != CLOCK_EVT_STATE_ONESHOT) return -ENODEV; } return 0; } int __tick_broadcast_oneshot_control(enum tick_broadcast_state state) { struct tick_device *td = this_cpu_ptr(&tick_cpu_device); int cpu = smp_processor_id(); if (!tick_oneshot_wakeup_control(state, td, cpu)) return 0; if (tick_broadcast_device.evtdev) return ___tick_broadcast_oneshot_control(state, td, cpu); /* * If there is no broadcast or wakeup device, tell the caller not * to go into deep idle. */ return -EBUSY; } /* * Reset the one shot broadcast for a cpu * * Called with tick_broadcast_lock held */ static void tick_broadcast_clear_oneshot(int cpu) { cpumask_clear_cpu(cpu, tick_broadcast_oneshot_mask); cpumask_clear_cpu(cpu, tick_broadcast_pending_mask); } static void tick_broadcast_init_next_event(struct cpumask *mask, ktime_t expires) { struct tick_device *td; int cpu; for_each_cpu(cpu, mask) { td = &per_cpu(tick_cpu_device, cpu); if (td->evtdev) td->evtdev->next_event = expires; } } static inline ktime_t tick_get_next_period(void) { ktime_t next; /* * Protect against concurrent updates (store /load tearing on * 32bit). It does not matter if the time is already in the * past. The broadcast device which is about to be programmed will * fire in any case. */ raw_spin_lock(&jiffies_lock); next = tick_next_period; raw_spin_unlock(&jiffies_lock); return next; } /** * tick_broadcast_setup_oneshot - setup the broadcast device * @bc: the broadcast device * @from_periodic: true if called from periodic mode */ static void tick_broadcast_setup_oneshot(struct clock_event_device *bc, bool from_periodic) { int cpu = smp_processor_id(); ktime_t nexttick = 0; if (!bc) return; /* * When the broadcast device was switched to oneshot by the first * CPU handling the NOHZ change, the other CPUs will reach this * code via hrtimer_run_queues() -> tick_check_oneshot_change() * too. Set up the broadcast device only once! */ if (bc->event_handler == tick_handle_oneshot_broadcast) { /* * The CPU which switched from periodic to oneshot mode * set the broadcast oneshot bit for all other CPUs which * are in the general (periodic) broadcast mask to ensure * that CPUs which wait for the periodic broadcast are * woken up. * * Clear the bit for the local CPU as the set bit would * prevent the first tick_broadcast_enter() after this CPU * switched to oneshot state to program the broadcast * device. * * This code can also be reached via tick_broadcast_control(), * but this cannot avoid the tick_broadcast_clear_oneshot() * as that would break the periodic to oneshot transition of * secondary CPUs. But that's harmless as the below only * clears already cleared bits. */ tick_broadcast_clear_oneshot(cpu); return; } bc->event_handler = tick_handle_oneshot_broadcast; bc->next_event = KTIME_MAX; /* * When the tick mode is switched from periodic to oneshot it must * be ensured that CPUs which are waiting for periodic broadcast * get their wake-up at the next tick. This is achieved by ORing * tick_broadcast_mask into tick_broadcast_oneshot_mask. * * For other callers, e.g. broadcast device replacement, * tick_broadcast_oneshot_mask must not be touched as this would * set bits for CPUs which are already NOHZ, but not idle. Their * next tick_broadcast_enter() would observe the bit set and fail * to update the expiry time and the broadcast event device. */ if (from_periodic) { cpumask_copy(tmpmask, tick_broadcast_mask); /* Remove the local CPU as it is obviously not idle */ cpumask_clear_cpu(cpu, tmpmask); cpumask_or(tick_broadcast_oneshot_mask, tick_broadcast_oneshot_mask, tmpmask); /* * Ensure that the oneshot broadcast handler will wake the * CPUs which are still waiting for periodic broadcast. */ nexttick = tick_get_next_period(); tick_broadcast_init_next_event(tmpmask, nexttick); /* * If the underlying broadcast clock event device is * already in oneshot state, then there is nothing to do. * The device was already armed for the next tick * in tick_handle_broadcast_periodic() */ if (clockevent_state_oneshot(bc)) return; } /* * When switching from periodic to oneshot mode arm the broadcast * device for the next tick. * * If the broadcast device has been replaced in oneshot mode and * the oneshot broadcast mask is not empty, then arm it to expire * immediately in order to reevaluate the next expiring timer. * @nexttick is 0 and therefore in the past which will cause the * clockevent code to force an event. * * For both cases the programming can be avoided when the oneshot * broadcast mask is empty. * * tick_broadcast_set_event() implicitly switches the broadcast * device to oneshot state. */ if (!cpumask_empty(tick_broadcast_oneshot_mask)) tick_broadcast_set_event(bc, cpu, nexttick); } /* * Select oneshot operating mode for the broadcast device */ void tick_broadcast_switch_to_oneshot(void) { struct clock_event_device *bc; enum tick_device_mode oldmode; unsigned long flags; raw_spin_lock_irqsave(&tick_broadcast_lock, flags); oldmode = tick_broadcast_device.mode; tick_broadcast_device.mode = TICKDEV_MODE_ONESHOT; bc = tick_broadcast_device.evtdev; if (bc) tick_broadcast_setup_oneshot(bc, oldmode == TICKDEV_MODE_PERIODIC); raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags); } #ifdef CONFIG_HOTPLUG_CPU void hotplug_cpu__broadcast_tick_pull(int deadcpu) { struct clock_event_device *bc; unsigned long flags; raw_spin_lock_irqsave(&tick_broadcast_lock, flags); bc = tick_broadcast_device.evtdev; if (bc && broadcast_needs_cpu(bc, deadcpu)) { /* * If the broadcast force bit of the current CPU is set, * then the current CPU has not yet reprogrammed the local * timer device to avoid a ping-pong race. See * ___tick_broadcast_oneshot_control(). * * If the broadcast device is hrtimer based then * programming the broadcast event below does not have any * effect because the local clockevent device is not * running and not programmed because the broadcast event * is not earlier than the pending event of the local clock * event device. As a consequence all CPUs waiting for a * broadcast event are stuck forever. * * Detect this condition and reprogram the cpu local timer * device to avoid the starvation. */ if (tick_check_broadcast_expired()) { struct tick_device *td = this_cpu_ptr(&tick_cpu_device); cpumask_clear_cpu(smp_processor_id(), tick_broadcast_force_mask); tick_program_event(td->evtdev->next_event, 1); } /* This moves the broadcast assignment to this CPU: */ clockevents_program_event(bc, bc->next_event, 1); } raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags); } /* * Remove a dying CPU from broadcasting */ static void tick_broadcast_oneshot_offline(unsigned int cpu) { if (tick_get_oneshot_wakeup_device(cpu)) tick_set_oneshot_wakeup_device(NULL, cpu); /* * Clear the broadcast masks for the dead cpu, but do not stop * the broadcast device! */ cpumask_clear_cpu(cpu, tick_broadcast_oneshot_mask); cpumask_clear_cpu(cpu, tick_broadcast_pending_mask); cpumask_clear_cpu(cpu, tick_broadcast_force_mask); } #endif /* * Check, whether the broadcast device is in one shot mode */ int tick_broadcast_oneshot_active(void) { return tick_broadcast_device.mode == TICKDEV_MODE_ONESHOT; } /* * Check whether the broadcast device supports oneshot. */ bool tick_broadcast_oneshot_available(void) { struct clock_event_device *bc = tick_broadcast_device.evtdev; return bc ? bc->features & CLOCK_EVT_FEAT_ONESHOT : false; } #else int __tick_broadcast_oneshot_control(enum tick_broadcast_state state) { struct clock_event_device *bc = tick_broadcast_device.evtdev; if (!bc || (bc->features & CLOCK_EVT_FEAT_HRTIMER)) return -EBUSY; return 0; } #endif void __init tick_broadcast_init(void) { zalloc_cpumask_var(&tick_broadcast_mask, GFP_NOWAIT); zalloc_cpumask_var(&tick_broadcast_on, GFP_NOWAIT); zalloc_cpumask_var(&tmpmask, GFP_NOWAIT); #ifdef CONFIG_TICK_ONESHOT zalloc_cpumask_var(&tick_broadcast_oneshot_mask, GFP_NOWAIT); zalloc_cpumask_var(&tick_broadcast_pending_mask, GFP_NOWAIT); zalloc_cpumask_var(&tick_broadcast_force_mask, GFP_NOWAIT); #endif }
600 588 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 /* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright (c) 2013 Red Hat, Inc. and Parallels Inc. All rights reserved. * Authors: David Chinner and Glauber Costa * * Generic LRU infrastructure */ #ifndef _LRU_LIST_H #define _LRU_LIST_H #include <linux/list.h> #include <linux/nodemask.h> #include <linux/shrinker.h> #include <linux/xarray.h> struct mem_cgroup; /* list_lru_walk_cb has to always return one of those */ enum lru_status { LRU_REMOVED, /* item removed from list */ LRU_REMOVED_RETRY, /* item removed, but lock has been dropped and reacquired */ LRU_ROTATE, /* item referenced, give another pass */ LRU_SKIP, /* item cannot be locked, skip */ LRU_RETRY, /* item not freeable. May drop the lock internally, but has to return locked. */ LRU_STOP, /* stop lru list walking. May drop the lock internally, but has to return locked. */ }; struct list_lru_one { struct list_head list; /* may become negative during memcg reparenting */ long nr_items; /* protects all fields above */ spinlock_t lock; }; struct list_lru_memcg { struct rcu_head rcu; /* array of per cgroup per node lists, indexed by node id */ struct list_lru_one node[]; }; struct list_lru_node { /* global list, used for the root cgroup in cgroup aware lrus */ struct list_lru_one lru; atomic_long_t nr_items; } ____cacheline_aligned_in_smp; struct list_lru { struct list_lru_node *node; #ifdef CONFIG_MEMCG struct list_head list; int shrinker_id; bool memcg_aware; struct xarray xa; #endif #ifdef CONFIG_LOCKDEP struct lock_class_key *key; #endif }; void list_lru_destroy(struct list_lru *lru); int __list_lru_init(struct list_lru *lru, bool memcg_aware, struct shrinker *shrinker); #define list_lru_init(lru) \ __list_lru_init((lru), false, NULL) #define list_lru_init_memcg(lru, shrinker) \ __list_lru_init((lru), true, shrinker) static inline int list_lru_init_memcg_key(struct list_lru *lru, struct shrinker *shrinker, struct lock_class_key *key) { #ifdef CONFIG_LOCKDEP lru->key = key; #endif return list_lru_init_memcg(lru, shrinker); } int memcg_list_lru_alloc(struct mem_cgroup *memcg, struct list_lru *lru, gfp_t gfp); void memcg_reparent_list_lrus(struct mem_cgroup *memcg, struct mem_cgroup *parent); /** * list_lru_add: add an element to the lru list's tail * @lru: the lru pointer * @item: the item to be added. * @nid: the node id of the sublist to add the item to. * @memcg: the cgroup of the sublist to add the item to. * * If the element is already part of a list, this function returns doing * nothing. This means that it is not necessary to keep state about whether or * not the element already belongs in the list. That said, this logic only * works if the item is in *this* list. If the item might be in some other * list, then you cannot rely on this check and you must remove it from the * other list before trying to insert it. * * The lru list consists of many sublists internally; the @nid and @memcg * parameters are used to determine which sublist to insert the item into. * It's important to use the right value of @nid and @memcg when deleting the * item, since it might otherwise get deleted from the wrong sublist. * * This also applies when attempting to insert the item multiple times - if * the item is currently in one sublist and you call list_lru_add() again, you * must pass the right @nid and @memcg parameters so that the same sublist is * used. * * You must ensure that the memcg is not freed during this call (e.g., with * rcu or by taking a css refcnt). * * Return: true if the list was updated, false otherwise */ bool list_lru_add(struct list_lru *lru, struct list_head *item, int nid, struct mem_cgroup *memcg); /** * list_lru_add_obj: add an element to the lru list's tail * @lru: the lru pointer * @item: the item to be added. * * This function is similar to list_lru_add(), but the NUMA node and the * memcg of the sublist is determined by @item list_head. This assumption is * valid for slab objects LRU such as dentries, inodes, etc. * * Return: true if the list was updated, false otherwise */ bool list_lru_add_obj(struct list_lru *lru, struct list_head *item); /** * list_lru_del: delete an element from the lru list * @lru: the lru pointer * @item: the item to be deleted. * @nid: the node id of the sublist to delete the item from. * @memcg: the cgroup of the sublist to delete the item from. * * This function works analogously as list_lru_add() in terms of list * manipulation. * * The comments in list_lru_add() about an element already being in a list are * also valid for list_lru_del(), that is, you can delete an item that has * already been removed or never been added. However, if the item is in a * list, it must be in *this* list, and you must pass the right value of @nid * and @memcg so that the right sublist is used. * * You must ensure that the memcg is not freed during this call (e.g., with * rcu or by taking a css refcnt). When a memcg is deleted, list_lru entries * are automatically moved to the parent memcg. This is done in a race-free * way, so during deletion of an memcg both the old and new memcg will resolve * to the same sublist internally. * * Return: true if the list was updated, false otherwise */ bool list_lru_del(struct list_lru *lru, struct list_head *item, int nid, struct mem_cgroup *memcg); /** * list_lru_del_obj: delete an element from the lru list * @lru: the lru pointer * @item: the item to be deleted. * * This function is similar to list_lru_del(), but the NUMA node and the * memcg of the sublist is determined by @item list_head. This assumption is * valid for slab objects LRU such as dentries, inodes, etc. * * Return: true if the list was updated, false otherwise. */ bool list_lru_del_obj(struct list_lru *lru, struct list_head *item); /** * list_lru_count_one: return the number of objects currently held by @lru * @lru: the lru pointer. * @nid: the node id to count from. * @memcg: the cgroup to count from. * * There is no guarantee that the list is not updated while the count is being * computed. Callers that want such a guarantee need to provide an outer lock. * * Return: 0 for empty lists, otherwise the number of objects * currently held by @lru. */ unsigned long list_lru_count_one(struct list_lru *lru, int nid, struct mem_cgroup *memcg); unsigned long list_lru_count_node(struct list_lru *lru, int nid); static inline unsigned long list_lru_shrink_count(struct list_lru *lru, struct shrink_control *sc) { return list_lru_count_one(lru, sc->nid, sc->memcg); } static inline unsigned long list_lru_count(struct list_lru *lru) { long count = 0; int nid; for_each_node_state(nid, N_NORMAL_MEMORY) count += list_lru_count_node(lru, nid); return count; } void list_lru_isolate(struct list_lru_one *list, struct list_head *item); void list_lru_isolate_move(struct list_lru_one *list, struct list_head *item, struct list_head *head); typedef enum lru_status (*list_lru_walk_cb)(struct list_head *item, struct list_lru_one *list, void *cb_arg); /** * list_lru_walk_one: walk a @lru, isolating and disposing freeable items. * @lru: the lru pointer. * @nid: the node id to scan from. * @memcg: the cgroup to scan from. * @isolate: callback function that is responsible for deciding what to do with * the item currently being scanned * @cb_arg: opaque type that will be passed to @isolate * @nr_to_walk: how many items to scan. * * This function will scan all elements in a particular @lru, calling the * @isolate callback for each of those items, along with the current list * spinlock and a caller-provided opaque. The @isolate callback can choose to * drop the lock internally, but *must* return with the lock held. The callback * will return an enum lru_status telling the @lru infrastructure what to * do with the object being scanned. * * Please note that @nr_to_walk does not mean how many objects will be freed, * just how many objects will be scanned. * * Return: the number of objects effectively removed from the LRU. */ unsigned long list_lru_walk_one(struct list_lru *lru, int nid, struct mem_cgroup *memcg, list_lru_walk_cb isolate, void *cb_arg, unsigned long *nr_to_walk); /** * list_lru_walk_one_irq: walk a @lru, isolating and disposing freeable items. * @lru: the lru pointer. * @nid: the node id to scan from. * @memcg: the cgroup to scan from. * @isolate: callback function that is responsible for deciding what to do with * the item currently being scanned * @cb_arg: opaque type that will be passed to @isolate * @nr_to_walk: how many items to scan. * * Same as list_lru_walk_one() except that the spinlock is acquired with * spin_lock_irq(). */ unsigned long list_lru_walk_one_irq(struct list_lru *lru, int nid, struct mem_cgroup *memcg, list_lru_walk_cb isolate, void *cb_arg, unsigned long *nr_to_walk); unsigned long list_lru_walk_node(struct list_lru *lru, int nid, list_lru_walk_cb isolate, void *cb_arg, unsigned long *nr_to_walk); static inline unsigned long list_lru_shrink_walk(struct list_lru *lru, struct shrink_control *sc, list_lru_walk_cb isolate, void *cb_arg) { return list_lru_walk_one(lru, sc->nid, sc->memcg, isolate, cb_arg, &sc->nr_to_scan); } static inline unsigned long list_lru_shrink_walk_irq(struct list_lru *lru, struct shrink_control *sc, list_lru_walk_cb isolate, void *cb_arg) { return list_lru_walk_one_irq(lru, sc->nid, sc->memcg, isolate, cb_arg, &sc->nr_to_scan); } static inline unsigned long list_lru_walk(struct list_lru *lru, list_lru_walk_cb isolate, void *cb_arg, unsigned long nr_to_walk) { long isolated = 0; int nid; for_each_node_state(nid, N_NORMAL_MEMORY) { isolated += list_lru_walk_node(lru, nid, isolate, cb_arg, &nr_to_walk); if (nr_to_walk <= 0) break; } return isolated; } #endif /* _LRU_LIST_H */
1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 // SPDX-License-Identifier: GPL-2.0-or-later /* * Handle firewalling * Linux ethernet bridge * * Authors: * Lennert Buytenhek <buytenh@gnu.org> * Bart De Schuymer <bdschuym@pandora.be> * * Lennert dedicates this file to Kerstin Wurdinger. */ #include <linux/module.h> #include <linux/kernel.h> #include <linux/slab.h> #include <linux/ip.h> #include <linux/netdevice.h> #include <linux/skbuff.h> #include <linux/if_arp.h> #include <linux/if_ether.h> #include <linux/if_vlan.h> #include <linux/if_pppox.h> #include <linux/ppp_defs.h> #include <linux/netfilter_bridge.h> #include <uapi/linux/netfilter_bridge.h> #include <linux/netfilter_ipv4.h> #include <linux/netfilter_ipv6.h> #include <linux/netfilter_arp.h> #include <linux/in_route.h> #include <linux/rculist.h> #include <linux/inetdevice.h> #include <net/ip.h> #include <net/ipv6.h> #include <net/addrconf.h> #include <net/dst_metadata.h> #include <net/route.h> #include <net/netfilter/br_netfilter.h> #include <net/netns/generic.h> #include <net/inet_dscp.h> #include <linux/uaccess.h> #include "br_private.h" #ifdef CONFIG_SYSCTL #include <linux/sysctl.h> #endif #if IS_ENABLED(CONFIG_NF_CONNTRACK) #include <net/netfilter/nf_conntrack_core.h> #endif static unsigned int brnf_net_id __read_mostly; struct brnf_net { bool enabled; #ifdef CONFIG_SYSCTL struct ctl_table_header *ctl_hdr; #endif /* default value is 1 */ int call_iptables; int call_ip6tables; int call_arptables; /* default value is 0 */ int filter_vlan_tagged; int filter_pppoe_tagged; int pass_vlan_indev; }; #define IS_IP(skb) \ (!skb_vlan_tag_present(skb) && skb->protocol == htons(ETH_P_IP)) #define IS_IPV6(skb) \ (!skb_vlan_tag_present(skb) && skb->protocol == htons(ETH_P_IPV6)) #define IS_ARP(skb) \ (!skb_vlan_tag_present(skb) && skb->protocol == htons(ETH_P_ARP)) static inline __be16 vlan_proto(const struct sk_buff *skb) { if (skb_vlan_tag_present(skb)) return skb->protocol; else if (skb->protocol == htons(ETH_P_8021Q)) return vlan_eth_hdr(skb)->h_vlan_encapsulated_proto; else return 0; } static inline bool is_vlan_ip(const struct sk_buff *skb, const struct net *net) { struct brnf_net *brnet = net_generic(net, brnf_net_id); return vlan_proto(skb) == htons(ETH_P_IP) && brnet->filter_vlan_tagged; } static inline bool is_vlan_ipv6(const struct sk_buff *skb, const struct net *net) { struct brnf_net *brnet = net_generic(net, brnf_net_id); return vlan_proto(skb) == htons(ETH_P_IPV6) && brnet->filter_vlan_tagged; } static inline bool is_vlan_arp(const struct sk_buff *skb, const struct net *net) { struct brnf_net *brnet = net_generic(net, brnf_net_id); return vlan_proto(skb) == htons(ETH_P_ARP) && brnet->filter_vlan_tagged; } static inline __be16 pppoe_proto(const struct sk_buff *skb) { return *((__be16 *)(skb_mac_header(skb) + ETH_HLEN + sizeof(struct pppoe_hdr))); } static inline bool is_pppoe_ip(const struct sk_buff *skb, const struct net *net) { struct brnf_net *brnet = net_generic(net, brnf_net_id); return skb->protocol == htons(ETH_P_PPP_SES) && pppoe_proto(skb) == htons(PPP_IP) && brnet->filter_pppoe_tagged; } static inline bool is_pppoe_ipv6(const struct sk_buff *skb, const struct net *net) { struct brnf_net *brnet = net_generic(net, brnf_net_id); return skb->protocol == htons(ETH_P_PPP_SES) && pppoe_proto(skb) == htons(PPP_IPV6) && brnet->filter_pppoe_tagged; } /* largest possible L2 header, see br_nf_dev_queue_xmit() */ #define NF_BRIDGE_MAX_MAC_HEADER_LENGTH (PPPOE_SES_HLEN + ETH_HLEN) struct brnf_frag_data { local_lock_t bh_lock; char mac[NF_BRIDGE_MAX_MAC_HEADER_LENGTH]; u8 encap_size; u8 size; u16 vlan_tci; __be16 vlan_proto; }; static DEFINE_PER_CPU(struct brnf_frag_data, brnf_frag_data_storage) = { .bh_lock = INIT_LOCAL_LOCK(bh_lock), }; static void nf_bridge_info_free(struct sk_buff *skb) { skb_ext_del(skb, SKB_EXT_BRIDGE_NF); } static inline struct net_device *bridge_parent(const struct net_device *dev) { struct net_bridge_port *port; port = br_port_get_rcu(dev); return port ? port->br->dev : NULL; } static inline struct nf_bridge_info *nf_bridge_unshare(struct sk_buff *skb) { return skb_ext_add(skb, SKB_EXT_BRIDGE_NF); } unsigned int nf_bridge_encap_header_len(const struct sk_buff *skb) { switch (skb->protocol) { case __cpu_to_be16(ETH_P_8021Q): return VLAN_HLEN; case __cpu_to_be16(ETH_P_PPP_SES): return PPPOE_SES_HLEN; default: return 0; } } static inline void nf_bridge_pull_encap_header(struct sk_buff *skb) { unsigned int len = nf_bridge_encap_header_len(skb); skb_pull(skb, len); skb->network_header += len; } static inline void nf_bridge_pull_encap_header_rcsum(struct sk_buff *skb) { unsigned int len = nf_bridge_encap_header_len(skb); skb_pull_rcsum(skb, len); skb->network_header += len; } /* When handing a packet over to the IP layer * check whether we have a skb that is in the * expected format */ static int br_validate_ipv4(struct net *net, struct sk_buff *skb) { const struct iphdr *iph; u32 len; if (!pskb_may_pull(skb, sizeof(struct iphdr))) goto inhdr_error; iph = ip_hdr(skb); /* Basic sanity checks */ if (iph->ihl < 5 || iph->version != 4) goto inhdr_error; if (!pskb_may_pull(skb, iph->ihl*4)) goto inhdr_error; iph = ip_hdr(skb); if (unlikely(ip_fast_csum((u8 *)iph, iph->ihl))) goto csum_error; len = skb_ip_totlen(skb); if (skb->len < len) { __IP_INC_STATS(net, IPSTATS_MIB_INTRUNCATEDPKTS); goto drop; } else if (len < (iph->ihl*4)) goto inhdr_error; if (pskb_trim_rcsum(skb, len)) { __IP_INC_STATS(net, IPSTATS_MIB_INDISCARDS); goto drop; } memset(IPCB(skb), 0, sizeof(struct inet_skb_parm)); /* We should really parse IP options here but until * somebody who actually uses IP options complains to * us we'll just silently ignore the options because * we're lazy! */ return 0; csum_error: __IP_INC_STATS(net, IPSTATS_MIB_CSUMERRORS); inhdr_error: __IP_INC_STATS(net, IPSTATS_MIB_INHDRERRORS); drop: return -1; } void nf_bridge_update_protocol(struct sk_buff *skb) { const struct nf_bridge_info *nf_bridge = nf_bridge_info_get(skb); switch (nf_bridge->orig_proto) { case BRNF_PROTO_8021Q: skb->protocol = htons(ETH_P_8021Q); break; case BRNF_PROTO_PPPOE: skb->protocol = htons(ETH_P_PPP_SES); break; case BRNF_PROTO_UNCHANGED: break; } } /* Obtain the correct destination MAC address, while preserving the original * source MAC address. If we already know this address, we just copy it. If we * don't, we use the neighbour framework to find out. In both cases, we make * sure that br_handle_frame_finish() is called afterwards. */ int br_nf_pre_routing_finish_bridge(struct net *net, struct sock *sk, struct sk_buff *skb) { struct neighbour *neigh; struct dst_entry *dst; skb->dev = bridge_parent(skb->dev); if (!skb->dev) goto free_skb; dst = skb_dst(skb); neigh = dst_neigh_lookup_skb(dst, skb); if (neigh) { struct nf_bridge_info *nf_bridge = nf_bridge_info_get(skb); int ret; if ((READ_ONCE(neigh->nud_state) & NUD_CONNECTED) && READ_ONCE(neigh->hh.hh_len)) { struct net_device *br_indev; br_indev = nf_bridge_get_physindev(skb, net); if (!br_indev) { neigh_release(neigh); goto free_skb; } neigh_hh_bridge(&neigh->hh, skb); skb->dev = br_indev; ret = br_handle_frame_finish(net, sk, skb); } else { /* the neighbour function below overwrites the complete * MAC header, so we save the Ethernet source address and * protocol number. */ skb_copy_from_linear_data_offset(skb, -(ETH_HLEN-ETH_ALEN), nf_bridge->neigh_header, ETH_HLEN-ETH_ALEN); /* tell br_dev_xmit to continue with forwarding */ nf_bridge->bridged_dnat = 1; /* FIXME Need to refragment */ ret = READ_ONCE(neigh->output)(neigh, skb); } neigh_release(neigh); return ret; } free_skb: kfree_skb(skb); return 0; } static inline bool br_nf_ipv4_daddr_was_changed(const struct sk_buff *skb, const struct nf_bridge_info *nf_bridge) { return ip_hdr(skb)->daddr != nf_bridge->ipv4_daddr; } /* This requires some explaining. If DNAT has taken place, * we will need to fix up the destination Ethernet address. * This is also true when SNAT takes place (for the reply direction). * * There are two cases to consider: * 1. The packet was DNAT'ed to a device in the same bridge * port group as it was received on. We can still bridge * the packet. * 2. The packet was DNAT'ed to a different device, either * a non-bridged device or another bridge port group. * The packet will need to be routed. * * The correct way of distinguishing between these two cases is to * call ip_route_input() and to look at skb->dst->dev, which is * changed to the destination device if ip_route_input() succeeds. * * Let's first consider the case that ip_route_input() succeeds: * * If the output device equals the logical bridge device the packet * came in on, we can consider this bridging. The corresponding MAC * address will be obtained in br_nf_pre_routing_finish_bridge. * Otherwise, the packet is considered to be routed and we just * change the destination MAC address so that the packet will * later be passed up to the IP stack to be routed. For a redirected * packet, ip_route_input() will give back the localhost as output device, * which differs from the bridge device. * * Let's now consider the case that ip_route_input() fails: * * This can be because the destination address is martian, in which case * the packet will be dropped. * If IP forwarding is disabled, ip_route_input() will fail, while * ip_route_output_key() can return success. The source * address for ip_route_output_key() is set to zero, so ip_route_output_key() * thinks we're handling a locally generated packet and won't care * if IP forwarding is enabled. If the output device equals the logical bridge * device, we proceed as if ip_route_input() succeeded. If it differs from the * logical bridge port or if ip_route_output_key() fails we drop the packet. */ static int br_nf_pre_routing_finish(struct net *net, struct sock *sk, struct sk_buff *skb) { struct nf_bridge_info *nf_bridge = nf_bridge_info_get(skb); struct net_device *dev = skb->dev, *br_indev; const struct iphdr *iph = ip_hdr(skb); enum skb_drop_reason reason; struct rtable *rt; br_indev = nf_bridge_get_physindev(skb, net); if (!br_indev) { kfree_skb(skb); return 0; } nf_bridge->frag_max_size = IPCB(skb)->frag_max_size; if (nf_bridge->pkt_otherhost) { skb->pkt_type = PACKET_OTHERHOST; nf_bridge->pkt_otherhost = false; } nf_bridge->in_prerouting = 0; if (br_nf_ipv4_daddr_was_changed(skb, nf_bridge)) { reason = ip_route_input(skb, iph->daddr, iph->saddr, ip4h_dscp(iph), dev); if (reason) { kfree_skb_reason(skb, reason); return 0; } else { if (skb_dst(skb)->dev == dev) { skb->dev = br_indev; nf_bridge_update_protocol(skb); nf_bridge_push_encap_header(skb); br_nf_hook_thresh(NF_BR_PRE_ROUTING, net, sk, skb, skb->dev, NULL, br_nf_pre_routing_finish_bridge); return 0; } ether_addr_copy(eth_hdr(skb)->h_dest, dev->dev_addr); skb->pkt_type = PACKET_HOST; } } else { rt = bridge_parent_rtable(br_indev); if (!rt) { kfree_skb(skb); return 0; } skb_dst_drop(skb); skb_dst_set_noref(skb, &rt->dst); } skb->dev = br_indev; nf_bridge_update_protocol(skb); nf_bridge_push_encap_header(skb); br_nf_hook_thresh(NF_BR_PRE_ROUTING, net, sk, skb, skb->dev, NULL, br_handle_frame_finish); return 0; } static struct net_device *brnf_get_logical_dev(struct sk_buff *skb, const struct net_device *dev, const struct net *net) { struct net_device *vlan, *br; struct brnf_net *brnet = net_generic(net, brnf_net_id); br = bridge_parent(dev); if (brnet->pass_vlan_indev == 0 || !skb_vlan_tag_present(skb)) return br; vlan = __vlan_find_dev_deep_rcu(br, skb->vlan_proto, skb_vlan_tag_get(skb) & VLAN_VID_MASK); return vlan ? vlan : br; } /* Some common code for IPv4/IPv6 */ struct net_device *setup_pre_routing(struct sk_buff *skb, const struct net *net) { struct nf_bridge_info *nf_bridge = nf_bridge_info_get(skb); if (skb->pkt_type == PACKET_OTHERHOST) { skb->pkt_type = PACKET_HOST; nf_bridge->pkt_otherhost = true; } nf_bridge->in_prerouting = 1; nf_bridge->physinif = skb->dev->ifindex; skb->dev = brnf_get_logical_dev(skb, skb->dev, net); if (skb->protocol == htons(ETH_P_8021Q)) nf_bridge->orig_proto = BRNF_PROTO_8021Q; else if (skb->protocol == htons(ETH_P_PPP_SES)) nf_bridge->orig_proto = BRNF_PROTO_PPPOE; /* Must drop socket now because of tproxy. */ skb_orphan(skb); return skb->dev; } /* Direct IPv6 traffic to br_nf_pre_routing_ipv6. * Replicate the checks that IPv4 does on packet reception. * Set skb->dev to the bridge device (i.e. parent of the * receiving device) to make netfilter happy, the REDIRECT * target in particular. Save the original destination IP * address to be able to detect DNAT afterwards. */ static unsigned int br_nf_pre_routing(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { struct nf_bridge_info *nf_bridge; struct net_bridge_port *p; struct net_bridge *br; __u32 len = nf_bridge_encap_header_len(skb); struct brnf_net *brnet; if (unlikely(!pskb_may_pull(skb, len))) return NF_DROP_REASON(skb, SKB_DROP_REASON_PKT_TOO_SMALL, 0); p = br_port_get_rcu(state->in); if (p == NULL) return NF_DROP_REASON(skb, SKB_DROP_REASON_DEV_READY, 0); br = p->br; brnet = net_generic(state->net, brnf_net_id); if (IS_IPV6(skb) || is_vlan_ipv6(skb, state->net) || is_pppoe_ipv6(skb, state->net)) { if (!brnet->call_ip6tables && !br_opt_get(br, BROPT_NF_CALL_IP6TABLES)) return NF_ACCEPT; if (!ipv6_mod_enabled()) { pr_warn_once("Module ipv6 is disabled, so call_ip6tables is not supported."); return NF_DROP_REASON(skb, SKB_DROP_REASON_IPV6DISABLED, 0); } nf_bridge_pull_encap_header_rcsum(skb); return br_nf_pre_routing_ipv6(priv, skb, state); } if (!brnet->call_iptables && !br_opt_get(br, BROPT_NF_CALL_IPTABLES)) return NF_ACCEPT; if (!IS_IP(skb) && !is_vlan_ip(skb, state->net) && !is_pppoe_ip(skb, state->net)) return NF_ACCEPT; nf_bridge_pull_encap_header_rcsum(skb); if (br_validate_ipv4(state->net, skb)) return NF_DROP_REASON(skb, SKB_DROP_REASON_IP_INHDR, 0); if (!nf_bridge_alloc(skb)) return NF_DROP_REASON(skb, SKB_DROP_REASON_NOMEM, 0); if (!setup_pre_routing(skb, state->net)) return NF_DROP_REASON(skb, SKB_DROP_REASON_DEV_READY, 0); nf_bridge = nf_bridge_info_get(skb); nf_bridge->ipv4_daddr = ip_hdr(skb)->daddr; skb->protocol = htons(ETH_P_IP); skb->transport_header = skb->network_header + ip_hdr(skb)->ihl * 4; NF_HOOK(NFPROTO_IPV4, NF_INET_PRE_ROUTING, state->net, state->sk, skb, skb->dev, NULL, br_nf_pre_routing_finish); return NF_STOLEN; } #if IS_ENABLED(CONFIG_NF_CONNTRACK) /* conntracks' nf_confirm logic cannot handle cloned skbs referencing * the same nf_conn entry, which will happen for multicast (broadcast) * Frames on bridges. * * Example: * macvlan0 * br0 * ethX ethY * * ethX (or Y) receives multicast or broadcast packet containing * an IP packet, not yet in conntrack table. * * 1. skb passes through bridge and fake-ip (br_netfilter)Prerouting. * -> skb->_nfct now references a unconfirmed entry * 2. skb is broad/mcast packet. bridge now passes clones out on each bridge * interface. * 3. skb gets passed up the stack. * 4. In macvlan case, macvlan driver retains clone(s) of the mcast skb * and schedules a work queue to send them out on the lower devices. * * The clone skb->_nfct is not a copy, it is the same entry as the * original skb. The macvlan rx handler then returns RX_HANDLER_PASS. * 5. Normal conntrack hooks (in NF_INET_LOCAL_IN) confirm the orig skb. * * The Macvlan broadcast worker and normal confirm path will race. * * This race will not happen if step 2 already confirmed a clone. In that * case later steps perform skb_clone() with skb->_nfct already confirmed (in * hash table). This works fine. * * But such confirmation won't happen when eb/ip/nftables rules dropped the * packets before they reached the nf_confirm step in postrouting. * * Work around this problem by explicit confirmation of the entry at * LOCAL_IN time, before upper layer has a chance to clone the unconfirmed * entry. * */ static unsigned int br_nf_local_in(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { bool promisc = BR_INPUT_SKB_CB(skb)->promisc; struct nf_conntrack *nfct = skb_nfct(skb); const struct nf_ct_hook *ct_hook; struct nf_conn *ct; int ret; if (promisc) { nf_reset_ct(skb); return NF_ACCEPT; } if (!nfct || skb->pkt_type == PACKET_HOST) return NF_ACCEPT; ct = container_of(nfct, struct nf_conn, ct_general); if (likely(nf_ct_is_confirmed(ct))) return NF_ACCEPT; if (WARN_ON_ONCE(refcount_read(&nfct->use) != 1)) { nf_reset_ct(skb); return NF_ACCEPT; } WARN_ON_ONCE(skb_shared(skb)); /* We can't call nf_confirm here, it would create a dependency * on nf_conntrack module. */ ct_hook = rcu_dereference(nf_ct_hook); if (!ct_hook) { skb->_nfct = 0ul; nf_conntrack_put(nfct); return NF_ACCEPT; } nf_bridge_pull_encap_header(skb); ret = ct_hook->confirm(skb); switch (ret & NF_VERDICT_MASK) { case NF_STOLEN: return NF_STOLEN; default: nf_bridge_push_encap_header(skb); break; } return ret; } #endif /* PF_BRIDGE/FORWARD *************************************************/ static int br_nf_forward_finish(struct net *net, struct sock *sk, struct sk_buff *skb) { struct nf_bridge_info *nf_bridge = nf_bridge_info_get(skb); struct net_device *in; if (!IS_ARP(skb) && !is_vlan_arp(skb, net)) { if (skb->protocol == htons(ETH_P_IP)) nf_bridge->frag_max_size = IPCB(skb)->frag_max_size; if (skb->protocol == htons(ETH_P_IPV6)) nf_bridge->frag_max_size = IP6CB(skb)->frag_max_size; in = nf_bridge_get_physindev(skb, net); if (!in) { kfree_skb(skb); return 0; } if (nf_bridge->pkt_otherhost) { skb->pkt_type = PACKET_OTHERHOST; nf_bridge->pkt_otherhost = false; } nf_bridge_update_protocol(skb); } else { in = *((struct net_device **)(skb->cb)); } nf_bridge_push_encap_header(skb); br_nf_hook_thresh(NF_BR_FORWARD, net, sk, skb, in, skb->dev, br_forward_finish); return 0; } static unsigned int br_nf_forward_ip(struct sk_buff *skb, const struct nf_hook_state *state, u8 pf) { struct nf_bridge_info *nf_bridge; struct net_device *parent; nf_bridge = nf_bridge_info_get(skb); if (!nf_bridge) return NF_ACCEPT; /* Need exclusive nf_bridge_info since we might have multiple * different physoutdevs. */ if (!nf_bridge_unshare(skb)) return NF_DROP_REASON(skb, SKB_DROP_REASON_NOMEM, 0); nf_bridge = nf_bridge_info_get(skb); if (!nf_bridge) return NF_DROP_REASON(skb, SKB_DROP_REASON_NOMEM, 0); parent = bridge_parent(state->out); if (!parent) return NF_DROP_REASON(skb, SKB_DROP_REASON_DEV_READY, 0); nf_bridge_pull_encap_header(skb); if (skb->pkt_type == PACKET_OTHERHOST) { skb->pkt_type = PACKET_HOST; nf_bridge->pkt_otherhost = true; } if (pf == NFPROTO_IPV4) { if (br_validate_ipv4(state->net, skb)) return NF_DROP_REASON(skb, SKB_DROP_REASON_IP_INHDR, 0); IPCB(skb)->frag_max_size = nf_bridge->frag_max_size; skb->protocol = htons(ETH_P_IP); } else if (pf == NFPROTO_IPV6) { if (br_validate_ipv6(state->net, skb)) return NF_DROP_REASON(skb, SKB_DROP_REASON_IP_INHDR, 0); IP6CB(skb)->frag_max_size = nf_bridge->frag_max_size; skb->protocol = htons(ETH_P_IPV6); } else { WARN_ON_ONCE(1); return NF_DROP; } nf_bridge->physoutdev = skb->dev; NF_HOOK(pf, NF_INET_FORWARD, state->net, NULL, skb, brnf_get_logical_dev(skb, state->in, state->net), parent, br_nf_forward_finish); return NF_STOLEN; } static unsigned int br_nf_forward_arp(struct sk_buff *skb, const struct nf_hook_state *state) { struct net_bridge_port *p; struct net_bridge *br; struct net_device **d = (struct net_device **)(skb->cb); struct brnf_net *brnet; p = br_port_get_rcu(state->out); if (p == NULL) return NF_ACCEPT; br = p->br; brnet = net_generic(state->net, brnf_net_id); if (!brnet->call_arptables && !br_opt_get(br, BROPT_NF_CALL_ARPTABLES)) return NF_ACCEPT; if (is_vlan_arp(skb, state->net)) nf_bridge_pull_encap_header(skb); if (unlikely(!pskb_may_pull(skb, sizeof(struct arphdr)))) return NF_DROP_REASON(skb, SKB_DROP_REASON_PKT_TOO_SMALL, 0); if (arp_hdr(skb)->ar_pln != 4) { if (is_vlan_arp(skb, state->net)) nf_bridge_push_encap_header(skb); return NF_ACCEPT; } *d = state->in; NF_HOOK(NFPROTO_ARP, NF_ARP_FORWARD, state->net, state->sk, skb, state->in, state->out, br_nf_forward_finish); return NF_STOLEN; } /* This is the 'purely bridged' case. For IP, we pass the packet to * netfilter with indev and outdev set to the bridge device, * but we are still able to filter on the 'real' indev/outdev * because of the physdev module. For ARP, indev and outdev are the * bridge ports. */ static unsigned int br_nf_forward(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { if (IS_IP(skb) || is_vlan_ip(skb, state->net) || is_pppoe_ip(skb, state->net)) return br_nf_forward_ip(skb, state, NFPROTO_IPV4); if (IS_IPV6(skb) || is_vlan_ipv6(skb, state->net) || is_pppoe_ipv6(skb, state->net)) return br_nf_forward_ip(skb, state, NFPROTO_IPV6); if (IS_ARP(skb) || is_vlan_arp(skb, state->net)) return br_nf_forward_arp(skb, state); return NF_ACCEPT; } static int br_nf_push_frag_xmit(struct net *net, struct sock *sk, struct sk_buff *skb) { struct brnf_frag_data *data; int err; data = this_cpu_ptr(&brnf_frag_data_storage); err = skb_cow_head(skb, data->size); if (err) { kfree_skb(skb); return 0; } if (data->vlan_proto) __vlan_hwaccel_put_tag(skb, data->vlan_proto, data->vlan_tci); skb_copy_to_linear_data_offset(skb, -data->size, data->mac, data->size); __skb_push(skb, data->encap_size); nf_bridge_info_free(skb); return br_dev_queue_push_xmit(net, sk, skb); } static int br_nf_ip_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, int (*output)(struct net *, struct sock *, struct sk_buff *)) { unsigned int mtu = ip_skb_dst_mtu(sk, skb); struct iphdr *iph = ip_hdr(skb); if (unlikely(((iph->frag_off & htons(IP_DF)) && !skb->ignore_df) || (IPCB(skb)->frag_max_size && IPCB(skb)->frag_max_size > mtu))) { IP_INC_STATS(net, IPSTATS_MIB_FRAGFAILS); kfree_skb(skb); return -EMSGSIZE; } return ip_do_fragment(net, sk, skb, output); } static unsigned int nf_bridge_mtu_reduction(const struct sk_buff *skb) { const struct nf_bridge_info *nf_bridge = nf_bridge_info_get(skb); if (nf_bridge->orig_proto == BRNF_PROTO_PPPOE) return PPPOE_SES_HLEN; return 0; } static int br_nf_dev_queue_xmit(struct net *net, struct sock *sk, struct sk_buff *skb) { struct nf_bridge_info *nf_bridge = nf_bridge_info_get(skb); unsigned int mtu, mtu_reserved; int ret; mtu_reserved = nf_bridge_mtu_reduction(skb); mtu = skb->dev->mtu; if (nf_bridge->pkt_otherhost) { skb->pkt_type = PACKET_OTHERHOST; nf_bridge->pkt_otherhost = false; } if (nf_bridge->frag_max_size && nf_bridge->frag_max_size < mtu) mtu = nf_bridge->frag_max_size; nf_bridge_update_protocol(skb); nf_bridge_push_encap_header(skb); if (skb_is_gso(skb) || skb->len + mtu_reserved <= mtu) { nf_bridge_info_free(skb); return br_dev_queue_push_xmit(net, sk, skb); } /* Fragmentation on metadata/template dst is not supported */ if (unlikely(!skb_valid_dst(skb))) goto drop; /* This is wrong! We should preserve the original fragment * boundaries by preserving frag_list rather than refragmenting. */ if (IS_ENABLED(CONFIG_NF_DEFRAG_IPV4) && skb->protocol == htons(ETH_P_IP)) { struct brnf_frag_data *data; if (br_validate_ipv4(net, skb)) goto drop; IPCB(skb)->frag_max_size = nf_bridge->frag_max_size; local_lock_nested_bh(&brnf_frag_data_storage.bh_lock); data = this_cpu_ptr(&brnf_frag_data_storage); if (skb_vlan_tag_present(skb)) { data->vlan_tci = skb->vlan_tci; data->vlan_proto = skb->vlan_proto; } else { data->vlan_proto = 0; } data->encap_size = nf_bridge_encap_header_len(skb); data->size = ETH_HLEN + data->encap_size; skb_copy_from_linear_data_offset(skb, -data->size, data->mac, data->size); ret = br_nf_ip_fragment(net, sk, skb, br_nf_push_frag_xmit); local_unlock_nested_bh(&brnf_frag_data_storage.bh_lock); return ret; } if (IS_ENABLED(CONFIG_NF_DEFRAG_IPV6) && skb->protocol == htons(ETH_P_IPV6)) { const struct nf_ipv6_ops *v6ops = nf_get_ipv6_ops(); struct brnf_frag_data *data; if (br_validate_ipv6(net, skb)) goto drop; IP6CB(skb)->frag_max_size = nf_bridge->frag_max_size; local_lock_nested_bh(&brnf_frag_data_storage.bh_lock); data = this_cpu_ptr(&brnf_frag_data_storage); data->encap_size = nf_bridge_encap_header_len(skb); data->size = ETH_HLEN + data->encap_size; skb_copy_from_linear_data_offset(skb, -data->size, data->mac, data->size); if (v6ops) { ret = v6ops->fragment(net, sk, skb, br_nf_push_frag_xmit); local_unlock_nested_bh(&brnf_frag_data_storage.bh_lock); return ret; } local_unlock_nested_bh(&brnf_frag_data_storage.bh_lock); kfree_skb(skb); return -EMSGSIZE; } nf_bridge_info_free(skb); return br_dev_queue_push_xmit(net, sk, skb); drop: kfree_skb(skb); return 0; } /* PF_BRIDGE/POST_ROUTING ********************************************/ static unsigned int br_nf_post_routing(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { struct nf_bridge_info *nf_bridge = nf_bridge_info_get(skb); struct net_device *realoutdev = bridge_parent(skb->dev); u_int8_t pf; /* if nf_bridge is set, but ->physoutdev is NULL, this packet came in * on a bridge, but was delivered locally and is now being routed: * * POST_ROUTING was already invoked from the ip stack. */ if (!nf_bridge || !nf_bridge->physoutdev) return NF_ACCEPT; if (!realoutdev) return NF_DROP_REASON(skb, SKB_DROP_REASON_DEV_READY, 0); if (IS_IP(skb) || is_vlan_ip(skb, state->net) || is_pppoe_ip(skb, state->net)) pf = NFPROTO_IPV4; else if (IS_IPV6(skb) || is_vlan_ipv6(skb, state->net) || is_pppoe_ipv6(skb, state->net)) pf = NFPROTO_IPV6; else return NF_ACCEPT; if (skb->pkt_type == PACKET_OTHERHOST) { skb->pkt_type = PACKET_HOST; nf_bridge->pkt_otherhost = true; } nf_bridge_pull_encap_header(skb); if (pf == NFPROTO_IPV4) skb->protocol = htons(ETH_P_IP); else skb->protocol = htons(ETH_P_IPV6); NF_HOOK(pf, NF_INET_POST_ROUTING, state->net, state->sk, skb, NULL, realoutdev, br_nf_dev_queue_xmit); return NF_STOLEN; } /* IP/SABOTAGE *****************************************************/ /* Don't hand locally destined packets to PF_INET(6)/PRE_ROUTING * for the second time. */ static unsigned int ip_sabotage_in(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { struct nf_bridge_info *nf_bridge = nf_bridge_info_get(skb); if (nf_bridge) { if (nf_bridge->sabotage_in_done) return NF_ACCEPT; if (!nf_bridge->in_prerouting && !netif_is_l3_master(skb->dev) && !netif_is_l3_slave(skb->dev)) { nf_bridge->sabotage_in_done = 1; state->okfn(state->net, state->sk, skb); return NF_STOLEN; } } return NF_ACCEPT; } /* This is called when br_netfilter has called into iptables/netfilter, * and DNAT has taken place on a bridge-forwarded packet. * * neigh->output has created a new MAC header, with local br0 MAC * as saddr. * * This restores the original MAC saddr of the bridged packet * before invoking bridge forward logic to transmit the packet. */ static void br_nf_pre_routing_finish_bridge_slow(struct sk_buff *skb) { struct nf_bridge_info *nf_bridge = nf_bridge_info_get(skb); struct net_device *br_indev; br_indev = nf_bridge_get_physindev(skb, dev_net(skb->dev)); if (!br_indev) { kfree_skb(skb); return; } skb_pull(skb, ETH_HLEN); nf_bridge->bridged_dnat = 0; BUILD_BUG_ON(sizeof(nf_bridge->neigh_header) != (ETH_HLEN - ETH_ALEN)); skb_copy_to_linear_data_offset(skb, -(ETH_HLEN - ETH_ALEN), nf_bridge->neigh_header, ETH_HLEN - ETH_ALEN); skb->dev = br_indev; nf_bridge->physoutdev = NULL; br_handle_frame_finish(dev_net(skb->dev), NULL, skb); } static int br_nf_dev_xmit(struct sk_buff *skb) { const struct nf_bridge_info *nf_bridge = nf_bridge_info_get(skb); if (nf_bridge && nf_bridge->bridged_dnat) { br_nf_pre_routing_finish_bridge_slow(skb); return 1; } return 0; } static const struct nf_br_ops br_ops = { .br_dev_xmit_hook = br_nf_dev_xmit, }; /* For br_nf_post_routing, we need (prio = NF_BR_PRI_LAST), because * br_dev_queue_push_xmit is called afterwards */ static const struct nf_hook_ops br_nf_ops[] = { { .hook = br_nf_pre_routing, .pf = NFPROTO_BRIDGE, .hooknum = NF_BR_PRE_ROUTING, .priority = NF_BR_PRI_BRNF, }, #if IS_ENABLED(CONFIG_NF_CONNTRACK) { .hook = br_nf_local_in, .pf = NFPROTO_BRIDGE, .hooknum = NF_BR_LOCAL_IN, .priority = NF_BR_PRI_LAST, }, #endif { .hook = br_nf_forward, .pf = NFPROTO_BRIDGE, .hooknum = NF_BR_FORWARD, .priority = NF_BR_PRI_BRNF, }, { .hook = br_nf_post_routing, .pf = NFPROTO_BRIDGE, .hooknum = NF_BR_POST_ROUTING, .priority = NF_BR_PRI_LAST, }, { .hook = ip_sabotage_in, .pf = NFPROTO_IPV4, .hooknum = NF_INET_PRE_ROUTING, .priority = NF_IP_PRI_FIRST, }, { .hook = ip_sabotage_in, .pf = NFPROTO_IPV6, .hooknum = NF_INET_PRE_ROUTING, .priority = NF_IP6_PRI_FIRST, }, }; static int brnf_device_event(struct notifier_block *unused, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct brnf_net *brnet; struct net *net; int ret; if (event != NETDEV_REGISTER || !netif_is_bridge_master(dev)) return NOTIFY_DONE; ASSERT_RTNL(); net = dev_net(dev); brnet = net_generic(net, brnf_net_id); if (brnet->enabled) return NOTIFY_OK; ret = nf_register_net_hooks(net, br_nf_ops, ARRAY_SIZE(br_nf_ops)); if (ret) return NOTIFY_BAD; brnet->enabled = true; return NOTIFY_OK; } static struct notifier_block brnf_notifier __read_mostly = { .notifier_call = brnf_device_event, }; /* recursively invokes nf_hook_slow (again), skipping already-called * hooks (< NF_BR_PRI_BRNF). * * Called with rcu read lock held. */ int br_nf_hook_thresh(unsigned int hook, struct net *net, struct sock *sk, struct sk_buff *skb, struct net_device *indev, struct net_device *outdev, int (*okfn)(struct net *, struct sock *, struct sk_buff *)) { const struct nf_hook_entries *e; struct nf_hook_state state; struct nf_hook_ops **ops; unsigned int i; int ret; e = rcu_dereference(net->nf.hooks_bridge[hook]); if (!e) return okfn(net, sk, skb); ops = nf_hook_entries_get_hook_ops(e); for (i = 0; i < e->num_hook_entries; i++) { /* These hooks have already been called */ if (ops[i]->priority < NF_BR_PRI_BRNF) continue; /* These hooks have not been called yet, run them. */ if (ops[i]->priority > NF_BR_PRI_BRNF) break; /* take a closer look at NF_BR_PRI_BRNF. */ if (ops[i]->hook == br_nf_pre_routing) { /* This hook diverted the skb to this function, * hooks after this have not been run yet. */ i++; break; } } nf_hook_state_init(&state, hook, NFPROTO_BRIDGE, indev, outdev, sk, net, okfn); ret = nf_hook_slow(skb, &state, e, i); if (ret == 1) ret = okfn(net, sk, skb); return ret; } #ifdef CONFIG_SYSCTL static int brnf_sysctl_call_tables(const struct ctl_table *ctl, int write, void *buffer, size_t *lenp, loff_t *ppos) { int ret; ret = proc_dointvec(ctl, write, buffer, lenp, ppos); if (write && *(int *)(ctl->data)) *(int *)(ctl->data) = 1; return ret; } static struct ctl_table brnf_table[] = { { .procname = "bridge-nf-call-arptables", .maxlen = sizeof(int), .mode = 0644, .proc_handler = brnf_sysctl_call_tables, }, { .procname = "bridge-nf-call-iptables", .maxlen = sizeof(int), .mode = 0644, .proc_handler = brnf_sysctl_call_tables, }, { .procname = "bridge-nf-call-ip6tables", .maxlen = sizeof(int), .mode = 0644, .proc_handler = brnf_sysctl_call_tables, }, { .procname = "bridge-nf-filter-vlan-tagged", .maxlen = sizeof(int), .mode = 0644, .proc_handler = brnf_sysctl_call_tables, }, { .procname = "bridge-nf-filter-pppoe-tagged", .maxlen = sizeof(int), .mode = 0644, .proc_handler = brnf_sysctl_call_tables, }, { .procname = "bridge-nf-pass-vlan-input-dev", .maxlen = sizeof(int), .mode = 0644, .proc_handler = brnf_sysctl_call_tables, }, }; static inline void br_netfilter_sysctl_default(struct brnf_net *brnf) { brnf->call_iptables = 1; brnf->call_ip6tables = 1; brnf->call_arptables = 1; brnf->filter_vlan_tagged = 0; brnf->filter_pppoe_tagged = 0; brnf->pass_vlan_indev = 0; } static int br_netfilter_sysctl_init_net(struct net *net) { struct ctl_table *table = brnf_table; struct brnf_net *brnet; if (!net_eq(net, &init_net)) { table = kmemdup(table, sizeof(brnf_table), GFP_KERNEL); if (!table) return -ENOMEM; } brnet = net_generic(net, brnf_net_id); table[0].data = &brnet->call_arptables; table[1].data = &brnet->call_iptables; table[2].data = &brnet->call_ip6tables; table[3].data = &brnet->filter_vlan_tagged; table[4].data = &brnet->filter_pppoe_tagged; table[5].data = &brnet->pass_vlan_indev; br_netfilter_sysctl_default(brnet); brnet->ctl_hdr = register_net_sysctl_sz(net, "net/bridge", table, ARRAY_SIZE(brnf_table)); if (!brnet->ctl_hdr) { if (!net_eq(net, &init_net)) kfree(table); return -ENOMEM; } return 0; } static void br_netfilter_sysctl_exit_net(struct net *net, struct brnf_net *brnet) { const struct ctl_table *table = brnet->ctl_hdr->ctl_table_arg; unregister_net_sysctl_table(brnet->ctl_hdr); if (!net_eq(net, &init_net)) kfree(table); } static int __net_init brnf_init_net(struct net *net) { return br_netfilter_sysctl_init_net(net); } #endif static void __net_exit brnf_exit_net(struct net *net) { struct brnf_net *brnet; brnet = net_generic(net, brnf_net_id); if (brnet->enabled) { nf_unregister_net_hooks(net, br_nf_ops, ARRAY_SIZE(br_nf_ops)); brnet->enabled = false; } #ifdef CONFIG_SYSCTL br_netfilter_sysctl_exit_net(net, brnet); #endif } static struct pernet_operations brnf_net_ops __read_mostly = { #ifdef CONFIG_SYSCTL .init = brnf_init_net, #endif .exit = brnf_exit_net, .id = &brnf_net_id, .size = sizeof(struct brnf_net), }; static int __init br_netfilter_init(void) { int ret; ret = register_pernet_subsys(&brnf_net_ops); if (ret < 0) return ret; ret = register_netdevice_notifier(&brnf_notifier); if (ret < 0) { unregister_pernet_subsys(&brnf_net_ops); return ret; } RCU_INIT_POINTER(nf_br_ops, &br_ops); printk(KERN_NOTICE "Bridge firewalling registered\n"); return 0; } static void __exit br_netfilter_fini(void) { RCU_INIT_POINTER(nf_br_ops, NULL); unregister_netdevice_notifier(&brnf_notifier); unregister_pernet_subsys(&brnf_net_ops); } module_init(br_netfilter_init); module_exit(br_netfilter_fini); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Lennert Buytenhek <buytenh@gnu.org>"); MODULE_AUTHOR("Bart De Schuymer <bdschuym@pandora.be>"); MODULE_DESCRIPTION("Linux ethernet netfilter firewall bridge");
137 169 176 166 262 259 129 2 2 2 169 169 164 158 48 6 37 3 11 3 13 2 1 2 117 2 15 129 260 171 170 171 125 124 125 125 125 125 171 3 3 171 171 169 11 170 170 171 172 170 170 169 168 171 174 174 174 174 169 115 171 170 171 169 171 169 260 244 258 7 125 7 125 129 129 4 130 130 3 118 15 15 2 262 14 4 250 255 42 246 260 4 1 262 260 261 259 4 1 262 130 257 4 4 262 177 146 42 245 246 246 124 42 10 34 261 262 259 4 262 262 42 246 259 128 261 4 245 246 42 42 4 4 4 169 182 143 143 143 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 // SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2009 Oracle. All rights reserved. */ #include <linux/sched.h> #include <linux/slab.h> #include <linux/sort.h> #include "messages.h" #include "ctree.h" #include "delayed-ref.h" #include "extent-tree.h" #include "transaction.h" #include "qgroup.h" #include "space-info.h" #include "tree-mod-log.h" #include "fs.h" struct kmem_cache *btrfs_delayed_ref_head_cachep; struct kmem_cache *btrfs_delayed_ref_node_cachep; struct kmem_cache *btrfs_delayed_extent_op_cachep; /* * delayed back reference update tracking. For subvolume trees * we queue up extent allocations and backref maintenance for * delayed processing. This avoids deep call chains where we * add extents in the middle of btrfs_search_slot, and it allows * us to buffer up frequently modified backrefs in an rb tree instead * of hammering updates on the extent allocation tree. */ bool btrfs_check_space_for_delayed_refs(struct btrfs_fs_info *fs_info) { struct btrfs_block_rsv *delayed_refs_rsv = &fs_info->delayed_refs_rsv; struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; bool ret = false; u64 reserved; spin_lock(&global_rsv->lock); reserved = global_rsv->reserved; spin_unlock(&global_rsv->lock); /* * Since the global reserve is just kind of magic we don't really want * to rely on it to save our bacon, so if our size is more than the * delayed_refs_rsv and the global rsv then it's time to think about * bailing. */ spin_lock(&delayed_refs_rsv->lock); reserved += delayed_refs_rsv->reserved; if (delayed_refs_rsv->size >= reserved) ret = true; spin_unlock(&delayed_refs_rsv->lock); return ret; } /* * Release a ref head's reservation. * * @fs_info: the filesystem * @nr_refs: number of delayed refs to drop * @nr_csums: number of csum items to drop * * Drops the delayed ref head's count from the delayed refs rsv and free any * excess reservation we had. */ void btrfs_delayed_refs_rsv_release(struct btrfs_fs_info *fs_info, int nr_refs, int nr_csums) { struct btrfs_block_rsv *block_rsv = &fs_info->delayed_refs_rsv; u64 num_bytes; u64 released; num_bytes = btrfs_calc_delayed_ref_bytes(fs_info, nr_refs); num_bytes += btrfs_calc_delayed_ref_csum_bytes(fs_info, nr_csums); released = btrfs_block_rsv_release(fs_info, block_rsv, num_bytes, NULL); if (released) trace_btrfs_space_reservation(fs_info, "delayed_refs_rsv", 0, released, 0); } /* * Adjust the size of the delayed refs rsv. * * This is to be called anytime we may have adjusted trans->delayed_ref_updates * or trans->delayed_ref_csum_deletions, it'll calculate the additional size and * add it to the delayed_refs_rsv. */ void btrfs_update_delayed_refs_rsv(struct btrfs_trans_handle *trans) { struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_refs_rsv; struct btrfs_block_rsv *local_rsv = &trans->delayed_rsv; u64 num_bytes; u64 reserved_bytes; if (btrfs_is_testing(fs_info)) return; num_bytes = btrfs_calc_delayed_ref_bytes(fs_info, trans->delayed_ref_updates); num_bytes += btrfs_calc_delayed_ref_csum_bytes(fs_info, trans->delayed_ref_csum_deletions); if (num_bytes == 0) return; /* * Try to take num_bytes from the transaction's local delayed reserve. * If not possible, try to take as much as it's available. If the local * reserve doesn't have enough reserved space, the delayed refs reserve * will be refilled next time btrfs_delayed_refs_rsv_refill() is called * by someone or if a transaction commit is triggered before that, the * global block reserve will be used. We want to minimize using the * global block reserve for cases we can account for in advance, to * avoid exhausting it and reach -ENOSPC during a transaction commit. */ spin_lock(&local_rsv->lock); reserved_bytes = min(num_bytes, local_rsv->reserved); local_rsv->reserved -= reserved_bytes; local_rsv->full = (local_rsv->reserved >= local_rsv->size); spin_unlock(&local_rsv->lock); spin_lock(&delayed_rsv->lock); delayed_rsv->size += num_bytes; delayed_rsv->reserved += reserved_bytes; delayed_rsv->full = (delayed_rsv->reserved >= delayed_rsv->size); spin_unlock(&delayed_rsv->lock); trans->delayed_ref_updates = 0; trans->delayed_ref_csum_deletions = 0; } /* * Adjust the size of the delayed refs block reserve for 1 block group item * insertion, used after allocating a block group. */ void btrfs_inc_delayed_refs_rsv_bg_inserts(struct btrfs_fs_info *fs_info) { struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_refs_rsv; spin_lock(&delayed_rsv->lock); /* * Inserting a block group item does not require changing the free space * tree, only the extent tree or the block group tree, so this is all we * need. */ delayed_rsv->size += btrfs_calc_insert_metadata_size(fs_info, 1); delayed_rsv->full = false; spin_unlock(&delayed_rsv->lock); } /* * Adjust the size of the delayed refs block reserve to release space for 1 * block group item insertion. */ void btrfs_dec_delayed_refs_rsv_bg_inserts(struct btrfs_fs_info *fs_info) { struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_refs_rsv; const u64 num_bytes = btrfs_calc_insert_metadata_size(fs_info, 1); u64 released; released = btrfs_block_rsv_release(fs_info, delayed_rsv, num_bytes, NULL); if (released > 0) trace_btrfs_space_reservation(fs_info, "delayed_refs_rsv", 0, released, 0); } /* * Adjust the size of the delayed refs block reserve for 1 block group item * update. */ void btrfs_inc_delayed_refs_rsv_bg_updates(struct btrfs_fs_info *fs_info) { struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_refs_rsv; spin_lock(&delayed_rsv->lock); /* * Updating a block group item does not result in new nodes/leaves and * does not require changing the free space tree, only the extent tree * or the block group tree, so this is all we need. */ delayed_rsv->size += btrfs_calc_metadata_size(fs_info, 1); delayed_rsv->full = false; spin_unlock(&delayed_rsv->lock); } /* * Adjust the size of the delayed refs block reserve to release space for 1 * block group item update. */ void btrfs_dec_delayed_refs_rsv_bg_updates(struct btrfs_fs_info *fs_info) { struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_refs_rsv; const u64 num_bytes = btrfs_calc_metadata_size(fs_info, 1); u64 released; released = btrfs_block_rsv_release(fs_info, delayed_rsv, num_bytes, NULL); if (released > 0) trace_btrfs_space_reservation(fs_info, "delayed_refs_rsv", 0, released, 0); } /* * Refill based on our delayed refs usage. * * @fs_info: the filesystem * @flush: control how we can flush for this reservation. * * This will refill the delayed block_rsv up to 1 items size worth of space and * will return -ENOSPC if we can't make the reservation. */ int btrfs_delayed_refs_rsv_refill(struct btrfs_fs_info *fs_info, enum btrfs_reserve_flush_enum flush) { struct btrfs_block_rsv *block_rsv = &fs_info->delayed_refs_rsv; struct btrfs_space_info *space_info = block_rsv->space_info; u64 limit = btrfs_calc_delayed_ref_bytes(fs_info, 1); u64 num_bytes = 0; u64 refilled_bytes; u64 to_free; int ret = -ENOSPC; spin_lock(&block_rsv->lock); if (block_rsv->reserved < block_rsv->size) { num_bytes = block_rsv->size - block_rsv->reserved; num_bytes = min(num_bytes, limit); } spin_unlock(&block_rsv->lock); if (!num_bytes) return 0; ret = btrfs_reserve_metadata_bytes(fs_info, space_info, num_bytes, flush); if (ret) return ret; /* * We may have raced with someone else, so check again if we the block * reserve is still not full and release any excess space. */ spin_lock(&block_rsv->lock); if (block_rsv->reserved < block_rsv->size) { u64 needed = block_rsv->size - block_rsv->reserved; if (num_bytes >= needed) { block_rsv->reserved += needed; block_rsv->full = true; to_free = num_bytes - needed; refilled_bytes = needed; } else { block_rsv->reserved += num_bytes; to_free = 0; refilled_bytes = num_bytes; } } else { to_free = num_bytes; refilled_bytes = 0; } spin_unlock(&block_rsv->lock); if (to_free > 0) btrfs_space_info_free_bytes_may_use(space_info, to_free); if (refilled_bytes > 0) trace_btrfs_space_reservation(fs_info, "delayed_refs_rsv", 0, refilled_bytes, 1); return 0; } /* * compare two delayed data backrefs with same bytenr and type */ static int comp_data_refs(const struct btrfs_delayed_ref_node *ref1, const struct btrfs_delayed_ref_node *ref2) { if (ref1->data_ref.objectid < ref2->data_ref.objectid) return -1; if (ref1->data_ref.objectid > ref2->data_ref.objectid) return 1; if (ref1->data_ref.offset < ref2->data_ref.offset) return -1; if (ref1->data_ref.offset > ref2->data_ref.offset) return 1; return 0; } static int comp_refs(const struct btrfs_delayed_ref_node *ref1, const struct btrfs_delayed_ref_node *ref2, bool check_seq) { int ret = 0; if (ref1->type < ref2->type) return -1; if (ref1->type > ref2->type) return 1; if (ref1->type == BTRFS_SHARED_BLOCK_REF_KEY || ref1->type == BTRFS_SHARED_DATA_REF_KEY) { if (ref1->parent < ref2->parent) return -1; if (ref1->parent > ref2->parent) return 1; } else { if (ref1->ref_root < ref2->ref_root) return -1; if (ref1->ref_root > ref2->ref_root) return 1; if (ref1->type == BTRFS_EXTENT_DATA_REF_KEY) ret = comp_data_refs(ref1, ref2); } if (ret) return ret; if (check_seq) { if (ref1->seq < ref2->seq) return -1; if (ref1->seq > ref2->seq) return 1; } return 0; } static int cmp_refs_node(const struct rb_node *new, const struct rb_node *exist) { const struct btrfs_delayed_ref_node *new_node = rb_entry(new, struct btrfs_delayed_ref_node, ref_node); const struct btrfs_delayed_ref_node *exist_node = rb_entry(exist, struct btrfs_delayed_ref_node, ref_node); return comp_refs(new_node, exist_node, true); } static struct btrfs_delayed_ref_node* tree_insert(struct rb_root_cached *root, struct btrfs_delayed_ref_node *ins) { struct rb_node *node = &ins->ref_node; struct rb_node *exist = rb_find_add_cached(node, root, cmp_refs_node); return rb_entry_safe(exist, struct btrfs_delayed_ref_node, ref_node); } static struct btrfs_delayed_ref_head *find_first_ref_head( struct btrfs_delayed_ref_root *dr) { unsigned long from = 0; lockdep_assert_held(&dr->lock); return xa_find(&dr->head_refs, &from, ULONG_MAX, XA_PRESENT); } static bool btrfs_delayed_ref_lock(struct btrfs_delayed_ref_root *delayed_refs, struct btrfs_delayed_ref_head *head) { lockdep_assert_held(&delayed_refs->lock); if (mutex_trylock(&head->mutex)) return true; refcount_inc(&head->refs); spin_unlock(&delayed_refs->lock); mutex_lock(&head->mutex); spin_lock(&delayed_refs->lock); if (!head->tracked) { mutex_unlock(&head->mutex); btrfs_put_delayed_ref_head(head); return false; } btrfs_put_delayed_ref_head(head); return true; } static inline void drop_delayed_ref(struct btrfs_fs_info *fs_info, struct btrfs_delayed_ref_root *delayed_refs, struct btrfs_delayed_ref_head *head, struct btrfs_delayed_ref_node *ref) { lockdep_assert_held(&head->lock); rb_erase_cached(&ref->ref_node, &head->ref_tree); RB_CLEAR_NODE(&ref->ref_node); if (!list_empty(&ref->add_list)) list_del(&ref->add_list); btrfs_put_delayed_ref(ref); btrfs_delayed_refs_rsv_release(fs_info, 1, 0); } static bool merge_ref(struct btrfs_fs_info *fs_info, struct btrfs_delayed_ref_root *delayed_refs, struct btrfs_delayed_ref_head *head, struct btrfs_delayed_ref_node *ref, u64 seq) { struct btrfs_delayed_ref_node *next; struct rb_node *node = rb_next(&ref->ref_node); bool done = false; while (!done && node) { int mod; next = rb_entry(node, struct btrfs_delayed_ref_node, ref_node); node = rb_next(node); if (seq && next->seq >= seq) break; if (comp_refs(ref, next, false)) break; if (ref->action == next->action) { mod = next->ref_mod; } else { if (ref->ref_mod < next->ref_mod) { swap(ref, next); done = true; } mod = -next->ref_mod; } drop_delayed_ref(fs_info, delayed_refs, head, next); ref->ref_mod += mod; if (ref->ref_mod == 0) { drop_delayed_ref(fs_info, delayed_refs, head, ref); done = true; } else { /* * Can't have multiples of the same ref on a tree block. */ WARN_ON(ref->type == BTRFS_TREE_BLOCK_REF_KEY || ref->type == BTRFS_SHARED_BLOCK_REF_KEY); } } return done; } void btrfs_merge_delayed_refs(struct btrfs_fs_info *fs_info, struct btrfs_delayed_ref_root *delayed_refs, struct btrfs_delayed_ref_head *head) { struct btrfs_delayed_ref_node *ref; struct rb_node *node; u64 seq = 0; lockdep_assert_held(&head->lock); if (RB_EMPTY_ROOT(&head->ref_tree.rb_root)) return; /* We don't have too many refs to merge for data. */ if (head->is_data) return; seq = btrfs_tree_mod_log_lowest_seq(fs_info); again: for (node = rb_first_cached(&head->ref_tree); node; node = rb_next(node)) { ref = rb_entry(node, struct btrfs_delayed_ref_node, ref_node); if (seq && ref->seq >= seq) continue; if (merge_ref(fs_info, delayed_refs, head, ref, seq)) goto again; } } int btrfs_check_delayed_seq(struct btrfs_fs_info *fs_info, u64 seq) { int ret = 0; u64 min_seq = btrfs_tree_mod_log_lowest_seq(fs_info); if (min_seq != 0 && seq >= min_seq) { btrfs_debug(fs_info, "holding back delayed_ref %llu, lowest is %llu", seq, min_seq); ret = 1; } return ret; } struct btrfs_delayed_ref_head *btrfs_select_ref_head( const struct btrfs_fs_info *fs_info, struct btrfs_delayed_ref_root *delayed_refs) { struct btrfs_delayed_ref_head *head; unsigned long start_index; unsigned long found_index; bool found_head = false; bool locked; spin_lock(&delayed_refs->lock); again: start_index = (delayed_refs->run_delayed_start >> fs_info->sectorsize_bits); xa_for_each_start(&delayed_refs->head_refs, found_index, head, start_index) { if (!head->processing) { found_head = true; break; } } if (!found_head) { if (delayed_refs->run_delayed_start == 0) { spin_unlock(&delayed_refs->lock); return NULL; } delayed_refs->run_delayed_start = 0; goto again; } head->processing = true; WARN_ON(delayed_refs->num_heads_ready == 0); delayed_refs->num_heads_ready--; delayed_refs->run_delayed_start = head->bytenr + head->num_bytes; locked = btrfs_delayed_ref_lock(delayed_refs, head); spin_unlock(&delayed_refs->lock); /* * We may have dropped the spin lock to get the head mutex lock, and * that might have given someone else time to free the head. If that's * true, it has been removed from our list and we can move on. */ if (!locked) return ERR_PTR(-EAGAIN); return head; } void btrfs_unselect_ref_head(struct btrfs_delayed_ref_root *delayed_refs, struct btrfs_delayed_ref_head *head) { spin_lock(&delayed_refs->lock); head->processing = false; delayed_refs->num_heads_ready++; spin_unlock(&delayed_refs->lock); btrfs_delayed_ref_unlock(head); } void btrfs_delete_ref_head(const struct btrfs_fs_info *fs_info, struct btrfs_delayed_ref_root *delayed_refs, struct btrfs_delayed_ref_head *head) { const unsigned long index = (head->bytenr >> fs_info->sectorsize_bits); lockdep_assert_held(&delayed_refs->lock); lockdep_assert_held(&head->lock); xa_erase(&delayed_refs->head_refs, index); head->tracked = false; delayed_refs->num_heads--; if (!head->processing) delayed_refs->num_heads_ready--; } struct btrfs_delayed_ref_node *btrfs_select_delayed_ref(struct btrfs_delayed_ref_head *head) { struct btrfs_delayed_ref_node *ref; lockdep_assert_held(&head->mutex); lockdep_assert_held(&head->lock); if (RB_EMPTY_ROOT(&head->ref_tree.rb_root)) return NULL; /* * Select a delayed ref of type BTRFS_ADD_DELAYED_REF first. * This is to prevent a ref count from going down to zero, which deletes * the extent item from the extent tree, when there still are references * to add, which would fail because they would not find the extent item. */ if (!list_empty(&head->ref_add_list)) return list_first_entry(&head->ref_add_list, struct btrfs_delayed_ref_node, add_list); ref = rb_entry(rb_first_cached(&head->ref_tree), struct btrfs_delayed_ref_node, ref_node); ASSERT(list_empty(&ref->add_list)); return ref; } /* * Helper to insert the ref_node to the tail or merge with tail. * * Return false if the ref was inserted. * Return true if the ref was merged into an existing one (and therefore can be * freed by the caller). */ static bool insert_delayed_ref(struct btrfs_trans_handle *trans, struct btrfs_delayed_ref_head *href, struct btrfs_delayed_ref_node *ref) { struct btrfs_delayed_ref_root *root = &trans->transaction->delayed_refs; struct btrfs_delayed_ref_node *exist; int mod; spin_lock(&href->lock); exist = tree_insert(&href->ref_tree, ref); if (!exist) { if (ref->action == BTRFS_ADD_DELAYED_REF) list_add_tail(&ref->add_list, &href->ref_add_list); spin_unlock(&href->lock); trans->delayed_ref_updates++; return false; } /* Now we are sure we can merge */ if (exist->action == ref->action) { mod = ref->ref_mod; } else { /* Need to change action */ if (exist->ref_mod < ref->ref_mod) { exist->action = ref->action; mod = -exist->ref_mod; exist->ref_mod = ref->ref_mod; if (ref->action == BTRFS_ADD_DELAYED_REF) list_add_tail(&exist->add_list, &href->ref_add_list); else if (ref->action == BTRFS_DROP_DELAYED_REF) { ASSERT(!list_empty(&exist->add_list)); list_del_init(&exist->add_list); } else { ASSERT(0); } } else mod = -ref->ref_mod; } exist->ref_mod += mod; /* remove existing tail if its ref_mod is zero */ if (exist->ref_mod == 0) drop_delayed_ref(trans->fs_info, root, href, exist); spin_unlock(&href->lock); return true; } /* * helper function to update the accounting in the head ref * existing and update must have the same bytenr */ static noinline void update_existing_head_ref(struct btrfs_trans_handle *trans, struct btrfs_delayed_ref_head *existing, struct btrfs_delayed_ref_head *update) { struct btrfs_delayed_ref_root *delayed_refs = &trans->transaction->delayed_refs; struct btrfs_fs_info *fs_info = trans->fs_info; int old_ref_mod; BUG_ON(existing->is_data != update->is_data); spin_lock(&existing->lock); /* * When freeing an extent, we may not know the owning root when we * first create the head_ref. However, some deref before the last deref * will know it, so we just need to update the head_ref accordingly. */ if (!existing->owning_root) existing->owning_root = update->owning_root; if (update->must_insert_reserved) { /* if the extent was freed and then * reallocated before the delayed ref * entries were processed, we can end up * with an existing head ref without * the must_insert_reserved flag set. * Set it again here */ existing->must_insert_reserved = update->must_insert_reserved; existing->owning_root = update->owning_root; /* * update the num_bytes so we make sure the accounting * is done correctly */ existing->num_bytes = update->num_bytes; } if (update->extent_op) { if (!existing->extent_op) { existing->extent_op = update->extent_op; } else { if (update->extent_op->update_key) { memcpy(&existing->extent_op->key, &update->extent_op->key, sizeof(update->extent_op->key)); existing->extent_op->update_key = true; } if (update->extent_op->update_flags) { existing->extent_op->flags_to_set |= update->extent_op->flags_to_set; existing->extent_op->update_flags = true; } btrfs_free_delayed_extent_op(update->extent_op); } } /* * update the reference mod on the head to reflect this new operation, * only need the lock for this case cause we could be processing it * currently, for refs we just added we know we're a-ok. */ old_ref_mod = existing->total_ref_mod; existing->ref_mod += update->ref_mod; existing->total_ref_mod += update->ref_mod; /* * If we are going to from a positive ref mod to a negative or vice * versa we need to make sure to adjust pending_csums accordingly. * We reserve bytes for csum deletion when adding or updating a ref head * see add_delayed_ref_head() for more details. */ if (existing->is_data) { u64 csum_leaves = btrfs_csum_bytes_to_leaves(fs_info, existing->num_bytes); if (existing->total_ref_mod >= 0 && old_ref_mod < 0) { delayed_refs->pending_csums -= existing->num_bytes; btrfs_delayed_refs_rsv_release(fs_info, 0, csum_leaves); } if (existing->total_ref_mod < 0 && old_ref_mod >= 0) { delayed_refs->pending_csums += existing->num_bytes; trans->delayed_ref_csum_deletions += csum_leaves; } } spin_unlock(&existing->lock); } static void init_delayed_ref_head(struct btrfs_delayed_ref_head *head_ref, struct btrfs_ref *generic_ref, struct btrfs_qgroup_extent_record *qrecord, u64 reserved) { int count_mod = 1; bool must_insert_reserved = false; /* If reserved is provided, it must be a data extent. */ BUG_ON(generic_ref->type != BTRFS_REF_DATA && reserved); switch (generic_ref->action) { case BTRFS_ADD_DELAYED_REF: /* count_mod is already set to 1. */ break; case BTRFS_UPDATE_DELAYED_HEAD: count_mod = 0; break; case BTRFS_DROP_DELAYED_REF: /* * The head node stores the sum of all the mods, so dropping a ref * should drop the sum in the head node by one. */ count_mod = -1; break; case BTRFS_ADD_DELAYED_EXTENT: /* * BTRFS_ADD_DELAYED_EXTENT means that we need to update the * reserved accounting when the extent is finally added, or if a * later modification deletes the delayed ref without ever * inserting the extent into the extent allocation tree. * ref->must_insert_reserved is the flag used to record that * accounting mods are required. * * Once we record must_insert_reserved, switch the action to * BTRFS_ADD_DELAYED_REF because other special casing is not * required. */ must_insert_reserved = true; break; } refcount_set(&head_ref->refs, 1); head_ref->bytenr = generic_ref->bytenr; head_ref->num_bytes = generic_ref->num_bytes; head_ref->ref_mod = count_mod; head_ref->reserved_bytes = reserved; head_ref->must_insert_reserved = must_insert_reserved; head_ref->owning_root = generic_ref->owning_root; head_ref->is_data = (generic_ref->type == BTRFS_REF_DATA); head_ref->is_system = (generic_ref->ref_root == BTRFS_CHUNK_TREE_OBJECTID); head_ref->ref_tree = RB_ROOT_CACHED; INIT_LIST_HEAD(&head_ref->ref_add_list); head_ref->tracked = false; head_ref->processing = false; head_ref->total_ref_mod = count_mod; spin_lock_init(&head_ref->lock); mutex_init(&head_ref->mutex); /* If not metadata set an impossible level to help debugging. */ if (generic_ref->type == BTRFS_REF_METADATA) head_ref->level = generic_ref->tree_ref.level; else head_ref->level = U8_MAX; if (qrecord) { if (generic_ref->ref_root && reserved) { qrecord->data_rsv = reserved; qrecord->data_rsv_refroot = generic_ref->ref_root; } qrecord->num_bytes = generic_ref->num_bytes; qrecord->old_roots = NULL; } } /* * helper function to actually insert a head node into the rbtree. * this does all the dirty work in terms of maintaining the correct * overall modification count. * * Returns an error pointer in case of an error. */ static noinline struct btrfs_delayed_ref_head * add_delayed_ref_head(struct btrfs_trans_handle *trans, struct btrfs_delayed_ref_head *head_ref, struct btrfs_qgroup_extent_record *qrecord, int action, bool *qrecord_inserted_ret) { struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_delayed_ref_head *existing; struct btrfs_delayed_ref_root *delayed_refs; const unsigned long index = (head_ref->bytenr >> fs_info->sectorsize_bits); bool qrecord_inserted = false; delayed_refs = &trans->transaction->delayed_refs; lockdep_assert_held(&delayed_refs->lock); #if BITS_PER_LONG == 32 if (head_ref->bytenr >= MAX_LFS_FILESIZE) { if (qrecord) xa_release(&delayed_refs->dirty_extents, index); btrfs_err_rl(fs_info, "delayed ref head %llu is beyond 32bit page cache and xarray index limit", head_ref->bytenr); btrfs_err_32bit_limit(fs_info); return ERR_PTR(-EOVERFLOW); } #endif /* Record qgroup extent info if provided */ if (qrecord) { int ret; ret = btrfs_qgroup_trace_extent_nolock(fs_info, delayed_refs, qrecord, head_ref->bytenr); if (ret) { /* Clean up if insertion fails or item exists. */ xa_release(&delayed_refs->dirty_extents, index); /* Caller responsible for freeing qrecord on error. */ if (ret < 0) return ERR_PTR(ret); kfree(qrecord); } else { qrecord_inserted = true; } } trace_add_delayed_ref_head(fs_info, head_ref, action); existing = xa_load(&delayed_refs->head_refs, index); if (existing) { update_existing_head_ref(trans, existing, head_ref); /* * we've updated the existing ref, free the newly * allocated ref */ kmem_cache_free(btrfs_delayed_ref_head_cachep, head_ref); head_ref = existing; } else { existing = xa_store(&delayed_refs->head_refs, index, head_ref, GFP_ATOMIC); if (xa_is_err(existing)) { /* Memory was preallocated by the caller. */ ASSERT(xa_err(existing) != -ENOMEM); return ERR_PTR(xa_err(existing)); } else if (WARN_ON(existing)) { /* * Shouldn't happen we just did a lookup before under * delayed_refs->lock. */ return ERR_PTR(-EEXIST); } head_ref->tracked = true; /* * We reserve the amount of bytes needed to delete csums when * adding the ref head and not when adding individual drop refs * since the csum items are deleted only after running the last * delayed drop ref (the data extent's ref count drops to 0). */ if (head_ref->is_data && head_ref->ref_mod < 0) { delayed_refs->pending_csums += head_ref->num_bytes; trans->delayed_ref_csum_deletions += btrfs_csum_bytes_to_leaves(fs_info, head_ref->num_bytes); } delayed_refs->num_heads++; delayed_refs->num_heads_ready++; } if (qrecord_inserted_ret) *qrecord_inserted_ret = qrecord_inserted; return head_ref; } /* * Initialize the structure which represents a modification to an extent. * * @fs_info: Internal to the mounted filesystem mount structure. * * @ref: The structure which is going to be initialized. * * @bytenr: The logical address of the extent for which a modification is * going to be recorded. * * @num_bytes: Size of the extent whose modification is being recorded. * * @ref_root: The id of the root where this modification has originated, this * can be either one of the well-known metadata trees or the * subvolume id which references this extent. * * @action: Can be one of BTRFS_ADD_DELAYED_REF/BTRFS_DROP_DELAYED_REF or * BTRFS_ADD_DELAYED_EXTENT * * @ref_type: Holds the type of the extent which is being recorded, can be * one of BTRFS_SHARED_BLOCK_REF_KEY/BTRFS_TREE_BLOCK_REF_KEY * when recording a metadata extent or BTRFS_SHARED_DATA_REF_KEY/ * BTRFS_EXTENT_DATA_REF_KEY when recording data extent */ static void init_delayed_ref_common(struct btrfs_fs_info *fs_info, struct btrfs_delayed_ref_node *ref, struct btrfs_ref *generic_ref) { int action = generic_ref->action; u64 seq = 0; if (action == BTRFS_ADD_DELAYED_EXTENT) action = BTRFS_ADD_DELAYED_REF; if (btrfs_is_fstree(generic_ref->ref_root)) seq = atomic64_read(&fs_info->tree_mod_seq); refcount_set(&ref->refs, 1); ref->bytenr = generic_ref->bytenr; ref->num_bytes = generic_ref->num_bytes; ref->ref_mod = 1; ref->action = action; ref->seq = seq; ref->type = btrfs_ref_type(generic_ref); ref->ref_root = generic_ref->ref_root; ref->parent = generic_ref->parent; RB_CLEAR_NODE(&ref->ref_node); INIT_LIST_HEAD(&ref->add_list); if (generic_ref->type == BTRFS_REF_DATA) ref->data_ref = generic_ref->data_ref; else ref->tree_ref = generic_ref->tree_ref; } void btrfs_init_tree_ref(struct btrfs_ref *generic_ref, int level, u64 mod_root, bool skip_qgroup) { #ifdef CONFIG_BTRFS_DEBUG /* If @real_root not set, use @root as fallback */ generic_ref->real_root = mod_root ?: generic_ref->ref_root; #endif generic_ref->tree_ref.level = level; generic_ref->type = BTRFS_REF_METADATA; if (skip_qgroup || !(btrfs_is_fstree(generic_ref->ref_root) && (!mod_root || btrfs_is_fstree(mod_root)))) generic_ref->skip_qgroup = true; else generic_ref->skip_qgroup = false; } void btrfs_init_data_ref(struct btrfs_ref *generic_ref, u64 ino, u64 offset, u64 mod_root, bool skip_qgroup) { #ifdef CONFIG_BTRFS_DEBUG /* If @real_root not set, use @root as fallback */ generic_ref->real_root = mod_root ?: generic_ref->ref_root; #endif generic_ref->data_ref.objectid = ino; generic_ref->data_ref.offset = offset; generic_ref->type = BTRFS_REF_DATA; if (skip_qgroup || !(btrfs_is_fstree(generic_ref->ref_root) && (!mod_root || btrfs_is_fstree(mod_root)))) generic_ref->skip_qgroup = true; else generic_ref->skip_qgroup = false; } static int add_delayed_ref(struct btrfs_trans_handle *trans, struct btrfs_ref *generic_ref, struct btrfs_delayed_extent_op *extent_op, u64 reserved) { struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_delayed_ref_node *node; struct btrfs_delayed_ref_head *head_ref; struct btrfs_delayed_ref_head *new_head_ref; struct btrfs_delayed_ref_root *delayed_refs; struct btrfs_qgroup_extent_record *record = NULL; const unsigned long index = (generic_ref->bytenr >> fs_info->sectorsize_bits); bool qrecord_reserved = false; bool qrecord_inserted; int action = generic_ref->action; bool merged; int ret; node = kmem_cache_alloc(btrfs_delayed_ref_node_cachep, GFP_NOFS); if (!node) return -ENOMEM; head_ref = kmem_cache_alloc(btrfs_delayed_ref_head_cachep, GFP_NOFS); if (!head_ref) { ret = -ENOMEM; goto free_node; } delayed_refs = &trans->transaction->delayed_refs; if (btrfs_qgroup_full_accounting(fs_info) && !generic_ref->skip_qgroup) { record = kzalloc(sizeof(*record), GFP_NOFS); if (!record) { ret = -ENOMEM; goto free_head_ref; } if (xa_reserve(&delayed_refs->dirty_extents, index, GFP_NOFS)) { ret = -ENOMEM; goto free_record; } qrecord_reserved = true; } ret = xa_reserve(&delayed_refs->head_refs, index, GFP_NOFS); if (ret) { if (qrecord_reserved) xa_release(&delayed_refs->dirty_extents, index); goto free_record; } init_delayed_ref_common(fs_info, node, generic_ref); init_delayed_ref_head(head_ref, generic_ref, record, reserved); head_ref->extent_op = extent_op; spin_lock(&delayed_refs->lock); /* * insert both the head node and the new ref without dropping * the spin lock */ new_head_ref = add_delayed_ref_head(trans, head_ref, record, action, &qrecord_inserted); if (IS_ERR(new_head_ref)) { xa_release(&delayed_refs->head_refs, index); spin_unlock(&delayed_refs->lock); ret = PTR_ERR(new_head_ref); goto free_record; } head_ref = new_head_ref; merged = insert_delayed_ref(trans, head_ref, node); spin_unlock(&delayed_refs->lock); /* * Need to update the delayed_refs_rsv with any changes we may have * made. */ btrfs_update_delayed_refs_rsv(trans); if (generic_ref->type == BTRFS_REF_DATA) trace_add_delayed_data_ref(trans->fs_info, node); else trace_add_delayed_tree_ref(trans->fs_info, node); if (merged) kmem_cache_free(btrfs_delayed_ref_node_cachep, node); if (qrecord_inserted) return btrfs_qgroup_trace_extent_post(trans, record, generic_ref->bytenr); return 0; free_record: kfree(record); free_head_ref: kmem_cache_free(btrfs_delayed_ref_head_cachep, head_ref); free_node: kmem_cache_free(btrfs_delayed_ref_node_cachep, node); return ret; } /* * Add a delayed tree ref. This does all of the accounting required to make sure * the delayed ref is eventually processed before this transaction commits. */ int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans, struct btrfs_ref *generic_ref, struct btrfs_delayed_extent_op *extent_op) { ASSERT(generic_ref->type == BTRFS_REF_METADATA && generic_ref->action); return add_delayed_ref(trans, generic_ref, extent_op, 0); } /* * add a delayed data ref. it's similar to btrfs_add_delayed_tree_ref. */ int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans, struct btrfs_ref *generic_ref, u64 reserved) { ASSERT(generic_ref->type == BTRFS_REF_DATA && generic_ref->action); return add_delayed_ref(trans, generic_ref, NULL, reserved); } int btrfs_add_delayed_extent_op(struct btrfs_trans_handle *trans, u64 bytenr, u64 num_bytes, u8 level, struct btrfs_delayed_extent_op *extent_op) { const unsigned long index = (bytenr >> trans->fs_info->sectorsize_bits); struct btrfs_delayed_ref_head *head_ref; struct btrfs_delayed_ref_head *head_ref_ret; struct btrfs_delayed_ref_root *delayed_refs; struct btrfs_ref generic_ref = { .type = BTRFS_REF_METADATA, .action = BTRFS_UPDATE_DELAYED_HEAD, .bytenr = bytenr, .num_bytes = num_bytes, .tree_ref.level = level, }; int ret; head_ref = kmem_cache_alloc(btrfs_delayed_ref_head_cachep, GFP_NOFS); if (!head_ref) return -ENOMEM; init_delayed_ref_head(head_ref, &generic_ref, NULL, 0); head_ref->extent_op = extent_op; delayed_refs = &trans->transaction->delayed_refs; ret = xa_reserve(&delayed_refs->head_refs, index, GFP_NOFS); if (ret) { kmem_cache_free(btrfs_delayed_ref_head_cachep, head_ref); return ret; } spin_lock(&delayed_refs->lock); head_ref_ret = add_delayed_ref_head(trans, head_ref, NULL, BTRFS_UPDATE_DELAYED_HEAD, NULL); if (IS_ERR(head_ref_ret)) { xa_release(&delayed_refs->head_refs, index); spin_unlock(&delayed_refs->lock); kmem_cache_free(btrfs_delayed_ref_head_cachep, head_ref); return PTR_ERR(head_ref_ret); } spin_unlock(&delayed_refs->lock); /* * Need to update the delayed_refs_rsv with any changes we may have * made. */ btrfs_update_delayed_refs_rsv(trans); return 0; } void btrfs_put_delayed_ref(struct btrfs_delayed_ref_node *ref) { if (refcount_dec_and_test(&ref->refs)) { WARN_ON(!RB_EMPTY_NODE(&ref->ref_node)); kmem_cache_free(btrfs_delayed_ref_node_cachep, ref); } } /* * This does a simple search for the head node for a given extent. Returns the * head node if found, or NULL if not. */ struct btrfs_delayed_ref_head * btrfs_find_delayed_ref_head(const struct btrfs_fs_info *fs_info, struct btrfs_delayed_ref_root *delayed_refs, u64 bytenr) { const unsigned long index = (bytenr >> fs_info->sectorsize_bits); lockdep_assert_held(&delayed_refs->lock); return xa_load(&delayed_refs->head_refs, index); } static int find_comp(struct btrfs_delayed_ref_node *entry, u64 root, u64 parent) { int type = parent ? BTRFS_SHARED_BLOCK_REF_KEY : BTRFS_TREE_BLOCK_REF_KEY; if (type < entry->type) return -1; if (type > entry->type) return 1; if (type == BTRFS_TREE_BLOCK_REF_KEY) { if (root < entry->ref_root) return -1; if (root > entry->ref_root) return 1; } else { if (parent < entry->parent) return -1; if (parent > entry->parent) return 1; } return 0; } /* * Check to see if a given root/parent reference is attached to the head. This * only checks for BTRFS_ADD_DELAYED_REF references that match, as that * indicates the reference exists for the given root or parent. This is for * tree blocks only. * * @head: the head of the bytenr we're searching. * @root: the root objectid of the reference if it is a normal reference. * @parent: the parent if this is a shared backref. */ bool btrfs_find_delayed_tree_ref(struct btrfs_delayed_ref_head *head, u64 root, u64 parent) { struct rb_node *node; bool found = false; lockdep_assert_held(&head->mutex); spin_lock(&head->lock); node = head->ref_tree.rb_root.rb_node; while (node) { struct btrfs_delayed_ref_node *entry; int ret; entry = rb_entry(node, struct btrfs_delayed_ref_node, ref_node); ret = find_comp(entry, root, parent); if (ret < 0) { node = node->rb_left; } else if (ret > 0) { node = node->rb_right; } else { /* * We only want to count ADD actions, as drops mean the * ref doesn't exist. */ if (entry->action == BTRFS_ADD_DELAYED_REF) found = true; break; } } spin_unlock(&head->lock); return found; } void btrfs_destroy_delayed_refs(struct btrfs_transaction *trans) { struct btrfs_delayed_ref_root *delayed_refs = &trans->delayed_refs; struct btrfs_fs_info *fs_info = trans->fs_info; spin_lock(&delayed_refs->lock); while (true) { struct btrfs_delayed_ref_head *head; struct rb_node *n; bool pin_bytes = false; head = find_first_ref_head(delayed_refs); if (!head) break; if (!btrfs_delayed_ref_lock(delayed_refs, head)) continue; spin_lock(&head->lock); while ((n = rb_first_cached(&head->ref_tree)) != NULL) { struct btrfs_delayed_ref_node *ref; ref = rb_entry(n, struct btrfs_delayed_ref_node, ref_node); drop_delayed_ref(fs_info, delayed_refs, head, ref); } if (head->must_insert_reserved) pin_bytes = true; btrfs_free_delayed_extent_op(head->extent_op); btrfs_delete_ref_head(fs_info, delayed_refs, head); spin_unlock(&head->lock); spin_unlock(&delayed_refs->lock); mutex_unlock(&head->mutex); if (!btrfs_is_testing(fs_info) && pin_bytes) { struct btrfs_block_group *bg; bg = btrfs_lookup_block_group(fs_info, head->bytenr); if (WARN_ON_ONCE(bg == NULL)) { /* * Unexpected and there's nothing we can do here * because we are in a transaction abort path, * so any errors can only be ignored or reported * while attempting to cleanup all resources. */ btrfs_err(fs_info, "block group for delayed ref at %llu was not found while destroying ref head", head->bytenr); } else { spin_lock(&bg->space_info->lock); spin_lock(&bg->lock); bg->pinned += head->num_bytes; btrfs_space_info_update_bytes_pinned(bg->space_info, head->num_bytes); bg->reserved -= head->num_bytes; bg->space_info->bytes_reserved -= head->num_bytes; spin_unlock(&bg->lock); spin_unlock(&bg->space_info->lock); btrfs_put_block_group(bg); } btrfs_error_unpin_extent_range(fs_info, head->bytenr, head->bytenr + head->num_bytes - 1); } if (!btrfs_is_testing(fs_info)) btrfs_cleanup_ref_head_accounting(fs_info, delayed_refs, head); btrfs_put_delayed_ref_head(head); cond_resched(); spin_lock(&delayed_refs->lock); } if (!btrfs_is_testing(fs_info)) btrfs_qgroup_destroy_extent_records(trans); spin_unlock(&delayed_refs->lock); } void __cold btrfs_delayed_ref_exit(void) { kmem_cache_destroy(btrfs_delayed_ref_head_cachep); kmem_cache_destroy(btrfs_delayed_ref_node_cachep); kmem_cache_destroy(btrfs_delayed_extent_op_cachep); } int __init btrfs_delayed_ref_init(void) { btrfs_delayed_ref_head_cachep = KMEM_CACHE(btrfs_delayed_ref_head, 0); if (!btrfs_delayed_ref_head_cachep) return -ENOMEM; btrfs_delayed_ref_node_cachep = KMEM_CACHE(btrfs_delayed_ref_node, 0); if (!btrfs_delayed_ref_node_cachep) goto fail; btrfs_delayed_extent_op_cachep = KMEM_CACHE(btrfs_delayed_extent_op, 0); if (!btrfs_delayed_extent_op_cachep) goto fail; return 0; fail: btrfs_delayed_ref_exit(); return -ENOMEM; }
1 1 1 1 1 1 1 1 1 2 2 1 1 1 2 2 3 3 1 3 2 2 2 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 // SPDX-License-Identifier: GPL-2.0 #include <linux/ceph/ceph_debug.h> #include <linux/crc32c.h> #include <linux/ctype.h> #include <linux/highmem.h> #include <linux/inet.h> #include <linux/kthread.h> #include <linux/net.h> #include <linux/nsproxy.h> #include <linux/sched/mm.h> #include <linux/slab.h> #include <linux/socket.h> #include <linux/string.h> #ifdef CONFIG_BLOCK #include <linux/bio.h> #endif /* CONFIG_BLOCK */ #include <linux/dns_resolver.h> #include <net/tcp.h> #include <trace/events/sock.h> #include <linux/ceph/ceph_features.h> #include <linux/ceph/libceph.h> #include <linux/ceph/messenger.h> #include <linux/ceph/decode.h> #include <linux/ceph/pagelist.h> #include <linux/export.h> /* * Ceph uses the messenger to exchange ceph_msg messages with other * hosts in the system. The messenger provides ordered and reliable * delivery. We tolerate TCP disconnects by reconnecting (with * exponential backoff) in the case of a fault (disconnection, bad * crc, protocol error). Acks allow sent messages to be discarded by * the sender. */ /* * We track the state of the socket on a given connection using * values defined below. The transition to a new socket state is * handled by a function which verifies we aren't coming from an * unexpected state. * * -------- * | NEW* | transient initial state * -------- * | con_sock_state_init() * v * ---------- * | CLOSED | initialized, but no socket (and no * ---------- TCP connection) * ^ \ * | \ con_sock_state_connecting() * | ---------------------- * | \ * + con_sock_state_closed() \ * |+--------------------------- \ * | \ \ \ * | ----------- \ \ * | | CLOSING | socket event; \ \ * | ----------- await close \ \ * | ^ \ | * | | \ | * | + con_sock_state_closing() \ | * | / \ | | * | / --------------- | | * | / \ v v * | / -------------- * | / -----------------| CONNECTING | socket created, TCP * | | / -------------- connect initiated * | | | con_sock_state_connected() * | | v * ------------- * | CONNECTED | TCP connection established * ------------- * * State values for ceph_connection->sock_state; NEW is assumed to be 0. */ #define CON_SOCK_STATE_NEW 0 /* -> CLOSED */ #define CON_SOCK_STATE_CLOSED 1 /* -> CONNECTING */ #define CON_SOCK_STATE_CONNECTING 2 /* -> CONNECTED or -> CLOSING */ #define CON_SOCK_STATE_CONNECTED 3 /* -> CLOSING or -> CLOSED */ #define CON_SOCK_STATE_CLOSING 4 /* -> CLOSED */ static bool con_flag_valid(unsigned long con_flag) { switch (con_flag) { case CEPH_CON_F_LOSSYTX: case CEPH_CON_F_KEEPALIVE_PENDING: case CEPH_CON_F_WRITE_PENDING: case CEPH_CON_F_SOCK_CLOSED: case CEPH_CON_F_BACKOFF: return true; default: return false; } } void ceph_con_flag_clear(struct ceph_connection *con, unsigned long con_flag) { BUG_ON(!con_flag_valid(con_flag)); clear_bit(con_flag, &con->flags); } void ceph_con_flag_set(struct ceph_connection *con, unsigned long con_flag) { BUG_ON(!con_flag_valid(con_flag)); set_bit(con_flag, &con->flags); } bool ceph_con_flag_test(struct ceph_connection *con, unsigned long con_flag) { BUG_ON(!con_flag_valid(con_flag)); return test_bit(con_flag, &con->flags); } bool ceph_con_flag_test_and_clear(struct ceph_connection *con, unsigned long con_flag) { BUG_ON(!con_flag_valid(con_flag)); return test_and_clear_bit(con_flag, &con->flags); } bool ceph_con_flag_test_and_set(struct ceph_connection *con, unsigned long con_flag) { BUG_ON(!con_flag_valid(con_flag)); return test_and_set_bit(con_flag, &con->flags); } /* Slab caches for frequently-allocated structures */ static struct kmem_cache *ceph_msg_cache; #ifdef CONFIG_LOCKDEP static struct lock_class_key socket_class; #endif static void queue_con(struct ceph_connection *con); static void cancel_con(struct ceph_connection *con); static void ceph_con_workfn(struct work_struct *); static void con_fault(struct ceph_connection *con); /* * Nicely render a sockaddr as a string. An array of formatted * strings is used, to approximate reentrancy. */ #define ADDR_STR_COUNT_LOG 5 /* log2(# address strings in array) */ #define ADDR_STR_COUNT (1 << ADDR_STR_COUNT_LOG) #define ADDR_STR_COUNT_MASK (ADDR_STR_COUNT - 1) #define MAX_ADDR_STR_LEN 64 /* 54 is enough */ static char addr_str[ADDR_STR_COUNT][MAX_ADDR_STR_LEN]; static atomic_t addr_str_seq = ATOMIC_INIT(0); struct page *ceph_zero_page; /* used in certain error cases */ const char *ceph_pr_addr(const struct ceph_entity_addr *addr) { int i; char *s; struct sockaddr_storage ss = addr->in_addr; /* align */ struct sockaddr_in *in4 = (struct sockaddr_in *)&ss; struct sockaddr_in6 *in6 = (struct sockaddr_in6 *)&ss; i = atomic_inc_return(&addr_str_seq) & ADDR_STR_COUNT_MASK; s = addr_str[i]; switch (ss.ss_family) { case AF_INET: snprintf(s, MAX_ADDR_STR_LEN, "(%d)%pI4:%hu", le32_to_cpu(addr->type), &in4->sin_addr, ntohs(in4->sin_port)); break; case AF_INET6: snprintf(s, MAX_ADDR_STR_LEN, "(%d)[%pI6c]:%hu", le32_to_cpu(addr->type), &in6->sin6_addr, ntohs(in6->sin6_port)); break; default: snprintf(s, MAX_ADDR_STR_LEN, "(unknown sockaddr family %hu)", ss.ss_family); } return s; } EXPORT_SYMBOL(ceph_pr_addr); void ceph_encode_my_addr(struct ceph_messenger *msgr) { if (!ceph_msgr2(from_msgr(msgr))) { memcpy(&msgr->my_enc_addr, &msgr->inst.addr, sizeof(msgr->my_enc_addr)); ceph_encode_banner_addr(&msgr->my_enc_addr); } } /* * work queue for all reading and writing to/from the socket. */ static struct workqueue_struct *ceph_msgr_wq; static int ceph_msgr_slab_init(void) { BUG_ON(ceph_msg_cache); ceph_msg_cache = KMEM_CACHE(ceph_msg, 0); if (!ceph_msg_cache) return -ENOMEM; return 0; } static void ceph_msgr_slab_exit(void) { BUG_ON(!ceph_msg_cache); kmem_cache_destroy(ceph_msg_cache); ceph_msg_cache = NULL; } static void _ceph_msgr_exit(void) { if (ceph_msgr_wq) { destroy_workqueue(ceph_msgr_wq); ceph_msgr_wq = NULL; } BUG_ON(!ceph_zero_page); put_page(ceph_zero_page); ceph_zero_page = NULL; ceph_msgr_slab_exit(); } int __init ceph_msgr_init(void) { if (ceph_msgr_slab_init()) return -ENOMEM; BUG_ON(ceph_zero_page); ceph_zero_page = ZERO_PAGE(0); get_page(ceph_zero_page); /* * The number of active work items is limited by the number of * connections, so leave @max_active at default. */ ceph_msgr_wq = alloc_workqueue("ceph-msgr", WQ_MEM_RECLAIM | WQ_PERCPU, 0); if (ceph_msgr_wq) return 0; pr_err("msgr_init failed to create workqueue\n"); _ceph_msgr_exit(); return -ENOMEM; } void ceph_msgr_exit(void) { BUG_ON(ceph_msgr_wq == NULL); _ceph_msgr_exit(); } void ceph_msgr_flush(void) { flush_workqueue(ceph_msgr_wq); } EXPORT_SYMBOL(ceph_msgr_flush); /* Connection socket state transition functions */ static void con_sock_state_init(struct ceph_connection *con) { int old_state; old_state = atomic_xchg(&con->sock_state, CON_SOCK_STATE_CLOSED); if (WARN_ON(old_state != CON_SOCK_STATE_NEW)) printk("%s: unexpected old state %d\n", __func__, old_state); dout("%s con %p sock %d -> %d\n", __func__, con, old_state, CON_SOCK_STATE_CLOSED); } static void con_sock_state_connecting(struct ceph_connection *con) { int old_state; old_state = atomic_xchg(&con->sock_state, CON_SOCK_STATE_CONNECTING); if (WARN_ON(old_state != CON_SOCK_STATE_CLOSED)) printk("%s: unexpected old state %d\n", __func__, old_state); dout("%s con %p sock %d -> %d\n", __func__, con, old_state, CON_SOCK_STATE_CONNECTING); } static void con_sock_state_connected(struct ceph_connection *con) { int old_state; old_state = atomic_xchg(&con->sock_state, CON_SOCK_STATE_CONNECTED); if (WARN_ON(old_state != CON_SOCK_STATE_CONNECTING)) printk("%s: unexpected old state %d\n", __func__, old_state); dout("%s con %p sock %d -> %d\n", __func__, con, old_state, CON_SOCK_STATE_CONNECTED); } static void con_sock_state_closing(struct ceph_connection *con) { int old_state; old_state = atomic_xchg(&con->sock_state, CON_SOCK_STATE_CLOSING); if (WARN_ON(old_state != CON_SOCK_STATE_CONNECTING && old_state != CON_SOCK_STATE_CONNECTED && old_state != CON_SOCK_STATE_CLOSING)) printk("%s: unexpected old state %d\n", __func__, old_state); dout("%s con %p sock %d -> %d\n", __func__, con, old_state, CON_SOCK_STATE_CLOSING); } static void con_sock_state_closed(struct ceph_connection *con) { int old_state; old_state = atomic_xchg(&con->sock_state, CON_SOCK_STATE_CLOSED); if (WARN_ON(old_state != CON_SOCK_STATE_CONNECTED && old_state != CON_SOCK_STATE_CLOSING && old_state != CON_SOCK_STATE_CONNECTING && old_state != CON_SOCK_STATE_CLOSED)) printk("%s: unexpected old state %d\n", __func__, old_state); dout("%s con %p sock %d -> %d\n", __func__, con, old_state, CON_SOCK_STATE_CLOSED); } /* * socket callback functions */ /* data available on socket, or listen socket received a connect */ static void ceph_sock_data_ready(struct sock *sk) { struct ceph_connection *con = sk->sk_user_data; trace_sk_data_ready(sk); if (atomic_read(&con->msgr->stopping)) { return; } if (sk->sk_state != TCP_CLOSE_WAIT) { dout("%s %p state = %d, queueing work\n", __func__, con, con->state); queue_con(con); } } /* socket has buffer space for writing */ static void ceph_sock_write_space(struct sock *sk) { struct ceph_connection *con = sk->sk_user_data; /* only queue to workqueue if there is data we want to write, * and there is sufficient space in the socket buffer to accept * more data. clear SOCK_NOSPACE so that ceph_sock_write_space() * doesn't get called again until try_write() fills the socket * buffer. See net/ipv4/tcp_input.c:tcp_check_space() * and net/core/stream.c:sk_stream_write_space(). */ if (ceph_con_flag_test(con, CEPH_CON_F_WRITE_PENDING)) { if (sk_stream_is_writeable(sk)) { dout("%s %p queueing write work\n", __func__, con); clear_bit(SOCK_NOSPACE, &sk->sk_socket->flags); queue_con(con); } } else { dout("%s %p nothing to write\n", __func__, con); } } /* socket's state has changed */ static void ceph_sock_state_change(struct sock *sk) { struct ceph_connection *con = sk->sk_user_data; dout("%s %p state = %d sk_state = %u\n", __func__, con, con->state, sk->sk_state); switch (sk->sk_state) { case TCP_CLOSE: dout("%s TCP_CLOSE\n", __func__); fallthrough; case TCP_CLOSE_WAIT: dout("%s TCP_CLOSE_WAIT\n", __func__); con_sock_state_closing(con); ceph_con_flag_set(con, CEPH_CON_F_SOCK_CLOSED); queue_con(con); break; case TCP_ESTABLISHED: dout("%s TCP_ESTABLISHED\n", __func__); con_sock_state_connected(con); queue_con(con); break; default: /* Everything else is uninteresting */ break; } } /* * set up socket callbacks */ static void set_sock_callbacks(struct socket *sock, struct ceph_connection *con) { struct sock *sk = sock->sk; sk->sk_user_data = con; sk->sk_data_ready = ceph_sock_data_ready; sk->sk_write_space = ceph_sock_write_space; sk->sk_state_change = ceph_sock_state_change; } /* * socket helpers */ /* * initiate connection to a remote socket. */ int ceph_tcp_connect(struct ceph_connection *con) { struct sockaddr_storage ss = con->peer_addr.in_addr; /* align */ struct socket *sock; unsigned int noio_flag; int ret; dout("%s con %p peer_addr %s\n", __func__, con, ceph_pr_addr(&con->peer_addr)); BUG_ON(con->sock); /* sock_create_kern() allocates with GFP_KERNEL */ noio_flag = memalloc_noio_save(); ret = sock_create_kern(read_pnet(&con->msgr->net), ss.ss_family, SOCK_STREAM, IPPROTO_TCP, &sock); memalloc_noio_restore(noio_flag); if (ret) return ret; sock->sk->sk_allocation = GFP_NOFS; sock->sk->sk_use_task_frag = false; #ifdef CONFIG_LOCKDEP lockdep_set_class(&sock->sk->sk_lock, &socket_class); #endif set_sock_callbacks(sock, con); con_sock_state_connecting(con); ret = kernel_connect(sock, (struct sockaddr *)&ss, sizeof(ss), O_NONBLOCK); if (ret == -EINPROGRESS) { dout("connect %s EINPROGRESS sk_state = %u\n", ceph_pr_addr(&con->peer_addr), sock->sk->sk_state); } else if (ret < 0) { pr_err("connect %s error %d\n", ceph_pr_addr(&con->peer_addr), ret); sock_release(sock); return ret; } if (ceph_test_opt(from_msgr(con->msgr), TCP_NODELAY)) tcp_sock_set_nodelay(sock->sk); con->sock = sock; return 0; } /* * Shutdown/close the socket for the given connection. */ int ceph_con_close_socket(struct ceph_connection *con) { int rc = 0; dout("%s con %p sock %p\n", __func__, con, con->sock); if (con->sock) { rc = con->sock->ops->shutdown(con->sock, SHUT_RDWR); sock_release(con->sock); con->sock = NULL; } /* * Forcibly clear the SOCK_CLOSED flag. It gets set * independent of the connection mutex, and we could have * received a socket close event before we had the chance to * shut the socket down. */ ceph_con_flag_clear(con, CEPH_CON_F_SOCK_CLOSED); con_sock_state_closed(con); return rc; } static void ceph_con_reset_protocol(struct ceph_connection *con) { dout("%s con %p\n", __func__, con); ceph_con_close_socket(con); if (con->in_msg) { WARN_ON(con->in_msg->con != con); ceph_msg_put(con->in_msg); con->in_msg = NULL; } if (con->out_msg) { WARN_ON(con->out_msg->con != con); ceph_msg_put(con->out_msg); con->out_msg = NULL; } if (con->bounce_page) { __free_page(con->bounce_page); con->bounce_page = NULL; } if (ceph_msgr2(from_msgr(con->msgr))) ceph_con_v2_reset_protocol(con); else ceph_con_v1_reset_protocol(con); } /* * Reset a connection. Discard all incoming and outgoing messages * and clear *_seq state. */ static void ceph_msg_remove(struct ceph_msg *msg) { list_del_init(&msg->list_head); ceph_msg_put(msg); } static void ceph_msg_remove_list(struct list_head *head) { while (!list_empty(head)) { struct ceph_msg *msg = list_first_entry(head, struct ceph_msg, list_head); ceph_msg_remove(msg); } } void ceph_con_reset_session(struct ceph_connection *con) { dout("%s con %p\n", __func__, con); WARN_ON(con->in_msg); WARN_ON(con->out_msg); ceph_msg_remove_list(&con->out_queue); ceph_msg_remove_list(&con->out_sent); con->out_seq = 0; con->in_seq = 0; con->in_seq_acked = 0; if (ceph_msgr2(from_msgr(con->msgr))) ceph_con_v2_reset_session(con); else ceph_con_v1_reset_session(con); } /* * mark a peer down. drop any open connections. */ void ceph_con_close(struct ceph_connection *con) { mutex_lock(&con->mutex); dout("con_close %p peer %s\n", con, ceph_pr_addr(&con->peer_addr)); con->state = CEPH_CON_S_CLOSED; ceph_con_flag_clear(con, CEPH_CON_F_LOSSYTX); /* so we retry next connect */ ceph_con_flag_clear(con, CEPH_CON_F_KEEPALIVE_PENDING); ceph_con_flag_clear(con, CEPH_CON_F_WRITE_PENDING); ceph_con_flag_clear(con, CEPH_CON_F_BACKOFF); ceph_con_reset_protocol(con); ceph_con_reset_session(con); cancel_con(con); mutex_unlock(&con->mutex); } EXPORT_SYMBOL(ceph_con_close); /* * Reopen a closed connection, with a new peer address. */ void ceph_con_open(struct ceph_connection *con, __u8 entity_type, __u64 entity_num, struct ceph_entity_addr *addr) { mutex_lock(&con->mutex); dout("con_open %p %s\n", con, ceph_pr_addr(addr)); WARN_ON(con->state != CEPH_CON_S_CLOSED); con->state = CEPH_CON_S_PREOPEN; con->peer_name.type = (__u8) entity_type; con->peer_name.num = cpu_to_le64(entity_num); memcpy(&con->peer_addr, addr, sizeof(*addr)); con->delay = 0; /* reset backoff memory */ mutex_unlock(&con->mutex); queue_con(con); } EXPORT_SYMBOL(ceph_con_open); /* * return true if this connection ever successfully opened */ bool ceph_con_opened(struct ceph_connection *con) { if (ceph_msgr2(from_msgr(con->msgr))) return ceph_con_v2_opened(con); return ceph_con_v1_opened(con); } /* * initialize a new connection. */ void ceph_con_init(struct ceph_connection *con, void *private, const struct ceph_connection_operations *ops, struct ceph_messenger *msgr) { dout("con_init %p\n", con); memset(con, 0, sizeof(*con)); con->private = private; con->ops = ops; con->msgr = msgr; con_sock_state_init(con); mutex_init(&con->mutex); INIT_LIST_HEAD(&con->out_queue); INIT_LIST_HEAD(&con->out_sent); INIT_DELAYED_WORK(&con->work, ceph_con_workfn); con->state = CEPH_CON_S_CLOSED; } EXPORT_SYMBOL(ceph_con_init); /* * We maintain a global counter to order connection attempts. Get * a unique seq greater than @gt. */ u32 ceph_get_global_seq(struct ceph_messenger *msgr, u32 gt) { u32 ret; spin_lock(&msgr->global_seq_lock); if (msgr->global_seq < gt) msgr->global_seq = gt; ret = ++msgr->global_seq; spin_unlock(&msgr->global_seq_lock); return ret; } /* * Discard messages that have been acked by the server. */ void ceph_con_discard_sent(struct ceph_connection *con, u64 ack_seq) { struct ceph_msg *msg; u64 seq; dout("%s con %p ack_seq %llu\n", __func__, con, ack_seq); while (!list_empty(&con->out_sent)) { msg = list_first_entry(&con->out_sent, struct ceph_msg, list_head); WARN_ON(msg->needs_out_seq); seq = le64_to_cpu(msg->hdr.seq); if (seq > ack_seq) break; dout("%s con %p discarding msg %p seq %llu\n", __func__, con, msg, seq); ceph_msg_remove(msg); } } /* * Discard messages that have been requeued in con_fault(), up to * reconnect_seq. This avoids gratuitously resending messages that * the server had received and handled prior to reconnect. */ void ceph_con_discard_requeued(struct ceph_connection *con, u64 reconnect_seq) { struct ceph_msg *msg; u64 seq; dout("%s con %p reconnect_seq %llu\n", __func__, con, reconnect_seq); while (!list_empty(&con->out_queue)) { msg = list_first_entry(&con->out_queue, struct ceph_msg, list_head); if (msg->needs_out_seq) break; seq = le64_to_cpu(msg->hdr.seq); if (seq > reconnect_seq) break; dout("%s con %p discarding msg %p seq %llu\n", __func__, con, msg, seq); ceph_msg_remove(msg); } } #ifdef CONFIG_BLOCK /* * For a bio data item, a piece is whatever remains of the next * entry in the current bio iovec, or the first entry in the next * bio in the list. */ static void ceph_msg_data_bio_cursor_init(struct ceph_msg_data_cursor *cursor, size_t length) { struct ceph_msg_data *data = cursor->data; struct ceph_bio_iter *it = &cursor->bio_iter; cursor->resid = min_t(size_t, length, data->bio_length); *it = data->bio_pos; if (cursor->resid < it->iter.bi_size) it->iter.bi_size = cursor->resid; BUG_ON(cursor->resid < bio_iter_len(it->bio, it->iter)); } static struct page *ceph_msg_data_bio_next(struct ceph_msg_data_cursor *cursor, size_t *page_offset, size_t *length) { struct bio_vec bv = bio_iter_iovec(cursor->bio_iter.bio, cursor->bio_iter.iter); *page_offset = bv.bv_offset; *length = bv.bv_len; return bv.bv_page; } static bool ceph_msg_data_bio_advance(struct ceph_msg_data_cursor *cursor, size_t bytes) { struct ceph_bio_iter *it = &cursor->bio_iter; struct page *page = bio_iter_page(it->bio, it->iter); BUG_ON(bytes > cursor->resid); BUG_ON(bytes > bio_iter_len(it->bio, it->iter)); cursor->resid -= bytes; bio_advance_iter(it->bio, &it->iter, bytes); if (!cursor->resid) return false; /* no more data */ if (!bytes || (it->iter.bi_size && it->iter.bi_bvec_done && page == bio_iter_page(it->bio, it->iter))) return false; /* more bytes to process in this segment */ if (!it->iter.bi_size) { it->bio = it->bio->bi_next; it->iter = it->bio->bi_iter; if (cursor->resid < it->iter.bi_size) it->iter.bi_size = cursor->resid; } BUG_ON(cursor->resid < bio_iter_len(it->bio, it->iter)); return true; } #endif /* CONFIG_BLOCK */ static void ceph_msg_data_bvecs_cursor_init(struct ceph_msg_data_cursor *cursor, size_t length) { struct ceph_msg_data *data = cursor->data; struct bio_vec *bvecs = data->bvec_pos.bvecs; cursor->resid = min_t(size_t, length, data->bvec_pos.iter.bi_size); cursor->bvec_iter = data->bvec_pos.iter; cursor->bvec_iter.bi_size = cursor->resid; BUG_ON(cursor->resid < bvec_iter_len(bvecs, cursor->bvec_iter)); } static struct page *ceph_msg_data_bvecs_next(struct ceph_msg_data_cursor *cursor, size_t *page_offset, size_t *length) { struct bio_vec bv = bvec_iter_bvec(cursor->data->bvec_pos.bvecs, cursor->bvec_iter); *page_offset = bv.bv_offset; *length = bv.bv_len; return bv.bv_page; } static bool ceph_msg_data_bvecs_advance(struct ceph_msg_data_cursor *cursor, size_t bytes) { struct bio_vec *bvecs = cursor->data->bvec_pos.bvecs; struct page *page = bvec_iter_page(bvecs, cursor->bvec_iter); BUG_ON(bytes > cursor->resid); BUG_ON(bytes > bvec_iter_len(bvecs, cursor->bvec_iter)); cursor->resid -= bytes; bvec_iter_advance(bvecs, &cursor->bvec_iter, bytes); if (!cursor->resid) return false; /* no more data */ if (!bytes || (cursor->bvec_iter.bi_bvec_done && page == bvec_iter_page(bvecs, cursor->bvec_iter))) return false; /* more bytes to process in this segment */ BUG_ON(cursor->resid < bvec_iter_len(bvecs, cursor->bvec_iter)); return true; } /* * For a page array, a piece comes from the first page in the array * that has not already been fully consumed. */ static void ceph_msg_data_pages_cursor_init(struct ceph_msg_data_cursor *cursor, size_t length) { struct ceph_msg_data *data = cursor->data; int page_count; BUG_ON(data->type != CEPH_MSG_DATA_PAGES); BUG_ON(!data->pages); BUG_ON(!data->length); cursor->resid = min(length, data->length); page_count = calc_pages_for(data->alignment, (u64)data->length); cursor->page_offset = data->alignment & ~PAGE_MASK; cursor->page_index = 0; BUG_ON(page_count > (int)USHRT_MAX); cursor->page_count = (unsigned short)page_count; BUG_ON(length > SIZE_MAX - cursor->page_offset); } static struct page * ceph_msg_data_pages_next(struct ceph_msg_data_cursor *cursor, size_t *page_offset, size_t *length) { struct ceph_msg_data *data = cursor->data; BUG_ON(data->type != CEPH_MSG_DATA_PAGES); BUG_ON(cursor->page_index >= cursor->page_count); BUG_ON(cursor->page_offset >= PAGE_SIZE); *page_offset = cursor->page_offset; *length = min_t(size_t, cursor->resid, PAGE_SIZE - *page_offset); return data->pages[cursor->page_index]; } static bool ceph_msg_data_pages_advance(struct ceph_msg_data_cursor *cursor, size_t bytes) { BUG_ON(cursor->data->type != CEPH_MSG_DATA_PAGES); BUG_ON(cursor->page_offset + bytes > PAGE_SIZE); /* Advance the cursor page offset */ cursor->resid -= bytes; cursor->page_offset = (cursor->page_offset + bytes) & ~PAGE_MASK; if (!bytes || cursor->page_offset) return false; /* more bytes to process in the current page */ if (!cursor->resid) return false; /* no more data */ /* Move on to the next page; offset is already at 0 */ BUG_ON(cursor->page_index >= cursor->page_count); cursor->page_index++; return true; } /* * For a pagelist, a piece is whatever remains to be consumed in the * first page in the list, or the front of the next page. */ static void ceph_msg_data_pagelist_cursor_init(struct ceph_msg_data_cursor *cursor, size_t length) { struct ceph_msg_data *data = cursor->data; struct ceph_pagelist *pagelist; struct page *page; BUG_ON(data->type != CEPH_MSG_DATA_PAGELIST); pagelist = data->pagelist; BUG_ON(!pagelist); if (!length) return; /* pagelist can be assigned but empty */ BUG_ON(list_empty(&pagelist->head)); page = list_first_entry(&pagelist->head, struct page, lru); cursor->resid = min(length, pagelist->length); cursor->page = page; cursor->offset = 0; } static struct page * ceph_msg_data_pagelist_next(struct ceph_msg_data_cursor *cursor, size_t *page_offset, size_t *length) { struct ceph_msg_data *data = cursor->data; struct ceph_pagelist *pagelist; BUG_ON(data->type != CEPH_MSG_DATA_PAGELIST); pagelist = data->pagelist; BUG_ON(!pagelist); BUG_ON(!cursor->page); BUG_ON(cursor->offset + cursor->resid != pagelist->length); /* offset of first page in pagelist is always 0 */ *page_offset = cursor->offset & ~PAGE_MASK; *length = min_t(size_t, cursor->resid, PAGE_SIZE - *page_offset); return cursor->page; } static bool ceph_msg_data_pagelist_advance(struct ceph_msg_data_cursor *cursor, size_t bytes) { struct ceph_msg_data *data = cursor->data; struct ceph_pagelist *pagelist; BUG_ON(data->type != CEPH_MSG_DATA_PAGELIST); pagelist = data->pagelist; BUG_ON(!pagelist); BUG_ON(cursor->offset + cursor->resid != pagelist->length); BUG_ON((cursor->offset & ~PAGE_MASK) + bytes > PAGE_SIZE); /* Advance the cursor offset */ cursor->resid -= bytes; cursor->offset += bytes; /* offset of first page in pagelist is always 0 */ if (!bytes || cursor->offset & ~PAGE_MASK) return false; /* more bytes to process in the current page */ if (!cursor->resid) return false; /* no more data */ /* Move on to the next page */ BUG_ON(list_is_last(&cursor->page->lru, &pagelist->head)); cursor->page = list_next_entry(cursor->page, lru); return true; } static void ceph_msg_data_iter_cursor_init(struct ceph_msg_data_cursor *cursor, size_t length) { struct ceph_msg_data *data = cursor->data; cursor->iov_iter = data->iter; cursor->lastlen = 0; iov_iter_truncate(&cursor->iov_iter, length); cursor->resid = iov_iter_count(&cursor->iov_iter); } static struct page *ceph_msg_data_iter_next(struct ceph_msg_data_cursor *cursor, size_t *page_offset, size_t *length) { struct page *page; ssize_t len; if (cursor->lastlen) iov_iter_revert(&cursor->iov_iter, cursor->lastlen); len = iov_iter_get_pages2(&cursor->iov_iter, &page, PAGE_SIZE, 1, page_offset); BUG_ON(len < 0); cursor->lastlen = len; /* * FIXME: The assumption is that the pages represented by the iov_iter * are pinned, with the references held by the upper-level * callers, or by virtue of being under writeback. Eventually, * we'll get an iov_iter_get_pages2 variant that doesn't take * page refs. Until then, just put the page ref. */ VM_BUG_ON_PAGE(!PageWriteback(page) && page_count(page) < 2, page); put_page(page); *length = min_t(size_t, len, cursor->resid); return page; } static bool ceph_msg_data_iter_advance(struct ceph_msg_data_cursor *cursor, size_t bytes) { BUG_ON(bytes > cursor->resid); cursor->resid -= bytes; if (bytes < cursor->lastlen) { cursor->lastlen -= bytes; } else { iov_iter_advance(&cursor->iov_iter, bytes - cursor->lastlen); cursor->lastlen = 0; } return cursor->resid; } /* * Message data is handled (sent or received) in pieces, where each * piece resides on a single page. The network layer might not * consume an entire piece at once. A data item's cursor keeps * track of which piece is next to process and how much remains to * be processed in that piece. It also tracks whether the current * piece is the last one in the data item. */ static void __ceph_msg_data_cursor_init(struct ceph_msg_data_cursor *cursor) { size_t length = cursor->total_resid; switch (cursor->data->type) { case CEPH_MSG_DATA_PAGELIST: ceph_msg_data_pagelist_cursor_init(cursor, length); break; case CEPH_MSG_DATA_PAGES: ceph_msg_data_pages_cursor_init(cursor, length); break; #ifdef CONFIG_BLOCK case CEPH_MSG_DATA_BIO: ceph_msg_data_bio_cursor_init(cursor, length); break; #endif /* CONFIG_BLOCK */ case CEPH_MSG_DATA_BVECS: ceph_msg_data_bvecs_cursor_init(cursor, length); break; case CEPH_MSG_DATA_ITER: ceph_msg_data_iter_cursor_init(cursor, length); break; case CEPH_MSG_DATA_NONE: default: /* BUG(); */ break; } cursor->need_crc = true; } void ceph_msg_data_cursor_init(struct ceph_msg_data_cursor *cursor, struct ceph_msg *msg, size_t length) { BUG_ON(!length); BUG_ON(length > msg->data_length); BUG_ON(!msg->num_data_items); cursor->total_resid = length; cursor->data = msg->data; cursor->sr_resid = 0; __ceph_msg_data_cursor_init(cursor); } /* * Return the page containing the next piece to process for a given * data item, and supply the page offset and length of that piece. * Indicate whether this is the last piece in this data item. */ struct page *ceph_msg_data_next(struct ceph_msg_data_cursor *cursor, size_t *page_offset, size_t *length) { struct page *page; switch (cursor->data->type) { case CEPH_MSG_DATA_PAGELIST: page = ceph_msg_data_pagelist_next(cursor, page_offset, length); break; case CEPH_MSG_DATA_PAGES: page = ceph_msg_data_pages_next(cursor, page_offset, length); break; #ifdef CONFIG_BLOCK case CEPH_MSG_DATA_BIO: page = ceph_msg_data_bio_next(cursor, page_offset, length); break; #endif /* CONFIG_BLOCK */ case CEPH_MSG_DATA_BVECS: page = ceph_msg_data_bvecs_next(cursor, page_offset, length); break; case CEPH_MSG_DATA_ITER: page = ceph_msg_data_iter_next(cursor, page_offset, length); break; case CEPH_MSG_DATA_NONE: default: page = NULL; break; } BUG_ON(!page); BUG_ON(*page_offset + *length > PAGE_SIZE); BUG_ON(!*length); BUG_ON(*length > cursor->resid); return page; } /* * Returns true if the result moves the cursor on to the next piece * of the data item. */ void ceph_msg_data_advance(struct ceph_msg_data_cursor *cursor, size_t bytes) { bool new_piece; BUG_ON(bytes > cursor->resid); switch (cursor->data->type) { case CEPH_MSG_DATA_PAGELIST: new_piece = ceph_msg_data_pagelist_advance(cursor, bytes); break; case CEPH_MSG_DATA_PAGES: new_piece = ceph_msg_data_pages_advance(cursor, bytes); break; #ifdef CONFIG_BLOCK case CEPH_MSG_DATA_BIO: new_piece = ceph_msg_data_bio_advance(cursor, bytes); break; #endif /* CONFIG_BLOCK */ case CEPH_MSG_DATA_BVECS: new_piece = ceph_msg_data_bvecs_advance(cursor, bytes); break; case CEPH_MSG_DATA_ITER: new_piece = ceph_msg_data_iter_advance(cursor, bytes); break; case CEPH_MSG_DATA_NONE: default: BUG(); break; } cursor->total_resid -= bytes; if (!cursor->resid && cursor->total_resid) { cursor->data++; __ceph_msg_data_cursor_init(cursor); new_piece = true; } cursor->need_crc = new_piece; } u32 ceph_crc32c_page(u32 crc, struct page *page, unsigned int page_offset, unsigned int length) { char *kaddr; kaddr = kmap(page); BUG_ON(kaddr == NULL); crc = crc32c(crc, kaddr + page_offset, length); kunmap(page); return crc; } bool ceph_addr_is_blank(const struct ceph_entity_addr *addr) { struct sockaddr_storage ss = addr->in_addr; /* align */ struct in_addr *addr4 = &((struct sockaddr_in *)&ss)->sin_addr; struct in6_addr *addr6 = &((struct sockaddr_in6 *)&ss)->sin6_addr; switch (ss.ss_family) { case AF_INET: return addr4->s_addr == htonl(INADDR_ANY); case AF_INET6: return ipv6_addr_any(addr6); default: return true; } } EXPORT_SYMBOL(ceph_addr_is_blank); int ceph_addr_port(const struct ceph_entity_addr *addr) { switch (get_unaligned(&addr->in_addr.ss_family)) { case AF_INET: return ntohs(get_unaligned(&((struct sockaddr_in *)&addr->in_addr)->sin_port)); case AF_INET6: return ntohs(get_unaligned(&((struct sockaddr_in6 *)&addr->in_addr)->sin6_port)); } return 0; } void ceph_addr_set_port(struct ceph_entity_addr *addr, int p) { switch (get_unaligned(&addr->in_addr.ss_family)) { case AF_INET: put_unaligned(htons(p), &((struct sockaddr_in *)&addr->in_addr)->sin_port); break; case AF_INET6: put_unaligned(htons(p), &((struct sockaddr_in6 *)&addr->in_addr)->sin6_port); break; } } /* * Unlike other *_pton function semantics, zero indicates success. */ static int ceph_pton(const char *str, size_t len, struct ceph_entity_addr *addr, char delim, const char **ipend) { memset(&addr->in_addr, 0, sizeof(addr->in_addr)); if (in4_pton(str, len, (u8 *)&((struct sockaddr_in *)&addr->in_addr)->sin_addr.s_addr, delim, ipend)) { put_unaligned(AF_INET, &addr->in_addr.ss_family); return 0; } if (in6_pton(str, len, (u8 *)&((struct sockaddr_in6 *)&addr->in_addr)->sin6_addr.s6_addr, delim, ipend)) { put_unaligned(AF_INET6, &addr->in_addr.ss_family); return 0; } return -EINVAL; } /* * Extract hostname string and resolve using kernel DNS facility. */ #ifdef CONFIG_CEPH_LIB_USE_DNS_RESOLVER static int ceph_dns_resolve_name(const char *name, size_t namelen, struct ceph_entity_addr *addr, char delim, const char **ipend) { const char *end, *delim_p; char *colon_p, *ip_addr = NULL; int ip_len, ret; /* * The end of the hostname occurs immediately preceding the delimiter or * the port marker (':') where the delimiter takes precedence. */ delim_p = memchr(name, delim, namelen); colon_p = memchr(name, ':', namelen); if (delim_p && colon_p) end = min(delim_p, colon_p); else if (!delim_p && colon_p) end = colon_p; else { end = delim_p; if (!end) /* case: hostname:/ */ end = name + namelen; } if (end <= name) return -EINVAL; /* do dns_resolve upcall */ ip_len = dns_query(current->nsproxy->net_ns, NULL, name, end - name, NULL, &ip_addr, NULL, false); if (ip_len > 0) ret = ceph_pton(ip_addr, ip_len, addr, -1, NULL); else ret = -ESRCH; kfree(ip_addr); *ipend = end; pr_info("resolve '%.*s' (ret=%d): %s\n", (int)(end - name), name, ret, ret ? "failed" : ceph_pr_addr(addr)); return ret; } #else static inline int ceph_dns_resolve_name(const char *name, size_t namelen, struct ceph_entity_addr *addr, char delim, const char **ipend) { return -EINVAL; } #endif /* * Parse a server name (IP or hostname). If a valid IP address is not found * then try to extract a hostname to resolve using userspace DNS upcall. */ static int ceph_parse_server_name(const char *name, size_t namelen, struct ceph_entity_addr *addr, char delim, const char **ipend) { int ret; ret = ceph_pton(name, namelen, addr, delim, ipend); if (ret) ret = ceph_dns_resolve_name(name, namelen, addr, delim, ipend); return ret; } /* * Parse an ip[:port] list into an addr array. Use the default * monitor port if a port isn't specified. */ int ceph_parse_ips(const char *c, const char *end, struct ceph_entity_addr *addr, int max_count, int *count, char delim) { int i, ret = -EINVAL; const char *p = c; dout("parse_ips on '%.*s'\n", (int)(end-c), c); for (i = 0; i < max_count; i++) { char cur_delim = delim; const char *ipend; int port; if (*p == '[') { cur_delim = ']'; p++; } ret = ceph_parse_server_name(p, end - p, &addr[i], cur_delim, &ipend); if (ret) goto bad; ret = -EINVAL; p = ipend; if (cur_delim == ']') { if (*p != ']') { dout("missing matching ']'\n"); goto bad; } p++; } /* port? */ if (p < end && *p == ':') { port = 0; p++; while (p < end && *p >= '0' && *p <= '9') { port = (port * 10) + (*p - '0'); p++; } if (port == 0) port = CEPH_MON_PORT; else if (port > 65535) goto bad; } else { port = CEPH_MON_PORT; } ceph_addr_set_port(&addr[i], port); /* * We want the type to be set according to ms_mode * option, but options are normally parsed after mon * addresses. Rather than complicating parsing, set * to LEGACY and override in build_initial_monmap() * for mon addresses and ceph_messenger_init() for * ip option. */ addr[i].type = CEPH_ENTITY_ADDR_TYPE_LEGACY; addr[i].nonce = 0; dout("%s got %s\n", __func__, ceph_pr_addr(&addr[i])); if (p == end) break; if (*p != delim) goto bad; p++; } if (p != end) goto bad; if (count) *count = i + 1; return 0; bad: return ret; } /* * Process message. This happens in the worker thread. The callback should * be careful not to do anything that waits on other incoming messages or it * may deadlock. */ void ceph_con_process_message(struct ceph_connection *con) { struct ceph_msg *msg = con->in_msg; BUG_ON(con->in_msg->con != con); con->in_msg = NULL; /* if first message, set peer_name */ if (con->peer_name.type == 0) con->peer_name = msg->hdr.src; con->in_seq++; mutex_unlock(&con->mutex); dout("===== %p %llu from %s%lld %d=%s len %d+%d+%d (%u %u %u) =====\n", msg, le64_to_cpu(msg->hdr.seq), ENTITY_NAME(msg->hdr.src), le16_to_cpu(msg->hdr.type), ceph_msg_type_name(le16_to_cpu(msg->hdr.type)), le32_to_cpu(msg->hdr.front_len), le32_to_cpu(msg->hdr.middle_len), le32_to_cpu(msg->hdr.data_len), con->in_front_crc, con->in_middle_crc, con->in_data_crc); con->ops->dispatch(con, msg); mutex_lock(&con->mutex); } /* * Atomically queue work on a connection after the specified delay. * Bump @con reference to avoid races with connection teardown. * Returns 0 if work was queued, or an error code otherwise. */ static int queue_con_delay(struct ceph_connection *con, unsigned long delay) { if (!con->ops->get(con)) { dout("%s %p ref count 0\n", __func__, con); return -ENOENT; } if (delay >= HZ) delay = round_jiffies_relative(delay); dout("%s %p %lu\n", __func__, con, delay); if (!queue_delayed_work(ceph_msgr_wq, &con->work, delay)) { dout("%s %p - already queued\n", __func__, con); con->ops->put(con); return -EBUSY; } return 0; } static void queue_con(struct ceph_connection *con) { (void) queue_con_delay(con, 0); } static void cancel_con(struct ceph_connection *con) { if (cancel_delayed_work(&con->work)) { dout("%s %p\n", __func__, con); con->ops->put(con); } } static bool con_sock_closed(struct ceph_connection *con) { if (!ceph_con_flag_test_and_clear(con, CEPH_CON_F_SOCK_CLOSED)) return false; #define CASE(x) \ case CEPH_CON_S_ ## x: \ con->error_msg = "socket closed (con state " #x ")"; \ break; switch (con->state) { CASE(CLOSED); CASE(PREOPEN); CASE(V1_BANNER); CASE(V1_CONNECT_MSG); CASE(V2_BANNER_PREFIX); CASE(V2_BANNER_PAYLOAD); CASE(V2_HELLO); CASE(V2_AUTH); CASE(V2_AUTH_SIGNATURE); CASE(V2_SESSION_CONNECT); CASE(V2_SESSION_RECONNECT); CASE(OPEN); CASE(STANDBY); default: BUG(); } #undef CASE return true; } static bool con_backoff(struct ceph_connection *con) { int ret; if (!ceph_con_flag_test_and_clear(con, CEPH_CON_F_BACKOFF)) return false; ret = queue_con_delay(con, con->delay); if (ret) { dout("%s: con %p FAILED to back off %lu\n", __func__, con, con->delay); BUG_ON(ret == -ENOENT); ceph_con_flag_set(con, CEPH_CON_F_BACKOFF); } return true; } /* Finish fault handling; con->mutex must *not* be held here */ static void con_fault_finish(struct ceph_connection *con) { dout("%s %p\n", __func__, con); /* * in case we faulted due to authentication, invalidate our * current tickets so that we can get new ones. */ if (!ceph_msgr2(from_msgr(con->msgr)) && con->v1.auth_retry) { dout("auth_retry %d, invalidating\n", con->v1.auth_retry); if (con->ops->invalidate_authorizer) con->ops->invalidate_authorizer(con); con->v1.auth_retry = 0; } if (con->ops->fault) con->ops->fault(con); } /* * Do some work on a connection. Drop a connection ref when we're done. */ static void ceph_con_workfn(struct work_struct *work) { struct ceph_connection *con = container_of(work, struct ceph_connection, work.work); bool fault; mutex_lock(&con->mutex); while (true) { int ret; if ((fault = con_sock_closed(con))) { dout("%s: con %p SOCK_CLOSED\n", __func__, con); break; } if (con_backoff(con)) { dout("%s: con %p BACKOFF\n", __func__, con); break; } if (con->state == CEPH_CON_S_STANDBY) { dout("%s: con %p STANDBY\n", __func__, con); break; } if (con->state == CEPH_CON_S_CLOSED) { dout("%s: con %p CLOSED\n", __func__, con); BUG_ON(con->sock); break; } if (con->state == CEPH_CON_S_PREOPEN) { dout("%s: con %p PREOPEN\n", __func__, con); BUG_ON(con->sock); } if (ceph_msgr2(from_msgr(con->msgr))) ret = ceph_con_v2_try_read(con); else ret = ceph_con_v1_try_read(con); if (ret < 0) { if (ret == -EAGAIN) continue; if (!con->error_msg) con->error_msg = "socket error on read"; fault = true; break; } if (ceph_msgr2(from_msgr(con->msgr))) ret = ceph_con_v2_try_write(con); else ret = ceph_con_v1_try_write(con); if (ret < 0) { if (ret == -EAGAIN) continue; if (!con->error_msg) con->error_msg = "socket error on write"; fault = true; } break; /* If we make it to here, we're done */ } if (fault) con_fault(con); mutex_unlock(&con->mutex); if (fault) con_fault_finish(con); con->ops->put(con); } /* * Generic error/fault handler. A retry mechanism is used with * exponential backoff */ static void con_fault(struct ceph_connection *con) { dout("fault %p state %d to peer %s\n", con, con->state, ceph_pr_addr(&con->peer_addr)); pr_warn("%s%lld %s %s\n", ENTITY_NAME(con->peer_name), ceph_pr_addr(&con->peer_addr), con->error_msg); con->error_msg = NULL; WARN_ON(con->state == CEPH_CON_S_STANDBY || con->state == CEPH_CON_S_CLOSED); ceph_con_reset_protocol(con); if (ceph_con_flag_test(con, CEPH_CON_F_LOSSYTX)) { dout("fault on LOSSYTX channel, marking CLOSED\n"); con->state = CEPH_CON_S_CLOSED; return; } /* Requeue anything that hasn't been acked */ list_splice_init(&con->out_sent, &con->out_queue); /* If there are no messages queued or keepalive pending, place * the connection in a STANDBY state */ if (list_empty(&con->out_queue) && !ceph_con_flag_test(con, CEPH_CON_F_KEEPALIVE_PENDING)) { dout("fault %p setting STANDBY clearing WRITE_PENDING\n", con); ceph_con_flag_clear(con, CEPH_CON_F_WRITE_PENDING); con->state = CEPH_CON_S_STANDBY; } else { /* retry after a delay. */ con->state = CEPH_CON_S_PREOPEN; if (!con->delay) { con->delay = BASE_DELAY_INTERVAL; } else if (con->delay < MAX_DELAY_INTERVAL) { con->delay *= 2; if (con->delay > MAX_DELAY_INTERVAL) con->delay = MAX_DELAY_INTERVAL; } ceph_con_flag_set(con, CEPH_CON_F_BACKOFF); queue_con(con); } } void ceph_messenger_reset_nonce(struct ceph_messenger *msgr) { u32 nonce = le32_to_cpu(msgr->inst.addr.nonce) + 1000000; msgr->inst.addr.nonce = cpu_to_le32(nonce); ceph_encode_my_addr(msgr); } /* * initialize a new messenger instance */ void ceph_messenger_init(struct ceph_messenger *msgr, struct ceph_entity_addr *myaddr) { spin_lock_init(&msgr->global_seq_lock); if (myaddr) { memcpy(&msgr->inst.addr.in_addr, &myaddr->in_addr, sizeof(msgr->inst.addr.in_addr)); ceph_addr_set_port(&msgr->inst.addr, 0); } /* * Since nautilus, clients are identified using type ANY. * For msgr1, ceph_encode_banner_addr() munges it to NONE. */ msgr->inst.addr.type = CEPH_ENTITY_ADDR_TYPE_ANY; /* generate a random non-zero nonce */ do { get_random_bytes(&msgr->inst.addr.nonce, sizeof(msgr->inst.addr.nonce)); } while (!msgr->inst.addr.nonce); ceph_encode_my_addr(msgr); atomic_set(&msgr->stopping, 0); write_pnet(&msgr->net, get_net(current->nsproxy->net_ns)); dout("%s %p\n", __func__, msgr); } void ceph_messenger_fini(struct ceph_messenger *msgr) { put_net(read_pnet(&msgr->net)); } static void msg_con_set(struct ceph_msg *msg, struct ceph_connection *con) { if (msg->con) msg->con->ops->put(msg->con); msg->con = con ? con->ops->get(con) : NULL; BUG_ON(msg->con != con); } static void clear_standby(struct ceph_connection *con) { /* come back from STANDBY? */ if (con->state == CEPH_CON_S_STANDBY) { dout("clear_standby %p\n", con); con->state = CEPH_CON_S_PREOPEN; if (!ceph_msgr2(from_msgr(con->msgr))) con->v1.connect_seq++; WARN_ON(ceph_con_flag_test(con, CEPH_CON_F_WRITE_PENDING)); WARN_ON(ceph_con_flag_test(con, CEPH_CON_F_KEEPALIVE_PENDING)); } } /* * Queue up an outgoing message on the given connection. * * Consumes a ref on @msg. */ void ceph_con_send(struct ceph_connection *con, struct ceph_msg *msg) { /* set src+dst */ msg->hdr.src = con->msgr->inst.name; BUG_ON(msg->front.iov_len != le32_to_cpu(msg->hdr.front_len)); msg->needs_out_seq = true; mutex_lock(&con->mutex); if (con->state == CEPH_CON_S_CLOSED) { dout("con_send %p closed, dropping %p\n", con, msg); ceph_msg_put(msg); mutex_unlock(&con->mutex); return; } msg_con_set(msg, con); BUG_ON(!list_empty(&msg->list_head)); list_add_tail(&msg->list_head, &con->out_queue); dout("----- %p to %s%lld %d=%s len %d+%d+%d -----\n", msg, ENTITY_NAME(con->peer_name), le16_to_cpu(msg->hdr.type), ceph_msg_type_name(le16_to_cpu(msg->hdr.type)), le32_to_cpu(msg->hdr.front_len), le32_to_cpu(msg->hdr.middle_len), le32_to_cpu(msg->hdr.data_len)); clear_standby(con); mutex_unlock(&con->mutex); /* if there wasn't anything waiting to send before, queue * new work */ if (!ceph_con_flag_test_and_set(con, CEPH_CON_F_WRITE_PENDING)) queue_con(con); } EXPORT_SYMBOL(ceph_con_send); /* * Revoke a message that was previously queued for send */ void ceph_msg_revoke(struct ceph_msg *msg) { struct ceph_connection *con = msg->con; if (!con) { dout("%s msg %p null con\n", __func__, msg); return; /* Message not in our possession */ } mutex_lock(&con->mutex); if (list_empty(&msg->list_head)) { WARN_ON(con->out_msg == msg); dout("%s con %p msg %p not linked\n", __func__, con, msg); mutex_unlock(&con->mutex); return; } dout("%s con %p msg %p was linked\n", __func__, con, msg); msg->hdr.seq = 0; ceph_msg_remove(msg); if (con->out_msg == msg) { WARN_ON(con->state != CEPH_CON_S_OPEN); dout("%s con %p msg %p was sending\n", __func__, con, msg); if (ceph_msgr2(from_msgr(con->msgr))) ceph_con_v2_revoke(con, msg); else ceph_con_v1_revoke(con, msg); ceph_msg_put(con->out_msg); con->out_msg = NULL; } else { dout("%s con %p msg %p not current, out_msg %p\n", __func__, con, msg, con->out_msg); } mutex_unlock(&con->mutex); } /* * Revoke a message that we may be reading data into */ void ceph_msg_revoke_incoming(struct ceph_msg *msg) { struct ceph_connection *con = msg->con; if (!con) { dout("%s msg %p null con\n", __func__, msg); return; /* Message not in our possession */ } mutex_lock(&con->mutex); if (con->in_msg == msg) { WARN_ON(con->state != CEPH_CON_S_OPEN); dout("%s con %p msg %p was recving\n", __func__, con, msg); if (ceph_msgr2(from_msgr(con->msgr))) ceph_con_v2_revoke_incoming(con); else ceph_con_v1_revoke_incoming(con); ceph_msg_put(con->in_msg); con->in_msg = NULL; } else { dout("%s con %p msg %p not current, in_msg %p\n", __func__, con, msg, con->in_msg); } mutex_unlock(&con->mutex); } /* * Queue a keepalive byte to ensure the tcp connection is alive. */ void ceph_con_keepalive(struct ceph_connection *con) { dout("con_keepalive %p\n", con); mutex_lock(&con->mutex); clear_standby(con); ceph_con_flag_set(con, CEPH_CON_F_KEEPALIVE_PENDING); mutex_unlock(&con->mutex); if (!ceph_con_flag_test_and_set(con, CEPH_CON_F_WRITE_PENDING)) queue_con(con); } EXPORT_SYMBOL(ceph_con_keepalive); bool ceph_con_keepalive_expired(struct ceph_connection *con, unsigned long interval) { if (interval > 0 && (con->peer_features & CEPH_FEATURE_MSGR_KEEPALIVE2)) { struct timespec64 now; struct timespec64 ts; ktime_get_real_ts64(&now); jiffies_to_timespec64(interval, &ts); ts = timespec64_add(con->last_keepalive_ack, ts); return timespec64_compare(&now, &ts) >= 0; } return false; } static struct ceph_msg_data *ceph_msg_data_add(struct ceph_msg *msg) { BUG_ON(msg->num_data_items >= msg->max_data_items); return &msg->data[msg->num_data_items++]; } static void ceph_msg_data_destroy(struct ceph_msg_data *data) { if (data->type == CEPH_MSG_DATA_PAGES && data->own_pages) { int num_pages = calc_pages_for(data->alignment, data->length); ceph_release_page_vector(data->pages, num_pages); } else if (data->type == CEPH_MSG_DATA_PAGELIST) { ceph_pagelist_release(data->pagelist); } } void ceph_msg_data_add_pages(struct ceph_msg *msg, struct page **pages, size_t length, size_t alignment, bool own_pages) { struct ceph_msg_data *data; BUG_ON(!pages); BUG_ON(!length); data = ceph_msg_data_add(msg); data->type = CEPH_MSG_DATA_PAGES; data->pages = pages; data->length = length; data->alignment = alignment & ~PAGE_MASK; data->own_pages = own_pages; msg->data_length += length; } EXPORT_SYMBOL(ceph_msg_data_add_pages); void ceph_msg_data_add_pagelist(struct ceph_msg *msg, struct ceph_pagelist *pagelist) { struct ceph_msg_data *data; BUG_ON(!pagelist); BUG_ON(!pagelist->length); data = ceph_msg_data_add(msg); data->type = CEPH_MSG_DATA_PAGELIST; refcount_inc(&pagelist->refcnt); data->pagelist = pagelist; msg->data_length += pagelist->length; } EXPORT_SYMBOL(ceph_msg_data_add_pagelist); #ifdef CONFIG_BLOCK void ceph_msg_data_add_bio(struct ceph_msg *msg, struct ceph_bio_iter *bio_pos, u32 length) { struct ceph_msg_data *data; data = ceph_msg_data_add(msg); data->type = CEPH_MSG_DATA_BIO; data->bio_pos = *bio_pos; data->bio_length = length; msg->data_length += length; } EXPORT_SYMBOL(ceph_msg_data_add_bio); #endif /* CONFIG_BLOCK */ void ceph_msg_data_add_bvecs(struct ceph_msg *msg, struct ceph_bvec_iter *bvec_pos) { struct ceph_msg_data *data; data = ceph_msg_data_add(msg); data->type = CEPH_MSG_DATA_BVECS; data->bvec_pos = *bvec_pos; msg->data_length += bvec_pos->iter.bi_size; } EXPORT_SYMBOL(ceph_msg_data_add_bvecs); void ceph_msg_data_add_iter(struct ceph_msg *msg, struct iov_iter *iter) { struct ceph_msg_data *data; data = ceph_msg_data_add(msg); data->type = CEPH_MSG_DATA_ITER; data->iter = *iter; msg->data_length += iov_iter_count(&data->iter); } /* * construct a new message with given type, size * the new msg has a ref count of 1. */ struct ceph_msg *ceph_msg_new2(int type, int front_len, int max_data_items, gfp_t flags, bool can_fail) { struct ceph_msg *m; m = kmem_cache_zalloc(ceph_msg_cache, flags); if (m == NULL) goto out; m->hdr.type = cpu_to_le16(type); m->hdr.priority = cpu_to_le16(CEPH_MSG_PRIO_DEFAULT); m->hdr.front_len = cpu_to_le32(front_len); INIT_LIST_HEAD(&m->list_head); kref_init(&m->kref); /* front */ if (front_len) { m->front.iov_base = kvmalloc(front_len, flags); if (m->front.iov_base == NULL) { dout("ceph_msg_new can't allocate %d bytes\n", front_len); goto out2; } } else { m->front.iov_base = NULL; } m->front_alloc_len = m->front.iov_len = front_len; if (max_data_items) { m->data = kmalloc_array(max_data_items, sizeof(*m->data), flags); if (!m->data) goto out2; m->max_data_items = max_data_items; } dout("ceph_msg_new %p front %d\n", m, front_len); return m; out2: ceph_msg_put(m); out: if (!can_fail) { pr_err("msg_new can't create type %d front %d\n", type, front_len); WARN_ON(1); } else { dout("msg_new can't create type %d front %d\n", type, front_len); } return NULL; } EXPORT_SYMBOL(ceph_msg_new2); struct ceph_msg *ceph_msg_new(int type, int front_len, gfp_t flags, bool can_fail) { return ceph_msg_new2(type, front_len, 0, flags, can_fail); } EXPORT_SYMBOL(ceph_msg_new); /* * Allocate "middle" portion of a message, if it is needed and wasn't * allocated by alloc_msg. This allows us to read a small fixed-size * per-type header in the front and then gracefully fail (i.e., * propagate the error to the caller based on info in the front) when * the middle is too large. */ static int ceph_alloc_middle(struct ceph_connection *con, struct ceph_msg *msg) { int type = le16_to_cpu(msg->hdr.type); int middle_len = le32_to_cpu(msg->hdr.middle_len); dout("alloc_middle %p type %d %s middle_len %d\n", msg, type, ceph_msg_type_name(type), middle_len); BUG_ON(!middle_len); BUG_ON(msg->middle); msg->middle = ceph_buffer_new(middle_len, GFP_NOFS); if (!msg->middle) return -ENOMEM; return 0; } /* * Allocate a message for receiving an incoming message on a * connection, and save the result in con->in_msg. Uses the * connection's private alloc_msg op if available. * * Returns 0 on success, or a negative error code. * * On success, if we set *skip = 1: * - the next message should be skipped and ignored. * - con->in_msg == NULL * or if we set *skip = 0: * - con->in_msg is non-null. * On error (ENOMEM, EAGAIN, ...), * - con->in_msg == NULL */ int ceph_con_in_msg_alloc(struct ceph_connection *con, struct ceph_msg_header *hdr, int *skip) { int middle_len = le32_to_cpu(hdr->middle_len); struct ceph_msg *msg; int ret = 0; BUG_ON(con->in_msg != NULL); BUG_ON(!con->ops->alloc_msg); mutex_unlock(&con->mutex); msg = con->ops->alloc_msg(con, hdr, skip); mutex_lock(&con->mutex); if (con->state != CEPH_CON_S_OPEN) { if (msg) ceph_msg_put(msg); return -EAGAIN; } if (msg) { BUG_ON(*skip); msg_con_set(msg, con); con->in_msg = msg; } else { /* * Null message pointer means either we should skip * this message or we couldn't allocate memory. The * former is not an error. */ if (*skip) return 0; con->error_msg = "error allocating memory for incoming message"; return -ENOMEM; } memcpy(&con->in_msg->hdr, hdr, sizeof(*hdr)); if (middle_len && !con->in_msg->middle) { ret = ceph_alloc_middle(con, con->in_msg); if (ret < 0) { ceph_msg_put(con->in_msg); con->in_msg = NULL; } } return ret; } struct ceph_msg *ceph_con_get_out_msg(struct ceph_connection *con) { struct ceph_msg *msg; if (list_empty(&con->out_queue)) return NULL; msg = list_first_entry(&con->out_queue, struct ceph_msg, list_head); WARN_ON(msg->con != con); /* * Put the message on "sent" list using a ref from ceph_con_send(). * It is put when the message is acked or revoked. */ list_move_tail(&msg->list_head, &con->out_sent); /* * Only assign outgoing seq # if we haven't sent this message * yet. If it is requeued, resend with it's original seq. */ if (msg->needs_out_seq) { msg->hdr.seq = cpu_to_le64(++con->out_seq); msg->needs_out_seq = false; if (con->ops->reencode_message) con->ops->reencode_message(msg); } /* * Get a ref for out_msg. It is put when we are done sending the * message or in case of a fault. */ WARN_ON(con->out_msg); return con->out_msg = ceph_msg_get(msg); } /* * Free a generically kmalloc'd message. */ static void ceph_msg_free(struct ceph_msg *m) { dout("%s %p\n", __func__, m); kvfree(m->front.iov_base); kfree(m->data); kmem_cache_free(ceph_msg_cache, m); } static void ceph_msg_release(struct kref *kref) { struct ceph_msg *m = container_of(kref, struct ceph_msg, kref); int i; dout("%s %p\n", __func__, m); WARN_ON(!list_empty(&m->list_head)); msg_con_set(m, NULL); /* drop middle, data, if any */ if (m->middle) { ceph_buffer_put(m->middle); m->middle = NULL; } for (i = 0; i < m->num_data_items; i++) ceph_msg_data_destroy(&m->data[i]); if (m->pool) ceph_msgpool_put(m->pool, m); else ceph_msg_free(m); } struct ceph_msg *ceph_msg_get(struct ceph_msg *msg) { dout("%s %p (was %d)\n", __func__, msg, kref_read(&msg->kref)); kref_get(&msg->kref); return msg; } EXPORT_SYMBOL(ceph_msg_get); void ceph_msg_put(struct ceph_msg *msg) { dout("%s %p (was %d)\n", __func__, msg, kref_read(&msg->kref)); kref_put(&msg->kref, ceph_msg_release); } EXPORT_SYMBOL(ceph_msg_put); void ceph_msg_dump(struct ceph_msg *msg) { pr_debug("msg_dump %p (front_alloc_len %d length %zd)\n", msg, msg->front_alloc_len, msg->data_length); print_hex_dump(KERN_DEBUG, "header: ", DUMP_PREFIX_OFFSET, 16, 1, &msg->hdr, sizeof(msg->hdr), true); print_hex_dump(KERN_DEBUG, " front: ", DUMP_PREFIX_OFFSET, 16, 1, msg->front.iov_base, msg->front.iov_len, true); if (msg->middle) print_hex_dump(KERN_DEBUG, "middle: ", DUMP_PREFIX_OFFSET, 16, 1, msg->middle->vec.iov_base, msg->middle->vec.iov_len, true); print_hex_dump(KERN_DEBUG, "footer: ", DUMP_PREFIX_OFFSET, 16, 1, &msg->footer, sizeof(msg->footer), true); } EXPORT_SYMBOL(ceph_msg_dump);
1 1 1 1 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 // SPDX-License-Identifier: GPL-2.0-or-later /* * NET3 IP device support routines. * * Derived from the IP parts of dev.c 1.0.19 * Authors: Ross Biro * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> * Mark Evans, <evansmp@uhura.aston.ac.uk> * * Additional Authors: * Alan Cox, <gw4pts@gw4pts.ampr.org> * Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> * * Changes: * Alexey Kuznetsov: pa_* fields are replaced with ifaddr * lists. * Cyrus Durgin: updated for kmod * Matthias Andree: in devinet_ioctl, compare label and * address (4.4BSD alias style support), * fall back to comparing just the label * if no match found. */ #include <linux/uaccess.h> #include <linux/bitops.h> #include <linux/capability.h> #include <linux/module.h> #include <linux/types.h> #include <linux/kernel.h> #include <linux/sched/signal.h> #include <linux/string.h> #include <linux/mm.h> #include <linux/socket.h> #include <linux/sockios.h> #include <linux/in.h> #include <linux/errno.h> #include <linux/interrupt.h> #include <linux/if_addr.h> #include <linux/if_ether.h> #include <linux/inet.h> #include <linux/netdevice.h> #include <linux/etherdevice.h> #include <linux/skbuff.h> #include <linux/init.h> #include <linux/notifier.h> #include <linux/inetdevice.h> #include <linux/igmp.h> #include "igmp_internal.h" #include <linux/slab.h> #include <linux/hash.h> #ifdef CONFIG_SYSCTL #include <linux/sysctl.h> #endif #include <linux/kmod.h> #include <linux/netconf.h> #include <net/arp.h> #include <net/ip.h> #include <net/route.h> #include <net/ip_fib.h> #include <net/rtnetlink.h> #include <net/net_namespace.h> #include <net/addrconf.h> #define IPV6ONLY_FLAGS \ (IFA_F_NODAD | IFA_F_OPTIMISTIC | IFA_F_DADFAILED | \ IFA_F_HOMEADDRESS | IFA_F_TENTATIVE | \ IFA_F_MANAGETEMPADDR | IFA_F_STABLE_PRIVACY) static struct ipv4_devconf ipv4_devconf = { .data = { [IPV4_DEVCONF_ACCEPT_REDIRECTS - 1] = 1, [IPV4_DEVCONF_SEND_REDIRECTS - 1] = 1, [IPV4_DEVCONF_SECURE_REDIRECTS - 1] = 1, [IPV4_DEVCONF_SHARED_MEDIA - 1] = 1, [IPV4_DEVCONF_IGMPV2_UNSOLICITED_REPORT_INTERVAL - 1] = 10000 /*ms*/, [IPV4_DEVCONF_IGMPV3_UNSOLICITED_REPORT_INTERVAL - 1] = 1000 /*ms*/, [IPV4_DEVCONF_ARP_EVICT_NOCARRIER - 1] = 1, }, }; static struct ipv4_devconf ipv4_devconf_dflt = { .data = { [IPV4_DEVCONF_ACCEPT_REDIRECTS - 1] = 1, [IPV4_DEVCONF_SEND_REDIRECTS - 1] = 1, [IPV4_DEVCONF_SECURE_REDIRECTS - 1] = 1, [IPV4_DEVCONF_SHARED_MEDIA - 1] = 1, [IPV4_DEVCONF_ACCEPT_SOURCE_ROUTE - 1] = 1, [IPV4_DEVCONF_IGMPV2_UNSOLICITED_REPORT_INTERVAL - 1] = 10000 /*ms*/, [IPV4_DEVCONF_IGMPV3_UNSOLICITED_REPORT_INTERVAL - 1] = 1000 /*ms*/, [IPV4_DEVCONF_ARP_EVICT_NOCARRIER - 1] = 1, }, }; #define IPV4_DEVCONF_DFLT(net, attr) \ IPV4_DEVCONF((*net->ipv4.devconf_dflt), attr) static const struct nla_policy ifa_ipv4_policy[IFA_MAX+1] = { [IFA_LOCAL] = { .type = NLA_U32 }, [IFA_ADDRESS] = { .type = NLA_U32 }, [IFA_BROADCAST] = { .type = NLA_U32 }, [IFA_LABEL] = { .type = NLA_STRING, .len = IFNAMSIZ - 1 }, [IFA_CACHEINFO] = { .len = sizeof(struct ifa_cacheinfo) }, [IFA_FLAGS] = { .type = NLA_U32 }, [IFA_RT_PRIORITY] = { .type = NLA_U32 }, [IFA_TARGET_NETNSID] = { .type = NLA_S32 }, [IFA_PROTO] = { .type = NLA_U8 }, }; #define IN4_ADDR_HSIZE_SHIFT 8 #define IN4_ADDR_HSIZE (1U << IN4_ADDR_HSIZE_SHIFT) static u32 inet_addr_hash(const struct net *net, __be32 addr) { u32 val = __ipv4_addr_hash(addr, net_hash_mix(net)); return hash_32(val, IN4_ADDR_HSIZE_SHIFT); } static void inet_hash_insert(struct net *net, struct in_ifaddr *ifa) { u32 hash = inet_addr_hash(net, ifa->ifa_local); ASSERT_RTNL(); hlist_add_head_rcu(&ifa->addr_lst, &net->ipv4.inet_addr_lst[hash]); } static void inet_hash_remove(struct in_ifaddr *ifa) { ASSERT_RTNL(); hlist_del_init_rcu(&ifa->addr_lst); } /** * __ip_dev_find - find the first device with a given source address. * @net: the net namespace * @addr: the source address * @devref: if true, take a reference on the found device * * If a caller uses devref=false, it should be protected by RCU, or RTNL */ struct net_device *__ip_dev_find(struct net *net, __be32 addr, bool devref) { struct net_device *result = NULL; struct in_ifaddr *ifa; rcu_read_lock(); ifa = inet_lookup_ifaddr_rcu(net, addr); if (!ifa) { struct flowi4 fl4 = { .daddr = addr }; struct fib_result res = { 0 }; struct fib_table *local; /* Fallback to FIB local table so that communication * over loopback subnets work. */ local = fib_get_table(net, RT_TABLE_LOCAL); if (local && !fib_table_lookup(local, &fl4, &res, FIB_LOOKUP_NOREF) && res.type == RTN_LOCAL) result = FIB_RES_DEV(res); } else { result = ifa->ifa_dev->dev; } if (result && devref) dev_hold(result); rcu_read_unlock(); return result; } EXPORT_SYMBOL(__ip_dev_find); /* called under RCU lock */ struct in_ifaddr *inet_lookup_ifaddr_rcu(struct net *net, __be32 addr) { u32 hash = inet_addr_hash(net, addr); struct in_ifaddr *ifa; hlist_for_each_entry_rcu(ifa, &net->ipv4.inet_addr_lst[hash], addr_lst) if (ifa->ifa_local == addr) return ifa; return NULL; } static void rtmsg_ifa(int event, struct in_ifaddr *, struct nlmsghdr *, u32); static BLOCKING_NOTIFIER_HEAD(inetaddr_chain); static BLOCKING_NOTIFIER_HEAD(inetaddr_validator_chain); static void inet_del_ifa(struct in_device *in_dev, struct in_ifaddr __rcu **ifap, int destroy); #ifdef CONFIG_SYSCTL static int devinet_sysctl_register(struct in_device *idev); static void devinet_sysctl_unregister(struct in_device *idev); #else static int devinet_sysctl_register(struct in_device *idev) { return 0; } static void devinet_sysctl_unregister(struct in_device *idev) { } #endif /* Locks all the inet devices. */ static struct in_ifaddr *inet_alloc_ifa(struct in_device *in_dev) { struct in_ifaddr *ifa; ifa = kzalloc(sizeof(*ifa), GFP_KERNEL_ACCOUNT); if (!ifa) return NULL; in_dev_hold(in_dev); ifa->ifa_dev = in_dev; INIT_HLIST_NODE(&ifa->addr_lst); return ifa; } static void inet_rcu_free_ifa(struct rcu_head *head) { struct in_ifaddr *ifa = container_of(head, struct in_ifaddr, rcu_head); in_dev_put(ifa->ifa_dev); kfree(ifa); } static void inet_free_ifa(struct in_ifaddr *ifa) { /* Our reference to ifa->ifa_dev must be freed ASAP * to release the reference to the netdev the same way. * in_dev_put() -> in_dev_finish_destroy() -> netdev_put() */ call_rcu_hurry(&ifa->rcu_head, inet_rcu_free_ifa); } static void in_dev_free_rcu(struct rcu_head *head) { struct in_device *idev = container_of(head, struct in_device, rcu_head); kfree(rcu_dereference_protected(idev->mc_hash, 1)); kfree(idev); } void in_dev_finish_destroy(struct in_device *idev) { struct net_device *dev = idev->dev; WARN_ON(idev->ifa_list); WARN_ON(idev->mc_list); #ifdef NET_REFCNT_DEBUG pr_debug("%s: %p=%s\n", __func__, idev, dev ? dev->name : "NIL"); #endif netdev_put(dev, &idev->dev_tracker); if (!idev->dead) pr_err("Freeing alive in_device %p\n", idev); else call_rcu(&idev->rcu_head, in_dev_free_rcu); } EXPORT_SYMBOL(in_dev_finish_destroy); static struct in_device *inetdev_init(struct net_device *dev) { struct in_device *in_dev; int err = -ENOMEM; ASSERT_RTNL(); in_dev = kzalloc(sizeof(*in_dev), GFP_KERNEL); if (!in_dev) goto out; memcpy(&in_dev->cnf, dev_net(dev)->ipv4.devconf_dflt, sizeof(in_dev->cnf)); in_dev->cnf.sysctl = NULL; in_dev->dev = dev; in_dev->arp_parms = neigh_parms_alloc(dev, &arp_tbl); if (!in_dev->arp_parms) goto out_kfree; if (IPV4_DEVCONF(in_dev->cnf, FORWARDING)) netif_disable_lro(dev); /* Reference in_dev->dev */ netdev_hold(dev, &in_dev->dev_tracker, GFP_KERNEL); /* Account for reference dev->ip_ptr (below) */ refcount_set(&in_dev->refcnt, 1); if (dev != blackhole_netdev) { err = devinet_sysctl_register(in_dev); if (err) { in_dev->dead = 1; neigh_parms_release(&arp_tbl, in_dev->arp_parms); in_dev_put(in_dev); in_dev = NULL; goto out; } ip_mc_init_dev(in_dev); if (dev->flags & IFF_UP) ip_mc_up(in_dev); } /* we can receive as soon as ip_ptr is set -- do this last */ rcu_assign_pointer(dev->ip_ptr, in_dev); out: return in_dev ?: ERR_PTR(err); out_kfree: kfree(in_dev); in_dev = NULL; goto out; } static void inetdev_destroy(struct in_device *in_dev) { struct net_device *dev; struct in_ifaddr *ifa; ASSERT_RTNL(); dev = in_dev->dev; in_dev->dead = 1; ip_mc_destroy_dev(in_dev); while ((ifa = rtnl_dereference(in_dev->ifa_list)) != NULL) { inet_del_ifa(in_dev, &in_dev->ifa_list, 0); inet_free_ifa(ifa); } RCU_INIT_POINTER(dev->ip_ptr, NULL); devinet_sysctl_unregister(in_dev); neigh_parms_release(&arp_tbl, in_dev->arp_parms); arp_ifdown(dev); in_dev_put(in_dev); } static int __init inet_blackhole_dev_init(void) { struct in_device *in_dev; rtnl_lock(); in_dev = inetdev_init(blackhole_netdev); rtnl_unlock(); return PTR_ERR_OR_ZERO(in_dev); } late_initcall(inet_blackhole_dev_init); int inet_addr_onlink(struct in_device *in_dev, __be32 a, __be32 b) { const struct in_ifaddr *ifa; rcu_read_lock(); in_dev_for_each_ifa_rcu(ifa, in_dev) { if (inet_ifa_match(a, ifa)) { if (!b || inet_ifa_match(b, ifa)) { rcu_read_unlock(); return 1; } } } rcu_read_unlock(); return 0; } static void __inet_del_ifa(struct in_device *in_dev, struct in_ifaddr __rcu **ifap, int destroy, struct nlmsghdr *nlh, u32 portid) { struct in_ifaddr *promote = NULL; struct in_ifaddr *ifa, *ifa1; struct in_ifaddr __rcu **last_prim; struct in_ifaddr *prev_prom = NULL; int do_promote = IN_DEV_PROMOTE_SECONDARIES(in_dev); ASSERT_RTNL(); ifa1 = rtnl_dereference(*ifap); last_prim = ifap; if (in_dev->dead) goto no_promotions; /* 1. Deleting primary ifaddr forces deletion all secondaries * unless alias promotion is set **/ if (!(ifa1->ifa_flags & IFA_F_SECONDARY)) { struct in_ifaddr __rcu **ifap1 = &ifa1->ifa_next; while ((ifa = rtnl_dereference(*ifap1)) != NULL) { if (!(ifa->ifa_flags & IFA_F_SECONDARY) && ifa1->ifa_scope <= ifa->ifa_scope) last_prim = &ifa->ifa_next; if (!(ifa->ifa_flags & IFA_F_SECONDARY) || ifa1->ifa_mask != ifa->ifa_mask || !inet_ifa_match(ifa1->ifa_address, ifa)) { ifap1 = &ifa->ifa_next; prev_prom = ifa; continue; } if (!do_promote) { inet_hash_remove(ifa); *ifap1 = ifa->ifa_next; rtmsg_ifa(RTM_DELADDR, ifa, nlh, portid); blocking_notifier_call_chain(&inetaddr_chain, NETDEV_DOWN, ifa); inet_free_ifa(ifa); } else { promote = ifa; break; } } } /* On promotion all secondaries from subnet are changing * the primary IP, we must remove all their routes silently * and later to add them back with new prefsrc. Do this * while all addresses are on the device list. */ for (ifa = promote; ifa; ifa = rtnl_dereference(ifa->ifa_next)) { if (ifa1->ifa_mask == ifa->ifa_mask && inet_ifa_match(ifa1->ifa_address, ifa)) fib_del_ifaddr(ifa, ifa1); } no_promotions: /* 2. Unlink it */ *ifap = ifa1->ifa_next; inet_hash_remove(ifa1); /* 3. Announce address deletion */ /* Send message first, then call notifier. At first sight, FIB update triggered by notifier will refer to already deleted ifaddr, that could confuse netlink listeners. It is not true: look, gated sees that route deleted and if it still thinks that ifaddr is valid, it will try to restore deleted routes... Grr. So that, this order is correct. */ rtmsg_ifa(RTM_DELADDR, ifa1, nlh, portid); blocking_notifier_call_chain(&inetaddr_chain, NETDEV_DOWN, ifa1); if (promote) { struct in_ifaddr *next_sec; next_sec = rtnl_dereference(promote->ifa_next); if (prev_prom) { struct in_ifaddr *last_sec; rcu_assign_pointer(prev_prom->ifa_next, next_sec); last_sec = rtnl_dereference(*last_prim); rcu_assign_pointer(promote->ifa_next, last_sec); rcu_assign_pointer(*last_prim, promote); } promote->ifa_flags &= ~IFA_F_SECONDARY; rtmsg_ifa(RTM_NEWADDR, promote, nlh, portid); blocking_notifier_call_chain(&inetaddr_chain, NETDEV_UP, promote); for (ifa = next_sec; ifa; ifa = rtnl_dereference(ifa->ifa_next)) { if (ifa1->ifa_mask != ifa->ifa_mask || !inet_ifa_match(ifa1->ifa_address, ifa)) continue; fib_add_ifaddr(ifa); } } if (destroy) inet_free_ifa(ifa1); } static void inet_del_ifa(struct in_device *in_dev, struct in_ifaddr __rcu **ifap, int destroy) { __inet_del_ifa(in_dev, ifap, destroy, NULL, 0); } static int __inet_insert_ifa(struct in_ifaddr *ifa, struct nlmsghdr *nlh, u32 portid, struct netlink_ext_ack *extack) { struct in_ifaddr __rcu **last_primary, **ifap; struct in_device *in_dev = ifa->ifa_dev; struct net *net = dev_net(in_dev->dev); struct in_validator_info ivi; struct in_ifaddr *ifa1; int ret; ASSERT_RTNL(); ifa->ifa_flags &= ~IFA_F_SECONDARY; last_primary = &in_dev->ifa_list; /* Don't set IPv6 only flags to IPv4 addresses */ ifa->ifa_flags &= ~IPV6ONLY_FLAGS; ifap = &in_dev->ifa_list; ifa1 = rtnl_dereference(*ifap); while (ifa1) { if (!(ifa1->ifa_flags & IFA_F_SECONDARY) && ifa->ifa_scope <= ifa1->ifa_scope) last_primary = &ifa1->ifa_next; if (ifa1->ifa_mask == ifa->ifa_mask && inet_ifa_match(ifa1->ifa_address, ifa)) { if (ifa1->ifa_local == ifa->ifa_local) { inet_free_ifa(ifa); return -EEXIST; } if (ifa1->ifa_scope != ifa->ifa_scope) { NL_SET_ERR_MSG(extack, "ipv4: Invalid scope value"); inet_free_ifa(ifa); return -EINVAL; } ifa->ifa_flags |= IFA_F_SECONDARY; } ifap = &ifa1->ifa_next; ifa1 = rtnl_dereference(*ifap); } /* Allow any devices that wish to register ifaddr validtors to weigh * in now, before changes are committed. The rntl lock is serializing * access here, so the state should not change between a validator call * and a final notify on commit. This isn't invoked on promotion under * the assumption that validators are checking the address itself, and * not the flags. */ ivi.ivi_addr = ifa->ifa_address; ivi.ivi_dev = ifa->ifa_dev; ivi.extack = extack; ret = blocking_notifier_call_chain(&inetaddr_validator_chain, NETDEV_UP, &ivi); ret = notifier_to_errno(ret); if (ret) { inet_free_ifa(ifa); return ret; } if (!(ifa->ifa_flags & IFA_F_SECONDARY)) ifap = last_primary; rcu_assign_pointer(ifa->ifa_next, *ifap); rcu_assign_pointer(*ifap, ifa); inet_hash_insert(dev_net(in_dev->dev), ifa); cancel_delayed_work(&net->ipv4.addr_chk_work); queue_delayed_work(system_power_efficient_wq, &net->ipv4.addr_chk_work, 0); /* Send message first, then call notifier. Notifier will trigger FIB update, so that listeners of netlink will know about new ifaddr */ rtmsg_ifa(RTM_NEWADDR, ifa, nlh, portid); blocking_notifier_call_chain(&inetaddr_chain, NETDEV_UP, ifa); return 0; } static int inet_insert_ifa(struct in_ifaddr *ifa) { if (!ifa->ifa_local) { inet_free_ifa(ifa); return 0; } return __inet_insert_ifa(ifa, NULL, 0, NULL); } static int inet_set_ifa(struct net_device *dev, struct in_ifaddr *ifa) { struct in_device *in_dev = __in_dev_get_rtnl_net(dev); ipv4_devconf_setall(in_dev); neigh_parms_data_state_setall(in_dev->arp_parms); if (ipv4_is_loopback(ifa->ifa_local)) ifa->ifa_scope = RT_SCOPE_HOST; return inet_insert_ifa(ifa); } /* Caller must hold RCU or RTNL : * We dont take a reference on found in_device */ struct in_device *inetdev_by_index(struct net *net, int ifindex) { struct net_device *dev; struct in_device *in_dev = NULL; rcu_read_lock(); dev = dev_get_by_index_rcu(net, ifindex); if (dev) in_dev = rcu_dereference_rtnl(dev->ip_ptr); rcu_read_unlock(); return in_dev; } EXPORT_SYMBOL(inetdev_by_index); /* Called only from RTNL semaphored context. No locks. */ struct in_ifaddr *inet_ifa_byprefix(struct in_device *in_dev, __be32 prefix, __be32 mask) { struct in_ifaddr *ifa; ASSERT_RTNL(); in_dev_for_each_ifa_rtnl(ifa, in_dev) { if (ifa->ifa_mask == mask && inet_ifa_match(prefix, ifa)) return ifa; } return NULL; } static int ip_mc_autojoin_config(struct net *net, bool join, const struct in_ifaddr *ifa) { #if defined(CONFIG_IP_MULTICAST) struct ip_mreqn mreq = { .imr_multiaddr.s_addr = ifa->ifa_address, .imr_ifindex = ifa->ifa_dev->dev->ifindex, }; struct sock *sk = net->ipv4.mc_autojoin_sk; int ret; ASSERT_RTNL_NET(net); lock_sock(sk); if (join) ret = ip_mc_join_group(sk, &mreq); else ret = ip_mc_leave_group(sk, &mreq); release_sock(sk); return ret; #else return -EOPNOTSUPP; #endif } static int inet_rtm_deladdr(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); struct in_ifaddr __rcu **ifap; struct nlattr *tb[IFA_MAX+1]; struct in_device *in_dev; struct ifaddrmsg *ifm; struct in_ifaddr *ifa; int err; err = nlmsg_parse_deprecated(nlh, sizeof(*ifm), tb, IFA_MAX, ifa_ipv4_policy, extack); if (err < 0) goto out; ifm = nlmsg_data(nlh); rtnl_net_lock(net); in_dev = inetdev_by_index(net, ifm->ifa_index); if (!in_dev) { NL_SET_ERR_MSG(extack, "ipv4: Device not found"); err = -ENODEV; goto unlock; } for (ifap = &in_dev->ifa_list; (ifa = rtnl_net_dereference(net, *ifap)) != NULL; ifap = &ifa->ifa_next) { if (tb[IFA_LOCAL] && ifa->ifa_local != nla_get_in_addr(tb[IFA_LOCAL])) continue; if (tb[IFA_LABEL] && nla_strcmp(tb[IFA_LABEL], ifa->ifa_label)) continue; if (tb[IFA_ADDRESS] && (ifm->ifa_prefixlen != ifa->ifa_prefixlen || !inet_ifa_match(nla_get_in_addr(tb[IFA_ADDRESS]), ifa))) continue; if (ipv4_is_multicast(ifa->ifa_address)) ip_mc_autojoin_config(net, false, ifa); __inet_del_ifa(in_dev, ifap, 1, nlh, NETLINK_CB(skb).portid); goto unlock; } NL_SET_ERR_MSG(extack, "ipv4: Address not found"); err = -EADDRNOTAVAIL; unlock: rtnl_net_unlock(net); out: return err; } static void check_lifetime(struct work_struct *work) { unsigned long now, next, next_sec, next_sched; struct in_ifaddr *ifa; struct hlist_node *n; struct net *net; int i; net = container_of(to_delayed_work(work), struct net, ipv4.addr_chk_work); now = jiffies; next = round_jiffies_up(now + ADDR_CHECK_FREQUENCY); for (i = 0; i < IN4_ADDR_HSIZE; i++) { struct hlist_head *head = &net->ipv4.inet_addr_lst[i]; bool change_needed = false; rcu_read_lock(); hlist_for_each_entry_rcu(ifa, head, addr_lst) { unsigned long age, tstamp; u32 preferred_lft; u32 valid_lft; u32 flags; flags = READ_ONCE(ifa->ifa_flags); if (flags & IFA_F_PERMANENT) continue; preferred_lft = READ_ONCE(ifa->ifa_preferred_lft); valid_lft = READ_ONCE(ifa->ifa_valid_lft); tstamp = READ_ONCE(ifa->ifa_tstamp); /* We try to batch several events at once. */ age = (now - tstamp + ADDRCONF_TIMER_FUZZ_MINUS) / HZ; if (valid_lft != INFINITY_LIFE_TIME && age >= valid_lft) { change_needed = true; } else if (preferred_lft == INFINITY_LIFE_TIME) { continue; } else if (age >= preferred_lft) { if (time_before(tstamp + valid_lft * HZ, next)) next = tstamp + valid_lft * HZ; if (!(flags & IFA_F_DEPRECATED)) change_needed = true; } else if (time_before(tstamp + preferred_lft * HZ, next)) { next = tstamp + preferred_lft * HZ; } } rcu_read_unlock(); if (!change_needed) continue; rtnl_net_lock(net); hlist_for_each_entry_safe(ifa, n, head, addr_lst) { unsigned long age; if (ifa->ifa_flags & IFA_F_PERMANENT) continue; /* We try to batch several events at once. */ age = (now - ifa->ifa_tstamp + ADDRCONF_TIMER_FUZZ_MINUS) / HZ; if (ifa->ifa_valid_lft != INFINITY_LIFE_TIME && age >= ifa->ifa_valid_lft) { struct in_ifaddr __rcu **ifap; struct in_ifaddr *tmp; ifap = &ifa->ifa_dev->ifa_list; tmp = rtnl_net_dereference(net, *ifap); while (tmp) { if (tmp == ifa) { inet_del_ifa(ifa->ifa_dev, ifap, 1); break; } ifap = &tmp->ifa_next; tmp = rtnl_net_dereference(net, *ifap); } } else if (ifa->ifa_preferred_lft != INFINITY_LIFE_TIME && age >= ifa->ifa_preferred_lft && !(ifa->ifa_flags & IFA_F_DEPRECATED)) { ifa->ifa_flags |= IFA_F_DEPRECATED; rtmsg_ifa(RTM_NEWADDR, ifa, NULL, 0); } } rtnl_net_unlock(net); } next_sec = round_jiffies_up(next); next_sched = next; /* If rounded timeout is accurate enough, accept it. */ if (time_before(next_sec, next + ADDRCONF_TIMER_FUZZ)) next_sched = next_sec; now = jiffies; /* And minimum interval is ADDRCONF_TIMER_FUZZ_MAX. */ if (time_before(next_sched, now + ADDRCONF_TIMER_FUZZ_MAX)) next_sched = now + ADDRCONF_TIMER_FUZZ_MAX; queue_delayed_work(system_power_efficient_wq, &net->ipv4.addr_chk_work, next_sched - now); } static void set_ifa_lifetime(struct in_ifaddr *ifa, __u32 valid_lft, __u32 prefered_lft) { unsigned long timeout; u32 flags; flags = ifa->ifa_flags & ~(IFA_F_PERMANENT | IFA_F_DEPRECATED); timeout = addrconf_timeout_fixup(valid_lft, HZ); if (addrconf_finite_timeout(timeout)) WRITE_ONCE(ifa->ifa_valid_lft, timeout); else flags |= IFA_F_PERMANENT; timeout = addrconf_timeout_fixup(prefered_lft, HZ); if (addrconf_finite_timeout(timeout)) { if (timeout == 0) flags |= IFA_F_DEPRECATED; WRITE_ONCE(ifa->ifa_preferred_lft, timeout); } WRITE_ONCE(ifa->ifa_flags, flags); WRITE_ONCE(ifa->ifa_tstamp, jiffies); if (!ifa->ifa_cstamp) WRITE_ONCE(ifa->ifa_cstamp, ifa->ifa_tstamp); } static int inet_validate_rtm(struct nlmsghdr *nlh, struct nlattr **tb, struct netlink_ext_ack *extack, __u32 *valid_lft, __u32 *prefered_lft) { struct ifaddrmsg *ifm = nlmsg_data(nlh); int err; err = nlmsg_parse_deprecated(nlh, sizeof(*ifm), tb, IFA_MAX, ifa_ipv4_policy, extack); if (err < 0) return err; if (ifm->ifa_prefixlen > 32) { NL_SET_ERR_MSG(extack, "ipv4: Invalid prefix length"); return -EINVAL; } if (!tb[IFA_LOCAL]) { NL_SET_ERR_MSG(extack, "ipv4: Local address is not supplied"); return -EINVAL; } if (tb[IFA_CACHEINFO]) { struct ifa_cacheinfo *ci; ci = nla_data(tb[IFA_CACHEINFO]); if (!ci->ifa_valid || ci->ifa_prefered > ci->ifa_valid) { NL_SET_ERR_MSG(extack, "ipv4: address lifetime invalid"); return -EINVAL; } *valid_lft = ci->ifa_valid; *prefered_lft = ci->ifa_prefered; } return 0; } static struct in_ifaddr *inet_rtm_to_ifa(struct net *net, struct nlmsghdr *nlh, struct nlattr **tb, struct netlink_ext_ack *extack) { struct ifaddrmsg *ifm = nlmsg_data(nlh); struct in_device *in_dev; struct net_device *dev; struct in_ifaddr *ifa; int err; dev = __dev_get_by_index(net, ifm->ifa_index); err = -ENODEV; if (!dev) { NL_SET_ERR_MSG(extack, "ipv4: Device not found"); goto errout; } in_dev = __in_dev_get_rtnl_net(dev); err = -ENOBUFS; if (!in_dev) goto errout; ifa = inet_alloc_ifa(in_dev); if (!ifa) /* * A potential indev allocation can be left alive, it stays * assigned to its device and is destroy with it. */ goto errout; ipv4_devconf_setall(in_dev); neigh_parms_data_state_setall(in_dev->arp_parms); if (!tb[IFA_ADDRESS]) tb[IFA_ADDRESS] = tb[IFA_LOCAL]; ifa->ifa_prefixlen = ifm->ifa_prefixlen; ifa->ifa_mask = inet_make_mask(ifm->ifa_prefixlen); ifa->ifa_flags = nla_get_u32_default(tb[IFA_FLAGS], ifm->ifa_flags); ifa->ifa_scope = ifm->ifa_scope; ifa->ifa_local = nla_get_in_addr(tb[IFA_LOCAL]); ifa->ifa_address = nla_get_in_addr(tb[IFA_ADDRESS]); if (tb[IFA_BROADCAST]) ifa->ifa_broadcast = nla_get_in_addr(tb[IFA_BROADCAST]); if (tb[IFA_LABEL]) nla_strscpy(ifa->ifa_label, tb[IFA_LABEL], IFNAMSIZ); else memcpy(ifa->ifa_label, dev->name, IFNAMSIZ); if (tb[IFA_RT_PRIORITY]) ifa->ifa_rt_priority = nla_get_u32(tb[IFA_RT_PRIORITY]); if (tb[IFA_PROTO]) ifa->ifa_proto = nla_get_u8(tb[IFA_PROTO]); return ifa; errout: return ERR_PTR(err); } static struct in_ifaddr *find_matching_ifa(struct net *net, struct in_ifaddr *ifa) { struct in_device *in_dev = ifa->ifa_dev; struct in_ifaddr *ifa1; in_dev_for_each_ifa_rtnl_net(net, ifa1, in_dev) { if (ifa1->ifa_mask == ifa->ifa_mask && inet_ifa_match(ifa1->ifa_address, ifa) && ifa1->ifa_local == ifa->ifa_local) return ifa1; } return NULL; } static int inet_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { __u32 prefered_lft = INFINITY_LIFE_TIME; __u32 valid_lft = INFINITY_LIFE_TIME; struct net *net = sock_net(skb->sk); struct in_ifaddr *ifa_existing; struct nlattr *tb[IFA_MAX + 1]; struct in_ifaddr *ifa; int ret; ret = inet_validate_rtm(nlh, tb, extack, &valid_lft, &prefered_lft); if (ret < 0) return ret; if (!nla_get_in_addr(tb[IFA_LOCAL])) return 0; rtnl_net_lock(net); ifa = inet_rtm_to_ifa(net, nlh, tb, extack); if (IS_ERR(ifa)) { ret = PTR_ERR(ifa); goto unlock; } ifa_existing = find_matching_ifa(net, ifa); if (!ifa_existing) { /* It would be best to check for !NLM_F_CREATE here but * userspace already relies on not having to provide this. */ set_ifa_lifetime(ifa, valid_lft, prefered_lft); if (ifa->ifa_flags & IFA_F_MCAUTOJOIN) { ret = ip_mc_autojoin_config(net, true, ifa); if (ret < 0) { NL_SET_ERR_MSG(extack, "ipv4: Multicast auto join failed"); inet_free_ifa(ifa); goto unlock; } } ret = __inet_insert_ifa(ifa, nlh, NETLINK_CB(skb).portid, extack); } else { u32 new_metric = ifa->ifa_rt_priority; u8 new_proto = ifa->ifa_proto; inet_free_ifa(ifa); if (nlh->nlmsg_flags & NLM_F_EXCL || !(nlh->nlmsg_flags & NLM_F_REPLACE)) { NL_SET_ERR_MSG(extack, "ipv4: Address already assigned"); ret = -EEXIST; goto unlock; } ifa = ifa_existing; if (ifa->ifa_rt_priority != new_metric) { fib_modify_prefix_metric(ifa, new_metric); ifa->ifa_rt_priority = new_metric; } ifa->ifa_proto = new_proto; set_ifa_lifetime(ifa, valid_lft, prefered_lft); cancel_delayed_work(&net->ipv4.addr_chk_work); queue_delayed_work(system_power_efficient_wq, &net->ipv4.addr_chk_work, 0); rtmsg_ifa(RTM_NEWADDR, ifa, nlh, NETLINK_CB(skb).portid); } unlock: rtnl_net_unlock(net); return ret; } /* * Determine a default network mask, based on the IP address. */ static int inet_abc_len(__be32 addr) { int rc = -1; /* Something else, probably a multicast. */ if (ipv4_is_zeronet(addr) || ipv4_is_lbcast(addr)) rc = 0; else { __u32 haddr = ntohl(addr); if (IN_CLASSA(haddr)) rc = 8; else if (IN_CLASSB(haddr)) rc = 16; else if (IN_CLASSC(haddr)) rc = 24; else if (IN_CLASSE(haddr)) rc = 32; } return rc; } int devinet_ioctl(struct net *net, unsigned int cmd, struct ifreq *ifr) { struct sockaddr_in sin_orig; struct sockaddr_in *sin = (struct sockaddr_in *)&ifr->ifr_addr; struct in_ifaddr __rcu **ifap = NULL; struct in_device *in_dev; struct in_ifaddr *ifa = NULL; struct net_device *dev; char *colon; int ret = -EFAULT; int tryaddrmatch = 0; ifr->ifr_name[IFNAMSIZ - 1] = 0; /* save original address for comparison */ memcpy(&sin_orig, sin, sizeof(*sin)); colon = strchr(ifr->ifr_name, ':'); if (colon) *colon = 0; dev_load(net, ifr->ifr_name); switch (cmd) { case SIOCGIFADDR: /* Get interface address */ case SIOCGIFBRDADDR: /* Get the broadcast address */ case SIOCGIFDSTADDR: /* Get the destination address */ case SIOCGIFNETMASK: /* Get the netmask for the interface */ /* Note that these ioctls will not sleep, so that we do not impose a lock. One day we will be forced to put shlock here (I mean SMP) */ tryaddrmatch = (sin_orig.sin_family == AF_INET); memset(sin, 0, sizeof(*sin)); sin->sin_family = AF_INET; break; case SIOCSIFFLAGS: ret = -EPERM; if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) goto out; break; case SIOCSIFADDR: /* Set interface address (and family) */ case SIOCSIFBRDADDR: /* Set the broadcast address */ case SIOCSIFDSTADDR: /* Set the destination address */ case SIOCSIFNETMASK: /* Set the netmask for the interface */ ret = -EPERM; if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) goto out; ret = -EINVAL; if (sin->sin_family != AF_INET) goto out; break; default: ret = -EINVAL; goto out; } rtnl_net_lock(net); ret = -ENODEV; dev = __dev_get_by_name(net, ifr->ifr_name); if (!dev) goto done; if (colon) *colon = ':'; in_dev = __in_dev_get_rtnl_net(dev); if (in_dev) { if (tryaddrmatch) { /* Matthias Andree */ /* compare label and address (4.4BSD style) */ /* note: we only do this for a limited set of ioctls and only if the original address family was AF_INET. This is checked above. */ for (ifap = &in_dev->ifa_list; (ifa = rtnl_net_dereference(net, *ifap)) != NULL; ifap = &ifa->ifa_next) { if (!strcmp(ifr->ifr_name, ifa->ifa_label) && sin_orig.sin_addr.s_addr == ifa->ifa_local) { break; /* found */ } } } /* we didn't get a match, maybe the application is 4.3BSD-style and passed in junk so we fall back to comparing just the label */ if (!ifa) { for (ifap = &in_dev->ifa_list; (ifa = rtnl_net_dereference(net, *ifap)) != NULL; ifap = &ifa->ifa_next) if (!strcmp(ifr->ifr_name, ifa->ifa_label)) break; } } ret = -EADDRNOTAVAIL; if (!ifa && cmd != SIOCSIFADDR && cmd != SIOCSIFFLAGS) goto done; switch (cmd) { case SIOCGIFADDR: /* Get interface address */ ret = 0; sin->sin_addr.s_addr = ifa->ifa_local; break; case SIOCGIFBRDADDR: /* Get the broadcast address */ ret = 0; sin->sin_addr.s_addr = ifa->ifa_broadcast; break; case SIOCGIFDSTADDR: /* Get the destination address */ ret = 0; sin->sin_addr.s_addr = ifa->ifa_address; break; case SIOCGIFNETMASK: /* Get the netmask for the interface */ ret = 0; sin->sin_addr.s_addr = ifa->ifa_mask; break; case SIOCSIFFLAGS: if (colon) { ret = -EADDRNOTAVAIL; if (!ifa) break; ret = 0; if (!(ifr->ifr_flags & IFF_UP)) inet_del_ifa(in_dev, ifap, 1); break; } /* NETDEV_UP/DOWN/CHANGE could touch a peer dev */ ASSERT_RTNL(); ret = dev_change_flags(dev, ifr->ifr_flags, NULL); break; case SIOCSIFADDR: /* Set interface address (and family) */ ret = -EINVAL; if (inet_abc_len(sin->sin_addr.s_addr) < 0) break; if (!ifa) { ret = -ENOBUFS; if (!in_dev) break; ifa = inet_alloc_ifa(in_dev); if (!ifa) break; if (colon) memcpy(ifa->ifa_label, ifr->ifr_name, IFNAMSIZ); else memcpy(ifa->ifa_label, dev->name, IFNAMSIZ); } else { ret = 0; if (ifa->ifa_local == sin->sin_addr.s_addr) break; inet_del_ifa(in_dev, ifap, 0); ifa->ifa_broadcast = 0; ifa->ifa_scope = 0; } ifa->ifa_address = ifa->ifa_local = sin->sin_addr.s_addr; if (!(dev->flags & IFF_POINTOPOINT)) { ifa->ifa_prefixlen = inet_abc_len(ifa->ifa_address); ifa->ifa_mask = inet_make_mask(ifa->ifa_prefixlen); if ((dev->flags & IFF_BROADCAST) && ifa->ifa_prefixlen < 31) ifa->ifa_broadcast = ifa->ifa_address | ~ifa->ifa_mask; } else { ifa->ifa_prefixlen = 32; ifa->ifa_mask = inet_make_mask(32); } set_ifa_lifetime(ifa, INFINITY_LIFE_TIME, INFINITY_LIFE_TIME); ret = inet_set_ifa(dev, ifa); break; case SIOCSIFBRDADDR: /* Set the broadcast address */ ret = 0; if (ifa->ifa_broadcast != sin->sin_addr.s_addr) { inet_del_ifa(in_dev, ifap, 0); ifa->ifa_broadcast = sin->sin_addr.s_addr; inet_insert_ifa(ifa); } break; case SIOCSIFDSTADDR: /* Set the destination address */ ret = 0; if (ifa->ifa_address == sin->sin_addr.s_addr) break; ret = -EINVAL; if (inet_abc_len(sin->sin_addr.s_addr) < 0) break; ret = 0; inet_del_ifa(in_dev, ifap, 0); ifa->ifa_address = sin->sin_addr.s_addr; inet_insert_ifa(ifa); break; case SIOCSIFNETMASK: /* Set the netmask for the interface */ /* * The mask we set must be legal. */ ret = -EINVAL; if (bad_mask(sin->sin_addr.s_addr, 0)) break; ret = 0; if (ifa->ifa_mask != sin->sin_addr.s_addr) { __be32 old_mask = ifa->ifa_mask; inet_del_ifa(in_dev, ifap, 0); ifa->ifa_mask = sin->sin_addr.s_addr; ifa->ifa_prefixlen = inet_mask_len(ifa->ifa_mask); /* See if current broadcast address matches * with current netmask, then recalculate * the broadcast address. Otherwise it's a * funny address, so don't touch it since * the user seems to know what (s)he's doing... */ if ((dev->flags & IFF_BROADCAST) && (ifa->ifa_prefixlen < 31) && (ifa->ifa_broadcast == (ifa->ifa_local|~old_mask))) { ifa->ifa_broadcast = (ifa->ifa_local | ~sin->sin_addr.s_addr); } inet_insert_ifa(ifa); } break; } done: rtnl_net_unlock(net); out: return ret; } int inet_gifconf(struct net_device *dev, char __user *buf, int len, int size) { struct in_device *in_dev = __in_dev_get_rtnl_net(dev); const struct in_ifaddr *ifa; struct ifreq ifr; int done = 0; if (WARN_ON(size > sizeof(struct ifreq))) goto out; if (!in_dev) goto out; in_dev_for_each_ifa_rtnl_net(dev_net(dev), ifa, in_dev) { if (!buf) { done += size; continue; } if (len < size) break; memset(&ifr, 0, sizeof(struct ifreq)); strcpy(ifr.ifr_name, ifa->ifa_label); (*(struct sockaddr_in *)&ifr.ifr_addr).sin_family = AF_INET; (*(struct sockaddr_in *)&ifr.ifr_addr).sin_addr.s_addr = ifa->ifa_local; if (copy_to_user(buf + done, &ifr, size)) { done = -EFAULT; break; } len -= size; done += size; } out: return done; } static __be32 in_dev_select_addr(const struct in_device *in_dev, int scope) { const struct in_ifaddr *ifa; in_dev_for_each_ifa_rcu(ifa, in_dev) { if (READ_ONCE(ifa->ifa_flags) & IFA_F_SECONDARY) continue; if (ifa->ifa_scope != RT_SCOPE_LINK && ifa->ifa_scope <= scope) return ifa->ifa_local; } return 0; } __be32 inet_select_addr(const struct net_device *dev, __be32 dst, int scope) { const struct in_ifaddr *ifa; __be32 addr = 0; unsigned char localnet_scope = RT_SCOPE_HOST; struct in_device *in_dev; struct net *net; int master_idx; rcu_read_lock(); net = dev_net_rcu(dev); in_dev = __in_dev_get_rcu(dev); if (!in_dev) goto no_in_dev; if (unlikely(IN_DEV_ROUTE_LOCALNET(in_dev))) localnet_scope = RT_SCOPE_LINK; in_dev_for_each_ifa_rcu(ifa, in_dev) { if (READ_ONCE(ifa->ifa_flags) & IFA_F_SECONDARY) continue; if (min(ifa->ifa_scope, localnet_scope) > scope) continue; if (!dst || inet_ifa_match(dst, ifa)) { addr = ifa->ifa_local; break; } if (!addr) addr = ifa->ifa_local; } if (addr) goto out_unlock; no_in_dev: master_idx = l3mdev_master_ifindex_rcu(dev); /* For VRFs, the VRF device takes the place of the loopback device, * with addresses on it being preferred. Note in such cases the * loopback device will be among the devices that fail the master_idx * equality check in the loop below. */ if (master_idx && (dev = dev_get_by_index_rcu(net, master_idx)) && (in_dev = __in_dev_get_rcu(dev))) { addr = in_dev_select_addr(in_dev, scope); if (addr) goto out_unlock; } /* Not loopback addresses on loopback should be preferred in this case. It is important that lo is the first interface in dev_base list. */ for_each_netdev_rcu(net, dev) { if (l3mdev_master_ifindex_rcu(dev) != master_idx) continue; in_dev = __in_dev_get_rcu(dev); if (!in_dev) continue; addr = in_dev_select_addr(in_dev, scope); if (addr) goto out_unlock; } out_unlock: rcu_read_unlock(); return addr; } EXPORT_SYMBOL(inet_select_addr); static __be32 confirm_addr_indev(struct in_device *in_dev, __be32 dst, __be32 local, int scope) { unsigned char localnet_scope = RT_SCOPE_HOST; const struct in_ifaddr *ifa; __be32 addr = 0; int same = 0; if (unlikely(IN_DEV_ROUTE_LOCALNET(in_dev))) localnet_scope = RT_SCOPE_LINK; in_dev_for_each_ifa_rcu(ifa, in_dev) { unsigned char min_scope = min(ifa->ifa_scope, localnet_scope); if (!addr && (local == ifa->ifa_local || !local) && min_scope <= scope) { addr = ifa->ifa_local; if (same) break; } if (!same) { same = (!local || inet_ifa_match(local, ifa)) && (!dst || inet_ifa_match(dst, ifa)); if (same && addr) { if (local || !dst) break; /* Is the selected addr into dst subnet? */ if (inet_ifa_match(addr, ifa)) break; /* No, then can we use new local src? */ if (min_scope <= scope) { addr = ifa->ifa_local; break; } /* search for large dst subnet for addr */ same = 0; } } } return same ? addr : 0; } /* * Confirm that local IP address exists using wildcards: * - net: netns to check, cannot be NULL * - in_dev: only on this interface, NULL=any interface * - dst: only in the same subnet as dst, 0=any dst * - local: address, 0=autoselect the local address * - scope: maximum allowed scope value for the local address */ __be32 inet_confirm_addr(struct net *net, struct in_device *in_dev, __be32 dst, __be32 local, int scope) { __be32 addr = 0; struct net_device *dev; if (in_dev) return confirm_addr_indev(in_dev, dst, local, scope); rcu_read_lock(); for_each_netdev_rcu(net, dev) { in_dev = __in_dev_get_rcu(dev); if (in_dev) { addr = confirm_addr_indev(in_dev, dst, local, scope); if (addr) break; } } rcu_read_unlock(); return addr; } EXPORT_SYMBOL(inet_confirm_addr); /* * Device notifier */ int register_inetaddr_notifier(struct notifier_block *nb) { return blocking_notifier_chain_register(&inetaddr_chain, nb); } EXPORT_SYMBOL(register_inetaddr_notifier); int unregister_inetaddr_notifier(struct notifier_block *nb) { return blocking_notifier_chain_unregister(&inetaddr_chain, nb); } EXPORT_SYMBOL(unregister_inetaddr_notifier); int register_inetaddr_validator_notifier(struct notifier_block *nb) { return blocking_notifier_chain_register(&inetaddr_validator_chain, nb); } EXPORT_SYMBOL(register_inetaddr_validator_notifier); int unregister_inetaddr_validator_notifier(struct notifier_block *nb) { return blocking_notifier_chain_unregister(&inetaddr_validator_chain, nb); } EXPORT_SYMBOL(unregister_inetaddr_validator_notifier); /* Rename ifa_labels for a device name change. Make some effort to preserve * existing alias numbering and to create unique labels if possible. */ static void inetdev_changename(struct net_device *dev, struct in_device *in_dev) { struct in_ifaddr *ifa; int named = 0; in_dev_for_each_ifa_rtnl(ifa, in_dev) { char old[IFNAMSIZ], *dot; memcpy(old, ifa->ifa_label, IFNAMSIZ); memcpy(ifa->ifa_label, dev->name, IFNAMSIZ); if (named++ == 0) goto skip; dot = strchr(old, ':'); if (!dot) { sprintf(old, ":%d", named); dot = old; } if (strlen(dot) + strlen(dev->name) < IFNAMSIZ) strcat(ifa->ifa_label, dot); else strcpy(ifa->ifa_label + (IFNAMSIZ - strlen(dot) - 1), dot); skip: rtmsg_ifa(RTM_NEWADDR, ifa, NULL, 0); } } static void inetdev_send_gratuitous_arp(struct net_device *dev, struct in_device *in_dev) { const struct in_ifaddr *ifa; in_dev_for_each_ifa_rtnl(ifa, in_dev) { arp_send(ARPOP_REQUEST, ETH_P_ARP, ifa->ifa_local, dev, ifa->ifa_local, NULL, dev->dev_addr, NULL); } } /* Called only under RTNL semaphore */ static int inetdev_event(struct notifier_block *this, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct in_device *in_dev = __in_dev_get_rtnl(dev); ASSERT_RTNL(); if (!in_dev) { if (event == NETDEV_REGISTER) { in_dev = inetdev_init(dev); if (IS_ERR(in_dev)) return notifier_from_errno(PTR_ERR(in_dev)); if (dev->flags & IFF_LOOPBACK) { IN_DEV_CONF_SET(in_dev, NOXFRM, 1); IN_DEV_CONF_SET(in_dev, NOPOLICY, 1); } } else if (event == NETDEV_CHANGEMTU) { /* Re-enabling IP */ if (inetdev_valid_mtu(dev->mtu)) in_dev = inetdev_init(dev); } goto out; } switch (event) { case NETDEV_REGISTER: pr_debug("%s: bug\n", __func__); RCU_INIT_POINTER(dev->ip_ptr, NULL); break; case NETDEV_UP: if (!inetdev_valid_mtu(dev->mtu)) break; if (dev->flags & IFF_LOOPBACK) { struct in_ifaddr *ifa = inet_alloc_ifa(in_dev); if (ifa) { ifa->ifa_local = ifa->ifa_address = htonl(INADDR_LOOPBACK); ifa->ifa_prefixlen = 8; ifa->ifa_mask = inet_make_mask(8); ifa->ifa_scope = RT_SCOPE_HOST; memcpy(ifa->ifa_label, dev->name, IFNAMSIZ); set_ifa_lifetime(ifa, INFINITY_LIFE_TIME, INFINITY_LIFE_TIME); ipv4_devconf_setall(in_dev); neigh_parms_data_state_setall(in_dev->arp_parms); inet_insert_ifa(ifa); } } ip_mc_up(in_dev); fallthrough; case NETDEV_CHANGEADDR: if (!IN_DEV_ARP_NOTIFY(in_dev)) break; fallthrough; case NETDEV_NOTIFY_PEERS: /* Send gratuitous ARP to notify of link change */ inetdev_send_gratuitous_arp(dev, in_dev); break; case NETDEV_DOWN: ip_mc_down(in_dev); break; case NETDEV_PRE_TYPE_CHANGE: ip_mc_unmap(in_dev); break; case NETDEV_POST_TYPE_CHANGE: ip_mc_remap(in_dev); break; case NETDEV_CHANGEMTU: if (inetdev_valid_mtu(dev->mtu)) break; /* disable IP when MTU is not enough */ fallthrough; case NETDEV_UNREGISTER: inetdev_destroy(in_dev); break; case NETDEV_CHANGENAME: /* Do not notify about label change, this event is * not interesting to applications using netlink. */ inetdev_changename(dev, in_dev); devinet_sysctl_unregister(in_dev); devinet_sysctl_register(in_dev); break; } out: return NOTIFY_DONE; } static struct notifier_block ip_netdev_notifier = { .notifier_call = inetdev_event, }; static size_t inet_nlmsg_size(void) { return NLMSG_ALIGN(sizeof(struct ifaddrmsg)) + nla_total_size(4) /* IFA_ADDRESS */ + nla_total_size(4) /* IFA_LOCAL */ + nla_total_size(4) /* IFA_BROADCAST */ + nla_total_size(IFNAMSIZ) /* IFA_LABEL */ + nla_total_size(4) /* IFA_FLAGS */ + nla_total_size(1) /* IFA_PROTO */ + nla_total_size(4) /* IFA_RT_PRIORITY */ + nla_total_size(sizeof(struct ifa_cacheinfo)); /* IFA_CACHEINFO */ } static inline u32 cstamp_delta(unsigned long cstamp) { return (cstamp - INITIAL_JIFFIES) * 100UL / HZ; } static int put_cacheinfo(struct sk_buff *skb, unsigned long cstamp, unsigned long tstamp, u32 preferred, u32 valid) { struct ifa_cacheinfo ci; ci.cstamp = cstamp_delta(cstamp); ci.tstamp = cstamp_delta(tstamp); ci.ifa_prefered = preferred; ci.ifa_valid = valid; return nla_put(skb, IFA_CACHEINFO, sizeof(ci), &ci); } static int inet_fill_ifaddr(struct sk_buff *skb, const struct in_ifaddr *ifa, struct inet_fill_args *args) { struct ifaddrmsg *ifm; struct nlmsghdr *nlh; unsigned long tstamp; u32 preferred, valid; u32 flags; nlh = nlmsg_put(skb, args->portid, args->seq, args->event, sizeof(*ifm), args->flags); if (!nlh) return -EMSGSIZE; ifm = nlmsg_data(nlh); ifm->ifa_family = AF_INET; ifm->ifa_prefixlen = ifa->ifa_prefixlen; flags = READ_ONCE(ifa->ifa_flags); /* Warning : ifm->ifa_flags is an __u8, it holds only 8 bits. * The 32bit value is given in IFA_FLAGS attribute. */ ifm->ifa_flags = (__u8)flags; ifm->ifa_scope = ifa->ifa_scope; ifm->ifa_index = ifa->ifa_dev->dev->ifindex; if (args->netnsid >= 0 && nla_put_s32(skb, IFA_TARGET_NETNSID, args->netnsid)) goto nla_put_failure; tstamp = READ_ONCE(ifa->ifa_tstamp); if (!(flags & IFA_F_PERMANENT)) { preferred = READ_ONCE(ifa->ifa_preferred_lft); valid = READ_ONCE(ifa->ifa_valid_lft); if (preferred != INFINITY_LIFE_TIME) { long tval = (jiffies - tstamp) / HZ; if (preferred > tval) preferred -= tval; else preferred = 0; if (valid != INFINITY_LIFE_TIME) { if (valid > tval) valid -= tval; else valid = 0; } } } else { preferred = INFINITY_LIFE_TIME; valid = INFINITY_LIFE_TIME; } if ((ifa->ifa_address && nla_put_in_addr(skb, IFA_ADDRESS, ifa->ifa_address)) || (ifa->ifa_local && nla_put_in_addr(skb, IFA_LOCAL, ifa->ifa_local)) || (ifa->ifa_broadcast && nla_put_in_addr(skb, IFA_BROADCAST, ifa->ifa_broadcast)) || (ifa->ifa_label[0] && nla_put_string(skb, IFA_LABEL, ifa->ifa_label)) || (ifa->ifa_proto && nla_put_u8(skb, IFA_PROTO, ifa->ifa_proto)) || nla_put_u32(skb, IFA_FLAGS, flags) || (ifa->ifa_rt_priority && nla_put_u32(skb, IFA_RT_PRIORITY, ifa->ifa_rt_priority)) || put_cacheinfo(skb, READ_ONCE(ifa->ifa_cstamp), tstamp, preferred, valid)) goto nla_put_failure; nlmsg_end(skb, nlh); return 0; nla_put_failure: nlmsg_cancel(skb, nlh); return -EMSGSIZE; } static int inet_valid_dump_ifaddr_req(const struct nlmsghdr *nlh, struct inet_fill_args *fillargs, struct net **tgt_net, struct sock *sk, struct netlink_callback *cb) { struct netlink_ext_ack *extack = cb->extack; struct nlattr *tb[IFA_MAX+1]; struct ifaddrmsg *ifm; int err, i; ifm = nlmsg_payload(nlh, sizeof(*ifm)); if (!ifm) { NL_SET_ERR_MSG(extack, "ipv4: Invalid header for address dump request"); return -EINVAL; } if (ifm->ifa_prefixlen || ifm->ifa_flags || ifm->ifa_scope) { NL_SET_ERR_MSG(extack, "ipv4: Invalid values in header for address dump request"); return -EINVAL; } fillargs->ifindex = ifm->ifa_index; if (fillargs->ifindex) { cb->answer_flags |= NLM_F_DUMP_FILTERED; fillargs->flags |= NLM_F_DUMP_FILTERED; } err = nlmsg_parse_deprecated_strict(nlh, sizeof(*ifm), tb, IFA_MAX, ifa_ipv4_policy, extack); if (err < 0) return err; for (i = 0; i <= IFA_MAX; ++i) { if (!tb[i]) continue; if (i == IFA_TARGET_NETNSID) { struct net *net; fillargs->netnsid = nla_get_s32(tb[i]); net = rtnl_get_net_ns_capable(sk, fillargs->netnsid); if (IS_ERR(net)) { fillargs->netnsid = -1; NL_SET_ERR_MSG(extack, "ipv4: Invalid target network namespace id"); return PTR_ERR(net); } *tgt_net = net; } else { NL_SET_ERR_MSG(extack, "ipv4: Unsupported attribute in dump request"); return -EINVAL; } } return 0; } static int in_dev_dump_ifmcaddr(struct in_device *in_dev, struct sk_buff *skb, struct netlink_callback *cb, int *s_ip_idx, struct inet_fill_args *fillargs) { struct ip_mc_list *im; int ip_idx = 0; int err; for (im = rcu_dereference(in_dev->mc_list); im; im = rcu_dereference(im->next_rcu)) { if (ip_idx < *s_ip_idx) { ip_idx++; continue; } err = inet_fill_ifmcaddr(skb, in_dev->dev, im, fillargs); if (err < 0) goto done; nl_dump_check_consistent(cb, nlmsg_hdr(skb)); ip_idx++; } err = 0; ip_idx = 0; done: *s_ip_idx = ip_idx; return err; } static int in_dev_dump_ifaddr(struct in_device *in_dev, struct sk_buff *skb, struct netlink_callback *cb, int *s_ip_idx, struct inet_fill_args *fillargs) { struct in_ifaddr *ifa; int ip_idx = 0; int err; in_dev_for_each_ifa_rcu(ifa, in_dev) { if (ip_idx < *s_ip_idx) { ip_idx++; continue; } err = inet_fill_ifaddr(skb, ifa, fillargs); if (err < 0) goto done; nl_dump_check_consistent(cb, nlmsg_hdr(skb)); ip_idx++; } err = 0; ip_idx = 0; done: *s_ip_idx = ip_idx; return err; } static int in_dev_dump_addr(struct in_device *in_dev, struct sk_buff *skb, struct netlink_callback *cb, int *s_ip_idx, struct inet_fill_args *fillargs) { switch (fillargs->event) { case RTM_NEWADDR: return in_dev_dump_ifaddr(in_dev, skb, cb, s_ip_idx, fillargs); case RTM_GETMULTICAST: return in_dev_dump_ifmcaddr(in_dev, skb, cb, s_ip_idx, fillargs); default: return -EINVAL; } } /* Combine dev_addr_genid and dev_base_seq to detect changes. */ static u32 inet_base_seq(const struct net *net) { u32 res = atomic_read(&net->ipv4.dev_addr_genid) + READ_ONCE(net->dev_base_seq); /* Must not return 0 (see nl_dump_check_consistent()). * Chose a value far away from 0. */ if (!res) res = 0x80000000; return res; } static int inet_dump_addr(struct sk_buff *skb, struct netlink_callback *cb, int event) { const struct nlmsghdr *nlh = cb->nlh; struct inet_fill_args fillargs = { .portid = NETLINK_CB(cb->skb).portid, .seq = nlh->nlmsg_seq, .event = event, .flags = NLM_F_MULTI, .netnsid = -1, }; struct net *net = sock_net(skb->sk); struct net *tgt_net = net; struct { unsigned long ifindex; int ip_idx; } *ctx = (void *)cb->ctx; struct in_device *in_dev; struct net_device *dev; int err = 0; rcu_read_lock(); if (cb->strict_check) { err = inet_valid_dump_ifaddr_req(nlh, &fillargs, &tgt_net, skb->sk, cb); if (err < 0) goto done; if (fillargs.ifindex) { dev = dev_get_by_index_rcu(tgt_net, fillargs.ifindex); if (!dev) { err = -ENODEV; goto done; } in_dev = __in_dev_get_rcu(dev); if (!in_dev) goto done; err = in_dev_dump_addr(in_dev, skb, cb, &ctx->ip_idx, &fillargs); goto done; } } cb->seq = inet_base_seq(tgt_net); for_each_netdev_dump(tgt_net, dev, ctx->ifindex) { in_dev = __in_dev_get_rcu(dev); if (!in_dev) continue; err = in_dev_dump_addr(in_dev, skb, cb, &ctx->ip_idx, &fillargs); if (err < 0) goto done; } done: if (fillargs.netnsid >= 0) put_net(tgt_net); rcu_read_unlock(); return err; } static int inet_dump_ifaddr(struct sk_buff *skb, struct netlink_callback *cb) { return inet_dump_addr(skb, cb, RTM_NEWADDR); } static int inet_dump_ifmcaddr(struct sk_buff *skb, struct netlink_callback *cb) { return inet_dump_addr(skb, cb, RTM_GETMULTICAST); } static void rtmsg_ifa(int event, struct in_ifaddr *ifa, struct nlmsghdr *nlh, u32 portid) { struct inet_fill_args fillargs = { .portid = portid, .seq = nlh ? nlh->nlmsg_seq : 0, .event = event, .flags = 0, .netnsid = -1, }; struct sk_buff *skb; int err = -ENOBUFS; struct net *net; net = dev_net(ifa->ifa_dev->dev); skb = nlmsg_new(inet_nlmsg_size(), GFP_KERNEL); if (!skb) goto errout; err = inet_fill_ifaddr(skb, ifa, &fillargs); if (err < 0) { /* -EMSGSIZE implies BUG in inet_nlmsg_size() */ WARN_ON(err == -EMSGSIZE); kfree_skb(skb); goto errout; } rtnl_notify(skb, net, portid, RTNLGRP_IPV4_IFADDR, nlh, GFP_KERNEL); return; errout: rtnl_set_sk_err(net, RTNLGRP_IPV4_IFADDR, err); } static size_t inet_get_link_af_size(const struct net_device *dev, u32 ext_filter_mask) { struct in_device *in_dev = rcu_dereference_rtnl(dev->ip_ptr); if (!in_dev) return 0; return nla_total_size(IPV4_DEVCONF_MAX * 4); /* IFLA_INET_CONF */ } static int inet_fill_link_af(struct sk_buff *skb, const struct net_device *dev, u32 ext_filter_mask) { struct in_device *in_dev = rcu_dereference_rtnl(dev->ip_ptr); struct nlattr *nla; int i; if (!in_dev) return -ENODATA; nla = nla_reserve(skb, IFLA_INET_CONF, IPV4_DEVCONF_MAX * 4); if (!nla) return -EMSGSIZE; for (i = 0; i < IPV4_DEVCONF_MAX; i++) ((u32 *) nla_data(nla))[i] = READ_ONCE(in_dev->cnf.data[i]); return 0; } static const struct nla_policy inet_af_policy[IFLA_INET_MAX+1] = { [IFLA_INET_CONF] = { .type = NLA_NESTED }, }; static int inet_validate_link_af(const struct net_device *dev, const struct nlattr *nla, struct netlink_ext_ack *extack) { struct nlattr *a, *tb[IFLA_INET_MAX+1]; int err, rem; if (dev && !__in_dev_get_rtnl(dev)) return -EAFNOSUPPORT; err = nla_parse_nested_deprecated(tb, IFLA_INET_MAX, nla, inet_af_policy, extack); if (err < 0) return err; if (tb[IFLA_INET_CONF]) { nla_for_each_nested(a, tb[IFLA_INET_CONF], rem) { int cfgid = nla_type(a); if (nla_len(a) < 4) return -EINVAL; if (cfgid <= 0 || cfgid > IPV4_DEVCONF_MAX) return -EINVAL; } } return 0; } static int inet_set_link_af(struct net_device *dev, const struct nlattr *nla, struct netlink_ext_ack *extack) { struct in_device *in_dev = __in_dev_get_rtnl(dev); struct nlattr *a, *tb[IFLA_INET_MAX+1]; int rem; if (!in_dev) return -EAFNOSUPPORT; if (nla_parse_nested_deprecated(tb, IFLA_INET_MAX, nla, NULL, NULL) < 0) return -EINVAL; if (tb[IFLA_INET_CONF]) { nla_for_each_nested(a, tb[IFLA_INET_CONF], rem) ipv4_devconf_set(in_dev, nla_type(a), nla_get_u32(a)); } return 0; } static int inet_netconf_msgsize_devconf(int type) { int size = NLMSG_ALIGN(sizeof(struct netconfmsg)) + nla_total_size(4); /* NETCONFA_IFINDEX */ bool all = false; if (type == NETCONFA_ALL) all = true; if (all || type == NETCONFA_FORWARDING) size += nla_total_size(4); if (all || type == NETCONFA_RP_FILTER) size += nla_total_size(4); if (all || type == NETCONFA_MC_FORWARDING) size += nla_total_size(4); if (all || type == NETCONFA_BC_FORWARDING) size += nla_total_size(4); if (all || type == NETCONFA_PROXY_NEIGH) size += nla_total_size(4); if (all || type == NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN) size += nla_total_size(4); return size; } static int inet_netconf_fill_devconf(struct sk_buff *skb, int ifindex, const struct ipv4_devconf *devconf, u32 portid, u32 seq, int event, unsigned int flags, int type) { struct nlmsghdr *nlh; struct netconfmsg *ncm; bool all = false; nlh = nlmsg_put(skb, portid, seq, event, sizeof(struct netconfmsg), flags); if (!nlh) return -EMSGSIZE; if (type == NETCONFA_ALL) all = true; ncm = nlmsg_data(nlh); ncm->ncm_family = AF_INET; if (nla_put_s32(skb, NETCONFA_IFINDEX, ifindex) < 0) goto nla_put_failure; if (!devconf) goto out; if ((all || type == NETCONFA_FORWARDING) && nla_put_s32(skb, NETCONFA_FORWARDING, IPV4_DEVCONF_RO(*devconf, FORWARDING)) < 0) goto nla_put_failure; if ((all || type == NETCONFA_RP_FILTER) && nla_put_s32(skb, NETCONFA_RP_FILTER, IPV4_DEVCONF_RO(*devconf, RP_FILTER)) < 0) goto nla_put_failure; if ((all || type == NETCONFA_MC_FORWARDING) && nla_put_s32(skb, NETCONFA_MC_FORWARDING, IPV4_DEVCONF_RO(*devconf, MC_FORWARDING)) < 0) goto nla_put_failure; if ((all || type == NETCONFA_BC_FORWARDING) && nla_put_s32(skb, NETCONFA_BC_FORWARDING, IPV4_DEVCONF_RO(*devconf, BC_FORWARDING)) < 0) goto nla_put_failure; if ((all || type == NETCONFA_PROXY_NEIGH) && nla_put_s32(skb, NETCONFA_PROXY_NEIGH, IPV4_DEVCONF_RO(*devconf, PROXY_ARP)) < 0) goto nla_put_failure; if ((all || type == NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN) && nla_put_s32(skb, NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN, IPV4_DEVCONF_RO(*devconf, IGNORE_ROUTES_WITH_LINKDOWN)) < 0) goto nla_put_failure; out: nlmsg_end(skb, nlh); return 0; nla_put_failure: nlmsg_cancel(skb, nlh); return -EMSGSIZE; } void inet_netconf_notify_devconf(struct net *net, int event, int type, int ifindex, struct ipv4_devconf *devconf) { struct sk_buff *skb; int err = -ENOBUFS; skb = nlmsg_new(inet_netconf_msgsize_devconf(type), GFP_KERNEL); if (!skb) goto errout; err = inet_netconf_fill_devconf(skb, ifindex, devconf, 0, 0, event, 0, type); if (err < 0) { /* -EMSGSIZE implies BUG in inet_netconf_msgsize_devconf() */ WARN_ON(err == -EMSGSIZE); kfree_skb(skb); goto errout; } rtnl_notify(skb, net, 0, RTNLGRP_IPV4_NETCONF, NULL, GFP_KERNEL); return; errout: rtnl_set_sk_err(net, RTNLGRP_IPV4_NETCONF, err); } static const struct nla_policy devconf_ipv4_policy[NETCONFA_MAX+1] = { [NETCONFA_IFINDEX] = { .len = sizeof(int) }, [NETCONFA_FORWARDING] = { .len = sizeof(int) }, [NETCONFA_RP_FILTER] = { .len = sizeof(int) }, [NETCONFA_PROXY_NEIGH] = { .len = sizeof(int) }, [NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN] = { .len = sizeof(int) }, }; static int inet_netconf_valid_get_req(struct sk_buff *skb, const struct nlmsghdr *nlh, struct nlattr **tb, struct netlink_ext_ack *extack) { int i, err; if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(struct netconfmsg))) { NL_SET_ERR_MSG(extack, "ipv4: Invalid header for netconf get request"); return -EINVAL; } if (!netlink_strict_get_check(skb)) return nlmsg_parse_deprecated(nlh, sizeof(struct netconfmsg), tb, NETCONFA_MAX, devconf_ipv4_policy, extack); err = nlmsg_parse_deprecated_strict(nlh, sizeof(struct netconfmsg), tb, NETCONFA_MAX, devconf_ipv4_policy, extack); if (err) return err; for (i = 0; i <= NETCONFA_MAX; i++) { if (!tb[i]) continue; switch (i) { case NETCONFA_IFINDEX: break; default: NL_SET_ERR_MSG(extack, "ipv4: Unsupported attribute in netconf get request"); return -EINVAL; } } return 0; } static int inet_netconf_get_devconf(struct sk_buff *in_skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct net *net = sock_net(in_skb->sk); struct nlattr *tb[NETCONFA_MAX + 1]; const struct ipv4_devconf *devconf; struct in_device *in_dev = NULL; struct net_device *dev = NULL; struct sk_buff *skb; int ifindex; int err; err = inet_netconf_valid_get_req(in_skb, nlh, tb, extack); if (err) return err; if (!tb[NETCONFA_IFINDEX]) return -EINVAL; ifindex = nla_get_s32(tb[NETCONFA_IFINDEX]); switch (ifindex) { case NETCONFA_IFINDEX_ALL: devconf = net->ipv4.devconf_all; break; case NETCONFA_IFINDEX_DEFAULT: devconf = net->ipv4.devconf_dflt; break; default: err = -ENODEV; dev = dev_get_by_index(net, ifindex); if (dev) in_dev = in_dev_get(dev); if (!in_dev) goto errout; devconf = &in_dev->cnf; break; } err = -ENOBUFS; skb = nlmsg_new(inet_netconf_msgsize_devconf(NETCONFA_ALL), GFP_KERNEL); if (!skb) goto errout; err = inet_netconf_fill_devconf(skb, ifindex, devconf, NETLINK_CB(in_skb).portid, nlh->nlmsg_seq, RTM_NEWNETCONF, 0, NETCONFA_ALL); if (err < 0) { /* -EMSGSIZE implies BUG in inet_netconf_msgsize_devconf() */ WARN_ON(err == -EMSGSIZE); kfree_skb(skb); goto errout; } err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid); errout: if (in_dev) in_dev_put(in_dev); dev_put(dev); return err; } static int inet_netconf_dump_devconf(struct sk_buff *skb, struct netlink_callback *cb) { const struct nlmsghdr *nlh = cb->nlh; struct net *net = sock_net(skb->sk); struct { unsigned long ifindex; unsigned int all_default; } *ctx = (void *)cb->ctx; const struct in_device *in_dev; struct net_device *dev; int err = 0; if (cb->strict_check) { struct netlink_ext_ack *extack = cb->extack; struct netconfmsg *ncm; if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*ncm))) { NL_SET_ERR_MSG(extack, "ipv4: Invalid header for netconf dump request"); return -EINVAL; } if (nlmsg_attrlen(nlh, sizeof(*ncm))) { NL_SET_ERR_MSG(extack, "ipv4: Invalid data after header in netconf dump request"); return -EINVAL; } } rcu_read_lock(); for_each_netdev_dump(net, dev, ctx->ifindex) { in_dev = __in_dev_get_rcu(dev); if (!in_dev) continue; err = inet_netconf_fill_devconf(skb, dev->ifindex, &in_dev->cnf, NETLINK_CB(cb->skb).portid, nlh->nlmsg_seq, RTM_NEWNETCONF, NLM_F_MULTI, NETCONFA_ALL); if (err < 0) goto done; } if (ctx->all_default == 0) { err = inet_netconf_fill_devconf(skb, NETCONFA_IFINDEX_ALL, net->ipv4.devconf_all, NETLINK_CB(cb->skb).portid, nlh->nlmsg_seq, RTM_NEWNETCONF, NLM_F_MULTI, NETCONFA_ALL); if (err < 0) goto done; ctx->all_default++; } if (ctx->all_default == 1) { err = inet_netconf_fill_devconf(skb, NETCONFA_IFINDEX_DEFAULT, net->ipv4.devconf_dflt, NETLINK_CB(cb->skb).portid, nlh->nlmsg_seq, RTM_NEWNETCONF, NLM_F_MULTI, NETCONFA_ALL); if (err < 0) goto done; ctx->all_default++; } done: rcu_read_unlock(); return err; } #ifdef CONFIG_SYSCTL static void devinet_copy_dflt_conf(struct net *net, int i) { struct net_device *dev; rcu_read_lock(); for_each_netdev_rcu(net, dev) { struct in_device *in_dev; in_dev = __in_dev_get_rcu(dev); if (in_dev && !test_bit(i, in_dev->cnf.state)) in_dev->cnf.data[i] = net->ipv4.devconf_dflt->data[i]; } rcu_read_unlock(); } /* called with RTNL locked */ static void inet_forward_change(struct net *net) { struct net_device *dev; int on = IPV4_DEVCONF_ALL(net, FORWARDING); IPV4_DEVCONF_ALL(net, ACCEPT_REDIRECTS) = !on; IPV4_DEVCONF_DFLT(net, FORWARDING) = on; inet_netconf_notify_devconf(net, RTM_NEWNETCONF, NETCONFA_FORWARDING, NETCONFA_IFINDEX_ALL, net->ipv4.devconf_all); inet_netconf_notify_devconf(net, RTM_NEWNETCONF, NETCONFA_FORWARDING, NETCONFA_IFINDEX_DEFAULT, net->ipv4.devconf_dflt); for_each_netdev(net, dev) { struct in_device *in_dev; if (on) dev_disable_lro(dev); in_dev = __in_dev_get_rtnl_net(dev); if (in_dev) { IN_DEV_CONF_SET(in_dev, FORWARDING, on); inet_netconf_notify_devconf(net, RTM_NEWNETCONF, NETCONFA_FORWARDING, dev->ifindex, &in_dev->cnf); } } } static int devinet_conf_ifindex(struct net *net, struct ipv4_devconf *cnf) { if (cnf == net->ipv4.devconf_dflt) return NETCONFA_IFINDEX_DEFAULT; else if (cnf == net->ipv4.devconf_all) return NETCONFA_IFINDEX_ALL; else { struct in_device *idev = container_of(cnf, struct in_device, cnf); return idev->dev->ifindex; } } static int devinet_conf_proc(const struct ctl_table *ctl, int write, void *buffer, size_t *lenp, loff_t *ppos) { int old_value = *(int *)ctl->data; int ret = proc_dointvec(ctl, write, buffer, lenp, ppos); int new_value = *(int *)ctl->data; if (write) { struct ipv4_devconf *cnf = ctl->extra1; struct net *net = ctl->extra2; int i = (int *)ctl->data - cnf->data; int ifindex; set_bit(i, cnf->state); if (cnf == net->ipv4.devconf_dflt) devinet_copy_dflt_conf(net, i); if (i == IPV4_DEVCONF_ACCEPT_LOCAL - 1 || i == IPV4_DEVCONF_ROUTE_LOCALNET - 1) if ((new_value == 0) && (old_value != 0)) rt_cache_flush(net); if (i == IPV4_DEVCONF_BC_FORWARDING - 1 && new_value != old_value) rt_cache_flush(net); if (i == IPV4_DEVCONF_RP_FILTER - 1 && new_value != old_value) { ifindex = devinet_conf_ifindex(net, cnf); inet_netconf_notify_devconf(net, RTM_NEWNETCONF, NETCONFA_RP_FILTER, ifindex, cnf); } if (i == IPV4_DEVCONF_PROXY_ARP - 1 && new_value != old_value) { ifindex = devinet_conf_ifindex(net, cnf); inet_netconf_notify_devconf(net, RTM_NEWNETCONF, NETCONFA_PROXY_NEIGH, ifindex, cnf); } if (i == IPV4_DEVCONF_IGNORE_ROUTES_WITH_LINKDOWN - 1 && new_value != old_value) { ifindex = devinet_conf_ifindex(net, cnf); inet_netconf_notify_devconf(net, RTM_NEWNETCONF, NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN, ifindex, cnf); } } return ret; } static int devinet_sysctl_forward(const struct ctl_table *ctl, int write, void *buffer, size_t *lenp, loff_t *ppos) { int *valp = ctl->data; int val = *valp; loff_t pos = *ppos; struct net *net = ctl->extra2; int ret; if (write && !ns_capable(net->user_ns, CAP_NET_ADMIN)) return -EPERM; ret = proc_dointvec(ctl, write, buffer, lenp, ppos); if (write && *valp != val) { if (valp != &IPV4_DEVCONF_DFLT(net, FORWARDING)) { if (!rtnl_net_trylock(net)) { /* Restore the original values before restarting */ *valp = val; *ppos = pos; return restart_syscall(); } if (valp == &IPV4_DEVCONF_ALL(net, FORWARDING)) { inet_forward_change(net); } else { struct ipv4_devconf *cnf = ctl->extra1; struct in_device *idev = container_of(cnf, struct in_device, cnf); if (*valp) dev_disable_lro(idev->dev); inet_netconf_notify_devconf(net, RTM_NEWNETCONF, NETCONFA_FORWARDING, idev->dev->ifindex, cnf); } rtnl_net_unlock(net); rt_cache_flush(net); } else inet_netconf_notify_devconf(net, RTM_NEWNETCONF, NETCONFA_FORWARDING, NETCONFA_IFINDEX_DEFAULT, net->ipv4.devconf_dflt); } return ret; } static int ipv4_doint_and_flush(const struct ctl_table *ctl, int write, void *buffer, size_t *lenp, loff_t *ppos) { int *valp = ctl->data; int val = *valp; int ret = proc_dointvec(ctl, write, buffer, lenp, ppos); struct net *net = ctl->extra2; if (write && *valp != val) rt_cache_flush(net); return ret; } #define DEVINET_SYSCTL_ENTRY(attr, name, mval, proc) \ { \ .procname = name, \ .data = ipv4_devconf.data + \ IPV4_DEVCONF_ ## attr - 1, \ .maxlen = sizeof(int), \ .mode = mval, \ .proc_handler = proc, \ .extra1 = &ipv4_devconf, \ } #define DEVINET_SYSCTL_RW_ENTRY(attr, name) \ DEVINET_SYSCTL_ENTRY(attr, name, 0644, devinet_conf_proc) #define DEVINET_SYSCTL_RO_ENTRY(attr, name) \ DEVINET_SYSCTL_ENTRY(attr, name, 0444, devinet_conf_proc) #define DEVINET_SYSCTL_COMPLEX_ENTRY(attr, name, proc) \ DEVINET_SYSCTL_ENTRY(attr, name, 0644, proc) #define DEVINET_SYSCTL_FLUSHING_ENTRY(attr, name) \ DEVINET_SYSCTL_COMPLEX_ENTRY(attr, name, ipv4_doint_and_flush) static struct devinet_sysctl_table { struct ctl_table_header *sysctl_header; struct ctl_table devinet_vars[IPV4_DEVCONF_MAX]; } devinet_sysctl = { .devinet_vars = { DEVINET_SYSCTL_COMPLEX_ENTRY(FORWARDING, "forwarding", devinet_sysctl_forward), DEVINET_SYSCTL_RO_ENTRY(MC_FORWARDING, "mc_forwarding"), DEVINET_SYSCTL_RW_ENTRY(BC_FORWARDING, "bc_forwarding"), DEVINET_SYSCTL_RW_ENTRY(ACCEPT_REDIRECTS, "accept_redirects"), DEVINET_SYSCTL_RW_ENTRY(SECURE_REDIRECTS, "secure_redirects"), DEVINET_SYSCTL_RW_ENTRY(SHARED_MEDIA, "shared_media"), DEVINET_SYSCTL_RW_ENTRY(RP_FILTER, "rp_filter"), DEVINET_SYSCTL_RW_ENTRY(SEND_REDIRECTS, "send_redirects"), DEVINET_SYSCTL_RW_ENTRY(ACCEPT_SOURCE_ROUTE, "accept_source_route"), DEVINET_SYSCTL_RW_ENTRY(ACCEPT_LOCAL, "accept_local"), DEVINET_SYSCTL_RW_ENTRY(SRC_VMARK, "src_valid_mark"), DEVINET_SYSCTL_RW_ENTRY(PROXY_ARP, "proxy_arp"), DEVINET_SYSCTL_RW_ENTRY(MEDIUM_ID, "medium_id"), DEVINET_SYSCTL_RW_ENTRY(BOOTP_RELAY, "bootp_relay"), DEVINET_SYSCTL_RW_ENTRY(LOG_MARTIANS, "log_martians"), DEVINET_SYSCTL_RW_ENTRY(TAG, "tag"), DEVINET_SYSCTL_RW_ENTRY(ARPFILTER, "arp_filter"), DEVINET_SYSCTL_RW_ENTRY(ARP_ANNOUNCE, "arp_announce"), DEVINET_SYSCTL_RW_ENTRY(ARP_IGNORE, "arp_ignore"), DEVINET_SYSCTL_RW_ENTRY(ARP_ACCEPT, "arp_accept"), DEVINET_SYSCTL_RW_ENTRY(ARP_NOTIFY, "arp_notify"), DEVINET_SYSCTL_RW_ENTRY(ARP_EVICT_NOCARRIER, "arp_evict_nocarrier"), DEVINET_SYSCTL_RW_ENTRY(PROXY_ARP_PVLAN, "proxy_arp_pvlan"), DEVINET_SYSCTL_RW_ENTRY(FORCE_IGMP_VERSION, "force_igmp_version"), DEVINET_SYSCTL_RW_ENTRY(IGMPV2_UNSOLICITED_REPORT_INTERVAL, "igmpv2_unsolicited_report_interval"), DEVINET_SYSCTL_RW_ENTRY(IGMPV3_UNSOLICITED_REPORT_INTERVAL, "igmpv3_unsolicited_report_interval"), DEVINET_SYSCTL_RW_ENTRY(IGNORE_ROUTES_WITH_LINKDOWN, "ignore_routes_with_linkdown"), DEVINET_SYSCTL_RW_ENTRY(DROP_GRATUITOUS_ARP, "drop_gratuitous_arp"), DEVINET_SYSCTL_FLUSHING_ENTRY(NOXFRM, "disable_xfrm"), DEVINET_SYSCTL_FLUSHING_ENTRY(NOPOLICY, "disable_policy"), DEVINET_SYSCTL_FLUSHING_ENTRY(PROMOTE_SECONDARIES, "promote_secondaries"), DEVINET_SYSCTL_FLUSHING_ENTRY(ROUTE_LOCALNET, "route_localnet"), DEVINET_SYSCTL_FLUSHING_ENTRY(DROP_UNICAST_IN_L2_MULTICAST, "drop_unicast_in_l2_multicast"), }, }; static int __devinet_sysctl_register(struct net *net, char *dev_name, int ifindex, struct ipv4_devconf *p) { int i; struct devinet_sysctl_table *t; char path[sizeof("net/ipv4/conf/") + IFNAMSIZ]; t = kmemdup(&devinet_sysctl, sizeof(*t), GFP_KERNEL_ACCOUNT); if (!t) goto out; for (i = 0; i < ARRAY_SIZE(t->devinet_vars); i++) { t->devinet_vars[i].data += (char *)p - (char *)&ipv4_devconf; t->devinet_vars[i].extra1 = p; t->devinet_vars[i].extra2 = net; } snprintf(path, sizeof(path), "net/ipv4/conf/%s", dev_name); t->sysctl_header = register_net_sysctl(net, path, t->devinet_vars); if (!t->sysctl_header) goto free; p->sysctl = t; inet_netconf_notify_devconf(net, RTM_NEWNETCONF, NETCONFA_ALL, ifindex, p); return 0; free: kfree(t); out: return -ENOMEM; } static void __devinet_sysctl_unregister(struct net *net, struct ipv4_devconf *cnf, int ifindex) { struct devinet_sysctl_table *t = cnf->sysctl; if (t) { cnf->sysctl = NULL; unregister_net_sysctl_table(t->sysctl_header); kfree(t); } inet_netconf_notify_devconf(net, RTM_DELNETCONF, 0, ifindex, NULL); } static int devinet_sysctl_register(struct in_device *idev) { int err; if (!sysctl_dev_name_is_allowed(idev->dev->name)) return -EINVAL; err = neigh_sysctl_register(idev->dev, idev->arp_parms, NULL); if (err) return err; err = __devinet_sysctl_register(dev_net(idev->dev), idev->dev->name, idev->dev->ifindex, &idev->cnf); if (err) neigh_sysctl_unregister(idev->arp_parms); return err; } static void devinet_sysctl_unregister(struct in_device *idev) { struct net *net = dev_net(idev->dev); __devinet_sysctl_unregister(net, &idev->cnf, idev->dev->ifindex); neigh_sysctl_unregister(idev->arp_parms); } static struct ctl_table ctl_forward_entry[] = { { .procname = "ip_forward", .data = &ipv4_devconf.data[ IPV4_DEVCONF_FORWARDING - 1], .maxlen = sizeof(int), .mode = 0644, .proc_handler = devinet_sysctl_forward, .extra1 = &ipv4_devconf, .extra2 = &init_net, }, }; #endif static __net_init int devinet_init_net(struct net *net) { #ifdef CONFIG_SYSCTL struct ctl_table_header *forw_hdr; struct ctl_table *tbl; #endif struct ipv4_devconf *all, *dflt; int err; int i; err = -ENOMEM; net->ipv4.inet_addr_lst = kmalloc_array(IN4_ADDR_HSIZE, sizeof(struct hlist_head), GFP_KERNEL); if (!net->ipv4.inet_addr_lst) goto err_alloc_hash; all = kmemdup(&ipv4_devconf, sizeof(ipv4_devconf), GFP_KERNEL); if (!all) goto err_alloc_all; dflt = kmemdup(&ipv4_devconf_dflt, sizeof(ipv4_devconf_dflt), GFP_KERNEL); if (!dflt) goto err_alloc_dflt; #ifdef CONFIG_SYSCTL tbl = kmemdup(ctl_forward_entry, sizeof(ctl_forward_entry), GFP_KERNEL); if (!tbl) goto err_alloc_ctl; tbl[0].data = &all->data[IPV4_DEVCONF_FORWARDING - 1]; tbl[0].extra1 = all; tbl[0].extra2 = net; #endif if (!net_eq(net, &init_net)) { switch (net_inherit_devconf()) { case 3: /* copy from the current netns */ memcpy(all, current->nsproxy->net_ns->ipv4.devconf_all, sizeof(ipv4_devconf)); memcpy(dflt, current->nsproxy->net_ns->ipv4.devconf_dflt, sizeof(ipv4_devconf_dflt)); break; case 0: case 1: /* copy from init_net */ memcpy(all, init_net.ipv4.devconf_all, sizeof(ipv4_devconf)); memcpy(dflt, init_net.ipv4.devconf_dflt, sizeof(ipv4_devconf_dflt)); break; case 2: /* use compiled values */ break; } } #ifdef CONFIG_SYSCTL err = __devinet_sysctl_register(net, "all", NETCONFA_IFINDEX_ALL, all); if (err < 0) goto err_reg_all; err = __devinet_sysctl_register(net, "default", NETCONFA_IFINDEX_DEFAULT, dflt); if (err < 0) goto err_reg_dflt; err = -ENOMEM; forw_hdr = register_net_sysctl_sz(net, "net/ipv4", tbl, ARRAY_SIZE(ctl_forward_entry)); if (!forw_hdr) goto err_reg_ctl; net->ipv4.forw_hdr = forw_hdr; #endif for (i = 0; i < IN4_ADDR_HSIZE; i++) INIT_HLIST_HEAD(&net->ipv4.inet_addr_lst[i]); INIT_DEFERRABLE_WORK(&net->ipv4.addr_chk_work, check_lifetime); net->ipv4.devconf_all = all; net->ipv4.devconf_dflt = dflt; return 0; #ifdef CONFIG_SYSCTL err_reg_ctl: __devinet_sysctl_unregister(net, dflt, NETCONFA_IFINDEX_DEFAULT); err_reg_dflt: __devinet_sysctl_unregister(net, all, NETCONFA_IFINDEX_ALL); err_reg_all: kfree(tbl); err_alloc_ctl: #endif kfree(dflt); err_alloc_dflt: kfree(all); err_alloc_all: kfree(net->ipv4.inet_addr_lst); err_alloc_hash: return err; } static __net_exit void devinet_exit_net(struct net *net) { #ifdef CONFIG_SYSCTL const struct ctl_table *tbl; #endif cancel_delayed_work_sync(&net->ipv4.addr_chk_work); #ifdef CONFIG_SYSCTL tbl = net->ipv4.forw_hdr->ctl_table_arg; unregister_net_sysctl_table(net->ipv4.forw_hdr); __devinet_sysctl_unregister(net, net->ipv4.devconf_dflt, NETCONFA_IFINDEX_DEFAULT); __devinet_sysctl_unregister(net, net->ipv4.devconf_all, NETCONFA_IFINDEX_ALL); kfree(tbl); #endif kfree(net->ipv4.devconf_dflt); kfree(net->ipv4.devconf_all); kfree(net->ipv4.inet_addr_lst); } static __net_initdata struct pernet_operations devinet_ops = { .init = devinet_init_net, .exit = devinet_exit_net, }; static struct rtnl_af_ops inet_af_ops __read_mostly = { .family = AF_INET, .fill_link_af = inet_fill_link_af, .get_link_af_size = inet_get_link_af_size, .validate_link_af = inet_validate_link_af, .set_link_af = inet_set_link_af, }; static const struct rtnl_msg_handler devinet_rtnl_msg_handlers[] __initconst = { {.protocol = PF_INET, .msgtype = RTM_NEWADDR, .doit = inet_rtm_newaddr, .flags = RTNL_FLAG_DOIT_PERNET}, {.protocol = PF_INET, .msgtype = RTM_DELADDR, .doit = inet_rtm_deladdr, .flags = RTNL_FLAG_DOIT_PERNET}, {.protocol = PF_INET, .msgtype = RTM_GETADDR, .dumpit = inet_dump_ifaddr, .flags = RTNL_FLAG_DUMP_UNLOCKED | RTNL_FLAG_DUMP_SPLIT_NLM_DONE}, {.protocol = PF_INET, .msgtype = RTM_GETNETCONF, .doit = inet_netconf_get_devconf, .dumpit = inet_netconf_dump_devconf, .flags = RTNL_FLAG_DOIT_UNLOCKED | RTNL_FLAG_DUMP_UNLOCKED}, {.owner = THIS_MODULE, .protocol = PF_INET, .msgtype = RTM_GETMULTICAST, .dumpit = inet_dump_ifmcaddr, .flags = RTNL_FLAG_DUMP_UNLOCKED}, }; void __init devinet_init(void) { register_pernet_subsys(&devinet_ops); register_netdevice_notifier(&ip_netdev_notifier); if (rtnl_af_register(&inet_af_ops)) panic("Unable to register inet_af_ops\n"); rtnl_register_many(devinet_rtnl_msg_handlers); }
72 1 72 57 6 1 63 64 64 5 59 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 // SPDX-License-Identifier: GPL-2.0-or-later /* * Squashfs - a compressed read only filesystem for Linux * * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009 * Phillip Lougher <phillip@squashfs.org.uk> * * decompressor.c */ #include <linux/types.h> #include <linux/mutex.h> #include <linux/slab.h> #include "squashfs_fs.h" #include "squashfs_fs_sb.h" #include "decompressor.h" #include "squashfs.h" #include "page_actor.h" /* * This file (and decompressor.h) implements a decompressor framework for * Squashfs, allowing multiple decompressors to be easily supported */ static const struct squashfs_decompressor squashfs_lzma_unsupported_comp_ops = { NULL, NULL, NULL, NULL, LZMA_COMPRESSION, "lzma", 0 }; #ifndef CONFIG_SQUASHFS_LZ4 static const struct squashfs_decompressor squashfs_lz4_comp_ops = { NULL, NULL, NULL, NULL, LZ4_COMPRESSION, "lz4", 0 }; #endif #ifndef CONFIG_SQUASHFS_LZO static const struct squashfs_decompressor squashfs_lzo_comp_ops = { NULL, NULL, NULL, NULL, LZO_COMPRESSION, "lzo", 0 }; #endif #ifndef CONFIG_SQUASHFS_XZ static const struct squashfs_decompressor squashfs_xz_comp_ops = { NULL, NULL, NULL, NULL, XZ_COMPRESSION, "xz", 0 }; #endif #ifndef CONFIG_SQUASHFS_ZLIB static const struct squashfs_decompressor squashfs_zlib_comp_ops = { NULL, NULL, NULL, NULL, ZLIB_COMPRESSION, "zlib", 0 }; #endif #ifndef CONFIG_SQUASHFS_ZSTD static const struct squashfs_decompressor squashfs_zstd_comp_ops = { NULL, NULL, NULL, NULL, ZSTD_COMPRESSION, "zstd", 0 }; #endif static const struct squashfs_decompressor squashfs_unknown_comp_ops = { NULL, NULL, NULL, NULL, 0, "unknown", 0 }; static const struct squashfs_decompressor *decompressor[] = { &squashfs_zlib_comp_ops, &squashfs_lz4_comp_ops, &squashfs_lzo_comp_ops, &squashfs_xz_comp_ops, &squashfs_lzma_unsupported_comp_ops, &squashfs_zstd_comp_ops, &squashfs_unknown_comp_ops }; const struct squashfs_decompressor *squashfs_lookup_decompressor(int id) { int i; for (i = 0; decompressor[i]->id; i++) if (id == decompressor[i]->id) break; return decompressor[i]; } static void *get_comp_opts(struct super_block *sb, unsigned short flags) { struct squashfs_sb_info *msblk = sb->s_fs_info; void *buffer = NULL, *comp_opts; struct squashfs_page_actor *actor = NULL; int length = 0; /* * Read decompressor specific options from file system if present */ if (SQUASHFS_COMP_OPTS(flags)) { buffer = kmalloc(PAGE_SIZE, GFP_KERNEL); if (buffer == NULL) { comp_opts = ERR_PTR(-ENOMEM); goto out; } actor = squashfs_page_actor_init(&buffer, 1, 0); if (actor == NULL) { comp_opts = ERR_PTR(-ENOMEM); goto out; } length = squashfs_read_data(sb, sizeof(struct squashfs_super_block), 0, NULL, actor); if (length < 0) { comp_opts = ERR_PTR(length); goto out; } } comp_opts = squashfs_comp_opts(msblk, buffer, length); out: kfree(actor); kfree(buffer); return comp_opts; } void *squashfs_decompressor_setup(struct super_block *sb, unsigned short flags) { struct squashfs_sb_info *msblk = sb->s_fs_info; void *stream, *comp_opts = get_comp_opts(sb, flags); if (IS_ERR(comp_opts)) return comp_opts; stream = msblk->thread_ops->create(msblk, comp_opts); if (IS_ERR(stream)) kfree(comp_opts); return stream; }
1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 // SPDX-License-Identifier: GPL-2.0-or-later /* * xfrm_proc.c * * Copyright (C)2006-2007 USAGI/WIDE Project * * Authors: Masahide NAKAMURA <nakam@linux-ipv6.org> */ #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <linux/export.h> #include <net/snmp.h> #include <net/xfrm.h> static const struct snmp_mib xfrm_mib_list[] = { SNMP_MIB_ITEM("XfrmInError", LINUX_MIB_XFRMINERROR), SNMP_MIB_ITEM("XfrmInBufferError", LINUX_MIB_XFRMINBUFFERERROR), SNMP_MIB_ITEM("XfrmInHdrError", LINUX_MIB_XFRMINHDRERROR), SNMP_MIB_ITEM("XfrmInNoStates", LINUX_MIB_XFRMINNOSTATES), SNMP_MIB_ITEM("XfrmInStateProtoError", LINUX_MIB_XFRMINSTATEPROTOERROR), SNMP_MIB_ITEM("XfrmInStateModeError", LINUX_MIB_XFRMINSTATEMODEERROR), SNMP_MIB_ITEM("XfrmInStateSeqError", LINUX_MIB_XFRMINSTATESEQERROR), SNMP_MIB_ITEM("XfrmInStateExpired", LINUX_MIB_XFRMINSTATEEXPIRED), SNMP_MIB_ITEM("XfrmInStateMismatch", LINUX_MIB_XFRMINSTATEMISMATCH), SNMP_MIB_ITEM("XfrmInStateInvalid", LINUX_MIB_XFRMINSTATEINVALID), SNMP_MIB_ITEM("XfrmInTmplMismatch", LINUX_MIB_XFRMINTMPLMISMATCH), SNMP_MIB_ITEM("XfrmInNoPols", LINUX_MIB_XFRMINNOPOLS), SNMP_MIB_ITEM("XfrmInPolBlock", LINUX_MIB_XFRMINPOLBLOCK), SNMP_MIB_ITEM("XfrmInPolError", LINUX_MIB_XFRMINPOLERROR), SNMP_MIB_ITEM("XfrmOutError", LINUX_MIB_XFRMOUTERROR), SNMP_MIB_ITEM("XfrmOutBundleGenError", LINUX_MIB_XFRMOUTBUNDLEGENERROR), SNMP_MIB_ITEM("XfrmOutBundleCheckError", LINUX_MIB_XFRMOUTBUNDLECHECKERROR), SNMP_MIB_ITEM("XfrmOutNoStates", LINUX_MIB_XFRMOUTNOSTATES), SNMP_MIB_ITEM("XfrmOutStateProtoError", LINUX_MIB_XFRMOUTSTATEPROTOERROR), SNMP_MIB_ITEM("XfrmOutStateModeError", LINUX_MIB_XFRMOUTSTATEMODEERROR), SNMP_MIB_ITEM("XfrmOutStateSeqError", LINUX_MIB_XFRMOUTSTATESEQERROR), SNMP_MIB_ITEM("XfrmOutStateExpired", LINUX_MIB_XFRMOUTSTATEEXPIRED), SNMP_MIB_ITEM("XfrmOutPolBlock", LINUX_MIB_XFRMOUTPOLBLOCK), SNMP_MIB_ITEM("XfrmOutPolDead", LINUX_MIB_XFRMOUTPOLDEAD), SNMP_MIB_ITEM("XfrmOutPolError", LINUX_MIB_XFRMOUTPOLERROR), SNMP_MIB_ITEM("XfrmFwdHdrError", LINUX_MIB_XFRMFWDHDRERROR), SNMP_MIB_ITEM("XfrmOutStateInvalid", LINUX_MIB_XFRMOUTSTATEINVALID), SNMP_MIB_ITEM("XfrmAcquireError", LINUX_MIB_XFRMACQUIREERROR), SNMP_MIB_ITEM("XfrmOutStateDirError", LINUX_MIB_XFRMOUTSTATEDIRERROR), SNMP_MIB_ITEM("XfrmInStateDirError", LINUX_MIB_XFRMINSTATEDIRERROR), SNMP_MIB_ITEM("XfrmInIptfsError", LINUX_MIB_XFRMINIPTFSERROR), SNMP_MIB_ITEM("XfrmOutNoQueueSpace", LINUX_MIB_XFRMOUTNOQSPACE), }; static int xfrm_statistics_seq_show(struct seq_file *seq, void *v) { unsigned long buff[ARRAY_SIZE(xfrm_mib_list)]; const int cnt = ARRAY_SIZE(xfrm_mib_list); struct net *net = seq->private; int i; memset(buff, 0, sizeof(buff)); xfrm_state_update_stats(net); snmp_get_cpu_field_batch_cnt(buff, xfrm_mib_list, cnt, net->mib.xfrm_statistics); for (i = 0; i < cnt; i++) seq_printf(seq, "%-24s\t%lu\n", xfrm_mib_list[i].name, buff[i]); return 0; } int __net_init xfrm_proc_init(struct net *net) { if (!proc_create_net_single("xfrm_stat", 0444, net->proc_net, xfrm_statistics_seq_show, NULL)) return -ENOMEM; return 0; } void xfrm_proc_fini(struct net *net) { remove_proc_entry("xfrm_stat", net->proc_net); }
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _IPV6_H #define _IPV6_H #include <uapi/linux/ipv6.h> #include <linux/cache.h> #define ipv6_optlen(p) (((p)->hdrlen+1) << 3) #define ipv6_authlen(p) (((p)->hdrlen+2) << 2) /* * This structure contains configuration options per IPv6 link. */ struct ipv6_devconf { /* RX & TX fastpath fields. */ __cacheline_group_begin(ipv6_devconf_read_txrx); __s32 disable_ipv6; __s32 hop_limit; __s32 mtu6; __s32 forwarding; __s32 force_forwarding; __s32 disable_policy; __s32 proxy_ndp; __cacheline_group_end(ipv6_devconf_read_txrx); __s32 accept_ra; __s32 accept_redirects; __s32 autoconf; __s32 dad_transmits; __s32 rtr_solicits; __s32 rtr_solicit_interval; __s32 rtr_solicit_max_interval; __s32 rtr_solicit_delay; __s32 force_mld_version; __s32 mldv1_unsolicited_report_interval; __s32 mldv2_unsolicited_report_interval; __s32 use_tempaddr; __s32 temp_valid_lft; __s32 temp_prefered_lft; __s32 regen_min_advance; __s32 regen_max_retry; __s32 max_desync_factor; __s32 max_addresses; __s32 accept_ra_defrtr; __u32 ra_defrtr_metric; __s32 accept_ra_min_hop_limit; __s32 accept_ra_min_lft; __s32 accept_ra_pinfo; __s32 ignore_routes_with_linkdown; #ifdef CONFIG_IPV6_ROUTER_PREF __s32 accept_ra_rtr_pref; __s32 rtr_probe_interval; #ifdef CONFIG_IPV6_ROUTE_INFO __s32 accept_ra_rt_info_min_plen; __s32 accept_ra_rt_info_max_plen; #endif #endif __s32 accept_source_route; __s32 accept_ra_from_local; #ifdef CONFIG_IPV6_OPTIMISTIC_DAD __s32 optimistic_dad; __s32 use_optimistic; #endif #ifdef CONFIG_IPV6_MROUTE atomic_t mc_forwarding; #endif __s32 drop_unicast_in_l2_multicast; __s32 accept_dad; __s32 force_tllao; __s32 ndisc_notify; __s32 suppress_frag_ndisc; __s32 accept_ra_mtu; __s32 drop_unsolicited_na; __s32 accept_untracked_na; struct ipv6_stable_secret { bool initialized; struct in6_addr secret; } stable_secret; __s32 use_oif_addrs_only; __s32 keep_addr_on_down; __s32 seg6_enabled; #ifdef CONFIG_IPV6_SEG6_HMAC __s32 seg6_require_hmac; #endif __u32 enhanced_dad; __u32 addr_gen_mode; __s32 ndisc_tclass; __s32 rpl_seg_enabled; __u32 ioam6_id; __u32 ioam6_id_wide; __u8 ioam6_enabled; __u8 ndisc_evict_nocarrier; __u8 ra_honor_pio_life; __u8 ra_honor_pio_pflag; struct ctl_table_header *sysctl_header; }; struct ipv6_params { __s32 disable_ipv6; __s32 autoconf; }; extern struct ipv6_params ipv6_defaults; #include <linux/tcp.h> #include <linux/udp.h> #include <net/inet_sock.h> static inline struct ipv6hdr *ipv6_hdr(const struct sk_buff *skb) { return (struct ipv6hdr *)skb_network_header(skb); } static inline struct ipv6hdr *inner_ipv6_hdr(const struct sk_buff *skb) { return (struct ipv6hdr *)skb_inner_network_header(skb); } static inline struct ipv6hdr *ipipv6_hdr(const struct sk_buff *skb) { return (struct ipv6hdr *)skb_transport_header(skb); } static inline unsigned int ipv6_transport_len(const struct sk_buff *skb) { return ntohs(ipv6_hdr(skb)->payload_len) + sizeof(struct ipv6hdr) - skb_network_header_len(skb); } /* This structure contains results of exthdrs parsing as offsets from skb->nh. */ struct inet6_skb_parm { int iif; __be16 ra; __u16 dst0; __u16 srcrt; __u16 dst1; __u16 lastopt; __u16 nhoff; __u16 flags; #if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE) __u16 dsthao; #endif __u16 frag_max_size; __u16 srhoff; #define IP6SKB_XFRM_TRANSFORMED 1 #define IP6SKB_FORWARDED 2 #define IP6SKB_REROUTED 4 #define IP6SKB_ROUTERALERT 8 #define IP6SKB_FRAGMENTED 16 #define IP6SKB_HOPBYHOP 32 #define IP6SKB_L3SLAVE 64 #define IP6SKB_JUMBOGRAM 128 #define IP6SKB_SEG6 256 #define IP6SKB_FAKEJUMBO 512 #define IP6SKB_MULTIPATH 1024 #define IP6SKB_MCROUTE 2048 }; #if defined(CONFIG_NET_L3_MASTER_DEV) static inline bool ipv6_l3mdev_skb(__u16 flags) { return flags & IP6SKB_L3SLAVE; } #else static inline bool ipv6_l3mdev_skb(__u16 flags) { return false; } #endif #define IP6CB(skb) ((struct inet6_skb_parm*)((skb)->cb)) #define IP6CBMTU(skb) ((struct ip6_mtuinfo *)((skb)->cb)) static inline int inet6_iif(const struct sk_buff *skb) { bool l3_slave = ipv6_l3mdev_skb(IP6CB(skb)->flags); return l3_slave ? skb->skb_iif : IP6CB(skb)->iif; } static inline bool inet6_is_jumbogram(const struct sk_buff *skb) { return !!(IP6CB(skb)->flags & IP6SKB_JUMBOGRAM); } /* can not be used in TCP layer after tcp_v6_fill_cb */ static inline int inet6_sdif(const struct sk_buff *skb) { #if IS_ENABLED(CONFIG_NET_L3_MASTER_DEV) if (skb && ipv6_l3mdev_skb(IP6CB(skb)->flags)) return IP6CB(skb)->iif; #endif return 0; } struct tcp6_request_sock { struct tcp_request_sock tcp6rsk_tcp; }; struct ipv6_mc_socklist; struct ipv6_ac_socklist; struct ipv6_fl_socklist; struct inet6_cork { struct ipv6_txoptions *opt; u8 hop_limit; u8 tclass; u8 dontfrag:1; }; /* struct ipv6_pinfo - ipv6 private area */ struct ipv6_pinfo { /* Used in tx path (inet6_csk_route_socket(), ip6_xmit()) */ struct in6_addr saddr; __be32 flow_label; u32 dst_cookie; struct ipv6_txoptions __rcu *opt; s16 hop_limit; u8 pmtudisc; u8 tclass; #ifdef CONFIG_IPV6_SUBTREES bool saddr_cache; #endif bool daddr_cache; u8 mcast_hops; u32 frag_size; int ucast_oif; int mcast_oif; /* pktoption flags */ union { struct { u16 srcrt:1, osrcrt:1, rxinfo:1, rxoinfo:1, rxhlim:1, rxohlim:1, hopopts:1, ohopopts:1, dstopts:1, odstopts:1, rxflow:1, rxtclass:1, rxpmtu:1, rxorigdstaddr:1, recvfragsize:1; /* 1 bits hole */ } bits; u16 all; } rxopt; /* sockopt flags */ u8 srcprefs; /* 001: prefer temporary address * 010: prefer public address * 100: prefer care-of address */ u8 min_hopcount; __be32 rcv_flowinfo; struct in6_pktinfo sticky_pktinfo; struct sk_buff *pktoptions; struct sk_buff *rxpmtu; struct inet6_cork cork; struct ipv6_mc_socklist __rcu *ipv6_mc_list; struct ipv6_ac_socklist *ipv6_ac_list; struct ipv6_fl_socklist __rcu *ipv6_fl_list; }; /* We currently use available bits from inet_sk(sk)->inet_flags, * this could change in the future. */ #define inet6_test_bit(nr, sk) \ test_bit(INET_FLAGS_##nr, &inet_sk(sk)->inet_flags) #define inet6_set_bit(nr, sk) \ set_bit(INET_FLAGS_##nr, &inet_sk(sk)->inet_flags) #define inet6_clear_bit(nr, sk) \ clear_bit(INET_FLAGS_##nr, &inet_sk(sk)->inet_flags) #define inet6_assign_bit(nr, sk, val) \ assign_bit(INET_FLAGS_##nr, &inet_sk(sk)->inet_flags, val) /* WARNING: don't change the layout of the members in {raw,udp,tcp}6_sock! */ struct raw6_sock { /* inet_sock has to be the first member of raw6_sock */ struct inet_sock inet; __u32 checksum; /* perform checksum */ __u32 offset; /* checksum offset */ struct icmp6_filter filter; __u32 ip6mr_table; struct numa_drop_counters drop_counters; struct ipv6_pinfo inet6; }; struct udp6_sock { struct udp_sock udp; struct ipv6_pinfo inet6; }; struct tcp6_sock { struct tcp_sock tcp; struct ipv6_pinfo inet6; }; extern int inet6_sk_rebuild_header(struct sock *sk); struct tcp6_timewait_sock { struct tcp_timewait_sock tcp6tw_tcp; }; #if IS_ENABLED(CONFIG_IPV6) bool ipv6_mod_enabled(void); static inline struct ipv6_pinfo *inet6_sk(const struct sock *__sk) { return sk_fullsock(__sk) ? inet_sk(__sk)->pinet6 : NULL; } #define raw6_sk(ptr) container_of_const(ptr, struct raw6_sock, inet.sk) #define ipv6_only_sock(sk) (sk->sk_ipv6only) #define ipv6_sk_rxinfo(sk) ((sk)->sk_family == PF_INET6 && \ inet6_sk(sk)->rxopt.bits.rxinfo) static inline const struct in6_addr *inet6_rcv_saddr(const struct sock *sk) { if (sk->sk_family == AF_INET6) return &sk->sk_v6_rcv_saddr; return NULL; } static inline int inet_v6_ipv6only(const struct sock *sk) { /* ipv6only field is at same position for timewait and other sockets */ return ipv6_only_sock(sk); } #else #define ipv6_only_sock(sk) 0 #define ipv6_sk_rxinfo(sk) 0 static inline bool ipv6_mod_enabled(void) { return false; } static inline struct ipv6_pinfo * inet6_sk(const struct sock *__sk) { return NULL; } static inline struct raw6_sock *raw6_sk(const struct sock *sk) { return NULL; } #define inet6_rcv_saddr(__sk) NULL #define inet_v6_ipv6only(__sk) 0 #endif /* IS_ENABLED(CONFIG_IPV6) */ #endif /* _IPV6_H */
1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 /* SPDX-License-Identifier: GPL-2.0 */ /* * Portions of this file * Copyright(c) 2016-2017 Intel Deutschland GmbH * Copyright (C) 2018 - 2024 Intel Corporation */ #if !defined(__MAC80211_DRIVER_TRACE) || defined(TRACE_HEADER_MULTI_READ) #define __MAC80211_DRIVER_TRACE #include <linux/tracepoint.h> #include <net/mac80211.h> #include "ieee80211_i.h" #undef TRACE_SYSTEM #define TRACE_SYSTEM mac80211 #define MAXNAME 32 #define LOCAL_ENTRY __array(char, wiphy_name, 32) #define LOCAL_ASSIGN strscpy(__entry->wiphy_name, wiphy_name(local->hw.wiphy), MAXNAME) #define LOCAL_PR_FMT "%s" #define LOCAL_PR_ARG __entry->wiphy_name #define STA_ENTRY __array(char, sta_addr, ETH_ALEN) #define STA_ASSIGN (sta ? memcpy(__entry->sta_addr, sta->addr, ETH_ALEN) : \ eth_zero_addr(__entry->sta_addr)) #define STA_NAMED_ASSIGN(s) memcpy(__entry->sta_addr, (s)->addr, ETH_ALEN) #define STA_PR_FMT " sta:%pM" #define STA_PR_ARG __entry->sta_addr #define VIF_ENTRY __field(enum nl80211_iftype, vif_type) __field(void *, sdata) \ __field(bool, p2p) \ __string(vif_name, sdata->name) #define VIF_ASSIGN __entry->vif_type = sdata->vif.type; __entry->sdata = sdata; \ __entry->p2p = sdata->vif.p2p; \ __assign_str(vif_name) #define VIF_PR_FMT " vif:%s(%d%s)" #define VIF_PR_ARG __get_str(vif_name), __entry->vif_type, __entry->p2p ? "/p2p" : "" #define CHANDEF_ENTRY __field(u32, control_freq) \ __field(u32, freq_offset) \ __field(u32, chan_width) \ __field(u32, center_freq1) \ __field(u32, freq1_offset) \ __field(u32, center_freq2) #define CHANDEF_ASSIGN(c) \ __entry->control_freq = (c) ? ((c)->chan ? (c)->chan->center_freq : 0) : 0; \ __entry->freq_offset = (c) ? ((c)->chan ? (c)->chan->freq_offset : 0) : 0; \ __entry->chan_width = (c) ? (c)->width : 0; \ __entry->center_freq1 = (c) ? (c)->center_freq1 : 0; \ __entry->freq1_offset = (c) ? (c)->freq1_offset : 0; \ __entry->center_freq2 = (c) ? (c)->center_freq2 : 0; #define CHANDEF_PR_FMT " chandef(%d.%03d MHz,width:%d,center: %d.%03d/%d MHz)" #define CHANDEF_PR_ARG __entry->control_freq, __entry->freq_offset, __entry->chan_width, \ __entry->center_freq1, __entry->freq1_offset, __entry->center_freq2 #define MIN_CHANDEF_ENTRY \ __field(u32, min_control_freq) \ __field(u32, min_freq_offset) \ __field(u32, min_chan_width) \ __field(u32, min_center_freq1) \ __field(u32, min_freq1_offset) \ __field(u32, min_center_freq2) #define MIN_CHANDEF_ASSIGN(c) \ __entry->min_control_freq = (c)->chan ? (c)->chan->center_freq : 0; \ __entry->min_freq_offset = (c)->chan ? (c)->chan->freq_offset : 0; \ __entry->min_chan_width = (c)->width; \ __entry->min_center_freq1 = (c)->center_freq1; \ __entry->min_freq1_offset = (c)->freq1_offset; \ __entry->min_center_freq2 = (c)->center_freq2; #define MIN_CHANDEF_PR_FMT " mindef(%d.%03d MHz,width:%d,center: %d.%03d/%d MHz)" #define MIN_CHANDEF_PR_ARG __entry->min_control_freq, __entry->min_freq_offset, \ __entry->min_chan_width, \ __entry->min_center_freq1, __entry->min_freq1_offset, \ __entry->min_center_freq2 #define AP_CHANDEF_ENTRY \ __field(u32, ap_control_freq) \ __field(u32, ap_freq_offset) \ __field(u32, ap_chan_width) \ __field(u32, ap_center_freq1) \ __field(u32, ap_freq1_offset) \ __field(u32, ap_center_freq2) #define AP_CHANDEF_ASSIGN(c) \ __entry->ap_control_freq = (c)->chan ? (c)->chan->center_freq : 0;\ __entry->ap_freq_offset = (c)->chan ? (c)->chan->freq_offset : 0;\ __entry->ap_chan_width = (c)->chan ? (c)->width : 0; \ __entry->ap_center_freq1 = (c)->chan ? (c)->center_freq1 : 0; \ __entry->ap_freq1_offset = (c)->chan ? (c)->freq1_offset : 0; \ __entry->ap_center_freq2 = (c)->chan ? (c)->center_freq2 : 0; #define AP_CHANDEF_PR_FMT " ap(%d.%03d MHz,width:%d,center: %d.%03d/%d MHz)" #define AP_CHANDEF_PR_ARG __entry->ap_control_freq, __entry->ap_freq_offset, \ __entry->ap_chan_width, \ __entry->ap_center_freq1, __entry->ap_freq1_offset, \ __entry->ap_center_freq2 #define CHANCTX_ENTRY CHANDEF_ENTRY \ MIN_CHANDEF_ENTRY \ AP_CHANDEF_ENTRY \ __field(u8, rx_chains_static) \ __field(u8, rx_chains_dynamic) #define CHANCTX_ASSIGN CHANDEF_ASSIGN(&ctx->conf.def) \ MIN_CHANDEF_ASSIGN(&ctx->conf.min_def) \ AP_CHANDEF_ASSIGN(&ctx->conf.ap) \ __entry->rx_chains_static = ctx->conf.rx_chains_static; \ __entry->rx_chains_dynamic = ctx->conf.rx_chains_dynamic #define CHANCTX_PR_FMT CHANDEF_PR_FMT MIN_CHANDEF_PR_FMT AP_CHANDEF_PR_FMT " chains:%d/%d" #define CHANCTX_PR_ARG CHANDEF_PR_ARG, MIN_CHANDEF_PR_ARG, AP_CHANDEF_PR_ARG, \ __entry->rx_chains_static, __entry->rx_chains_dynamic #define KEY_ENTRY __field(u32, cipher) \ __field(u8, hw_key_idx) \ __field(u8, flags) \ __field(s8, keyidx) #define KEY_ASSIGN(k) __entry->cipher = (k)->cipher; \ __entry->flags = (k)->flags; \ __entry->keyidx = (k)->keyidx; \ __entry->hw_key_idx = (k)->hw_key_idx; #define KEY_PR_FMT " cipher:0x%x, flags=%#x, keyidx=%d, hw_key_idx=%d" #define KEY_PR_ARG __entry->cipher, __entry->flags, __entry->keyidx, __entry->hw_key_idx #define AMPDU_ACTION_ENTRY __field(enum ieee80211_ampdu_mlme_action, \ ieee80211_ampdu_mlme_action) \ STA_ENTRY \ __field(u16, tid) \ __field(u16, ssn) \ __field(u16, buf_size) \ __field(bool, amsdu) \ __field(u16, timeout) \ __field(u16, action) #define AMPDU_ACTION_ASSIGN STA_NAMED_ASSIGN(params->sta); \ __entry->tid = params->tid; \ __entry->ssn = params->ssn; \ __entry->buf_size = params->buf_size; \ __entry->amsdu = params->amsdu; \ __entry->timeout = params->timeout; \ __entry->action = params->action; #define AMPDU_ACTION_PR_FMT STA_PR_FMT " tid %d, ssn %d, buf_size %u, amsdu %d, timeout %d action %d" #define AMPDU_ACTION_PR_ARG STA_PR_ARG, __entry->tid, __entry->ssn, \ __entry->buf_size, __entry->amsdu, __entry->timeout, \ __entry->action /* * Tracing for driver callbacks. */ DECLARE_EVENT_CLASS(local_only_evt, TP_PROTO(struct ieee80211_local *local), TP_ARGS(local), TP_STRUCT__entry( LOCAL_ENTRY ), TP_fast_assign( LOCAL_ASSIGN; ), TP_printk(LOCAL_PR_FMT, LOCAL_PR_ARG) ); DECLARE_EVENT_CLASS(local_sdata_addr_evt, TP_PROTO(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata), TP_ARGS(local, sdata), TP_STRUCT__entry( LOCAL_ENTRY VIF_ENTRY __array(char, addr, ETH_ALEN) ), TP_fast_assign( LOCAL_ASSIGN; VIF_ASSIGN; memcpy(__entry->addr, sdata->vif.addr, ETH_ALEN); ), TP_printk( LOCAL_PR_FMT VIF_PR_FMT " addr:%pM", LOCAL_PR_ARG, VIF_PR_ARG, __entry->addr ) ); DECLARE_EVENT_CLASS(local_u32_evt, TP_PROTO(struct ieee80211_local *local, u32 value), TP_ARGS(local, value), TP_STRUCT__entry( LOCAL_ENTRY __field(u32, value) ), TP_fast_assign( LOCAL_ASSIGN; __entry->value = value; ), TP_printk( LOCAL_PR_FMT " value:%d", LOCAL_PR_ARG, __entry->value ) ); DECLARE_EVENT_CLASS(local_sdata_evt, TP_PROTO(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata), TP_ARGS(local, sdata), TP_STRUCT__entry( LOCAL_ENTRY VIF_ENTRY ), TP_fast_assign( LOCAL_ASSIGN; VIF_ASSIGN; ), TP_printk( LOCAL_PR_FMT VIF_PR_FMT, LOCAL_PR_ARG, VIF_PR_ARG ) ); DEFINE_EVENT(local_only_evt, drv_return_void, TP_PROTO(struct ieee80211_local *local), TP_ARGS(local) ); TRACE_EVENT(drv_return_int, TP_PROTO(struct ieee80211_local *local, int ret), TP_ARGS(local, ret), TP_STRUCT__entry( LOCAL_ENTRY __field(int, ret) ), TP_fast_assign( LOCAL_ASSIGN; __entry->ret = ret; ), TP_printk(LOCAL_PR_FMT " - %d", LOCAL_PR_ARG, __entry->ret) ); TRACE_EVENT(drv_return_bool, TP_PROTO(struct ieee80211_local *local, bool ret), TP_ARGS(local, ret), TP_STRUCT__entry( LOCAL_ENTRY __field(bool, ret) ), TP_fast_assign( LOCAL_ASSIGN; __entry->ret = ret; ), TP_printk(LOCAL_PR_FMT " - %s", LOCAL_PR_ARG, (__entry->ret) ? "true" : "false") ); TRACE_EVENT(drv_return_u32, TP_PROTO(struct ieee80211_local *local, u32 ret), TP_ARGS(local, ret), TP_STRUCT__entry( LOCAL_ENTRY __field(u32, ret) ), TP_fast_assign( LOCAL_ASSIGN; __entry->ret = ret; ), TP_printk(LOCAL_PR_FMT " - %u", LOCAL_PR_ARG, __entry->ret) ); TRACE_EVENT(drv_return_u64, TP_PROTO(struct ieee80211_local *local, u64 ret), TP_ARGS(local, ret), TP_STRUCT__entry( LOCAL_ENTRY __field(u64, ret) ), TP_fast_assign( LOCAL_ASSIGN; __entry->ret = ret; ), TP_printk(LOCAL_PR_FMT " - %llu", LOCAL_PR_ARG, __entry->ret) ); DEFINE_EVENT(local_only_evt, drv_start, TP_PROTO(struct ieee80211_local *local), TP_ARGS(local) ); DEFINE_EVENT(local_u32_evt, drv_get_et_strings, TP_PROTO(struct ieee80211_local *local, u32 sset), TP_ARGS(local, sset) ); DEFINE_EVENT(local_u32_evt, drv_get_et_sset_count, TP_PROTO(struct ieee80211_local *local, u32 sset), TP_ARGS(local, sset) ); DEFINE_EVENT(local_only_evt, drv_get_et_stats, TP_PROTO(struct ieee80211_local *local), TP_ARGS(local) ); DEFINE_EVENT(local_only_evt, drv_suspend, TP_PROTO(struct ieee80211_local *local), TP_ARGS(local) ); DEFINE_EVENT(local_only_evt, drv_resume, TP_PROTO(struct ieee80211_local *local), TP_ARGS(local) ); TRACE_EVENT(drv_set_wakeup, TP_PROTO(struct ieee80211_local *local, bool enabled), TP_ARGS(local, enabled), TP_STRUCT__entry( LOCAL_ENTRY __field(bool, enabled) ), TP_fast_assign( LOCAL_ASSIGN; __entry->enabled = enabled; ), TP_printk(LOCAL_PR_FMT " enabled:%d", LOCAL_PR_ARG, __entry->enabled) ); TRACE_EVENT(drv_stop, TP_PROTO(struct ieee80211_local *local, bool suspend), TP_ARGS(local, suspend), TP_STRUCT__entry( LOCAL_ENTRY __field(bool, suspend) ), TP_fast_assign( LOCAL_ASSIGN; __entry->suspend = suspend; ), TP_printk(LOCAL_PR_FMT " suspend:%d", LOCAL_PR_ARG, __entry->suspend) ); DEFINE_EVENT(local_sdata_addr_evt, drv_add_interface, TP_PROTO(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata), TP_ARGS(local, sdata) ); TRACE_EVENT(drv_change_interface, TP_PROTO(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, enum nl80211_iftype type, bool p2p), TP_ARGS(local, sdata, type, p2p), TP_STRUCT__entry( LOCAL_ENTRY VIF_ENTRY __field(u32, new_type) __field(bool, new_p2p) ), TP_fast_assign( LOCAL_ASSIGN; VIF_ASSIGN; __entry->new_type = type; __entry->new_p2p = p2p; ), TP_printk( LOCAL_PR_FMT VIF_PR_FMT " new type:%d%s", LOCAL_PR_ARG, VIF_PR_ARG, __entry->new_type, __entry->new_p2p ? "/p2p" : "" ) ); DEFINE_EVENT(local_sdata_addr_evt, drv_remove_interface, TP_PROTO(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata), TP_ARGS(local, sdata) ); TRACE_EVENT(drv_config, TP_PROTO(struct ieee80211_local *local, int radio_idx, u32 changed), TP_ARGS(local, radio_idx, changed), TP_STRUCT__entry( LOCAL_ENTRY __field(int, radio_idx) __field(u32, changed) __field(u32, flags) __field(int, power_level) __field(int, dynamic_ps_timeout) __field(u16, listen_interval) __field(u8, long_frame_max_tx_count) __field(u8, short_frame_max_tx_count) CHANDEF_ENTRY __field(int, smps) ), TP_fast_assign( LOCAL_ASSIGN; __entry->radio_idx = radio_idx; __entry->changed = changed; __entry->flags = local->hw.conf.flags; __entry->power_level = local->hw.conf.power_level; __entry->dynamic_ps_timeout = local->hw.conf.dynamic_ps_timeout; __entry->listen_interval = local->hw.conf.listen_interval; __entry->long_frame_max_tx_count = local->hw.conf.long_frame_max_tx_count; __entry->short_frame_max_tx_count = local->hw.conf.short_frame_max_tx_count; CHANDEF_ASSIGN(&local->hw.conf.chandef) __entry->smps = local->hw.conf.smps_mode; ), TP_printk( LOCAL_PR_FMT " radio_idx:%d ch:%#x" CHANDEF_PR_FMT, LOCAL_PR_ARG, __entry->radio_idx, __entry->changed, CHANDEF_PR_ARG ) ); TRACE_EVENT(drv_vif_cfg_changed, TP_PROTO(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, u64 changed), TP_ARGS(local, sdata, changed), TP_STRUCT__entry( LOCAL_ENTRY VIF_ENTRY __field(u64, changed) __field(bool, assoc) __field(bool, ibss_joined) __field(bool, ibss_creator) __field(u16, aid) __dynamic_array(u32, arp_addr_list, sdata->vif.cfg.arp_addr_cnt > IEEE80211_BSS_ARP_ADDR_LIST_LEN ? IEEE80211_BSS_ARP_ADDR_LIST_LEN : sdata->vif.cfg.arp_addr_cnt) __field(int, arp_addr_cnt) __dynamic_array(u8, ssid, sdata->vif.cfg.ssid_len) __field(int, s1g) __field(bool, idle) __field(bool, ps) ), TP_fast_assign( LOCAL_ASSIGN; VIF_ASSIGN; __entry->changed = changed; __entry->aid = sdata->vif.cfg.aid; __entry->assoc = sdata->vif.cfg.assoc; __entry->ibss_joined = sdata->vif.cfg.ibss_joined; __entry->ibss_creator = sdata->vif.cfg.ibss_creator; __entry->ps = sdata->vif.cfg.ps; __entry->arp_addr_cnt = sdata->vif.cfg.arp_addr_cnt; memcpy(__get_dynamic_array(arp_addr_list), sdata->vif.cfg.arp_addr_list, sizeof(u32) * (sdata->vif.cfg.arp_addr_cnt > IEEE80211_BSS_ARP_ADDR_LIST_LEN ? IEEE80211_BSS_ARP_ADDR_LIST_LEN : sdata->vif.cfg.arp_addr_cnt)); memcpy(__get_dynamic_array(ssid), sdata->vif.cfg.ssid, sdata->vif.cfg.ssid_len); __entry->s1g = sdata->vif.cfg.s1g; __entry->idle = sdata->vif.cfg.idle; ), TP_printk( LOCAL_PR_FMT VIF_PR_FMT " changed:%#llx", LOCAL_PR_ARG, VIF_PR_ARG, __entry->changed ) ); TRACE_EVENT(drv_link_info_changed, TP_PROTO(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, struct ieee80211_bss_conf *link_conf, u64 changed), TP_ARGS(local, sdata, link_conf, changed), TP_STRUCT__entry( LOCAL_ENTRY VIF_ENTRY __field(u64, changed) __field(int, link_id) __field(bool, cts) __field(bool, shortpre) __field(bool, shortslot) __field(bool, enable_beacon) __field(u8, dtimper) __field(u16, bcnint) __field(u16, assoc_cap) __field(u64, sync_tsf) __field(u32, sync_device_ts) __field(u8, sync_dtim_count) __field(u32, basic_rates) __array(int, mcast_rate, NUM_NL80211_BANDS) __field(u16, ht_operation_mode) __field(s32, cqm_rssi_thold) __field(s32, cqm_rssi_hyst) __field(u32, channel_width) __field(u32, channel_cfreq1) __field(u32, channel_cfreq1_offset) __field(bool, qos) __field(bool, hidden_ssid) __field(int, txpower) __field(u8, p2p_oppps_ctwindow) ), TP_fast_assign( LOCAL_ASSIGN; VIF_ASSIGN; __entry->changed = changed; __entry->link_id = link_conf->link_id; __entry->shortpre = link_conf->use_short_preamble; __entry->cts = link_conf->use_cts_prot; __entry->shortslot = link_conf->use_short_slot; __entry->enable_beacon = link_conf->enable_beacon; __entry->dtimper = link_conf->dtim_period; __entry->bcnint = link_conf->beacon_int; __entry->assoc_cap = link_conf->assoc_capability; __entry->sync_tsf = link_conf->sync_tsf; __entry->sync_device_ts = link_conf->sync_device_ts; __entry->sync_dtim_count = link_conf->sync_dtim_count; __entry->basic_rates = link_conf->basic_rates; memcpy(__entry->mcast_rate, link_conf->mcast_rate, sizeof(__entry->mcast_rate)); __entry->ht_operation_mode = link_conf->ht_operation_mode; __entry->cqm_rssi_thold = link_conf->cqm_rssi_thold; __entry->cqm_rssi_hyst = link_conf->cqm_rssi_hyst; __entry->channel_width = link_conf->chanreq.oper.width; __entry->channel_cfreq1 = link_conf->chanreq.oper.center_freq1; __entry->channel_cfreq1_offset = link_conf->chanreq.oper.freq1_offset; __entry->qos = link_conf->qos; __entry->hidden_ssid = link_conf->hidden_ssid; __entry->txpower = link_conf->txpower; __entry->p2p_oppps_ctwindow = link_conf->p2p_noa_attr.oppps_ctwindow; ), TP_printk( LOCAL_PR_FMT VIF_PR_FMT " link_id:%d, changed:%#llx", LOCAL_PR_ARG, VIF_PR_ARG, __entry->link_id, __entry->changed ) ); TRACE_EVENT(drv_prepare_multicast, TP_PROTO(struct ieee80211_local *local, int mc_count), TP_ARGS(local, mc_count), TP_STRUCT__entry( LOCAL_ENTRY __field(int, mc_count) ), TP_fast_assign( LOCAL_ASSIGN; __entry->mc_count = mc_count; ), TP_printk( LOCAL_PR_FMT " prepare mc (%d)", LOCAL_PR_ARG, __entry->mc_count ) ); TRACE_EVENT(drv_configure_filter, TP_PROTO(struct ieee80211_local *local, unsigned int changed_flags, unsigned int *total_flags, u64 multicast), TP_ARGS(local, changed_flags, total_flags, multicast), TP_STRUCT__entry( LOCAL_ENTRY __field(unsigned int, changed) __field(unsigned int, total) __field(u64, multicast) ), TP_fast_assign( LOCAL_ASSIGN; __entry->changed = changed_flags; __entry->total = *total_flags; __entry->multicast = multicast; ), TP_printk( LOCAL_PR_FMT " changed:%#x total:%#x", LOCAL_PR_ARG, __entry->changed, __entry->total ) ); TRACE_EVENT(drv_config_iface_filter, TP_PROTO(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, unsigned int filter_flags, unsigned int changed_flags), TP_ARGS(local, sdata, filter_flags, changed_flags), TP_STRUCT__entry( LOCAL_ENTRY VIF_ENTRY __field(unsigned int, filter_flags) __field(unsigned int, changed_flags) ), TP_fast_assign( LOCAL_ASSIGN; VIF_ASSIGN; __entry->filter_flags = filter_flags; __entry->changed_flags = changed_flags; ), TP_printk( LOCAL_PR_FMT VIF_PR_FMT " filter_flags: %#x changed_flags: %#x", LOCAL_PR_ARG, VIF_PR_ARG, __entry->filter_flags, __entry->changed_flags ) ); TRACE_EVENT(drv_set_tim, TP_PROTO(struct ieee80211_local *local, struct ieee80211_sta *sta, bool set), TP_ARGS(local, sta, set), TP_STRUCT__entry( LOCAL_ENTRY STA_ENTRY __field(bool, set) ), TP_fast_assign( LOCAL_ASSIGN; STA_ASSIGN; __entry->set = set; ), TP_printk( LOCAL_PR_FMT STA_PR_FMT " set:%d", LOCAL_PR_ARG, STA_PR_ARG, __entry->set ) ); TRACE_EVENT(drv_set_key, TP_PROTO(struct ieee80211_local *local, enum set_key_cmd cmd, struct ieee80211_sub_if_data *sdata, struct ieee80211_sta *sta, struct ieee80211_key_conf *key), TP_ARGS(local, cmd, sdata, sta, key), TP_STRUCT__entry( LOCAL_ENTRY VIF_ENTRY STA_ENTRY __field(u32, cmd) KEY_ENTRY ), TP_fast_assign( LOCAL_ASSIGN; VIF_ASSIGN; STA_ASSIGN; __entry->cmd = cmd; KEY_ASSIGN(key); ), TP_printk( LOCAL_PR_FMT VIF_PR_FMT STA_PR_FMT " cmd: %d" KEY_PR_FMT, LOCAL_PR_ARG, VIF_PR_ARG, STA_PR_ARG, __entry->cmd, KEY_PR_ARG ) ); TRACE_EVENT(drv_update_tkip_key, TP_PROTO(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, struct ieee80211_key_conf *conf, struct ieee80211_sta *sta, u32 iv32), TP_ARGS(local, sdata, conf, sta, iv32), TP_