Total coverage: 283740 (18%)of 1606251
60 4 4 64 63 64 1 56 13 88 84 14 64 64 64 15 15 15 65 64 4 3 14 14 58 60 56 9 18 51 59 57 8 60 7 56 58 8 10 10 10 63 5 5 63 59 1 64 63 60 4 4 64 63 99 98 63 61 64 64 40 64 97 41 62 2 85 12 12 10 27 20 7 6 1 5 1 5 1 1 11 11 2 1 2 5 5 5 12 12 12 12 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 60 60 59 1 10 10 10 85 85 85 85 45 16 96 78 2 18 97 78 78 2 18 17 17 99 59 59 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 // SPDX-License-Identifier: GPL-2.0 /* * bcachefs journalling code, for btree insertions * * Copyright 2012 Google, Inc. */ #include "bcachefs.h" #include "alloc_foreground.h" #include "bkey_methods.h" #include "btree_gc.h" #include "btree_update.h" #include "btree_write_buffer.h" #include "buckets.h" #include "enumerated_ref.h" #include "error.h" #include "journal.h" #include "journal_io.h" #include "journal_reclaim.h" #include "journal_sb.h" #include "journal_seq_blacklist.h" #include "trace.h" static inline bool journal_seq_unwritten(struct journal *j, u64 seq) { return seq > j->seq_ondisk; } static bool __journal_entry_is_open(union journal_res_state state) { return state.cur_entry_offset < JOURNAL_ENTRY_CLOSED_VAL; } static inline unsigned nr_unwritten_journal_entries(struct journal *j) { return atomic64_read(&j->seq) - j->seq_ondisk; } static bool journal_entry_is_open(struct journal *j) { return __journal_entry_is_open(j->reservations); } static void bch2_journal_buf_to_text(struct printbuf *out, struct journal *j, u64 seq) { union journal_res_state s = READ_ONCE(j->reservations); unsigned i = seq & JOURNAL_BUF_MASK; struct journal_buf *buf = j->buf + i; prt_printf(out, "seq:\t%llu\n", seq); printbuf_indent_add(out, 2); if (!buf->write_started) prt_printf(out, "refcount:\t%u\n", journal_state_count(s, i & JOURNAL_STATE_BUF_MASK)); struct closure *cl = &buf->io; int r = atomic_read(&cl->remaining); prt_printf(out, "io:\t%pS r %i\n", cl->fn, r & CLOSURE_REMAINING_MASK); if (buf->data) { prt_printf(out, "size:\t"); prt_human_readable_u64(out, vstruct_bytes(buf->data)); prt_newline(out); } prt_printf(out, "expires:\t%li jiffies\n", buf->expires - jiffies); prt_printf(out, "flags:\t"); if (buf->noflush) prt_str(out, "noflush "); if (buf->must_flush) prt_str(out, "must_flush "); if (buf->separate_flush) prt_str(out, "separate_flush "); if (buf->need_flush_to_write_buffer) prt_str(out, "need_flush_to_write_buffer "); if (buf->write_started) prt_str(out, "write_started "); if (buf->write_allocated) prt_str(out, "write_allocated "); if (buf->write_done) prt_str(out, "write_done"); prt_newline(out); printbuf_indent_sub(out, 2); } static void bch2_journal_bufs_to_text(struct printbuf *out, struct journal *j) { lockdep_assert_held(&j->lock); out->atomic++; if (!out->nr_tabstops) printbuf_tabstop_push(out, 24); for (u64 seq = journal_last_unwritten_seq(j); seq <= journal_cur_seq(j); seq++) bch2_journal_buf_to_text(out, j, seq); prt_printf(out, "last buf %s\n", journal_entry_is_open(j) ? "open" : "closed"); --out->atomic; } static inline struct journal_buf * journal_seq_to_buf(struct journal *j, u64 seq) { struct journal_buf *buf = NULL; EBUG_ON(seq > journal_cur_seq(j)); if (journal_seq_unwritten(j, seq)) buf = j->buf + (seq & JOURNAL_BUF_MASK); return buf; } static void journal_pin_list_init(struct journal_entry_pin_list *p, int count) { for (unsigned i = 0; i < ARRAY_SIZE(p->unflushed); i++) INIT_LIST_HEAD(&p->unflushed[i]); for (unsigned i = 0; i < ARRAY_SIZE(p->flushed); i++) INIT_LIST_HEAD(&p->flushed[i]); atomic_set(&p->count, count); p->devs.nr = 0; } /* * Detect stuck journal conditions and trigger shutdown. Technically the journal * can end up stuck for a variety of reasons, such as a blocked I/O, journal * reservation lockup, etc. Since this is a fatal error with potentially * unpredictable characteristics, we want to be fairly conservative before we * decide to shut things down. * * Consider the journal stuck when it appears full with no ability to commit * btree transactions, to discard journal buckets, nor acquire priority * (reserved watermark) reservation. */ static inline bool journal_error_check_stuck(struct journal *j, int error, unsigned flags) { struct bch_fs *c = container_of(j, struct bch_fs, journal); bool stuck = false; struct printbuf buf = PRINTBUF; buf.atomic++; if (!(error == -BCH_ERR_journal_full || error == -BCH_ERR_journal_pin_full) || nr_unwritten_journal_entries(j) || (flags & BCH_WATERMARK_MASK) != BCH_WATERMARK_reclaim) return stuck; spin_lock(&j->lock); if (j->can_discard) { spin_unlock(&j->lock); return stuck; } stuck = true; /* * The journal shutdown path will set ->err_seq, but do it here first to * serialize against concurrent failures and avoid duplicate error * reports. */ if (j->err_seq) { spin_unlock(&j->lock); return stuck; } j->err_seq = journal_cur_seq(j); __bch2_journal_debug_to_text(&buf, j); spin_unlock(&j->lock); prt_printf(&buf, bch2_fmt(c, "Journal stuck! Hava a pre-reservation but journal full (error %s)"), bch2_err_str(error)); bch2_print_str(c, KERN_ERR, buf.buf); printbuf_reset(&buf); bch2_journal_pins_to_text(&buf, j); bch_err(c, "Journal pins:\n%s", buf.buf); printbuf_exit(&buf); bch2_fatal_error(c); dump_stack(); return stuck; } void bch2_journal_do_writes(struct journal *j) { for (u64 seq = journal_last_unwritten_seq(j); seq <= journal_cur_seq(j); seq++) { unsigned idx = seq & JOURNAL_BUF_MASK; struct journal_buf *w = j->buf + idx; if (w->write_started && !w->write_allocated) break; if (w->write_started) continue; if (!journal_state_seq_count(j, j->reservations, seq)) { j->seq_write_started = seq; w->write_started = true; closure_call(&w->io, bch2_journal_write, j->wq, NULL); } break; } } /* * Final processing when the last reference of a journal buffer has been * dropped. Drop the pin list reference acquired at journal entry open and write * the buffer, if requested. */ void bch2_journal_buf_put_final(struct journal *j, u64 seq) { lockdep_assert_held(&j->lock); if (__bch2_journal_pin_put(j, seq)) bch2_journal_reclaim_fast(j); bch2_journal_do_writes(j); /* * for __bch2_next_write_buffer_flush_journal_buf(), when quiescing an * open journal entry */ wake_up(&j->wait); } /* * Returns true if journal entry is now closed: * * We don't close a journal_buf until the next journal_buf is finished writing, * and can be opened again - this also initializes the next journal_buf: */ static void __journal_entry_close(struct journal *j, unsigned closed_val, bool trace) { struct bch_fs *c = container_of(j, struct bch_fs, journal); struct journal_buf *buf = journal_cur_buf(j); union journal_res_state old, new; unsigned sectors; BUG_ON(closed_val != JOURNAL_ENTRY_CLOSED_VAL && closed_val != JOURNAL_ENTRY_ERROR_VAL); lockdep_assert_held(&j->lock); old.v = atomic64_read(&j->reservations.counter); do { new.v = old.v; new.cur_entry_offset = closed_val; if (old.cur_entry_offset == JOURNAL_ENTRY_ERROR_VAL || old.cur_entry_offset == new.cur_entry_offset) return; } while (!atomic64_try_cmpxchg(&j->reservations.counter, &old.v, new.v)); if (!__journal_entry_is_open(old)) return; if (old.cur_entry_offset == JOURNAL_ENTRY_BLOCKED_VAL) old.cur_entry_offset = j->cur_entry_offset_if_blocked; /* Close out old buffer: */ buf->data->u64s = cpu_to_le32(old.cur_entry_offset); if (trace_journal_entry_close_enabled() && trace) { struct printbuf pbuf = PRINTBUF; pbuf.atomic++; prt_str(&pbuf, "entry size: "); prt_human_readable_u64(&pbuf, vstruct_bytes(buf->data)); prt_newline(&pbuf); bch2_prt_task_backtrace(&pbuf, current, 1, GFP_NOWAIT); trace_journal_entry_close(c, pbuf.buf); printbuf_exit(&pbuf); } sectors = vstruct_blocks_plus(buf->data, c->block_bits, buf->u64s_reserved) << c->block_bits; if (unlikely(sectors > buf->sectors)) { struct printbuf err = PRINTBUF; err.atomic++; prt_printf(&err, "journal entry overran reserved space: %u > %u\n", sectors, buf->sectors); prt_printf(&err, "buf u64s %u u64s reserved %u cur_entry_u64s %u block_bits %u\n", le32_to_cpu(buf->data->u64s), buf->u64s_reserved, j->cur_entry_u64s, c->block_bits); prt_printf(&err, "fatal error - emergency read only"); bch2_journal_halt_locked(j); bch_err(c, "%s", err.buf); printbuf_exit(&err); return; } buf->sectors = sectors; /* * We have to set last_seq here, _before_ opening a new journal entry: * * A threads may replace an old pin with a new pin on their current * journal reservation - the expectation being that the journal will * contain either what the old pin protected or what the new pin * protects. * * After the old pin is dropped journal_last_seq() won't include the old * pin, so we can only write the updated last_seq on the entry that * contains whatever the new pin protects. * * Restated, we can _not_ update last_seq for a given entry if there * could be a newer entry open with reservations/pins that have been * taken against it. * * Hence, we want update/set last_seq on the current journal entry right * before we open a new one: */ buf->last_seq = journal_last_seq(j); buf->data->last_seq = cpu_to_le64(buf->last_seq); BUG_ON(buf->last_seq > le64_to_cpu(buf->data->seq)); cancel_delayed_work(&j->write_work); bch2_journal_space_available(j); __bch2_journal_buf_put(j, le64_to_cpu(buf->data->seq)); } void bch2_journal_halt_locked(struct journal *j) { lockdep_assert_held(&j->lock); __journal_entry_close(j, JOURNAL_ENTRY_ERROR_VAL, true); if (!j->err_seq) j->err_seq = journal_cur_seq(j); journal_wake(j); } void bch2_journal_halt(struct journal *j) { spin_lock(&j->lock); bch2_journal_halt_locked(j); spin_unlock(&j->lock); } static bool journal_entry_want_write(struct journal *j) { bool ret = !journal_entry_is_open(j) || journal_cur_seq(j) == journal_last_unwritten_seq(j); /* Don't close it yet if we already have a write in flight: */ if (ret) __journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL, true); else if (nr_unwritten_journal_entries(j)) { struct journal_buf *buf = journal_cur_buf(j); if (!buf->flush_time) { buf->flush_time = local_clock() ?: 1; buf->expires = jiffies; } } return ret; } bool bch2_journal_entry_close(struct journal *j) { bool ret; spin_lock(&j->lock); ret = journal_entry_want_write(j); spin_unlock(&j->lock); return ret; } /* * should _only_ called from journal_res_get() - when we actually want a * journal reservation - journal entry is open means journal is dirty: */ static int journal_entry_open(struct journal *j) { struct bch_fs *c = container_of(j, struct bch_fs, journal); struct journal_buf *buf = j->buf + ((journal_cur_seq(j) + 1) & JOURNAL_BUF_MASK); union journal_res_state old, new; int u64s; lockdep_assert_held(&j->lock); BUG_ON(journal_entry_is_open(j)); BUG_ON(BCH_SB_CLEAN(c->disk_sb.sb)); if (j->blocked) return bch_err_throw(c, journal_blocked); if (j->cur_entry_error) return j->cur_entry_error; int ret = bch2_journal_error(j); if (unlikely(ret)) return ret; if (!fifo_free(&j->pin)) return bch_err_throw(c, journal_pin_full); if (nr_unwritten_journal_entries(j) == ARRAY_SIZE(j->buf)) return bch_err_throw(c, journal_max_in_flight); if (atomic64_read(&j->seq) - j->seq_write_started == JOURNAL_STATE_BUF_NR) return bch_err_throw(c, journal_max_open); if (unlikely(journal_cur_seq(j) >= JOURNAL_SEQ_MAX)) { bch_err(c, "cannot start: journal seq overflow"); if (bch2_fs_emergency_read_only_locked(c)) bch_err(c, "fatal error - emergency read only"); return bch_err_throw(c, journal_shutdown); } if (!j->free_buf && !buf->data) return bch_err_throw(c, journal_buf_enomem); /* will retry after write completion frees up a buf */ BUG_ON(!j->cur_entry_sectors); if (!buf->data) { swap(buf->data, j->free_buf); swap(buf->buf_size, j->free_buf_size); } buf->expires = (journal_cur_seq(j) == j->flushed_seq_ondisk ? jiffies : j->last_flush_write) + msecs_to_jiffies(c->opts.journal_flush_delay); buf->u64s_reserved = j->entry_u64s_reserved; buf->disk_sectors = j->cur_entry_sectors; buf->sectors = min(buf->disk_sectors, buf->buf_size >> 9); u64s = (int) (buf->sectors << 9) / sizeof(u64) - journal_entry_overhead(j); u64s = clamp_t(int, u64s, 0, JOURNAL_ENTRY_CLOSED_VAL - 1); if (u64s <= (ssize_t) j->early_journal_entries.nr) return bch_err_throw(c, journal_full); if (fifo_empty(&j->pin) && j->reclaim_thread) wake_up_process(j->reclaim_thread); /* * The fifo_push() needs to happen at the same time as j->seq is * incremented for journal_last_seq() to be calculated correctly */ atomic64_inc(&j->seq); journal_pin_list_init(fifo_push_ref(&j->pin), 1); if (unlikely(bch2_journal_seq_is_blacklisted(c, journal_cur_seq(j), false))) { bch_err(c, "attempting to open blacklisted journal seq %llu", journal_cur_seq(j)); if (bch2_fs_emergency_read_only_locked(c)) bch_err(c, "fatal error - emergency read only"); return bch_err_throw(c, journal_shutdown); } BUG_ON(j->pin.back - 1 != atomic64_read(&j->seq)); BUG_ON(j->buf + (journal_cur_seq(j) & JOURNAL_BUF_MASK) != buf); bkey_extent_init(&buf->key); buf->noflush = false; buf->must_flush = false; buf->separate_flush = false; buf->flush_time = 0; buf->need_flush_to_write_buffer = true; buf->write_started = false; buf->write_allocated = false; buf->write_done = false; memset(buf->data, 0, sizeof(*buf->data)); buf->data->seq = cpu_to_le64(journal_cur_seq(j)); buf->data->u64s = 0; if (j->early_journal_entries.nr) { memcpy(buf->data->_data, j->early_journal_entries.data, j->early_journal_entries.nr * sizeof(u64)); le32_add_cpu(&buf->data->u64s, j->early_journal_entries.nr); } /* * Must be set before marking the journal entry as open: */ j->cur_entry_u64s = u64s; old.v = atomic64_read(&j->reservations.counter); do { new.v = old.v; BUG_ON(old.cur_entry_offset == JOURNAL_ENTRY_ERROR_VAL); new.idx++; BUG_ON(journal_state_count(new, new.idx)); BUG_ON(new.idx != (journal_cur_seq(j) & JOURNAL_STATE_BUF_MASK)); journal_state_inc(&new); /* Handle any already added entries */ new.cur_entry_offset = le32_to_cpu(buf->data->u64s); } while (!atomic64_try_cmpxchg(&j->reservations.counter, &old.v, new.v)); if (nr_unwritten_journal_entries(j) == 1) mod_delayed_work(j->wq, &j->write_work, msecs_to_jiffies(c->opts.journal_flush_delay)); journal_wake(j); if (j->early_journal_entries.nr) darray_exit(&j->early_journal_entries); return 0; } static bool journal_quiesced(struct journal *j) { bool ret = atomic64_read(&j->seq) == j->seq_ondisk; if (!ret) bch2_journal_entry_close(j); return ret; } static void journal_quiesce(struct journal *j) { wait_event(j->wait, journal_quiesced(j)); } static void journal_write_work(struct work_struct *work) { struct journal *j = container_of(work, struct journal, write_work.work); spin_lock(&j->lock); if (__journal_entry_is_open(j->reservations)) { long delta = journal_cur_buf(j)->expires - jiffies; if (delta > 0) mod_delayed_work(j->wq, &j->write_work, delta); else __journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL, true); } spin_unlock(&j->lock); } static void journal_buf_prealloc(struct journal *j) { if (j->free_buf && j->free_buf_size >= j->buf_size_want) return; unsigned buf_size = j->buf_size_want; spin_unlock(&j->lock); void *buf = kvmalloc(buf_size, GFP_NOFS); spin_lock(&j->lock); if (buf && (!j->free_buf || buf_size > j->free_buf_size)) { swap(buf, j->free_buf); swap(buf_size, j->free_buf_size); } if (unlikely(buf)) { spin_unlock(&j->lock); /* kvfree can sleep */ kvfree(buf); spin_lock(&j->lock); } } static int __journal_res_get(struct journal *j, struct journal_res *res, unsigned flags) { struct bch_fs *c = container_of(j, struct bch_fs, journal); struct journal_buf *buf; bool can_discard; int ret; retry: if (journal_res_get_fast(j, res, flags)) return 0; ret = bch2_journal_error(j); if (unlikely(ret)) return ret; if (j->blocked) return bch_err_throw(c, journal_blocked); if ((flags & BCH_WATERMARK_MASK) < j->watermark) { ret = bch_err_throw(c, journal_full); can_discard = j->can_discard; goto out; } if (nr_unwritten_journal_entries(j) == ARRAY_SIZE(j->buf) && !journal_entry_is_open(j)) { ret = bch_err_throw(c, journal_max_in_flight); goto out; } spin_lock(&j->lock); journal_buf_prealloc(j); /* * Recheck after taking the lock, so we don't race with another thread * that just did journal_entry_open() and call bch2_journal_entry_close() * unnecessarily */ if (journal_res_get_fast(j, res, flags)) { ret = 0; goto unlock; } /* * If we couldn't get a reservation because the current buf filled up, * and we had room for a bigger entry on disk, signal that we want to * realloc the journal bufs: */ buf = journal_cur_buf(j); if (journal_entry_is_open(j) && buf->buf_size >> 9 < buf->disk_sectors && buf->buf_size < JOURNAL_ENTRY_SIZE_MAX) j->buf_size_want = max(j->buf_size_want, buf->buf_size << 1); __journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL, false); ret = journal_entry_open(j) ?: -BCH_ERR_journal_retry_open; unlock: can_discard = j->can_discard; spin_unlock(&j->lock); out: if (likely(!ret)) return 0; if (ret == -BCH_ERR_journal_retry_open) goto retry; if (journal_error_check_stuck(j, ret, flags)) ret = bch_err_throw(c, journal_stuck); if (ret == -BCH_ERR_journal_max_in_flight && track_event_change(&c->times[BCH_TIME_blocked_journal_max_in_flight], true) && trace_journal_entry_full_enabled()) { struct printbuf buf = PRINTBUF; bch2_printbuf_make_room(&buf, 4096); spin_lock(&j->lock); prt_printf(&buf, "seq %llu\n", journal_cur_seq(j)); bch2_journal_bufs_to_text(&buf, j); spin_unlock(&j->lock); trace_journal_entry_full(c, buf.buf); printbuf_exit(&buf); count_event(c, journal_entry_full); } if (ret == -BCH_ERR_journal_max_open && track_event_change(&c->times[BCH_TIME_blocked_journal_max_open], true) && trace_journal_entry_full_enabled()) { struct printbuf buf = PRINTBUF; bch2_printbuf_make_room(&buf, 4096); spin_lock(&j->lock); prt_printf(&buf, "seq %llu\n", journal_cur_seq(j)); bch2_journal_bufs_to_text(&buf, j); spin_unlock(&j->lock); trace_journal_entry_full(c, buf.buf); printbuf_exit(&buf); count_event(c, journal_entry_full); } /* * Journal is full - can't rely on reclaim from work item due to * freezing: */ if ((ret == -BCH_ERR_journal_full || ret == -BCH_ERR_journal_pin_full) && !(flags & JOURNAL_RES_GET_NONBLOCK)) { if (can_discard) { bch2_journal_do_discards(j); goto retry; } if (mutex_trylock(&j->reclaim_lock)) { bch2_journal_reclaim(j); mutex_unlock(&j->reclaim_lock); } } return ret; } static unsigned max_dev_latency(struct bch_fs *c) { u64 nsecs = 0; guard(rcu)(); for_each_rw_member_rcu(c, ca) nsecs = max(nsecs, ca->io_latency[WRITE].stats.max_duration); return nsecs_to_jiffies(nsecs); } /* * Essentially the entry function to the journaling code. When bcachefs is doing * a btree insert, it calls this function to get the current journal write. * Journal write is the structure used set up journal writes. The calling * function will then add its keys to the structure, queuing them for the next * write. * * To ensure forward progress, the current task must not be holding any * btree node write locks. */ int bch2_journal_res_get_slowpath(struct journal *j, struct journal_res *res, unsigned flags, struct btree_trans *trans) { int ret; if (closure_wait_event_timeout(&j->async_wait, !bch2_err_matches(ret = __journal_res_get(j, res, flags), BCH_ERR_operation_blocked) || (flags & JOURNAL_RES_GET_NONBLOCK), HZ)) return ret; if (trans) bch2_trans_unlock_long(trans); struct bch_fs *c = container_of(j, struct bch_fs, journal); int remaining_wait = max(max_dev_latency(c) * 2, HZ * 10); remaining_wait = max(0, remaining_wait - HZ); if (closure_wait_event_timeout(&j->async_wait, !bch2_err_matches(ret = __journal_res_get(j, res, flags), BCH_ERR_operation_blocked) || (flags & JOURNAL_RES_GET_NONBLOCK), remaining_wait)) return ret; struct printbuf buf = PRINTBUF; bch2_journal_debug_to_text(&buf, j); bch2_print_str(c, KERN_ERR, buf.buf); prt_printf(&buf, bch2_fmt(c, "Journal stuck? Waited for 10 seconds, err %s"), bch2_err_str(ret)); printbuf_exit(&buf); closure_wait_event(&j->async_wait, !bch2_err_matches(ret = __journal_res_get(j, res, flags), BCH_ERR_operation_blocked) || (flags & JOURNAL_RES_GET_NONBLOCK)); return ret; } /* journal_entry_res: */ void bch2_journal_entry_res_resize(struct journal *j, struct journal_entry_res *res, unsigned new_u64s) { union journal_res_state state; int d = new_u64s - res->u64s; spin_lock(&j->lock); j->entry_u64s_reserved += d; if (d <= 0) goto out; j->cur_entry_u64s = max_t(int, 0, j->cur_entry_u64s - d); state = READ_ONCE(j->reservations); if (state.cur_entry_offset < JOURNAL_ENTRY_CLOSED_VAL && state.cur_entry_offset > j->cur_entry_u64s) { j->cur_entry_u64s += d; /* * Not enough room in current journal entry, have to flush it: */ __journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL, true); } else { journal_cur_buf(j)->u64s_reserved += d; } out: spin_unlock(&j->lock); res->u64s += d; } /* journal flushing: */ /** * bch2_journal_flush_seq_async - wait for a journal entry to be written * @j: journal object * @seq: seq to flush * @parent: closure object to wait with * Returns: 1 if @seq has already been flushed, 0 if @seq is being flushed, * -BCH_ERR_journal_flush_err if @seq will never be flushed * * Like bch2_journal_wait_on_seq, except that it triggers a write immediately if * necessary */ int bch2_journal_flush_seq_async(struct journal *j, u64 seq, struct closure *parent) { struct bch_fs *c = container_of(j, struct bch_fs, journal); struct journal_buf *buf; int ret = 0; if (seq <= j->flushed_seq_ondisk) return 1; spin_lock(&j->lock); if (WARN_ONCE(seq > journal_cur_seq(j), "requested to flush journal seq %llu, but currently at %llu", seq, journal_cur_seq(j))) goto out; /* Recheck under lock: */ if (j->err_seq && seq >= j->err_seq) { ret = bch_err_throw(c, journal_flush_err); goto out; } if (seq <= j->flushed_seq_ondisk) { ret = 1; goto out; } /* if seq was written, but not flushed - flush a newer one instead */ seq = max(seq, journal_last_unwritten_seq(j)); recheck_need_open: if (seq > journal_cur_seq(j)) { struct journal_res res = { 0 }; if (journal_entry_is_open(j)) __journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL, true); spin_unlock(&j->lock); /* * We're called from bch2_journal_flush_seq() -> wait_event(); * but this might block. We won't usually block, so we won't * livelock: */ sched_annotate_sleep(); ret = bch2_journal_res_get(j, &res, jset_u64s(0), 0, NULL); if (ret) return ret; seq = res.seq; buf = journal_seq_to_buf(j, seq); buf->must_flush = true; if (!buf->flush_time) { buf->flush_time = local_clock() ?: 1; buf->expires = jiffies; } if (parent && !closure_wait(&buf->wait, parent)) BUG(); bch2_journal_res_put(j, &res); spin_lock(&j->lock); goto want_write; } /* * if write was kicked off without a flush, or if we promised it * wouldn't be a flush, flush the next sequence number instead */ buf = journal_seq_to_buf(j, seq); if (buf->noflush) { seq++; goto recheck_need_open; } buf->must_flush = true; j->flushing_seq = max(j->flushing_seq, seq); if (parent && !closure_wait(&buf->wait, parent)) BUG(); want_write: if (seq == journal_cur_seq(j)) journal_entry_want_write(j); out: spin_unlock(&j->lock); return ret; } int bch2_journal_flush_seq(struct journal *j, u64 seq, unsigned task_state) { u64 start_time = local_clock(); int ret, ret2; /* * Don't update time_stats when @seq is already flushed: */ if (seq <= j->flushed_seq_ondisk) return 0; ret = wait_event_state(j->wait, (ret2 = bch2_journal_flush_seq_async(j, seq, NULL)), task_state); if (!ret) bch2_time_stats_update(j->flush_seq_time, start_time); return ret ?: ret2 < 0 ? ret2 : 0; } /* * bch2_journal_flush_async - if there is an open journal entry, or a journal * still being written, write it and wait for the write to complete */ void bch2_journal_flush_async(struct journal *j, struct closure *parent) { bch2_journal_flush_seq_async(j, atomic64_read(&j->seq), parent); } int bch2_journal_flush(struct journal *j) { return bch2_journal_flush_seq(j, atomic64_read(&j->seq), TASK_UNINTERRUPTIBLE); } /* * bch2_journal_noflush_seq - ask the journal not to issue any flushes in the * range [start, end) * @seq */ bool bch2_journal_noflush_seq(struct journal *j, u64 start, u64 end) { struct bch_fs *c = container_of(j, struct bch_fs, journal); u64 unwritten_seq; bool ret = false; if (!(c->sb.features & (1ULL << BCH_FEATURE_journal_no_flush))) return false; if (c->journal.flushed_seq_ondisk >= start) return false; spin_lock(&j->lock); if (c->journal.flushed_seq_ondisk >= start) goto out; for (unwritten_seq = journal_last_unwritten_seq(j); unwritten_seq < end; unwritten_seq++) { struct journal_buf *buf = journal_seq_to_buf(j, unwritten_seq); /* journal flush already in flight, or flush requseted */ if (buf->must_flush) goto out; buf->noflush = true; } ret = true; out: spin_unlock(&j->lock); return ret; } static int __bch2_journal_meta(struct journal *j) { struct journal_res res = {}; int ret = bch2_journal_res_get(j, &res, jset_u64s(0), 0, NULL); if (ret) return ret; struct journal_buf *buf = j->buf + (res.seq & JOURNAL_BUF_MASK); buf->must_flush = true; if (!buf->flush_time) { buf->flush_time = local_clock() ?: 1; buf->expires = jiffies; } bch2_journal_res_put(j, &res); return bch2_journal_flush_seq(j, res.seq, TASK_UNINTERRUPTIBLE); } int bch2_journal_meta(struct journal *j) { struct bch_fs *c = container_of(j, struct bch_fs, journal); if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_journal)) return bch_err_throw(c, erofs_no_writes); int ret = __bch2_journal_meta(j); enumerated_ref_put(&c->writes, BCH_WRITE_REF_journal); return ret; } /* block/unlock the journal: */ void bch2_journal_unblock(struct journal *j) { spin_lock(&j->lock); if (!--j->blocked && j->cur_entry_offset_if_blocked < JOURNAL_ENTRY_CLOSED_VAL && j->reservations.cur_entry_offset == JOURNAL_ENTRY_BLOCKED_VAL) { union journal_res_state old, new; old.v = atomic64_read(&j->reservations.counter); do { new.v = old.v; new.cur_entry_offset = j->cur_entry_offset_if_blocked; } while (!atomic64_try_cmpxchg(&j->reservations.counter, &old.v, new.v)); } spin_unlock(&j->lock); journal_wake(j); } static void __bch2_journal_block(struct journal *j) { if (!j->blocked++) { union journal_res_state old, new; old.v = atomic64_read(&j->reservations.counter); do { j->cur_entry_offset_if_blocked = old.cur_entry_offset; if (j->cur_entry_offset_if_blocked >= JOURNAL_ENTRY_CLOSED_VAL) break; new.v = old.v; new.cur_entry_offset = JOURNAL_ENTRY_BLOCKED_VAL; } while (!atomic64_try_cmpxchg(&j->reservations.counter, &old.v, new.v)); if (old.cur_entry_offset < JOURNAL_ENTRY_BLOCKED_VAL) journal_cur_buf(j)->data->u64s = cpu_to_le32(old.cur_entry_offset); } } void bch2_journal_block(struct journal *j) { spin_lock(&j->lock); __bch2_journal_block(j); spin_unlock(&j->lock); journal_quiesce(j); } static struct journal_buf *__bch2_next_write_buffer_flush_journal_buf(struct journal *j, u64 max_seq, bool *blocked) { struct journal_buf *ret = NULL; /* We're inside wait_event(), but using mutex_lock(: */ sched_annotate_sleep(); mutex_lock(&j->buf_lock); spin_lock(&j->lock); max_seq = min(max_seq, journal_cur_seq(j)); for (u64 seq = journal_last_unwritten_seq(j); seq <= max_seq; seq++) { unsigned idx = seq & JOURNAL_BUF_MASK; struct journal_buf *buf = j->buf + idx; if (buf->need_flush_to_write_buffer) { union journal_res_state s; s.v = atomic64_read_acquire(&j->reservations.counter); unsigned open = seq == journal_cur_seq(j) && __journal_entry_is_open(s); if (open && !*blocked) { __bch2_journal_block(j); *blocked = true; } ret = journal_state_count(s, idx & JOURNAL_STATE_BUF_MASK) > open ? ERR_PTR(-EAGAIN) : buf; break; } } spin_unlock(&j->lock); if (IS_ERR_OR_NULL(ret)) mutex_unlock(&j->buf_lock); return ret; } struct journal_buf *bch2_next_write_buffer_flush_journal_buf(struct journal *j, u64 max_seq, bool *blocked) { struct journal_buf *ret; *blocked = false; wait_event(j->wait, (ret = __bch2_next_write_buffer_flush_journal_buf(j, max_seq, blocked)) != ERR_PTR(-EAGAIN)); if (IS_ERR_OR_NULL(ret) && *blocked) bch2_journal_unblock(j); return ret; } /* allocate journal on a device: */ static int bch2_set_nr_journal_buckets_iter(struct bch_dev *ca, unsigned nr, bool new_fs, struct closure *cl) { struct bch_fs *c = ca->fs; struct journal_device *ja = &ca->journal; u64 *new_bucket_seq = NULL, *new_buckets = NULL; struct open_bucket **ob = NULL; long *bu = NULL; unsigned i, pos, nr_got = 0, nr_want = nr - ja->nr; int ret = 0; BUG_ON(nr <= ja->nr); bu = kcalloc(nr_want, sizeof(*bu), GFP_KERNEL); ob = kcalloc(nr_want, sizeof(*ob), GFP_KERNEL); new_buckets = kcalloc(nr, sizeof(u64), GFP_KERNEL); new_bucket_seq = kcalloc(nr, sizeof(u64), GFP_KERNEL); if (!bu || !ob || !new_buckets || !new_bucket_seq) { ret = bch_err_throw(c, ENOMEM_set_nr_journal_buckets); goto err_free; } for (nr_got = 0; nr_got < nr_want; nr_got++) { enum bch_watermark watermark = new_fs ? BCH_WATERMARK_btree : BCH_WATERMARK_normal; ob[nr_got] = bch2_bucket_alloc(c, ca, watermark, BCH_DATA_journal, cl); ret = PTR_ERR_OR_ZERO(ob[nr_got]); if (ret) break; if (!new_fs) { ret = bch2_trans_run(c, bch2_trans_mark_metadata_bucket(trans, ca, ob[nr_got]->bucket, BCH_DATA_journal, ca->mi.bucket_size, BTREE_TRIGGER_transactional)); if (ret) { bch2_open_bucket_put(c, ob[nr_got]); bch_err_msg(c, ret, "marking new journal buckets"); break; } } bu[nr_got] = ob[nr_got]->bucket; } if (!nr_got) goto err_free; /* Don't return an error if we successfully allocated some buckets: */ ret = 0; if (c) { bch2_journal_flush_all_pins(&c->journal); bch2_journal_block(&c->journal); mutex_lock(&c->sb_lock); } memcpy(new_buckets, ja->buckets, ja->nr * sizeof(u64)); memcpy(new_bucket_seq, ja->bucket_seq, ja->nr * sizeof(u64)); BUG_ON(ja->discard_idx > ja->nr); pos = ja->discard_idx ?: ja->nr; memmove(new_buckets + pos + nr_got, new_buckets + pos, sizeof(new_buckets[0]) * (ja->nr - pos)); memmove(new_bucket_seq + pos + nr_got, new_bucket_seq + pos, sizeof(new_bucket_seq[0]) * (ja->nr - pos)); for (i = 0; i < nr_got; i++) { new_buckets[pos + i] = bu[i]; new_bucket_seq[pos + i] = 0; } nr = ja->nr + nr_got; ret = bch2_journal_buckets_to_sb(c, ca, new_buckets, nr); if (ret) goto err_unblock; bch2_write_super(c); /* Commit: */ if (c) spin_lock(&c->journal.lock); swap(new_buckets, ja->buckets); swap(new_bucket_seq, ja->bucket_seq); ja->nr = nr; if (pos <= ja->discard_idx) ja->discard_idx = (ja->discard_idx + nr_got) % ja->nr; if (pos <= ja->dirty_idx_ondisk) ja->dirty_idx_ondisk = (ja->dirty_idx_ondisk + nr_got) % ja->nr; if (pos <= ja->dirty_idx) ja->dirty_idx = (ja->dirty_idx + nr_got) % ja->nr; if (pos <= ja->cur_idx) ja->cur_idx = (ja->cur_idx + nr_got) % ja->nr; if (c) spin_unlock(&c->journal.lock); err_unblock: if (c) { bch2_journal_unblock(&c->journal); mutex_unlock(&c->sb_lock); } if (ret && !new_fs) for (i = 0; i < nr_got; i++) bch2_trans_run(c, bch2_trans_mark_metadata_bucket(trans, ca, bu[i], BCH_DATA_free, 0, BTREE_TRIGGER_transactional)); err_free: for (i = 0; i < nr_got; i++) bch2_open_bucket_put(c, ob[i]); kfree(new_bucket_seq); kfree(new_buckets); kfree(ob); kfree(bu); return ret; } static int bch2_set_nr_journal_buckets_loop(struct bch_fs *c, struct bch_dev *ca, unsigned nr, bool new_fs) { struct journal_device *ja = &ca->journal; int ret = 0; struct closure cl; closure_init_stack(&cl); /* don't handle reducing nr of buckets yet: */ if (nr < ja->nr) return 0; while (!ret && ja->nr < nr) { struct disk_reservation disk_res = { 0, 0, 0 }; /* * note: journal buckets aren't really counted as _sectors_ used yet, so * we don't need the disk reservation to avoid the BUG_ON() in buckets.c * when space used goes up without a reservation - but we do need the * reservation to ensure we'll actually be able to allocate: * * XXX: that's not right, disk reservations only ensure a * filesystem-wide allocation will succeed, this is a device * specific allocation - we can hang here: */ if (!new_fs) { ret = bch2_disk_reservation_get(c, &disk_res, bucket_to_sector(ca, nr - ja->nr), 1, 0); if (ret) break; } ret = bch2_set_nr_journal_buckets_iter(ca, nr, new_fs, &cl); if (ret == -BCH_ERR_bucket_alloc_blocked || ret == -BCH_ERR_open_buckets_empty) ret = 0; /* wait and retry */ bch2_disk_reservation_put(c, &disk_res); closure_sync(&cl); } return ret; } /* * Allocate more journal space at runtime - not currently making use if it, but * the code works: */ int bch2_set_nr_journal_buckets(struct bch_fs *c, struct bch_dev *ca, unsigned nr) { down_write(&c->state_lock); int ret = bch2_set_nr_journal_buckets_loop(c, ca, nr, false); up_write(&c->state_lock); bch_err_fn(c, ret); return ret; } int bch2_dev_journal_bucket_delete(struct bch_dev *ca, u64 b) { struct bch_fs *c = ca->fs; struct journal *j = &c->journal; struct journal_device *ja = &ca->journal; guard(mutex)(&c->sb_lock); unsigned pos; for (pos = 0; pos < ja->nr; pos++) if (ja->buckets[pos] == b) break; if (pos == ja->nr) { bch_err(ca, "journal bucket %llu not found when deleting", b); return -EINVAL; } u64 *new_buckets = kcalloc(ja->nr, sizeof(u64), GFP_KERNEL);; if (!new_buckets) return bch_err_throw(c, ENOMEM_set_nr_journal_buckets); memcpy(new_buckets, ja->buckets, ja->nr * sizeof(u64)); memmove(&new_buckets[pos], &new_buckets[pos + 1], (ja->nr - 1 - pos) * sizeof(new_buckets[0])); int ret = bch2_journal_buckets_to_sb(c, ca, ja->buckets, ja->nr - 1) ?: bch2_write_super(c); if (ret) { kfree(new_buckets); return ret; } scoped_guard(spinlock, &j->lock) { if (pos < ja->discard_idx) --ja->discard_idx; if (pos < ja->dirty_idx_ondisk) --ja->dirty_idx_ondisk; if (pos < ja->dirty_idx) --ja->dirty_idx; if (pos < ja->cur_idx) --ja->cur_idx; ja->nr--; memmove(&ja->buckets[pos], &ja->buckets[pos + 1], (ja->nr - pos) * sizeof(ja->buckets[0])); memmove(&ja->bucket_seq[pos], &ja->bucket_seq[pos + 1], (ja->nr - pos) * sizeof(ja->bucket_seq[0])); bch2_journal_space_available(j); } kfree(new_buckets); return 0; } int bch2_dev_journal_alloc(struct bch_dev *ca, bool new_fs) { struct bch_fs *c = ca->fs; if (!(ca->mi.data_allowed & BIT(BCH_DATA_journal))) return 0; if (c->sb.features & BIT_ULL(BCH_FEATURE_small_image)) { bch_err(c, "cannot allocate journal, filesystem is an unresized image file"); return bch_err_throw(c, erofs_filesystem_full); } unsigned nr; int ret; if (dynamic_fault("bcachefs:add:journal_alloc")) { ret = bch_err_throw(c, ENOMEM_set_nr_journal_buckets); goto err; } /* 1/128th of the device by default: */ nr = ca->mi.nbuckets >> 7; /* * clamp journal size to 8192 buckets or 8GB (in sectors), whichever * is smaller: */ nr = clamp_t(unsigned, nr, BCH_JOURNAL_BUCKETS_MIN, min(1 << 13, (1 << 24) / ca->mi.bucket_size)); ret = bch2_set_nr_journal_buckets_loop(c, ca, nr, new_fs); err: bch_err_fn(ca, ret); return ret; } int bch2_fs_journal_alloc(struct bch_fs *c) { for_each_online_member(c, ca, BCH_DEV_READ_REF_fs_journal_alloc) { if (ca->journal.nr) continue; int ret = bch2_dev_journal_alloc(ca, true); if (ret) { enumerated_ref_put(&ca->io_ref[READ], BCH_DEV_READ_REF_fs_journal_alloc); return ret; } } return 0; } /* startup/shutdown: */ static bool bch2_journal_writing_to_device(struct journal *j, unsigned dev_idx) { bool ret = false; u64 seq; spin_lock(&j->lock); for (seq = journal_last_unwritten_seq(j); seq <= journal_cur_seq(j) && !ret; seq++) { struct journal_buf *buf = journal_seq_to_buf(j, seq); if (bch2_bkey_has_device_c(bkey_i_to_s_c(&buf->key), dev_idx)) ret = true; } spin_unlock(&j->lock); return ret; } void bch2_dev_journal_stop(struct journal *j, struct bch_dev *ca) { wait_event(j->wait, !bch2_journal_writing_to_device(j, ca->dev_idx)); } void bch2_fs_journal_stop(struct journal *j) { if (!test_bit(JOURNAL_running, &j->flags)) return; bch2_journal_reclaim_stop(j); bch2_journal_flush_all_pins(j); wait_event(j->wait, bch2_journal_entry_close(j)); /* * Always write a new journal entry, to make sure the clock hands are up * to date (and match the superblock) */ __bch2_journal_meta(j); journal_quiesce(j); cancel_delayed_work_sync(&j->write_work); WARN(!bch2_journal_error(j) && test_bit(JOURNAL_replay_done, &j->flags) && j->last_empty_seq != journal_cur_seq(j), "journal shutdown error: cur seq %llu but last empty seq %llu", journal_cur_seq(j), j->last_empty_seq); if (!bch2_journal_error(j)) clear_bit(JOURNAL_running, &j->flags); } int bch2_fs_journal_start(struct journal *j, u64 cur_seq) { struct bch_fs *c = container_of(j, struct bch_fs, journal); struct journal_entry_pin_list *p; struct journal_replay *i, **_i; struct genradix_iter iter; bool had_entries = false; u64 last_seq = cur_seq, nr, seq; /* * * XXX pick most recent non blacklisted sequence number */ cur_seq = max(cur_seq, bch2_journal_last_blacklisted_seq(c)); if (cur_seq >= JOURNAL_SEQ_MAX) { bch_err(c, "cannot start: journal seq overflow"); return -EINVAL; } genradix_for_each_reverse(&c->journal_entries, iter, _i) { i = *_i; if (journal_replay_ignore(i)) continue; last_seq = le64_to_cpu(i->j.last_seq); break; } nr = cur_seq - last_seq; /* * Extra fudge factor, in case we crashed when the journal pin fifo was * nearly or completely full. We'll need to be able to open additional * journal entries (at least a few) in order for journal replay to get * going: */ nr += nr / 4; nr = max(nr, JOURNAL_PIN); init_fifo(&j->pin, roundup_pow_of_two(nr), GFP_KERNEL); if (!j->pin.data) { bch_err(c, "error reallocating journal fifo (%llu open entries)", nr); return bch_err_throw(c, ENOMEM_journal_pin_fifo); } j->replay_journal_seq = last_seq; j->replay_journal_seq_end = cur_seq; j->last_seq_ondisk = last_seq; j->flushed_seq_ondisk = cur_seq - 1; j->seq_write_started = cur_seq - 1; j->seq_ondisk = cur_seq - 1; j->pin.front = last_seq; j->pin.back = cur_seq; atomic64_set(&j->seq, cur_seq - 1); fifo_for_each_entry_ptr(p, &j->pin, seq) journal_pin_list_init(p, 1); genradix_for_each(&c->journal_entries, iter, _i) { i = *_i; if (journal_replay_ignore(i)) continue; seq = le64_to_cpu(i->j.seq); BUG_ON(seq >= cur_seq); if (seq < last_seq) continue; if (journal_entry_empty(&i->j)) j->last_empty_seq = le64_to_cpu(i->j.seq); p = journal_seq_pin(j, seq); p->devs.nr = 0; darray_for_each(i->ptrs, ptr) bch2_dev_list_add_dev(&p->devs, ptr->dev); had_entries = true; } if (!had_entries) j->last_empty_seq = cur_seq - 1; /* to match j->seq */ spin_lock(&j->lock); j->last_flush_write = jiffies; j->reservations.idx = journal_cur_seq(j); c->last_bucket_seq_cleanup = journal_cur_seq(j); spin_unlock(&j->lock); return 0; } void bch2_journal_set_replay_done(struct journal *j) { /* * journal_space_available must happen before setting JOURNAL_running * JOURNAL_running must happen before JOURNAL_replay_done */ spin_lock(&j->lock); bch2_journal_space_available(j); set_bit(JOURNAL_need_flush_write, &j->flags); set_bit(JOURNAL_running, &j->flags); set_bit(JOURNAL_replay_done, &j->flags); spin_unlock(&j->lock); } /* init/exit: */ void bch2_dev_journal_exit(struct bch_dev *ca) { struct journal_device *ja = &ca->journal; for (unsigned i = 0; i < ARRAY_SIZE(ja->bio); i++) { kfree(ja->bio[i]); ja->bio[i] = NULL; } kfree(ja->buckets); kfree(ja->bucket_seq); ja->buckets = NULL; ja->bucket_seq = NULL; } int bch2_dev_journal_init(struct bch_dev *ca, struct bch_sb *sb) { struct bch_fs *c = ca->fs; struct journal_device *ja = &ca->journal; struct bch_sb_field_journal *journal_buckets = bch2_sb_field_get(sb, journal); struct bch_sb_field_journal_v2 *journal_buckets_v2 = bch2_sb_field_get(sb, journal_v2); ja->nr = 0; if (journal_buckets_v2) { unsigned nr = bch2_sb_field_journal_v2_nr_entries(journal_buckets_v2); for (unsigned i = 0; i < nr; i++) ja->nr += le64_to_cpu(journal_buckets_v2->d[i].nr); } else if (journal_buckets) { ja->nr = bch2_nr_journal_buckets(journal_buckets); } ja->bucket_seq = kcalloc(ja->nr, sizeof(u64), GFP_KERNEL); if (!ja->bucket_seq) return bch_err_throw(c, ENOMEM_dev_journal_init); unsigned nr_bvecs = DIV_ROUND_UP(JOURNAL_ENTRY_SIZE_MAX, PAGE_SIZE); for (unsigned i = 0; i < ARRAY_SIZE(ja->bio); i++) { ja->bio[i] = kzalloc(struct_size(ja->bio[i], bio.bi_inline_vecs, nr_bvecs), GFP_KERNEL); if (!ja->bio[i]) return bch_err_throw(c, ENOMEM_dev_journal_init); ja->bio[i]->ca = ca; ja->bio[i]->buf_idx = i; bio_init(&ja->bio[i]->bio, NULL, ja->bio[i]->bio.bi_inline_vecs, nr_bvecs, 0); } ja->buckets = kcalloc(ja->nr, sizeof(u64), GFP_KERNEL); if (!ja->buckets) return bch_err_throw(c, ENOMEM_dev_journal_init); if (journal_buckets_v2) { unsigned nr = bch2_sb_field_journal_v2_nr_entries(journal_buckets_v2); unsigned dst = 0; for (unsigned i = 0; i < nr; i++) for (unsigned j = 0; j < le64_to_cpu(journal_buckets_v2->d[i].nr); j++) ja->buckets[dst++] = le64_to_cpu(journal_buckets_v2->d[i].start) + j; } else if (journal_buckets) { for (unsigned i = 0; i < ja->nr; i++) ja->buckets[i] = le64_to_cpu(journal_buckets->buckets[i]); } return 0; } void bch2_fs_journal_exit(struct journal *j) { if (j->wq) destroy_workqueue(j->wq); darray_exit(&j->early_journal_entries); for (unsigned i = 0; i < ARRAY_SIZE(j->buf); i++) kvfree(j->buf[i].data); kvfree(j->free_buf); free_fifo(&j->pin); } void bch2_fs_journal_init_early(struct journal *j) { static struct lock_class_key res_key; mutex_init(&j->buf_lock); spin_lock_init(&j->lock); spin_lock_init(&j->err_lock); init_waitqueue_head(&j->wait); INIT_DELAYED_WORK(&j->write_work, journal_write_work); init_waitqueue_head(&j->reclaim_wait); init_waitqueue_head(&j->pin_flush_wait); mutex_init(&j->reclaim_lock); mutex_init(&j->discard_lock); lockdep_init_map(&j->res_map, "journal res", &res_key, 0); atomic64_set(&j->reservations.counter, ((union journal_res_state) { .cur_entry_offset = JOURNAL_ENTRY_CLOSED_VAL }).v); } int bch2_fs_journal_init(struct journal *j) { struct bch_fs *c = container_of(j, struct bch_fs, journal); j->free_buf_size = j->buf_size_want = JOURNAL_ENTRY_SIZE_MIN; j->free_buf = kvmalloc(j->free_buf_size, GFP_KERNEL); if (!j->free_buf) return bch_err_throw(c, ENOMEM_journal_buf); for (unsigned i = 0; i < ARRAY_SIZE(j->buf); i++) j->buf[i].idx = i; j->wq = alloc_workqueue("bcachefs_journal", WQ_HIGHPRI|WQ_FREEZABLE|WQ_UNBOUND|WQ_MEM_RECLAIM, 512); if (!j->wq) return bch_err_throw(c, ENOMEM_fs_other_alloc); return 0; } /* debug: */ static const char * const bch2_journal_flags_strs[] = { #define x(n) #n, JOURNAL_FLAGS() #undef x NULL }; void __bch2_journal_debug_to_text(struct printbuf *out, struct journal *j) { struct bch_fs *c = container_of(j, struct bch_fs, journal); union journal_res_state s; unsigned long now = jiffies; u64 nr_writes = j->nr_flush_writes + j->nr_noflush_writes; printbuf_tabstops_reset(out); printbuf_tabstop_push(out, 28); out->atomic++; guard(rcu)(); s = READ_ONCE(j->reservations); prt_printf(out, "flags:\t"); prt_bitflags(out, bch2_journal_flags_strs, j->flags); prt_newline(out); prt_printf(out, "dirty journal entries:\t%llu/%llu\n", fifo_used(&j->pin), j->pin.size); prt_printf(out, "seq:\t%llu\n", journal_cur_seq(j)); prt_printf(out, "seq_ondisk:\t%llu\n", j->seq_ondisk); prt_printf(out, "last_seq:\t%llu\n", journal_last_seq(j)); prt_printf(out, "last_seq_ondisk:\t%llu\n", j->last_seq_ondisk); prt_printf(out, "flushed_seq_ondisk:\t%llu\n", j->flushed_seq_ondisk); prt_printf(out, "watermark:\t%s\n", bch2_watermarks[j->watermark]); prt_printf(out, "each entry reserved:\t%u\n", j->entry_u64s_reserved); prt_printf(out, "nr flush writes:\t%llu\n", j->nr_flush_writes); prt_printf(out, "nr noflush writes:\t%llu\n", j->nr_noflush_writes); prt_printf(out, "average write size:\t"); prt_human_readable_u64(out, nr_writes ? div64_u64(j->entry_bytes_written, nr_writes) : 0); prt_newline(out); prt_printf(out, "free buf:\t%u\n", j->free_buf ? j->free_buf_size : 0); prt_printf(out, "nr direct reclaim:\t%llu\n", j->nr_direct_reclaim); prt_printf(out, "nr background reclaim:\t%llu\n", j->nr_background_reclaim); prt_printf(out, "reclaim kicked:\t%u\n", j->reclaim_kicked); prt_printf(out, "reclaim runs in:\t%u ms\n", time_after(j->next_reclaim, now) ? jiffies_to_msecs(j->next_reclaim - jiffies) : 0); prt_printf(out, "blocked:\t%u\n", j->blocked); prt_printf(out, "current entry sectors:\t%u\n", j->cur_entry_sectors); prt_printf(out, "current entry error:\t%s\n", bch2_err_str(j->cur_entry_error)); prt_printf(out, "current entry:\t"); switch (s.cur_entry_offset) { case JOURNAL_ENTRY_ERROR_VAL: prt_printf(out, "error\n"); break; case JOURNAL_ENTRY_CLOSED_VAL: prt_printf(out, "closed\n"); break; case JOURNAL_ENTRY_BLOCKED_VAL: prt_printf(out, "blocked\n"); break; default: prt_printf(out, "%u/%u\n", s.cur_entry_offset, j->cur_entry_u64s); break; } prt_printf(out, "unwritten entries:\n"); bch2_journal_bufs_to_text(out, j); prt_printf(out, "space:\n"); printbuf_indent_add(out, 2); prt_printf(out, "discarded\t%u:%u\n", j->space[journal_space_discarded].next_entry, j->space[journal_space_discarded].total); prt_printf(out, "clean ondisk\t%u:%u\n", j->space[journal_space_clean_ondisk].next_entry, j->space[journal_space_clean_ondisk].total); prt_printf(out, "clean\t%u:%u\n", j->space[journal_space_clean].next_entry, j->space[journal_space_clean].total); prt_printf(out, "total\t%u:%u\n", j->space[journal_space_total].next_entry, j->space[journal_space_total].total); printbuf_indent_sub(out, 2); for_each_member_device_rcu(c, ca, &c->rw_devs[BCH_DATA_journal]) { if (!ca->mi.durability) continue; struct journal_device *ja = &ca->journal; if (!test_bit(ca->dev_idx, c->rw_devs[BCH_DATA_journal].d)) continue; if (!ja->nr) continue; prt_printf(out, "dev %u:\n", ca->dev_idx); prt_printf(out, "durability %u:\n", ca->mi.durability); printbuf_indent_add(out, 2); prt_printf(out, "nr\t%u\n", ja->nr); prt_printf(out, "bucket size\t%u\n", ca->mi.bucket_size); prt_printf(out, "available\t%u:%u\n", bch2_journal_dev_buckets_available(j, ja, journal_space_discarded), ja->sectors_free); prt_printf(out, "discard_idx\t%u\n", ja->discard_idx); prt_printf(out, "dirty_ondisk\t%u (seq %llu)\n",ja->dirty_idx_ondisk, ja->bucket_seq[ja->dirty_idx_ondisk]); prt_printf(out, "dirty_idx\t%u (seq %llu)\n", ja->dirty_idx, ja->bucket_seq[ja->dirty_idx]); prt_printf(out, "cur_idx\t%u (seq %llu)\n", ja->cur_idx, ja->bucket_seq[ja->cur_idx]); printbuf_indent_sub(out, 2); } prt_printf(out, "replicas want %u need %u\n", c->opts.metadata_replicas, c->opts.metadata_replicas_required); --out->atomic; } void bch2_journal_debug_to_text(struct printbuf *out, struct journal *j) { spin_lock(&j->lock); __bch2_journal_debug_to_text(out, j); spin_unlock(&j->lock); }
19396 27901 4 2 9713 5272 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 /* SPDX-License-Identifier: GPL-2.0+ */ /* * Read-Copy Update mechanism for mutual exclusion * * Copyright IBM Corporation, 2001 * * Author: Dipankar Sarma <dipankar@in.ibm.com> * * Based on the original work by Paul McKenney <paulmck@vnet.ibm.com> * and inputs from Rusty Russell, Andrea Arcangeli and Andi Kleen. * Papers: * http://www.rdrop.com/users/paulmck/paper/rclockpdcsproof.pdf * http://lse.sourceforge.net/locking/rclock_OLS.2001.05.01c.sc.pdf (OLS2001) * * For detailed explanation of Read-Copy Update mechanism see - * http://lse.sourceforge.net/locking/rcupdate.html * */ #ifndef __LINUX_RCUPDATE_H #define __LINUX_RCUPDATE_H #include <linux/types.h> #include <linux/compiler.h> #include <linux/atomic.h> #include <linux/irqflags.h> #include <linux/preempt.h> #include <linux/bottom_half.h> #include <linux/lockdep.h> #include <linux/cleanup.h> #include <asm/processor.h> #include <linux/context_tracking_irq.h> #define ULONG_CMP_GE(a, b) (ULONG_MAX / 2 >= (a) - (b)) #define ULONG_CMP_LT(a, b) (ULONG_MAX / 2 < (a) - (b)) #define RCU_SEQ_CTR_SHIFT 2 #define RCU_SEQ_STATE_MASK ((1 << RCU_SEQ_CTR_SHIFT) - 1) /* Exported common interfaces */ void call_rcu(struct rcu_head *head, rcu_callback_t func); void rcu_barrier_tasks(void); void synchronize_rcu(void); struct rcu_gp_oldstate; unsigned long get_completed_synchronize_rcu(void); void get_completed_synchronize_rcu_full(struct rcu_gp_oldstate *rgosp); // Maximum number of unsigned long values corresponding to // not-yet-completed RCU grace periods. #define NUM_ACTIVE_RCU_POLL_OLDSTATE 2 /** * same_state_synchronize_rcu - Are two old-state values identical? * @oldstate1: First old-state value. * @oldstate2: Second old-state value. * * The two old-state values must have been obtained from either * get_state_synchronize_rcu(), start_poll_synchronize_rcu(), or * get_completed_synchronize_rcu(). Returns @true if the two values are * identical and @false otherwise. This allows structures whose lifetimes * are tracked by old-state values to push these values to a list header, * allowing those structures to be slightly smaller. */ static inline bool same_state_synchronize_rcu(unsigned long oldstate1, unsigned long oldstate2) { return oldstate1 == oldstate2; } #ifdef CONFIG_PREEMPT_RCU void __rcu_read_lock(void); void __rcu_read_unlock(void); /* * Defined as a macro as it is a very low level header included from * areas that don't even know about current. This gives the rcu_read_lock() * nesting depth, but makes sense only if CONFIG_PREEMPT_RCU -- in other * types of kernel builds, the rcu_read_lock() nesting depth is unknowable. */ #define rcu_preempt_depth() READ_ONCE(current->rcu_read_lock_nesting) #else /* #ifdef CONFIG_PREEMPT_RCU */ #ifdef CONFIG_TINY_RCU #define rcu_read_unlock_strict() do { } while (0) #else void rcu_read_unlock_strict(void); #endif static inline void __rcu_read_lock(void) { preempt_disable(); } static inline void __rcu_read_unlock(void) { if (IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD)) rcu_read_unlock_strict(); preempt_enable(); } static inline int rcu_preempt_depth(void) { return 0; } #endif /* #else #ifdef CONFIG_PREEMPT_RCU */ #ifdef CONFIG_RCU_LAZY void call_rcu_hurry(struct rcu_head *head, rcu_callback_t func); #else static inline void call_rcu_hurry(struct rcu_head *head, rcu_callback_t func) { call_rcu(head, func); } #endif /* Internal to kernel */ void rcu_init(void); extern int rcu_scheduler_active; void rcu_sched_clock_irq(int user); #ifdef CONFIG_RCU_STALL_COMMON void rcu_sysrq_start(void); void rcu_sysrq_end(void); #else /* #ifdef CONFIG_RCU_STALL_COMMON */ static inline void rcu_sysrq_start(void) { } static inline void rcu_sysrq_end(void) { } #endif /* #else #ifdef CONFIG_RCU_STALL_COMMON */ #if defined(CONFIG_NO_HZ_FULL) && (!defined(CONFIG_GENERIC_ENTRY) || !defined(CONFIG_KVM_XFER_TO_GUEST_WORK)) void rcu_irq_work_resched(void); #else static __always_inline void rcu_irq_work_resched(void) { } #endif #ifdef CONFIG_RCU_NOCB_CPU void rcu_init_nohz(void); int rcu_nocb_cpu_offload(int cpu); int rcu_nocb_cpu_deoffload(int cpu); void rcu_nocb_flush_deferred_wakeup(void); #define RCU_NOCB_LOCKDEP_WARN(c, s) RCU_LOCKDEP_WARN(c, s) #else /* #ifdef CONFIG_RCU_NOCB_CPU */ static inline void rcu_init_nohz(void) { } static inline int rcu_nocb_cpu_offload(int cpu) { return -EINVAL; } static inline int rcu_nocb_cpu_deoffload(int cpu) { return 0; } static inline void rcu_nocb_flush_deferred_wakeup(void) { } #define RCU_NOCB_LOCKDEP_WARN(c, s) #endif /* #else #ifdef CONFIG_RCU_NOCB_CPU */ /* * Note a quasi-voluntary context switch for RCU-tasks's benefit. * This is a macro rather than an inline function to avoid #include hell. */ #ifdef CONFIG_TASKS_RCU_GENERIC # ifdef CONFIG_TASKS_RCU # define rcu_tasks_classic_qs(t, preempt) \ do { \ if (!(preempt) && READ_ONCE((t)->rcu_tasks_holdout)) \ WRITE_ONCE((t)->rcu_tasks_holdout, false); \ } while (0) void call_rcu_tasks(struct rcu_head *head, rcu_callback_t func); void synchronize_rcu_tasks(void); void rcu_tasks_torture_stats_print(char *tt, char *tf); # else # define rcu_tasks_classic_qs(t, preempt) do { } while (0) # define call_rcu_tasks call_rcu # define synchronize_rcu_tasks synchronize_rcu # endif # ifdef CONFIG_TASKS_TRACE_RCU // Bits for ->trc_reader_special.b.need_qs field. #define TRC_NEED_QS 0x1 // Task needs a quiescent state. #define TRC_NEED_QS_CHECKED 0x2 // Task has been checked for needing quiescent state. u8 rcu_trc_cmpxchg_need_qs(struct task_struct *t, u8 old, u8 new); void rcu_tasks_trace_qs_blkd(struct task_struct *t); # define rcu_tasks_trace_qs(t) \ do { \ int ___rttq_nesting = READ_ONCE((t)->trc_reader_nesting); \ \ if (unlikely(READ_ONCE((t)->trc_reader_special.b.need_qs) == TRC_NEED_QS) && \ likely(!___rttq_nesting)) { \ rcu_trc_cmpxchg_need_qs((t), TRC_NEED_QS, TRC_NEED_QS_CHECKED); \ } else if (___rttq_nesting && ___rttq_nesting != INT_MIN && \ !READ_ONCE((t)->trc_reader_special.b.blocked)) { \ rcu_tasks_trace_qs_blkd(t); \ } \ } while (0) void rcu_tasks_trace_torture_stats_print(char *tt, char *tf); # else # define rcu_tasks_trace_qs(t) do { } while (0) # endif #define rcu_tasks_qs(t, preempt) \ do { \ rcu_tasks_classic_qs((t), (preempt)); \ rcu_tasks_trace_qs(t); \ } while (0) # ifdef CONFIG_TASKS_RUDE_RCU void synchronize_rcu_tasks_rude(void); void rcu_tasks_rude_torture_stats_print(char *tt, char *tf); # endif #define rcu_note_voluntary_context_switch(t) rcu_tasks_qs(t, false) void exit_tasks_rcu_start(void); void exit_tasks_rcu_finish(void); #else /* #ifdef CONFIG_TASKS_RCU_GENERIC */ #define rcu_tasks_classic_qs(t, preempt) do { } while (0) #define rcu_tasks_qs(t, preempt) do { } while (0) #define rcu_note_voluntary_context_switch(t) do { } while (0) #define call_rcu_tasks call_rcu #define synchronize_rcu_tasks synchronize_rcu static inline void exit_tasks_rcu_start(void) { } static inline void exit_tasks_rcu_finish(void) { } #endif /* #else #ifdef CONFIG_TASKS_RCU_GENERIC */ /** * rcu_trace_implies_rcu_gp - does an RCU Tasks Trace grace period imply an RCU grace period? * * As an accident of implementation, an RCU Tasks Trace grace period also * acts as an RCU grace period. However, this could change at any time. * Code relying on this accident must call this function to verify that * this accident is still happening. * * You have been warned! */ static inline bool rcu_trace_implies_rcu_gp(void) { return true; } /** * cond_resched_tasks_rcu_qs - Report potential quiescent states to RCU * * This macro resembles cond_resched(), except that it is defined to * report potential quiescent states to RCU-tasks even if the cond_resched() * machinery were to be shut off, as some advocate for PREEMPTION kernels. */ #define cond_resched_tasks_rcu_qs() \ do { \ rcu_tasks_qs(current, false); \ cond_resched(); \ } while (0) /** * rcu_softirq_qs_periodic - Report RCU and RCU-Tasks quiescent states * @old_ts: jiffies at start of processing. * * This helper is for long-running softirq handlers, such as NAPI threads in * networking. The caller should initialize the variable passed in as @old_ts * at the beginning of the softirq handler. When invoked frequently, this macro * will invoke rcu_softirq_qs() every 100 milliseconds thereafter, which will * provide both RCU and RCU-Tasks quiescent states. Note that this macro * modifies its old_ts argument. * * Because regions of code that have disabled softirq act as RCU read-side * critical sections, this macro should be invoked with softirq (and * preemption) enabled. * * The macro is not needed when CONFIG_PREEMPT_RT is defined. RT kernels would * have more chance to invoke schedule() calls and provide necessary quiescent * states. As a contrast, calling cond_resched() only won't achieve the same * effect because cond_resched() does not provide RCU-Tasks quiescent states. */ #define rcu_softirq_qs_periodic(old_ts) \ do { \ if (!IS_ENABLED(CONFIG_PREEMPT_RT) && \ time_after(jiffies, (old_ts) + HZ / 10)) { \ preempt_disable(); \ rcu_softirq_qs(); \ preempt_enable(); \ (old_ts) = jiffies; \ } \ } while (0) /* * Infrastructure to implement the synchronize_() primitives in * TREE_RCU and rcu_barrier_() primitives in TINY_RCU. */ #if defined(CONFIG_TREE_RCU) #include <linux/rcutree.h> #elif defined(CONFIG_TINY_RCU) #include <linux/rcutiny.h> #else #error "Unknown RCU implementation specified to kernel configuration" #endif /* * The init_rcu_head_on_stack() and destroy_rcu_head_on_stack() calls * are needed for dynamic initialization and destruction of rcu_head * on the stack, and init_rcu_head()/destroy_rcu_head() are needed for * dynamic initialization and destruction of statically allocated rcu_head * structures. However, rcu_head structures allocated dynamically in the * heap don't need any initialization. */ #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD void init_rcu_head(struct rcu_head *head); void destroy_rcu_head(struct rcu_head *head); void init_rcu_head_on_stack(struct rcu_head *head); void destroy_rcu_head_on_stack(struct rcu_head *head); #else /* !CONFIG_DEBUG_OBJECTS_RCU_HEAD */ static inline void init_rcu_head(struct rcu_head *head) { } static inline void destroy_rcu_head(struct rcu_head *head) { } static inline void init_rcu_head_on_stack(struct rcu_head *head) { } static inline void destroy_rcu_head_on_stack(struct rcu_head *head) { } #endif /* #else !CONFIG_DEBUG_OBJECTS_RCU_HEAD */ #if defined(CONFIG_HOTPLUG_CPU) && defined(CONFIG_PROVE_RCU) bool rcu_lockdep_current_cpu_online(void); #else /* #if defined(CONFIG_HOTPLUG_CPU) && defined(CONFIG_PROVE_RCU) */ static inline bool rcu_lockdep_current_cpu_online(void) { return true; } #endif /* #else #if defined(CONFIG_HOTPLUG_CPU) && defined(CONFIG_PROVE_RCU) */ extern struct lockdep_map rcu_lock_map; extern struct lockdep_map rcu_bh_lock_map; extern struct lockdep_map rcu_sched_lock_map; extern struct lockdep_map rcu_callback_map; #ifdef CONFIG_DEBUG_LOCK_ALLOC static inline void rcu_lock_acquire(struct lockdep_map *map) { lock_acquire(map, 0, 0, 2, 0, NULL, _THIS_IP_); } static inline void rcu_try_lock_acquire(struct lockdep_map *map) { lock_acquire(map, 0, 1, 2, 0, NULL, _THIS_IP_); } static inline void rcu_lock_release(struct lockdep_map *map) { lock_release(map, _THIS_IP_); } int debug_lockdep_rcu_enabled(void); int rcu_read_lock_held(void); int rcu_read_lock_bh_held(void); int rcu_read_lock_sched_held(void); int rcu_read_lock_any_held(void); #else /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ # define rcu_lock_acquire(a) do { } while (0) # define rcu_try_lock_acquire(a) do { } while (0) # define rcu_lock_release(a) do { } while (0) static inline int rcu_read_lock_held(void) { return 1; } static inline int rcu_read_lock_bh_held(void) { return 1; } static inline int rcu_read_lock_sched_held(void) { return !preemptible(); } static inline int rcu_read_lock_any_held(void) { return !preemptible(); } static inline int debug_lockdep_rcu_enabled(void) { return 0; } #endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */ #ifdef CONFIG_PROVE_RCU /** * RCU_LOCKDEP_WARN - emit lockdep splat if specified condition is met * @c: condition to check * @s: informative message * * This checks debug_lockdep_rcu_enabled() before checking (c) to * prevent early boot splats due to lockdep not yet being initialized, * and rechecks it after checking (c) to prevent false-positive splats * due to races with lockdep being disabled. See commit 3066820034b5dd * ("rcu: Reject RCU_LOCKDEP_WARN() false positives") for more detail. */ #define RCU_LOCKDEP_WARN(c, s) \ do { \ static bool __section(".data..unlikely") __warned; \ if (debug_lockdep_rcu_enabled() && (c) && \ debug_lockdep_rcu_enabled() && !__warned) { \ __warned = true; \ lockdep_rcu_suspicious(__FILE__, __LINE__, s); \ } \ } while (0) #ifndef CONFIG_PREEMPT_RCU static inline void rcu_preempt_sleep_check(void) { RCU_LOCKDEP_WARN(lock_is_held(&rcu_lock_map), "Illegal context switch in RCU read-side critical section"); } #else // #ifndef CONFIG_PREEMPT_RCU static inline void rcu_preempt_sleep_check(void) { } #endif // #else // #ifndef CONFIG_PREEMPT_RCU #define rcu_sleep_check() \ do { \ rcu_preempt_sleep_check(); \ if (!IS_ENABLED(CONFIG_PREEMPT_RT)) \ RCU_LOCKDEP_WARN(lock_is_held(&rcu_bh_lock_map), \ "Illegal context switch in RCU-bh read-side critical section"); \ RCU_LOCKDEP_WARN(lock_is_held(&rcu_sched_lock_map), \ "Illegal context switch in RCU-sched read-side critical section"); \ } while (0) // See RCU_LOCKDEP_WARN() for an explanation of the double call to // debug_lockdep_rcu_enabled(). static inline bool lockdep_assert_rcu_helper(bool c) { return debug_lockdep_rcu_enabled() && (c || !rcu_is_watching() || !rcu_lockdep_current_cpu_online()) && debug_lockdep_rcu_enabled(); } /** * lockdep_assert_in_rcu_read_lock - WARN if not protected by rcu_read_lock() * * Splats if lockdep is enabled and there is no rcu_read_lock() in effect. */ #define lockdep_assert_in_rcu_read_lock() \ WARN_ON_ONCE(lockdep_assert_rcu_helper(!lock_is_held(&rcu_lock_map))) /** * lockdep_assert_in_rcu_read_lock_bh - WARN if not protected by rcu_read_lock_bh() * * Splats if lockdep is enabled and there is no rcu_read_lock_bh() in effect. * Note that local_bh_disable() and friends do not suffice here, instead an * actual rcu_read_lock_bh() is required. */ #define lockdep_assert_in_rcu_read_lock_bh() \ WARN_ON_ONCE(lockdep_assert_rcu_helper(!lock_is_held(&rcu_bh_lock_map))) /** * lockdep_assert_in_rcu_read_lock_sched - WARN if not protected by rcu_read_lock_sched() * * Splats if lockdep is enabled and there is no rcu_read_lock_sched() * in effect. Note that preempt_disable() and friends do not suffice here, * instead an actual rcu_read_lock_sched() is required. */ #define lockdep_assert_in_rcu_read_lock_sched() \ WARN_ON_ONCE(lockdep_assert_rcu_helper(!lock_is_held(&rcu_sched_lock_map))) /** * lockdep_assert_in_rcu_reader - WARN if not within some type of RCU reader * * Splats if lockdep is enabled and there is no RCU reader of any * type in effect. Note that regions of code protected by things like * preempt_disable, local_bh_disable(), and local_irq_disable() all qualify * as RCU readers. * * Note that this will never trigger in PREEMPT_NONE or PREEMPT_VOLUNTARY * kernels that are not also built with PREEMPT_COUNT. But if you have * lockdep enabled, you might as well also enable PREEMPT_COUNT. */ #define lockdep_assert_in_rcu_reader() \ WARN_ON_ONCE(lockdep_assert_rcu_helper(!lock_is_held(&rcu_lock_map) && \ !lock_is_held(&rcu_bh_lock_map) && \ !lock_is_held(&rcu_sched_lock_map) && \ preemptible())) #else /* #ifdef CONFIG_PROVE_RCU */ #define RCU_LOCKDEP_WARN(c, s) do { } while (0 && (c)) #define rcu_sleep_check() do { } while (0) #define lockdep_assert_in_rcu_read_lock() do { } while (0) #define lockdep_assert_in_rcu_read_lock_bh() do { } while (0) #define lockdep_assert_in_rcu_read_lock_sched() do { } while (0) #define lockdep_assert_in_rcu_reader() do { } while (0) #endif /* #else #ifdef CONFIG_PROVE_RCU */ /* * Helper functions for rcu_dereference_check(), rcu_dereference_protected() * and rcu_assign_pointer(). Some of these could be folded into their * callers, but they are left separate in order to ease introduction of * multiple pointers markings to match different RCU implementations * (e.g., __srcu), should this make sense in the future. */ #ifdef __CHECKER__ #define rcu_check_sparse(p, space) \ ((void)(((typeof(*p) space *)p) == p)) #else /* #ifdef __CHECKER__ */ #define rcu_check_sparse(p, space) #endif /* #else #ifdef __CHECKER__ */ #define __unrcu_pointer(p, local) \ ({ \ typeof(*p) *local = (typeof(*p) *__force)(p); \ rcu_check_sparse(p, __rcu); \ ((typeof(*p) __force __kernel *)(local)); \ }) /** * unrcu_pointer - mark a pointer as not being RCU protected * @p: pointer needing to lose its __rcu property * * Converts @p from an __rcu pointer to a __kernel pointer. * This allows an __rcu pointer to be used with xchg() and friends. */ #define unrcu_pointer(p) __unrcu_pointer(p, __UNIQUE_ID(rcu)) #define __rcu_access_pointer(p, local, space) \ ({ \ typeof(*p) *local = (typeof(*p) *__force)READ_ONCE(p); \ rcu_check_sparse(p, space); \ ((typeof(*p) __force __kernel *)(local)); \ }) #define __rcu_dereference_check(p, local, c, space) \ ({ \ /* Dependency order vs. p above. */ \ typeof(*p) *local = (typeof(*p) *__force)READ_ONCE(p); \ RCU_LOCKDEP_WARN(!(c), "suspicious rcu_dereference_check() usage"); \ rcu_check_sparse(p, space); \ ((typeof(*p) __force __kernel *)(local)); \ }) #define __rcu_dereference_protected(p, local, c, space) \ ({ \ RCU_LOCKDEP_WARN(!(c), "suspicious rcu_dereference_protected() usage"); \ rcu_check_sparse(p, space); \ ((typeof(*p) __force __kernel *)(p)); \ }) #define __rcu_dereference_raw(p, local) \ ({ \ /* Dependency order vs. p above. */ \ typeof(p) local = READ_ONCE(p); \ ((typeof(*p) __force __kernel *)(local)); \ }) #define rcu_dereference_raw(p) __rcu_dereference_raw(p, __UNIQUE_ID(rcu)) /** * RCU_INITIALIZER() - statically initialize an RCU-protected global variable * @v: The value to statically initialize with. */ #define RCU_INITIALIZER(v) (typeof(*(v)) __force __rcu *)(v) /** * rcu_assign_pointer() - assign to RCU-protected pointer * @p: pointer to assign to * @v: value to assign (publish) * * Assigns the specified value to the specified RCU-protected * pointer, ensuring that any concurrent RCU readers will see * any prior initialization. * * Inserts memory barriers on architectures that require them * (which is most of them), and also prevents the compiler from * reordering the code that initializes the structure after the pointer * assignment. More importantly, this call documents which pointers * will be dereferenced by RCU read-side code. * * In some special cases, you may use RCU_INIT_POINTER() instead * of rcu_assign_pointer(). RCU_INIT_POINTER() is a bit faster due * to the fact that it does not constrain either the CPU or the compiler. * That said, using RCU_INIT_POINTER() when you should have used * rcu_assign_pointer() is a very bad thing that results in * impossible-to-diagnose memory corruption. So please be careful. * See the RCU_INIT_POINTER() comment header for details. * * Note that rcu_assign_pointer() evaluates each of its arguments only * once, appearances notwithstanding. One of the "extra" evaluations * is in typeof() and the other visible only to sparse (__CHECKER__), * neither of which actually execute the argument. As with most cpp * macros, this execute-arguments-only-once property is important, so * please be careful when making changes to rcu_assign_pointer() and the * other macros that it invokes. */ #define rcu_assign_pointer(p, v) \ do { \ uintptr_t _r_a_p__v = (uintptr_t)(v); \ rcu_check_sparse(p, __rcu); \ \ if (__builtin_constant_p(v) && (_r_a_p__v) == (uintptr_t)NULL) \ WRITE_ONCE((p), (typeof(p))(_r_a_p__v)); \ else \ smp_store_release(&p, RCU_INITIALIZER((typeof(p))_r_a_p__v)); \ } while (0) /** * rcu_replace_pointer() - replace an RCU pointer, returning its old value * @rcu_ptr: RCU pointer, whose old value is returned * @ptr: regular pointer * @c: the lockdep conditions under which the dereference will take place * * Perform a replacement, where @rcu_ptr is an RCU-annotated * pointer and @c is the lockdep argument that is passed to the * rcu_dereference_protected() call used to read that pointer. The old * value of @rcu_ptr is returned, and @rcu_ptr is set to @ptr. */ #define rcu_replace_pointer(rcu_ptr, ptr, c) \ ({ \ typeof(ptr) __tmp = rcu_dereference_protected((rcu_ptr), (c)); \ rcu_assign_pointer((rcu_ptr), (ptr)); \ __tmp; \ }) /** * rcu_access_pointer() - fetch RCU pointer with no dereferencing * @p: The pointer to read * * Return the value of the specified RCU-protected pointer, but omit the * lockdep checks for being in an RCU read-side critical section. This is * useful when the value of this pointer is accessed, but the pointer is * not dereferenced, for example, when testing an RCU-protected pointer * against NULL. Although rcu_access_pointer() may also be used in cases * where update-side locks prevent the value of the pointer from changing, * you should instead use rcu_dereference_protected() for this use case. * Within an RCU read-side critical section, there is little reason to * use rcu_access_pointer(). * * It is usually best to test the rcu_access_pointer() return value * directly in order to avoid accidental dereferences being introduced * by later inattentive changes. In other words, assigning the * rcu_access_pointer() return value to a local variable results in an * accident waiting to happen. * * It is also permissible to use rcu_access_pointer() when read-side * access to the pointer was removed at least one grace period ago, as is * the case in the context of the RCU callback that is freeing up the data, * or after a synchronize_rcu() returns. This can be useful when tearing * down multi-linked structures after a grace period has elapsed. However, * rcu_dereference_protected() is normally preferred for this use case. */ #define rcu_access_pointer(p) __rcu_access_pointer((p), __UNIQUE_ID(rcu), __rcu) /** * rcu_dereference_check() - rcu_dereference with debug checking * @p: The pointer to read, prior to dereferencing * @c: The conditions under which the dereference will take place * * Do an rcu_dereference(), but check that the conditions under which the * dereference will take place are correct. Typically the conditions * indicate the various locking conditions that should be held at that * point. The check should return true if the conditions are satisfied. * An implicit check for being in an RCU read-side critical section * (rcu_read_lock()) is included. * * For example: * * bar = rcu_dereference_check(foo->bar, lockdep_is_held(&foo->lock)); * * could be used to indicate to lockdep that foo->bar may only be dereferenced * if either rcu_read_lock() is held, or that the lock required to replace * the bar struct at foo->bar is held. * * Note that the list of conditions may also include indications of when a lock * need not be held, for example during initialisation or destruction of the * target struct: * * bar = rcu_dereference_check(foo->bar, lockdep_is_held(&foo->lock) || * atomic_read(&foo->usage) == 0); * * Inserts memory barriers on architectures that require them * (currently only the Alpha), prevents the compiler from refetching * (and from merging fetches), and, more importantly, documents exactly * which pointers are protected by RCU and checks that the pointer is * annotated as __rcu. */ #define rcu_dereference_check(p, c) \ __rcu_dereference_check((p), __UNIQUE_ID(rcu), \ (c) || rcu_read_lock_held(), __rcu) /** * rcu_dereference_bh_check() - rcu_dereference_bh with debug checking * @p: The pointer to read, prior to dereferencing * @c: The conditions under which the dereference will take place * * This is the RCU-bh counterpart to rcu_dereference_check(). However, * please note that starting in v5.0 kernels, vanilla RCU grace periods * wait for local_bh_disable() regions of code in addition to regions of * code demarked by rcu_read_lock() and rcu_read_unlock(). This means * that synchronize_rcu(), call_rcu, and friends all take not only * rcu_read_lock() but also rcu_read_lock_bh() into account. */ #define rcu_dereference_bh_check(p, c) \ __rcu_dereference_check((p), __UNIQUE_ID(rcu), \ (c) || rcu_read_lock_bh_held(), __rcu) /** * rcu_dereference_sched_check() - rcu_dereference_sched with debug checking * @p: The pointer to read, prior to dereferencing * @c: The conditions under which the dereference will take place * * This is the RCU-sched counterpart to rcu_dereference_check(). * However, please note that starting in v5.0 kernels, vanilla RCU grace * periods wait for preempt_disable() regions of code in addition to * regions of code demarked by rcu_read_lock() and rcu_read_unlock(). * This means that synchronize_rcu(), call_rcu, and friends all take not * only rcu_read_lock() but also rcu_read_lock_sched() into account. */ #define rcu_dereference_sched_check(p, c) \ __rcu_dereference_check((p), __UNIQUE_ID(rcu), \ (c) || rcu_read_lock_sched_held(), \ __rcu) /* * The tracing infrastructure traces RCU (we want that), but unfortunately * some of the RCU checks causes tracing to lock up the system. * * The no-tracing version of rcu_dereference_raw() must not call * rcu_read_lock_held(). */ #define rcu_dereference_raw_check(p) \ __rcu_dereference_check((p), __UNIQUE_ID(rcu), 1, __rcu) /** * rcu_dereference_protected() - fetch RCU pointer when updates prevented * @p: The pointer to read, prior to dereferencing * @c: The conditions under which the dereference will take place * * Return the value of the specified RCU-protected pointer, but omit * the READ_ONCE(). This is useful in cases where update-side locks * prevent the value of the pointer from changing. Please note that this * primitive does *not* prevent the compiler from repeating this reference * or combining it with other references, so it should not be used without * protection of appropriate locks. * * This function is only for update-side use. Using this function * when protected only by rcu_read_lock() will result in infrequent * but very ugly failures. */ #define rcu_dereference_protected(p, c) \ __rcu_dereference_protected((p), __UNIQUE_ID(rcu), (c), __rcu) /** * rcu_dereference() - fetch RCU-protected pointer for dereferencing * @p: The pointer to read, prior to dereferencing * * This is a simple wrapper around rcu_dereference_check(). */ #define rcu_dereference(p) rcu_dereference_check(p, 0) /** * rcu_dereference_bh() - fetch an RCU-bh-protected pointer for dereferencing * @p: The pointer to read, prior to dereferencing * * Makes rcu_dereference_check() do the dirty work. */ #define rcu_dereference_bh(p) rcu_dereference_bh_check(p, 0) /** * rcu_dereference_sched() - fetch RCU-sched-protected pointer for dereferencing * @p: The pointer to read, prior to dereferencing * * Makes rcu_dereference_check() do the dirty work. */ #define rcu_dereference_sched(p) rcu_dereference_sched_check(p, 0) /** * rcu_pointer_handoff() - Hand off a pointer from RCU to other mechanism * @p: The pointer to hand off * * This is simply an identity function, but it documents where a pointer * is handed off from RCU to some other synchronization mechanism, for * example, reference counting or locking. In C11, it would map to * kill_dependency(). It could be used as follows:: * * rcu_read_lock(); * p = rcu_dereference(gp); * long_lived = is_long_lived(p); * if (long_lived) { * if (!atomic_inc_not_zero(p->refcnt)) * long_lived = false; * else * p = rcu_pointer_handoff(p); * } * rcu_read_unlock(); */ #define rcu_pointer_handoff(p) (p) /** * rcu_read_lock() - mark the beginning of an RCU read-side critical section * * When synchronize_rcu() is invoked on one CPU while other CPUs * are within RCU read-side critical sections, then the * synchronize_rcu() is guaranteed to block until after all the other * CPUs exit their critical sections. Similarly, if call_rcu() is invoked * on one CPU while other CPUs are within RCU read-side critical * sections, invocation of the corresponding RCU callback is deferred * until after the all the other CPUs exit their critical sections. * * Both synchronize_rcu() and call_rcu() also wait for regions of code * with preemption disabled, including regions of code with interrupts or * softirqs disabled. * * Note, however, that RCU callbacks are permitted to run concurrently * with new RCU read-side critical sections. One way that this can happen * is via the following sequence of events: (1) CPU 0 enters an RCU * read-side critical section, (2) CPU 1 invokes call_rcu() to register * an RCU callback, (3) CPU 0 exits the RCU read-side critical section, * (4) CPU 2 enters a RCU read-side critical section, (5) the RCU * callback is invoked. This is legal, because the RCU read-side critical * section that was running concurrently with the call_rcu() (and which * therefore might be referencing something that the corresponding RCU * callback would free up) has completed before the corresponding * RCU callback is invoked. * * RCU read-side critical sections may be nested. Any deferred actions * will be deferred until the outermost RCU read-side critical section * completes. * * You can avoid reading and understanding the next paragraph by * following this rule: don't put anything in an rcu_read_lock() RCU * read-side critical section that would block in a !PREEMPTION kernel. * But if you want the full story, read on! * * In non-preemptible RCU implementations (pure TREE_RCU and TINY_RCU), * it is illegal to block while in an RCU read-side critical section. * In preemptible RCU implementations (PREEMPT_RCU) in CONFIG_PREEMPTION * kernel builds, RCU read-side critical sections may be preempted, * but explicit blocking is illegal. Finally, in preemptible RCU * implementations in real-time (with -rt patchset) kernel builds, RCU * read-side critical sections may be preempted and they may also block, but * only when acquiring spinlocks that are subject to priority inheritance. */ static __always_inline void rcu_read_lock(void) { __rcu_read_lock(); __acquire(RCU); rcu_lock_acquire(&rcu_lock_map); RCU_LOCKDEP_WARN(!rcu_is_watching(), "rcu_read_lock() used illegally while idle"); } /* * So where is rcu_write_lock()? It does not exist, as there is no * way for writers to lock out RCU readers. This is a feature, not * a bug -- this property is what provides RCU's performance benefits. * Of course, writers must coordinate with each other. The normal * spinlock primitives work well for this, but any other technique may be * used as well. RCU does not care how the writers keep out of each * others' way, as long as they do so. */ /** * rcu_read_unlock() - marks the end of an RCU read-side critical section. * * In almost all situations, rcu_read_unlock() is immune from deadlock. * This deadlock immunity also extends to the scheduler's runqueue * and priority-inheritance spinlocks, courtesy of the quiescent-state * deferral that is carried out when rcu_read_unlock() is invoked with * interrupts disabled. * * See rcu_read_lock() for more information. */ static inline void rcu_read_unlock(void) { RCU_LOCKDEP_WARN(!rcu_is_watching(), "rcu_read_unlock() used illegally while idle"); rcu_lock_release(&rcu_lock_map); /* Keep acq info for rls diags. */ __release(RCU); __rcu_read_unlock(); } /** * rcu_read_lock_bh() - mark the beginning of an RCU-bh critical section * * This is equivalent to rcu_read_lock(), but also disables softirqs. * Note that anything else that disables softirqs can also serve as an RCU * read-side critical section. However, please note that this equivalence * applies only to v5.0 and later. Before v5.0, rcu_read_lock() and * rcu_read_lock_bh() were unrelated. * * Note that rcu_read_lock_bh() and the matching rcu_read_unlock_bh() * must occur in the same context, for example, it is illegal to invoke * rcu_read_unlock_bh() from one task if the matching rcu_read_lock_bh() * was invoked from some other task. */ static inline void rcu_read_lock_bh(void) { local_bh_disable(); __acquire(RCU_BH); rcu_lock_acquire(&rcu_bh_lock_map); RCU_LOCKDEP_WARN(!rcu_is_watching(), "rcu_read_lock_bh() used illegally while idle"); } /** * rcu_read_unlock_bh() - marks the end of a softirq-only RCU critical section * * See rcu_read_lock_bh() for more information. */ static inline void rcu_read_unlock_bh(void) { RCU_LOCKDEP_WARN(!rcu_is_watching(), "rcu_read_unlock_bh() used illegally while idle"); rcu_lock_release(&rcu_bh_lock_map); __release(RCU_BH); local_bh_enable(); } /** * rcu_read_lock_sched() - mark the beginning of a RCU-sched critical section * * This is equivalent to rcu_read_lock(), but also disables preemption. * Read-side critical sections can also be introduced by anything else that * disables preemption, including local_irq_disable() and friends. However, * please note that the equivalence to rcu_read_lock() applies only to * v5.0 and later. Before v5.0, rcu_read_lock() and rcu_read_lock_sched() * were unrelated. * * Note that rcu_read_lock_sched() and the matching rcu_read_unlock_sched() * must occur in the same context, for example, it is illegal to invoke * rcu_read_unlock_sched() from process context if the matching * rcu_read_lock_sched() was invoked from an NMI handler. */ static inline void rcu_read_lock_sched(void) { preempt_disable(); __acquire(RCU_SCHED); rcu_lock_acquire(&rcu_sched_lock_map); RCU_LOCKDEP_WARN(!rcu_is_watching(), "rcu_read_lock_sched() used illegally while idle"); } /* Used by lockdep and tracing: cannot be traced, cannot call lockdep. */ static inline notrace void rcu_read_lock_sched_notrace(void) { preempt_disable_notrace(); __acquire(RCU_SCHED); } /** * rcu_read_unlock_sched() - marks the end of a RCU-classic critical section * * See rcu_read_lock_sched() for more information. */ static inline void rcu_read_unlock_sched(void) { RCU_LOCKDEP_WARN(!rcu_is_watching(), "rcu_read_unlock_sched() used illegally while idle"); rcu_lock_release(&rcu_sched_lock_map); __release(RCU_SCHED); preempt_enable(); } /* Used by lockdep and tracing: cannot be traced, cannot call lockdep. */ static inline notrace void rcu_read_unlock_sched_notrace(void) { __release(RCU_SCHED); preempt_enable_notrace(); } /** * RCU_INIT_POINTER() - initialize an RCU protected pointer * @p: The pointer to be initialized. * @v: The value to initialized the pointer to. * * Initialize an RCU-protected pointer in special cases where readers * do not need ordering constraints on the CPU or the compiler. These * special cases are: * * 1. This use of RCU_INIT_POINTER() is NULLing out the pointer *or* * 2. The caller has taken whatever steps are required to prevent * RCU readers from concurrently accessing this pointer *or* * 3. The referenced data structure has already been exposed to * readers either at compile time or via rcu_assign_pointer() *and* * * a. You have not made *any* reader-visible changes to * this structure since then *or* * b. It is OK for readers accessing this structure from its * new location to see the old state of the structure. (For * example, the changes were to statistical counters or to * other state where exact synchronization is not required.) * * Failure to follow these rules governing use of RCU_INIT_POINTER() will * result in impossible-to-diagnose memory corruption. As in the structures * will look OK in crash dumps, but any concurrent RCU readers might * see pre-initialized values of the referenced data structure. So * please be very careful how you use RCU_INIT_POINTER()!!! * * If you are creating an RCU-protected linked structure that is accessed * by a single external-to-structure RCU-protected pointer, then you may * use RCU_INIT_POINTER() to initialize the internal RCU-protected * pointers, but you must use rcu_assign_pointer() to initialize the * external-to-structure pointer *after* you have completely initialized * the reader-accessible portions of the linked structure. * * Note that unlike rcu_assign_pointer(), RCU_INIT_POINTER() provides no * ordering guarantees for either the CPU or the compiler. */ #define RCU_INIT_POINTER(p, v) \ do { \ rcu_check_sparse(p, __rcu); \ WRITE_ONCE(p, RCU_INITIALIZER(v)); \ } while (0) /** * RCU_POINTER_INITIALIZER() - statically initialize an RCU protected pointer * @p: The pointer to be initialized. * @v: The value to initialized the pointer to. * * GCC-style initialization for an RCU-protected pointer in a structure field. */ #define RCU_POINTER_INITIALIZER(p, v) \ .p = RCU_INITIALIZER(v) /** * kfree_rcu() - kfree an object after a grace period. * @ptr: pointer to kfree for double-argument invocations. * @rhf: the name of the struct rcu_head within the type of @ptr. * * Many rcu callbacks functions just call kfree() on the base structure. * These functions are trivial, but their size adds up, and furthermore * when they are used in a kernel module, that module must invoke the * high-latency rcu_barrier() function at module-unload time. * * The kfree_rcu() function handles this issue. In order to have a universal * callback function handling different offsets of rcu_head, the callback needs * to determine the starting address of the freed object, which can be a large * kmalloc or vmalloc allocation. To allow simply aligning the pointer down to * page boundary for those, only offsets up to 4095 bytes can be accommodated. * If the offset is larger than 4095 bytes, a compile-time error will * be generated in kvfree_rcu_arg_2(). If this error is triggered, you can * either fall back to use of call_rcu() or rearrange the structure to * position the rcu_head structure into the first 4096 bytes. * * The object to be freed can be allocated either by kmalloc() or * kmem_cache_alloc(). * * Note that the allowable offset might decrease in the future. * * The BUILD_BUG_ON check must not involve any function calls, hence the * checks are done in macros here. */ #define kfree_rcu(ptr, rhf) kvfree_rcu_arg_2(ptr, rhf) #define kvfree_rcu(ptr, rhf) kvfree_rcu_arg_2(ptr, rhf) /** * kfree_rcu_mightsleep() - kfree an object after a grace period. * @ptr: pointer to kfree for single-argument invocations. * * When it comes to head-less variant, only one argument * is passed and that is just a pointer which has to be * freed after a grace period. Therefore the semantic is * * kfree_rcu_mightsleep(ptr); * * where @ptr is the pointer to be freed by kvfree(). * * Please note, head-less way of freeing is permitted to * use from a context that has to follow might_sleep() * annotation. Otherwise, please switch and embed the * rcu_head structure within the type of @ptr. */ #define kfree_rcu_mightsleep(ptr) kvfree_rcu_arg_1(ptr) #define kvfree_rcu_mightsleep(ptr) kvfree_rcu_arg_1(ptr) /* * In mm/slab_common.c, no suitable header to include here. */ void kvfree_call_rcu(struct rcu_head *head, void *ptr); /* * The BUILD_BUG_ON() makes sure the rcu_head offset can be handled. See the * comment of kfree_rcu() for details. */ #define kvfree_rcu_arg_2(ptr, rhf) \ do { \ typeof (ptr) ___p = (ptr); \ \ if (___p) { \ BUILD_BUG_ON(offsetof(typeof(*(ptr)), rhf) >= 4096); \ kvfree_call_rcu(&((___p)->rhf), (void *) (___p)); \ } \ } while (0) #define kvfree_rcu_arg_1(ptr) \ do { \ typeof(ptr) ___p = (ptr); \ \ if (___p) \ kvfree_call_rcu(NULL, (void *) (___p)); \ } while (0) /* * Place this after a lock-acquisition primitive to guarantee that * an UNLOCK+LOCK pair acts as a full barrier. This guarantee applies * if the UNLOCK and LOCK are executed by the same CPU or if the * UNLOCK and LOCK operate on the same lock variable. */ #ifdef CONFIG_ARCH_WEAK_RELEASE_ACQUIRE #define smp_mb__after_unlock_lock() smp_mb() /* Full ordering for lock. */ #else /* #ifdef CONFIG_ARCH_WEAK_RELEASE_ACQUIRE */ #define smp_mb__after_unlock_lock() do { } while (0) #endif /* #else #ifdef CONFIG_ARCH_WEAK_RELEASE_ACQUIRE */ /* Has the specified rcu_head structure been handed to call_rcu()? */ /** * rcu_head_init - Initialize rcu_head for rcu_head_after_call_rcu() * @rhp: The rcu_head structure to initialize. * * If you intend to invoke rcu_head_after_call_rcu() to test whether a * given rcu_head structure has already been passed to call_rcu(), then * you must also invoke this rcu_head_init() function on it just after * allocating that structure. Calls to this function must not race with * calls to call_rcu(), rcu_head_after_call_rcu(), or callback invocation. */ static inline void rcu_head_init(struct rcu_head *rhp) { rhp->func = (rcu_callback_t)~0L; } /** * rcu_head_after_call_rcu() - Has this rcu_head been passed to call_rcu()? * @rhp: The rcu_head structure to test. * @f: The function passed to call_rcu() along with @rhp. * * Returns @true if the @rhp has been passed to call_rcu() with @func, * and @false otherwise. Emits a warning in any other case, including * the case where @rhp has already been invoked after a grace period. * Calls to this function must not race with callback invocation. One way * to avoid such races is to enclose the call to rcu_head_after_call_rcu() * in an RCU read-side critical section that includes a read-side fetch * of the pointer to the structure containing @rhp. */ static inline bool rcu_head_after_call_rcu(struct rcu_head *rhp, rcu_callback_t f) { rcu_callback_t func = READ_ONCE(rhp->func); if (func == f) return true; WARN_ON_ONCE(func != (rcu_callback_t)~0L); return false; } /* kernel/ksysfs.c definitions */ extern int rcu_expedited; extern int rcu_normal; DEFINE_LOCK_GUARD_0(rcu, do { rcu_read_lock(); /* * sparse doesn't call the cleanup function, * so just release immediately and don't track * the context. We don't need to anyway, since * the whole point of the guard is to not need * the explicit unlock. */ __release(RCU); } while (0), rcu_read_unlock()) #endif /* __LINUX_RCUPDATE_H */
1 9 9 9 29 12 2 32 190 188 178 9 197 180 192 186 186 188 186 186 1 1 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 // SPDX-License-Identifier: GPL-2.0-or-later /* * Copyright (c) 2016 Mellanox Technologies. All rights reserved. * Copyright (c) 2016 Jiri Pirko <jiri@mellanox.com> */ #include <net/genetlink.h> #define CREATE_TRACE_POINTS #include <trace/events/devlink.h> #include "devl_internal.h" EXPORT_TRACEPOINT_SYMBOL_GPL(devlink_hwmsg); EXPORT_TRACEPOINT_SYMBOL_GPL(devlink_hwerr); EXPORT_TRACEPOINT_SYMBOL_GPL(devlink_trap_report); DEFINE_XARRAY_FLAGS(devlinks, XA_FLAGS_ALLOC); static struct devlink *devlinks_xa_get(unsigned long index) { struct devlink *devlink; rcu_read_lock(); devlink = xa_find(&devlinks, &index, index, DEVLINK_REGISTERED); if (!devlink || !devlink_try_get(devlink)) devlink = NULL; rcu_read_unlock(); return devlink; } /* devlink_rels xarray contains 1:1 relationships between * devlink object and related nested devlink instance. * The xarray index is used to get the nested object from * the nested-in object code. */ static DEFINE_XARRAY_FLAGS(devlink_rels, XA_FLAGS_ALLOC1); #define DEVLINK_REL_IN_USE XA_MARK_0 struct devlink_rel { u32 index; refcount_t refcount; u32 devlink_index; struct { u32 devlink_index; u32 obj_index; devlink_rel_notify_cb_t *notify_cb; devlink_rel_cleanup_cb_t *cleanup_cb; struct delayed_work notify_work; } nested_in; }; static void devlink_rel_free(struct devlink_rel *rel) { xa_erase(&devlink_rels, rel->index); kfree(rel); } static void __devlink_rel_get(struct devlink_rel *rel) { refcount_inc(&rel->refcount); } static void __devlink_rel_put(struct devlink_rel *rel) { if (refcount_dec_and_test(&rel->refcount)) devlink_rel_free(rel); } static void devlink_rel_nested_in_notify_work(struct work_struct *work) { struct devlink_rel *rel = container_of(work, struct devlink_rel, nested_in.notify_work.work); struct devlink *devlink; devlink = devlinks_xa_get(rel->nested_in.devlink_index); if (!devlink) goto rel_put; if (!devl_trylock(devlink)) { devlink_put(devlink); goto reschedule_work; } if (!devl_is_registered(devlink)) { devl_unlock(devlink); devlink_put(devlink); goto rel_put; } if (!xa_get_mark(&devlink_rels, rel->index, DEVLINK_REL_IN_USE)) rel->nested_in.cleanup_cb(devlink, rel->nested_in.obj_index, rel->index); rel->nested_in.notify_cb(devlink, rel->nested_in.obj_index); devl_unlock(devlink); devlink_put(devlink); rel_put: __devlink_rel_put(rel); return; reschedule_work: schedule_delayed_work(&rel->nested_in.notify_work, 1); } static void devlink_rel_nested_in_notify_work_schedule(struct devlink_rel *rel) { __devlink_rel_get(rel); schedule_delayed_work(&rel->nested_in.notify_work, 0); } static struct devlink_rel *devlink_rel_alloc(void) { struct devlink_rel *rel; static u32 next; int err; rel = kzalloc(sizeof(*rel), GFP_KERNEL); if (!rel) return ERR_PTR(-ENOMEM); err = xa_alloc_cyclic(&devlink_rels, &rel->index, rel, xa_limit_32b, &next, GFP_KERNEL); if (err < 0) { kfree(rel); return ERR_PTR(err); } refcount_set(&rel->refcount, 1); INIT_DELAYED_WORK(&rel->nested_in.notify_work, &devlink_rel_nested_in_notify_work); return rel; } static void devlink_rel_put(struct devlink *devlink) { struct devlink_rel *rel = devlink->rel; if (!rel) return; xa_clear_mark(&devlink_rels, rel->index, DEVLINK_REL_IN_USE); devlink_rel_nested_in_notify_work_schedule(rel); __devlink_rel_put(rel); devlink->rel = NULL; } void devlink_rel_nested_in_clear(u32 rel_index) { xa_clear_mark(&devlink_rels, rel_index, DEVLINK_REL_IN_USE); } int devlink_rel_nested_in_add(u32 *rel_index, u32 devlink_index, u32 obj_index, devlink_rel_notify_cb_t *notify_cb, devlink_rel_cleanup_cb_t *cleanup_cb, struct devlink *devlink) { struct devlink_rel *rel = devlink_rel_alloc(); ASSERT_DEVLINK_NOT_REGISTERED(devlink); if (IS_ERR(rel)) return PTR_ERR(rel); rel->devlink_index = devlink->index; rel->nested_in.devlink_index = devlink_index; rel->nested_in.obj_index = obj_index; rel->nested_in.notify_cb = notify_cb; rel->nested_in.cleanup_cb = cleanup_cb; *rel_index = rel->index; xa_set_mark(&devlink_rels, rel->index, DEVLINK_REL_IN_USE); devlink->rel = rel; return 0; } /** * devlink_rel_nested_in_notify - Notify the object this devlink * instance is nested in. * @devlink: devlink * * This is called upon network namespace change of devlink instance. * In case this devlink instance is nested in another devlink object, * a notification of a change of this object should be sent * over netlink. The parent devlink instance lock needs to be * taken during the notification preparation. * However, since the devlink lock of nested instance is held here, * we would end with wrong devlink instance lock ordering and * deadlock. Therefore the work is utilized to avoid that. */ void devlink_rel_nested_in_notify(struct devlink *devlink) { struct devlink_rel *rel = devlink->rel; if (!rel) return; devlink_rel_nested_in_notify_work_schedule(rel); } static struct devlink_rel *devlink_rel_find(unsigned long rel_index) { return xa_find(&devlink_rels, &rel_index, rel_index, DEVLINK_REL_IN_USE); } static struct devlink *devlink_rel_devlink_get(u32 rel_index) { struct devlink_rel *rel; u32 devlink_index; if (!rel_index) return NULL; xa_lock(&devlink_rels); rel = devlink_rel_find(rel_index); if (rel) devlink_index = rel->devlink_index; xa_unlock(&devlink_rels); if (!rel) return NULL; return devlinks_xa_get(devlink_index); } int devlink_rel_devlink_handle_put(struct sk_buff *msg, struct devlink *devlink, u32 rel_index, int attrtype, bool *msg_updated) { struct net *net = devlink_net(devlink); struct devlink *rel_devlink; int err; rel_devlink = devlink_rel_devlink_get(rel_index); if (!rel_devlink) return 0; err = devlink_nl_put_nested_handle(msg, net, rel_devlink, attrtype); devlink_put(rel_devlink); if (!err && msg_updated) *msg_updated = true; return err; } void *devlink_priv(struct devlink *devlink) { return &devlink->priv; } EXPORT_SYMBOL_GPL(devlink_priv); struct devlink *priv_to_devlink(void *priv) { return container_of(priv, struct devlink, priv); } EXPORT_SYMBOL_GPL(priv_to_devlink); struct device *devlink_to_dev(const struct devlink *devlink) { return devlink->dev; } EXPORT_SYMBOL_GPL(devlink_to_dev); struct net *devlink_net(const struct devlink *devlink) { return read_pnet(&devlink->_net); } EXPORT_SYMBOL_GPL(devlink_net); void devl_assert_locked(struct devlink *devlink) { lockdep_assert_held(&devlink->lock); } EXPORT_SYMBOL_GPL(devl_assert_locked); #ifdef CONFIG_LOCKDEP /* For use in conjunction with LOCKDEP only e.g. rcu_dereference_protected() */ bool devl_lock_is_held(struct devlink *devlink) { return lockdep_is_held(&devlink->lock); } EXPORT_SYMBOL_GPL(devl_lock_is_held); #endif void devl_lock(struct devlink *devlink) { mutex_lock(&devlink->lock); } EXPORT_SYMBOL_GPL(devl_lock); int devl_trylock(struct devlink *devlink) { return mutex_trylock(&devlink->lock); } EXPORT_SYMBOL_GPL(devl_trylock); void devl_unlock(struct devlink *devlink) { mutex_unlock(&devlink->lock); } EXPORT_SYMBOL_GPL(devl_unlock); /** * devlink_try_get() - try to obtain a reference on a devlink instance * @devlink: instance to reference * * Obtain a reference on a devlink instance. A reference on a devlink instance * only implies that it's safe to take the instance lock. It does not imply * that the instance is registered, use devl_is_registered() after taking * the instance lock to check registration status. */ struct devlink *__must_check devlink_try_get(struct devlink *devlink) { if (refcount_inc_not_zero(&devlink->refcount)) return devlink; return NULL; } static void devlink_release(struct work_struct *work) { struct devlink *devlink; devlink = container_of(to_rcu_work(work), struct devlink, rwork); mutex_destroy(&devlink->lock); lockdep_unregister_key(&devlink->lock_key); put_device(devlink->dev); kvfree(devlink); } void devlink_put(struct devlink *devlink) { if (refcount_dec_and_test(&devlink->refcount)) queue_rcu_work(system_wq, &devlink->rwork); } struct devlink *devlinks_xa_find_get(struct net *net, unsigned long *indexp) { struct devlink *devlink = NULL; rcu_read_lock(); retry: devlink = xa_find(&devlinks, indexp, ULONG_MAX, DEVLINK_REGISTERED); if (!devlink) goto unlock; if (!devlink_try_get(devlink)) goto next; if (!net_eq(devlink_net(devlink), net)) { devlink_put(devlink); goto next; } unlock: rcu_read_unlock(); return devlink; next: (*indexp)++; goto retry; } /** * devl_register - Register devlink instance * @devlink: devlink */ int devl_register(struct devlink *devlink) { ASSERT_DEVLINK_NOT_REGISTERED(devlink); devl_assert_locked(devlink); xa_set_mark(&devlinks, devlink->index, DEVLINK_REGISTERED); devlink_notify_register(devlink); devlink_rel_nested_in_notify(devlink); return 0; } EXPORT_SYMBOL_GPL(devl_register); void devlink_register(struct devlink *devlink) { devl_lock(devlink); devl_register(devlink); devl_unlock(devlink); } EXPORT_SYMBOL_GPL(devlink_register); /** * devl_unregister - Unregister devlink instance * @devlink: devlink */ void devl_unregister(struct devlink *devlink) { ASSERT_DEVLINK_REGISTERED(devlink); devl_assert_locked(devlink); devlink_notify_unregister(devlink); xa_clear_mark(&devlinks, devlink->index, DEVLINK_REGISTERED); devlink_rel_put(devlink); } EXPORT_SYMBOL_GPL(devl_unregister); void devlink_unregister(struct devlink *devlink) { devl_lock(devlink); devl_unregister(devlink); devl_unlock(devlink); } EXPORT_SYMBOL_GPL(devlink_unregister); /** * devlink_alloc_ns - Allocate new devlink instance resources * in specific namespace * * @ops: ops * @priv_size: size of user private data * @net: net namespace * @dev: parent device * * Allocate new devlink instance resources, including devlink index * and name. */ struct devlink *devlink_alloc_ns(const struct devlink_ops *ops, size_t priv_size, struct net *net, struct device *dev) { struct devlink *devlink; static u32 last_id; int ret; WARN_ON(!ops || !dev); if (!devlink_reload_actions_valid(ops)) return NULL; devlink = kvzalloc(struct_size(devlink, priv, priv_size), GFP_KERNEL); if (!devlink) return NULL; ret = xa_alloc_cyclic(&devlinks, &devlink->index, devlink, xa_limit_31b, &last_id, GFP_KERNEL); if (ret < 0) goto err_xa_alloc; devlink->dev = get_device(dev); devlink->ops = ops; xa_init_flags(&devlink->ports, XA_FLAGS_ALLOC); xa_init_flags(&devlink->params, XA_FLAGS_ALLOC); xa_init_flags(&devlink->snapshot_ids, XA_FLAGS_ALLOC); xa_init_flags(&devlink->nested_rels, XA_FLAGS_ALLOC); write_pnet(&devlink->_net, net); INIT_LIST_HEAD(&devlink->rate_list); INIT_LIST_HEAD(&devlink->linecard_list); INIT_LIST_HEAD(&devlink->sb_list); INIT_LIST_HEAD_RCU(&devlink->dpipe_table_list); INIT_LIST_HEAD(&devlink->resource_list); INIT_LIST_HEAD(&devlink->region_list); INIT_LIST_HEAD(&devlink->reporter_list); INIT_LIST_HEAD(&devlink->trap_list); INIT_LIST_HEAD(&devlink->trap_group_list); INIT_LIST_HEAD(&devlink->trap_policer_list); INIT_RCU_WORK(&devlink->rwork, devlink_release); lockdep_register_key(&devlink->lock_key); mutex_init(&devlink->lock); lockdep_set_class(&devlink->lock, &devlink->lock_key); refcount_set(&devlink->refcount, 1); return devlink; err_xa_alloc: kvfree(devlink); return NULL; } EXPORT_SYMBOL_GPL(devlink_alloc_ns); /** * devlink_free - Free devlink instance resources * * @devlink: devlink */ void devlink_free(struct devlink *devlink) { ASSERT_DEVLINK_NOT_REGISTERED(devlink); WARN_ON(!list_empty(&devlink->trap_policer_list)); WARN_ON(!list_empty(&devlink->trap_group_list)); WARN_ON(!list_empty(&devlink->trap_list)); WARN_ON(!list_empty(&devlink->reporter_list)); WARN_ON(!list_empty(&devlink->region_list)); WARN_ON(!list_empty(&devlink->resource_list)); WARN_ON(!list_empty(&devlink->dpipe_table_list)); WARN_ON(!list_empty(&devlink->sb_list)); WARN_ON(!list_empty(&devlink->rate_list)); WARN_ON(!list_empty(&devlink->linecard_list)); WARN_ON(!xa_empty(&devlink->ports)); xa_destroy(&devlink->nested_rels); xa_destroy(&devlink->snapshot_ids); xa_destroy(&devlink->params); xa_destroy(&devlink->ports); xa_erase(&devlinks, devlink->index); devlink_put(devlink); } EXPORT_SYMBOL_GPL(devlink_free); static void __net_exit devlink_pernet_pre_exit(struct net *net) { struct devlink *devlink; u32 actions_performed; unsigned long index; int err; /* In case network namespace is getting destroyed, reload * all devlink instances from this namespace into init_net. */ devlinks_xa_for_each_registered_get(net, index, devlink) { devl_dev_lock(devlink, true); err = 0; if (devl_is_registered(devlink)) err = devlink_reload(devlink, &init_net, DEVLINK_RELOAD_ACTION_DRIVER_REINIT, DEVLINK_RELOAD_LIMIT_UNSPEC, &actions_performed, NULL); devl_dev_unlock(devlink, true); devlink_put(devlink); if (err && err != -EOPNOTSUPP) pr_warn("Failed to reload devlink instance into init_net\n"); } } static struct pernet_operations devlink_pernet_ops __net_initdata = { .pre_exit = devlink_pernet_pre_exit, }; static struct notifier_block devlink_port_netdevice_nb = { .notifier_call = devlink_port_netdevice_event, }; static int __init devlink_init(void) { int err; err = register_pernet_subsys(&devlink_pernet_ops); if (err) goto out; err = genl_register_family(&devlink_nl_family); if (err) goto out_unreg_pernet_subsys; err = register_netdevice_notifier(&devlink_port_netdevice_nb); if (!err) return 0; genl_unregister_family(&devlink_nl_family); out_unreg_pernet_subsys: unregister_pernet_subsys(&devlink_pernet_ops); out: WARN_ON(err); return err; } subsys_initcall(devlink_init);
150 108 2 89 84 4 12 6 70 1 2 67 1 2 1 3 4 34 1 62 22 55 4 15 31 9 23 21 6 15 9 22 3 18 23 1 9 13 13 7 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 /* * linux/fs/hfs/dir.c * * Copyright (C) 1995-1997 Paul H. Hargrove * (C) 2003 Ardis Technologies <roman@ardistech.com> * This file may be distributed under the terms of the GNU General Public License. * * This file contains directory-related functions independent of which * scheme is being used to represent forks. * * Based on the minix file system code, (C) 1991, 1992 by Linus Torvalds */ #include "hfs_fs.h" #include "btree.h" /* * hfs_lookup() */ static struct dentry *hfs_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) { hfs_cat_rec rec; struct hfs_find_data fd; struct inode *inode = NULL; int res; res = hfs_find_init(HFS_SB(dir->i_sb)->cat_tree, &fd); if (res) return ERR_PTR(res); hfs_cat_build_key(dir->i_sb, fd.search_key, dir->i_ino, &dentry->d_name); res = hfs_brec_read(&fd, &rec, sizeof(rec)); if (res) { if (res != -ENOENT) inode = ERR_PTR(res); } else { inode = hfs_iget(dir->i_sb, &fd.search_key->cat, &rec); if (!inode) inode = ERR_PTR(-EACCES); } hfs_find_exit(&fd); return d_splice_alias(inode, dentry); } /* * hfs_readdir */ static int hfs_readdir(struct file *file, struct dir_context *ctx) { struct inode *inode = file_inode(file); struct super_block *sb = inode->i_sb; int len, err; char strbuf[HFS_MAX_NAMELEN]; union hfs_cat_rec entry; struct hfs_find_data fd; struct hfs_readdir_data *rd; u16 type; if (ctx->pos >= inode->i_size) return 0; err = hfs_find_init(HFS_SB(sb)->cat_tree, &fd); if (err) return err; hfs_cat_build_key(sb, fd.search_key, inode->i_ino, NULL); err = hfs_brec_find(&fd); if (err) goto out; if (ctx->pos == 0) { /* This is completely artificial... */ if (!dir_emit_dot(file, ctx)) goto out; ctx->pos = 1; } if (ctx->pos == 1) { if (fd.entrylength > sizeof(entry) || fd.entrylength < 0) { err = -EIO; goto out; } hfs_bnode_read(fd.bnode, &entry, fd.entryoffset, fd.entrylength); if (entry.type != HFS_CDR_THD) { pr_err("bad catalog folder thread\n"); err = -EIO; goto out; } //if (fd.entrylength < HFS_MIN_THREAD_SZ) { // pr_err("truncated catalog thread\n"); // err = -EIO; // goto out; //} if (!dir_emit(ctx, "..", 2, be32_to_cpu(entry.thread.ParID), DT_DIR)) goto out; ctx->pos = 2; } if (ctx->pos >= inode->i_size) goto out; err = hfs_brec_goto(&fd, ctx->pos - 1); if (err) goto out; for (;;) { if (be32_to_cpu(fd.key->cat.ParID) != inode->i_ino) { pr_err("walked past end of dir\n"); err = -EIO; goto out; } if (fd.entrylength > sizeof(entry) || fd.entrylength < 0) { err = -EIO; goto out; } hfs_bnode_read(fd.bnode, &entry, fd.entryoffset, fd.entrylength); type = entry.type; len = hfs_mac2asc(sb, strbuf, &fd.key->cat.CName); if (type == HFS_CDR_DIR) { if (fd.entrylength < sizeof(struct hfs_cat_dir)) { pr_err("small dir entry\n"); err = -EIO; goto out; } if (!dir_emit(ctx, strbuf, len, be32_to_cpu(entry.dir.DirID), DT_DIR)) break; } else if (type == HFS_CDR_FIL) { if (fd.entrylength < sizeof(struct hfs_cat_file)) { pr_err("small file entry\n"); err = -EIO; goto out; } if (!dir_emit(ctx, strbuf, len, be32_to_cpu(entry.file.FlNum), DT_REG)) break; } else { pr_err("bad catalog entry type %d\n", type); err = -EIO; goto out; } ctx->pos++; if (ctx->pos >= inode->i_size) goto out; err = hfs_brec_goto(&fd, 1); if (err) goto out; } rd = file->private_data; if (!rd) { rd = kmalloc(sizeof(struct hfs_readdir_data), GFP_KERNEL); if (!rd) { err = -ENOMEM; goto out; } file->private_data = rd; rd->file = file; spin_lock(&HFS_I(inode)->open_dir_lock); list_add(&rd->list, &HFS_I(inode)->open_dir_list); spin_unlock(&HFS_I(inode)->open_dir_lock); } /* * Can be done after the list insertion; exclusion with * hfs_delete_cat() is provided by directory lock. */ memcpy(&rd->key, &fd.key->cat, sizeof(struct hfs_cat_key)); out: hfs_find_exit(&fd); return err; } static int hfs_dir_release(struct inode *inode, struct file *file) { struct hfs_readdir_data *rd = file->private_data; if (rd) { spin_lock(&HFS_I(inode)->open_dir_lock); list_del(&rd->list); spin_unlock(&HFS_I(inode)->open_dir_lock); kfree(rd); } return 0; } /* * hfs_create() * * This is the create() entry in the inode_operations structure for * regular HFS directories. The purpose is to create a new file in * a directory and return a corresponding inode, given the inode for * the directory and the name (and its length) of the new file. */ static int hfs_create(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, bool excl) { struct inode *inode; int res; inode = hfs_new_inode(dir, &dentry->d_name, mode); if (!inode) return -ENOMEM; res = hfs_cat_create(inode->i_ino, dir, &dentry->d_name, inode); if (res) { clear_nlink(inode); hfs_delete_inode(inode); iput(inode); return res; } d_instantiate(dentry, inode); mark_inode_dirty(inode); return 0; } /* * hfs_mkdir() * * This is the mkdir() entry in the inode_operations structure for * regular HFS directories. The purpose is to create a new directory * in a directory, given the inode for the parent directory and the * name (and its length) of the new directory. */ static struct dentry *hfs_mkdir(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode) { struct inode *inode; int res; inode = hfs_new_inode(dir, &dentry->d_name, S_IFDIR | mode); if (!inode) return ERR_PTR(-ENOMEM); res = hfs_cat_create(inode->i_ino, dir, &dentry->d_name, inode); if (res) { clear_nlink(inode); hfs_delete_inode(inode); iput(inode); return ERR_PTR(res); } d_instantiate(dentry, inode); mark_inode_dirty(inode); return NULL; } /* * hfs_remove() * * This serves as both unlink() and rmdir() in the inode_operations * structure for regular HFS directories. The purpose is to delete * an existing child, given the inode for the parent directory and * the name (and its length) of the existing directory. * * HFS does not have hardlinks, so both rmdir and unlink set the * link count to 0. The only difference is the emptiness check. */ static int hfs_remove(struct inode *dir, struct dentry *dentry) { struct inode *inode = d_inode(dentry); int res; if (S_ISDIR(inode->i_mode) && inode->i_size != 2) return -ENOTEMPTY; res = hfs_cat_delete(inode->i_ino, dir, &dentry->d_name); if (res) return res; clear_nlink(inode); inode_set_ctime_current(inode); hfs_delete_inode(inode); mark_inode_dirty(inode); return 0; } /* * hfs_rename() * * This is the rename() entry in the inode_operations structure for * regular HFS directories. The purpose is to rename an existing * file or directory, given the inode for the current directory and * the name (and its length) of the existing file/directory and the * inode for the new directory and the name (and its length) of the * new file/directory. * XXX: how do you handle must_be dir? */ static int hfs_rename(struct mnt_idmap *idmap, struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags) { int res; if (flags & ~RENAME_NOREPLACE) return -EINVAL; /* Unlink destination if it already exists */ if (d_really_is_positive(new_dentry)) { res = hfs_remove(new_dir, new_dentry); if (res) return res; } res = hfs_cat_move(d_inode(old_dentry)->i_ino, old_dir, &old_dentry->d_name, new_dir, &new_dentry->d_name); if (!res) hfs_cat_build_key(old_dir->i_sb, (btree_key *)&HFS_I(d_inode(old_dentry))->cat_key, new_dir->i_ino, &new_dentry->d_name); return res; } const struct file_operations hfs_dir_operations = { .read = generic_read_dir, .iterate_shared = hfs_readdir, .llseek = generic_file_llseek, .release = hfs_dir_release, }; const struct inode_operations hfs_dir_inode_operations = { .create = hfs_create, .lookup = hfs_lookup, .unlink = hfs_remove, .mkdir = hfs_mkdir, .rmdir = hfs_remove, .rename = hfs_rename, .setattr = hfs_inode_setattr, };
165 166 152 270 186 261 34 7 28 34 34 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 /* * linux/fs/hfs/string.c * * Copyright (C) 1995-1997 Paul H. Hargrove * (C) 2003 Ardis Technologies <roman@ardistech.com> * This file may be distributed under the terms of the GNU General Public License. * * This file contains the string comparison function for the * Macintosh character set. * * The code in this file is derived from code which is copyright * 1986, 1989, 1990 by Abacus Research and Development, Inc. (ARDI) * It is used here by the permission of ARDI's president Cliff Matthews. */ #include "hfs_fs.h" #include <linux/dcache.h> /*================ File-local variables ================*/ /* * unsigned char caseorder[] * * Defines the lexical ordering of characters on the Macintosh * * Composition of the 'casefold' and 'order' tables from ARDI's code * with the entry for 0x20 changed to match that for 0xCA to remove * special case for those two characters. */ static unsigned char caseorder[256] = { 0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F, 0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17,0x18,0x19,0x1A,0x1B,0x1C,0x1D,0x1E,0x1F, 0x20,0x22,0x23,0x28,0x29,0x2A,0x2B,0x2C,0x2F,0x30,0x31,0x32,0x33,0x34,0x35,0x36, 0x37,0x38,0x39,0x3A,0x3B,0x3C,0x3D,0x3E,0x3F,0x40,0x41,0x42,0x43,0x44,0x45,0x46, 0x47,0x48,0x57,0x59,0x5D,0x5F,0x66,0x68,0x6A,0x6C,0x72,0x74,0x76,0x78,0x7A,0x7E, 0x8C,0x8E,0x90,0x92,0x95,0x97,0x9E,0xA0,0xA2,0xA4,0xA7,0xA9,0xAA,0xAB,0xAC,0xAD, 0x4E,0x48,0x57,0x59,0x5D,0x5F,0x66,0x68,0x6A,0x6C,0x72,0x74,0x76,0x78,0x7A,0x7E, 0x8C,0x8E,0x90,0x92,0x95,0x97,0x9E,0xA0,0xA2,0xA4,0xA7,0xAF,0xB0,0xB1,0xB2,0xB3, 0x4A,0x4C,0x5A,0x60,0x7B,0x7F,0x98,0x4F,0x49,0x51,0x4A,0x4B,0x4C,0x5A,0x60,0x63, 0x64,0x65,0x6E,0x6F,0x70,0x71,0x7B,0x84,0x85,0x86,0x7F,0x80,0x9A,0x9B,0x9C,0x98, 0xB4,0xB5,0xB6,0xB7,0xB8,0xB9,0xBA,0x94,0xBB,0xBC,0xBD,0xBE,0xBF,0xC0,0x4D,0x81, 0xC1,0xC2,0xC3,0xC4,0xC5,0xC6,0xC7,0xC8,0xC9,0xCA,0xCB,0x55,0x8A,0xCC,0x4D,0x81, 0xCD,0xCE,0xCF,0xD0,0xD1,0xD2,0xD3,0x26,0x27,0xD4,0x20,0x49,0x4B,0x80,0x82,0x82, 0xD5,0xD6,0x24,0x25,0x2D,0x2E,0xD7,0xD8,0xA6,0xD9,0xDA,0xDB,0xDC,0xDD,0xDE,0xDF, 0xE0,0xE1,0xE2,0xE3,0xE4,0xE5,0xE6,0xE7,0xE8,0xE9,0xEA,0xEB,0xEC,0xED,0xEE,0xEF, 0xF0,0xF1,0xF2,0xF3,0xF4,0xF5,0xF6,0xF7,0xF8,0xF9,0xFA,0xFB,0xFC,0xFD,0xFE,0xFF }; /*================ Global functions ================*/ /* * Hash a string to an integer in a case-independent way */ int hfs_hash_dentry(const struct dentry *dentry, struct qstr *this) { const unsigned char *name = this->name; unsigned int hash, len = this->len; if (len > HFS_NAMELEN) len = HFS_NAMELEN; hash = init_name_hash(dentry); for (; len; len--) hash = partial_name_hash(caseorder[*name++], hash); this->hash = end_name_hash(hash); return 0; } /* * Compare two strings in the HFS filename character ordering * Returns positive, negative, or zero, not just 0 or (+/-)1 * * Equivalent to ARDI's call: * ROMlib_RelString(s1+1, s2+1, true, false, (s1[0]<<16) | s2[0]) */ int hfs_strcmp(const unsigned char *s1, unsigned int len1, const unsigned char *s2, unsigned int len2) { int len, tmp; len = (len1 > len2) ? len2 : len1; while (len--) { tmp = (int)caseorder[*(s1++)] - (int)caseorder[*(s2++)]; if (tmp) return tmp; } return len1 - len2; } /* * Test for equality of two strings in the HFS filename character ordering. * return 1 on failure and 0 on success */ int hfs_compare_dentry(const struct dentry *dentry, unsigned int len, const char *str, const struct qstr *name) { const unsigned char *n1, *n2; if (len >= HFS_NAMELEN) { if (name->len < HFS_NAMELEN) return 1; len = HFS_NAMELEN; } else if (len != name->len) return 1; n1 = str; n2 = name->name; while (len--) { if (caseorder[*n1++] != caseorder[*n2++]) return 1; } return 0; }
52 52 35 35 11 36 36 3 23 60 14 72 53 18 18 35 35 1 35 35 3 2 1 1 66 54 14 7 8 14 5 17 5 4 2 8 7 1 3 2 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 // SPDX-License-Identifier: GPL-2.0-or-later /* * OSS compatible sequencer driver * * synth device handlers * * Copyright (C) 1998,99 Takashi Iwai <tiwai@suse.de> */ #include "seq_oss_synth.h" #include "seq_oss_midi.h" #include "../seq_lock.h" #include <linux/init.h> #include <linux/module.h> #include <linux/slab.h> #include <linux/nospec.h> /* * constants */ #define SNDRV_SEQ_OSS_MAX_SYNTH_NAME 30 #define MAX_SYSEX_BUFLEN 128 /* * definition of synth info records */ /* synth info */ struct seq_oss_synth { int seq_device; /* for synth_info */ int synth_type; int synth_subtype; int nr_voices; char name[SNDRV_SEQ_OSS_MAX_SYNTH_NAME]; struct snd_seq_oss_callback oper; int opened; void *private_data; snd_use_lock_t use_lock; }; /* * device table */ static int max_synth_devs; static struct seq_oss_synth *synth_devs[SNDRV_SEQ_OSS_MAX_SYNTH_DEVS]; static struct seq_oss_synth midi_synth_dev = { .seq_device = -1, .synth_type = SYNTH_TYPE_MIDI, .synth_subtype = 0, .nr_voices = 16, .name = "MIDI", }; static DEFINE_SPINLOCK(register_lock); /* * prototypes */ static struct seq_oss_synth *get_synthdev(struct seq_oss_devinfo *dp, int dev); static void reset_channels(struct seq_oss_synthinfo *info); /* * global initialization */ void __init snd_seq_oss_synth_init(void) { snd_use_lock_init(&midi_synth_dev.use_lock); } /* * registration of the synth device */ int snd_seq_oss_synth_probe(struct device *_dev) { struct snd_seq_device *dev = to_seq_dev(_dev); int i; struct seq_oss_synth *rec; struct snd_seq_oss_reg *reg = SNDRV_SEQ_DEVICE_ARGPTR(dev); unsigned long flags; rec = kzalloc(sizeof(*rec), GFP_KERNEL); if (!rec) return -ENOMEM; rec->seq_device = -1; rec->synth_type = reg->type; rec->synth_subtype = reg->subtype; rec->nr_voices = reg->nvoices; rec->oper = reg->oper; rec->private_data = reg->private_data; rec->opened = 0; snd_use_lock_init(&rec->use_lock); /* copy and truncate the name of synth device */ strscpy(rec->name, dev->name, sizeof(rec->name)); /* registration */ spin_lock_irqsave(&register_lock, flags); for (i = 0; i < max_synth_devs; i++) { if (synth_devs[i] == NULL) break; } if (i >= max_synth_devs) { if (max_synth_devs >= SNDRV_SEQ_OSS_MAX_SYNTH_DEVS) { spin_unlock_irqrestore(&register_lock, flags); pr_err("ALSA: seq_oss: no more synth slot\n"); kfree(rec); return -ENOMEM; } max_synth_devs++; } rec->seq_device = i; synth_devs[i] = rec; spin_unlock_irqrestore(&register_lock, flags); dev->driver_data = rec; #ifdef SNDRV_OSS_INFO_DEV_SYNTH if (i < SNDRV_CARDS) snd_oss_info_register(SNDRV_OSS_INFO_DEV_SYNTH, i, rec->name); #endif return 0; } int snd_seq_oss_synth_remove(struct device *_dev) { struct snd_seq_device *dev = to_seq_dev(_dev); int index; struct seq_oss_synth *rec = dev->driver_data; unsigned long flags; spin_lock_irqsave(&register_lock, flags); for (index = 0; index < max_synth_devs; index++) { if (synth_devs[index] == rec) break; } if (index >= max_synth_devs) { spin_unlock_irqrestore(&register_lock, flags); pr_err("ALSA: seq_oss: can't unregister synth\n"); return -EINVAL; } synth_devs[index] = NULL; if (index == max_synth_devs - 1) { for (index--; index >= 0; index--) { if (synth_devs[index]) break; } max_synth_devs = index + 1; } spin_unlock_irqrestore(&register_lock, flags); #ifdef SNDRV_OSS_INFO_DEV_SYNTH if (rec->seq_device < SNDRV_CARDS) snd_oss_info_unregister(SNDRV_OSS_INFO_DEV_SYNTH, rec->seq_device); #endif snd_use_lock_sync(&rec->use_lock); kfree(rec); return 0; } /* */ static struct seq_oss_synth * get_sdev(int dev) { struct seq_oss_synth *rec; unsigned long flags; spin_lock_irqsave(&register_lock, flags); rec = synth_devs[dev]; if (rec) snd_use_lock_use(&rec->use_lock); spin_unlock_irqrestore(&register_lock, flags); return rec; } /* * set up synth tables */ void snd_seq_oss_synth_setup(struct seq_oss_devinfo *dp) { int i; struct seq_oss_synth *rec; struct seq_oss_synthinfo *info; dp->max_synthdev = max_synth_devs; dp->synth_opened = 0; memset(dp->synths, 0, sizeof(dp->synths)); for (i = 0; i < dp->max_synthdev; i++) { rec = get_sdev(i); if (rec == NULL) continue; if (rec->oper.open == NULL || rec->oper.close == NULL) { snd_use_lock_free(&rec->use_lock); continue; } info = &dp->synths[i]; info->arg.app_index = dp->port; info->arg.file_mode = dp->file_mode; info->arg.seq_mode = dp->seq_mode; if (dp->seq_mode == SNDRV_SEQ_OSS_MODE_SYNTH) info->arg.event_passing = SNDRV_SEQ_OSS_PROCESS_EVENTS; else info->arg.event_passing = SNDRV_SEQ_OSS_PASS_EVENTS; info->opened = 0; if (!try_module_get(rec->oper.owner)) { snd_use_lock_free(&rec->use_lock); continue; } if (rec->oper.open(&info->arg, rec->private_data) < 0) { module_put(rec->oper.owner); snd_use_lock_free(&rec->use_lock); continue; } info->nr_voices = rec->nr_voices; if (info->nr_voices > 0) { info->ch = kcalloc(info->nr_voices, sizeof(struct seq_oss_chinfo), GFP_KERNEL); if (!info->ch) { rec->oper.close(&info->arg); module_put(rec->oper.owner); snd_use_lock_free(&rec->use_lock); continue; } reset_channels(info); } info->opened++; rec->opened++; dp->synth_opened++; snd_use_lock_free(&rec->use_lock); } } /* * set up synth tables for MIDI emulation - /dev/music mode only */ void snd_seq_oss_synth_setup_midi(struct seq_oss_devinfo *dp) { int i; if (dp->max_synthdev >= SNDRV_SEQ_OSS_MAX_SYNTH_DEVS) return; for (i = 0; i < dp->max_mididev; i++) { struct seq_oss_synthinfo *info; info = &dp->synths[dp->max_synthdev]; if (snd_seq_oss_midi_open(dp, i, dp->file_mode) < 0) continue; info->arg.app_index = dp->port; info->arg.file_mode = dp->file_mode; info->arg.seq_mode = dp->seq_mode; info->arg.private_data = info; info->is_midi = 1; info->midi_mapped = i; info->arg.event_passing = SNDRV_SEQ_OSS_PASS_EVENTS; snd_seq_oss_midi_get_addr(dp, i, &info->arg.addr); info->opened = 1; midi_synth_dev.opened++; dp->max_synthdev++; if (dp->max_synthdev >= SNDRV_SEQ_OSS_MAX_SYNTH_DEVS) break; } } /* * clean up synth tables */ void snd_seq_oss_synth_cleanup(struct seq_oss_devinfo *dp) { int i; struct seq_oss_synth *rec; struct seq_oss_synthinfo *info; if (snd_BUG_ON(dp->max_synthdev > SNDRV_SEQ_OSS_MAX_SYNTH_DEVS)) return; for (i = 0; i < dp->max_synthdev; i++) { info = &dp->synths[i]; if (! info->opened) continue; if (info->is_midi) { if (midi_synth_dev.opened > 0) { snd_seq_oss_midi_close(dp, info->midi_mapped); midi_synth_dev.opened--; } } else { rec = get_sdev(i); if (rec == NULL) continue; if (rec->opened > 0) { rec->oper.close(&info->arg); module_put(rec->oper.owner); rec->opened = 0; } snd_use_lock_free(&rec->use_lock); } kfree(info->ch); info->ch = NULL; } dp->synth_opened = 0; dp->max_synthdev = 0; } static struct seq_oss_synthinfo * get_synthinfo_nospec(struct seq_oss_devinfo *dp, int dev) { if (dev < 0 || dev >= dp->max_synthdev) return NULL; dev = array_index_nospec(dev, SNDRV_SEQ_OSS_MAX_SYNTH_DEVS); return &dp->synths[dev]; } /* * return synth device information pointer */ static struct seq_oss_synth * get_synthdev(struct seq_oss_devinfo *dp, int dev) { struct seq_oss_synth *rec; struct seq_oss_synthinfo *info = get_synthinfo_nospec(dp, dev); if (!info) return NULL; if (!info->opened) return NULL; if (info->is_midi) { rec = &midi_synth_dev; snd_use_lock_use(&rec->use_lock); } else { rec = get_sdev(dev); if (!rec) return NULL; } if (! rec->opened) { snd_use_lock_free(&rec->use_lock); return NULL; } return rec; } /* * reset note and velocity on each channel. */ static void reset_channels(struct seq_oss_synthinfo *info) { int i; if (info->ch == NULL || ! info->nr_voices) return; for (i = 0; i < info->nr_voices; i++) { info->ch[i].note = -1; info->ch[i].vel = 0; } } /* * reset synth device: * call reset callback. if no callback is defined, send a heartbeat * event to the corresponding port. */ void snd_seq_oss_synth_reset(struct seq_oss_devinfo *dp, int dev) { struct seq_oss_synth *rec; struct seq_oss_synthinfo *info; info = get_synthinfo_nospec(dp, dev); if (!info || !info->opened) return; reset_channels(info); if (info->is_midi) { if (midi_synth_dev.opened <= 0) return; snd_seq_oss_midi_reset(dp, info->midi_mapped); /* reopen the device */ snd_seq_oss_midi_close(dp, dev); if (snd_seq_oss_midi_open(dp, info->midi_mapped, dp->file_mode) < 0) { midi_synth_dev.opened--; info->opened = 0; kfree(info->ch); info->ch = NULL; } return; } rec = get_sdev(dev); if (rec == NULL) return; if (rec->oper.reset) { rec->oper.reset(&info->arg); } else { struct snd_seq_event ev; memset(&ev, 0, sizeof(ev)); snd_seq_oss_fill_addr(dp, &ev, info->arg.addr.client, info->arg.addr.port); ev.type = SNDRV_SEQ_EVENT_RESET; snd_seq_oss_dispatch(dp, &ev, 0, 0); } snd_use_lock_free(&rec->use_lock); } /* * load a patch record: * call load_patch callback function */ int snd_seq_oss_synth_load_patch(struct seq_oss_devinfo *dp, int dev, int fmt, const char __user *buf, int p, int c) { struct seq_oss_synth *rec; struct seq_oss_synthinfo *info; int rc; info = get_synthinfo_nospec(dp, dev); if (!info) return -ENXIO; if (info->is_midi) return 0; rec = get_synthdev(dp, dev); if (!rec) return -ENXIO; if (rec->oper.load_patch == NULL) rc = -ENXIO; else rc = rec->oper.load_patch(&info->arg, fmt, buf, p, c); snd_use_lock_free(&rec->use_lock); return rc; } /* * check if the device is valid synth device and return the synth info */ struct seq_oss_synthinfo * snd_seq_oss_synth_info(struct seq_oss_devinfo *dp, int dev) { struct seq_oss_synth *rec; rec = get_synthdev(dp, dev); if (rec) { snd_use_lock_free(&rec->use_lock); return get_synthinfo_nospec(dp, dev); } return NULL; } /* * receive OSS 6 byte sysex packet: * the event is filled and prepared for sending immediately * (i.e. sysex messages are fragmented) */ int snd_seq_oss_synth_sysex(struct seq_oss_devinfo *dp, int dev, unsigned char *buf, struct snd_seq_event *ev) { unsigned char *p; int len = 6; p = memchr(buf, 0xff, 6); if (p) len = p - buf + 1; /* copy the data to event record and send it */ if (snd_seq_oss_synth_addr(dp, dev, ev)) return -EINVAL; ev->flags = SNDRV_SEQ_EVENT_LENGTH_VARIABLE; ev->data.ext.len = len; ev->data.ext.ptr = buf; return 0; } /* * fill the event source/destination addresses */ int snd_seq_oss_synth_addr(struct seq_oss_devinfo *dp, int dev, struct snd_seq_event *ev) { struct seq_oss_synthinfo *info = snd_seq_oss_synth_info(dp, dev); if (!info) return -EINVAL; snd_seq_oss_fill_addr(dp, ev, info->arg.addr.client, info->arg.addr.port); return 0; } /* * OSS compatible ioctl */ int snd_seq_oss_synth_ioctl(struct seq_oss_devinfo *dp, int dev, unsigned int cmd, unsigned long addr) { struct seq_oss_synth *rec; struct seq_oss_synthinfo *info; int rc; info = get_synthinfo_nospec(dp, dev); if (!info || info->is_midi) return -ENXIO; rec = get_synthdev(dp, dev); if (!rec) return -ENXIO; if (rec->oper.ioctl == NULL) rc = -ENXIO; else rc = rec->oper.ioctl(&info->arg, cmd, addr); snd_use_lock_free(&rec->use_lock); return rc; } /* * send OSS raw events - SEQ_PRIVATE and SEQ_VOLUME */ int snd_seq_oss_synth_raw_event(struct seq_oss_devinfo *dp, int dev, unsigned char *data, struct snd_seq_event *ev) { struct seq_oss_synthinfo *info; info = snd_seq_oss_synth_info(dp, dev); if (!info || info->is_midi) return -ENXIO; ev->type = SNDRV_SEQ_EVENT_OSS; memcpy(ev->data.raw8.d, data, 8); return snd_seq_oss_synth_addr(dp, dev, ev); } /* * create OSS compatible synth_info record */ int snd_seq_oss_synth_make_info(struct seq_oss_devinfo *dp, int dev, struct synth_info *inf) { struct seq_oss_synth *rec; struct seq_oss_synthinfo *info = get_synthinfo_nospec(dp, dev); if (!info) return -ENXIO; if (info->is_midi) { struct midi_info minf; if (snd_seq_oss_midi_make_info(dp, info->midi_mapped, &minf)) return -ENXIO; inf->synth_type = SYNTH_TYPE_MIDI; inf->synth_subtype = 0; inf->nr_voices = 16; inf->device = dev; strscpy(inf->name, minf.name, sizeof(inf->name)); } else { rec = get_synthdev(dp, dev); if (!rec) return -ENXIO; inf->synth_type = rec->synth_type; inf->synth_subtype = rec->synth_subtype; inf->nr_voices = rec->nr_voices; inf->device = dev; strscpy(inf->name, rec->name, sizeof(inf->name)); snd_use_lock_free(&rec->use_lock); } return 0; } #ifdef CONFIG_SND_PROC_FS /* * proc interface */ void snd_seq_oss_synth_info_read(struct snd_info_buffer *buf) { int i; struct seq_oss_synth *rec; snd_iprintf(buf, "\nNumber of synth devices: %d\n", max_synth_devs); for (i = 0; i < max_synth_devs; i++) { snd_iprintf(buf, "\nsynth %d: ", i); rec = get_sdev(i); if (rec == NULL) { snd_iprintf(buf, "*empty*\n"); continue; } snd_iprintf(buf, "[%s]\n", rec->name); snd_iprintf(buf, " type 0x%x : subtype 0x%x : voices %d\n", rec->synth_type, rec->synth_subtype, rec->nr_voices); snd_iprintf(buf, " capabilities : ioctl %s / load_patch %s\n", str_enabled_disabled((long)rec->oper.ioctl), str_enabled_disabled((long)rec->oper.load_patch)); snd_use_lock_free(&rec->use_lock); } } #endif /* CONFIG_SND_PROC_FS */
263 19 240 39 5 35 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 // SPDX-License-Identifier: GPL-2.0-or-later /* * Media Controller ancillary functions * * Copyright (c) 2016 Mauro Carvalho Chehab <mchehab@kernel.org> * Copyright (C) 2016 Shuah Khan <shuahkh@osg.samsung.com> * Copyright (C) 2006-2010 Nokia Corporation * Copyright (c) 2016 Intel Corporation. */ #include <linux/module.h> #include <linux/pci.h> #include <linux/usb.h> #include <media/media-device.h> #include <media/media-entity.h> #include <media/v4l2-fh.h> #include <media/v4l2-mc.h> #include <media/v4l2-subdev.h> #include <media/videobuf2-core.h> int v4l2_mc_create_media_graph(struct media_device *mdev) { struct media_entity *entity; struct media_entity *if_vid = NULL, *if_aud = NULL; struct media_entity *tuner = NULL, *decoder = NULL; struct media_entity *io_v4l = NULL, *io_vbi = NULL, *io_swradio = NULL; bool is_webcam = false; u32 flags; int ret, pad_sink, pad_source; if (!mdev) return 0; media_device_for_each_entity(entity, mdev) { switch (entity->function) { case MEDIA_ENT_F_IF_VID_DECODER: if_vid = entity; break; case MEDIA_ENT_F_IF_AUD_DECODER: if_aud = entity; break; case MEDIA_ENT_F_TUNER: tuner = entity; break; case MEDIA_ENT_F_ATV_DECODER: decoder = entity; break; case MEDIA_ENT_F_IO_V4L: io_v4l = entity; break; case MEDIA_ENT_F_IO_VBI: io_vbi = entity; break; case MEDIA_ENT_F_IO_SWRADIO: io_swradio = entity; break; case MEDIA_ENT_F_CAM_SENSOR: is_webcam = true; break; } } /* It should have at least one I/O entity */ if (!io_v4l && !io_vbi && !io_swradio) { dev_warn(mdev->dev, "Didn't find any I/O entity\n"); return -EINVAL; } /* * Here, webcams are modelled on a very simple way: the sensor is * connected directly to the I/O entity. All dirty details, like * scaler and crop HW are hidden. While such mapping is not enough * for mc-centric hardware, it is enough for v4l2 interface centric * PC-consumer's hardware. */ if (is_webcam) { if (!io_v4l) { dev_warn(mdev->dev, "Didn't find a MEDIA_ENT_F_IO_V4L\n"); return -EINVAL; } media_device_for_each_entity(entity, mdev) { if (entity->function != MEDIA_ENT_F_CAM_SENSOR) continue; ret = media_create_pad_link(entity, 0, io_v4l, 0, MEDIA_LNK_FL_ENABLED); if (ret) { dev_warn(mdev->dev, "Failed to create a sensor link\n"); return ret; } } if (!decoder) return 0; } /* The device isn't a webcam. So, it should have a decoder */ if (!decoder) { dev_warn(mdev->dev, "Decoder not found\n"); return -EINVAL; } /* Link the tuner and IF video output pads */ if (tuner) { if (if_vid) { pad_source = media_get_pad_index(tuner, MEDIA_PAD_FL_SOURCE, PAD_SIGNAL_ANALOG); pad_sink = media_get_pad_index(if_vid, MEDIA_PAD_FL_SINK, PAD_SIGNAL_ANALOG); if (pad_source < 0 || pad_sink < 0) { dev_warn(mdev->dev, "Couldn't get tuner and/or PLL pad(s): (%d, %d)\n", pad_source, pad_sink); return -EINVAL; } ret = media_create_pad_link(tuner, pad_source, if_vid, pad_sink, MEDIA_LNK_FL_ENABLED); if (ret) { dev_warn(mdev->dev, "Couldn't create tuner->PLL link)\n"); return ret; } pad_source = media_get_pad_index(if_vid, MEDIA_PAD_FL_SOURCE, PAD_SIGNAL_ANALOG); pad_sink = media_get_pad_index(decoder, MEDIA_PAD_FL_SINK, PAD_SIGNAL_ANALOG); if (pad_source < 0 || pad_sink < 0) { dev_warn(mdev->dev, "get decoder and/or PLL pad(s): (%d, %d)\n", pad_source, pad_sink); return -EINVAL; } ret = media_create_pad_link(if_vid, pad_source, decoder, pad_sink, MEDIA_LNK_FL_ENABLED); if (ret) { dev_warn(mdev->dev, "couldn't link PLL to decoder\n"); return ret; } } else { pad_source = media_get_pad_index(tuner, MEDIA_PAD_FL_SOURCE, PAD_SIGNAL_ANALOG); pad_sink = media_get_pad_index(decoder, MEDIA_PAD_FL_SINK, PAD_SIGNAL_ANALOG); if (pad_source < 0 || pad_sink < 0) { dev_warn(mdev->dev, "couldn't get tuner and/or decoder pad(s): (%d, %d)\n", pad_source, pad_sink); return -EINVAL; } ret = media_create_pad_link(tuner, pad_source, decoder, pad_sink, MEDIA_LNK_FL_ENABLED); if (ret) return ret; } if (if_aud) { pad_source = media_get_pad_index(tuner, MEDIA_PAD_FL_SOURCE, PAD_SIGNAL_AUDIO); pad_sink = media_get_pad_index(if_aud, MEDIA_PAD_FL_SINK, PAD_SIGNAL_AUDIO); if (pad_source < 0 || pad_sink < 0) { dev_warn(mdev->dev, "couldn't get tuner and/or decoder pad(s) for audio: (%d, %d)\n", pad_source, pad_sink); return -EINVAL; } ret = media_create_pad_link(tuner, pad_source, if_aud, pad_sink, MEDIA_LNK_FL_ENABLED); if (ret) { dev_warn(mdev->dev, "couldn't link tuner->audio PLL\n"); return ret; } } else { if_aud = tuner; } } /* Create demod to V4L, VBI and SDR radio links */ if (io_v4l) { pad_source = media_get_pad_index(decoder, MEDIA_PAD_FL_SOURCE, PAD_SIGNAL_DV); if (pad_source < 0) { dev_warn(mdev->dev, "couldn't get decoder output pad for V4L I/O\n"); return -EINVAL; } ret = media_create_pad_link(decoder, pad_source, io_v4l, 0, MEDIA_LNK_FL_ENABLED); if (ret) { dev_warn(mdev->dev, "couldn't link decoder output to V4L I/O\n"); return ret; } } if (io_swradio) { pad_source = media_get_pad_index(decoder, MEDIA_PAD_FL_SOURCE, PAD_SIGNAL_DV); if (pad_source < 0) { dev_warn(mdev->dev, "couldn't get decoder output pad for SDR\n"); return -EINVAL; } ret = media_create_pad_link(decoder, pad_source, io_swradio, 0, MEDIA_LNK_FL_ENABLED); if (ret) { dev_warn(mdev->dev, "couldn't link decoder output to SDR\n"); return ret; } } if (io_vbi) { pad_source = media_get_pad_index(decoder, MEDIA_PAD_FL_SOURCE, PAD_SIGNAL_DV); if (pad_source < 0) { dev_warn(mdev->dev, "couldn't get decoder output pad for VBI\n"); return -EINVAL; } ret = media_create_pad_link(decoder, pad_source, io_vbi, 0, MEDIA_LNK_FL_ENABLED); if (ret) { dev_warn(mdev->dev, "couldn't link decoder output to VBI\n"); return ret; } } /* Create links for the media connectors */ flags = MEDIA_LNK_FL_ENABLED; media_device_for_each_entity(entity, mdev) { switch (entity->function) { case MEDIA_ENT_F_CONN_RF: if (!tuner) continue; pad_sink = media_get_pad_index(tuner, MEDIA_PAD_FL_SINK, PAD_SIGNAL_ANALOG); if (pad_sink < 0) { dev_warn(mdev->dev, "couldn't get tuner analog pad sink\n"); return -EINVAL; } ret = media_create_pad_link(entity, 0, tuner, pad_sink, flags); break; case MEDIA_ENT_F_CONN_SVIDEO: case MEDIA_ENT_F_CONN_COMPOSITE: pad_sink = media_get_pad_index(decoder, MEDIA_PAD_FL_SINK, PAD_SIGNAL_ANALOG); if (pad_sink < 0) { dev_warn(mdev->dev, "couldn't get decoder analog pad sink\n"); return -EINVAL; } ret = media_create_pad_link(entity, 0, decoder, pad_sink, flags); break; default: continue; } if (ret) return ret; flags = 0; } return 0; } EXPORT_SYMBOL_GPL(v4l2_mc_create_media_graph); int v4l_enable_media_source(struct video_device *vdev) { struct media_device *mdev = vdev->entity.graph_obj.mdev; int ret = 0, err; if (!mdev) return 0; mutex_lock(&mdev->graph_mutex); if (!mdev->enable_source) goto end; err = mdev->enable_source(&vdev->entity, &vdev->pipe); if (err) ret = -EBUSY; end: mutex_unlock(&mdev->graph_mutex); return ret; } EXPORT_SYMBOL_GPL(v4l_enable_media_source); void v4l_disable_media_source(struct video_device *vdev) { struct media_device *mdev = vdev->entity.graph_obj.mdev; if (mdev) { mutex_lock(&mdev->graph_mutex); if (mdev->disable_source) mdev->disable_source(&vdev->entity); mutex_unlock(&mdev->graph_mutex); } } EXPORT_SYMBOL_GPL(v4l_disable_media_source); int v4l_vb2q_enable_media_source(struct vb2_queue *q) { struct v4l2_fh *fh = q->owner; if (fh && fh->vdev) return v4l_enable_media_source(fh->vdev); return 0; } EXPORT_SYMBOL_GPL(v4l_vb2q_enable_media_source); int v4l2_create_fwnode_links_to_pad(struct v4l2_subdev *src_sd, struct media_pad *sink, u32 flags) { struct fwnode_handle *endpoint; if (!(sink->flags & MEDIA_PAD_FL_SINK)) return -EINVAL; fwnode_graph_for_each_endpoint(src_sd->fwnode, endpoint) { struct fwnode_handle *remote_ep; int src_idx, sink_idx, ret; struct media_pad *src; src_idx = media_entity_get_fwnode_pad(&src_sd->entity, endpoint, MEDIA_PAD_FL_SOURCE); if (src_idx < 0) { dev_dbg(src_sd->dev, "no source pad found for %pfw\n", endpoint); continue; } remote_ep = fwnode_graph_get_remote_endpoint(endpoint); if (!remote_ep) { dev_dbg(src_sd->dev, "no remote ep found for %pfw\n", endpoint); continue; } /* * ask the sink to verify it owns the remote endpoint, * and translate to a sink pad. */ sink_idx = media_entity_get_fwnode_pad(sink->entity, remote_ep, MEDIA_PAD_FL_SINK); fwnode_handle_put(remote_ep); if (sink_idx < 0 || sink_idx != sink->index) { dev_dbg(src_sd->dev, "sink pad index mismatch or error (is %d, expected %u)\n", sink_idx, sink->index); continue; } /* * the source endpoint corresponds to one of its source pads, * the source endpoint connects to an endpoint at the sink * entity, and the sink endpoint corresponds to the sink * pad requested, so we have found an endpoint connection * that works, create the media link for it. */ src = &src_sd->entity.pads[src_idx]; /* skip if link already exists */ if (media_entity_find_link(src, sink)) { dev_dbg(src_sd->dev, "link %s:%d -> %s:%d already exists\n", src_sd->entity.name, src_idx, sink->entity->name, sink_idx); continue; } dev_dbg(src_sd->dev, "creating link %s:%d -> %s:%d\n", src_sd->entity.name, src_idx, sink->entity->name, sink_idx); ret = media_create_pad_link(&src_sd->entity, src_idx, sink->entity, sink_idx, flags); if (ret) { dev_err(src_sd->dev, "link %s:%d -> %s:%d failed with %d\n", src_sd->entity.name, src_idx, sink->entity->name, sink_idx, ret); fwnode_handle_put(endpoint); return ret; } } return 0; } EXPORT_SYMBOL_GPL(v4l2_create_fwnode_links_to_pad); int v4l2_create_fwnode_links(struct v4l2_subdev *src_sd, struct v4l2_subdev *sink_sd) { unsigned int i; for (i = 0; i < sink_sd->entity.num_pads; i++) { struct media_pad *pad = &sink_sd->entity.pads[i]; int ret; if (!(pad->flags & MEDIA_PAD_FL_SINK)) continue; ret = v4l2_create_fwnode_links_to_pad(src_sd, pad, 0); if (ret) return ret; } return 0; } EXPORT_SYMBOL_GPL(v4l2_create_fwnode_links); /* ----------------------------------------------------------------------------- * Pipeline power management * * Entities must be powered up when part of a pipeline that contains at least * one open video device node. * * To achieve this use the entity use_count field to track the number of users. * For entities corresponding to video device nodes the use_count field stores * the users count of the node. For entities corresponding to subdevs the * use_count field stores the total number of users of all video device nodes * in the pipeline. * * The v4l2_pipeline_pm_{get, put}() functions must be called in the open() and * close() handlers of video device nodes. It increments or decrements the use * count of all subdev entities in the pipeline. * * To react to link management on powered pipelines, the link setup notification * callback updates the use count of all entities in the source and sink sides * of the link. */ /* * pipeline_pm_use_count - Count the number of users of a pipeline * @entity: The entity * * Return the total number of users of all video device nodes in the pipeline. */ static int pipeline_pm_use_count(struct media_entity *entity, struct media_graph *graph) { int use = 0; media_graph_walk_start(graph, entity); while ((entity = media_graph_walk_next(graph))) { if (is_media_entity_v4l2_video_device(entity)) use += entity->use_count; } return use; } /* * pipeline_pm_power_one - Apply power change to an entity * @entity: The entity * @change: Use count change * * Change the entity use count by @change. If the entity is a subdev update its * power state by calling the core::s_power operation when the use count goes * from 0 to != 0 or from != 0 to 0. * * Return 0 on success or a negative error code on failure. */ static int pipeline_pm_power_one(struct media_entity *entity, int change) { struct v4l2_subdev *subdev; int ret; subdev = is_media_entity_v4l2_subdev(entity) ? media_entity_to_v4l2_subdev(entity) : NULL; if (entity->use_count == 0 && change > 0 && subdev != NULL) { ret = v4l2_subdev_call(subdev, core, s_power, 1); if (ret < 0 && ret != -ENOIOCTLCMD) return ret; } entity->use_count += change; WARN_ON(entity->use_count < 0); if (entity->use_count == 0 && change < 0 && subdev != NULL) v4l2_subdev_call(subdev, core, s_power, 0); return 0; } /* * pipeline_pm_power - Apply power change to all entities in a pipeline * @entity: The entity * @change: Use count change * * Walk the pipeline to update the use count and the power state of all non-node * entities. * * Return 0 on success or a negative error code on failure. */ static int pipeline_pm_power(struct media_entity *entity, int change, struct media_graph *graph) { struct media_entity *first = entity; int ret = 0; if (!change) return 0; media_graph_walk_start(graph, entity); while (!ret && (entity = media_graph_walk_next(graph))) if (is_media_entity_v4l2_subdev(entity)) ret = pipeline_pm_power_one(entity, change); if (!ret) return ret; media_graph_walk_start(graph, first); while ((first = media_graph_walk_next(graph)) && first != entity) if (is_media_entity_v4l2_subdev(first)) pipeline_pm_power_one(first, -change); return ret; } static int v4l2_pipeline_pm_use(struct media_entity *entity, unsigned int use) { struct media_device *mdev = entity->graph_obj.mdev; int change = use ? 1 : -1; int ret; mutex_lock(&mdev->graph_mutex); /* Apply use count to node. */ entity->use_count += change; WARN_ON(entity->use_count < 0); /* Apply power change to connected non-nodes. */ ret = pipeline_pm_power(entity, change, &mdev->pm_count_walk); if (ret < 0) entity->use_count -= change; mutex_unlock(&mdev->graph_mutex); return ret; } int v4l2_pipeline_pm_get(struct media_entity *entity) { return v4l2_pipeline_pm_use(entity, 1); } EXPORT_SYMBOL_GPL(v4l2_pipeline_pm_get); void v4l2_pipeline_pm_put(struct media_entity *entity) { /* Powering off entities shouldn't fail. */ WARN_ON(v4l2_pipeline_pm_use(entity, 0)); } EXPORT_SYMBOL_GPL(v4l2_pipeline_pm_put); int v4l2_pipeline_link_notify(struct media_link *link, u32 flags, unsigned int notification) { struct media_graph *graph = &link->graph_obj.mdev->pm_count_walk; struct media_entity *source = link->source->entity; struct media_entity *sink = link->sink->entity; int source_use; int sink_use; int ret = 0; source_use = pipeline_pm_use_count(source, graph); sink_use = pipeline_pm_use_count(sink, graph); if (notification == MEDIA_DEV_NOTIFY_POST_LINK_CH && !(flags & MEDIA_LNK_FL_ENABLED)) { /* Powering off entities is assumed to never fail. */ pipeline_pm_power(source, -sink_use, graph); pipeline_pm_power(sink, -source_use, graph); return 0; } if (notification == MEDIA_DEV_NOTIFY_PRE_LINK_CH && (flags & MEDIA_LNK_FL_ENABLED)) { ret = pipeline_pm_power(source, sink_use, graph); if (ret < 0) return ret; ret = pipeline_pm_power(sink, source_use, graph); if (ret < 0) pipeline_pm_power(source, -sink_use, graph); } return ret; } EXPORT_SYMBOL_GPL(v4l2_pipeline_link_notify);
4 4 4 4 2 2 1 9 4 5 10 2 5 4 4 8 3 4 1 2 5 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 // SPDX-License-Identifier: GPL-2.0-only /* DVB USB compliant linux driver for MSI Mega Sky 580 DVB-T USB2.0 receiver * * Copyright (C) 2006 Aapo Tahkola (aet@rasterburn.org) * * see Documentation/driver-api/media/drivers/dvb-usb.rst for more information */ #include "m920x.h" #include "mt352.h" #include "mt352_priv.h" #include "qt1010.h" #include "tda1004x.h" #include "tda827x.h" #include "mt2060.h" #include <media/tuner.h> #include "tuner-simple.h" #include <linux/unaligned.h> /* debug */ static int dvb_usb_m920x_debug; module_param_named(debug,dvb_usb_m920x_debug, int, 0644); MODULE_PARM_DESC(debug, "set debugging level (1=rc (or-able))." DVB_USB_DEBUG_STATUS); DVB_DEFINE_MOD_OPT_ADAPTER_NR(adapter_nr); static int m920x_set_filter(struct dvb_usb_device *d, int type, int idx, int pid); static inline int m920x_read(struct usb_device *udev, u8 request, u16 value, u16 index, void *data, int size) { int ret; ret = usb_control_msg(udev, usb_rcvctrlpipe(udev, 0), request, USB_TYPE_VENDOR | USB_DIR_IN, value, index, data, size, 2000); if (ret < 0) { printk(KERN_INFO "m920x_read = error: %d\n", ret); return ret; } if (ret != size) { deb("m920x_read = no data\n"); return -EIO; } return 0; } static inline int m920x_write(struct usb_device *udev, u8 request, u16 value, u16 index) { return usb_control_msg(udev, usb_sndctrlpipe(udev, 0), request, USB_TYPE_VENDOR | USB_DIR_OUT, value, index, NULL, 0, 2000); } static inline int m920x_write_seq(struct usb_device *udev, u8 request, struct m920x_inits *seq) { int ret; do { ret = m920x_write(udev, request, seq->data, seq->address); if (ret != 0) return ret; seq++; } while (seq->address); return 0; } static int m920x_init(struct dvb_usb_device *d, struct m920x_inits *rc_seq) { int ret, i, epi, flags = 0; int adap_enabled[M9206_MAX_ADAPTERS] = { 0 }; /* Remote controller init. */ if (d->props.rc.legacy.rc_query || d->props.rc.core.rc_query) { deb("Initialising remote control\n"); ret = m920x_write_seq(d->udev, M9206_CORE, rc_seq); if (ret != 0) { deb("Initialising remote control failed\n"); return ret; } deb("Initialising remote control success\n"); } for (i = 0; i < d->props.num_adapters; i++) flags |= d->adapter[i].props.fe[0].caps; /* Some devices(Dposh) might crash if we attempt touch at all. */ if (flags & DVB_USB_ADAP_HAS_PID_FILTER) { for (i = 0; i < d->props.num_adapters; i++) { epi = d->adapter[i].props.fe[0].stream.endpoint - 0x81; if (epi < 0 || epi >= M9206_MAX_ADAPTERS) { printk(KERN_INFO "m920x: Unexpected adapter endpoint!\n"); return -EINVAL; } adap_enabled[epi] = 1; } for (i = 0; i < M9206_MAX_ADAPTERS; i++) { if (adap_enabled[i]) continue; if ((ret = m920x_set_filter(d, 0x81 + i, 0, 0x0)) != 0) return ret; if ((ret = m920x_set_filter(d, 0x81 + i, 0, 0x02f5)) != 0) return ret; } } return 0; } static int m920x_init_ep(struct usb_interface *intf) { struct usb_device *udev = interface_to_usbdev(intf); struct usb_host_interface *alt; if ((alt = usb_altnum_to_altsetting(intf, 1)) == NULL) { deb("No alt found!\n"); return -ENODEV; } return usb_set_interface(udev, alt->desc.bInterfaceNumber, alt->desc.bAlternateSetting); } static inline void m920x_parse_rc_state(struct dvb_usb_device *d, u8 rc_state, int *state) { struct m920x_state *m = d->priv; switch (rc_state) { case 0x80: *state = REMOTE_NO_KEY_PRESSED; break; case 0x88: /* framing error or "invalid code" */ case 0x99: case 0xc0: case 0xd8: *state = REMOTE_NO_KEY_PRESSED; m->rep_count = 0; break; case 0x93: case 0x92: case 0x83: /* pinnacle PCTV310e */ case 0x82: m->rep_count = 0; *state = REMOTE_KEY_PRESSED; break; case 0x91: case 0x81: /* pinnacle PCTV310e */ /* prevent immediate auto-repeat */ if (++m->rep_count > 2) *state = REMOTE_KEY_REPEAT; else *state = REMOTE_NO_KEY_PRESSED; break; default: deb("Unexpected rc state %02x\n", rc_state); *state = REMOTE_NO_KEY_PRESSED; break; } } static int m920x_rc_query(struct dvb_usb_device *d, u32 *event, int *state) { int i, ret = 0; u8 *rc_state; rc_state = kmalloc(2, GFP_KERNEL); if (!rc_state) return -ENOMEM; ret = m920x_read(d->udev, M9206_CORE, 0x0, M9206_RC_STATE, rc_state, 1); if (ret != 0) goto out; ret = m920x_read(d->udev, M9206_CORE, 0x0, M9206_RC_KEY, rc_state + 1, 1); if (ret != 0) goto out; m920x_parse_rc_state(d, rc_state[0], state); for (i = 0; i < d->props.rc.legacy.rc_map_size; i++) if (rc5_data(&d->props.rc.legacy.rc_map_table[i]) == rc_state[1]) { *event = d->props.rc.legacy.rc_map_table[i].keycode; goto out; } if (rc_state[1] != 0) deb("Unknown rc key %02x\n", rc_state[1]); *state = REMOTE_NO_KEY_PRESSED; out: kfree(rc_state); return ret; } static int m920x_rc_core_query(struct dvb_usb_device *d) { int ret = 0; u8 *rc_state; int state; rc_state = kmalloc(2, GFP_KERNEL); if (!rc_state) return -ENOMEM; if ((ret = m920x_read(d->udev, M9206_CORE, 0x0, M9206_RC_STATE, &rc_state[0], 1)) != 0) goto out; if ((ret = m920x_read(d->udev, M9206_CORE, 0x0, M9206_RC_KEY, &rc_state[1], 1)) != 0) goto out; deb("state=0x%02x keycode=0x%02x\n", rc_state[0], rc_state[1]); m920x_parse_rc_state(d, rc_state[0], &state); if (state == REMOTE_NO_KEY_PRESSED) rc_keyup(d->rc_dev); else if (state == REMOTE_KEY_REPEAT) rc_repeat(d->rc_dev); else rc_keydown(d->rc_dev, RC_PROTO_UNKNOWN, rc_state[1], 0); out: kfree(rc_state); return ret; } /* I2C */ static int m920x_i2c_xfer(struct i2c_adapter *adap, struct i2c_msg msg[], int num) { struct dvb_usb_device *d = i2c_get_adapdata(adap); int i, j; int ret = 0; if (mutex_lock_interruptible(&d->i2c_mutex) < 0) return -EAGAIN; for (i = 0; i < num; i++) { if (msg[i].flags & (I2C_M_NO_RD_ACK | I2C_M_IGNORE_NAK | I2C_M_TEN) || msg[i].len == 0) { /* For a 0 byte message, I think sending the address * to index 0x80|0x40 would be the correct thing to * do. However, zero byte messages are only used for * probing, and since we don't know how to get the * slave's ack, we can't probe. */ ret = -ENOTSUPP; goto unlock; } /* Send START & address/RW bit */ if (!(msg[i].flags & I2C_M_NOSTART)) { if ((ret = m920x_write(d->udev, M9206_I2C, (msg[i].addr << 1) | (msg[i].flags & I2C_M_RD ? 0x01 : 0), 0x80)) != 0) goto unlock; /* Should check for ack here, if we knew how. */ } if (msg[i].flags & I2C_M_RD) { char *read = kmalloc(1, GFP_KERNEL); if (!read) { ret = -ENOMEM; goto unlock; } for (j = 0; j < msg[i].len; j++) { /* Last byte of transaction? * Send STOP, otherwise send ACK. */ int stop = (i+1 == num && j+1 == msg[i].len) ? 0x40 : 0x01; if ((ret = m920x_read(d->udev, M9206_I2C, 0x0, 0x20 | stop, read, 1)) != 0) { kfree(read); goto unlock; } msg[i].buf[j] = read[0]; } kfree(read); } else { for (j = 0; j < msg[i].len; j++) { /* Last byte of transaction? Then send STOP. */ int stop = (i+1 == num && j+1 == msg[i].len) ? 0x40 : 0x00; if ((ret = m920x_write(d->udev, M9206_I2C, msg[i].buf[j], stop)) != 0) goto unlock; /* Should check for ack here too. */ } } } ret = num; unlock: mutex_unlock(&d->i2c_mutex); return ret; } static u32 m920x_i2c_func(struct i2c_adapter *adapter) { return I2C_FUNC_I2C; } static const struct i2c_algorithm m920x_i2c_algo = { .master_xfer = m920x_i2c_xfer, .functionality = m920x_i2c_func, }; /* pid filter */ static int m920x_set_filter(struct dvb_usb_device *d, int type, int idx, int pid) { int ret = 0; if (pid >= 0x8000) return -EINVAL; pid |= 0x8000; if ((ret = m920x_write(d->udev, M9206_FILTER, pid, (type << 8) | (idx * 4) )) != 0) return ret; if ((ret = m920x_write(d->udev, M9206_FILTER, 0, (type << 8) | (idx * 4) )) != 0) return ret; return ret; } static int m920x_update_filters(struct dvb_usb_adapter *adap) { struct m920x_state *m = adap->dev->priv; int enabled = m->filtering_enabled[adap->id]; int i, ret = 0, filter = 0; int ep = adap->props.fe[0].stream.endpoint; for (i = 0; i < M9206_MAX_FILTERS; i++) if (m->filters[adap->id][i] == 8192) enabled = 0; /* Disable all filters */ if ((ret = m920x_set_filter(adap->dev, ep, 1, enabled)) != 0) return ret; for (i = 0; i < M9206_MAX_FILTERS; i++) if ((ret = m920x_set_filter(adap->dev, ep, i + 2, 0)) != 0) return ret; /* Set */ if (enabled) { for (i = 0; i < M9206_MAX_FILTERS; i++) { if (m->filters[adap->id][i] == 0) continue; if ((ret = m920x_set_filter(adap->dev, ep, filter + 2, m->filters[adap->id][i])) != 0) return ret; filter++; } } return ret; } static int m920x_pid_filter_ctrl(struct dvb_usb_adapter *adap, int onoff) { struct m920x_state *m = adap->dev->priv; m->filtering_enabled[adap->id] = onoff ? 1 : 0; return m920x_update_filters(adap); } static int m920x_pid_filter(struct dvb_usb_adapter *adap, int index, u16 pid, int onoff) { struct m920x_state *m = adap->dev->priv; m->filters[adap->id][index] = onoff ? pid : 0; return m920x_update_filters(adap); } static int m920x_firmware_download(struct usb_device *udev, const struct firmware *fw) { u16 value, index, size; u8 *read, *buff; int i, pass, ret = 0; buff = kmalloc(65536, GFP_KERNEL); if (buff == NULL) return -ENOMEM; read = kmalloc(4, GFP_KERNEL); if (!read) { kfree(buff); return -ENOMEM; } if ((ret = m920x_read(udev, M9206_FILTER, 0x0, 0x8000, read, 4)) != 0) goto done; deb("%*ph\n", 4, read); if ((ret = m920x_read(udev, M9206_FW, 0x0, 0x0, read, 1)) != 0) goto done; deb("%x\n", read[0]); for (pass = 0; pass < 2; pass++) { for (i = 0; i + (sizeof(u16) * 3) < fw->size;) { value = get_unaligned_le16(fw->data + i); i += sizeof(u16); index = get_unaligned_le16(fw->data + i); i += sizeof(u16); size = get_unaligned_le16(fw->data + i); i += sizeof(u16); if (pass == 1) { /* Will stall if using fw->data ... */ memcpy(buff, fw->data + i, size); ret = usb_control_msg(udev, usb_sndctrlpipe(udev,0), M9206_FW, USB_TYPE_VENDOR | USB_DIR_OUT, value, index, buff, size, 20); if (ret != size) { deb("error while uploading fw!\n"); ret = -EIO; goto done; } msleep(3); } i += size; } if (i != fw->size) { deb("bad firmware file!\n"); ret = -EINVAL; goto done; } } msleep(36); /* m920x will disconnect itself from the bus after this. */ (void) m920x_write(udev, M9206_CORE, 0x01, M9206_FW_GO); deb("firmware uploaded!\n"); done: kfree(read); kfree(buff); return ret; } /* Callbacks for DVB USB */ static int m920x_identify_state(struct usb_device *udev, const struct dvb_usb_device_properties *props, const struct dvb_usb_device_description **desc, int *cold) { struct usb_host_interface *alt; alt = usb_altnum_to_altsetting(usb_ifnum_to_if(udev, 0), 1); *cold = (alt == NULL) ? 1 : 0; return 0; } /* demod configurations */ static int m920x_mt352_demod_init(struct dvb_frontend *fe) { int ret; static const u8 config[] = { CONFIG, 0x3d }; static const u8 clock[] = { CLOCK_CTL, 0x30 }; static const u8 reset[] = { RESET, 0x80 }; static const u8 adc_ctl[] = { ADC_CTL_1, 0x40 }; static const u8 agc[] = { AGC_TARGET, 0x1c, 0x20 }; static const u8 sec_agc[] = { 0x69, 0x00, 0xff, 0xff, 0x40, 0xff, 0x00, 0x40, 0x40 }; static const u8 unk1[] = { 0x93, 0x1a }; static const u8 unk2[] = { 0xb5, 0x7a }; deb("Demod init!\n"); if ((ret = mt352_write(fe, config, ARRAY_SIZE(config))) != 0) return ret; if ((ret = mt352_write(fe, clock, ARRAY_SIZE(clock))) != 0) return ret; if ((ret = mt352_write(fe, reset, ARRAY_SIZE(reset))) != 0) return ret; if ((ret = mt352_write(fe, adc_ctl, ARRAY_SIZE(adc_ctl))) != 0) return ret; if ((ret = mt352_write(fe, agc, ARRAY_SIZE(agc))) != 0) return ret; if ((ret = mt352_write(fe, sec_agc, ARRAY_SIZE(sec_agc))) != 0) return ret; if ((ret = mt352_write(fe, unk1, ARRAY_SIZE(unk1))) != 0) return ret; if ((ret = mt352_write(fe, unk2, ARRAY_SIZE(unk2))) != 0) return ret; return 0; } static struct mt352_config m920x_mt352_config = { .demod_address = 0x0f, .no_tuner = 1, .demod_init = m920x_mt352_demod_init, }; static struct tda1004x_config m920x_tda10046_08_config = { .demod_address = 0x08, .invert = 0, .invert_oclk = 0, .ts_mode = TDA10046_TS_SERIAL, .xtal_freq = TDA10046_XTAL_16M, .if_freq = TDA10046_FREQ_045, .agc_config = TDA10046_AGC_TDA827X, .gpio_config = TDA10046_GPTRI, .request_firmware = NULL, }; static struct tda1004x_config m920x_tda10046_0b_config = { .demod_address = 0x0b, .invert = 0, .invert_oclk = 0, .ts_mode = TDA10046_TS_SERIAL, .xtal_freq = TDA10046_XTAL_16M, .if_freq = TDA10046_FREQ_045, .agc_config = TDA10046_AGC_TDA827X, .gpio_config = TDA10046_GPTRI, .request_firmware = NULL, /* uses firmware EEPROM */ }; /* tuner configurations */ static struct qt1010_config m920x_qt1010_config = { .i2c_address = 0x62 }; static struct mt2060_config m920x_mt2060_config = { .i2c_address = 0x60, /* 0xc0 */ .clock_out = 0, }; /* Callbacks for DVB USB */ static int m920x_mt352_frontend_attach(struct dvb_usb_adapter *adap) { deb("%s\n",__func__); adap->fe_adap[0].fe = dvb_attach(mt352_attach, &m920x_mt352_config, &adap->dev->i2c_adap); if ((adap->fe_adap[0].fe) == NULL) return -EIO; return 0; } static int m920x_mt352_frontend_attach_vp7049(struct dvb_usb_adapter *adap) { struct m920x_inits vp7049_fe_init_seq[] = { /* XXX without these commands the frontend cannot be detected, * they must be sent BEFORE the frontend is attached */ { 0xff28, 0x00 }, { 0xff23, 0x00 }, { 0xff28, 0x00 }, { 0xff23, 0x00 }, { 0xff21, 0x20 }, { 0xff21, 0x60 }, { 0xff28, 0x00 }, { 0xff22, 0x00 }, { 0xff20, 0x30 }, { 0xff20, 0x20 }, { 0xff20, 0x30 }, { } /* terminating entry */ }; int ret; deb("%s\n", __func__); ret = m920x_write_seq(adap->dev->udev, M9206_CORE, vp7049_fe_init_seq); if (ret != 0) { deb("Initialization of vp7049 frontend failed."); return ret; } return m920x_mt352_frontend_attach(adap); } static int m920x_tda10046_08_frontend_attach(struct dvb_usb_adapter *adap) { deb("%s\n",__func__); adap->fe_adap[0].fe = dvb_attach(tda10046_attach, &m920x_tda10046_08_config, &adap->dev->i2c_adap); if ((adap->fe_adap[0].fe) == NULL) return -EIO; return 0; } static int m920x_tda10046_0b_frontend_attach(struct dvb_usb_adapter *adap) { deb("%s\n",__func__); adap->fe_adap[0].fe = dvb_attach(tda10046_attach, &m920x_tda10046_0b_config, &adap->dev->i2c_adap); if ((adap->fe_adap[0].fe) == NULL) return -EIO; return 0; } static int m920x_qt1010_tuner_attach(struct dvb_usb_adapter *adap) { deb("%s\n",__func__); if (dvb_attach(qt1010_attach, adap->fe_adap[0].fe, &adap->dev->i2c_adap, &m920x_qt1010_config) == NULL) return -ENODEV; return 0; } static int m920x_tda8275_60_tuner_attach(struct dvb_usb_adapter *adap) { deb("%s\n",__func__); if (dvb_attach(tda827x_attach, adap->fe_adap[0].fe, 0x60, &adap->dev->i2c_adap, NULL) == NULL) return -ENODEV; return 0; } static int m920x_tda8275_61_tuner_attach(struct dvb_usb_adapter *adap) { deb("%s\n",__func__); if (dvb_attach(tda827x_attach, adap->fe_adap[0].fe, 0x61, &adap->dev->i2c_adap, NULL) == NULL) return -ENODEV; return 0; } static int m920x_fmd1216me_tuner_attach(struct dvb_usb_adapter *adap) { dvb_attach(simple_tuner_attach, adap->fe_adap[0].fe, &adap->dev->i2c_adap, 0x61, TUNER_PHILIPS_FMD1216ME_MK3); return 0; } static int m920x_mt2060_tuner_attach(struct dvb_usb_adapter *adap) { deb("%s\n", __func__); if (dvb_attach(mt2060_attach, adap->fe_adap[0].fe, &adap->dev->i2c_adap, &m920x_mt2060_config, 1220) == NULL) return -ENODEV; return 0; } /* device-specific initialization */ static struct m920x_inits megasky_rc_init [] = { { M9206_RC_INIT2, 0xa8 }, { M9206_RC_INIT1, 0x51 }, { } /* terminating entry */ }; static struct m920x_inits tvwalkertwin_rc_init [] = { { M9206_RC_INIT2, 0x00 }, { M9206_RC_INIT1, 0xef }, { 0xff28, 0x00 }, { 0xff23, 0x00 }, { 0xff21, 0x30 }, { } /* terminating entry */ }; static struct m920x_inits pinnacle310e_init[] = { /* without these the tuner doesn't work */ { 0xff20, 0x9b }, { 0xff22, 0x70 }, /* rc settings */ { 0xff50, 0x80 }, { M9206_RC_INIT1, 0x00 }, { M9206_RC_INIT2, 0xff }, { } /* terminating entry */ }; static struct m920x_inits vp7049_rc_init[] = { { 0xff28, 0x00 }, { 0xff23, 0x00 }, { 0xff21, 0x70 }, { M9206_RC_INIT2, 0x00 }, { M9206_RC_INIT1, 0xff }, { } /* terminating entry */ }; /* ir keymaps */ static struct rc_map_table rc_map_megasky_table[] = { { 0x0012, KEY_POWER }, { 0x001e, KEY_CYCLEWINDOWS }, /* min/max */ { 0x0002, KEY_CHANNELUP }, { 0x0005, KEY_CHANNELDOWN }, { 0x0003, KEY_VOLUMEUP }, { 0x0006, KEY_VOLUMEDOWN }, { 0x0004, KEY_MUTE }, { 0x0007, KEY_OK }, /* TS */ { 0x0008, KEY_STOP }, { 0x0009, KEY_MENU }, /* swap */ { 0x000a, KEY_REWIND }, { 0x001b, KEY_PAUSE }, { 0x001f, KEY_FASTFORWARD }, { 0x000c, KEY_RECORD }, { 0x000d, KEY_CAMERA }, /* screenshot */ { 0x000e, KEY_COFFEE }, /* "MTS" */ }; static struct rc_map_table rc_map_tvwalkertwin_table[] = { { 0x0001, KEY_ZOOM }, /* Full Screen */ { 0x0002, KEY_CAMERA }, /* snapshot */ { 0x0003, KEY_MUTE }, { 0x0004, KEY_REWIND }, { 0x0005, KEY_PLAYPAUSE }, /* Play/Pause */ { 0x0006, KEY_FASTFORWARD }, { 0x0007, KEY_RECORD }, { 0x0008, KEY_STOP }, { 0x0009, KEY_TIME }, /* Timeshift */ { 0x000c, KEY_COFFEE }, /* Recall */ { 0x000e, KEY_CHANNELUP }, { 0x0012, KEY_POWER }, { 0x0015, KEY_MENU }, /* source */ { 0x0018, KEY_CYCLEWINDOWS }, /* TWIN PIP */ { 0x001a, KEY_CHANNELDOWN }, { 0x001b, KEY_VOLUMEDOWN }, { 0x001e, KEY_VOLUMEUP }, }; static struct rc_map_table rc_map_pinnacle310e_table[] = { { 0x16, KEY_POWER }, { 0x17, KEY_FAVORITES }, { 0x0f, KEY_TEXT }, { 0x48, KEY_PROGRAM }, /* preview */ { 0x1c, KEY_EPG }, { 0x04, KEY_LIST }, /* record list */ { 0x03, KEY_1 }, { 0x01, KEY_2 }, { 0x06, KEY_3 }, { 0x09, KEY_4 }, { 0x1d, KEY_5 }, { 0x1f, KEY_6 }, { 0x0d, KEY_7 }, { 0x19, KEY_8 }, { 0x1b, KEY_9 }, { 0x15, KEY_0 }, { 0x0c, KEY_CANCEL }, { 0x4a, KEY_CLEAR }, { 0x13, KEY_BACK }, { 0x00, KEY_TAB }, { 0x4b, KEY_UP }, { 0x4e, KEY_LEFT }, { 0x52, KEY_RIGHT }, { 0x51, KEY_DOWN }, { 0x4f, KEY_ENTER }, /* could also be KEY_OK */ { 0x1e, KEY_VOLUMEUP }, { 0x0a, KEY_VOLUMEDOWN }, { 0x05, KEY_CHANNELUP }, { 0x02, KEY_CHANNELDOWN }, { 0x11, KEY_RECORD }, { 0x14, KEY_PLAY }, { 0x4c, KEY_PAUSE }, { 0x1a, KEY_STOP }, { 0x40, KEY_REWIND }, { 0x12, KEY_FASTFORWARD }, { 0x41, KEY_PREVIOUSSONG }, /* Replay */ { 0x42, KEY_NEXTSONG }, /* Skip */ { 0x54, KEY_CAMERA }, /* Capture */ /* { 0x50, KEY_SAP }, */ /* Sap */ { 0x47, KEY_CYCLEWINDOWS }, /* Pip */ { 0x4d, KEY_SCREEN }, /* FullScreen */ { 0x08, KEY_SUBTITLE }, { 0x0e, KEY_MUTE }, /* { 0x49, KEY_LR }, */ /* L/R */ { 0x07, KEY_SLEEP }, /* Hibernate */ { 0x08, KEY_VIDEO }, /* A/V */ { 0x0e, KEY_MENU }, /* Recall */ { 0x45, KEY_ZOOMIN }, { 0x46, KEY_ZOOMOUT }, { 0x18, KEY_RED }, /* Red */ { 0x53, KEY_GREEN }, /* Green */ { 0x5e, KEY_YELLOW }, /* Yellow */ { 0x5f, KEY_BLUE }, /* Blue */ }; /* DVB USB Driver stuff */ static struct dvb_usb_device_properties megasky_properties; static struct dvb_usb_device_properties digivox_mini_ii_properties; static struct dvb_usb_device_properties tvwalkertwin_properties; static struct dvb_usb_device_properties dposh_properties; static struct dvb_usb_device_properties pinnacle_pctv310e_properties; static struct dvb_usb_device_properties vp7049_properties; static int m920x_probe(struct usb_interface *intf, const struct usb_device_id *id) { struct dvb_usb_device *d = NULL; int ret; struct m920x_inits *rc_init_seq = NULL; int bInterfaceNumber = intf->cur_altsetting->desc.bInterfaceNumber; deb("Probing for m920x device at interface %d\n", bInterfaceNumber); if (bInterfaceNumber == 0) { /* Single-tuner device, or first interface on * multi-tuner device */ ret = dvb_usb_device_init(intf, &megasky_properties, THIS_MODULE, &d, adapter_nr); if (ret == 0) { rc_init_seq = megasky_rc_init; goto found; } ret = dvb_usb_device_init(intf, &digivox_mini_ii_properties, THIS_MODULE, &d, adapter_nr); if (ret == 0) { /* No remote control, so no rc_init_seq */ goto found; } /* This configures both tuners on the TV Walker Twin */ ret = dvb_usb_device_init(intf, &tvwalkertwin_properties, THIS_MODULE, &d, adapter_nr); if (ret == 0) { rc_init_seq = tvwalkertwin_rc_init; goto found; } ret = dvb_usb_device_init(intf, &dposh_properties, THIS_MODULE, &d, adapter_nr); if (ret == 0) { /* Remote controller not supported yet. */ goto found; } ret = dvb_usb_device_init(intf, &pinnacle_pctv310e_properties, THIS_MODULE, &d, adapter_nr); if (ret == 0) { rc_init_seq = pinnacle310e_init; goto found; } ret = dvb_usb_device_init(intf, &vp7049_properties, THIS_MODULE, &d, adapter_nr); if (ret == 0) { rc_init_seq = vp7049_rc_init; goto found; } return ret; } else { /* Another interface on a multi-tuner device */ /* The LifeView TV Walker Twin gets here, but struct * tvwalkertwin_properties already configured both * tuners, so there is nothing for us to do here */ } found: if ((ret = m920x_init_ep(intf)) < 0) return ret; if (d && (ret = m920x_init(d, rc_init_seq)) != 0) return ret; return ret; } enum { MSI_MEGASKY580, ANUBIS_MSI_DIGI_VOX_MINI_II, ANUBIS_LIFEVIEW_TV_WALKER_TWIN_COLD, ANUBIS_LIFEVIEW_TV_WALKER_TWIN_WARM, DPOSH_M9206_COLD, DPOSH_M9206_WARM, VISIONPLUS_PINNACLE_PCTV310E, AZUREWAVE_TWINHAN_VP7049, }; static const struct usb_device_id m920x_table[] = { DVB_USB_DEV(MSI, MSI_MEGASKY580), DVB_USB_DEV(ANUBIS_ELECTRONIC, ANUBIS_MSI_DIGI_VOX_MINI_II), DVB_USB_DEV(ANUBIS_ELECTRONIC, ANUBIS_LIFEVIEW_TV_WALKER_TWIN_COLD), DVB_USB_DEV(ANUBIS_ELECTRONIC, ANUBIS_LIFEVIEW_TV_WALKER_TWIN_WARM), DVB_USB_DEV(DPOSH, DPOSH_M9206_COLD), DVB_USB_DEV(DPOSH, DPOSH_M9206_WARM), DVB_USB_DEV(VISIONPLUS, VISIONPLUS_PINNACLE_PCTV310E), DVB_USB_DEV(AZUREWAVE, AZUREWAVE_TWINHAN_VP7049), { } }; MODULE_DEVICE_TABLE (usb, m920x_table); static struct dvb_usb_device_properties megasky_properties = { .caps = DVB_USB_IS_AN_I2C_ADAPTER, .usb_ctrl = DEVICE_SPECIFIC, .firmware = "dvb-usb-megasky-02.fw", .download_firmware = m920x_firmware_download, .rc.legacy = { .rc_interval = 100, .rc_map_table = rc_map_megasky_table, .rc_map_size = ARRAY_SIZE(rc_map_megasky_table), .rc_query = m920x_rc_query, }, .size_of_priv = sizeof(struct m920x_state), .identify_state = m920x_identify_state, .num_adapters = 1, .adapter = {{ .num_frontends = 1, .fe = {{ .caps = DVB_USB_ADAP_HAS_PID_FILTER | DVB_USB_ADAP_PID_FILTER_CAN_BE_TURNED_OFF, .pid_filter_count = 8, .pid_filter = m920x_pid_filter, .pid_filter_ctrl = m920x_pid_filter_ctrl, .frontend_attach = m920x_mt352_frontend_attach, .tuner_attach = m920x_qt1010_tuner_attach, .stream = { .type = USB_BULK, .count = 8, .endpoint = 0x81, .u = { .bulk = { .buffersize = 512, } } }, }}, }}, .i2c_algo = &m920x_i2c_algo, .num_device_descs = 1, .devices = { { "MSI Mega Sky 580 DVB-T USB2.0", { &m920x_table[MSI_MEGASKY580], NULL }, { NULL }, } } }; static struct dvb_usb_device_properties digivox_mini_ii_properties = { .caps = DVB_USB_IS_AN_I2C_ADAPTER, .usb_ctrl = DEVICE_SPECIFIC, .firmware = "dvb-usb-digivox-02.fw", .download_firmware = m920x_firmware_download, .size_of_priv = sizeof(struct m920x_state), .identify_state = m920x_identify_state, .num_adapters = 1, .adapter = {{ .num_frontends = 1, .fe = {{ .caps = DVB_USB_ADAP_HAS_PID_FILTER | DVB_USB_ADAP_PID_FILTER_CAN_BE_TURNED_OFF, .pid_filter_count = 8, .pid_filter = m920x_pid_filter, .pid_filter_ctrl = m920x_pid_filter_ctrl, .frontend_attach = m920x_tda10046_08_frontend_attach, .tuner_attach = m920x_tda8275_60_tuner_attach, .stream = { .type = USB_BULK, .count = 8, .endpoint = 0x81, .u = { .bulk = { .buffersize = 0x4000, } } }, }}, }}, .i2c_algo = &m920x_i2c_algo, .num_device_descs = 1, .devices = { { "MSI DIGI VOX mini II DVB-T USB2.0", { &m920x_table[ANUBIS_MSI_DIGI_VOX_MINI_II], NULL }, { NULL }, }, } }; /* LifeView TV Walker Twin support by Nick Andrew <nick@nick-andrew.net> * * LifeView TV Walker Twin has 1 x M9206, 2 x TDA10046, 2 x TDA8275A * TDA10046 #0 is located at i2c address 0x08 * TDA10046 #1 is located at i2c address 0x0b * TDA8275A #0 is located at i2c address 0x60 * TDA8275A #1 is located at i2c address 0x61 */ static struct dvb_usb_device_properties tvwalkertwin_properties = { .caps = DVB_USB_IS_AN_I2C_ADAPTER, .usb_ctrl = DEVICE_SPECIFIC, .firmware = "dvb-usb-tvwalkert.fw", .download_firmware = m920x_firmware_download, .rc.legacy = { .rc_interval = 100, .rc_map_table = rc_map_tvwalkertwin_table, .rc_map_size = ARRAY_SIZE(rc_map_tvwalkertwin_table), .rc_query = m920x_rc_query, }, .size_of_priv = sizeof(struct m920x_state), .identify_state = m920x_identify_state, .num_adapters = 2, .adapter = {{ .num_frontends = 1, .fe = {{ .caps = DVB_USB_ADAP_HAS_PID_FILTER | DVB_USB_ADAP_PID_FILTER_CAN_BE_TURNED_OFF, .pid_filter_count = 8, .pid_filter = m920x_pid_filter, .pid_filter_ctrl = m920x_pid_filter_ctrl, .frontend_attach = m920x_tda10046_08_frontend_attach, .tuner_attach = m920x_tda8275_60_tuner_attach, .stream = { .type = USB_BULK, .count = 8, .endpoint = 0x81, .u = { .bulk = { .buffersize = 512, } } }}, }},{ .num_frontends = 1, .fe = {{ .caps = DVB_USB_ADAP_HAS_PID_FILTER | DVB_USB_ADAP_PID_FILTER_CAN_BE_TURNED_OFF, .pid_filter_count = 8, .pid_filter = m920x_pid_filter, .pid_filter_ctrl = m920x_pid_filter_ctrl, .frontend_attach = m920x_tda10046_0b_frontend_attach, .tuner_attach = m920x_tda8275_61_tuner_attach, .stream = { .type = USB_BULK, .count = 8, .endpoint = 0x82, .u = { .bulk = { .buffersize = 512, } } }}, }, }}, .i2c_algo = &m920x_i2c_algo, .num_device_descs = 1, .devices = { { .name = "LifeView TV Walker Twin DVB-T USB2.0", .cold_ids = { &m920x_table[ANUBIS_LIFEVIEW_TV_WALKER_TWIN_COLD], NULL }, .warm_ids = { &m920x_table[ANUBIS_LIFEVIEW_TV_WALKER_TWIN_WARM], NULL }, }, } }; static struct dvb_usb_device_properties dposh_properties = { .caps = DVB_USB_IS_AN_I2C_ADAPTER, .usb_ctrl = DEVICE_SPECIFIC, .firmware = "dvb-usb-dposh-01.fw", .download_firmware = m920x_firmware_download, .size_of_priv = sizeof(struct m920x_state), .identify_state = m920x_identify_state, .num_adapters = 1, .adapter = {{ .num_frontends = 1, .fe = {{ /* Hardware pid filters don't work with this device/firmware */ .frontend_attach = m920x_mt352_frontend_attach, .tuner_attach = m920x_qt1010_tuner_attach, .stream = { .type = USB_BULK, .count = 8, .endpoint = 0x81, .u = { .bulk = { .buffersize = 512, } } }, }}, }}, .i2c_algo = &m920x_i2c_algo, .num_device_descs = 1, .devices = { { .name = "Dposh DVB-T USB2.0", .cold_ids = { &m920x_table[DPOSH_M9206_COLD], NULL }, .warm_ids = { &m920x_table[DPOSH_M9206_WARM], NULL }, }, } }; static struct dvb_usb_device_properties pinnacle_pctv310e_properties = { .caps = DVB_USB_IS_AN_I2C_ADAPTER, .usb_ctrl = DEVICE_SPECIFIC, .download_firmware = NULL, .rc.legacy = { .rc_interval = 100, .rc_map_table = rc_map_pinnacle310e_table, .rc_map_size = ARRAY_SIZE(rc_map_pinnacle310e_table), .rc_query = m920x_rc_query, }, .size_of_priv = sizeof(struct m920x_state), .identify_state = m920x_identify_state, .num_adapters = 1, .adapter = {{ .num_frontends = 1, .fe = {{ .caps = DVB_USB_ADAP_HAS_PID_FILTER | DVB_USB_ADAP_PID_FILTER_CAN_BE_TURNED_OFF, .pid_filter_count = 8, .pid_filter = m920x_pid_filter, .pid_filter_ctrl = m920x_pid_filter_ctrl, .frontend_attach = m920x_mt352_frontend_attach, .tuner_attach = m920x_fmd1216me_tuner_attach, .stream = { .type = USB_ISOC, .count = 5, .endpoint = 0x84, .u = { .isoc = { .framesperurb = 128, .framesize = 564, .interval = 1, } } }, }}, } }, .i2c_algo = &m920x_i2c_algo, .num_device_descs = 1, .devices = { { "Pinnacle PCTV 310e", { &m920x_table[VISIONPLUS_PINNACLE_PCTV310E], NULL }, { NULL }, } } }; static struct dvb_usb_device_properties vp7049_properties = { .caps = DVB_USB_IS_AN_I2C_ADAPTER, .usb_ctrl = DEVICE_SPECIFIC, .firmware = "dvb-usb-vp7049-0.95.fw", .download_firmware = m920x_firmware_download, .rc.core = { .rc_interval = 150, .rc_codes = RC_MAP_TWINHAN_VP1027_DVBS, .rc_query = m920x_rc_core_query, .allowed_protos = RC_PROTO_BIT_UNKNOWN, }, .size_of_priv = sizeof(struct m920x_state), .identify_state = m920x_identify_state, .num_adapters = 1, .adapter = {{ .num_frontends = 1, .fe = {{ .caps = DVB_USB_ADAP_HAS_PID_FILTER | DVB_USB_ADAP_PID_FILTER_CAN_BE_TURNED_OFF, .pid_filter_count = 8, .pid_filter = m920x_pid_filter, .pid_filter_ctrl = m920x_pid_filter_ctrl, .frontend_attach = m920x_mt352_frontend_attach_vp7049, .tuner_attach = m920x_mt2060_tuner_attach, .stream = { .type = USB_BULK, .count = 8, .endpoint = 0x81, .u = { .bulk = { .buffersize = 512, } } }, } }, } }, .i2c_algo = &m920x_i2c_algo, .num_device_descs = 1, .devices = { { "DTV-DVB UDTT7049", { &m920x_table[AZUREWAVE_TWINHAN_VP7049], NULL }, { NULL }, } } }; static struct usb_driver m920x_driver = { .name = "dvb_usb_m920x", .probe = m920x_probe, .disconnect = dvb_usb_device_exit, .id_table = m920x_table, }; module_usb_driver(m920x_driver); MODULE_AUTHOR("Aapo Tahkola <aet@rasterburn.org>"); MODULE_DESCRIPTION("DVB Driver for ULI M920x"); MODULE_VERSION("0.1"); MODULE_LICENSE("GPL");
7 40 15 15 15 10 15 15 8 58 3 27 28 2 53 55 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 /* * Copyright (c) 2006 Oracle. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. * */ #include <linux/percpu.h> #include <linux/seq_file.h> #include <linux/slab.h> #include <linux/proc_fs.h> #include <linux/export.h> #include "rds.h" /* * This file implements a getsockopt() call which copies a set of fixed * sized structs into a user-specified buffer as a means of providing * read-only information about RDS. * * For a given information source there are a given number of fixed sized * structs at a given time. The structs are only copied if the user-specified * buffer is big enough. The destination pages that make up the buffer * are pinned for the duration of the copy. * * This gives us the following benefits: * * - simple implementation, no copy "position" across multiple calls * - consistent snapshot of an info source * - atomic copy works well with whatever locking info source has * - one portable tool to get rds info across implementations * - long-lived tool can get info without allocating * * at the following costs: * * - info source copy must be pinned, may be "large" */ struct rds_info_iterator { struct page **pages; void *addr; unsigned long offset; }; static DEFINE_SPINLOCK(rds_info_lock); static rds_info_func rds_info_funcs[RDS_INFO_LAST - RDS_INFO_FIRST + 1]; void rds_info_register_func(int optname, rds_info_func func) { int offset = optname - RDS_INFO_FIRST; BUG_ON(optname < RDS_INFO_FIRST || optname > RDS_INFO_LAST); spin_lock(&rds_info_lock); BUG_ON(rds_info_funcs[offset]); rds_info_funcs[offset] = func; spin_unlock(&rds_info_lock); } EXPORT_SYMBOL_GPL(rds_info_register_func); void rds_info_deregister_func(int optname, rds_info_func func) { int offset = optname - RDS_INFO_FIRST; BUG_ON(optname < RDS_INFO_FIRST || optname > RDS_INFO_LAST); spin_lock(&rds_info_lock); BUG_ON(rds_info_funcs[offset] != func); rds_info_funcs[offset] = NULL; spin_unlock(&rds_info_lock); } EXPORT_SYMBOL_GPL(rds_info_deregister_func); /* * Typically we hold an atomic kmap across multiple rds_info_copy() calls * because the kmap is so expensive. This must be called before using blocking * operations while holding the mapping and as the iterator is torn down. */ void rds_info_iter_unmap(struct rds_info_iterator *iter) { if (iter->addr) { kunmap_atomic(iter->addr); iter->addr = NULL; } } /* * get_user_pages() called flush_dcache_page() on the pages for us. */ void rds_info_copy(struct rds_info_iterator *iter, void *data, unsigned long bytes) { unsigned long this; while (bytes) { if (!iter->addr) iter->addr = kmap_atomic(*iter->pages); this = min(bytes, PAGE_SIZE - iter->offset); rdsdebug("page %p addr %p offset %lu this %lu data %p " "bytes %lu\n", *iter->pages, iter->addr, iter->offset, this, data, bytes); memcpy(iter->addr + iter->offset, data, this); data += this; bytes -= this; iter->offset += this; if (iter->offset == PAGE_SIZE) { kunmap_atomic(iter->addr); iter->addr = NULL; iter->offset = 0; iter->pages++; } } } EXPORT_SYMBOL_GPL(rds_info_copy); /* * @optval points to the userspace buffer that the information snapshot * will be copied into. * * @optlen on input is the size of the buffer in userspace. @optlen * on output is the size of the requested snapshot in bytes. * * This function returns -errno if there is a failure, particularly -ENOSPC * if the given userspace buffer was not large enough to fit the snapshot. * On success it returns the positive number of bytes of each array element * in the snapshot. */ int rds_info_getsockopt(struct socket *sock, int optname, char __user *optval, int __user *optlen) { struct rds_info_iterator iter; struct rds_info_lengths lens; unsigned long nr_pages = 0; unsigned long start; rds_info_func func; struct page **pages = NULL; int ret; int len; int total; if (get_user(len, optlen)) { ret = -EFAULT; goto out; } /* check for all kinds of wrapping and the like */ start = (unsigned long)optval; if (len < 0 || len > INT_MAX - PAGE_SIZE + 1 || start + len < start) { ret = -EINVAL; goto out; } /* a 0 len call is just trying to probe its length */ if (len == 0) goto call_func; nr_pages = (PAGE_ALIGN(start + len) - (start & PAGE_MASK)) >> PAGE_SHIFT; pages = kmalloc_array(nr_pages, sizeof(struct page *), GFP_KERNEL); if (!pages) { ret = -ENOMEM; goto out; } ret = pin_user_pages_fast(start, nr_pages, FOLL_WRITE, pages); if (ret != nr_pages) { if (ret > 0) nr_pages = ret; else nr_pages = 0; ret = -EAGAIN; /* XXX ? */ goto out; } rdsdebug("len %d nr_pages %lu\n", len, nr_pages); call_func: func = rds_info_funcs[optname - RDS_INFO_FIRST]; if (!func) { ret = -ENOPROTOOPT; goto out; } iter.pages = pages; iter.addr = NULL; iter.offset = start & (PAGE_SIZE - 1); func(sock, len, &iter, &lens); BUG_ON(lens.each == 0); total = lens.nr * lens.each; rds_info_iter_unmap(&iter); if (total > len) { len = total; ret = -ENOSPC; } else { len = total; ret = lens.each; } if (put_user(len, optlen)) ret = -EFAULT; out: if (pages) unpin_user_pages(pages, nr_pages); kfree(pages); return ret; }
1 1 5 4 5 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 // SPDX-License-Identifier: GPL-2.0 /* * (C) Copyright 2002-2004, 2007 Greg Kroah-Hartman <greg@kroah.com> * (C) Copyright 2007 Novell Inc. */ #include <linux/pci.h> #include <linux/module.h> #include <linux/init.h> #include <linux/device.h> #include <linux/mempolicy.h> #include <linux/string.h> #include <linux/slab.h> #include <linux/sched.h> #include <linux/sched/isolation.h> #include <linux/cpu.h> #include <linux/pm_runtime.h> #include <linux/suspend.h> #include <linux/kexec.h> #include <linux/of_device.h> #include <linux/acpi.h> #include <linux/dma-map-ops.h> #include <linux/iommu.h> #include "pci.h" #include "pcie/portdrv.h" struct pci_dynid { struct list_head node; struct pci_device_id id; }; /** * pci_add_dynid - add a new PCI device ID to this driver and re-probe devices * @drv: target pci driver * @vendor: PCI vendor ID * @device: PCI device ID * @subvendor: PCI subvendor ID * @subdevice: PCI subdevice ID * @class: PCI class * @class_mask: PCI class mask * @driver_data: private driver data * * Adds a new dynamic pci device ID to this driver and causes the * driver to probe for all devices again. @drv must have been * registered prior to calling this function. * * CONTEXT: * Does GFP_KERNEL allocation. * * RETURNS: * 0 on success, -errno on failure. */ int pci_add_dynid(struct pci_driver *drv, unsigned int vendor, unsigned int device, unsigned int subvendor, unsigned int subdevice, unsigned int class, unsigned int class_mask, unsigned long driver_data) { struct pci_dynid *dynid; dynid = kzalloc(sizeof(*dynid), GFP_KERNEL); if (!dynid) return -ENOMEM; dynid->id.vendor = vendor; dynid->id.device = device; dynid->id.subvendor = subvendor; dynid->id.subdevice = subdevice; dynid->id.class = class; dynid->id.class_mask = class_mask; dynid->id.driver_data = driver_data; spin_lock(&drv->dynids.lock); list_add_tail(&dynid->node, &drv->dynids.list); spin_unlock(&drv->dynids.lock); return driver_attach(&drv->driver); } EXPORT_SYMBOL_GPL(pci_add_dynid); static void pci_free_dynids(struct pci_driver *drv) { struct pci_dynid *dynid, *n; spin_lock(&drv->dynids.lock); list_for_each_entry_safe(dynid, n, &drv->dynids.list, node) { list_del(&dynid->node); kfree(dynid); } spin_unlock(&drv->dynids.lock); } /** * pci_match_id - See if a PCI device matches a given pci_id table * @ids: array of PCI device ID structures to search in * @dev: the PCI device structure to match against. * * Used by a driver to check whether a PCI device is in its list of * supported devices. Returns the matching pci_device_id structure or * %NULL if there is no match. * * Deprecated; don't use this as it will not catch any dynamic IDs * that a driver might want to check for. */ const struct pci_device_id *pci_match_id(const struct pci_device_id *ids, struct pci_dev *dev) { if (ids) { while (ids->vendor || ids->subvendor || ids->class_mask) { if (pci_match_one_device(ids, dev)) return ids; ids++; } } return NULL; } EXPORT_SYMBOL(pci_match_id); static const struct pci_device_id pci_device_id_any = { .vendor = PCI_ANY_ID, .device = PCI_ANY_ID, .subvendor = PCI_ANY_ID, .subdevice = PCI_ANY_ID, }; /** * pci_match_device - See if a device matches a driver's list of IDs * @drv: the PCI driver to match against * @dev: the PCI device structure to match against * * Used by a driver to check whether a PCI device is in its list of * supported devices or in the dynids list, which may have been augmented * via the sysfs "new_id" file. Returns the matching pci_device_id * structure or %NULL if there is no match. */ static const struct pci_device_id *pci_match_device(struct pci_driver *drv, struct pci_dev *dev) { struct pci_dynid *dynid; const struct pci_device_id *found_id = NULL, *ids; /* When driver_override is set, only bind to the matching driver */ if (dev->driver_override && strcmp(dev->driver_override, drv->name)) return NULL; /* Look at the dynamic ids first, before the static ones */ spin_lock(&drv->dynids.lock); list_for_each_entry(dynid, &drv->dynids.list, node) { if (pci_match_one_device(&dynid->id, dev)) { found_id = &dynid->id; break; } } spin_unlock(&drv->dynids.lock); if (found_id) return found_id; for (ids = drv->id_table; (found_id = pci_match_id(ids, dev)); ids = found_id + 1) { /* * The match table is split based on driver_override. * In case override_only was set, enforce driver_override * matching. */ if (found_id->override_only) { if (dev->driver_override) return found_id; } else { return found_id; } } /* driver_override will always match, send a dummy id */ if (dev->driver_override) return &pci_device_id_any; return NULL; } /** * new_id_store - sysfs frontend to pci_add_dynid() * @driver: target device driver * @buf: buffer for scanning device ID data * @count: input size * * Allow PCI IDs to be added to an existing driver via sysfs. */ static ssize_t new_id_store(struct device_driver *driver, const char *buf, size_t count) { struct pci_driver *pdrv = to_pci_driver(driver); const struct pci_device_id *ids = pdrv->id_table; u32 vendor, device, subvendor = PCI_ANY_ID, subdevice = PCI_ANY_ID, class = 0, class_mask = 0; unsigned long driver_data = 0; int fields; int retval = 0; fields = sscanf(buf, "%x %x %x %x %x %x %lx", &vendor, &device, &subvendor, &subdevice, &class, &class_mask, &driver_data); if (fields < 2) return -EINVAL; if (fields != 7) { struct pci_dev *pdev = kzalloc(sizeof(*pdev), GFP_KERNEL); if (!pdev) return -ENOMEM; pdev->vendor = vendor; pdev->device = device; pdev->subsystem_vendor = subvendor; pdev->subsystem_device = subdevice; pdev->class = class; if (pci_match_device(pdrv, pdev)) retval = -EEXIST; kfree(pdev); if (retval) return retval; } /* Only accept driver_data values that match an existing id_table entry */ if (ids) { retval = -EINVAL; while (ids->vendor || ids->subvendor || ids->class_mask) { if (driver_data == ids->driver_data) { retval = 0; break; } ids++; } if (retval) /* No match */ return retval; } retval = pci_add_dynid(pdrv, vendor, device, subvendor, subdevice, class, class_mask, driver_data); if (retval) return retval; return count; } static DRIVER_ATTR_WO(new_id); /** * remove_id_store - remove a PCI device ID from this driver * @driver: target device driver * @buf: buffer for scanning device ID data * @count: input size * * Removes a dynamic pci device ID to this driver. */ static ssize_t remove_id_store(struct device_driver *driver, const char *buf, size_t count) { struct pci_dynid *dynid, *n; struct pci_driver *pdrv = to_pci_driver(driver); u32 vendor, device, subvendor = PCI_ANY_ID, subdevice = PCI_ANY_ID, class = 0, class_mask = 0; int fields; size_t retval = -ENODEV; fields = sscanf(buf, "%x %x %x %x %x %x", &vendor, &device, &subvendor, &subdevice, &class, &class_mask); if (fields < 2) return -EINVAL; spin_lock(&pdrv->dynids.lock); list_for_each_entry_safe(dynid, n, &pdrv->dynids.list, node) { struct pci_device_id *id = &dynid->id; if ((id->vendor == vendor) && (id->device == device) && (subvendor == PCI_ANY_ID || id->subvendor == subvendor) && (subdevice == PCI_ANY_ID || id->subdevice == subdevice) && !((id->class ^ class) & class_mask)) { list_del(&dynid->node); kfree(dynid); retval = count; break; } } spin_unlock(&pdrv->dynids.lock); return retval; } static DRIVER_ATTR_WO(remove_id); static struct attribute *pci_drv_attrs[] = { &driver_attr_new_id.attr, &driver_attr_remove_id.attr, NULL, }; ATTRIBUTE_GROUPS(pci_drv); struct drv_dev_and_id { struct pci_driver *drv; struct pci_dev *dev; const struct pci_device_id *id; }; static long local_pci_probe(void *_ddi) { struct drv_dev_and_id *ddi = _ddi; struct pci_dev *pci_dev = ddi->dev; struct pci_driver *pci_drv = ddi->drv; struct device *dev = &pci_dev->dev; int rc; /* * Unbound PCI devices are always put in D0, regardless of * runtime PM status. During probe, the device is set to * active and the usage count is incremented. If the driver * supports runtime PM, it should call pm_runtime_put_noidle(), * or any other runtime PM helper function decrementing the usage * count, in its probe routine and pm_runtime_get_noresume() in * its remove routine. */ pm_runtime_get_sync(dev); pci_dev->driver = pci_drv; rc = pci_drv->probe(pci_dev, ddi->id); if (!rc) return rc; if (rc < 0) { pci_dev->driver = NULL; pm_runtime_put_sync(dev); return rc; } /* * Probe function should return < 0 for failure, 0 for success * Treat values > 0 as success, but warn. */ pci_warn(pci_dev, "Driver probe function unexpectedly returned %d\n", rc); return 0; } static bool pci_physfn_is_probed(struct pci_dev *dev) { #ifdef CONFIG_PCI_IOV return dev->is_virtfn && dev->physfn->is_probed; #else return false; #endif } static int pci_call_probe(struct pci_driver *drv, struct pci_dev *dev, const struct pci_device_id *id) { int error, node, cpu; struct drv_dev_and_id ddi = { drv, dev, id }; /* * Execute driver initialization on node where the device is * attached. This way the driver likely allocates its local memory * on the right node. */ node = dev_to_node(&dev->dev); dev->is_probed = 1; cpu_hotplug_disable(); /* * Prevent nesting work_on_cpu() for the case where a Virtual Function * device is probed from work_on_cpu() of the Physical device. */ if (node < 0 || node >= MAX_NUMNODES || !node_online(node) || pci_physfn_is_probed(dev)) { cpu = nr_cpu_ids; } else { cpumask_var_t wq_domain_mask; if (!zalloc_cpumask_var(&wq_domain_mask, GFP_KERNEL)) { error = -ENOMEM; goto out; } cpumask_and(wq_domain_mask, housekeeping_cpumask(HK_TYPE_WQ), housekeeping_cpumask(HK_TYPE_DOMAIN)); cpu = cpumask_any_and(cpumask_of_node(node), wq_domain_mask); free_cpumask_var(wq_domain_mask); } if (cpu < nr_cpu_ids) error = work_on_cpu(cpu, local_pci_probe, &ddi); else error = local_pci_probe(&ddi); out: dev->is_probed = 0; cpu_hotplug_enable(); return error; } /** * __pci_device_probe - check if a driver wants to claim a specific PCI device * @drv: driver to call to check if it wants the PCI device * @pci_dev: PCI device being probed * * returns 0 on success, else error. * side-effect: pci_dev->driver is set to drv when drv claims pci_dev. */ static int __pci_device_probe(struct pci_driver *drv, struct pci_dev *pci_dev) { const struct pci_device_id *id; int error = 0; if (drv->probe) { error = -ENODEV; id = pci_match_device(drv, pci_dev); if (id) error = pci_call_probe(drv, pci_dev, id); } return error; } #ifdef CONFIG_PCI_IOV static inline bool pci_device_can_probe(struct pci_dev *pdev) { return (!pdev->is_virtfn || pdev->physfn->sriov->drivers_autoprobe || pdev->driver_override); } #else static inline bool pci_device_can_probe(struct pci_dev *pdev) { return true; } #endif static int pci_device_probe(struct device *dev) { int error; struct pci_dev *pci_dev = to_pci_dev(dev); struct pci_driver *drv = to_pci_driver(dev->driver); if (!pci_device_can_probe(pci_dev)) return -ENODEV; pci_assign_irq(pci_dev); error = pcibios_alloc_irq(pci_dev); if (error < 0) return error; pci_dev_get(pci_dev); error = __pci_device_probe(drv, pci_dev); if (error) { pcibios_free_irq(pci_dev); pci_dev_put(pci_dev); } return error; } static void pci_device_remove(struct device *dev) { struct pci_dev *pci_dev = to_pci_dev(dev); struct pci_driver *drv = pci_dev->driver; if (drv->remove) { pm_runtime_get_sync(dev); /* * If the driver provides a .runtime_idle() callback and it has * started to run already, it may continue to run in parallel * with the code below, so wait until all of the runtime PM * activity has completed. */ pm_runtime_barrier(dev); drv->remove(pci_dev); pm_runtime_put_noidle(dev); } pcibios_free_irq(pci_dev); pci_dev->driver = NULL; pci_iov_remove(pci_dev); /* Undo the runtime PM settings in local_pci_probe() */ pm_runtime_put_sync(dev); /* * If the device is still on, set the power state as "unknown", * since it might change by the next time we load the driver. */ if (pci_dev->current_state == PCI_D0) pci_dev->current_state = PCI_UNKNOWN; /* * We would love to complain here if pci_dev->is_enabled is set, that * the driver should have called pci_disable_device(), but the * unfortunate fact is there are too many odd BIOS and bridge setups * that don't like drivers doing that all of the time. * Oh well, we can dream of sane hardware when we sleep, no matter how * horrible the crap we have to deal with is when we are awake... */ pci_dev_put(pci_dev); } static void pci_device_shutdown(struct device *dev) { struct pci_dev *pci_dev = to_pci_dev(dev); struct pci_driver *drv = pci_dev->driver; pm_runtime_resume(dev); if (drv && drv->shutdown) drv->shutdown(pci_dev); /* * If this is a kexec reboot, turn off Bus Master bit on the * device to tell it to not continue to do DMA. Don't touch * devices in D3cold or unknown states. * If it is not a kexec reboot, firmware will hit the PCI * devices with big hammer and stop their DMA any way. */ if (kexec_in_progress && (pci_dev->current_state <= PCI_D3hot)) pci_clear_master(pci_dev); } #ifdef CONFIG_PM_SLEEP /* Auxiliary functions used for system resume */ /** * pci_restore_standard_config - restore standard config registers of PCI device * @pci_dev: PCI device to handle */ static int pci_restore_standard_config(struct pci_dev *pci_dev) { pci_update_current_state(pci_dev, PCI_UNKNOWN); if (pci_dev->current_state != PCI_D0) { int error = pci_set_power_state(pci_dev, PCI_D0); if (error) return error; } pci_restore_state(pci_dev); pci_pme_restore(pci_dev); return 0; } #endif /* CONFIG_PM_SLEEP */ #ifdef CONFIG_PM /* Auxiliary functions used for system resume and run-time resume */ static void pci_pm_default_resume(struct pci_dev *pci_dev) { pci_fixup_device(pci_fixup_resume, pci_dev); pci_enable_wake(pci_dev, PCI_D0, false); } static void pci_pm_default_resume_early(struct pci_dev *pci_dev) { pci_pm_power_up_and_verify_state(pci_dev); pci_restore_state(pci_dev); pci_pme_restore(pci_dev); } static void pci_pm_bridge_power_up_actions(struct pci_dev *pci_dev) { int ret; ret = pci_bridge_wait_for_secondary_bus(pci_dev, "resume"); if (ret) { /* * The downstream link failed to come up, so mark the * devices below as disconnected to make sure we don't * attempt to resume them. */ pci_walk_bus(pci_dev->subordinate, pci_dev_set_disconnected, NULL); return; } /* * When powering on a bridge from D3cold, the whole hierarchy may be * powered on into D0uninitialized state, resume them to give them a * chance to suspend again */ pci_resume_bus(pci_dev->subordinate); } #endif /* CONFIG_PM */ #ifdef CONFIG_PM_SLEEP /* * Default "suspend" method for devices that have no driver provided suspend, * or not even a driver at all (second part). */ static void pci_pm_set_unknown_state(struct pci_dev *pci_dev) { /* * mark its power state as "unknown", since we don't know if * e.g. the BIOS will change its device state when we suspend. */ if (pci_dev->current_state == PCI_D0) pci_dev->current_state = PCI_UNKNOWN; } /* * Default "resume" method for devices that have no driver provided resume, * or not even a driver at all (second part). */ static int pci_pm_reenable_device(struct pci_dev *pci_dev) { int retval; /* if the device was enabled before suspend, re-enable */ retval = pci_reenable_device(pci_dev); /* * if the device was busmaster before the suspend, make it busmaster * again */ if (pci_dev->is_busmaster) pci_set_master(pci_dev); return retval; } static int pci_legacy_suspend(struct device *dev, pm_message_t state) { struct pci_dev *pci_dev = to_pci_dev(dev); struct pci_driver *drv = pci_dev->driver; if (drv && drv->suspend) { pci_power_t prev = pci_dev->current_state; int error; error = drv->suspend(pci_dev, state); suspend_report_result(dev, drv->suspend, error); if (error) return error; if (!pci_dev->state_saved && pci_dev->current_state != PCI_D0 && pci_dev->current_state != PCI_UNKNOWN) { pci_WARN_ONCE(pci_dev, pci_dev->current_state != prev, "PCI PM: Device state not saved by %pS\n", drv->suspend); } } pci_fixup_device(pci_fixup_suspend, pci_dev); return 0; } static int pci_legacy_suspend_late(struct device *dev) { struct pci_dev *pci_dev = to_pci_dev(dev); if (!pci_dev->state_saved) pci_save_state(pci_dev); pci_pm_set_unknown_state(pci_dev); pci_fixup_device(pci_fixup_suspend_late, pci_dev); return 0; } static int pci_legacy_resume(struct device *dev) { struct pci_dev *pci_dev = to_pci_dev(dev); struct pci_driver *drv = pci_dev->driver; pci_fixup_device(pci_fixup_resume, pci_dev); return drv && drv->resume ? drv->resume(pci_dev) : pci_pm_reenable_device(pci_dev); } /* Auxiliary functions used by the new power management framework */ static void pci_pm_default_suspend(struct pci_dev *pci_dev) { /* Disable non-bridge devices without PM support */ if (!pci_has_subordinate(pci_dev)) pci_disable_enabled_device(pci_dev); } static bool pci_has_legacy_pm_support(struct pci_dev *pci_dev) { struct pci_driver *drv = pci_dev->driver; bool ret = drv && (drv->suspend || drv->resume); /* * Legacy PM support is used by default, so warn if the new framework is * supported as well. Drivers are supposed to support either the * former, or the latter, but not both at the same time. */ pci_WARN(pci_dev, ret && drv->driver.pm, "device %04x:%04x\n", pci_dev->vendor, pci_dev->device); return ret; } /* New power management framework */ static int pci_pm_prepare(struct device *dev) { struct pci_dev *pci_dev = to_pci_dev(dev); const struct dev_pm_ops *pm = dev->driver ? dev->driver->pm : NULL; if (pm && pm->prepare) { int error = pm->prepare(dev); if (error < 0) return error; if (!error && dev_pm_test_driver_flags(dev, DPM_FLAG_SMART_PREPARE)) return 0; } if (pci_dev_need_resume(pci_dev)) return 0; /* * The PME setting needs to be adjusted here in case the direct-complete * optimization is used with respect to this device. */ pci_dev_adjust_pme(pci_dev); return 1; } static void pci_pm_complete(struct device *dev) { struct pci_dev *pci_dev = to_pci_dev(dev); pci_dev_complete_resume(pci_dev); pm_generic_complete(dev); /* Resume device if platform firmware has put it in reset-power-on */ if (pm_runtime_suspended(dev) && pm_resume_via_firmware()) { pci_power_t pre_sleep_state = pci_dev->current_state; pci_refresh_power_state(pci_dev); /* * On platforms with ACPI this check may also trigger for * devices sharing power resources if one of those power * resources has been activated as a result of a change of the * power state of another device sharing it. However, in that * case it is also better to resume the device, in general. */ if (pci_dev->current_state < pre_sleep_state) pm_request_resume(dev); } } #else /* !CONFIG_PM_SLEEP */ #define pci_pm_prepare NULL #define pci_pm_complete NULL #endif /* !CONFIG_PM_SLEEP */ #ifdef CONFIG_SUSPEND static void pcie_pme_root_status_cleanup(struct pci_dev *pci_dev) { /* * Some BIOSes forget to clear Root PME Status bits after system * wakeup, which breaks ACPI-based runtime wakeup on PCI Express. * Clear those bits now just in case (shouldn't hurt). */ if (pci_is_pcie(pci_dev) && (pci_pcie_type(pci_dev) == PCI_EXP_TYPE_ROOT_PORT || pci_pcie_type(pci_dev) == PCI_EXP_TYPE_RC_EC)) pcie_clear_root_pme_status(pci_dev); } static int pci_pm_suspend(struct device *dev) { struct pci_dev *pci_dev = to_pci_dev(dev); const struct dev_pm_ops *pm = dev->driver ? dev->driver->pm : NULL; pci_dev->skip_bus_pm = false; /* * Disabling PTM allows some systems, e.g., Intel mobile chips * since Coffee Lake, to enter a lower-power PM state. */ pci_suspend_ptm(pci_dev); if (pci_has_legacy_pm_support(pci_dev)) return pci_legacy_suspend(dev, PMSG_SUSPEND); if (!pm) { pci_pm_default_suspend(pci_dev); return 0; } /* * PCI devices suspended at run time may need to be resumed at this * point, because in general it may be necessary to reconfigure them for * system suspend. Namely, if the device is expected to wake up the * system from the sleep state, it may have to be reconfigured for this * purpose, or if the device is not expected to wake up the system from * the sleep state, it should be prevented from signaling wakeup events * going forward. * * Also if the driver of the device does not indicate that its system * suspend callbacks can cope with runtime-suspended devices, it is * better to resume the device from runtime suspend here. */ if (!dev_pm_smart_suspend(dev) || pci_dev_need_resume(pci_dev)) { pm_runtime_resume(dev); pci_dev->state_saved = false; } else { pci_dev_adjust_pme(pci_dev); } if (pm->suspend) { pci_power_t prev = pci_dev->current_state; int error; error = pm->suspend(dev); suspend_report_result(dev, pm->suspend, error); if (error) return error; if (!pci_dev->state_saved && pci_dev->current_state != PCI_D0 && pci_dev->current_state != PCI_UNKNOWN) { pci_WARN_ONCE(pci_dev, pci_dev->current_state != prev, "PCI PM: State of device not saved by %pS\n", pm->suspend); } } return 0; } static int pci_pm_suspend_late(struct device *dev) { if (dev_pm_skip_suspend(dev)) return 0; pci_fixup_device(pci_fixup_suspend, to_pci_dev(dev)); return pm_generic_suspend_late(dev); } static int pci_pm_suspend_noirq(struct device *dev) { struct pci_dev *pci_dev = to_pci_dev(dev); const struct dev_pm_ops *pm = dev->driver ? dev->driver->pm : NULL; if (dev_pm_skip_suspend(dev)) return 0; if (pci_has_legacy_pm_support(pci_dev)) return pci_legacy_suspend_late(dev); if (!pm) { pci_save_state(pci_dev); goto Fixup; } if (pm->suspend_noirq) { pci_power_t prev = pci_dev->current_state; int error; error = pm->suspend_noirq(dev); suspend_report_result(dev, pm->suspend_noirq, error); if (error) return error; if (!pci_dev->state_saved && pci_dev->current_state != PCI_D0 && pci_dev->current_state != PCI_UNKNOWN) { pci_WARN_ONCE(pci_dev, pci_dev->current_state != prev, "PCI PM: State of device not saved by %pS\n", pm->suspend_noirq); goto Fixup; } } if (!pci_dev->state_saved) { pci_save_state(pci_dev); /* * If the device is a bridge with a child in D0 below it, * it needs to stay in D0, so check skip_bus_pm to avoid * putting it into a low-power state in that case. */ if (!pci_dev->skip_bus_pm && pci_power_manageable(pci_dev)) pci_prepare_to_sleep(pci_dev); } pci_dbg(pci_dev, "PCI PM: Suspend power state: %s\n", pci_power_name(pci_dev->current_state)); if (pci_dev->current_state == PCI_D0) { pci_dev->skip_bus_pm = true; /* * Per PCI PM r1.2, table 6-1, a bridge must be in D0 if any * downstream device is in D0, so avoid changing the power state * of the parent bridge by setting the skip_bus_pm flag for it. */ if (pci_dev->bus->self) pci_dev->bus->self->skip_bus_pm = true; } if (pci_dev->skip_bus_pm && pm_suspend_no_platform()) { pci_dbg(pci_dev, "PCI PM: Skipped\n"); goto Fixup; } pci_pm_set_unknown_state(pci_dev); /* * Some BIOSes from ASUS have a bug: If a USB EHCI host controller's * PCI COMMAND register isn't 0, the BIOS assumes that the controller * hasn't been quiesced and tries to turn it off. If the controller * is already in D3, this can hang or cause memory corruption. * * Since the value of the COMMAND register doesn't matter once the * device has been suspended, we can safely set it to 0 here. */ if (pci_dev->class == PCI_CLASS_SERIAL_USB_EHCI) pci_write_config_word(pci_dev, PCI_COMMAND, 0); Fixup: pci_fixup_device(pci_fixup_suspend_late, pci_dev); /* * If the target system sleep state is suspend-to-idle, it is sufficient * to check whether or not the device's wakeup settings are good for * runtime PM. Otherwise, the pm_resume_via_firmware() check will cause * pci_pm_complete() to take care of fixing up the device's state * anyway, if need be. */ if (device_can_wakeup(dev) && !device_may_wakeup(dev)) dev->power.may_skip_resume = false; return 0; } static int pci_pm_resume_noirq(struct device *dev) { struct pci_dev *pci_dev = to_pci_dev(dev); const struct dev_pm_ops *pm = dev->driver ? dev->driver->pm : NULL; pci_power_t prev_state = pci_dev->current_state; bool skip_bus_pm = pci_dev->skip_bus_pm; if (dev_pm_skip_resume(dev)) return 0; /* * In the suspend-to-idle case, devices left in D0 during suspend will * stay in D0, so it is not necessary to restore or update their * configuration here and attempting to put them into D0 again is * pointless, so avoid doing that. */ if (!(skip_bus_pm && pm_suspend_no_platform())) pci_pm_default_resume_early(pci_dev); pci_fixup_device(pci_fixup_resume_early, pci_dev); pcie_pme_root_status_cleanup(pci_dev); if (!skip_bus_pm && prev_state == PCI_D3cold) pci_pm_bridge_power_up_actions(pci_dev); if (pci_has_legacy_pm_support(pci_dev)) return 0; if (pm && pm->resume_noirq) return pm->resume_noirq(dev); return 0; } static int pci_pm_resume_early(struct device *dev) { if (dev_pm_skip_resume(dev)) return 0; return pm_generic_resume_early(dev); } static int pci_pm_resume(struct device *dev) { struct pci_dev *pci_dev = to_pci_dev(dev); const struct dev_pm_ops *pm = dev->driver ? dev->driver->pm : NULL; /* * This is necessary for the suspend error path in which resume is * called without restoring the standard config registers of the device. */ if (pci_dev->state_saved) pci_restore_standard_config(pci_dev); pci_resume_ptm(pci_dev); if (pci_has_legacy_pm_support(pci_dev)) return pci_legacy_resume(dev); pci_pm_default_resume(pci_dev); if (pm) { if (pm->resume) return pm->resume(dev); } else { pci_pm_reenable_device(pci_dev); } return 0; } #else /* !CONFIG_SUSPEND */ #define pci_pm_suspend NULL #define pci_pm_suspend_late NULL #define pci_pm_suspend_noirq NULL #define pci_pm_resume NULL #define pci_pm_resume_early NULL #define pci_pm_resume_noirq NULL #endif /* !CONFIG_SUSPEND */ #ifdef CONFIG_HIBERNATE_CALLBACKS static int pci_pm_freeze(struct device *dev) { struct pci_dev *pci_dev = to_pci_dev(dev); const struct dev_pm_ops *pm = dev->driver ? dev->driver->pm : NULL; if (pci_has_legacy_pm_support(pci_dev)) return pci_legacy_suspend(dev, PMSG_FREEZE); if (!pm) { pci_pm_default_suspend(pci_dev); return 0; } /* * Resume all runtime-suspended devices before creating a snapshot * image of system memory, because the restore kernel generally cannot * be expected to always handle them consistently and they need to be * put into the runtime-active metastate during system resume anyway, * so it is better to ensure that the state saved in the image will be * always consistent with that. */ pm_runtime_resume(dev); pci_dev->state_saved = false; if (pm->freeze) { int error; error = pm->freeze(dev); suspend_report_result(dev, pm->freeze, error); if (error) return error; } return 0; } static int pci_pm_freeze_noirq(struct device *dev) { struct pci_dev *pci_dev = to_pci_dev(dev); const struct dev_pm_ops *pm = dev->driver ? dev->driver->pm : NULL; if (pci_has_legacy_pm_support(pci_dev)) return pci_legacy_suspend_late(dev); if (pm && pm->freeze_noirq) { int error; error = pm->freeze_noirq(dev); suspend_report_result(dev, pm->freeze_noirq, error); if (error) return error; } if (!pci_dev->state_saved) pci_save_state(pci_dev); pci_pm_set_unknown_state(pci_dev); return 0; } static int pci_pm_thaw_noirq(struct device *dev) { struct pci_dev *pci_dev = to_pci_dev(dev); const struct dev_pm_ops *pm = dev->driver ? dev->driver->pm : NULL; /* * The pm->thaw_noirq() callback assumes the device has been * returned to D0 and its config state has been restored. * * In addition, pci_restore_state() restores MSI-X state in MMIO * space, which requires the device to be in D0, so return it to D0 * in case the driver's "freeze" callbacks put it into a low-power * state. */ pci_pm_power_up_and_verify_state(pci_dev); pci_restore_state(pci_dev); if (pci_has_legacy_pm_support(pci_dev)) return 0; if (pm && pm->thaw_noirq) return pm->thaw_noirq(dev); return 0; } static int pci_pm_thaw(struct device *dev) { struct pci_dev *pci_dev = to_pci_dev(dev); const struct dev_pm_ops *pm = dev->driver ? dev->driver->pm : NULL; int error = 0; if (pci_has_legacy_pm_support(pci_dev)) return pci_legacy_resume(dev); if (pm) { if (pm->thaw) error = pm->thaw(dev); } else { pci_pm_reenable_device(pci_dev); } pci_dev->state_saved = false; return error; } static int pci_pm_poweroff(struct device *dev) { struct pci_dev *pci_dev = to_pci_dev(dev); const struct dev_pm_ops *pm = dev->driver ? dev->driver->pm : NULL; if (pci_has_legacy_pm_support(pci_dev)) return pci_legacy_suspend(dev, PMSG_HIBERNATE); if (!pm) { pci_pm_default_suspend(pci_dev); return 0; } /* The reason to do that is the same as in pci_pm_suspend(). */ if (!dev_pm_smart_suspend(dev) || pci_dev_need_resume(pci_dev)) { pm_runtime_resume(dev); pci_dev->state_saved = false; } else { pci_dev_adjust_pme(pci_dev); } if (pm->poweroff) { int error; error = pm->poweroff(dev); suspend_report_result(dev, pm->poweroff, error); if (error) return error; } return 0; } static int pci_pm_poweroff_late(struct device *dev) { if (dev_pm_skip_suspend(dev)) return 0; pci_fixup_device(pci_fixup_suspend, to_pci_dev(dev)); return pm_generic_poweroff_late(dev); } static int pci_pm_poweroff_noirq(struct device *dev) { struct pci_dev *pci_dev = to_pci_dev(dev); const struct dev_pm_ops *pm = dev->driver ? dev->driver->pm : NULL; if (dev_pm_skip_suspend(dev)) return 0; if (pci_has_legacy_pm_support(pci_dev)) return pci_legacy_suspend_late(dev); if (!pm) { pci_fixup_device(pci_fixup_suspend_late, pci_dev); return 0; } if (pm->poweroff_noirq) { int error; error = pm->poweroff_noirq(dev); suspend_report_result(dev, pm->poweroff_noirq, error); if (error) return error; } if (!pci_dev->state_saved && !pci_has_subordinate(pci_dev)) pci_prepare_to_sleep(pci_dev); /* * The reason for doing this here is the same as for the analogous code * in pci_pm_suspend_noirq(). */ if (pci_dev->class == PCI_CLASS_SERIAL_USB_EHCI) pci_write_config_word(pci_dev, PCI_COMMAND, 0); pci_fixup_device(pci_fixup_suspend_late, pci_dev); return 0; } static int pci_pm_restore_noirq(struct device *dev) { struct pci_dev *pci_dev = to_pci_dev(dev); const struct dev_pm_ops *pm = dev->driver ? dev->driver->pm : NULL; pci_pm_default_resume_early(pci_dev); pci_fixup_device(pci_fixup_resume_early, pci_dev); if (pci_has_legacy_pm_support(pci_dev)) return 0; if (pm && pm->restore_noirq) return pm->restore_noirq(dev); return 0; } static int pci_pm_restore(struct device *dev) { struct pci_dev *pci_dev = to_pci_dev(dev); const struct dev_pm_ops *pm = dev->driver ? dev->driver->pm : NULL; /* * This is necessary for the hibernation error path in which restore is * called without restoring the standard config registers of the device. */ if (pci_dev->state_saved) pci_restore_standard_config(pci_dev); if (pci_has_legacy_pm_support(pci_dev)) return pci_legacy_resume(dev); pci_pm_default_resume(pci_dev); if (pm) { if (pm->restore) return pm->restore(dev); } else { pci_pm_reenable_device(pci_dev); } return 0; } #else /* !CONFIG_HIBERNATE_CALLBACKS */ #define pci_pm_freeze NULL #define pci_pm_freeze_noirq NULL #define pci_pm_thaw NULL #define pci_pm_thaw_noirq NULL #define pci_pm_poweroff NULL #define pci_pm_poweroff_late NULL #define pci_pm_poweroff_noirq NULL #define pci_pm_restore NULL #define pci_pm_restore_noirq NULL #endif /* !CONFIG_HIBERNATE_CALLBACKS */ #ifdef CONFIG_PM static int pci_pm_runtime_suspend(struct device *dev) { struct pci_dev *pci_dev = to_pci_dev(dev); const struct dev_pm_ops *pm = dev->driver ? dev->driver->pm : NULL; pci_power_t prev = pci_dev->current_state; int error; pci_suspend_ptm(pci_dev); /* * If pci_dev->driver is not set (unbound), we leave the device in D0, * but it may go to D3cold when the bridge above it runtime suspends. * Save its config space in case that happens. */ if (!pci_dev->driver) { pci_save_state(pci_dev); return 0; } pci_dev->state_saved = false; if (pm && pm->runtime_suspend) { error = pm->runtime_suspend(dev); /* * -EBUSY and -EAGAIN is used to request the runtime PM core * to schedule a new suspend, so log the event only with debug * log level. */ if (error == -EBUSY || error == -EAGAIN) { pci_dbg(pci_dev, "can't suspend now (%ps returned %d)\n", pm->runtime_suspend, error); return error; } else if (error) { pci_err(pci_dev, "can't suspend (%ps returned %d)\n", pm->runtime_suspend, error); return error; } } pci_fixup_device(pci_fixup_suspend, pci_dev); if (pm && pm->runtime_suspend && !pci_dev->state_saved && pci_dev->current_state != PCI_D0 && pci_dev->current_state != PCI_UNKNOWN) { pci_WARN_ONCE(pci_dev, pci_dev->current_state != prev, "PCI PM: State of device not saved by %pS\n", pm->runtime_suspend); return 0; } if (!pci_dev->state_saved) { pci_save_state(pci_dev); pci_finish_runtime_suspend(pci_dev); } return 0; } static int pci_pm_runtime_resume(struct device *dev) { struct pci_dev *pci_dev = to_pci_dev(dev); const struct dev_pm_ops *pm = dev->driver ? dev->driver->pm : NULL; pci_power_t prev_state = pci_dev->current_state; int error = 0; /* * Restoring config space is necessary even if the device is not bound * to a driver because although we left it in D0, it may have gone to * D3cold when the bridge above it runtime suspended. */ pci_pm_default_resume_early(pci_dev); pci_resume_ptm(pci_dev); if (!pci_dev->driver) return 0; pci_fixup_device(pci_fixup_resume_early, pci_dev); pci_pm_default_resume(pci_dev); if (prev_state == PCI_D3cold) pci_pm_bridge_power_up_actions(pci_dev); if (pm && pm->runtime_resume) error = pm->runtime_resume(dev); return error; } static int pci_pm_runtime_idle(struct device *dev) { struct pci_dev *pci_dev = to_pci_dev(dev); const struct dev_pm_ops *pm = dev->driver ? dev->driver->pm : NULL; /* * If pci_dev->driver is not set (unbound), the device should * always remain in D0 regardless of the runtime PM status */ if (!pci_dev->driver) return 0; if (pm && pm->runtime_idle) return pm->runtime_idle(dev); return 0; } static const struct dev_pm_ops pci_dev_pm_ops = { .prepare = pci_pm_prepare, .complete = pci_pm_complete, .suspend = pci_pm_suspend, .suspend_late = pci_pm_suspend_late, .resume = pci_pm_resume, .resume_early = pci_pm_resume_early, .freeze = pci_pm_freeze, .thaw = pci_pm_thaw, .poweroff = pci_pm_poweroff, .poweroff_late = pci_pm_poweroff_late, .restore = pci_pm_restore, .suspend_noirq = pci_pm_suspend_noirq, .resume_noirq = pci_pm_resume_noirq, .freeze_noirq = pci_pm_freeze_noirq, .thaw_noirq = pci_pm_thaw_noirq, .poweroff_noirq = pci_pm_poweroff_noirq, .restore_noirq = pci_pm_restore_noirq, .runtime_suspend = pci_pm_runtime_suspend, .runtime_resume = pci_pm_runtime_resume, .runtime_idle = pci_pm_runtime_idle, }; #define PCI_PM_OPS_PTR (&pci_dev_pm_ops) #else /* !CONFIG_PM */ #define pci_pm_runtime_suspend NULL #define pci_pm_runtime_resume NULL #define pci_pm_runtime_idle NULL #define PCI_PM_OPS_PTR NULL #endif /* !CONFIG_PM */ /** * __pci_register_driver - register a new pci driver * @drv: the driver structure to register * @owner: owner module of drv * @mod_name: module name string * * Adds the driver structure to the list of registered drivers. * Returns a negative value on error, otherwise 0. * If no error occurred, the driver remains registered even if * no device was claimed during registration. */ int __pci_register_driver(struct pci_driver *drv, struct module *owner, const char *mod_name) { /* initialize common driver fields */ drv->driver.name = drv->name; drv->driver.bus = &pci_bus_type; drv->driver.owner = owner; drv->driver.mod_name = mod_name; drv->driver.groups = drv->groups; drv->driver.dev_groups = drv->dev_groups; spin_lock_init(&drv->dynids.lock); INIT_LIST_HEAD(&drv->dynids.list); /* register with core */ return driver_register(&drv->driver); } EXPORT_SYMBOL(__pci_register_driver); /** * pci_unregister_driver - unregister a pci driver * @drv: the driver structure to unregister * * Deletes the driver structure from the list of registered PCI drivers, * gives it a chance to clean up by calling its remove() function for * each device it was responsible for, and marks those devices as * driverless. */ void pci_unregister_driver(struct pci_driver *drv) { driver_unregister(&drv->driver); pci_free_dynids(drv); } EXPORT_SYMBOL(pci_unregister_driver); static struct pci_driver pci_compat_driver = { .name = "compat" }; /** * pci_dev_driver - get the pci_driver of a device * @dev: the device to query * * Returns the appropriate pci_driver structure or %NULL if there is no * registered driver for the device. */ struct pci_driver *pci_dev_driver(const struct pci_dev *dev) { int i; if (dev->driver) return dev->driver; for (i = 0; i <= PCI_ROM_RESOURCE; i++) if (dev->resource[i].flags & IORESOURCE_BUSY) return &pci_compat_driver; return NULL; } EXPORT_SYMBOL(pci_dev_driver); /** * pci_bus_match - Tell if a PCI device structure has a matching PCI device id structure * @dev: the PCI device structure to match against * @drv: the device driver to search for matching PCI device id structures * * Used by a driver to check whether a PCI device present in the * system is in its list of supported devices. Returns the matching * pci_device_id structure or %NULL if there is no match. */ static int pci_bus_match(struct device *dev, const struct device_driver *drv) { struct pci_dev *pci_dev = to_pci_dev(dev); struct pci_driver *pci_drv; const struct pci_device_id *found_id; if (pci_dev_binding_disallowed(pci_dev)) return 0; pci_drv = (struct pci_driver *)to_pci_driver(drv); found_id = pci_match_device(pci_drv, pci_dev); if (found_id) return 1; return 0; } /** * pci_dev_get - increments the reference count of the pci device structure * @dev: the device being referenced * * Each live reference to a device should be refcounted. * * Drivers for PCI devices should normally record such references in * their probe() methods, when they bind to a device, and release * them by calling pci_dev_put(), in their disconnect() methods. * * A pointer to the device with the incremented reference counter is returned. */ struct pci_dev *pci_dev_get(struct pci_dev *dev) { if (dev) get_device(&dev->dev); return dev; } EXPORT_SYMBOL(pci_dev_get); /** * pci_dev_put - release a use of the pci device structure * @dev: device that's been disconnected * * Must be called when a user of a device is finished with it. When the last * user of the device calls this function, the memory of the device is freed. */ void pci_dev_put(struct pci_dev *dev) { if (dev) put_device(&dev->dev); } EXPORT_SYMBOL(pci_dev_put); static int pci_uevent(const struct device *dev, struct kobj_uevent_env *env) { const struct pci_dev *pdev; if (!dev) return -ENODEV; pdev = to_pci_dev(dev); if (add_uevent_var(env, "PCI_CLASS=%04X", pdev->class)) return -ENOMEM; if (add_uevent_var(env, "PCI_ID=%04X:%04X", pdev->vendor, pdev->device)) return -ENOMEM; if (add_uevent_var(env, "PCI_SUBSYS_ID=%04X:%04X", pdev->subsystem_vendor, pdev->subsystem_device)) return -ENOMEM; if (add_uevent_var(env, "PCI_SLOT_NAME=%s", pci_name(pdev))) return -ENOMEM; if (add_uevent_var(env, "MODALIAS=pci:v%08Xd%08Xsv%08Xsd%08Xbc%02Xsc%02Xi%02X", pdev->vendor, pdev->device, pdev->subsystem_vendor, pdev->subsystem_device, (u8)(pdev->class >> 16), (u8)(pdev->class >> 8), (u8)(pdev->class))) return -ENOMEM; return 0; } #if defined(CONFIG_PCIEAER) || defined(CONFIG_EEH) /** * pci_uevent_ers - emit a uevent during recovery path of PCI device * @pdev: PCI device undergoing error recovery * @err_type: type of error event */ void pci_uevent_ers(struct pci_dev *pdev, enum pci_ers_result err_type) { int idx = 0; char *envp[3]; switch (err_type) { case PCI_ERS_RESULT_NONE: case PCI_ERS_RESULT_CAN_RECOVER: envp[idx++] = "ERROR_EVENT=BEGIN_RECOVERY"; envp[idx++] = "DEVICE_ONLINE=0"; break; case PCI_ERS_RESULT_RECOVERED: envp[idx++] = "ERROR_EVENT=SUCCESSFUL_RECOVERY"; envp[idx++] = "DEVICE_ONLINE=1"; break; case PCI_ERS_RESULT_DISCONNECT: envp[idx++] = "ERROR_EVENT=FAILED_RECOVERY"; envp[idx++] = "DEVICE_ONLINE=0"; break; default: break; } if (idx > 0) { envp[idx++] = NULL; kobject_uevent_env(&pdev->dev.kobj, KOBJ_CHANGE, envp); } } #endif static int pci_bus_num_vf(struct device *dev) { return pci_num_vf(to_pci_dev(dev)); } /** * pci_dma_configure - Setup DMA configuration * @dev: ptr to dev structure * * Function to update PCI devices's DMA configuration using the same * info from the OF node or ACPI node of host bridge's parent (if any). */ static int pci_dma_configure(struct device *dev) { struct pci_driver *driver = to_pci_driver(dev->driver); struct device *bridge; int ret = 0; bridge = pci_get_host_bridge_device(to_pci_dev(dev)); if (IS_ENABLED(CONFIG_OF) && bridge->parent && bridge->parent->of_node) { ret = of_dma_configure(dev, bridge->parent->of_node, true); } else if (has_acpi_companion(bridge)) { struct acpi_device *adev = to_acpi_device_node(bridge->fwnode); ret = acpi_dma_configure(dev, acpi_get_dma_attr(adev)); } pci_put_host_bridge_device(bridge); /* @driver may not be valid when we're called from the IOMMU layer */ if (!ret && dev->driver && !driver->driver_managed_dma) { ret = iommu_device_use_default_domain(dev); if (ret) arch_teardown_dma_ops(dev); } return ret; } static void pci_dma_cleanup(struct device *dev) { struct pci_driver *driver = to_pci_driver(dev->driver); if (!driver->driver_managed_dma) iommu_device_unuse_default_domain(dev); } /* * pci_device_irq_get_affinity - get IRQ affinity mask for device * @dev: ptr to dev structure * @irq_vec: interrupt vector number * * Return the CPU affinity mask for @dev and @irq_vec. */ static const struct cpumask *pci_device_irq_get_affinity(struct device *dev, unsigned int irq_vec) { return pci_irq_get_affinity(to_pci_dev(dev), irq_vec); } const struct bus_type pci_bus_type = { .name = "pci", .match = pci_bus_match, .uevent = pci_uevent, .probe = pci_device_probe, .remove = pci_device_remove, .shutdown = pci_device_shutdown, .irq_get_affinity = pci_device_irq_get_affinity, .dev_groups = pci_dev_groups, .bus_groups = pci_bus_groups, .drv_groups = pci_drv_groups, .pm = PCI_PM_OPS_PTR, .num_vf = pci_bus_num_vf, .dma_configure = pci_dma_configure, .dma_cleanup = pci_dma_cleanup, }; EXPORT_SYMBOL(pci_bus_type); #ifdef CONFIG_PCIEPORTBUS static int pcie_port_bus_match(struct device *dev, const struct device_driver *drv) { struct pcie_device *pciedev; const struct pcie_port_service_driver *driver; if (drv->bus != &pcie_port_bus_type || dev->bus != &pcie_port_bus_type) return 0; pciedev = to_pcie_device(dev); driver = to_service_driver(drv); if (driver->service != pciedev->service) return 0; if (driver->port_type != PCIE_ANY_PORT && driver->port_type != pci_pcie_type(pciedev->port)) return 0; return 1; } const struct bus_type pcie_port_bus_type = { .name = "pci_express", .match = pcie_port_bus_match, }; #endif static int __init pci_driver_init(void) { int ret; ret = bus_register(&pci_bus_type); if (ret) return ret; #ifdef CONFIG_PCIEPORTBUS ret = bus_register(&pcie_port_bus_type); if (ret) return ret; #endif dma_debug_add_bus(&pci_bus_type); return 0; } postcore_initcall(pci_driver_init);
8 8 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 /* mpi-bit.c - MPI bit level functions * Copyright (C) 1998, 1999 Free Software Foundation, Inc. * * This file is part of GnuPG. * * GnuPG is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * GnuPG is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA */ #include "mpi-internal.h" #include "longlong.h" #define A_LIMB_1 ((mpi_limb_t) 1) /**************** * Sometimes we have MSL (most significant limbs) which are 0; * this is for some reasons not good, so this function removes them. */ void mpi_normalize(MPI a) { for (; a->nlimbs && !a->d[a->nlimbs - 1]; a->nlimbs--) ; } /**************** * Return the number of bits in A. */ unsigned mpi_get_nbits(MPI a) { unsigned n; mpi_normalize(a); if (a->nlimbs) { mpi_limb_t alimb = a->d[a->nlimbs - 1]; if (alimb) n = count_leading_zeros(alimb); else n = BITS_PER_MPI_LIMB; n = BITS_PER_MPI_LIMB - n + (a->nlimbs - 1) * BITS_PER_MPI_LIMB; } else n = 0; return n; } EXPORT_SYMBOL_GPL(mpi_get_nbits); /**************** * Test whether bit N is set. */ int mpi_test_bit(MPI a, unsigned int n) { unsigned int limbno, bitno; mpi_limb_t limb; limbno = n / BITS_PER_MPI_LIMB; bitno = n % BITS_PER_MPI_LIMB; if (limbno >= a->nlimbs) return 0; /* too far left: this is a 0 */ limb = a->d[limbno]; return (limb & (A_LIMB_1 << bitno)) ? 1 : 0; } EXPORT_SYMBOL_GPL(mpi_test_bit); /**************** * Set bit N of A. */ int mpi_set_bit(MPI a, unsigned int n) { unsigned int i, limbno, bitno; int err; limbno = n / BITS_PER_MPI_LIMB; bitno = n % BITS_PER_MPI_LIMB; if (limbno >= a->nlimbs) { for (i = a->nlimbs; i < a->alloced; i++) a->d[i] = 0; err = mpi_resize(a, limbno+1); if (err) return err; a->nlimbs = limbno+1; } a->d[limbno] |= (A_LIMB_1<<bitno); return 0; } EXPORT_SYMBOL_GPL(mpi_set_bit); /* * Shift A by N bits to the right. */ int mpi_rshift(MPI x, MPI a, unsigned int n) { mpi_size_t xsize; unsigned int i; unsigned int nlimbs = (n/BITS_PER_MPI_LIMB); unsigned int nbits = (n%BITS_PER_MPI_LIMB); int err; if (x == a) { /* In-place operation. */ if (nlimbs >= x->nlimbs) { x->nlimbs = 0; return 0; } if (nlimbs) { for (i = 0; i < x->nlimbs - nlimbs; i++) x->d[i] = x->d[i+nlimbs]; x->d[i] = 0; x->nlimbs -= nlimbs; } if (x->nlimbs && nbits) mpihelp_rshift(x->d, x->d, x->nlimbs, nbits); } else if (nlimbs) { /* Copy and shift by more or equal bits than in a limb. */ xsize = a->nlimbs; x->sign = a->sign; err = RESIZE_IF_NEEDED(x, xsize); if (err) return err; x->nlimbs = xsize; for (i = 0; i < a->nlimbs; i++) x->d[i] = a->d[i]; x->nlimbs = i; if (nlimbs >= x->nlimbs) { x->nlimbs = 0; return 0; } for (i = 0; i < x->nlimbs - nlimbs; i++) x->d[i] = x->d[i+nlimbs]; x->d[i] = 0; x->nlimbs -= nlimbs; if (x->nlimbs && nbits) mpihelp_rshift(x->d, x->d, x->nlimbs, nbits); } else { /* Copy and shift by less than bits in a limb. */ xsize = a->nlimbs; x->sign = a->sign; err = RESIZE_IF_NEEDED(x, xsize); if (err) return err; x->nlimbs = xsize; if (xsize) { if (nbits) mpihelp_rshift(x->d, a->d, x->nlimbs, nbits); else { /* The rshift helper function is not specified for * NBITS==0, thus we do a plain copy here. */ for (i = 0; i < x->nlimbs; i++) x->d[i] = a->d[i]; } } } MPN_NORMALIZE(x->d, x->nlimbs); return 0; } EXPORT_SYMBOL_GPL(mpi_rshift);
27 5 1 27 29 29 2 9 4 26 72 73 58 15 72 71 60 72 10 40 15 2 41 36 10 10 69 36 36 1 26 10 13 27 21 6 27 13 13 13 10 3 13 13 13 13 1 11 12 3 10 10 10 10 13 13 15 15 4 10 1 11 15 15 9 7 5 10 8 10 10 15 12 8 4 12 3 3 3 3 3 3 3 3 3 20 1 14 12 1 4 1 3 3 2 1 14 13 12 3 2 14 14 1 13 13 9 5 3 5 3 15 15 15 1 1 3 2 1 10 10 58 13 53 57 15 59 48 6 4 2 1 37 36 16 11 10 37 16 20 13 21 18 18 21 37 6 23 23 23 11 15 15 15 15 4 4 4 3 12 3 3 3 3 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 // SPDX-License-Identifier: GPL-2.0-or-later /* * Copyright (C) International Business Machines Corp., 2000-2004 */ /* * jfs_dtree.c: directory B+-tree manager * * B+-tree with variable length key directory: * * each directory page is structured as an array of 32-byte * directory entry slots initialized as a freelist * to avoid search/compaction of free space at insertion. * when an entry is inserted, a number of slots are allocated * from the freelist as required to store variable length data * of the entry; when the entry is deleted, slots of the entry * are returned to freelist. * * leaf entry stores full name as key and file serial number * (aka inode number) as data. * internal/router entry stores sufffix compressed name * as key and simple extent descriptor as data. * * each directory page maintains a sorted entry index table * which stores the start slot index of sorted entries * to allow binary search on the table. * * directory starts as a root/leaf page in on-disk inode * inline data area. * when it becomes full, it starts a leaf of a external extent * of length of 1 block. each time the first leaf becomes full, * it is extended rather than split (its size is doubled), * until its length becoms 4 KBytes, from then the extent is split * with new 4 Kbyte extent when it becomes full * to reduce external fragmentation of small directories. * * blah, blah, blah, for linear scan of directory in pieces by * readdir(). * * * case-insensitive directory file system * * names are stored in case-sensitive way in leaf entry. * but stored, searched and compared in case-insensitive (uppercase) order * (i.e., both search key and entry key are folded for search/compare): * (note that case-sensitive order is BROKEN in storage, e.g., * sensitive: Ad, aB, aC, aD -> insensitive: aB, aC, aD, Ad * * entries which folds to the same key makes up a equivalent class * whose members are stored as contiguous cluster (may cross page boundary) * but whose order is arbitrary and acts as duplicate, e.g., * abc, Abc, aBc, abC) * * once match is found at leaf, requires scan forward/backward * either for, in case-insensitive search, duplicate * or for, in case-sensitive search, for exact match * * router entry must be created/stored in case-insensitive way * in internal entry: * (right most key of left page and left most key of right page * are folded, and its suffix compression is propagated as router * key in parent) * (e.g., if split occurs <abc> and <aBd>, <ABD> trather than <aB> * should be made the router key for the split) * * case-insensitive search: * * fold search key; * * case-insensitive search of B-tree: * for internal entry, router key is already folded; * for leaf entry, fold the entry key before comparison. * * if (leaf entry case-insensitive match found) * if (next entry satisfies case-insensitive match) * return EDUPLICATE; * if (prev entry satisfies case-insensitive match) * return EDUPLICATE; * return match; * else * return no match; * * serialization: * target directory inode lock is being held on entry/exit * of all main directory service routines. * * log based recovery: */ #include <linux/fs.h> #include <linux/quotaops.h> #include <linux/slab.h> #include "jfs_incore.h" #include "jfs_superblock.h" #include "jfs_filsys.h" #include "jfs_metapage.h" #include "jfs_dmap.h" #include "jfs_unicode.h" #include "jfs_debug.h" /* dtree split parameter */ struct dtsplit { struct metapage *mp; s16 index; s16 nslot; struct component_name *key; ddata_t *data; struct pxdlist *pxdlist; }; #define DT_PAGE(IP, MP) BT_PAGE(IP, MP, dtpage_t, i_dtroot) /* get page buffer for specified block address */ #define DT_GETPAGE(IP, BN, MP, SIZE, P, RC) \ do { \ BT_GETPAGE(IP, BN, MP, dtpage_t, SIZE, P, RC, i_dtroot); \ if (!(RC)) { \ if (((P)->header.nextindex > \ (((BN) == 0) ? DTROOTMAXSLOT : (P)->header.maxslot)) || \ ((BN) && (((P)->header.maxslot > DTPAGEMAXSLOT) || \ ((P)->header.stblindex >= DTPAGEMAXSLOT)))) { \ BT_PUTPAGE(MP); \ jfs_error((IP)->i_sb, \ "DT_GETPAGE: dtree page corrupt\n"); \ MP = NULL; \ RC = -EIO; \ } \ } \ } while (0) /* for consistency */ #define DT_PUTPAGE(MP) BT_PUTPAGE(MP) #define DT_GETSEARCH(IP, LEAF, BN, MP, P, INDEX) \ BT_GETSEARCH(IP, LEAF, BN, MP, dtpage_t, P, INDEX, i_dtroot) /* * forward references */ static int dtSplitUp(tid_t tid, struct inode *ip, struct dtsplit * split, struct btstack * btstack); static int dtSplitPage(tid_t tid, struct inode *ip, struct dtsplit * split, struct metapage ** rmpp, dtpage_t ** rpp, pxd_t * rxdp); static int dtExtendPage(tid_t tid, struct inode *ip, struct dtsplit * split, struct btstack * btstack); static int dtSplitRoot(tid_t tid, struct inode *ip, struct dtsplit * split, struct metapage ** rmpp); static int dtDeleteUp(tid_t tid, struct inode *ip, struct metapage * fmp, dtpage_t * fp, struct btstack * btstack); static int dtRelink(tid_t tid, struct inode *ip, dtpage_t * p); static int dtReadFirst(struct inode *ip, struct btstack * btstack); static int dtReadNext(struct inode *ip, loff_t * offset, struct btstack * btstack); static int dtCompare(struct component_name * key, dtpage_t * p, int si); static int ciCompare(struct component_name * key, dtpage_t * p, int si, int flag); static void dtGetKey(dtpage_t * p, int i, struct component_name * key, int flag); static int ciGetLeafPrefixKey(dtpage_t * lp, int li, dtpage_t * rp, int ri, struct component_name * key, int flag); static void dtInsertEntry(dtpage_t * p, int index, struct component_name * key, ddata_t * data, struct dt_lock **); static void dtMoveEntry(dtpage_t * sp, int si, dtpage_t * dp, struct dt_lock ** sdtlock, struct dt_lock ** ddtlock, int do_index); static void dtDeleteEntry(dtpage_t * p, int fi, struct dt_lock ** dtlock); static void dtTruncateEntry(dtpage_t * p, int ti, struct dt_lock ** dtlock); static void dtLinelockFreelist(dtpage_t * p, int m, struct dt_lock ** dtlock); #define ciToUpper(c) UniStrupr((c)->name) /* * read_index_page() * * Reads a page of a directory's index table. * Having metadata mapped into the directory inode's address space * presents a multitude of problems. We avoid this by mapping to * the absolute address space outside of the *_metapage routines */ static struct metapage *read_index_page(struct inode *inode, s64 blkno) { int rc; s64 xaddr; int xflag; s32 xlen; rc = xtLookup(inode, blkno, 1, &xflag, &xaddr, &xlen, 1); if (rc || (xaddr == 0)) return NULL; return read_metapage(inode, xaddr, PSIZE, 1); } /* * get_index_page() * * Same as get_index_page(), but get's a new page without reading */ static struct metapage *get_index_page(struct inode *inode, s64 blkno) { int rc; s64 xaddr; int xflag; s32 xlen; rc = xtLookup(inode, blkno, 1, &xflag, &xaddr, &xlen, 1); if (rc || (xaddr == 0)) return NULL; return get_metapage(inode, xaddr, PSIZE, 1); } /* * find_index() * * Returns dtree page containing directory table entry for specified * index and pointer to its entry. * * mp must be released by caller. */ static struct dir_table_slot *find_index(struct inode *ip, u32 index, struct metapage ** mp, s64 *lblock) { struct jfs_inode_info *jfs_ip = JFS_IP(ip); s64 blkno; s64 offset; int page_offset; struct dir_table_slot *slot; static int maxWarnings = 10; if (index < 2) { if (maxWarnings) { jfs_warn("find_entry called with index = %d", index); maxWarnings--; } return NULL; } if (index >= jfs_ip->next_index) { jfs_warn("find_entry called with index >= next_index"); return NULL; } if (jfs_dirtable_inline(ip)) { /* * Inline directory table */ *mp = NULL; slot = &jfs_ip->i_dirtable[index - 2]; } else { offset = (index - 2) * sizeof(struct dir_table_slot); page_offset = offset & (PSIZE - 1); blkno = ((offset + 1) >> L2PSIZE) << JFS_SBI(ip->i_sb)->l2nbperpage; if (*mp && (*lblock != blkno)) { release_metapage(*mp); *mp = NULL; } if (!(*mp)) { *lblock = blkno; *mp = read_index_page(ip, blkno); } if (!(*mp)) { jfs_err("free_index: error reading directory table"); return NULL; } slot = (struct dir_table_slot *) ((char *) (*mp)->data + page_offset); } return slot; } static inline void lock_index(tid_t tid, struct inode *ip, struct metapage * mp, u32 index) { struct tlock *tlck; struct linelock *llck; struct lv *lv; tlck = txLock(tid, ip, mp, tlckDATA); llck = (struct linelock *) tlck->lock; if (llck->index >= llck->maxcnt) llck = txLinelock(llck); lv = &llck->lv[llck->index]; /* * Linelock slot size is twice the size of directory table * slot size. 512 entries per page. */ lv->offset = ((index - 2) & 511) >> 1; lv->length = 1; llck->index++; } /* * add_index() * * Adds an entry to the directory index table. This is used to provide * each directory entry with a persistent index in which to resume * directory traversals */ static u32 add_index(tid_t tid, struct inode *ip, s64 bn, int slot) { struct super_block *sb = ip->i_sb; struct jfs_sb_info *sbi = JFS_SBI(sb); struct jfs_inode_info *jfs_ip = JFS_IP(ip); u64 blkno; struct dir_table_slot *dirtab_slot; u32 index; struct linelock *llck; struct lv *lv; struct metapage *mp; s64 offset; uint page_offset; struct tlock *tlck; s64 xaddr; ASSERT(DO_INDEX(ip)); if (jfs_ip->next_index < 2) { jfs_warn("add_index: next_index = %d. Resetting!", jfs_ip->next_index); jfs_ip->next_index = 2; } index = jfs_ip->next_index++; if (index <= MAX_INLINE_DIRTABLE_ENTRY) { /* * i_size reflects size of index table, or 8 bytes per entry. */ ip->i_size = (loff_t) (index - 1) << 3; /* * dir table fits inline within inode */ dirtab_slot = &jfs_ip->i_dirtable[index-2]; dirtab_slot->flag = DIR_INDEX_VALID; dirtab_slot->slot = slot; DTSaddress(dirtab_slot, bn); set_cflag(COMMIT_Dirtable, ip); return index; } if (index == (MAX_INLINE_DIRTABLE_ENTRY + 1)) { struct dir_table_slot temp_table[12]; /* * It's time to move the inline table to an external * page and begin to build the xtree */ if (dquot_alloc_block(ip, sbi->nbperpage)) goto clean_up; if (dbAlloc(ip, 0, sbi->nbperpage, &xaddr)) { dquot_free_block(ip, sbi->nbperpage); goto clean_up; } /* * Save the table, we're going to overwrite it with the * xtree root */ memcpy(temp_table, &jfs_ip->i_dirtable, sizeof(temp_table)); /* * Initialize empty x-tree */ xtInitRoot(tid, ip); /* * Add the first block to the xtree */ if (xtInsert(tid, ip, 0, 0, sbi->nbperpage, &xaddr, 0)) { /* This really shouldn't fail */ jfs_warn("add_index: xtInsert failed!"); memcpy(&jfs_ip->i_dirtable, temp_table, sizeof (temp_table)); dbFree(ip, xaddr, sbi->nbperpage); dquot_free_block(ip, sbi->nbperpage); goto clean_up; } ip->i_size = PSIZE; mp = get_index_page(ip, 0); if (!mp) { jfs_err("add_index: get_metapage failed!"); xtTruncate(tid, ip, 0, COMMIT_PWMAP); memcpy(&jfs_ip->i_dirtable, temp_table, sizeof (temp_table)); goto clean_up; } tlck = txLock(tid, ip, mp, tlckDATA); llck = (struct linelock *) & tlck->lock; ASSERT(llck->index == 0); lv = &llck->lv[0]; lv->offset = 0; lv->length = 6; /* tlckDATA slot size is 16 bytes */ llck->index++; memcpy(mp->data, temp_table, sizeof(temp_table)); mark_metapage_dirty(mp); release_metapage(mp); /* * Logging is now directed by xtree tlocks */ clear_cflag(COMMIT_Dirtable, ip); } offset = (index - 2) * sizeof(struct dir_table_slot); page_offset = offset & (PSIZE - 1); blkno = ((offset + 1) >> L2PSIZE) << sbi->l2nbperpage; if (page_offset == 0) { /* * This will be the beginning of a new page */ xaddr = 0; if (xtInsert(tid, ip, 0, blkno, sbi->nbperpage, &xaddr, 0)) { jfs_warn("add_index: xtInsert failed!"); goto clean_up; } ip->i_size += PSIZE; if ((mp = get_index_page(ip, blkno))) memset(mp->data, 0, PSIZE); /* Just looks better */ else xtTruncate(tid, ip, offset, COMMIT_PWMAP); } else mp = read_index_page(ip, blkno); if (!mp) { jfs_err("add_index: get/read_metapage failed!"); goto clean_up; } lock_index(tid, ip, mp, index); dirtab_slot = (struct dir_table_slot *) ((char *) mp->data + page_offset); dirtab_slot->flag = DIR_INDEX_VALID; dirtab_slot->slot = slot; DTSaddress(dirtab_slot, bn); mark_metapage_dirty(mp); release_metapage(mp); return index; clean_up: jfs_ip->next_index--; return 0; } /* * free_index() * * Marks an entry to the directory index table as free. */ static void free_index(tid_t tid, struct inode *ip, u32 index, u32 next) { struct dir_table_slot *dirtab_slot; s64 lblock; struct metapage *mp = NULL; dirtab_slot = find_index(ip, index, &mp, &lblock); if (!dirtab_slot) return; dirtab_slot->flag = DIR_INDEX_FREE; dirtab_slot->slot = dirtab_slot->addr1 = 0; dirtab_slot->addr2 = cpu_to_le32(next); if (mp) { lock_index(tid, ip, mp, index); mark_metapage_dirty(mp); release_metapage(mp); } else set_cflag(COMMIT_Dirtable, ip); } /* * modify_index() * * Changes an entry in the directory index table */ static void modify_index(tid_t tid, struct inode *ip, u32 index, s64 bn, int slot, struct metapage ** mp, s64 *lblock) { struct dir_table_slot *dirtab_slot; dirtab_slot = find_index(ip, index, mp, lblock); if (!dirtab_slot) return; DTSaddress(dirtab_slot, bn); dirtab_slot->slot = slot; if (*mp) { lock_index(tid, ip, *mp, index); mark_metapage_dirty(*mp); } else set_cflag(COMMIT_Dirtable, ip); } /* * read_index() * * reads a directory table slot */ static int read_index(struct inode *ip, u32 index, struct dir_table_slot * dirtab_slot) { s64 lblock; struct metapage *mp = NULL; struct dir_table_slot *slot; slot = find_index(ip, index, &mp, &lblock); if (!slot) { return -EIO; } memcpy(dirtab_slot, slot, sizeof(struct dir_table_slot)); if (mp) release_metapage(mp); return 0; } /* * dtSearch() * * function: * Search for the entry with specified key * * parameter: * * return: 0 - search result on stack, leaf page pinned; * errno - I/O error */ int dtSearch(struct inode *ip, struct component_name * key, ino_t * data, struct btstack * btstack, int flag) { int rc = 0; int cmp = 1; /* init for empty page */ s64 bn; struct metapage *mp; dtpage_t *p; s8 *stbl; int base, index, lim; struct btframe *btsp; pxd_t *pxd; int psize = 288; /* initial in-line directory */ ino_t inumber; struct component_name ciKey; struct super_block *sb = ip->i_sb; ciKey.name = kmalloc_array(JFS_NAME_MAX + 1, sizeof(wchar_t), GFP_NOFS); if (!ciKey.name) { rc = -ENOMEM; goto dtSearch_Exit2; } /* uppercase search key for c-i directory */ UniStrcpy(ciKey.name, key->name); ciKey.namlen = key->namlen; /* only uppercase if case-insensitive support is on */ if ((JFS_SBI(sb)->mntflag & JFS_OS2) == JFS_OS2) { ciToUpper(&ciKey); } BT_CLR(btstack); /* reset stack */ /* init level count for max pages to split */ btstack->nsplit = 1; /* * search down tree from root: * * between two consecutive entries of <Ki, Pi> and <Kj, Pj> of * internal page, child page Pi contains entry with k, Ki <= K < Kj. * * if entry with search key K is not found * internal page search find the entry with largest key Ki * less than K which point to the child page to search; * leaf page search find the entry with smallest key Kj * greater than K so that the returned index is the position of * the entry to be shifted right for insertion of new entry. * for empty tree, search key is greater than any key of the tree. * * by convention, root bn = 0. */ for (bn = 0;;) { /* get/pin the page to search */ DT_GETPAGE(ip, bn, mp, psize, p, rc); if (rc) goto dtSearch_Exit1; /* get sorted entry table of the page */ stbl = DT_GETSTBL(p); /* * binary search with search key K on the current page. */ for (base = 0, lim = p->header.nextindex; lim; lim >>= 1) { index = base + (lim >> 1); if (stbl[index] < 0) { rc = -EIO; goto out; } if (p->header.flag & BT_LEAF) { /* uppercase leaf name to compare */ cmp = ciCompare(&ciKey, p, stbl[index], JFS_SBI(sb)->mntflag); } else { /* router key is in uppercase */ cmp = dtCompare(&ciKey, p, stbl[index]); } if (cmp == 0) { /* * search hit */ /* search hit - leaf page: * return the entry found */ if (p->header.flag & BT_LEAF) { inumber = le32_to_cpu( ((struct ldtentry *) & p->slot[stbl[index]])->inumber); /* * search for JFS_LOOKUP */ if (flag == JFS_LOOKUP) { *data = inumber; rc = 0; goto out; } /* * search for JFS_CREATE */ if (flag == JFS_CREATE) { *data = inumber; rc = -EEXIST; goto out; } /* * search for JFS_REMOVE or JFS_RENAME */ if ((flag == JFS_REMOVE || flag == JFS_RENAME) && *data != inumber) { rc = -ESTALE; goto out; } /* * JFS_REMOVE|JFS_FINDDIR|JFS_RENAME */ /* save search result */ *data = inumber; btsp = btstack->top; btsp->bn = bn; btsp->index = index; btsp->mp = mp; rc = 0; goto dtSearch_Exit1; } /* search hit - internal page: * descend/search its child page */ goto getChild; } if (cmp > 0) { base = index + 1; --lim; } } /* * search miss * * base is the smallest index with key (Kj) greater than * search key (K) and may be zero or (maxindex + 1) index. */ /* * search miss - leaf page * * return location of entry (base) where new entry with * search key K is to be inserted. */ if (p->header.flag & BT_LEAF) { /* * search for JFS_LOOKUP, JFS_REMOVE, or JFS_RENAME */ if (flag == JFS_LOOKUP || flag == JFS_REMOVE || flag == JFS_RENAME) { rc = -ENOENT; goto out; } /* * search for JFS_CREATE|JFS_FINDDIR: * * save search result */ *data = 0; btsp = btstack->top; btsp->bn = bn; btsp->index = base; btsp->mp = mp; rc = 0; goto dtSearch_Exit1; } /* * search miss - internal page * * if base is non-zero, decrement base by one to get the parent * entry of the child page to search. */ index = base ? base - 1 : base; /* * go down to child page */ getChild: /* update max. number of pages to split */ if (BT_STACK_FULL(btstack)) { /* Something's corrupted, mark filesystem dirty so * chkdsk will fix it. */ jfs_error(sb, "stack overrun!\n"); BT_STACK_DUMP(btstack); rc = -EIO; goto out; } btstack->nsplit++; /* push (bn, index) of the parent page/entry */ BT_PUSH(btstack, bn, index); /* get the child page block number */ pxd = (pxd_t *) & p->slot[stbl[index]]; bn = addressPXD(pxd); psize = lengthPXD(pxd) << JFS_SBI(ip->i_sb)->l2bsize; /* unpin the parent page */ DT_PUTPAGE(mp); } out: DT_PUTPAGE(mp); dtSearch_Exit1: kfree(ciKey.name); dtSearch_Exit2: return rc; } /* * dtInsert() * * function: insert an entry to directory tree * * parameter: * * return: 0 - success; * errno - failure; */ int dtInsert(tid_t tid, struct inode *ip, struct component_name * name, ino_t * fsn, struct btstack * btstack) { int rc = 0; struct metapage *mp; /* meta-page buffer */ dtpage_t *p; /* base B+-tree index page */ s64 bn; int index; struct dtsplit split; /* split information */ ddata_t data; struct dt_lock *dtlck; int n; struct tlock *tlck; struct lv *lv; /* * retrieve search result * * dtSearch() returns (leaf page pinned, index at which to insert). * n.b. dtSearch() may return index of (maxindex + 1) of * the full page. */ DT_GETSEARCH(ip, btstack->top, bn, mp, p, index); if (p->header.freelist == 0) return -EINVAL; /* * insert entry for new key */ if (DO_INDEX(ip)) { if (JFS_IP(ip)->next_index == DIREND) { DT_PUTPAGE(mp); return -EMLINK; } n = NDTLEAF(name->namlen); data.leaf.tid = tid; data.leaf.ip = ip; } else { n = NDTLEAF_LEGACY(name->namlen); data.leaf.ip = NULL; /* signifies legacy directory format */ } data.leaf.ino = *fsn; /* * leaf page does not have enough room for new entry: * * extend/split the leaf page; * * dtSplitUp() will insert the entry and unpin the leaf page. */ if (n > p->header.freecnt) { split.mp = mp; split.index = index; split.nslot = n; split.key = name; split.data = &data; rc = dtSplitUp(tid, ip, &split, btstack); return rc; } /* * leaf page does have enough room for new entry: * * insert the new data entry into the leaf page; */ BT_MARK_DIRTY(mp, ip); /* * acquire a transaction lock on the leaf page */ tlck = txLock(tid, ip, mp, tlckDTREE | tlckENTRY); dtlck = (struct dt_lock *) & tlck->lock; ASSERT(dtlck->index == 0); lv = & dtlck->lv[0]; /* linelock header */ lv->offset = 0; lv->length = 1; dtlck->index++; dtInsertEntry(p, index, name, &data, &dtlck); /* linelock stbl of non-root leaf page */ if (!(p->header.flag & BT_ROOT)) { if (dtlck->index >= dtlck->maxcnt) dtlck = (struct dt_lock *) txLinelock(dtlck); lv = & dtlck->lv[dtlck->index]; n = index >> L2DTSLOTSIZE; lv->offset = p->header.stblindex + n; lv->length = ((p->header.nextindex - 1) >> L2DTSLOTSIZE) - n + 1; dtlck->index++; } /* unpin the leaf page */ DT_PUTPAGE(mp); return 0; } /* * dtSplitUp() * * function: propagate insertion bottom up; * * parameter: * * return: 0 - success; * errno - failure; * leaf page unpinned; */ static int dtSplitUp(tid_t tid, struct inode *ip, struct dtsplit * split, struct btstack * btstack) { struct jfs_sb_info *sbi = JFS_SBI(ip->i_sb); int rc = 0; struct metapage *smp; dtpage_t *sp; /* split page */ struct metapage *rmp; dtpage_t *rp; /* new right page split from sp */ pxd_t rpxd; /* new right page extent descriptor */ struct metapage *lmp; dtpage_t *lp; /* left child page */ int skip; /* index of entry of insertion */ struct btframe *parent; /* parent page entry on traverse stack */ s64 xaddr, nxaddr; int xlen, xsize; struct pxdlist pxdlist; pxd_t *pxd; struct component_name key = { 0, NULL }; ddata_t *data = split->data; int n; struct dt_lock *dtlck; struct tlock *tlck; struct lv *lv; int quota_allocation = 0; /* get split page */ smp = split->mp; sp = DT_PAGE(ip, smp); key.name = kmalloc_array(JFS_NAME_MAX + 2, sizeof(wchar_t), GFP_NOFS); if (!key.name) { DT_PUTPAGE(smp); rc = -ENOMEM; goto dtSplitUp_Exit; } /* * split leaf page * * The split routines insert the new entry, and * acquire txLock as appropriate. */ /* * split root leaf page: */ if (sp->header.flag & BT_ROOT) { /* * allocate a single extent child page */ xlen = 1; n = sbi->bsize >> L2DTSLOTSIZE; n -= (n + 31) >> L2DTSLOTSIZE; /* stbl size */ n -= DTROOTMAXSLOT - sp->header.freecnt; /* header + entries */ if (n <= split->nslot) xlen++; if ((rc = dbAlloc(ip, 0, (s64) xlen, &xaddr))) { DT_PUTPAGE(smp); goto freeKeyName; } pxdlist.maxnpxd = 1; pxdlist.npxd = 0; pxd = &pxdlist.pxd[0]; PXDaddress(pxd, xaddr); PXDlength(pxd, xlen); split->pxdlist = &pxdlist; rc = dtSplitRoot(tid, ip, split, &rmp); if (rc) dbFree(ip, xaddr, xlen); else DT_PUTPAGE(rmp); DT_PUTPAGE(smp); if (!DO_INDEX(ip)) ip->i_size = xlen << sbi->l2bsize; goto freeKeyName; } /* * extend first leaf page * * extend the 1st extent if less than buffer page size * (dtExtendPage() reurns leaf page unpinned) */ pxd = &sp->header.self; xlen = lengthPXD(pxd); xsize = xlen << sbi->l2bsize; if (xsize < PSIZE) { xaddr = addressPXD(pxd); n = xsize >> L2DTSLOTSIZE; n -= (n + 31) >> L2DTSLOTSIZE; /* stbl size */ if ((n + sp->header.freecnt) <= split->nslot) n = xlen + (xlen << 1); else n = xlen; /* Allocate blocks to quota. */ rc = dquot_alloc_block(ip, n); if (rc) goto extendOut; quota_allocation += n; if ((rc = dbReAlloc(sbi->ipbmap, xaddr, (s64) xlen, (s64) n, &nxaddr))) goto extendOut; pxdlist.maxnpxd = 1; pxdlist.npxd = 0; pxd = &pxdlist.pxd[0]; PXDaddress(pxd, nxaddr); PXDlength(pxd, xlen + n); split->pxdlist = &pxdlist; if ((rc = dtExtendPage(tid, ip, split, btstack))) { nxaddr = addressPXD(pxd); if (xaddr != nxaddr) { /* free relocated extent */ xlen = lengthPXD(pxd); dbFree(ip, nxaddr, (s64) xlen); } else { /* free extended delta */ xlen = lengthPXD(pxd) - n; xaddr = addressPXD(pxd) + xlen; dbFree(ip, xaddr, (s64) n); } } else if (!DO_INDEX(ip)) ip->i_size = lengthPXD(pxd) << sbi->l2bsize; extendOut: DT_PUTPAGE(smp); goto freeKeyName; } /* * split leaf page <sp> into <sp> and a new right page <rp>. * * return <rp> pinned and its extent descriptor <rpxd> */ /* * allocate new directory page extent and * new index page(s) to cover page split(s) * * allocation hint: ? */ n = btstack->nsplit; pxdlist.maxnpxd = pxdlist.npxd = 0; xlen = sbi->nbperpage; for (pxd = pxdlist.pxd; n > 0; n--, pxd++) { if ((rc = dbAlloc(ip, 0, (s64) xlen, &xaddr)) == 0) { PXDaddress(pxd, xaddr); PXDlength(pxd, xlen); pxdlist.maxnpxd++; continue; } DT_PUTPAGE(smp); /* undo allocation */ goto splitOut; } split->pxdlist = &pxdlist; if ((rc = dtSplitPage(tid, ip, split, &rmp, &rp, &rpxd))) { DT_PUTPAGE(smp); /* undo allocation */ goto splitOut; } if (!DO_INDEX(ip)) ip->i_size += PSIZE; /* * propagate up the router entry for the leaf page just split * * insert a router entry for the new page into the parent page, * propagate the insert/split up the tree by walking back the stack * of (bn of parent page, index of child page entry in parent page) * that were traversed during the search for the page that split. * * the propagation of insert/split up the tree stops if the root * splits or the page inserted into doesn't have to split to hold * the new entry. * * the parent entry for the split page remains the same, and * a new entry is inserted at its right with the first key and * block number of the new right page. * * There are a maximum of 4 pages pinned at any time: * two children, left parent and right parent (when the parent splits). * keep the child pages pinned while working on the parent. * make sure that all pins are released at exit. */ while ((parent = BT_POP(btstack)) != NULL) { /* parent page specified by stack frame <parent> */ /* keep current child pages (<lp>, <rp>) pinned */ lmp = smp; lp = sp; /* * insert router entry in parent for new right child page <rp> */ /* get the parent page <sp> */ DT_GETPAGE(ip, parent->bn, smp, PSIZE, sp, rc); if (rc) { DT_PUTPAGE(lmp); DT_PUTPAGE(rmp); goto splitOut; } /* * The new key entry goes ONE AFTER the index of parent entry, * because the split was to the right. */ skip = parent->index + 1; /* * compute the key for the router entry * * key suffix compression: * for internal pages that have leaf pages as children, * retain only what's needed to distinguish between * the new entry and the entry on the page to its left. * If the keys compare equal, retain the entire key. * * note that compression is performed only at computing * router key at the lowest internal level. * further compression of the key between pairs of higher * level internal pages loses too much information and * the search may fail. * (e.g., two adjacent leaf pages of {a, ..., x} {xx, ...,} * results in two adjacent parent entries (a)(xx). * if split occurs between these two entries, and * if compression is applied, the router key of parent entry * of right page (x) will divert search for x into right * subtree and miss x in the left subtree.) * * the entire key must be retained for the next-to-leftmost * internal key at any level of the tree, or search may fail * (e.g., ?) */ switch (rp->header.flag & BT_TYPE) { case BT_LEAF: /* * compute the length of prefix for suffix compression * between last entry of left page and first entry * of right page */ if ((sp->header.flag & BT_ROOT && skip > 1) || sp->header.prev != 0 || skip > 1) { /* compute uppercase router prefix key */ rc = ciGetLeafPrefixKey(lp, lp->header.nextindex-1, rp, 0, &key, sbi->mntflag); if (rc) { DT_PUTPAGE(lmp); DT_PUTPAGE(rmp); DT_PUTPAGE(smp); goto splitOut; } } else { /* next to leftmost entry of lowest internal level */ /* compute uppercase router key */ dtGetKey(rp, 0, &key, sbi->mntflag); key.name[key.namlen] = 0; if ((sbi->mntflag & JFS_OS2) == JFS_OS2) ciToUpper(&key); } n = NDTINTERNAL(key.namlen); break; case BT_INTERNAL: dtGetKey(rp, 0, &key, sbi->mntflag); n = NDTINTERNAL(key.namlen); break; default: jfs_err("dtSplitUp(): UFO!"); break; } /* unpin left child page */ DT_PUTPAGE(lmp); /* * compute the data for the router entry */ data->xd = rpxd; /* child page xd */ /* * parent page is full - split the parent page */ if (n > sp->header.freecnt) { /* init for parent page split */ split->mp = smp; split->index = skip; /* index at insert */ split->nslot = n; split->key = &key; /* split->data = data; */ /* unpin right child page */ DT_PUTPAGE(rmp); /* The split routines insert the new entry, * acquire txLock as appropriate. * return <rp> pinned and its block number <rbn>. */ rc = (sp->header.flag & BT_ROOT) ? dtSplitRoot(tid, ip, split, &rmp) : dtSplitPage(tid, ip, split, &rmp, &rp, &rpxd); if (rc) { DT_PUTPAGE(smp); goto splitOut; } /* smp and rmp are pinned */ } /* * parent page is not full - insert router entry in parent page */ else { BT_MARK_DIRTY(smp, ip); /* * acquire a transaction lock on the parent page */ tlck = txLock(tid, ip, smp, tlckDTREE | tlckENTRY); dtlck = (struct dt_lock *) & tlck->lock; ASSERT(dtlck->index == 0); lv = & dtlck->lv[0]; /* linelock header */ lv->offset = 0; lv->length = 1; dtlck->index++; /* linelock stbl of non-root parent page */ if (!(sp->header.flag & BT_ROOT)) { lv++; n = skip >> L2DTSLOTSIZE; lv->offset = sp->header.stblindex + n; lv->length = ((sp->header.nextindex - 1) >> L2DTSLOTSIZE) - n + 1; dtlck->index++; } dtInsertEntry(sp, skip, &key, data, &dtlck); /* exit propagate up */ break; } } /* unpin current split and its right page */ DT_PUTPAGE(smp); DT_PUTPAGE(rmp); /* * free remaining extents allocated for split */ splitOut: n = pxdlist.npxd; pxd = &pxdlist.pxd[n]; for (; n < pxdlist.maxnpxd; n++, pxd++) dbFree(ip, addressPXD(pxd), (s64) lengthPXD(pxd)); freeKeyName: kfree(key.name); /* Rollback quota allocation */ if (rc && quota_allocation) dquot_free_block(ip, quota_allocation); dtSplitUp_Exit: return rc; } /* * dtSplitPage() * * function: Split a non-root page of a btree. * * parameter: * * return: 0 - success; * errno - failure; * return split and new page pinned; */ static int dtSplitPage(tid_t tid, struct inode *ip, struct dtsplit * split, struct metapage ** rmpp, dtpage_t ** rpp, pxd_t * rpxdp) { int rc = 0; struct metapage *smp; dtpage_t *sp; struct metapage *rmp; dtpage_t *rp; /* new right page allocated */ s64 rbn; /* new right page block number */ struct metapage *mp; dtpage_t *p; s64 nextbn; struct pxdlist *pxdlist; pxd_t *pxd; int skip, nextindex, half, left, nxt, off, si; struct ldtentry *ldtentry; struct idtentry *idtentry; u8 *stbl; struct dtslot *f; int fsi, stblsize; int n; struct dt_lock *sdtlck, *rdtlck; struct tlock *tlck; struct dt_lock *dtlck; struct lv *slv, *rlv, *lv; /* get split page */ smp = split->mp; sp = DT_PAGE(ip, smp); /* * allocate the new right page for the split */ pxdlist = split->pxdlist; pxd = &pxdlist->pxd[pxdlist->npxd]; pxdlist->npxd++; rbn = addressPXD(pxd); rmp = get_metapage(ip, rbn, PSIZE, 1); if (rmp == NULL) return -EIO; /* Allocate blocks to quota. */ rc = dquot_alloc_block(ip, lengthPXD(pxd)); if (rc) { release_metapage(rmp); return rc; } jfs_info("dtSplitPage: ip:0x%p smp:0x%p rmp:0x%p", ip, smp, rmp); BT_MARK_DIRTY(rmp, ip); /* * acquire a transaction lock on the new right page */ tlck = txLock(tid, ip, rmp, tlckDTREE | tlckNEW); rdtlck = (struct dt_lock *) & tlck->lock; rp = (dtpage_t *) rmp->data; *rpp = rp; rp->header.self = *pxd; BT_MARK_DIRTY(smp, ip); /* * acquire a transaction lock on the split page * * action: */ tlck = txLock(tid, ip, smp, tlckDTREE | tlckENTRY); sdtlck = (struct dt_lock *) & tlck->lock; /* linelock header of split page */ ASSERT(sdtlck->index == 0); slv = & sdtlck->lv[0]; slv->offset = 0; slv->length = 1; sdtlck->index++; /* * initialize/update sibling pointers between sp and rp */ nextbn = le64_to_cpu(sp->header.next); rp->header.next = cpu_to_le64(nextbn); rp->header.prev = cpu_to_le64(addressPXD(&sp->header.self)); sp->header.next = cpu_to_le64(rbn); /* * initialize new right page */ rp->header.flag = sp->header.flag; /* compute sorted entry table at start of extent data area */ rp->header.nextindex = 0; rp->header.stblindex = 1; n = PSIZE >> L2DTSLOTSIZE; rp->header.maxslot = n; stblsize = (n + 31) >> L2DTSLOTSIZE; /* in unit of slot */ /* init freelist */ fsi = rp->header.stblindex + stblsize; rp->header.freelist = fsi; rp->header.freecnt = rp->header.maxslot - fsi; /* * sequential append at tail: append without split * * If splitting the last page on a level because of appending * a entry to it (skip is maxentry), it's likely that the access is * sequential. Adding an empty page on the side of the level is less * work and can push the fill factor much higher than normal. * If we're wrong it's no big deal, we'll just do the split the right * way next time. * (It may look like it's equally easy to do a similar hack for * reverse sorted data, that is, split the tree left, * but it's not. Be my guest.) */ if (nextbn == 0 && split->index == sp->header.nextindex) { /* linelock header + stbl (first slot) of new page */ rlv = & rdtlck->lv[rdtlck->index]; rlv->offset = 0; rlv->length = 2; rdtlck->index++; /* * initialize freelist of new right page */ f = &rp->slot[fsi]; for (fsi++; fsi < rp->header.maxslot; f++, fsi++) f->next = fsi; f->next = -1; /* insert entry at the first entry of the new right page */ dtInsertEntry(rp, 0, split->key, split->data, &rdtlck); goto out; } /* * non-sequential insert (at possibly middle page) */ /* * update prev pointer of previous right sibling page; */ if (nextbn != 0) { DT_GETPAGE(ip, nextbn, mp, PSIZE, p, rc); if (rc) { discard_metapage(rmp); return rc; } BT_MARK_DIRTY(mp, ip); /* * acquire a transaction lock on the next page */ tlck = txLock(tid, ip, mp, tlckDTREE | tlckRELINK); jfs_info("dtSplitPage: tlck = 0x%p, ip = 0x%p, mp=0x%p", tlck, ip, mp); dtlck = (struct dt_lock *) & tlck->lock; /* linelock header of previous right sibling page */ lv = & dtlck->lv[dtlck->index]; lv->offset = 0; lv->length = 1; dtlck->index++; p->header.prev = cpu_to_le64(rbn); DT_PUTPAGE(mp); } /* * split the data between the split and right pages. */ skip = split->index; half = (PSIZE >> L2DTSLOTSIZE) >> 1; /* swag */ left = 0; /* * compute fill factor for split pages * * <nxt> traces the next entry to move to rp * <off> traces the next entry to stay in sp */ stbl = (u8 *) & sp->slot[sp->header.stblindex]; nextindex = sp->header.nextindex; for (nxt = off = 0; nxt < nextindex; ++off) { if (off == skip) /* check for fill factor with new entry size */ n = split->nslot; else { si = stbl[nxt]; switch (sp->header.flag & BT_TYPE) { case BT_LEAF: ldtentry = (struct ldtentry *) & sp->slot[si]; if (DO_INDEX(ip)) n = NDTLEAF(ldtentry->namlen); else n = NDTLEAF_LEGACY(ldtentry-> namlen); break; case BT_INTERNAL: idtentry = (struct idtentry *) & sp->slot[si]; n = NDTINTERNAL(idtentry->namlen); break; default: break; } ++nxt; /* advance to next entry to move in sp */ } left += n; if (left >= half) break; } /* <nxt> poins to the 1st entry to move */ /* * move entries to right page * * dtMoveEntry() initializes rp and reserves entry for insertion * * split page moved out entries are linelocked; * new/right page moved in entries are linelocked; */ /* linelock header + stbl of new right page */ rlv = & rdtlck->lv[rdtlck->index]; rlv->offset = 0; rlv->length = 5; rdtlck->index++; dtMoveEntry(sp, nxt, rp, &sdtlck, &rdtlck, DO_INDEX(ip)); sp->header.nextindex = nxt; /* * finalize freelist of new right page */ fsi = rp->header.freelist; f = &rp->slot[fsi]; for (fsi++; fsi < rp->header.maxslot; f++, fsi++) f->next = fsi; f->next = -1; /* * Update directory index table for entries now in right page */ if ((rp->header.flag & BT_LEAF) && DO_INDEX(ip)) { s64 lblock; mp = NULL; stbl = DT_GETSTBL(rp); for (n = 0; n < rp->header.nextindex; n++) { ldtentry = (struct ldtentry *) & rp->slot[stbl[n]]; modify_index(tid, ip, le32_to_cpu(ldtentry->index), rbn, n, &mp, &lblock); } if (mp) release_metapage(mp); } /* * the skipped index was on the left page, */ if (skip <= off) { /* insert the new entry in the split page */ dtInsertEntry(sp, skip, split->key, split->data, &sdtlck); /* linelock stbl of split page */ if (sdtlck->index >= sdtlck->maxcnt) sdtlck = (struct dt_lock *) txLinelock(sdtlck); slv = & sdtlck->lv[sdtlck->index]; n = skip >> L2DTSLOTSIZE; slv->offset = sp->header.stblindex + n; slv->length = ((sp->header.nextindex - 1) >> L2DTSLOTSIZE) - n + 1; sdtlck->index++; } /* * the skipped index was on the right page, */ else { /* adjust the skip index to reflect the new position */ skip -= nxt; /* insert the new entry in the right page */ dtInsertEntry(rp, skip, split->key, split->data, &rdtlck); } out: *rmpp = rmp; *rpxdp = *pxd; return rc; } /* * dtExtendPage() * * function: extend 1st/only directory leaf page * * parameter: * * return: 0 - success; * errno - failure; * return extended page pinned; */ static int dtExtendPage(tid_t tid, struct inode *ip, struct dtsplit * split, struct btstack * btstack) { struct super_block *sb = ip->i_sb; int rc; struct metapage *smp, *pmp, *mp; dtpage_t *sp, *pp; struct pxdlist *pxdlist; pxd_t *pxd, *tpxd; int xlen, xsize; int newstblindex, newstblsize; int oldstblindex, oldstblsize; int fsi, last; struct dtslot *f; struct btframe *parent; int n; struct dt_lock *dtlck; s64 xaddr, txaddr; struct tlock *tlck; struct pxd_lock *pxdlock; struct lv *lv; uint type; struct ldtentry *ldtentry; u8 *stbl; /* get page to extend */ smp = split->mp; sp = DT_PAGE(ip, smp); /* get parent/root page */ parent = BT_POP(btstack); DT_GETPAGE(ip, parent->bn, pmp, PSIZE, pp, rc); if (rc) return (rc); /* * extend the extent */ pxdlist = split->pxdlist; pxd = &pxdlist->pxd[pxdlist->npxd]; pxdlist->npxd++; xaddr = addressPXD(pxd); tpxd = &sp->header.self; txaddr = addressPXD(tpxd); /* in-place extension */ if (xaddr == txaddr) { type = tlckEXTEND; } /* relocation */ else { type = tlckNEW; /* save moved extent descriptor for later free */ tlck = txMaplock(tid, ip, tlckDTREE | tlckRELOCATE); pxdlock = (struct pxd_lock *) & tlck->lock; pxdlock->flag = mlckFREEPXD; pxdlock->pxd = sp->header.self; pxdlock->index = 1; /* * Update directory index table to reflect new page address */ if (DO_INDEX(ip)) { s64 lblock; mp = NULL; stbl = DT_GETSTBL(sp); for (n = 0; n < sp->header.nextindex; n++) { ldtentry = (struct ldtentry *) & sp->slot[stbl[n]]; modify_index(tid, ip, le32_to_cpu(ldtentry->index), xaddr, n, &mp, &lblock); } if (mp) release_metapage(mp); } } /* * extend the page */ sp->header.self = *pxd; jfs_info("dtExtendPage: ip:0x%p smp:0x%p sp:0x%p", ip, smp, sp); BT_MARK_DIRTY(smp, ip); /* * acquire a transaction lock on the extended/leaf page */ tlck = txLock(tid, ip, smp, tlckDTREE | type); dtlck = (struct dt_lock *) & tlck->lock; lv = & dtlck->lv[0]; /* update buffer extent descriptor of extended page */ xlen = lengthPXD(pxd); xsize = xlen << JFS_SBI(sb)->l2bsize; /* * copy old stbl to new stbl at start of extended area */ oldstblindex = sp->header.stblindex; oldstblsize = (sp->header.maxslot + 31) >> L2DTSLOTSIZE; newstblindex = sp->header.maxslot; n = xsize >> L2DTSLOTSIZE; newstblsize = (n + 31) >> L2DTSLOTSIZE; memcpy(&sp->slot[newstblindex], &sp->slot[oldstblindex], sp->header.nextindex); /* * in-line extension: linelock old area of extended page */ if (type == tlckEXTEND) { /* linelock header */ lv->offset = 0; lv->length = 1; dtlck->index++; lv++; /* linelock new stbl of extended page */ lv->offset = newstblindex; lv->length = newstblsize; } /* * relocation: linelock whole relocated area */ else { lv->offset = 0; lv->length = sp->header.maxslot + newstblsize; } dtlck->index++; sp->header.maxslot = n; sp->header.stblindex = newstblindex; /* sp->header.nextindex remains the same */ /* * add old stbl region at head of freelist */ fsi = oldstblindex; f = &sp->slot[fsi]; last = sp->header.freelist; for (n = 0; n < oldstblsize; n++, fsi++, f++) { f->next = last; last = fsi; } sp->header.freelist = last; sp->header.freecnt += oldstblsize; /* * append free region of newly extended area at tail of freelist */ /* init free region of newly extended area */ fsi = n = newstblindex + newstblsize; f = &sp->slot[fsi]; for (fsi++; fsi < sp->header.maxslot; f++, fsi++) f->next = fsi; f->next = -1; /* append new free region at tail of old freelist */ fsi = sp->header.freelist; if (fsi == -1) sp->header.freelist = n; else { do { f = &sp->slot[fsi]; fsi = f->next; } while (fsi != -1); f->next = n; } sp->header.freecnt += sp->header.maxslot - n; /* * insert the new entry */ dtInsertEntry(sp, split->index, split->key, split->data, &dtlck); BT_MARK_DIRTY(pmp, ip); /* * linelock any freeslots residing in old extent */ if (type == tlckEXTEND) { n = sp->header.maxslot >> 2; if (sp->header.freelist < n) dtLinelockFreelist(sp, n, &dtlck); } /* * update parent entry on the parent/root page */ /* * acquire a transaction lock on the parent/root page */ tlck = txLock(tid, ip, pmp, tlckDTREE | tlckENTRY); dtlck = (struct dt_lock *) & tlck->lock; lv = & dtlck->lv[dtlck->index]; /* linelock parent entry - 1st slot */ lv->offset = 1; lv->length = 1; dtlck->index++; /* update the parent pxd for page extension */ tpxd = (pxd_t *) & pp->slot[1]; *tpxd = *pxd; DT_PUTPAGE(pmp); return 0; } /* * dtSplitRoot() * * function: * split the full root page into * original/root/split page and new right page * i.e., root remains fixed in tree anchor (inode) and * the root is copied to a single new right child page * since root page << non-root page, and * the split root page contains a single entry for the * new right child page. * * parameter: * * return: 0 - success; * errno - failure; * return new page pinned; */ static int dtSplitRoot(tid_t tid, struct inode *ip, struct dtsplit * split, struct metapage ** rmpp) { struct super_block *sb = ip->i_sb; struct metapage *smp; dtroot_t *sp; struct metapage *rmp; dtpage_t *rp; s64 rbn; int xlen; int xsize; struct dtslot *f; s8 *stbl; int fsi, stblsize, n; struct idtentry *s; pxd_t *ppxd; struct pxdlist *pxdlist; pxd_t *pxd; struct dt_lock *dtlck; struct tlock *tlck; struct lv *lv; int rc; /* get split root page */ smp = split->mp; sp = &JFS_IP(ip)->i_dtroot; /* * allocate/initialize a single (right) child page * * N.B. at first split, a one (or two) block to fit new entry * is allocated; at subsequent split, a full page is allocated; */ pxdlist = split->pxdlist; pxd = &pxdlist->pxd[pxdlist->npxd]; pxdlist->npxd++; rbn = addressPXD(pxd); xlen = lengthPXD(pxd); xsize = xlen << JFS_SBI(sb)->l2bsize; rmp = get_metapage(ip, rbn, xsize, 1); if (!rmp) return -EIO; rp = rmp->data; /* Allocate blocks to quota. */ rc = dquot_alloc_block(ip, lengthPXD(pxd)); if (rc) { release_metapage(rmp); return rc; } BT_MARK_DIRTY(rmp, ip); /* * acquire a transaction lock on the new right page */ tlck = txLock(tid, ip, rmp, tlckDTREE | tlckNEW); dtlck = (struct dt_lock *) & tlck->lock; rp->header.flag = (sp->header.flag & BT_LEAF) ? BT_LEAF : BT_INTERNAL; rp->header.self = *pxd; /* initialize sibling pointers */ rp->header.next = 0; rp->header.prev = 0; /* * move in-line root page into new right page extent */ /* linelock header + copied entries + new stbl (1st slot) in new page */ ASSERT(dtlck->index == 0); lv = & dtlck->lv[0]; lv->offset = 0; lv->length = 10; /* 1 + 8 + 1 */ dtlck->index++; n = xsize >> L2DTSLOTSIZE; rp->header.maxslot = n; stblsize = (n + 31) >> L2DTSLOTSIZE; /* copy old stbl to new stbl at start of extended area */ rp->header.stblindex = DTROOTMAXSLOT; stbl = (s8 *) & rp->slot[DTROOTMAXSLOT]; memcpy(stbl, sp->header.stbl, sp->header.nextindex); rp->header.nextindex = sp->header.nextindex; /* copy old data area to start of new data area */ memcpy(&rp->slot[1], &sp->slot[1], IDATASIZE); /* * append free region of newly extended area at tail of freelist */ /* init free region of newly extended area */ fsi = n = DTROOTMAXSLOT + stblsize; f = &rp->slot[fsi]; for (fsi++; fsi < rp->header.maxslot; f++, fsi++) f->next = fsi; f->next = -1; /* append new free region at tail of old freelist */ fsi = sp->header.freelist; if (fsi == -1) rp->header.freelist = n; else { rp->header.freelist = fsi; do { f = &rp->slot[fsi]; fsi = f->next; } while (fsi >= 0); f->next = n; } rp->header.freecnt = sp->header.freecnt + rp->header.maxslot - n; /* * Update directory index table for entries now in right page */ if ((rp->header.flag & BT_LEAF) && DO_INDEX(ip)) { s64 lblock; struct metapage *mp = NULL; struct ldtentry *ldtentry; stbl = DT_GETSTBL(rp); for (n = 0; n < rp->header.nextindex; n++) { ldtentry = (struct ldtentry *) & rp->slot[stbl[n]]; modify_index(tid, ip, le32_to_cpu(ldtentry->index), rbn, n, &mp, &lblock); } if (mp) release_metapage(mp); } /* * insert the new entry into the new right/child page * (skip index in the new right page will not change) */ dtInsertEntry(rp, split->index, split->key, split->data, &dtlck); /* * reset parent/root page * * set the 1st entry offset to 0, which force the left-most key * at any level of the tree to be less than any search key. * * The btree comparison code guarantees that the left-most key on any * level of the tree is never used, so it doesn't need to be filled in. */ BT_MARK_DIRTY(smp, ip); /* * acquire a transaction lock on the root page (in-memory inode) */ tlck = txLock(tid, ip, smp, tlckDTREE | tlckNEW | tlckBTROOT); dtlck = (struct dt_lock *) & tlck->lock; /* linelock root */ ASSERT(dtlck->index == 0); lv = & dtlck->lv[0]; lv->offset = 0; lv->length = DTROOTMAXSLOT; dtlck->index++; /* update page header of root */ if (sp->header.flag & BT_LEAF) { sp->header.flag &= ~BT_LEAF; sp->header.flag |= BT_INTERNAL; } /* init the first entry */ s = (struct idtentry *) & sp->slot[DTENTRYSTART]; ppxd = (pxd_t *) s; *ppxd = *pxd; s->next = -1; s->namlen = 0; stbl = sp->header.stbl; stbl[0] = DTENTRYSTART; sp->header.nextindex = 1; /* init freelist */ fsi = DTENTRYSTART + 1; f = &sp->slot[fsi]; /* init free region of remaining area */ for (fsi++; fsi < DTROOTMAXSLOT; f++, fsi++) f->next = fsi; f->next = -1; sp->header.freelist = DTENTRYSTART + 1; sp->header.freecnt = DTROOTMAXSLOT - (DTENTRYSTART + 1); *rmpp = rmp; return 0; } /* * dtDelete() * * function: delete the entry(s) referenced by a key. * * parameter: * * return: */ int dtDelete(tid_t tid, struct inode *ip, struct component_name * key, ino_t * ino, int flag) { int rc = 0; s64 bn; struct metapage *mp, *imp; dtpage_t *p; int index; struct btstack btstack; struct dt_lock *dtlck; struct tlock *tlck; struct lv *lv; int i; struct ldtentry *ldtentry; u8 *stbl; u32 table_index, next_index; struct metapage *nmp; dtpage_t *np; /* * search for the entry to delete: * * dtSearch() returns (leaf page pinned, index at which to delete). */ if ((rc = dtSearch(ip, key, ino, &btstack, flag))) return rc; /* retrieve search result */ DT_GETSEARCH(ip, btstack.top, bn, mp, p, index); /* * We need to find put the index of the next entry into the * directory index table in order to resume a readdir from this * entry. */ if (DO_INDEX(ip)) { stbl = DT_GETSTBL(p); ldtentry = (struct ldtentry *) & p->slot[stbl[index]]; table_index = le32_to_cpu(ldtentry->index); if (index == (p->header.nextindex - 1)) { /* * Last entry in this leaf page */ if ((p->header.flag & BT_ROOT) || (p->header.next == 0)) next_index = -1; else { /* Read next leaf page */ DT_GETPAGE(ip, le64_to_cpu(p->header.next), nmp, PSIZE, np, rc); if (rc) next_index = -1; else { stbl = DT_GETSTBL(np); ldtentry = (struct ldtentry *) & np-> slot[stbl[0]]; next_index = le32_to_cpu(ldtentry->index); DT_PUTPAGE(nmp); } } } else { ldtentry = (struct ldtentry *) & p->slot[stbl[index + 1]]; next_index = le32_to_cpu(ldtentry->index); } free_index(tid, ip, table_index, next_index); } /* * the leaf page becomes empty, delete the page */ if (p->header.nextindex == 1) { /* delete empty page */ rc = dtDeleteUp(tid, ip, mp, p, &btstack); } /* * the leaf page has other entries remaining: * * delete the entry from the leaf page. */ else { BT_MARK_DIRTY(mp, ip); /* * acquire a transaction lock on the leaf page */ tlck = txLock(tid, ip, mp, tlckDTREE | tlckENTRY); dtlck = (struct dt_lock *) & tlck->lock; /* * Do not assume that dtlck->index will be zero. During a * rename within a directory, this transaction may have * modified this page already when adding the new entry. */ /* linelock header */ if (dtlck->index >= dtlck->maxcnt) dtlck = (struct dt_lock *) txLinelock(dtlck); lv = & dtlck->lv[dtlck->index]; lv->offset = 0; lv->length = 1; dtlck->index++; /* linelock stbl of non-root leaf page */ if (!(p->header.flag & BT_ROOT)) { if (dtlck->index >= dtlck->maxcnt) dtlck = (struct dt_lock *) txLinelock(dtlck); lv = & dtlck->lv[dtlck->index]; i = index >> L2DTSLOTSIZE; lv->offset = p->header.stblindex + i; lv->length = ((p->header.nextindex - 1) >> L2DTSLOTSIZE) - i + 1; dtlck->index++; } /* free the leaf entry */ dtDeleteEntry(p, index, &dtlck); /* * Update directory index table for entries moved in stbl */ if (DO_INDEX(ip) && index < p->header.nextindex) { s64 lblock; imp = NULL; stbl = DT_GETSTBL(p); for (i = index; i < p->header.nextindex; i++) { ldtentry = (struct ldtentry *) & p->slot[stbl[i]]; modify_index(tid, ip, le32_to_cpu(ldtentry->index), bn, i, &imp, &lblock); } if (imp) release_metapage(imp); } DT_PUTPAGE(mp); } return rc; } /* * dtDeleteUp() * * function: * free empty pages as propagating deletion up the tree * * parameter: * * return: */ static int dtDeleteUp(tid_t tid, struct inode *ip, struct metapage * fmp, dtpage_t * fp, struct btstack * btstack) { int rc = 0; struct metapage *mp; dtpage_t *p; int index, nextindex; int xlen; struct btframe *parent; struct dt_lock *dtlck; struct tlock *tlck; struct lv *lv; struct pxd_lock *pxdlock; int i; /* * keep the root leaf page which has become empty */ if (BT_IS_ROOT(fmp)) { /* * reset the root * * dtInitRoot() acquires txlock on the root */ dtInitRoot(tid, ip, PARENT(ip)); DT_PUTPAGE(fmp); return 0; } /* * free the non-root leaf page */ /* * acquire a transaction lock on the page * * write FREEXTENT|NOREDOPAGE log record * N.B. linelock is overlaid as freed extent descriptor, and * the buffer page is freed; */ tlck = txMaplock(tid, ip, tlckDTREE | tlckFREE); pxdlock = (struct pxd_lock *) & tlck->lock; pxdlock->flag = mlckFREEPXD; pxdlock->pxd = fp->header.self; pxdlock->index = 1; /* update sibling pointers */ if ((rc = dtRelink(tid, ip, fp))) { BT_PUTPAGE(fmp); return rc; } xlen = lengthPXD(&fp->header.self); /* Free quota allocation. */ dquot_free_block(ip, xlen); /* free/invalidate its buffer page */ discard_metapage(fmp); /* * propagate page deletion up the directory tree * * If the delete from the parent page makes it empty, * continue all the way up the tree. * stop if the root page is reached (which is never deleted) or * if the entry deletion does not empty the page. */ while ((parent = BT_POP(btstack)) != NULL) { /* pin the parent page <sp> */ DT_GETPAGE(ip, parent->bn, mp, PSIZE, p, rc); if (rc) return rc; /* * free the extent of the child page deleted */ index = parent->index; /* * delete the entry for the child page from parent */ nextindex = p->header.nextindex; /* * the parent has the single entry being deleted: * * free the parent page which has become empty. */ if (nextindex == 1) { /* * keep the root internal page which has become empty */ if (p->header.flag & BT_ROOT) { /* * reset the root * * dtInitRoot() acquires txlock on the root */ dtInitRoot(tid, ip, PARENT(ip)); DT_PUTPAGE(mp); return 0; } /* * free the parent page */ else { /* * acquire a transaction lock on the page * * write FREEXTENT|NOREDOPAGE log record */ tlck = txMaplock(tid, ip, tlckDTREE | tlckFREE); pxdlock = (struct pxd_lock *) & tlck->lock; pxdlock->flag = mlckFREEPXD; pxdlock->pxd = p->header.self; pxdlock->index = 1; /* update sibling pointers */ if ((rc = dtRelink(tid, ip, p))) { DT_PUTPAGE(mp); return rc; } xlen = lengthPXD(&p->header.self); /* Free quota allocation */ dquot_free_block(ip, xlen); /* free/invalidate its buffer page */ discard_metapage(mp); /* propagate up */ continue; } } /* * the parent has other entries remaining: * * delete the router entry from the parent page. */ BT_MARK_DIRTY(mp, ip); /* * acquire a transaction lock on the page * * action: router entry deletion */ tlck = txLock(tid, ip, mp, tlckDTREE | tlckENTRY); dtlck = (struct dt_lock *) & tlck->lock; /* linelock header */ if (dtlck->index >= dtlck->maxcnt) dtlck = (struct dt_lock *) txLinelock(dtlck); lv = & dtlck->lv[dtlck->index]; lv->offset = 0; lv->length = 1; dtlck->index++; /* linelock stbl of non-root leaf page */ if (!(p->header.flag & BT_ROOT)) { if (dtlck->index < dtlck->maxcnt) lv++; else { dtlck = (struct dt_lock *) txLinelock(dtlck); lv = & dtlck->lv[0]; } i = index >> L2DTSLOTSIZE; lv->offset = p->header.stblindex + i; lv->length = ((p->header.nextindex - 1) >> L2DTSLOTSIZE) - i + 1; dtlck->index++; } /* free the router entry */ dtDeleteEntry(p, index, &dtlck); /* reset key of new leftmost entry of level (for consistency) */ if (index == 0 && ((p->header.flag & BT_ROOT) || p->header.prev == 0)) dtTruncateEntry(p, 0, &dtlck); /* unpin the parent page */ DT_PUTPAGE(mp); /* exit propagation up */ break; } if (!DO_INDEX(ip)) ip->i_size -= PSIZE; return 0; } /* * dtRelink() * * function: * link around a freed page. * * parameter: * fp: page to be freed * * return: */ static int dtRelink(tid_t tid, struct inode *ip, dtpage_t * p) { int rc; struct metapage *mp; s64 nextbn, prevbn; struct tlock *tlck; struct dt_lock *dtlck; struct lv *lv; nextbn = le64_to_cpu(p->header.next); prevbn = le64_to_cpu(p->header.prev); /* update prev pointer of the next page */ if (nextbn != 0) { DT_GETPAGE(ip, nextbn, mp, PSIZE, p, rc); if (rc) return rc; BT_MARK_DIRTY(mp, ip); /* * acquire a transaction lock on the next page * * action: update prev pointer; */ tlck = txLock(tid, ip, mp, tlckDTREE | tlckRELINK); jfs_info("dtRelink nextbn: tlck = 0x%p, ip = 0x%p, mp=0x%p", tlck, ip, mp); dtlck = (struct dt_lock *) & tlck->lock; /* linelock header */ if (dtlck->index >= dtlck->maxcnt) dtlck = (struct dt_lock *) txLinelock(dtlck); lv = & dtlck->lv[dtlck->index]; lv->offset = 0; lv->length = 1; dtlck->index++; p->header.prev = cpu_to_le64(prevbn); DT_PUTPAGE(mp); } /* update next pointer of the previous page */ if (prevbn != 0) { DT_GETPAGE(ip, prevbn, mp, PSIZE, p, rc); if (rc) return rc; BT_MARK_DIRTY(mp, ip); /* * acquire a transaction lock on the prev page * * action: update next pointer; */ tlck = txLock(tid, ip, mp, tlckDTREE | tlckRELINK); jfs_info("dtRelink prevbn: tlck = 0x%p, ip = 0x%p, mp=0x%p", tlck, ip, mp); dtlck = (struct dt_lock *) & tlck->lock; /* linelock header */ if (dtlck->index >= dtlck->maxcnt) dtlck = (struct dt_lock *) txLinelock(dtlck); lv = & dtlck->lv[dtlck->index]; lv->offset = 0; lv->length = 1; dtlck->index++; p->header.next = cpu_to_le64(nextbn); DT_PUTPAGE(mp); } return 0; } /* * dtInitRoot() * * initialize directory root (inline in inode) */ void dtInitRoot(tid_t tid, struct inode *ip, u32 idotdot) { struct jfs_inode_info *jfs_ip = JFS_IP(ip); dtroot_t *p; int fsi; struct dtslot *f; struct tlock *tlck; struct dt_lock *dtlck; struct lv *lv; u16 xflag_save; /* * If this was previously an non-empty directory, we need to remove * the old directory table. */ if (DO_INDEX(ip)) { if (!jfs_dirtable_inline(ip)) { struct tblock *tblk = tid_to_tblock(tid); /* * We're playing games with the tid's xflag. If * we're removing a regular file, the file's xtree * is committed with COMMIT_PMAP, but we always * commit the directories xtree with COMMIT_PWMAP. */ xflag_save = tblk->xflag; tblk->xflag = 0; /* * xtTruncate isn't guaranteed to fully truncate * the xtree. The caller needs to check i_size * after committing the transaction to see if * additional truncation is needed. The * COMMIT_Stale flag tells caller that we * initiated the truncation. */ xtTruncate(tid, ip, 0, COMMIT_PWMAP); set_cflag(COMMIT_Stale, ip); tblk->xflag = xflag_save; } else ip->i_size = 1; jfs_ip->next_index = 2; } else ip->i_size = IDATASIZE; /* * acquire a transaction lock on the root * * action: directory initialization; */ tlck = txLock(tid, ip, (struct metapage *) & jfs_ip->bxflag, tlckDTREE | tlckENTRY | tlckBTROOT); dtlck = (struct dt_lock *) & tlck->lock; /* linelock root */ ASSERT(dtlck->index == 0); lv = & dtlck->lv[0]; lv->offset = 0; lv->length = DTROOTMAXSLOT; dtlck->index++; p = &jfs_ip->i_dtroot; p->header.flag = DXD_INDEX | BT_ROOT | BT_LEAF; p->header.nextindex = 0; /* init freelist */ fsi = 1; f = &p->slot[fsi]; /* init data area of root */ for (fsi++; fsi < DTROOTMAXSLOT; f++, fsi++) f->next = fsi; f->next = -1; p->header.freelist = 1; p->header.freecnt = 8; /* init '..' entry */ p->header.idotdot = cpu_to_le32(idotdot); return; } /* * add_missing_indices() * * function: Fix dtree page in which one or more entries has an invalid index. * fsck.jfs should really fix this, but it currently does not. * Called from jfs_readdir when bad index is detected. */ static int add_missing_indices(struct inode *inode, s64 bn) { struct ldtentry *d; struct dt_lock *dtlck; int i; uint index; struct lv *lv; struct metapage *mp; dtpage_t *p; int rc = 0; s8 *stbl; tid_t tid; struct tlock *tlck; tid = txBegin(inode->i_sb, 0); DT_GETPAGE(inode, bn, mp, PSIZE, p, rc); if (rc) { printk(KERN_ERR "DT_GETPAGE failed!\n"); goto end; } BT_MARK_DIRTY(mp, inode); ASSERT(p->header.flag & BT_LEAF); tlck = txLock(tid, inode, mp, tlckDTREE | tlckENTRY); if (BT_IS_ROOT(mp)) tlck->type |= tlckBTROOT; dtlck = (struct dt_lock *) &tlck->lock; stbl = DT_GETSTBL(p); for (i = 0; i < p->header.nextindex; i++) { if (stbl[i] < 0) { jfs_err("jfs: add_missing_indices: Invalid stbl[%d] = %d for inode %ld, block = %lld", i, stbl[i], (long)inode->i_ino, (long long)bn); rc = -EIO; DT_PUTPAGE(mp); txAbort(tid, 0); goto end; } d = (struct ldtentry *) &p->slot[stbl[i]]; index = le32_to_cpu(d->index); if ((index < 2) || (index >= JFS_IP(inode)->next_index)) { d->index = cpu_to_le32(add_index(tid, inode, bn, i)); if (dtlck->index >= dtlck->maxcnt) dtlck = (struct dt_lock *) txLinelock(dtlck); lv = &dtlck->lv[dtlck->index]; lv->offset = stbl[i]; lv->length = 1; dtlck->index++; } } DT_PUTPAGE(mp); (void) txCommit(tid, 1, &inode, 0); end: txEnd(tid); return rc; } /* * Buffer to hold directory entry info while traversing a dtree page * before being fed to the filldir function */ struct jfs_dirent { loff_t position; int ino; u16 name_len; char name[]; }; /* * function to determine next variable-sized jfs_dirent in buffer */ static inline struct jfs_dirent *next_jfs_dirent(struct jfs_dirent *dirent) { return (struct jfs_dirent *) ((char *)dirent + ((sizeof (struct jfs_dirent) + dirent->name_len + 1 + sizeof (loff_t) - 1) & ~(sizeof (loff_t) - 1))); } /* * jfs_readdir() * * function: read directory entries sequentially * from the specified entry offset * * parameter: * * return: offset = (pn, index) of start entry * of next jfs_readdir()/dtRead() */ int jfs_readdir(struct file *file, struct dir_context *ctx) { struct inode *ip = file_inode(file); struct nls_table *codepage = JFS_SBI(ip->i_sb)->nls_tab; int rc = 0; loff_t dtpos; /* legacy OS/2 style position */ struct dtoffset { s16 pn; s16 index; s32 unused; } *dtoffset = (struct dtoffset *) &dtpos; s64 bn; struct metapage *mp; dtpage_t *p; int index; s8 *stbl; struct btstack btstack; int i, next; struct ldtentry *d; struct dtslot *t; int d_namleft, len, outlen; unsigned long dirent_buf; char *name_ptr; u32 dir_index; int do_index = 0; uint loop_count = 0; struct jfs_dirent *jfs_dirent; int jfs_dirents; int overflow, fix_page, page_fixed = 0; static int unique_pos = 2; /* If we can't fix broken index */ if (ctx->pos == DIREND) return 0; if (DO_INDEX(ip)) { /* * persistent index is stored in directory entries. * Special cases: 0 = . * 1 = .. * -1 = End of directory */ do_index = 1; dir_index = (u32) ctx->pos; /* * NFSv4 reserves cookies 1 and 2 for . and .. so the value * we return to the vfs is one greater than the one we use * internally. */ if (dir_index) dir_index--; if (dir_index > 1) { struct dir_table_slot dirtab_slot; if (dtEmpty(ip) || (dir_index >= JFS_IP(ip)->next_index)) { /* Stale position. Directory has shrunk */ ctx->pos = DIREND; return 0; } repeat: rc = read_index(ip, dir_index, &dirtab_slot); if (rc) { ctx->pos = DIREND; return rc; } if (dirtab_slot.flag == DIR_INDEX_FREE) { if (loop_count++ > JFS_IP(ip)->next_index) { jfs_err("jfs_readdir detected infinite loop!"); ctx->pos = DIREND; return 0; } dir_index = le32_to_cpu(dirtab_slot.addr2); if (dir_index == -1) { ctx->pos = DIREND; return 0; } goto repeat; } bn = addressDTS(&dirtab_slot); index = dirtab_slot.slot; DT_GETPAGE(ip, bn, mp, PSIZE, p, rc); if (rc) { ctx->pos = DIREND; return 0; } if (p->header.flag & BT_INTERNAL) { jfs_err("jfs_readdir: bad index table"); DT_PUTPAGE(mp); ctx->pos = DIREND; return 0; } } else { if (dir_index == 0) { /* * self "." */ ctx->pos = 1; if (!dir_emit(ctx, ".", 1, ip->i_ino, DT_DIR)) return 0; } /* * parent ".." */ ctx->pos = 2; if (!dir_emit(ctx, "..", 2, PARENT(ip), DT_DIR)) return 0; /* * Find first entry of left-most leaf */ if (dtEmpty(ip)) { ctx->pos = DIREND; return 0; } if ((rc = dtReadFirst(ip, &btstack))) return rc; DT_GETSEARCH(ip, btstack.top, bn, mp, p, index); } } else { /* * Legacy filesystem - OS/2 & Linux JFS < 0.3.6 * * pn = 0; index = 1: First entry "." * pn = 0; index = 2: Second entry ".." * pn > 0: Real entries, pn=1 -> leftmost page * pn = index = -1: No more entries */ dtpos = ctx->pos; if (dtpos < 2) { /* build "." entry */ ctx->pos = 1; if (!dir_emit(ctx, ".", 1, ip->i_ino, DT_DIR)) return 0; dtoffset->index = 2; ctx->pos = dtpos; } if (dtoffset->pn == 0) { if (dtoffset->index == 2) { /* build ".." entry */ if (!dir_emit(ctx, "..", 2, PARENT(ip), DT_DIR)) return 0; } else { jfs_err("jfs_readdir called with invalid offset!"); } dtoffset->pn = 1; dtoffset->index = 0; ctx->pos = dtpos; } if (dtEmpty(ip)) { ctx->pos = DIREND; return 0; } if ((rc = dtReadNext(ip, &ctx->pos, &btstack))) { jfs_err("jfs_readdir: unexpected rc = %d from dtReadNext", rc); ctx->pos = DIREND; return 0; } /* get start leaf page and index */ DT_GETSEARCH(ip, btstack.top, bn, mp, p, index); /* offset beyond directory eof ? */ if (bn < 0) { ctx->pos = DIREND; return 0; } } dirent_buf = __get_free_page(GFP_KERNEL); if (dirent_buf == 0) { DT_PUTPAGE(mp); jfs_warn("jfs_readdir: __get_free_page failed!"); ctx->pos = DIREND; return -ENOMEM; } while (1) { jfs_dirent = (struct jfs_dirent *) dirent_buf; jfs_dirents = 0; overflow = fix_page = 0; stbl = DT_GETSTBL(p); for (i = index; i < p->header.nextindex; i++) { if (stbl[i] < 0 || stbl[i] > 127) { jfs_err("JFS: Invalid stbl[%d] = %d for inode %ld, block = %lld", i, stbl[i], (long)ip->i_ino, (long long)bn); free_page(dirent_buf); DT_PUTPAGE(mp); return -EIO; } d = (struct ldtentry *) & p->slot[stbl[i]]; if (((long) jfs_dirent + d->namlen + 1) > (dirent_buf + PAGE_SIZE)) { /* DBCS codepages could overrun dirent_buf */ index = i; overflow = 1; break; } d_namleft = d->namlen; name_ptr = jfs_dirent->name; jfs_dirent->ino = le32_to_cpu(d->inumber); if (do_index) { len = min(d_namleft, DTLHDRDATALEN); jfs_dirent->position = le32_to_cpu(d->index); /* * d->index should always be valid, but it * isn't. fsck.jfs doesn't create the * directory index for the lost+found * directory. Rather than let it go, * we can try to fix it. */ if ((jfs_dirent->position < 2) || (jfs_dirent->position >= JFS_IP(ip)->next_index)) { if (!page_fixed && !isReadOnly(ip)) { fix_page = 1; /* * setting overflow and setting * index to i will cause the * same page to be processed * again starting here */ overflow = 1; index = i; break; } jfs_dirent->position = unique_pos++; } /* * We add 1 to the index because we may * use a value of 2 internally, and NFSv4 * doesn't like that. */ jfs_dirent->position++; } else { jfs_dirent->position = dtpos; len = min(d_namleft, DTLHDRDATALEN_LEGACY); } /* copy the name of head/only segment */ outlen = jfs_strfromUCS_le(name_ptr, d->name, len, codepage); jfs_dirent->name_len = outlen; /* copy name in the additional segment(s) */ next = d->next; while (next >= 0) { t = (struct dtslot *) & p->slot[next]; name_ptr += outlen; d_namleft -= len; /* Sanity Check */ if (d_namleft == 0) { jfs_error(ip->i_sb, "JFS:Dtree error: ino = %ld, bn=%lld, index = %d\n", (long)ip->i_ino, (long long)bn, i); goto skip_one; } len = min(d_namleft, DTSLOTDATALEN); outlen = jfs_strfromUCS_le(name_ptr, t->name, len, codepage); jfs_dirent->name_len += outlen; next = t->next; } jfs_dirents++; jfs_dirent = next_jfs_dirent(jfs_dirent); skip_one: if (!do_index) dtoffset->index++; } if (!overflow) { /* Point to next leaf page */ if (p->header.flag & BT_ROOT) bn = 0; else { bn = le64_to_cpu(p->header.next); index = 0; /* update offset (pn:index) for new page */ if (!do_index) { dtoffset->pn++; dtoffset->index = 0; } } page_fixed = 0; } /* unpin previous leaf page */ DT_PUTPAGE(mp); jfs_dirent = (struct jfs_dirent *) dirent_buf; while (jfs_dirents--) { ctx->pos = jfs_dirent->position; if (!dir_emit(ctx, jfs_dirent->name, jfs_dirent->name_len, jfs_dirent->ino, DT_UNKNOWN)) goto out; jfs_dirent = next_jfs_dirent(jfs_dirent); } if (fix_page) { if ((rc = add_missing_indices(ip, bn))) goto out; page_fixed = 1; } if (!overflow && (bn == 0)) { ctx->pos = DIREND; break; } DT_GETPAGE(ip, bn, mp, PSIZE, p, rc); if (rc) { free_page(dirent_buf); return rc; } } out: free_page(dirent_buf); return rc; } /* * dtReadFirst() * * function: get the leftmost page of the directory */ static int dtReadFirst(struct inode *ip, struct btstack * btstack) { int rc = 0; s64 bn; int psize = 288; /* initial in-line directory */ struct metapage *mp; dtpage_t *p; s8 *stbl; struct btframe *btsp; pxd_t *xd; BT_CLR(btstack); /* reset stack */ /* * descend leftmost path of the tree * * by convention, root bn = 0. */ for (bn = 0;;) { DT_GETPAGE(ip, bn, mp, psize, p, rc); if (rc) return rc; /* * leftmost leaf page */ if (p->header.flag & BT_LEAF) { /* return leftmost entry */ btsp = btstack->top; btsp->bn = bn; btsp->index = 0; btsp->mp = mp; return 0; } /* * descend down to leftmost child page */ if (BT_STACK_FULL(btstack)) { DT_PUTPAGE(mp); jfs_error(ip->i_sb, "btstack overrun\n"); BT_STACK_DUMP(btstack); return -EIO; } /* push (bn, index) of the parent page/entry */ BT_PUSH(btstack, bn, 0); /* get the leftmost entry */ stbl = DT_GETSTBL(p); if (stbl[0] < 0 || stbl[0] > 127) { DT_PUTPAGE(mp); jfs_error(ip->i_sb, "stbl[0] out of bound\n"); return -EIO; } xd = (pxd_t *) & p->slot[stbl[0]]; /* get the child page block address */ bn = addressPXD(xd); psize = lengthPXD(xd) << JFS_SBI(ip->i_sb)->l2bsize; /* unpin the parent page */ DT_PUTPAGE(mp); } } /* * dtReadNext() * * function: get the page of the specified offset (pn:index) * * return: if (offset > eof), bn = -1; * * note: if index > nextindex of the target leaf page, * start with 1st entry of next leaf page; */ static int dtReadNext(struct inode *ip, loff_t * offset, struct btstack * btstack) { int rc = 0; struct dtoffset { s16 pn; s16 index; s32 unused; } *dtoffset = (struct dtoffset *) offset; s64 bn; struct metapage *mp; dtpage_t *p; int index; int pn; s8 *stbl; struct btframe *btsp, *parent; pxd_t *xd; /* * get leftmost leaf page pinned */ if ((rc = dtReadFirst(ip, btstack))) return rc; /* get leaf page */ DT_GETSEARCH(ip, btstack->top, bn, mp, p, index); /* get the start offset (pn:index) */ pn = dtoffset->pn - 1; /* Now pn = 0 represents leftmost leaf */ index = dtoffset->index; /* start at leftmost page ? */ if (pn == 0) { /* offset beyond eof ? */ if (index < p->header.nextindex) goto out; if (p->header.flag & BT_ROOT) { bn = -1; goto out; } /* start with 1st entry of next leaf page */ dtoffset->pn++; dtoffset->index = index = 0; goto a; } /* start at non-leftmost page: scan parent pages for large pn */ if (p->header.flag & BT_ROOT) { bn = -1; goto out; } /* start after next leaf page ? */ if (pn > 1) goto b; /* get leaf page pn = 1 */ a: bn = le64_to_cpu(p->header.next); /* unpin leaf page */ DT_PUTPAGE(mp); /* offset beyond eof ? */ if (bn == 0) { bn = -1; goto out; } goto c; /* * scan last internal page level to get target leaf page */ b: /* unpin leftmost leaf page */ DT_PUTPAGE(mp); /* get left most parent page */ btsp = btstack->top; parent = btsp - 1; bn = parent->bn; DT_GETPAGE(ip, bn, mp, PSIZE, p, rc); if (rc) return rc; /* scan parent pages at last internal page level */ while (pn >= p->header.nextindex) { pn -= p->header.nextindex; /* get next parent page address */ bn = le64_to_cpu(p->header.next); /* unpin current parent page */ DT_PUTPAGE(mp); /* offset beyond eof ? */ if (bn == 0) { bn = -1; goto out; } /* get next parent page */ DT_GETPAGE(ip, bn, mp, PSIZE, p, rc); if (rc) return rc; /* update parent page stack frame */ parent->bn = bn; } /* get leaf page address */ stbl = DT_GETSTBL(p); xd = (pxd_t *) & p->slot[stbl[pn]]; bn = addressPXD(xd); /* unpin parent page */ DT_PUTPAGE(mp); /* * get target leaf page */ c: DT_GETPAGE(ip, bn, mp, PSIZE, p, rc); if (rc) return rc; /* * leaf page has been completed: * start with 1st entry of next leaf page */ if (index >= p->header.nextindex) { bn = le64_to_cpu(p->header.next); /* unpin leaf page */ DT_PUTPAGE(mp); /* offset beyond eof ? */ if (bn == 0) { bn = -1; goto out; } /* get next leaf page */ DT_GETPAGE(ip, bn, mp, PSIZE, p, rc); if (rc) return rc; /* start with 1st entry of next leaf page */ dtoffset->pn++; dtoffset->index = 0; } out: /* return target leaf page pinned */ btsp = btstack->top; btsp->bn = bn; btsp->index = dtoffset->index; btsp->mp = mp; return 0; } /* * dtCompare() * * function: compare search key with an internal entry * * return: * < 0 if k is < record * = 0 if k is = record * > 0 if k is > record */ static int dtCompare(struct component_name * key, /* search key */ dtpage_t * p, /* directory page */ int si) { /* entry slot index */ wchar_t *kname; __le16 *name; int klen, namlen, len, rc; struct idtentry *ih; struct dtslot *t; /* * force the left-most key on internal pages, at any level of * the tree, to be less than any search key. * this obviates having to update the leftmost key on an internal * page when the user inserts a new key in the tree smaller than * anything that has been stored. * * (? if/when dtSearch() narrows down to 1st entry (index = 0), * at any internal page at any level of the tree, * it descends to child of the entry anyway - * ? make the entry as min size dummy entry) * * if (e->index == 0 && h->prevpg == P_INVALID && !(h->flags & BT_LEAF)) * return (1); */ kname = key->name; klen = key->namlen; ih = (struct idtentry *) & p->slot[si]; si = ih->next; name = ih->name; namlen = ih->namlen; len = min(namlen, DTIHDRDATALEN); /* compare with head/only segment */ len = min(klen, len); if ((rc = UniStrncmp_le(kname, name, len))) return rc; klen -= len; namlen -= len; /* compare with additional segment(s) */ kname += len; while (klen > 0 && namlen > 0) { /* compare with next name segment */ t = (struct dtslot *) & p->slot[si]; len = min(namlen, DTSLOTDATALEN); len = min(klen, len); name = t->name; if ((rc = UniStrncmp_le(kname, name, len))) return rc; klen -= len; namlen -= len; kname += len; si = t->next; } return (klen - namlen); } /* * ciCompare() * * function: compare search key with an (leaf/internal) entry * * return: * < 0 if k is < record * = 0 if k is = record * > 0 if k is > record */ static int ciCompare(struct component_name * key, /* search key */ dtpage_t * p, /* directory page */ int si, /* entry slot index */ int flag) { wchar_t *kname, x; __le16 *name; int klen, namlen, len, rc; struct ldtentry *lh; struct idtentry *ih; struct dtslot *t; int i; /* * force the left-most key on internal pages, at any level of * the tree, to be less than any search key. * this obviates having to update the leftmost key on an internal * page when the user inserts a new key in the tree smaller than * anything that has been stored. * * (? if/when dtSearch() narrows down to 1st entry (index = 0), * at any internal page at any level of the tree, * it descends to child of the entry anyway - * ? make the entry as min size dummy entry) * * if (e->index == 0 && h->prevpg == P_INVALID && !(h->flags & BT_LEAF)) * return (1); */ kname = key->name; klen = key->namlen; /* * leaf page entry */ if (p->header.flag & BT_LEAF) { lh = (struct ldtentry *) & p->slot[si]; si = lh->next; name = lh->name; namlen = lh->namlen; if (flag & JFS_DIR_INDEX) len = min(namlen, DTLHDRDATALEN); else len = min(namlen, DTLHDRDATALEN_LEGACY); } /* * internal page entry */ else { ih = (struct idtentry *) & p->slot[si]; si = ih->next; name = ih->name; namlen = ih->namlen; len = min(namlen, DTIHDRDATALEN); } /* compare with head/only segment */ len = min(klen, len); for (i = 0; i < len; i++, kname++, name++) { /* only uppercase if case-insensitive support is on */ if ((flag & JFS_OS2) == JFS_OS2) x = UniToupper(le16_to_cpu(*name)); else x = le16_to_cpu(*name); if ((rc = *kname - x)) return rc; } klen -= len; namlen -= len; /* compare with additional segment(s) */ while (klen > 0 && namlen > 0) { /* compare with next name segment */ t = (struct dtslot *) & p->slot[si]; len = min(namlen, DTSLOTDATALEN); len = min(klen, len); name = t->name; for (i = 0; i < len; i++, kname++, name++) { /* only uppercase if case-insensitive support is on */ if ((flag & JFS_OS2) == JFS_OS2) x = UniToupper(le16_to_cpu(*name)); else x = le16_to_cpu(*name); if ((rc = *kname - x)) return rc; } klen -= len; namlen -= len; si = t->next; } return (klen - namlen); } /* * ciGetLeafPrefixKey() * * function: compute prefix of suffix compression * from two adjacent leaf entries * across page boundary * * return: non-zero on error * */ static int ciGetLeafPrefixKey(dtpage_t * lp, int li, dtpage_t * rp, int ri, struct component_name * key, int flag) { int klen, namlen; wchar_t *pl, *pr, *kname; struct component_name lkey; struct component_name rkey; lkey.name = kmalloc_array(JFS_NAME_MAX + 1, sizeof(wchar_t), GFP_KERNEL); if (lkey.name == NULL) return -ENOMEM; rkey.name = kmalloc_array(JFS_NAME_MAX + 1, sizeof(wchar_t), GFP_KERNEL); if (rkey.name == NULL) { kfree(lkey.name); return -ENOMEM; } /* get left and right key */ dtGetKey(lp, li, &lkey, flag); lkey.name[lkey.namlen] = 0; if ((flag & JFS_OS2) == JFS_OS2) ciToUpper(&lkey); dtGetKey(rp, ri, &rkey, flag); rkey.name[rkey.namlen] = 0; if ((flag & JFS_OS2) == JFS_OS2) ciToUpper(&rkey); /* compute prefix */ klen = 0; kname = key->name; namlen = min(lkey.namlen, rkey.namlen); for (pl = lkey.name, pr = rkey.name; namlen; pl++, pr++, namlen--, klen++, kname++) { *kname = *pr; if (*pl != *pr) { key->namlen = klen + 1; goto free_names; } } /* l->namlen <= r->namlen since l <= r */ if (lkey.namlen < rkey.namlen) { *kname = *pr; key->namlen = klen + 1; } else /* l->namelen == r->namelen */ key->namlen = klen; free_names: kfree(lkey.name); kfree(rkey.name); return 0; } /* * dtGetKey() * * function: get key of the entry */ static void dtGetKey(dtpage_t * p, int i, /* entry index */ struct component_name * key, int flag) { int si; s8 *stbl; struct ldtentry *lh; struct idtentry *ih; struct dtslot *t; int namlen, len; wchar_t *kname; __le16 *name; /* get entry */ stbl = DT_GETSTBL(p); si = stbl[i]; if (p->header.flag & BT_LEAF) { lh = (struct ldtentry *) & p->slot[si]; si = lh->next; namlen = lh->namlen; name = lh->name; if (flag & JFS_DIR_INDEX) len = min(namlen, DTLHDRDATALEN); else len = min(namlen, DTLHDRDATALEN_LEGACY); } else { ih = (struct idtentry *) & p->slot[si]; si = ih->next; namlen = ih->namlen; name = ih->name; len = min(namlen, DTIHDRDATALEN); } key->namlen = namlen; kname = key->name; /* * move head/only segment */ UniStrncpy_from_le(kname, name, len); /* * move additional segment(s) */ while (si >= 0) { /* get next segment */ t = &p->slot[si]; kname += len; namlen -= len; len = min(namlen, DTSLOTDATALEN); UniStrncpy_from_le(kname, t->name, len); si = t->next; } } /* * dtInsertEntry() * * function: allocate free slot(s) and * write a leaf/internal entry * * return: entry slot index */ static void dtInsertEntry(dtpage_t * p, int index, struct component_name * key, ddata_t * data, struct dt_lock ** dtlock) { struct dtslot *h, *t; struct ldtentry *lh = NULL; struct idtentry *ih = NULL; int hsi, fsi, klen, len, nextindex; wchar_t *kname; __le16 *name; s8 *stbl; pxd_t *xd; struct dt_lock *dtlck = *dtlock; struct lv *lv; int xsi, n; s64 bn = 0; struct metapage *mp = NULL; klen = key->namlen; kname = key->name; /* allocate a free slot */ hsi = fsi = p->header.freelist; h = &p->slot[fsi]; p->header.freelist = h->next; --p->header.freecnt; /* open new linelock */ if (dtlck->index >= dtlck->maxcnt) dtlck = (struct dt_lock *) txLinelock(dtlck); lv = & dtlck->lv[dtlck->index]; lv->offset = hsi; /* write head/only segment */ if (p->header.flag & BT_LEAF) { lh = (struct ldtentry *) h; lh->next = h->next; lh->inumber = cpu_to_le32(data->leaf.ino); lh->namlen = klen; name = lh->name; if (data->leaf.ip) { len = min(klen, DTLHDRDATALEN); if (!(p->header.flag & BT_ROOT)) bn = addressPXD(&p->header.self); lh->index = cpu_to_le32(add_index(data->leaf.tid, data->leaf.ip, bn, index)); } else len = min(klen, DTLHDRDATALEN_LEGACY); } else { ih = (struct idtentry *) h; ih->next = h->next; xd = (pxd_t *) ih; *xd = data->xd; ih->namlen = klen; name = ih->name; len = min(klen, DTIHDRDATALEN); } UniStrncpy_to_le(name, kname, len); n = 1; xsi = hsi; /* write additional segment(s) */ t = h; klen -= len; while (klen) { /* get free slot */ fsi = p->header.freelist; t = &p->slot[fsi]; p->header.freelist = t->next; --p->header.freecnt; /* is next slot contiguous ? */ if (fsi != xsi + 1) { /* close current linelock */ lv->length = n; dtlck->index++; /* open new linelock */ if (dtlck->index < dtlck->maxcnt) lv++; else { dtlck = (struct dt_lock *) txLinelock(dtlck); lv = & dtlck->lv[0]; } lv->offset = fsi; n = 0; } kname += len; len = min(klen, DTSLOTDATALEN); UniStrncpy_to_le(t->name, kname, len); n++; xsi = fsi; klen -= len; } /* close current linelock */ lv->length = n; dtlck->index++; *dtlock = dtlck; /* terminate last/only segment */ if (h == t) { /* single segment entry */ if (p->header.flag & BT_LEAF) lh->next = -1; else ih->next = -1; } else /* multi-segment entry */ t->next = -1; /* if insert into middle, shift right succeeding entries in stbl */ stbl = DT_GETSTBL(p); nextindex = p->header.nextindex; if (index < nextindex) { memmove(stbl + index + 1, stbl + index, nextindex - index); if ((p->header.flag & BT_LEAF) && data->leaf.ip) { s64 lblock; /* * Need to update slot number for entries that moved * in the stbl */ mp = NULL; for (n = index + 1; n <= nextindex; n++) { lh = (struct ldtentry *) & (p->slot[stbl[n]]); modify_index(data->leaf.tid, data->leaf.ip, le32_to_cpu(lh->index), bn, n, &mp, &lblock); } if (mp) release_metapage(mp); } } stbl[index] = hsi; /* advance next available entry index of stbl */ ++p->header.nextindex; } /* * dtMoveEntry() * * function: move entries from split/left page to new/right page * * nextindex of dst page and freelist/freecnt of both pages * are updated. */ static void dtMoveEntry(dtpage_t * sp, int si, dtpage_t * dp, struct dt_lock ** sdtlock, struct dt_lock ** ddtlock, int do_index) { int ssi, next; /* src slot index */ int di; /* dst entry index */ int dsi; /* dst slot index */ s8 *sstbl, *dstbl; /* sorted entry table */ int snamlen, len; struct ldtentry *slh, *dlh = NULL; struct idtentry *sih, *dih = NULL; struct dtslot *h, *s, *d; struct dt_lock *sdtlck = *sdtlock, *ddtlck = *ddtlock; struct lv *slv, *dlv; int xssi, ns, nd; int sfsi; sstbl = (s8 *) & sp->slot[sp->header.stblindex]; dstbl = (s8 *) & dp->slot[dp->header.stblindex]; dsi = dp->header.freelist; /* first (whole page) free slot */ sfsi = sp->header.freelist; /* linelock destination entry slot */ dlv = & ddtlck->lv[ddtlck->index]; dlv->offset = dsi; /* linelock source entry slot */ slv = & sdtlck->lv[sdtlck->index]; slv->offset = sstbl[si]; xssi = slv->offset - 1; /* * move entries */ ns = nd = 0; for (di = 0; si < sp->header.nextindex; si++, di++) { ssi = sstbl[si]; dstbl[di] = dsi; /* is next slot contiguous ? */ if (ssi != xssi + 1) { /* close current linelock */ slv->length = ns; sdtlck->index++; /* open new linelock */ if (sdtlck->index < sdtlck->maxcnt) slv++; else { sdtlck = (struct dt_lock *) txLinelock(sdtlck); slv = & sdtlck->lv[0]; } slv->offset = ssi; ns = 0; } /* * move head/only segment of an entry */ /* get dst slot */ h = d = &dp->slot[dsi]; /* get src slot and move */ s = &sp->slot[ssi]; if (sp->header.flag & BT_LEAF) { /* get source entry */ slh = (struct ldtentry *) s; dlh = (struct ldtentry *) h; snamlen = slh->namlen; if (do_index) { len = min(snamlen, DTLHDRDATALEN); dlh->index = slh->index; /* little-endian */ } else len = min(snamlen, DTLHDRDATALEN_LEGACY); memcpy(dlh, slh, 6 + len * 2); next = slh->next; /* update dst head/only segment next field */ dsi++; dlh->next = dsi; } else { sih = (struct idtentry *) s; snamlen = sih->namlen; len = min(snamlen, DTIHDRDATALEN); dih = (struct idtentry *) h; memcpy(dih, sih, 10 + len * 2); next = sih->next; dsi++; dih->next = dsi; } /* free src head/only segment */ s->next = sfsi; s->cnt = 1; sfsi = ssi; ns++; nd++; xssi = ssi; /* * move additional segment(s) of the entry */ snamlen -= len; while ((ssi = next) >= 0) { /* is next slot contiguous ? */ if (ssi != xssi + 1) { /* close current linelock */ slv->length = ns; sdtlck->index++; /* open new linelock */ if (sdtlck->index < sdtlck->maxcnt) slv++; else { sdtlck = (struct dt_lock *) txLinelock(sdtlck); slv = & sdtlck->lv[0]; } slv->offset = ssi; ns = 0; } /* get next source segment */ s = &sp->slot[ssi]; /* get next destination free slot */ d++; len = min(snamlen, DTSLOTDATALEN); UniStrncpy_le(d->name, s->name, len); ns++; nd++; xssi = ssi; dsi++; d->next = dsi; /* free source segment */ next = s->next; s->next = sfsi; s->cnt = 1; sfsi = ssi; snamlen -= len; } /* end while */ /* terminate dst last/only segment */ if (h == d) { /* single segment entry */ if (dp->header.flag & BT_LEAF) dlh->next = -1; else dih->next = -1; } else /* multi-segment entry */ d->next = -1; } /* end for */ /* close current linelock */ slv->length = ns; sdtlck->index++; *sdtlock = sdtlck; dlv->length = nd; ddtlck->index++; *ddtlock = ddtlck; /* update source header */ sp->header.freelist = sfsi; sp->header.freecnt += nd; /* update destination header */ dp->header.nextindex = di; dp->header.freelist = dsi; dp->header.freecnt -= nd; } /* * dtDeleteEntry() * * function: free a (leaf/internal) entry * * log freelist header, stbl, and each segment slot of entry * (even though last/only segment next field is modified, * physical image logging requires all segment slots of * the entry logged to avoid applying previous updates * to the same slots) */ static void dtDeleteEntry(dtpage_t * p, int fi, struct dt_lock ** dtlock) { int fsi; /* free entry slot index */ s8 *stbl; struct dtslot *t; int si, freecnt; struct dt_lock *dtlck = *dtlock; struct lv *lv; int xsi, n; /* get free entry slot index */ stbl = DT_GETSTBL(p); fsi = stbl[fi]; /* open new linelock */ if (dtlck->index >= dtlck->maxcnt) dtlck = (struct dt_lock *) txLinelock(dtlck); lv = & dtlck->lv[dtlck->index]; lv->offset = fsi; /* get the head/only segment */ t = &p->slot[fsi]; if (p->header.flag & BT_LEAF) si = ((struct ldtentry *) t)->next; else si = ((struct idtentry *) t)->next; t->next = si; t->cnt = 1; n = freecnt = 1; xsi = fsi; /* find the last/only segment */ while (si >= 0) { /* is next slot contiguous ? */ if (si != xsi + 1) { /* close current linelock */ lv->length = n; dtlck->index++; /* open new linelock */ if (dtlck->index < dtlck->maxcnt) lv++; else { dtlck = (struct dt_lock *) txLinelock(dtlck); lv = & dtlck->lv[0]; } lv->offset = si; n = 0; } n++; xsi = si; freecnt++; t = &p->slot[si]; t->cnt = 1; si = t->next; } /* close current linelock */ lv->length = n; dtlck->index++; *dtlock = dtlck; /* update freelist */ t->next = p->header.freelist; p->header.freelist = fsi; p->header.freecnt += freecnt; /* if delete from middle, * shift left the succedding entries in the stbl */ si = p->header.nextindex; if (fi < si - 1) memmove(&stbl[fi], &stbl[fi + 1], si - fi - 1); p->header.nextindex--; } /* * dtTruncateEntry() * * function: truncate a (leaf/internal) entry * * log freelist header, stbl, and each segment slot of entry * (even though last/only segment next field is modified, * physical image logging requires all segment slots of * the entry logged to avoid applying previous updates * to the same slots) */ static void dtTruncateEntry(dtpage_t * p, int ti, struct dt_lock ** dtlock) { int tsi; /* truncate entry slot index */ s8 *stbl; struct dtslot *t; int si, freecnt; struct dt_lock *dtlck = *dtlock; struct lv *lv; int fsi, xsi, n; /* get free entry slot index */ stbl = DT_GETSTBL(p); tsi = stbl[ti]; /* open new linelock */ if (dtlck->index >= dtlck->maxcnt) dtlck = (struct dt_lock *) txLinelock(dtlck); lv = & dtlck->lv[dtlck->index]; lv->offset = tsi; /* get the head/only segment */ t = &p->slot[tsi]; ASSERT(p->header.flag & BT_INTERNAL); ((struct idtentry *) t)->namlen = 0; si = ((struct idtentry *) t)->next; ((struct idtentry *) t)->next = -1; n = 1; freecnt = 0; fsi = si; xsi = tsi; /* find the last/only segment */ while (si >= 0) { /* is next slot contiguous ? */ if (si != xsi + 1) { /* close current linelock */ lv->length = n; dtlck->index++; /* open new linelock */ if (dtlck->index < dtlck->maxcnt) lv++; else { dtlck = (struct dt_lock *) txLinelock(dtlck); lv = & dtlck->lv[0]; } lv->offset = si; n = 0; } n++; xsi = si; freecnt++; t = &p->slot[si]; t->cnt = 1; si = t->next; } /* close current linelock */ lv->length = n; dtlck->index++; *dtlock = dtlck; /* update freelist */ if (freecnt == 0) return; t->next = p->header.freelist; p->header.freelist = fsi; p->header.freecnt += freecnt; } /* * dtLinelockFreelist() */ static void dtLinelockFreelist(dtpage_t * p, /* directory page */ int m, /* max slot index */ struct dt_lock ** dtlock) { int fsi; /* free entry slot index */ struct dtslot *t; int si; struct dt_lock *dtlck = *dtlock; struct lv *lv; int xsi, n; /* get free entry slot index */ fsi = p->header.freelist; /* open new linelock */ if (dtlck->index >= dtlck->maxcnt) dtlck = (struct dt_lock *) txLinelock(dtlck); lv = & dtlck->lv[dtlck->index]; lv->offset = fsi; n = 1; xsi = fsi; t = &p->slot[fsi]; si = t->next; /* find the last/only segment */ while (si < m && si >= 0) { /* is next slot contiguous ? */ if (si != xsi + 1) { /* close current linelock */ lv->length = n; dtlck->index++; /* open new linelock */ if (dtlck->index < dtlck->maxcnt) lv++; else { dtlck = (struct dt_lock *) txLinelock(dtlck); lv = & dtlck->lv[0]; } lv->offset = si; n = 0; } n++; xsi = si; t = &p->slot[si]; si = t->next; } /* close current linelock */ lv->length = n; dtlck->index++; *dtlock = dtlck; } /* * NAME: dtModify * * FUNCTION: Modify the inode number part of a directory entry * * PARAMETERS: * tid - Transaction id * ip - Inode of parent directory * key - Name of entry to be modified * orig_ino - Original inode number expected in entry * new_ino - New inode number to put into entry * flag - JFS_RENAME * * RETURNS: * -ESTALE - If entry found does not match orig_ino passed in * -ENOENT - If no entry can be found to match key * 0 - If successfully modified entry */ int dtModify(tid_t tid, struct inode *ip, struct component_name * key, ino_t * orig_ino, ino_t new_ino, int flag) { int rc; s64 bn; struct metapage *mp; dtpage_t *p; int index; struct btstack btstack; struct tlock *tlck; struct dt_lock *dtlck; struct lv *lv; s8 *stbl; int entry_si; /* entry slot index */ struct ldtentry *entry; /* * search for the entry to modify: * * dtSearch() returns (leaf page pinned, index at which to modify). */ if ((rc = dtSearch(ip, key, orig_ino, &btstack, flag))) return rc; /* retrieve search result */ DT_GETSEARCH(ip, btstack.top, bn, mp, p, index); BT_MARK_DIRTY(mp, ip); /* * acquire a transaction lock on the leaf page of named entry */ tlck = txLock(tid, ip, mp, tlckDTREE | tlckENTRY); dtlck = (struct dt_lock *) & tlck->lock; /* get slot index of the entry */ stbl = DT_GETSTBL(p); entry_si = stbl[index]; /* linelock entry */ ASSERT(dtlck->index == 0); lv = & dtlck->lv[0]; lv->offset = entry_si; lv->length = 1; dtlck->index++; /* get the head/only segment */ entry = (struct ldtentry *) & p->slot[entry_si]; /* substitute the inode number of the entry */ entry->inumber = cpu_to_le32(new_ino); /* unpin the leaf page */ DT_PUTPAGE(mp); return 0; }
27 63 62 36 48 6 135 136 136 86 3 135 130 15 103 130 136 5 135 48 48 48 155 147 147 1 150 2 151 131 149 47 47 2 45 47 46 5 5 66 64 65 65 65 66 62 48 66 62 47 65 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 // SPDX-License-Identifier: GPL-2.0 #include "bcachefs.h" #include "backpointers.h" #include "bkey_methods.h" #include "btree_cache.h" #include "btree_types.h" #include "alloc_background.h" #include "dirent.h" #include "disk_accounting.h" #include "ec.h" #include "error.h" #include "extents.h" #include "inode.h" #include "io_misc.h" #include "lru.h" #include "quota.h" #include "reflink.h" #include "snapshot.h" #include "subvolume.h" #include "xattr.h" const char * const bch2_bkey_types[] = { #define x(name, nr, ...) #name, BCH_BKEY_TYPES() #undef x NULL }; static int deleted_key_validate(struct bch_fs *c, struct bkey_s_c k, struct bkey_validate_context from) { return 0; } #define bch2_bkey_ops_deleted ((struct bkey_ops) { \ .key_validate = deleted_key_validate, \ }) #define bch2_bkey_ops_whiteout ((struct bkey_ops) { \ .key_validate = deleted_key_validate, \ }) static int empty_val_key_validate(struct bch_fs *c, struct bkey_s_c k, struct bkey_validate_context from) { int ret = 0; bkey_fsck_err_on(bkey_val_bytes(k.k), c, bkey_val_size_nonzero, "incorrect value size (%zu != 0)", bkey_val_bytes(k.k)); fsck_err: return ret; } #define bch2_bkey_ops_error ((struct bkey_ops) { \ .key_validate = empty_val_key_validate, \ }) static int key_type_cookie_validate(struct bch_fs *c, struct bkey_s_c k, struct bkey_validate_context from) { return 0; } static void key_type_cookie_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k) { struct bkey_s_c_cookie ck = bkey_s_c_to_cookie(k); prt_printf(out, "%llu", le64_to_cpu(ck.v->cookie)); } #define bch2_bkey_ops_cookie ((struct bkey_ops) { \ .key_validate = key_type_cookie_validate, \ .val_to_text = key_type_cookie_to_text, \ .min_val_size = 8, \ }) #define bch2_bkey_ops_hash_whiteout ((struct bkey_ops) {\ .key_validate = empty_val_key_validate, \ }) static int key_type_inline_data_validate(struct bch_fs *c, struct bkey_s_c k, struct bkey_validate_context from) { return 0; } static void key_type_inline_data_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k) { struct bkey_s_c_inline_data d = bkey_s_c_to_inline_data(k); unsigned datalen = bkey_inline_data_bytes(k.k); prt_printf(out, "datalen %u: %*phN", datalen, min(datalen, 32U), d.v->data); } #define bch2_bkey_ops_inline_data ((struct bkey_ops) { \ .key_validate = key_type_inline_data_validate, \ .val_to_text = key_type_inline_data_to_text, \ }) static bool key_type_set_merge(struct bch_fs *c, struct bkey_s l, struct bkey_s_c r) { bch2_key_resize(l.k, l.k->size + r.k->size); return true; } #define bch2_bkey_ops_set ((struct bkey_ops) { \ .key_validate = empty_val_key_validate, \ .key_merge = key_type_set_merge, \ }) const struct bkey_ops bch2_bkey_ops[] = { #define x(name, nr, ...) [KEY_TYPE_##name] = bch2_bkey_ops_##name, BCH_BKEY_TYPES() #undef x }; const struct bkey_ops bch2_bkey_null_ops = { }; int bch2_bkey_val_validate(struct bch_fs *c, struct bkey_s_c k, struct bkey_validate_context from) { if (test_bit(BCH_FS_no_invalid_checks, &c->flags)) return 0; const struct bkey_ops *ops = bch2_bkey_type_ops(k.k->type); int ret = 0; bkey_fsck_err_on(bkey_val_bytes(k.k) < ops->min_val_size, c, bkey_val_size_too_small, "bad val size (%zu < %u)", bkey_val_bytes(k.k), ops->min_val_size); if (!ops->key_validate) return 0; ret = ops->key_validate(c, k, from); fsck_err: return ret; } static u64 bch2_key_types_allowed[] = { [BKEY_TYPE_btree] = BIT_ULL(KEY_TYPE_deleted)| BIT_ULL(KEY_TYPE_btree_ptr)| BIT_ULL(KEY_TYPE_btree_ptr_v2), #define x(name, nr, flags, keys) [BKEY_TYPE_##name] = BIT_ULL(KEY_TYPE_deleted)|keys, BCH_BTREE_IDS() #undef x }; static const enum bch_bkey_type_flags bch2_bkey_type_flags[] = { #define x(name, nr, flags) [KEY_TYPE_##name] = flags, BCH_BKEY_TYPES() #undef x }; const char *bch2_btree_node_type_str(enum btree_node_type type) { return type == BKEY_TYPE_btree ? "internal btree node" : bch2_btree_id_str(type - 1); } int __bch2_bkey_validate(struct bch_fs *c, struct bkey_s_c k, struct bkey_validate_context from) { enum btree_node_type type = __btree_node_type(from.level, from.btree); if (test_bit(BCH_FS_no_invalid_checks, &c->flags)) return 0; int ret = 0; bkey_fsck_err_on(k.k->u64s < BKEY_U64s, c, bkey_u64s_too_small, "u64s too small (%u < %zu)", k.k->u64s, BKEY_U64s); if (type >= BKEY_TYPE_NR) return 0; enum bch_bkey_type_flags bkey_flags = k.k->type < KEY_TYPE_MAX ? bch2_bkey_type_flags[k.k->type] : 0; bool strict_key_type_allowed = (from.flags & BCH_VALIDATE_commit) || type == BKEY_TYPE_btree || (from.btree < BTREE_ID_NR && (bkey_flags & BKEY_TYPE_strict_btree_checks)); bkey_fsck_err_on(strict_key_type_allowed && k.k->type < KEY_TYPE_MAX && !(bch2_key_types_allowed[type] & BIT_ULL(k.k->type)), c, bkey_invalid_type_for_btree, "invalid key type for btree %s (%s)", bch2_btree_node_type_str(type), k.k->type < KEY_TYPE_MAX ? bch2_bkey_types[k.k->type] : "(unknown)"); if (btree_node_type_is_extents(type) && !bkey_whiteout(k.k)) { bkey_fsck_err_on(k.k->size == 0, c, bkey_extent_size_zero, "size == 0"); bkey_fsck_err_on(k.k->size > k.k->p.offset, c, bkey_extent_size_greater_than_offset, "size greater than offset (%u > %llu)", k.k->size, k.k->p.offset); } else { bkey_fsck_err_on(k.k->size, c, bkey_size_nonzero, "size != 0"); } if (type != BKEY_TYPE_btree) { enum btree_id btree = type - 1; if (btree_type_has_snapshots(btree)) { bkey_fsck_err_on(!k.k->p.snapshot, c, bkey_snapshot_zero, "snapshot == 0"); } else if (!btree_type_has_snapshot_field(btree)) { bkey_fsck_err_on(k.k->p.snapshot, c, bkey_snapshot_nonzero, "nonzero snapshot"); } else { /* * btree uses snapshot field but it's not required to be * nonzero */ } bkey_fsck_err_on(bkey_eq(k.k->p, POS_MAX), c, bkey_at_pos_max, "key at POS_MAX"); } fsck_err: return ret; } int bch2_bkey_validate(struct bch_fs *c, struct bkey_s_c k, struct bkey_validate_context from) { return __bch2_bkey_validate(c, k, from) ?: bch2_bkey_val_validate(c, k, from); } int bch2_bkey_in_btree_node(struct bch_fs *c, struct btree *b, struct bkey_s_c k, struct bkey_validate_context from) { int ret = 0; bkey_fsck_err_on(bpos_lt(k.k->p, b->data->min_key), c, bkey_before_start_of_btree_node, "key before start of btree node"); bkey_fsck_err_on(bpos_gt(k.k->p, b->data->max_key), c, bkey_after_end_of_btree_node, "key past end of btree node"); fsck_err: return ret; } void bch2_bpos_to_text(struct printbuf *out, struct bpos pos) { if (bpos_eq(pos, POS_MIN)) prt_printf(out, "POS_MIN"); else if (bpos_eq(pos, POS_MAX)) prt_printf(out, "POS_MAX"); else if (bpos_eq(pos, SPOS_MAX)) prt_printf(out, "SPOS_MAX"); else { if (pos.inode == U64_MAX) prt_printf(out, "U64_MAX"); else prt_printf(out, "%llu", pos.inode); prt_printf(out, ":"); if (pos.offset == U64_MAX) prt_printf(out, "U64_MAX"); else prt_printf(out, "%llu", pos.offset); prt_printf(out, ":"); if (pos.snapshot == U32_MAX) prt_printf(out, "U32_MAX"); else prt_printf(out, "%u", pos.snapshot); } } void bch2_bkey_to_text(struct printbuf *out, const struct bkey *k) { if (k) { prt_printf(out, "u64s %u type ", k->u64s); if (k->type < KEY_TYPE_MAX) prt_printf(out, "%s ", bch2_bkey_types[k->type]); else prt_printf(out, "%u ", k->type); bch2_bpos_to_text(out, k->p); prt_printf(out, " len %u ver %llu", k->size, k->bversion.lo); } else { prt_printf(out, "(null)"); } } void bch2_val_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k) { const struct bkey_ops *ops = bch2_bkey_type_ops(k.k->type); if (likely(ops->val_to_text)) ops->val_to_text(out, c, k); } void bch2_bkey_val_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k) { bch2_bkey_to_text(out, k.k); if (bkey_val_bytes(k.k)) { prt_printf(out, ": "); bch2_val_to_text(out, c, k); } } void bch2_bkey_swab_val(struct bkey_s k) { const struct bkey_ops *ops = bch2_bkey_type_ops(k.k->type); if (ops->swab) ops->swab(k); } bool bch2_bkey_normalize(struct bch_fs *c, struct bkey_s k) { const struct bkey_ops *ops = bch2_bkey_type_ops(k.k->type); return ops->key_normalize ? ops->key_normalize(c, k) : false; } bool bch2_bkey_merge(struct bch_fs *c, struct bkey_s l, struct bkey_s_c r) { const struct bkey_ops *ops = bch2_bkey_type_ops(l.k->type); return ops->key_merge && bch2_bkey_maybe_mergable(l.k, r.k) && (u64) l.k->size + r.k->size <= KEY_SIZE_MAX && !static_branch_unlikely(&bch2_key_merging_disabled) && ops->key_merge(c, l, r); } static const struct old_bkey_type { u8 btree_node_type; u8 old; u8 new; } bkey_renumber_table[] = { {BKEY_TYPE_btree, 128, KEY_TYPE_btree_ptr }, {BKEY_TYPE_extents, 128, KEY_TYPE_extent }, {BKEY_TYPE_extents, 129, KEY_TYPE_extent }, {BKEY_TYPE_extents, 130, KEY_TYPE_reservation }, {BKEY_TYPE_inodes, 128, KEY_TYPE_inode }, {BKEY_TYPE_inodes, 130, KEY_TYPE_inode_generation }, {BKEY_TYPE_dirents, 128, KEY_TYPE_dirent }, {BKEY_TYPE_dirents, 129, KEY_TYPE_hash_whiteout }, {BKEY_TYPE_xattrs, 128, KEY_TYPE_xattr }, {BKEY_TYPE_xattrs, 129, KEY_TYPE_hash_whiteout }, {BKEY_TYPE_alloc, 128, KEY_TYPE_alloc }, {BKEY_TYPE_quotas, 128, KEY_TYPE_quota }, }; void bch2_bkey_renumber(enum btree_node_type btree_node_type, struct bkey_packed *k, int write) { const struct old_bkey_type *i; for (i = bkey_renumber_table; i < bkey_renumber_table + ARRAY_SIZE(bkey_renumber_table); i++) if (btree_node_type == i->btree_node_type && k->type == (write ? i->new : i->old)) { k->type = write ? i->old : i->new; break; } } void __bch2_bkey_compat(unsigned level, enum btree_id btree_id, unsigned version, unsigned big_endian, int write, struct bkey_format *f, struct bkey_packed *k) { const struct bkey_ops *ops; struct bkey uk; unsigned nr_compat = 5; int i; /* * Do these operations in reverse order in the write path: */ for (i = 0; i < nr_compat; i++) switch (!write ? i : nr_compat - 1 - i) { case 0: if (big_endian != CPU_BIG_ENDIAN) { bch2_bkey_swab_key(f, k); } else if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) { bch2_bkey_swab_key(f, k); bch2_bkey_swab_key(f, k); } break; case 1: if (version < bcachefs_metadata_version_bkey_renumber) bch2_bkey_renumber(__btree_node_type(level, btree_id), k, write); break; case 2: if (version < bcachefs_metadata_version_inode_btree_change && btree_id == BTREE_ID_inodes) { if (!bkey_packed(k)) { struct bkey_i *u = packed_to_bkey(k); swap(u->k.p.inode, u->k.p.offset); } else if (f->bits_per_field[BKEY_FIELD_INODE] && f->bits_per_field[BKEY_FIELD_OFFSET]) { struct bkey_format tmp = *f, *in = f, *out = &tmp; swap(tmp.bits_per_field[BKEY_FIELD_INODE], tmp.bits_per_field[BKEY_FIELD_OFFSET]); swap(tmp.field_offset[BKEY_FIELD_INODE], tmp.field_offset[BKEY_FIELD_OFFSET]); if (!write) swap(in, out); uk = __bch2_bkey_unpack_key(in, k); swap(uk.p.inode, uk.p.offset); BUG_ON(!bch2_bkey_pack_key(k, &uk, out)); } } break; case 3: if (version < bcachefs_metadata_version_snapshot && (level || btree_type_has_snapshots(btree_id))) { struct bkey_i *u = packed_to_bkey(k); if (u) { u->k.p.snapshot = write ? 0 : U32_MAX; } else { u64 min_packed = le64_to_cpu(f->field_offset[BKEY_FIELD_SNAPSHOT]); u64 max_packed = min_packed + ~(~0ULL << f->bits_per_field[BKEY_FIELD_SNAPSHOT]); uk = __bch2_bkey_unpack_key(f, k); uk.p.snapshot = write ? min_packed : min_t(u64, U32_MAX, max_packed); BUG_ON(!bch2_bkey_pack_key(k, &uk, f)); } } break; case 4: { struct bkey_s u; if (!bkey_packed(k)) { u = bkey_i_to_s(packed_to_bkey(k)); } else { uk = __bch2_bkey_unpack_key(f, k); u.k = &uk; u.v = bkeyp_val(f, k); } if (big_endian != CPU_BIG_ENDIAN) bch2_bkey_swab_val(u); ops = bch2_bkey_type_ops(k->type); if (ops->compat) ops->compat(btree_id, version, big_endian, write, u); break; } default: BUG(); } }
294 216 294 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 // SPDX-License-Identifier: GPL-2.0-or-later /* * Information interface for ALSA driver * Copyright (c) by Jaroslav Kysela <perex@perex.cz> */ #include <linux/slab.h> #include <linux/time.h> #include <linux/string.h> #include <linux/export.h> #include <sound/core.h> #include <sound/minors.h> #include <sound/info.h> #include <linux/utsname.h> #include <linux/mutex.h> /* * OSS compatible part */ static DEFINE_MUTEX(strings); static char *snd_sndstat_strings[SNDRV_CARDS][SNDRV_OSS_INFO_DEV_COUNT]; int snd_oss_info_register(int dev, int num, char *string) { char *x; if (snd_BUG_ON(dev < 0 || dev >= SNDRV_OSS_INFO_DEV_COUNT)) return -ENXIO; if (snd_BUG_ON(num < 0 || num >= SNDRV_CARDS)) return -ENXIO; guard(mutex)(&strings); if (string == NULL) { x = snd_sndstat_strings[num][dev]; kfree(x); x = NULL; } else { x = kstrdup(string, GFP_KERNEL); if (x == NULL) return -ENOMEM; } snd_sndstat_strings[num][dev] = x; return 0; } EXPORT_SYMBOL(snd_oss_info_register); static int snd_sndstat_show_strings(struct snd_info_buffer *buf, char *id, int dev) { int idx, ok = -1; char *str; snd_iprintf(buf, "\n%s:", id); guard(mutex)(&strings); for (idx = 0; idx < SNDRV_CARDS; idx++) { str = snd_sndstat_strings[idx][dev]; if (str) { if (ok < 0) { snd_iprintf(buf, "\n"); ok++; } snd_iprintf(buf, "%i: %s\n", idx, str); } } if (ok < 0) snd_iprintf(buf, " NOT ENABLED IN CONFIG\n"); return ok; } static void snd_sndstat_proc_read(struct snd_info_entry *entry, struct snd_info_buffer *buffer) { snd_iprintf(buffer, "Sound Driver:3.8.1a-980706 (ALSA emulation code)\n"); snd_iprintf(buffer, "Kernel: %s %s %s %s %s\n", init_utsname()->sysname, init_utsname()->nodename, init_utsname()->release, init_utsname()->version, init_utsname()->machine); snd_iprintf(buffer, "Config options: 0\n"); snd_iprintf(buffer, "\nInstalled drivers: \n"); snd_iprintf(buffer, "Type 10: ALSA emulation\n"); snd_iprintf(buffer, "\nCard config: \n"); snd_card_info_read_oss(buffer); snd_sndstat_show_strings(buffer, "Audio devices", SNDRV_OSS_INFO_DEV_AUDIO); snd_sndstat_show_strings(buffer, "Synth devices", SNDRV_OSS_INFO_DEV_SYNTH); snd_sndstat_show_strings(buffer, "Midi devices", SNDRV_OSS_INFO_DEV_MIDI); snd_sndstat_show_strings(buffer, "Timers", SNDRV_OSS_INFO_DEV_TIMERS); snd_sndstat_show_strings(buffer, "Mixers", SNDRV_OSS_INFO_DEV_MIXERS); } int __init snd_info_minor_register(void) { struct snd_info_entry *entry; memset(snd_sndstat_strings, 0, sizeof(snd_sndstat_strings)); entry = snd_info_create_module_entry(THIS_MODULE, "sndstat", snd_oss_root); if (!entry) return -ENOMEM; entry->c.text.read = snd_sndstat_proc_read; return snd_info_register(entry); /* freed in error path */ }
45 1 167 42 97 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 /* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright (c) 2018 Red Hat, Inc. * All rights reserved. */ #ifndef __LIBXFS_AG_H #define __LIBXFS_AG_H 1 #include "xfs_group.h" struct xfs_mount; struct xfs_trans; struct xfs_perag; /* * Per-ag infrastructure */ /* per-AG block reservation data structures*/ struct xfs_ag_resv { /* number of blocks originally reserved here */ xfs_extlen_t ar_orig_reserved; /* number of blocks reserved here */ xfs_extlen_t ar_reserved; /* number of blocks originally asked for */ xfs_extlen_t ar_asked; }; /* * Per-ag incore structure, copies of information in agf and agi, to improve the * performance of allocation group selection. */ struct xfs_perag { struct xfs_group pag_group; unsigned long pag_opstate; uint8_t pagf_bno_level; /* # of levels in bno btree */ uint8_t pagf_cnt_level; /* # of levels in cnt btree */ uint8_t pagf_rmap_level;/* # of levels in rmap btree */ uint32_t pagf_flcount; /* count of blocks in freelist */ xfs_extlen_t pagf_freeblks; /* total free blocks */ xfs_extlen_t pagf_longest; /* longest free space */ uint32_t pagf_btreeblks; /* # of blocks held in AGF btrees */ xfs_agino_t pagi_freecount; /* number of free inodes */ xfs_agino_t pagi_count; /* number of allocated inodes */ /* * Inode allocation search lookup optimisation. * If the pagino matches, the search for new inodes * doesn't need to search the near ones again straight away */ xfs_agino_t pagl_pagino; xfs_agino_t pagl_leftrec; xfs_agino_t pagl_rightrec; uint8_t pagf_refcount_level; /* recount btree height */ /* Blocks reserved for all kinds of metadata. */ struct xfs_ag_resv pag_meta_resv; /* Blocks reserved for the reverse mapping btree. */ struct xfs_ag_resv pag_rmapbt_resv; /* Precalculated geometry info */ xfs_agino_t agino_min; xfs_agino_t agino_max; #ifdef __KERNEL__ /* -- kernel only structures below this line -- */ #ifdef CONFIG_XFS_ONLINE_REPAIR /* * Alternate btree heights so that online repair won't trip the write * verifiers while rebuilding the AG btrees. */ uint8_t pagf_repair_bno_level; uint8_t pagf_repair_cnt_level; uint8_t pagf_repair_refcount_level; uint8_t pagf_repair_rmap_level; #endif atomic_t pagf_fstrms; /* # of filestreams active in this AG */ spinlock_t pag_ici_lock; /* incore inode cache lock */ struct radix_tree_root pag_ici_root; /* incore inode cache root */ int pag_ici_reclaimable; /* reclaimable inodes */ unsigned long pag_ici_reclaim_cursor; /* reclaim restart point */ struct xfs_buf_cache pag_bcache; /* background prealloc block trimming */ struct delayed_work pag_blockgc_work; #endif /* __KERNEL__ */ }; static inline struct xfs_perag *to_perag(struct xfs_group *xg) { return container_of(xg, struct xfs_perag, pag_group); } static inline struct xfs_group *pag_group(struct xfs_perag *pag) { return &pag->pag_group; } static inline struct xfs_mount *pag_mount(const struct xfs_perag *pag) { return pag->pag_group.xg_mount; } static inline xfs_agnumber_t pag_agno(const struct xfs_perag *pag) { return pag->pag_group.xg_gno; } /* * Per-AG operational state. These are atomic flag bits. */ #define XFS_AGSTATE_AGF_INIT 0 #define XFS_AGSTATE_AGI_INIT 1 #define XFS_AGSTATE_PREFERS_METADATA 2 #define XFS_AGSTATE_ALLOWS_INODES 3 #define XFS_AGSTATE_AGFL_NEEDS_RESET 4 #define __XFS_AG_OPSTATE(name, NAME) \ static inline bool xfs_perag_ ## name (struct xfs_perag *pag) \ { \ return test_bit(XFS_AGSTATE_ ## NAME, &pag->pag_opstate); \ } __XFS_AG_OPSTATE(initialised_agf, AGF_INIT) __XFS_AG_OPSTATE(initialised_agi, AGI_INIT) __XFS_AG_OPSTATE(prefers_metadata, PREFERS_METADATA) __XFS_AG_OPSTATE(allows_inodes, ALLOWS_INODES) __XFS_AG_OPSTATE(agfl_needs_reset, AGFL_NEEDS_RESET) int xfs_initialize_perag(struct xfs_mount *mp, xfs_agnumber_t orig_agcount, xfs_agnumber_t new_agcount, xfs_rfsblock_t dcount, xfs_agnumber_t *maxagi); void xfs_free_perag_range(struct xfs_mount *mp, xfs_agnumber_t first_agno, xfs_agnumber_t end_agno); int xfs_initialize_perag_data(struct xfs_mount *mp, xfs_agnumber_t agno); int xfs_update_last_ag_size(struct xfs_mount *mp, xfs_agnumber_t prev_agcount); /* Passive AG references */ static inline struct xfs_perag * xfs_perag_get( struct xfs_mount *mp, xfs_agnumber_t agno) { return to_perag(xfs_group_get(mp, agno, XG_TYPE_AG)); } static inline struct xfs_perag * xfs_perag_hold( struct xfs_perag *pag) { return to_perag(xfs_group_hold(pag_group(pag))); } static inline void xfs_perag_put( struct xfs_perag *pag) { xfs_group_put(pag_group(pag)); } /* Active AG references */ static inline struct xfs_perag * xfs_perag_grab( struct xfs_mount *mp, xfs_agnumber_t agno) { return to_perag(xfs_group_grab(mp, agno, XG_TYPE_AG)); } static inline void xfs_perag_rele( struct xfs_perag *pag) { xfs_group_rele(pag_group(pag)); } static inline struct xfs_perag * xfs_perag_next_range( struct xfs_mount *mp, struct xfs_perag *pag, xfs_agnumber_t start_agno, xfs_agnumber_t end_agno) { return to_perag(xfs_group_next_range(mp, pag ? pag_group(pag) : NULL, start_agno, end_agno, XG_TYPE_AG)); } static inline struct xfs_perag * xfs_perag_next_from( struct xfs_mount *mp, struct xfs_perag *pag, xfs_agnumber_t start_agno) { return xfs_perag_next_range(mp, pag, start_agno, mp->m_sb.sb_agcount - 1); } static inline struct xfs_perag * xfs_perag_next( struct xfs_mount *mp, struct xfs_perag *pag) { return xfs_perag_next_from(mp, pag, 0); } /* * Per-ag geometry infomation and validation */ xfs_agblock_t xfs_ag_block_count(struct xfs_mount *mp, xfs_agnumber_t agno); void xfs_agino_range(struct xfs_mount *mp, xfs_agnumber_t agno, xfs_agino_t *first, xfs_agino_t *last); static inline bool xfs_verify_agbno(struct xfs_perag *pag, xfs_agblock_t agbno) { return xfs_verify_gbno(pag_group(pag), agbno); } static inline bool xfs_verify_agbext( struct xfs_perag *pag, xfs_agblock_t agbno, xfs_agblock_t len) { return xfs_verify_gbext(pag_group(pag), agbno, len); } /* * Verify that an AG inode number pointer neither points outside the AG * nor points at static metadata. */ static inline bool xfs_verify_agino(struct xfs_perag *pag, xfs_agino_t agino) { if (agino < pag->agino_min) return false; if (agino > pag->agino_max) return false; return true; } /* * Verify that an AG inode number pointer neither points outside the AG * nor points at static metadata, or is NULLAGINO. */ static inline bool xfs_verify_agino_or_null(struct xfs_perag *pag, xfs_agino_t agino) { if (agino == NULLAGINO) return true; return xfs_verify_agino(pag, agino); } static inline bool xfs_ag_contains_log(struct xfs_mount *mp, xfs_agnumber_t agno) { return mp->m_sb.sb_logstart > 0 && agno == XFS_FSB_TO_AGNO(mp, mp->m_sb.sb_logstart); } static inline struct xfs_perag * xfs_perag_next_wrap( struct xfs_perag *pag, xfs_agnumber_t *agno, xfs_agnumber_t stop_agno, xfs_agnumber_t restart_agno, xfs_agnumber_t wrap_agno) { struct xfs_mount *mp = pag_mount(pag); *agno = pag_agno(pag) + 1; xfs_perag_rele(pag); while (*agno != stop_agno) { if (*agno >= wrap_agno) { if (restart_agno >= stop_agno) break; *agno = restart_agno; } pag = xfs_perag_grab(mp, *agno); if (pag) return pag; (*agno)++; } return NULL; } /* * Iterate all AGs from start_agno through wrap_agno, then restart_agno through * (start_agno - 1). */ #define for_each_perag_wrap_range(mp, start_agno, restart_agno, wrap_agno, agno, pag) \ for ((agno) = (start_agno), (pag) = xfs_perag_grab((mp), (agno)); \ (pag) != NULL; \ (pag) = xfs_perag_next_wrap((pag), &(agno), (start_agno), \ (restart_agno), (wrap_agno))) /* * Iterate all AGs from start_agno through wrap_agno, then 0 through * (start_agno - 1). */ #define for_each_perag_wrap_at(mp, start_agno, wrap_agno, agno, pag) \ for_each_perag_wrap_range((mp), (start_agno), 0, (wrap_agno), (agno), (pag)) /* * Iterate all AGs from start_agno through to the end of the filesystem, then 0 * through (start_agno - 1). */ #define for_each_perag_wrap(mp, start_agno, agno, pag) \ for_each_perag_wrap_at((mp), (start_agno), (mp)->m_sb.sb_agcount, \ (agno), (pag)) struct aghdr_init_data { /* per ag data */ xfs_agblock_t agno; /* ag to init */ xfs_extlen_t agsize; /* new AG size */ struct list_head buffer_list; /* buffer writeback list */ xfs_rfsblock_t nfree; /* cumulative new free space */ /* per header data */ xfs_daddr_t daddr; /* header location */ size_t numblks; /* size of header */ const struct xfs_btree_ops *bc_ops; /* btree ops */ }; int xfs_ag_init_headers(struct xfs_mount *mp, struct aghdr_init_data *id); int xfs_ag_shrink_space(struct xfs_perag *pag, struct xfs_trans **tpp, xfs_extlen_t delta); int xfs_ag_extend_space(struct xfs_perag *pag, struct xfs_trans *tp, xfs_extlen_t len); int xfs_ag_get_geometry(struct xfs_perag *pag, struct xfs_ag_geometry *ageo); static inline xfs_fsblock_t xfs_agbno_to_fsb( struct xfs_perag *pag, xfs_agblock_t agbno) { return XFS_AGB_TO_FSB(pag_mount(pag), pag_agno(pag), agbno); } static inline xfs_daddr_t xfs_agbno_to_daddr( struct xfs_perag *pag, xfs_agblock_t agbno) { return XFS_AGB_TO_DADDR(pag_mount(pag), pag_agno(pag), agbno); } static inline xfs_ino_t xfs_agino_to_ino( struct xfs_perag *pag, xfs_agino_t agino) { return XFS_AGINO_TO_INO(pag_mount(pag), pag_agno(pag), agino); } #endif /* __LIBXFS_AG_H */
3 3 3 3 3 3 3 3 3 1 1 1 1 1 1 11 11 10 11 10 11 10 1 2 3 8 4 8 8 8 8 3 7 8 3 8 8 4 4 4 4 4 3 11 8 3 5 5 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 // SPDX-License-Identifier: GPL-2.0-only #include <linux/net_tstamp.h> #include <linux/phy.h> #include <linux/phy_link_topology.h> #include <linux/ptp_clock_kernel.h> #include <net/netdev_lock.h> #include "netlink.h" #include "common.h" #include "bitset.h" #include "ts.h" struct tsinfo_req_info { struct ethnl_req_info base; struct hwtstamp_provider_desc hwprov_desc; }; struct tsinfo_reply_data { struct ethnl_reply_data base; struct kernel_ethtool_ts_info ts_info; struct ethtool_ts_stats stats; }; #define TSINFO_REQINFO(__req_base) \ container_of(__req_base, struct tsinfo_req_info, base) #define TSINFO_REPDATA(__reply_base) \ container_of(__reply_base, struct tsinfo_reply_data, base) #define ETHTOOL_TS_STAT_CNT \ (__ETHTOOL_A_TS_STAT_CNT - (ETHTOOL_A_TS_STAT_UNSPEC + 1)) const struct nla_policy ethnl_tsinfo_get_policy[ETHTOOL_A_TSINFO_MAX + 1] = { [ETHTOOL_A_TSINFO_HEADER] = NLA_POLICY_NESTED(ethnl_header_policy_stats), [ETHTOOL_A_TSINFO_HWTSTAMP_PROVIDER] = NLA_POLICY_NESTED(ethnl_ts_hwtst_prov_policy), }; int ts_parse_hwtst_provider(const struct nlattr *nest, struct hwtstamp_provider_desc *hwprov_desc, struct netlink_ext_ack *extack, bool *mod) { struct nlattr *tb[ARRAY_SIZE(ethnl_ts_hwtst_prov_policy)]; int ret; ret = nla_parse_nested(tb, ARRAY_SIZE(ethnl_ts_hwtst_prov_policy) - 1, nest, ethnl_ts_hwtst_prov_policy, extack); if (ret < 0) return ret; if (NL_REQ_ATTR_CHECK(extack, nest, tb, ETHTOOL_A_TS_HWTSTAMP_PROVIDER_INDEX) || NL_REQ_ATTR_CHECK(extack, nest, tb, ETHTOOL_A_TS_HWTSTAMP_PROVIDER_QUALIFIER)) return -EINVAL; ethnl_update_u32(&hwprov_desc->index, tb[ETHTOOL_A_TS_HWTSTAMP_PROVIDER_INDEX], mod); ethnl_update_u32(&hwprov_desc->qualifier, tb[ETHTOOL_A_TS_HWTSTAMP_PROVIDER_QUALIFIER], mod); return 0; } static int tsinfo_parse_request(struct ethnl_req_info *req_base, struct nlattr **tb, struct netlink_ext_ack *extack) { struct tsinfo_req_info *req = TSINFO_REQINFO(req_base); bool mod = false; req->hwprov_desc.index = -1; if (!tb[ETHTOOL_A_TSINFO_HWTSTAMP_PROVIDER]) return 0; return ts_parse_hwtst_provider(tb[ETHTOOL_A_TSINFO_HWTSTAMP_PROVIDER], &req->hwprov_desc, extack, &mod); } static int tsinfo_prepare_data(const struct ethnl_req_info *req_base, struct ethnl_reply_data *reply_base, const struct genl_info *info) { struct tsinfo_reply_data *data = TSINFO_REPDATA(reply_base); struct tsinfo_req_info *req = TSINFO_REQINFO(req_base); struct net_device *dev = reply_base->dev; int ret; ret = ethnl_ops_begin(dev); if (ret < 0) return ret; if (req->hwprov_desc.index != -1) { ret = ethtool_get_ts_info_by_phc(dev, &data->ts_info, &req->hwprov_desc); ethnl_ops_complete(dev); return ret; } if (req_base->flags & ETHTOOL_FLAG_STATS) { ethtool_stats_init((u64 *)&data->stats, sizeof(data->stats) / sizeof(u64)); if (dev->ethtool_ops->get_ts_stats) dev->ethtool_ops->get_ts_stats(dev, &data->stats); } ret = __ethtool_get_ts_info(dev, &data->ts_info); ethnl_ops_complete(dev); return ret; } static int tsinfo_reply_size(const struct ethnl_req_info *req_base, const struct ethnl_reply_data *reply_base) { const struct tsinfo_reply_data *data = TSINFO_REPDATA(reply_base); bool compact = req_base->flags & ETHTOOL_FLAG_COMPACT_BITSETS; const struct kernel_ethtool_ts_info *ts_info = &data->ts_info; int len = 0; int ret; BUILD_BUG_ON(__SOF_TIMESTAMPING_CNT > 32); BUILD_BUG_ON(__HWTSTAMP_TX_CNT > 32); BUILD_BUG_ON(__HWTSTAMP_FILTER_CNT > 32); if (ts_info->so_timestamping) { ret = ethnl_bitset32_size(&ts_info->so_timestamping, NULL, __SOF_TIMESTAMPING_CNT, sof_timestamping_names, compact); if (ret < 0) return ret; len += ret; /* _TSINFO_TIMESTAMPING */ } if (ts_info->tx_types) { ret = ethnl_bitset32_size(&ts_info->tx_types, NULL, __HWTSTAMP_TX_CNT, ts_tx_type_names, compact); if (ret < 0) return ret; len += ret; /* _TSINFO_TX_TYPES */ } if (ts_info->rx_filters) { ret = ethnl_bitset32_size(&ts_info->rx_filters, NULL, __HWTSTAMP_FILTER_CNT, ts_rx_filter_names, compact); if (ret < 0) return ret; len += ret; /* _TSINFO_RX_FILTERS */ } if (ts_info->phc_index >= 0) { len += nla_total_size(sizeof(u32)); /* _TSINFO_PHC_INDEX */ /* _TSINFO_HWTSTAMP_PROVIDER */ len += nla_total_size(0) + 2 * nla_total_size(sizeof(u32)); } if (ts_info->phc_source) { len += nla_total_size(sizeof(u32)); /* _TSINFO_HWTSTAMP_SOURCE */ if (ts_info->phc_phyindex) /* _TSINFO_HWTSTAMP_PHYINDEX */ len += nla_total_size(sizeof(u32)); } if (req_base->flags & ETHTOOL_FLAG_STATS) len += nla_total_size(0) + /* _TSINFO_STATS */ nla_total_size_64bit(sizeof(u64)) * ETHTOOL_TS_STAT_CNT; return len; } static int tsinfo_put_stat(struct sk_buff *skb, u64 val, u16 attrtype) { if (val == ETHTOOL_STAT_NOT_SET) return 0; if (nla_put_uint(skb, attrtype, val)) return -EMSGSIZE; return 0; } static int tsinfo_put_stats(struct sk_buff *skb, const struct ethtool_ts_stats *stats) { struct nlattr *nest; nest = nla_nest_start(skb, ETHTOOL_A_TSINFO_STATS); if (!nest) return -EMSGSIZE; if (tsinfo_put_stat(skb, stats->tx_stats.pkts, ETHTOOL_A_TS_STAT_TX_PKTS) || tsinfo_put_stat(skb, stats->tx_stats.onestep_pkts_unconfirmed, ETHTOOL_A_TS_STAT_TX_ONESTEP_PKTS_UNCONFIRMED) || tsinfo_put_stat(skb, stats->tx_stats.lost, ETHTOOL_A_TS_STAT_TX_LOST) || tsinfo_put_stat(skb, stats->tx_stats.err, ETHTOOL_A_TS_STAT_TX_ERR)) goto err_cancel; nla_nest_end(skb, nest); return 0; err_cancel: nla_nest_cancel(skb, nest); return -EMSGSIZE; } static int tsinfo_fill_reply(struct sk_buff *skb, const struct ethnl_req_info *req_base, const struct ethnl_reply_data *reply_base) { const struct tsinfo_reply_data *data = TSINFO_REPDATA(reply_base); bool compact = req_base->flags & ETHTOOL_FLAG_COMPACT_BITSETS; const struct kernel_ethtool_ts_info *ts_info = &data->ts_info; int ret; if (ts_info->so_timestamping) { ret = ethnl_put_bitset32(skb, ETHTOOL_A_TSINFO_TIMESTAMPING, &ts_info->so_timestamping, NULL, __SOF_TIMESTAMPING_CNT, sof_timestamping_names, compact); if (ret < 0) return ret; } if (ts_info->tx_types) { ret = ethnl_put_bitset32(skb, ETHTOOL_A_TSINFO_TX_TYPES, &ts_info->tx_types, NULL, __HWTSTAMP_TX_CNT, ts_tx_type_names, compact); if (ret < 0) return ret; } if (ts_info->rx_filters) { ret = ethnl_put_bitset32(skb, ETHTOOL_A_TSINFO_RX_FILTERS, &ts_info->rx_filters, NULL, __HWTSTAMP_FILTER_CNT, ts_rx_filter_names, compact); if (ret < 0) return ret; } if (ts_info->phc_index >= 0) { struct nlattr *nest; ret = nla_put_u32(skb, ETHTOOL_A_TSINFO_PHC_INDEX, ts_info->phc_index); if (ret) return -EMSGSIZE; nest = nla_nest_start(skb, ETHTOOL_A_TSINFO_HWTSTAMP_PROVIDER); if (!nest) return -EMSGSIZE; if (nla_put_u32(skb, ETHTOOL_A_TS_HWTSTAMP_PROVIDER_INDEX, ts_info->phc_index) || nla_put_u32(skb, ETHTOOL_A_TS_HWTSTAMP_PROVIDER_QUALIFIER, ts_info->phc_qualifier)) { nla_nest_cancel(skb, nest); return -EMSGSIZE; } nla_nest_end(skb, nest); } if (ts_info->phc_source) { if (nla_put_u32(skb, ETHTOOL_A_TSINFO_HWTSTAMP_SOURCE, ts_info->phc_source)) return -EMSGSIZE; if (ts_info->phc_phyindex && nla_put_u32(skb, ETHTOOL_A_TSINFO_HWTSTAMP_PHYINDEX, ts_info->phc_phyindex)) return -EMSGSIZE; } if (req_base->flags & ETHTOOL_FLAG_STATS && tsinfo_put_stats(skb, &data->stats)) return -EMSGSIZE; return 0; } struct ethnl_tsinfo_dump_ctx { struct tsinfo_req_info *req_info; struct tsinfo_reply_data *reply_data; unsigned long pos_ifindex; bool netdev_dump_done; unsigned long pos_phyindex; enum hwtstamp_provider_qualifier pos_phcqualifier; }; static void *ethnl_tsinfo_prepare_dump(struct sk_buff *skb, struct net_device *dev, struct tsinfo_reply_data *reply_data, struct netlink_callback *cb) { struct ethnl_tsinfo_dump_ctx *ctx = (void *)cb->ctx; void *ehdr = NULL; ehdr = ethnl_dump_put(skb, cb, ETHTOOL_MSG_TSINFO_GET_REPLY); if (!ehdr) return ERR_PTR(-EMSGSIZE); reply_data = ctx->reply_data; memset(reply_data, 0, sizeof(*reply_data)); reply_data->base.dev = dev; reply_data->ts_info.cmd = ETHTOOL_GET_TS_INFO; reply_data->ts_info.phc_index = -1; return ehdr; } static int ethnl_tsinfo_end_dump(struct sk_buff *skb, struct net_device *dev, struct tsinfo_req_info *req_info, struct tsinfo_reply_data *reply_data, void *ehdr) { int ret; reply_data->ts_info.so_timestamping |= SOF_TIMESTAMPING_RX_SOFTWARE | SOF_TIMESTAMPING_SOFTWARE; ret = ethnl_fill_reply_header(skb, dev, ETHTOOL_A_TSINFO_HEADER); if (ret < 0) return ret; ret = tsinfo_fill_reply(skb, &req_info->base, &reply_data->base); if (ret < 0) return ret; reply_data->base.dev = NULL; genlmsg_end(skb, ehdr); return ret; } static int ethnl_tsinfo_dump_one_phydev(struct sk_buff *skb, struct net_device *dev, struct phy_device *phydev, struct netlink_callback *cb) { struct ethnl_tsinfo_dump_ctx *ctx = (void *)cb->ctx; struct tsinfo_reply_data *reply_data; struct tsinfo_req_info *req_info; void *ehdr = NULL; int ret = 0; if (!phy_has_tsinfo(phydev)) return -EOPNOTSUPP; reply_data = ctx->reply_data; req_info = ctx->req_info; ehdr = ethnl_tsinfo_prepare_dump(skb, dev, reply_data, cb); if (IS_ERR(ehdr)) return PTR_ERR(ehdr); ret = phy_ts_info(phydev, &reply_data->ts_info); if (ret < 0) goto err; if (reply_data->ts_info.phc_index >= 0) { reply_data->ts_info.phc_source = HWTSTAMP_SOURCE_PHYLIB; reply_data->ts_info.phc_phyindex = phydev->phyindex; } ret = ethnl_tsinfo_end_dump(skb, dev, req_info, reply_data, ehdr); if (ret < 0) goto err; return ret; err: genlmsg_cancel(skb, ehdr); return ret; } static int ethnl_tsinfo_dump_one_netdev(struct sk_buff *skb, struct net_device *dev, struct netlink_callback *cb) { struct ethnl_tsinfo_dump_ctx *ctx = (void *)cb->ctx; const struct ethtool_ops *ops = dev->ethtool_ops; struct tsinfo_reply_data *reply_data; struct tsinfo_req_info *req_info; void *ehdr = NULL; int ret = 0; if (!ops->get_ts_info) return -EOPNOTSUPP; reply_data = ctx->reply_data; req_info = ctx->req_info; for (; ctx->pos_phcqualifier < HWTSTAMP_PROVIDER_QUALIFIER_CNT; ctx->pos_phcqualifier++) { if (!net_support_hwtstamp_qualifier(dev, ctx->pos_phcqualifier)) continue; ehdr = ethnl_tsinfo_prepare_dump(skb, dev, reply_data, cb); if (IS_ERR(ehdr)) { ret = PTR_ERR(ehdr); goto err; } reply_data->ts_info.phc_qualifier = ctx->pos_phcqualifier; ret = ops->get_ts_info(dev, &reply_data->ts_info); if (ret < 0) goto err; if (reply_data->ts_info.phc_index >= 0) reply_data->ts_info.phc_source = HWTSTAMP_SOURCE_NETDEV; ret = ethnl_tsinfo_end_dump(skb, dev, req_info, reply_data, ehdr); if (ret < 0) goto err; } return ret; err: genlmsg_cancel(skb, ehdr); return ret; } static int ethnl_tsinfo_dump_one_net_topo(struct sk_buff *skb, struct net_device *dev, struct netlink_callback *cb) { struct ethnl_tsinfo_dump_ctx *ctx = (void *)cb->ctx; struct phy_device_node *pdn; int ret = 0; if (!ctx->netdev_dump_done) { ret = ethnl_tsinfo_dump_one_netdev(skb, dev, cb); if (ret < 0 && ret != -EOPNOTSUPP) return ret; ctx->netdev_dump_done = true; } if (!dev->link_topo) { if (phy_has_tsinfo(dev->phydev)) { ret = ethnl_tsinfo_dump_one_phydev(skb, dev, dev->phydev, cb); if (ret < 0 && ret != -EOPNOTSUPP) return ret; } return 0; } xa_for_each_start(&dev->link_topo->phys, ctx->pos_phyindex, pdn, ctx->pos_phyindex) { if (phy_has_tsinfo(pdn->phy)) { ret = ethnl_tsinfo_dump_one_phydev(skb, dev, pdn->phy, cb); if (ret < 0 && ret != -EOPNOTSUPP) return ret; } } return ret; } int ethnl_tsinfo_dumpit(struct sk_buff *skb, struct netlink_callback *cb) { struct ethnl_tsinfo_dump_ctx *ctx = (void *)cb->ctx; struct net *net = sock_net(skb->sk); struct net_device *dev; int ret = 0; rtnl_lock(); if (ctx->req_info->base.dev) { dev = ctx->req_info->base.dev; netdev_lock_ops(dev); ret = ethnl_tsinfo_dump_one_net_topo(skb, dev, cb); netdev_unlock_ops(dev); } else { for_each_netdev_dump(net, dev, ctx->pos_ifindex) { netdev_lock_ops(dev); ret = ethnl_tsinfo_dump_one_net_topo(skb, dev, cb); netdev_unlock_ops(dev); if (ret < 0 && ret != -EOPNOTSUPP) break; ctx->pos_phyindex = 0; ctx->netdev_dump_done = false; ctx->pos_phcqualifier = HWTSTAMP_PROVIDER_QUALIFIER_PRECISE; } } rtnl_unlock(); return ret; } int ethnl_tsinfo_start(struct netlink_callback *cb) { const struct genl_dumpit_info *info = genl_dumpit_info(cb); struct ethnl_tsinfo_dump_ctx *ctx = (void *)cb->ctx; struct nlattr **tb = info->info.attrs; struct tsinfo_reply_data *reply_data; struct tsinfo_req_info *req_info; int ret; BUILD_BUG_ON(sizeof(*ctx) > sizeof(cb->ctx)); req_info = kzalloc(sizeof(*req_info), GFP_KERNEL); if (!req_info) return -ENOMEM; reply_data = kzalloc(sizeof(*reply_data), GFP_KERNEL); if (!reply_data) { ret = -ENOMEM; goto free_req_info; } ret = ethnl_parse_header_dev_get(&req_info->base, tb[ETHTOOL_A_TSINFO_HEADER], sock_net(cb->skb->sk), cb->extack, false); if (ret < 0) goto free_reply_data; ctx->req_info = req_info; ctx->reply_data = reply_data; ctx->pos_ifindex = 0; ctx->pos_phyindex = 0; ctx->netdev_dump_done = false; ctx->pos_phcqualifier = HWTSTAMP_PROVIDER_QUALIFIER_PRECISE; return 0; free_reply_data: kfree(reply_data); free_req_info: kfree(req_info); return ret; } int ethnl_tsinfo_done(struct netlink_callback *cb) { struct ethnl_tsinfo_dump_ctx *ctx = (void *)cb->ctx; struct tsinfo_req_info *req_info = ctx->req_info; ethnl_parse_header_dev_put(&req_info->base); kfree(ctx->reply_data); kfree(ctx->req_info); return 0; } const struct ethnl_request_ops ethnl_tsinfo_request_ops = { .request_cmd = ETHTOOL_MSG_TSINFO_GET, .reply_cmd = ETHTOOL_MSG_TSINFO_GET_REPLY, .hdr_attr = ETHTOOL_A_TSINFO_HEADER, .req_info_size = sizeof(struct tsinfo_req_info), .reply_data_size = sizeof(struct tsinfo_reply_data), .parse_request = tsinfo_parse_request, .prepare_data = tsinfo_prepare_data, .reply_size = tsinfo_reply_size, .fill_reply = tsinfo_fill_reply, };
12 4 1 7 5 2 5 8 13 12 1 62 51 10 49 7 43 7 34 12 12 25 12 12 20 8 8 8 13 13 3 7 5 5 174 141 5 5 1 2 2 2 7 18 3 85 2 46 43 3 3 5 3 8 7 3 3 2 24 3 15 7 4 2 3 7 7 6 6 5 9 9 3 6 9 9 8 5 4 4 4 4 2 2 14 20 42 23 14 26 2 38 38 36 2 5 17 24 23 3 4 4 131 131 3 131 17 1 15 2 2 27 29 29 1 5 7 10 23 1 1 21 2 1 17 10 1 9 1 6 1 914 122 58 30 31 1 47 4 3 96 699 5 5 5 5 4 4 3 1 3 5 2 3 2 5 1 5 1 3 1 2 1 1 2 9 1 7 1 2 7 1 1 1 1 1 1 1 1 1 2 3 4 2 1 2 568 140 132 4 5 5 3 5 1 5 12 2 1 1 1 1 1 1 1 2 1 7 3 3 3 2 3 3 1 3 1 3 16 7 15 8 2 2 1 1 2 1 19 1 2 7 7 10 23 1 29 499 82 2 79 14 1 3 5 79 914 3 415 1 497 13 1 12 6 5 4 196 1 9 184 1 119 1 1 2 1 1 1 1 2 2 1 1 1 2 1 1 2 1 1 1 1 1 1 1 1 1 1 6 2 1 4 1 1 11 9 8 12 13 7 1 1 1 1 1 1 11 13 2 92 1 1 29 194 104 72 20 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 // SPDX-License-Identifier: GPL-2.0 /* * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket * interface as the means of communication with the user level. * * The IP to API glue. * * Authors: see ip.c * * Fixes: * Many : Split from ip.c , see ip.c for history. * Martin Mares : TOS setting fixed. * Alan Cox : Fixed a couple of oopses in Martin's * TOS tweaks. * Mike McLagan : Routing by source */ #include <linux/module.h> #include <linux/types.h> #include <linux/mm.h> #include <linux/skbuff.h> #include <linux/ip.h> #include <linux/icmp.h> #include <linux/inetdevice.h> #include <linux/netdevice.h> #include <linux/slab.h> #include <net/sock.h> #include <net/ip.h> #include <net/icmp.h> #include <net/tcp_states.h> #include <linux/udp.h> #include <linux/igmp.h> #include <linux/netfilter.h> #include <linux/route.h> #include <linux/mroute.h> #include <net/inet_ecn.h> #include <net/route.h> #include <net/xfrm.h> #include <net/compat.h> #include <net/checksum.h> #if IS_ENABLED(CONFIG_IPV6) #include <net/transp_v6.h> #endif #include <net/ip_fib.h> #include <linux/errqueue.h> #include <linux/uaccess.h> /* * SOL_IP control messages. */ static void ip_cmsg_recv_pktinfo(struct msghdr *msg, struct sk_buff *skb) { struct in_pktinfo info = *PKTINFO_SKB_CB(skb); info.ipi_addr.s_addr = ip_hdr(skb)->daddr; put_cmsg(msg, SOL_IP, IP_PKTINFO, sizeof(info), &info); } static void ip_cmsg_recv_ttl(struct msghdr *msg, struct sk_buff *skb) { int ttl = ip_hdr(skb)->ttl; put_cmsg(msg, SOL_IP, IP_TTL, sizeof(int), &ttl); } static void ip_cmsg_recv_tos(struct msghdr *msg, struct sk_buff *skb) { put_cmsg(msg, SOL_IP, IP_TOS, 1, &ip_hdr(skb)->tos); } static void ip_cmsg_recv_opts(struct msghdr *msg, struct sk_buff *skb) { if (IPCB(skb)->opt.optlen == 0) return; put_cmsg(msg, SOL_IP, IP_RECVOPTS, IPCB(skb)->opt.optlen, ip_hdr(skb) + 1); } static void ip_cmsg_recv_retopts(struct net *net, struct msghdr *msg, struct sk_buff *skb) { unsigned char optbuf[sizeof(struct ip_options) + 40]; struct ip_options *opt = (struct ip_options *)optbuf; if (IPCB(skb)->opt.optlen == 0) return; if (ip_options_echo(net, opt, skb)) { msg->msg_flags |= MSG_CTRUNC; return; } ip_options_undo(opt); put_cmsg(msg, SOL_IP, IP_RETOPTS, opt->optlen, opt->__data); } static void ip_cmsg_recv_fragsize(struct msghdr *msg, struct sk_buff *skb) { int val; if (IPCB(skb)->frag_max_size == 0) return; val = IPCB(skb)->frag_max_size; put_cmsg(msg, SOL_IP, IP_RECVFRAGSIZE, sizeof(val), &val); } static void ip_cmsg_recv_checksum(struct msghdr *msg, struct sk_buff *skb, int tlen, int offset) { __wsum csum = skb->csum; if (skb->ip_summed != CHECKSUM_COMPLETE) return; if (offset != 0) { int tend_off = skb_transport_offset(skb) + tlen; csum = csum_sub(csum, skb_checksum(skb, tend_off, offset, 0)); } put_cmsg(msg, SOL_IP, IP_CHECKSUM, sizeof(__wsum), &csum); } static void ip_cmsg_recv_security(struct msghdr *msg, struct sk_buff *skb) { struct lsm_context ctx; u32 secid; int err; err = security_socket_getpeersec_dgram(NULL, skb, &secid); if (err) return; err = security_secid_to_secctx(secid, &ctx); if (err < 0) return; put_cmsg(msg, SOL_IP, SCM_SECURITY, ctx.len, ctx.context); security_release_secctx(&ctx); } static void ip_cmsg_recv_dstaddr(struct msghdr *msg, struct sk_buff *skb) { __be16 _ports[2], *ports; struct sockaddr_in sin; /* All current transport protocols have the port numbers in the * first four bytes of the transport header and this function is * written with this assumption in mind. */ ports = skb_header_pointer(skb, skb_transport_offset(skb), sizeof(_ports), &_ports); if (!ports) return; sin.sin_family = AF_INET; sin.sin_addr.s_addr = ip_hdr(skb)->daddr; sin.sin_port = ports[1]; memset(sin.sin_zero, 0, sizeof(sin.sin_zero)); put_cmsg(msg, SOL_IP, IP_ORIGDSTADDR, sizeof(sin), &sin); } void ip_cmsg_recv_offset(struct msghdr *msg, struct sock *sk, struct sk_buff *skb, int tlen, int offset) { unsigned long flags = inet_cmsg_flags(inet_sk(sk)); if (!flags) return; /* Ordered by supposed usage frequency */ if (flags & IP_CMSG_PKTINFO) { ip_cmsg_recv_pktinfo(msg, skb); flags &= ~IP_CMSG_PKTINFO; if (!flags) return; } if (flags & IP_CMSG_TTL) { ip_cmsg_recv_ttl(msg, skb); flags &= ~IP_CMSG_TTL; if (!flags) return; } if (flags & IP_CMSG_TOS) { ip_cmsg_recv_tos(msg, skb); flags &= ~IP_CMSG_TOS; if (!flags) return; } if (flags & IP_CMSG_RECVOPTS) { ip_cmsg_recv_opts(msg, skb); flags &= ~IP_CMSG_RECVOPTS; if (!flags) return; } if (flags & IP_CMSG_RETOPTS) { ip_cmsg_recv_retopts(sock_net(sk), msg, skb); flags &= ~IP_CMSG_RETOPTS; if (!flags) return; } if (flags & IP_CMSG_PASSSEC) { ip_cmsg_recv_security(msg, skb); flags &= ~IP_CMSG_PASSSEC; if (!flags) return; } if (flags & IP_CMSG_ORIGDSTADDR) { ip_cmsg_recv_dstaddr(msg, skb); flags &= ~IP_CMSG_ORIGDSTADDR; if (!flags) return; } if (flags & IP_CMSG_CHECKSUM) ip_cmsg_recv_checksum(msg, skb, tlen, offset); if (flags & IP_CMSG_RECVFRAGSIZE) ip_cmsg_recv_fragsize(msg, skb); } EXPORT_SYMBOL(ip_cmsg_recv_offset); int ip_cmsg_send(struct sock *sk, struct msghdr *msg, struct ipcm_cookie *ipc, bool allow_ipv6) { int err, val; struct cmsghdr *cmsg; struct net *net = sock_net(sk); for_each_cmsghdr(cmsg, msg) { if (!CMSG_OK(msg, cmsg)) return -EINVAL; #if IS_ENABLED(CONFIG_IPV6) if (allow_ipv6 && cmsg->cmsg_level == SOL_IPV6 && cmsg->cmsg_type == IPV6_PKTINFO) { struct in6_pktinfo *src_info; if (cmsg->cmsg_len < CMSG_LEN(sizeof(*src_info))) return -EINVAL; src_info = (struct in6_pktinfo *)CMSG_DATA(cmsg); if (!ipv6_addr_v4mapped(&src_info->ipi6_addr)) return -EINVAL; if (src_info->ipi6_ifindex) ipc->oif = src_info->ipi6_ifindex; ipc->addr = src_info->ipi6_addr.s6_addr32[3]; continue; } #endif if (cmsg->cmsg_level == SOL_SOCKET) { err = __sock_cmsg_send(sk, cmsg, &ipc->sockc); if (err) return err; continue; } if (cmsg->cmsg_level != SOL_IP) continue; switch (cmsg->cmsg_type) { case IP_RETOPTS: err = cmsg->cmsg_len - sizeof(struct cmsghdr); /* Our caller is responsible for freeing ipc->opt */ err = ip_options_get(net, &ipc->opt, KERNEL_SOCKPTR(CMSG_DATA(cmsg)), err < 40 ? err : 40); if (err) return err; break; case IP_PKTINFO: { struct in_pktinfo *info; if (cmsg->cmsg_len != CMSG_LEN(sizeof(struct in_pktinfo))) return -EINVAL; info = (struct in_pktinfo *)CMSG_DATA(cmsg); if (info->ipi_ifindex) ipc->oif = info->ipi_ifindex; ipc->addr = info->ipi_spec_dst.s_addr; break; } case IP_TTL: if (cmsg->cmsg_len != CMSG_LEN(sizeof(int))) return -EINVAL; val = *(int *)CMSG_DATA(cmsg); if (val < 1 || val > 255) return -EINVAL; ipc->ttl = val; break; case IP_TOS: if (cmsg->cmsg_len == CMSG_LEN(sizeof(int))) val = *(int *)CMSG_DATA(cmsg); else if (cmsg->cmsg_len == CMSG_LEN(sizeof(u8))) val = *(u8 *)CMSG_DATA(cmsg); else return -EINVAL; if (val < 0 || val > 255) return -EINVAL; ipc->tos = val; ipc->sockc.priority = rt_tos2priority(ipc->tos); break; case IP_PROTOCOL: if (cmsg->cmsg_len != CMSG_LEN(sizeof(int))) return -EINVAL; val = *(int *)CMSG_DATA(cmsg); if (val < 1 || val > 255) return -EINVAL; ipc->protocol = val; break; default: return -EINVAL; } } return 0; } static void ip_ra_destroy_rcu(struct rcu_head *head) { struct ip_ra_chain *ra = container_of(head, struct ip_ra_chain, rcu); sock_put(ra->saved_sk); kfree(ra); } int ip_ra_control(struct sock *sk, unsigned char on, void (*destructor)(struct sock *)) { struct ip_ra_chain *ra, *new_ra; struct ip_ra_chain __rcu **rap; struct net *net = sock_net(sk); if (sk->sk_type != SOCK_RAW || inet_sk(sk)->inet_num == IPPROTO_RAW) return -EINVAL; new_ra = on ? kmalloc(sizeof(*new_ra), GFP_KERNEL) : NULL; if (on && !new_ra) return -ENOMEM; mutex_lock(&net->ipv4.ra_mutex); for (rap = &net->ipv4.ra_chain; (ra = rcu_dereference_protected(*rap, lockdep_is_held(&net->ipv4.ra_mutex))) != NULL; rap = &ra->next) { if (ra->sk == sk) { if (on) { mutex_unlock(&net->ipv4.ra_mutex); kfree(new_ra); return -EADDRINUSE; } /* dont let ip_call_ra_chain() use sk again */ ra->sk = NULL; RCU_INIT_POINTER(*rap, ra->next); mutex_unlock(&net->ipv4.ra_mutex); if (ra->destructor) ra->destructor(sk); /* * Delay sock_put(sk) and kfree(ra) after one rcu grace * period. This guarantee ip_call_ra_chain() dont need * to mess with socket refcounts. */ ra->saved_sk = sk; call_rcu(&ra->rcu, ip_ra_destroy_rcu); return 0; } } if (!new_ra) { mutex_unlock(&net->ipv4.ra_mutex); return -ENOBUFS; } new_ra->sk = sk; new_ra->destructor = destructor; RCU_INIT_POINTER(new_ra->next, ra); rcu_assign_pointer(*rap, new_ra); sock_hold(sk); mutex_unlock(&net->ipv4.ra_mutex); return 0; } static void ipv4_icmp_error_rfc4884(const struct sk_buff *skb, struct sock_ee_data_rfc4884 *out) { switch (icmp_hdr(skb)->type) { case ICMP_DEST_UNREACH: case ICMP_TIME_EXCEEDED: case ICMP_PARAMETERPROB: ip_icmp_error_rfc4884(skb, out, sizeof(struct icmphdr), icmp_hdr(skb)->un.reserved[1] * 4); } } void ip_icmp_error(struct sock *sk, struct sk_buff *skb, int err, __be16 port, u32 info, u8 *payload) { struct sock_exterr_skb *serr; skb = skb_clone(skb, GFP_ATOMIC); if (!skb) return; serr = SKB_EXT_ERR(skb); serr->ee.ee_errno = err; serr->ee.ee_origin = SO_EE_ORIGIN_ICMP; serr->ee.ee_type = icmp_hdr(skb)->type; serr->ee.ee_code = icmp_hdr(skb)->code; serr->ee.ee_pad = 0; serr->ee.ee_info = info; serr->ee.ee_data = 0; serr->addr_offset = (u8 *)&(((struct iphdr *)(icmp_hdr(skb) + 1))->daddr) - skb_network_header(skb); serr->port = port; if (skb_pull(skb, payload - skb->data)) { if (inet_test_bit(RECVERR_RFC4884, sk)) ipv4_icmp_error_rfc4884(skb, &serr->ee.ee_rfc4884); skb_reset_transport_header(skb); if (sock_queue_err_skb(sk, skb) == 0) return; } kfree_skb(skb); } EXPORT_SYMBOL_GPL(ip_icmp_error); void ip_local_error(struct sock *sk, int err, __be32 daddr, __be16 port, u32 info) { struct sock_exterr_skb *serr; struct iphdr *iph; struct sk_buff *skb; if (!inet_test_bit(RECVERR, sk)) return; skb = alloc_skb(sizeof(struct iphdr), GFP_ATOMIC); if (!skb) return; skb_put(skb, sizeof(struct iphdr)); skb_reset_network_header(skb); iph = ip_hdr(skb); iph->daddr = daddr; serr = SKB_EXT_ERR(skb); serr->ee.ee_errno = err; serr->ee.ee_origin = SO_EE_ORIGIN_LOCAL; serr->ee.ee_type = 0; serr->ee.ee_code = 0; serr->ee.ee_pad = 0; serr->ee.ee_info = info; serr->ee.ee_data = 0; serr->addr_offset = (u8 *)&iph->daddr - skb_network_header(skb); serr->port = port; __skb_pull(skb, skb_tail_pointer(skb) - skb->data); skb_reset_transport_header(skb); if (sock_queue_err_skb(sk, skb)) kfree_skb(skb); } /* For some errors we have valid addr_offset even with zero payload and * zero port. Also, addr_offset should be supported if port is set. */ static inline bool ipv4_datagram_support_addr(struct sock_exterr_skb *serr) { return serr->ee.ee_origin == SO_EE_ORIGIN_ICMP || serr->ee.ee_origin == SO_EE_ORIGIN_LOCAL || serr->port; } /* IPv4 supports cmsg on all imcp errors and some timestamps * * Timestamp code paths do not initialize the fields expected by cmsg: * the PKTINFO fields in skb->cb[]. Fill those in here. */ static bool ipv4_datagram_support_cmsg(const struct sock *sk, struct sk_buff *skb, int ee_origin) { struct in_pktinfo *info; if (ee_origin == SO_EE_ORIGIN_ICMP) return true; if (ee_origin == SO_EE_ORIGIN_LOCAL) return false; /* Support IP_PKTINFO on tstamp packets if requested, to correlate * timestamp with egress dev. Not possible for packets without iif * or without payload (SOF_TIMESTAMPING_OPT_TSONLY). */ info = PKTINFO_SKB_CB(skb); if (!(READ_ONCE(sk->sk_tsflags) & SOF_TIMESTAMPING_OPT_CMSG) || !info->ipi_ifindex) return false; info->ipi_spec_dst.s_addr = ip_hdr(skb)->saddr; return true; } /* * Handle MSG_ERRQUEUE */ int ip_recv_error(struct sock *sk, struct msghdr *msg, int len, int *addr_len) { struct sock_exterr_skb *serr; struct sk_buff *skb; DECLARE_SOCKADDR(struct sockaddr_in *, sin, msg->msg_name); struct { struct sock_extended_err ee; struct sockaddr_in offender; } errhdr; int err; int copied; err = -EAGAIN; skb = sock_dequeue_err_skb(sk); if (!skb) goto out; copied = skb->len; if (copied > len) { msg->msg_flags |= MSG_TRUNC; copied = len; } err = skb_copy_datagram_msg(skb, 0, msg, copied); if (unlikely(err)) { kfree_skb(skb); return err; } sock_recv_timestamp(msg, sk, skb); serr = SKB_EXT_ERR(skb); if (sin && ipv4_datagram_support_addr(serr)) { sin->sin_family = AF_INET; sin->sin_addr.s_addr = *(__be32 *)(skb_network_header(skb) + serr->addr_offset); sin->sin_port = serr->port; memset(&sin->sin_zero, 0, sizeof(sin->sin_zero)); *addr_len = sizeof(*sin); } memcpy(&errhdr.ee, &serr->ee, sizeof(struct sock_extended_err)); sin = &errhdr.offender; memset(sin, 0, sizeof(*sin)); if (ipv4_datagram_support_cmsg(sk, skb, serr->ee.ee_origin)) { sin->sin_family = AF_INET; sin->sin_addr.s_addr = ip_hdr(skb)->saddr; if (inet_cmsg_flags(inet_sk(sk))) ip_cmsg_recv(msg, skb); } put_cmsg(msg, SOL_IP, IP_RECVERR, sizeof(errhdr), &errhdr); /* Now we could try to dump offended packet options */ msg->msg_flags |= MSG_ERRQUEUE; err = copied; consume_skb(skb); out: return err; } void __ip_sock_set_tos(struct sock *sk, int val) { u8 old_tos = inet_sk(sk)->tos; if (sk->sk_type == SOCK_STREAM) { val &= ~INET_ECN_MASK; val |= old_tos & INET_ECN_MASK; } if (old_tos != val) { WRITE_ONCE(inet_sk(sk)->tos, val); WRITE_ONCE(sk->sk_priority, rt_tos2priority(val)); sk_dst_reset(sk); } } void ip_sock_set_tos(struct sock *sk, int val) { sockopt_lock_sock(sk); __ip_sock_set_tos(sk, val); sockopt_release_sock(sk); } EXPORT_SYMBOL(ip_sock_set_tos); void ip_sock_set_freebind(struct sock *sk) { inet_set_bit(FREEBIND, sk); } EXPORT_SYMBOL(ip_sock_set_freebind); void ip_sock_set_recverr(struct sock *sk) { inet_set_bit(RECVERR, sk); } EXPORT_SYMBOL(ip_sock_set_recverr); int ip_sock_set_mtu_discover(struct sock *sk, int val) { if (val < IP_PMTUDISC_DONT || val > IP_PMTUDISC_OMIT) return -EINVAL; WRITE_ONCE(inet_sk(sk)->pmtudisc, val); return 0; } EXPORT_SYMBOL(ip_sock_set_mtu_discover); void ip_sock_set_pktinfo(struct sock *sk) { inet_set_bit(PKTINFO, sk); } EXPORT_SYMBOL(ip_sock_set_pktinfo); /* * Socket option code for IP. This is the end of the line after any * TCP,UDP etc options on an IP socket. */ static bool setsockopt_needs_rtnl(int optname) { switch (optname) { case IP_ADD_MEMBERSHIP: case IP_ADD_SOURCE_MEMBERSHIP: case IP_BLOCK_SOURCE: case IP_DROP_MEMBERSHIP: case IP_DROP_SOURCE_MEMBERSHIP: case IP_MSFILTER: case IP_UNBLOCK_SOURCE: case MCAST_BLOCK_SOURCE: case MCAST_MSFILTER: case MCAST_JOIN_GROUP: case MCAST_JOIN_SOURCE_GROUP: case MCAST_LEAVE_GROUP: case MCAST_LEAVE_SOURCE_GROUP: case MCAST_UNBLOCK_SOURCE: return true; } return false; } static int set_mcast_msfilter(struct sock *sk, int ifindex, int numsrc, int fmode, struct sockaddr_storage *group, struct sockaddr_storage *list) { struct ip_msfilter *msf; struct sockaddr_in *psin; int err, i; msf = kmalloc(IP_MSFILTER_SIZE(numsrc), GFP_KERNEL); if (!msf) return -ENOBUFS; psin = (struct sockaddr_in *)group; if (psin->sin_family != AF_INET) goto Eaddrnotavail; msf->imsf_multiaddr = psin->sin_addr.s_addr; msf->imsf_interface = 0; msf->imsf_fmode = fmode; msf->imsf_numsrc = numsrc; for (i = 0; i < numsrc; ++i) { psin = (struct sockaddr_in *)&list[i]; if (psin->sin_family != AF_INET) goto Eaddrnotavail; msf->imsf_slist_flex[i] = psin->sin_addr.s_addr; } err = ip_mc_msfilter(sk, msf, ifindex); kfree(msf); return err; Eaddrnotavail: kfree(msf); return -EADDRNOTAVAIL; } static int copy_group_source_from_sockptr(struct group_source_req *greqs, sockptr_t optval, int optlen) { if (in_compat_syscall()) { struct compat_group_source_req gr32; if (optlen != sizeof(gr32)) return -EINVAL; if (copy_from_sockptr(&gr32, optval, sizeof(gr32))) return -EFAULT; greqs->gsr_interface = gr32.gsr_interface; greqs->gsr_group = gr32.gsr_group; greqs->gsr_source = gr32.gsr_source; } else { if (optlen != sizeof(*greqs)) return -EINVAL; if (copy_from_sockptr(greqs, optval, sizeof(*greqs))) return -EFAULT; } return 0; } static int do_mcast_group_source(struct sock *sk, int optname, sockptr_t optval, int optlen) { struct group_source_req greqs; struct ip_mreq_source mreqs; struct sockaddr_in *psin; int omode, add, err; err = copy_group_source_from_sockptr(&greqs, optval, optlen); if (err) return err; if (greqs.gsr_group.ss_family != AF_INET || greqs.gsr_source.ss_family != AF_INET) return -EADDRNOTAVAIL; psin = (struct sockaddr_in *)&greqs.gsr_group; mreqs.imr_multiaddr = psin->sin_addr.s_addr; psin = (struct sockaddr_in *)&greqs.gsr_source; mreqs.imr_sourceaddr = psin->sin_addr.s_addr; mreqs.imr_interface = 0; /* use index for mc_source */ if (optname == MCAST_BLOCK_SOURCE) { omode = MCAST_EXCLUDE; add = 1; } else if (optname == MCAST_UNBLOCK_SOURCE) { omode = MCAST_EXCLUDE; add = 0; } else if (optname == MCAST_JOIN_SOURCE_GROUP) { struct ip_mreqn mreq; psin = (struct sockaddr_in *)&greqs.gsr_group; mreq.imr_multiaddr = psin->sin_addr; mreq.imr_address.s_addr = 0; mreq.imr_ifindex = greqs.gsr_interface; err = ip_mc_join_group_ssm(sk, &mreq, MCAST_INCLUDE); if (err && err != -EADDRINUSE) return err; greqs.gsr_interface = mreq.imr_ifindex; omode = MCAST_INCLUDE; add = 1; } else /* MCAST_LEAVE_SOURCE_GROUP */ { omode = MCAST_INCLUDE; add = 0; } return ip_mc_source(add, omode, sk, &mreqs, greqs.gsr_interface); } static int ip_set_mcast_msfilter(struct sock *sk, sockptr_t optval, int optlen) { struct group_filter *gsf = NULL; int err; if (optlen < GROUP_FILTER_SIZE(0)) return -EINVAL; if (optlen > READ_ONCE(sock_net(sk)->core.sysctl_optmem_max)) return -ENOBUFS; gsf = memdup_sockptr(optval, optlen); if (IS_ERR(gsf)) return PTR_ERR(gsf); /* numsrc >= (4G-140)/128 overflow in 32 bits */ err = -ENOBUFS; if (gsf->gf_numsrc >= 0x1ffffff || gsf->gf_numsrc > READ_ONCE(sock_net(sk)->ipv4.sysctl_igmp_max_msf)) goto out_free_gsf; err = -EINVAL; if (GROUP_FILTER_SIZE(gsf->gf_numsrc) > optlen) goto out_free_gsf; err = set_mcast_msfilter(sk, gsf->gf_interface, gsf->gf_numsrc, gsf->gf_fmode, &gsf->gf_group, gsf->gf_slist_flex); out_free_gsf: kfree(gsf); return err; } static int compat_ip_set_mcast_msfilter(struct sock *sk, sockptr_t optval, int optlen) { const int size0 = offsetof(struct compat_group_filter, gf_slist_flex); struct compat_group_filter *gf32; unsigned int n; void *p; int err; if (optlen < size0) return -EINVAL; if (optlen > READ_ONCE(sock_net(sk)->core.sysctl_optmem_max) - 4) return -ENOBUFS; p = kmalloc(optlen + 4, GFP_KERNEL); if (!p) return -ENOMEM; gf32 = p + 4; /* we want ->gf_group and ->gf_slist_flex aligned */ err = -EFAULT; if (copy_from_sockptr(gf32, optval, optlen)) goto out_free_gsf; /* numsrc >= (4G-140)/128 overflow in 32 bits */ n = gf32->gf_numsrc; err = -ENOBUFS; if (n >= 0x1ffffff) goto out_free_gsf; err = -EINVAL; if (offsetof(struct compat_group_filter, gf_slist_flex[n]) > optlen) goto out_free_gsf; /* numsrc >= (4G-140)/128 overflow in 32 bits */ err = -ENOBUFS; if (n > READ_ONCE(sock_net(sk)->ipv4.sysctl_igmp_max_msf)) goto out_free_gsf; err = set_mcast_msfilter(sk, gf32->gf_interface, n, gf32->gf_fmode, &gf32->gf_group, gf32->gf_slist_flex); out_free_gsf: kfree(p); return err; } static int ip_mcast_join_leave(struct sock *sk, int optname, sockptr_t optval, int optlen) { struct ip_mreqn mreq = { }; struct sockaddr_in *psin; struct group_req greq; if (optlen < sizeof(struct group_req)) return -EINVAL; if (copy_from_sockptr(&greq, optval, sizeof(greq))) return -EFAULT; psin = (struct sockaddr_in *)&greq.gr_group; if (psin->sin_family != AF_INET) return -EINVAL; mreq.imr_multiaddr = psin->sin_addr; mreq.imr_ifindex = greq.gr_interface; if (optname == MCAST_JOIN_GROUP) return ip_mc_join_group(sk, &mreq); return ip_mc_leave_group(sk, &mreq); } static int compat_ip_mcast_join_leave(struct sock *sk, int optname, sockptr_t optval, int optlen) { struct compat_group_req greq; struct ip_mreqn mreq = { }; struct sockaddr_in *psin; if (optlen < sizeof(struct compat_group_req)) return -EINVAL; if (copy_from_sockptr(&greq, optval, sizeof(greq))) return -EFAULT; psin = (struct sockaddr_in *)&greq.gr_group; if (psin->sin_family != AF_INET) return -EINVAL; mreq.imr_multiaddr = psin->sin_addr; mreq.imr_ifindex = greq.gr_interface; if (optname == MCAST_JOIN_GROUP) return ip_mc_join_group(sk, &mreq); return ip_mc_leave_group(sk, &mreq); } DEFINE_STATIC_KEY_FALSE(ip4_min_ttl); int do_ip_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval, unsigned int optlen) { struct inet_sock *inet = inet_sk(sk); struct net *net = sock_net(sk); int val = 0, err, retv; bool needs_rtnl = setsockopt_needs_rtnl(optname); switch (optname) { case IP_PKTINFO: case IP_RECVTTL: case IP_RECVOPTS: case IP_RECVTOS: case IP_RETOPTS: case IP_TOS: case IP_TTL: case IP_HDRINCL: case IP_MTU_DISCOVER: case IP_RECVERR: case IP_ROUTER_ALERT: case IP_FREEBIND: case IP_PASSSEC: case IP_TRANSPARENT: case IP_MINTTL: case IP_NODEFRAG: case IP_BIND_ADDRESS_NO_PORT: case IP_UNICAST_IF: case IP_MULTICAST_TTL: case IP_MULTICAST_ALL: case IP_MULTICAST_LOOP: case IP_RECVORIGDSTADDR: case IP_CHECKSUM: case IP_RECVFRAGSIZE: case IP_RECVERR_RFC4884: case IP_LOCAL_PORT_RANGE: if (optlen >= sizeof(int)) { if (copy_from_sockptr(&val, optval, sizeof(val))) return -EFAULT; } else if (optlen >= sizeof(char)) { unsigned char ucval; if (copy_from_sockptr(&ucval, optval, sizeof(ucval))) return -EFAULT; val = (int) ucval; } } /* If optlen==0, it is equivalent to val == 0 */ if (optname == IP_ROUTER_ALERT) { retv = ip_ra_control(sk, val ? 1 : 0, NULL); if (retv == 0) inet_assign_bit(RTALERT, sk, val); return retv; } if (ip_mroute_opt(optname)) return ip_mroute_setsockopt(sk, optname, optval, optlen); /* Handle options that can be set without locking the socket. */ switch (optname) { case IP_PKTINFO: inet_assign_bit(PKTINFO, sk, val); return 0; case IP_RECVTTL: inet_assign_bit(TTL, sk, val); return 0; case IP_RECVTOS: inet_assign_bit(TOS, sk, val); return 0; case IP_RECVOPTS: inet_assign_bit(RECVOPTS, sk, val); return 0; case IP_RETOPTS: inet_assign_bit(RETOPTS, sk, val); return 0; case IP_PASSSEC: inet_assign_bit(PASSSEC, sk, val); return 0; case IP_RECVORIGDSTADDR: inet_assign_bit(ORIGDSTADDR, sk, val); return 0; case IP_RECVFRAGSIZE: if (sk->sk_type != SOCK_RAW && sk->sk_type != SOCK_DGRAM) return -EINVAL; inet_assign_bit(RECVFRAGSIZE, sk, val); return 0; case IP_RECVERR: inet_assign_bit(RECVERR, sk, val); if (!val) skb_errqueue_purge(&sk->sk_error_queue); return 0; case IP_RECVERR_RFC4884: if (val < 0 || val > 1) return -EINVAL; inet_assign_bit(RECVERR_RFC4884, sk, val); return 0; case IP_FREEBIND: if (optlen < 1) return -EINVAL; inet_assign_bit(FREEBIND, sk, val); return 0; case IP_HDRINCL: if (sk->sk_type != SOCK_RAW) return -ENOPROTOOPT; inet_assign_bit(HDRINCL, sk, val); return 0; case IP_MULTICAST_LOOP: if (optlen < 1) return -EINVAL; inet_assign_bit(MC_LOOP, sk, val); return 0; case IP_MULTICAST_ALL: if (optlen < 1) return -EINVAL; if (val != 0 && val != 1) return -EINVAL; inet_assign_bit(MC_ALL, sk, val); return 0; case IP_TRANSPARENT: if (!!val && !sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) && !sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) return -EPERM; if (optlen < 1) return -EINVAL; inet_assign_bit(TRANSPARENT, sk, val); return 0; case IP_NODEFRAG: if (sk->sk_type != SOCK_RAW) return -ENOPROTOOPT; inet_assign_bit(NODEFRAG, sk, val); return 0; case IP_BIND_ADDRESS_NO_PORT: inet_assign_bit(BIND_ADDRESS_NO_PORT, sk, val); return 0; case IP_TTL: if (optlen < 1) return -EINVAL; if (val != -1 && (val < 1 || val > 255)) return -EINVAL; WRITE_ONCE(inet->uc_ttl, val); return 0; case IP_MINTTL: if (optlen < 1) return -EINVAL; if (val < 0 || val > 255) return -EINVAL; if (val) static_branch_enable(&ip4_min_ttl); WRITE_ONCE(inet->min_ttl, val); return 0; case IP_MULTICAST_TTL: if (sk->sk_type == SOCK_STREAM) return -EINVAL; if (optlen < 1) return -EINVAL; if (val == -1) val = 1; if (val < 0 || val > 255) return -EINVAL; WRITE_ONCE(inet->mc_ttl, val); return 0; case IP_MTU_DISCOVER: return ip_sock_set_mtu_discover(sk, val); case IP_TOS: /* This sets both TOS and Precedence */ ip_sock_set_tos(sk, val); return 0; case IP_LOCAL_PORT_RANGE: { u16 lo = val; u16 hi = val >> 16; if (optlen != sizeof(u32)) return -EINVAL; if (lo != 0 && hi != 0 && lo > hi) return -EINVAL; WRITE_ONCE(inet->local_port_range, val); return 0; } } err = 0; if (needs_rtnl) rtnl_lock(); sockopt_lock_sock(sk); switch (optname) { case IP_OPTIONS: { struct ip_options_rcu *old, *opt = NULL; if (optlen > 40) goto e_inval; err = ip_options_get(sock_net(sk), &opt, optval, optlen); if (err) break; old = rcu_dereference_protected(inet->inet_opt, lockdep_sock_is_held(sk)); if (inet_test_bit(IS_ICSK, sk)) { struct inet_connection_sock *icsk = inet_csk(sk); #if IS_ENABLED(CONFIG_IPV6) if (sk->sk_family == PF_INET || (!((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE)) && inet->inet_daddr != LOOPBACK4_IPV6)) { #endif if (old) icsk->icsk_ext_hdr_len -= old->opt.optlen; if (opt) icsk->icsk_ext_hdr_len += opt->opt.optlen; icsk->icsk_sync_mss(sk, icsk->icsk_pmtu_cookie); #if IS_ENABLED(CONFIG_IPV6) } #endif } rcu_assign_pointer(inet->inet_opt, opt); if (old) kfree_rcu(old, rcu); break; } case IP_CHECKSUM: if (val) { if (!(inet_test_bit(CHECKSUM, sk))) { inet_inc_convert_csum(sk); inet_set_bit(CHECKSUM, sk); } } else { if (inet_test_bit(CHECKSUM, sk)) { inet_dec_convert_csum(sk); inet_clear_bit(CHECKSUM, sk); } } break; case IP_UNICAST_IF: { struct net_device *dev = NULL; int ifindex; int midx; if (optlen != sizeof(int)) goto e_inval; ifindex = (__force int)ntohl((__force __be32)val); if (ifindex == 0) { WRITE_ONCE(inet->uc_index, 0); err = 0; break; } dev = dev_get_by_index(sock_net(sk), ifindex); err = -EADDRNOTAVAIL; if (!dev) break; midx = l3mdev_master_ifindex(dev); dev_put(dev); err = -EINVAL; if (sk->sk_bound_dev_if && midx != sk->sk_bound_dev_if) break; WRITE_ONCE(inet->uc_index, ifindex); err = 0; break; } case IP_MULTICAST_IF: { struct ip_mreqn mreq; struct net_device *dev = NULL; int midx; if (sk->sk_type == SOCK_STREAM) goto e_inval; /* * Check the arguments are allowable */ if (optlen < sizeof(struct in_addr)) goto e_inval; err = -EFAULT; if (optlen >= sizeof(struct ip_mreqn)) { if (copy_from_sockptr(&mreq, optval, sizeof(mreq))) break; } else { memset(&mreq, 0, sizeof(mreq)); if (optlen >= sizeof(struct ip_mreq)) { if (copy_from_sockptr(&mreq, optval, sizeof(struct ip_mreq))) break; } else if (optlen >= sizeof(struct in_addr)) { if (copy_from_sockptr(&mreq.imr_address, optval, sizeof(struct in_addr))) break; } } if (!mreq.imr_ifindex) { if (mreq.imr_address.s_addr == htonl(INADDR_ANY)) { WRITE_ONCE(inet->mc_index, 0); WRITE_ONCE(inet->mc_addr, 0); err = 0; break; } dev = ip_dev_find(sock_net(sk), mreq.imr_address.s_addr); if (dev) mreq.imr_ifindex = dev->ifindex; } else dev = dev_get_by_index(sock_net(sk), mreq.imr_ifindex); err = -EADDRNOTAVAIL; if (!dev) break; midx = l3mdev_master_ifindex(dev); dev_put(dev); err = -EINVAL; if (sk->sk_bound_dev_if && mreq.imr_ifindex != sk->sk_bound_dev_if && midx != sk->sk_bound_dev_if) break; WRITE_ONCE(inet->mc_index, mreq.imr_ifindex); WRITE_ONCE(inet->mc_addr, mreq.imr_address.s_addr); err = 0; break; } case IP_ADD_MEMBERSHIP: case IP_DROP_MEMBERSHIP: { struct ip_mreqn mreq; err = -EPROTO; if (inet_test_bit(IS_ICSK, sk)) break; if (optlen < sizeof(struct ip_mreq)) goto e_inval; err = -EFAULT; if (optlen >= sizeof(struct ip_mreqn)) { if (copy_from_sockptr(&mreq, optval, sizeof(mreq))) break; } else { memset(&mreq, 0, sizeof(mreq)); if (copy_from_sockptr(&mreq, optval, sizeof(struct ip_mreq))) break; } if (optname == IP_ADD_MEMBERSHIP) err = ip_mc_join_group(sk, &mreq); else err = ip_mc_leave_group(sk, &mreq); break; } case IP_MSFILTER: { struct ip_msfilter *msf; if (optlen < IP_MSFILTER_SIZE(0)) goto e_inval; if (optlen > READ_ONCE(net->core.sysctl_optmem_max)) { err = -ENOBUFS; break; } msf = memdup_sockptr(optval, optlen); if (IS_ERR(msf)) { err = PTR_ERR(msf); break; } /* numsrc >= (1G-4) overflow in 32 bits */ if (msf->imsf_numsrc >= 0x3ffffffcU || msf->imsf_numsrc > READ_ONCE(net->ipv4.sysctl_igmp_max_msf)) { kfree(msf); err = -ENOBUFS; break; } if (IP_MSFILTER_SIZE(msf->imsf_numsrc) > optlen) { kfree(msf); err = -EINVAL; break; } err = ip_mc_msfilter(sk, msf, 0); kfree(msf); break; } case IP_BLOCK_SOURCE: case IP_UNBLOCK_SOURCE: case IP_ADD_SOURCE_MEMBERSHIP: case IP_DROP_SOURCE_MEMBERSHIP: { struct ip_mreq_source mreqs; int omode, add; if (optlen != sizeof(struct ip_mreq_source)) goto e_inval; if (copy_from_sockptr(&mreqs, optval, sizeof(mreqs))) { err = -EFAULT; break; } if (optname == IP_BLOCK_SOURCE) { omode = MCAST_EXCLUDE; add = 1; } else if (optname == IP_UNBLOCK_SOURCE) { omode = MCAST_EXCLUDE; add = 0; } else if (optname == IP_ADD_SOURCE_MEMBERSHIP) { struct ip_mreqn mreq; mreq.imr_multiaddr.s_addr = mreqs.imr_multiaddr; mreq.imr_address.s_addr = mreqs.imr_interface; mreq.imr_ifindex = 0; err = ip_mc_join_group_ssm(sk, &mreq, MCAST_INCLUDE); if (err && err != -EADDRINUSE) break; omode = MCAST_INCLUDE; add = 1; } else /* IP_DROP_SOURCE_MEMBERSHIP */ { omode = MCAST_INCLUDE; add = 0; } err = ip_mc_source(add, omode, sk, &mreqs, 0); break; } case MCAST_JOIN_GROUP: case MCAST_LEAVE_GROUP: if (in_compat_syscall()) err = compat_ip_mcast_join_leave(sk, optname, optval, optlen); else err = ip_mcast_join_leave(sk, optname, optval, optlen); break; case MCAST_JOIN_SOURCE_GROUP: case MCAST_LEAVE_SOURCE_GROUP: case MCAST_BLOCK_SOURCE: case MCAST_UNBLOCK_SOURCE: err = do_mcast_group_source(sk, optname, optval, optlen); break; case MCAST_MSFILTER: if (in_compat_syscall()) err = compat_ip_set_mcast_msfilter(sk, optval, optlen); else err = ip_set_mcast_msfilter(sk, optval, optlen); break; case IP_IPSEC_POLICY: case IP_XFRM_POLICY: err = -EPERM; if (!sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) break; err = xfrm_user_policy(sk, optname, optval, optlen); break; default: err = -ENOPROTOOPT; break; } sockopt_release_sock(sk); if (needs_rtnl) rtnl_unlock(); return err; e_inval: sockopt_release_sock(sk); if (needs_rtnl) rtnl_unlock(); return -EINVAL; } /** * ipv4_pktinfo_prepare - transfer some info from rtable to skb * @sk: socket * @skb: buffer * @drop_dst: if true, drops skb dst * * To support IP_CMSG_PKTINFO option, we store rt_iif and specific * destination in skb->cb[] before dst drop. * This way, receiver doesn't make cache line misses to read rtable. */ void ipv4_pktinfo_prepare(const struct sock *sk, struct sk_buff *skb, bool drop_dst) { struct in_pktinfo *pktinfo = PKTINFO_SKB_CB(skb); bool prepare = inet_test_bit(PKTINFO, sk) || ipv6_sk_rxinfo(sk); if (prepare && skb_rtable(skb)) { /* skb->cb is overloaded: prior to this point it is IP{6}CB * which has interface index (iif) as the first member of the * underlying inet{6}_skb_parm struct. This code then overlays * PKTINFO_SKB_CB and in_pktinfo also has iif as the first * element so the iif is picked up from the prior IPCB. If iif * is the loopback interface, then return the sending interface * (e.g., process binds socket to eth0 for Tx which is * redirected to loopback in the rtable/dst). */ struct rtable *rt = skb_rtable(skb); bool l3slave = ipv4_l3mdev_skb(IPCB(skb)->flags); if (pktinfo->ipi_ifindex == LOOPBACK_IFINDEX) pktinfo->ipi_ifindex = inet_iif(skb); else if (l3slave && rt && rt->rt_iif) pktinfo->ipi_ifindex = rt->rt_iif; pktinfo->ipi_spec_dst.s_addr = fib_compute_spec_dst(skb); } else { pktinfo->ipi_ifindex = 0; pktinfo->ipi_spec_dst.s_addr = 0; } if (drop_dst) skb_dst_drop(skb); } int ip_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval, unsigned int optlen) { int err; if (level != SOL_IP) return -ENOPROTOOPT; err = do_ip_setsockopt(sk, level, optname, optval, optlen); #ifdef CONFIG_NETFILTER /* we need to exclude all possible ENOPROTOOPTs except default case */ if (err == -ENOPROTOOPT && optname != IP_HDRINCL && optname != IP_IPSEC_POLICY && optname != IP_XFRM_POLICY && !ip_mroute_opt(optname)) err = nf_setsockopt(sk, PF_INET, optname, optval, optlen); #endif return err; } EXPORT_SYMBOL(ip_setsockopt); /* * Get the options. Note for future reference. The GET of IP options gets * the _received_ ones. The set sets the _sent_ ones. */ static bool getsockopt_needs_rtnl(int optname) { switch (optname) { case IP_MSFILTER: case MCAST_MSFILTER: return true; } return false; } static int ip_get_mcast_msfilter(struct sock *sk, sockptr_t optval, sockptr_t optlen, int len) { const int size0 = offsetof(struct group_filter, gf_slist_flex); struct group_filter gsf; int num, gsf_size; int err; if (len < size0) return -EINVAL; if (copy_from_sockptr(&gsf, optval, size0)) return -EFAULT; num = gsf.gf_numsrc; err = ip_mc_gsfget(sk, &gsf, optval, offsetof(struct group_filter, gf_slist_flex)); if (err) return err; if (gsf.gf_numsrc < num) num = gsf.gf_numsrc; gsf_size = GROUP_FILTER_SIZE(num); if (copy_to_sockptr(optlen, &gsf_size, sizeof(int)) || copy_to_sockptr(optval, &gsf, size0)) return -EFAULT; return 0; } static int compat_ip_get_mcast_msfilter(struct sock *sk, sockptr_t optval, sockptr_t optlen, int len) { const int size0 = offsetof(struct compat_group_filter, gf_slist_flex); struct compat_group_filter gf32; struct group_filter gf; int num; int err; if (len < size0) return -EINVAL; if (copy_from_sockptr(&gf32, optval, size0)) return -EFAULT; gf.gf_interface = gf32.gf_interface; gf.gf_fmode = gf32.gf_fmode; num = gf.gf_numsrc = gf32.gf_numsrc; gf.gf_group = gf32.gf_group; err = ip_mc_gsfget(sk, &gf, optval, offsetof(struct compat_group_filter, gf_slist_flex)); if (err) return err; if (gf.gf_numsrc < num) num = gf.gf_numsrc; len = GROUP_FILTER_SIZE(num) - (sizeof(gf) - sizeof(gf32)); if (copy_to_sockptr(optlen, &len, sizeof(int)) || copy_to_sockptr_offset(optval, offsetof(struct compat_group_filter, gf_fmode), &gf.gf_fmode, sizeof(gf.gf_fmode)) || copy_to_sockptr_offset(optval, offsetof(struct compat_group_filter, gf_numsrc), &gf.gf_numsrc, sizeof(gf.gf_numsrc))) return -EFAULT; return 0; } int do_ip_getsockopt(struct sock *sk, int level, int optname, sockptr_t optval, sockptr_t optlen) { struct inet_sock *inet = inet_sk(sk); bool needs_rtnl = getsockopt_needs_rtnl(optname); int val, err = 0; int len; if (level != SOL_IP) return -EOPNOTSUPP; if (ip_mroute_opt(optname)) return ip_mroute_getsockopt(sk, optname, optval, optlen); if (copy_from_sockptr(&len, optlen, sizeof(int))) return -EFAULT; if (len < 0) return -EINVAL; /* Handle options that can be read without locking the socket. */ switch (optname) { case IP_PKTINFO: val = inet_test_bit(PKTINFO, sk); goto copyval; case IP_RECVTTL: val = inet_test_bit(TTL, sk); goto copyval; case IP_RECVTOS: val = inet_test_bit(TOS, sk); goto copyval; case IP_RECVOPTS: val = inet_test_bit(RECVOPTS, sk); goto copyval; case IP_RETOPTS: val = inet_test_bit(RETOPTS, sk); goto copyval; case IP_PASSSEC: val = inet_test_bit(PASSSEC, sk); goto copyval; case IP_RECVORIGDSTADDR: val = inet_test_bit(ORIGDSTADDR, sk); goto copyval; case IP_CHECKSUM: val = inet_test_bit(CHECKSUM, sk); goto copyval; case IP_RECVFRAGSIZE: val = inet_test_bit(RECVFRAGSIZE, sk); goto copyval; case IP_RECVERR: val = inet_test_bit(RECVERR, sk); goto copyval; case IP_RECVERR_RFC4884: val = inet_test_bit(RECVERR_RFC4884, sk); goto copyval; case IP_FREEBIND: val = inet_test_bit(FREEBIND, sk); goto copyval; case IP_HDRINCL: val = inet_test_bit(HDRINCL, sk); goto copyval; case IP_MULTICAST_LOOP: val = inet_test_bit(MC_LOOP, sk); goto copyval; case IP_MULTICAST_ALL: val = inet_test_bit(MC_ALL, sk); goto copyval; case IP_TRANSPARENT: val = inet_test_bit(TRANSPARENT, sk); goto copyval; case IP_NODEFRAG: val = inet_test_bit(NODEFRAG, sk); goto copyval; case IP_BIND_ADDRESS_NO_PORT: val = inet_test_bit(BIND_ADDRESS_NO_PORT, sk); goto copyval; case IP_ROUTER_ALERT: val = inet_test_bit(RTALERT, sk); goto copyval; case IP_TTL: val = READ_ONCE(inet->uc_ttl); if (val < 0) val = READ_ONCE(sock_net(sk)->ipv4.sysctl_ip_default_ttl); goto copyval; case IP_MINTTL: val = READ_ONCE(inet->min_ttl); goto copyval; case IP_MULTICAST_TTL: val = READ_ONCE(inet->mc_ttl); goto copyval; case IP_MTU_DISCOVER: val = READ_ONCE(inet->pmtudisc); goto copyval; case IP_TOS: val = READ_ONCE(inet->tos); goto copyval; case IP_OPTIONS: { unsigned char optbuf[sizeof(struct ip_options)+40]; struct ip_options *opt = (struct ip_options *)optbuf; struct ip_options_rcu *inet_opt; rcu_read_lock(); inet_opt = rcu_dereference(inet->inet_opt); opt->optlen = 0; if (inet_opt) memcpy(optbuf, &inet_opt->opt, sizeof(struct ip_options) + inet_opt->opt.optlen); rcu_read_unlock(); if (opt->optlen == 0) { len = 0; return copy_to_sockptr(optlen, &len, sizeof(int)); } ip_options_undo(opt); len = min_t(unsigned int, len, opt->optlen); if (copy_to_sockptr(optlen, &len, sizeof(int))) return -EFAULT; if (copy_to_sockptr(optval, opt->__data, len)) return -EFAULT; return 0; } case IP_MTU: { struct dst_entry *dst; val = 0; dst = sk_dst_get(sk); if (dst) { val = dst_mtu(dst); dst_release(dst); } if (!val) return -ENOTCONN; goto copyval; } case IP_PKTOPTIONS: { struct msghdr msg; if (sk->sk_type != SOCK_STREAM) return -ENOPROTOOPT; if (optval.is_kernel) { msg.msg_control_is_user = false; msg.msg_control = optval.kernel; } else { msg.msg_control_is_user = true; msg.msg_control_user = optval.user; } msg.msg_controllen = len; msg.msg_flags = in_compat_syscall() ? MSG_CMSG_COMPAT : 0; if (inet_test_bit(PKTINFO, sk)) { struct in_pktinfo info; info.ipi_addr.s_addr = READ_ONCE(inet->inet_rcv_saddr); info.ipi_spec_dst.s_addr = READ_ONCE(inet->inet_rcv_saddr); info.ipi_ifindex = READ_ONCE(inet->mc_index); put_cmsg(&msg, SOL_IP, IP_PKTINFO, sizeof(info), &info); } if (inet_test_bit(TTL, sk)) { int hlim = READ_ONCE(inet->mc_ttl); put_cmsg(&msg, SOL_IP, IP_TTL, sizeof(hlim), &hlim); } if (inet_test_bit(TOS, sk)) { int tos = READ_ONCE(inet->rcv_tos); put_cmsg(&msg, SOL_IP, IP_TOS, sizeof(tos), &tos); } len -= msg.msg_controllen; return copy_to_sockptr(optlen, &len, sizeof(int)); } case IP_UNICAST_IF: val = (__force int)htonl((__u32) READ_ONCE(inet->uc_index)); goto copyval; case IP_MULTICAST_IF: { struct in_addr addr; len = min_t(unsigned int, len, sizeof(struct in_addr)); addr.s_addr = READ_ONCE(inet->mc_addr); if (copy_to_sockptr(optlen, &len, sizeof(int))) return -EFAULT; if (copy_to_sockptr(optval, &addr, len)) return -EFAULT; return 0; } case IP_LOCAL_PORT_RANGE: val = READ_ONCE(inet->local_port_range); goto copyval; } if (needs_rtnl) rtnl_lock(); sockopt_lock_sock(sk); switch (optname) { case IP_MSFILTER: { struct ip_msfilter msf; if (len < IP_MSFILTER_SIZE(0)) { err = -EINVAL; goto out; } if (copy_from_sockptr(&msf, optval, IP_MSFILTER_SIZE(0))) { err = -EFAULT; goto out; } err = ip_mc_msfget(sk, &msf, optval, optlen); goto out; } case MCAST_MSFILTER: if (in_compat_syscall()) err = compat_ip_get_mcast_msfilter(sk, optval, optlen, len); else err = ip_get_mcast_msfilter(sk, optval, optlen, len); goto out; case IP_PROTOCOL: val = inet_sk(sk)->inet_num; break; default: sockopt_release_sock(sk); return -ENOPROTOOPT; } sockopt_release_sock(sk); copyval: if (len < sizeof(int) && len > 0 && val >= 0 && val <= 255) { unsigned char ucval = (unsigned char)val; len = 1; if (copy_to_sockptr(optlen, &len, sizeof(int))) return -EFAULT; if (copy_to_sockptr(optval, &ucval, 1)) return -EFAULT; } else { len = min_t(unsigned int, sizeof(int), len); if (copy_to_sockptr(optlen, &len, sizeof(int))) return -EFAULT; if (copy_to_sockptr(optval, &val, len)) return -EFAULT; } return 0; out: sockopt_release_sock(sk); if (needs_rtnl) rtnl_unlock(); return err; } int ip_getsockopt(struct sock *sk, int level, int optname, char __user *optval, int __user *optlen) { int err; err = do_ip_getsockopt(sk, level, optname, USER_SOCKPTR(optval), USER_SOCKPTR(optlen)); #ifdef CONFIG_NETFILTER /* we need to exclude all possible ENOPROTOOPTs except default case */ if (err == -ENOPROTOOPT && optname != IP_PKTOPTIONS && !ip_mroute_opt(optname)) { int len; if (get_user(len, optlen)) return -EFAULT; err = nf_getsockopt(sk, PF_INET, optname, optval, &len); if (err >= 0) err = put_user(len, optlen); return err; } #endif return err; } EXPORT_SYMBOL(ip_getsockopt);
10 10 10 10 10 10 10 10 10 1 10 10 1 10 3 4 9 10 8 9 11 3 1 2 3 2 2 1 11 11 2 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 // SPDX-License-Identifier: GPL-2.0-or-later /* * PRNG: Pseudo Random Number Generator * Based on NIST Recommended PRNG From ANSI X9.31 Appendix A.2.4 using * AES 128 cipher * * (C) Neil Horman <nhorman@tuxdriver.com> */ #include <crypto/internal/cipher.h> #include <crypto/internal/rng.h> #include <linux/err.h> #include <linux/init.h> #include <linux/module.h> #include <linux/moduleparam.h> #include <linux/string.h> #define DEFAULT_PRNG_KEY "0123456789abcdef" #define DEFAULT_PRNG_KSZ 16 #define DEFAULT_BLK_SZ 16 #define DEFAULT_V_SEED "zaybxcwdveuftgsh" /* * Flags for the prng_context flags field */ #define PRNG_FIXED_SIZE 0x1 #define PRNG_NEED_RESET 0x2 /* * Note: DT is our counter value * I is our intermediate value * V is our seed vector * See http://csrc.nist.gov/groups/STM/cavp/documents/rng/931rngext.pdf * for implementation details */ struct prng_context { spinlock_t prng_lock; unsigned char rand_data[DEFAULT_BLK_SZ]; unsigned char last_rand_data[DEFAULT_BLK_SZ]; unsigned char DT[DEFAULT_BLK_SZ]; unsigned char I[DEFAULT_BLK_SZ]; unsigned char V[DEFAULT_BLK_SZ]; u32 rand_data_valid; struct crypto_cipher *tfm; u32 flags; }; static int dbg; static void hexdump(char *note, unsigned char *buf, unsigned int len) { if (dbg) { printk(KERN_CRIT "%s", note); print_hex_dump(KERN_CONT, "", DUMP_PREFIX_OFFSET, 16, 1, buf, len, false); } } #define dbgprint(format, args...) do {\ if (dbg)\ printk(format, ##args);\ } while (0) static void xor_vectors(unsigned char *in1, unsigned char *in2, unsigned char *out, unsigned int size) { int i; for (i = 0; i < size; i++) out[i] = in1[i] ^ in2[i]; } /* * Returns DEFAULT_BLK_SZ bytes of random data per call * returns 0 if generation succeeded, <0 if something went wrong */ static int _get_more_prng_bytes(struct prng_context *ctx, int cont_test) { int i; unsigned char tmp[DEFAULT_BLK_SZ]; unsigned char *output = NULL; dbgprint(KERN_CRIT "Calling _get_more_prng_bytes for context %p\n", ctx); hexdump("Input DT: ", ctx->DT, DEFAULT_BLK_SZ); hexdump("Input I: ", ctx->I, DEFAULT_BLK_SZ); hexdump("Input V: ", ctx->V, DEFAULT_BLK_SZ); /* * This algorithm is a 3 stage state machine */ for (i = 0; i < 3; i++) { switch (i) { case 0: /* * Start by encrypting the counter value * This gives us an intermediate value I */ memcpy(tmp, ctx->DT, DEFAULT_BLK_SZ); output = ctx->I; hexdump("tmp stage 0: ", tmp, DEFAULT_BLK_SZ); break; case 1: /* * Next xor I with our secret vector V * encrypt that result to obtain our * pseudo random data which we output */ xor_vectors(ctx->I, ctx->V, tmp, DEFAULT_BLK_SZ); hexdump("tmp stage 1: ", tmp, DEFAULT_BLK_SZ); output = ctx->rand_data; break; case 2: /* * First check that we didn't produce the same * random data that we did last time around through this */ if (!memcmp(ctx->rand_data, ctx->last_rand_data, DEFAULT_BLK_SZ)) { if (cont_test) { panic("cprng %p Failed repetition check!\n", ctx); } printk(KERN_ERR "ctx %p Failed repetition check!\n", ctx); ctx->flags |= PRNG_NEED_RESET; return -EINVAL; } memcpy(ctx->last_rand_data, ctx->rand_data, DEFAULT_BLK_SZ); /* * Lastly xor the random data with I * and encrypt that to obtain a new secret vector V */ xor_vectors(ctx->rand_data, ctx->I, tmp, DEFAULT_BLK_SZ); output = ctx->V; hexdump("tmp stage 2: ", tmp, DEFAULT_BLK_SZ); break; } /* do the encryption */ crypto_cipher_encrypt_one(ctx->tfm, output, tmp); } /* * Now update our DT value */ for (i = DEFAULT_BLK_SZ - 1; i >= 0; i--) { ctx->DT[i] += 1; if (ctx->DT[i] != 0) break; } dbgprint("Returning new block for context %p\n", ctx); ctx->rand_data_valid = 0; hexdump("Output DT: ", ctx->DT, DEFAULT_BLK_SZ); hexdump("Output I: ", ctx->I, DEFAULT_BLK_SZ); hexdump("Output V: ", ctx->V, DEFAULT_BLK_SZ); hexdump("New Random Data: ", ctx->rand_data, DEFAULT_BLK_SZ); return 0; } /* Our exported functions */ static int get_prng_bytes(char *buf, size_t nbytes, struct prng_context *ctx, int do_cont_test) { unsigned char *ptr = buf; unsigned int byte_count = (unsigned int)nbytes; int err; spin_lock_bh(&ctx->prng_lock); err = -EINVAL; if (ctx->flags & PRNG_NEED_RESET) goto done; /* * If the FIXED_SIZE flag is on, only return whole blocks of * pseudo random data */ err = -EINVAL; if (ctx->flags & PRNG_FIXED_SIZE) { if (nbytes < DEFAULT_BLK_SZ) goto done; byte_count = DEFAULT_BLK_SZ; } /* * Return 0 in case of success as mandated by the kernel * crypto API interface definition. */ err = 0; dbgprint(KERN_CRIT "getting %d random bytes for context %p\n", byte_count, ctx); remainder: if (ctx->rand_data_valid == DEFAULT_BLK_SZ) { if (_get_more_prng_bytes(ctx, do_cont_test) < 0) { memset(buf, 0, nbytes); err = -EINVAL; goto done; } } /* * Copy any data less than an entire block */ if (byte_count < DEFAULT_BLK_SZ) { empty_rbuf: while (ctx->rand_data_valid < DEFAULT_BLK_SZ) { *ptr = ctx->rand_data[ctx->rand_data_valid]; ptr++; byte_count--; ctx->rand_data_valid++; if (byte_count == 0) goto done; } } /* * Now copy whole blocks */ for (; byte_count >= DEFAULT_BLK_SZ; byte_count -= DEFAULT_BLK_SZ) { if (ctx->rand_data_valid == DEFAULT_BLK_SZ) { if (_get_more_prng_bytes(ctx, do_cont_test) < 0) { memset(buf, 0, nbytes); err = -EINVAL; goto done; } } if (ctx->rand_data_valid > 0) goto empty_rbuf; memcpy(ptr, ctx->rand_data, DEFAULT_BLK_SZ); ctx->rand_data_valid += DEFAULT_BLK_SZ; ptr += DEFAULT_BLK_SZ; } /* * Now go back and get any remaining partial block */ if (byte_count) goto remainder; done: spin_unlock_bh(&ctx->prng_lock); dbgprint(KERN_CRIT "returning %d from get_prng_bytes in context %p\n", err, ctx); return err; } static void free_prng_context(struct prng_context *ctx) { crypto_free_cipher(ctx->tfm); } static int reset_prng_context(struct prng_context *ctx, const unsigned char *key, size_t klen, const unsigned char *V, const unsigned char *DT) { int ret; const unsigned char *prng_key; spin_lock_bh(&ctx->prng_lock); ctx->flags |= PRNG_NEED_RESET; prng_key = (key != NULL) ? key : (unsigned char *)DEFAULT_PRNG_KEY; if (!key) klen = DEFAULT_PRNG_KSZ; if (V) memcpy(ctx->V, V, DEFAULT_BLK_SZ); else memcpy(ctx->V, DEFAULT_V_SEED, DEFAULT_BLK_SZ); if (DT) memcpy(ctx->DT, DT, DEFAULT_BLK_SZ); else memset(ctx->DT, 0, DEFAULT_BLK_SZ); memset(ctx->rand_data, 0, DEFAULT_BLK_SZ); memset(ctx->last_rand_data, 0, DEFAULT_BLK_SZ); ctx->rand_data_valid = DEFAULT_BLK_SZ; ret = crypto_cipher_setkey(ctx->tfm, prng_key, klen); if (ret) { dbgprint(KERN_CRIT "PRNG: setkey() failed flags=%x\n", crypto_cipher_get_flags(ctx->tfm)); goto out; } ret = 0; ctx->flags &= ~PRNG_NEED_RESET; out: spin_unlock_bh(&ctx->prng_lock); return ret; } static int cprng_init(struct crypto_tfm *tfm) { struct prng_context *ctx = crypto_tfm_ctx(tfm); spin_lock_init(&ctx->prng_lock); ctx->tfm = crypto_alloc_cipher("aes", 0, 0); if (IS_ERR(ctx->tfm)) { dbgprint(KERN_CRIT "Failed to alloc tfm for context %p\n", ctx); return PTR_ERR(ctx->tfm); } if (reset_prng_context(ctx, NULL, DEFAULT_PRNG_KSZ, NULL, NULL) < 0) return -EINVAL; /* * after allocation, we should always force the user to reset * so they don't inadvertently use the insecure default values * without specifying them intentially */ ctx->flags |= PRNG_NEED_RESET; return 0; } static void cprng_exit(struct crypto_tfm *tfm) { free_prng_context(crypto_tfm_ctx(tfm)); } static int cprng_get_random(struct crypto_rng *tfm, const u8 *src, unsigned int slen, u8 *rdata, unsigned int dlen) { struct prng_context *prng = crypto_rng_ctx(tfm); return get_prng_bytes(rdata, dlen, prng, 0); } /* * This is the cprng_registered reset method the seed value is * interpreted as the tuple { V KEY DT} * V and KEY are required during reset, and DT is optional, detected * as being present by testing the length of the seed */ static int cprng_reset(struct crypto_rng *tfm, const u8 *seed, unsigned int slen) { struct prng_context *prng = crypto_rng_ctx(tfm); const u8 *key = seed + DEFAULT_BLK_SZ; const u8 *dt = NULL; if (slen < DEFAULT_PRNG_KSZ + DEFAULT_BLK_SZ) return -EINVAL; if (slen >= (2 * DEFAULT_BLK_SZ + DEFAULT_PRNG_KSZ)) dt = key + DEFAULT_PRNG_KSZ; reset_prng_context(prng, key, DEFAULT_PRNG_KSZ, seed, dt); if (prng->flags & PRNG_NEED_RESET) return -EINVAL; return 0; } #ifdef CONFIG_CRYPTO_FIPS static int fips_cprng_get_random(struct crypto_rng *tfm, const u8 *src, unsigned int slen, u8 *rdata, unsigned int dlen) { struct prng_context *prng = crypto_rng_ctx(tfm); return get_prng_bytes(rdata, dlen, prng, 1); } static int fips_cprng_reset(struct crypto_rng *tfm, const u8 *seed, unsigned int slen) { u8 rdata[DEFAULT_BLK_SZ]; const u8 *key = seed + DEFAULT_BLK_SZ; int rc; struct prng_context *prng = crypto_rng_ctx(tfm); if (slen < DEFAULT_PRNG_KSZ + DEFAULT_BLK_SZ) return -EINVAL; /* fips strictly requires seed != key */ if (!memcmp(seed, key, DEFAULT_PRNG_KSZ)) return -EINVAL; rc = cprng_reset(tfm, seed, slen); if (!rc) goto out; /* this primes our continuity test */ rc = get_prng_bytes(rdata, DEFAULT_BLK_SZ, prng, 0); prng->rand_data_valid = DEFAULT_BLK_SZ; out: return rc; } #endif static struct rng_alg rng_algs[] = { { .generate = cprng_get_random, .seed = cprng_reset, .seedsize = DEFAULT_PRNG_KSZ + 2 * DEFAULT_BLK_SZ, .base = { .cra_name = "stdrng", .cra_driver_name = "ansi_cprng", .cra_priority = 100, .cra_ctxsize = sizeof(struct prng_context), .cra_module = THIS_MODULE, .cra_init = cprng_init, .cra_exit = cprng_exit, } #ifdef CONFIG_CRYPTO_FIPS }, { .generate = fips_cprng_get_random, .seed = fips_cprng_reset, .seedsize = DEFAULT_PRNG_KSZ + 2 * DEFAULT_BLK_SZ, .base = { .cra_name = "fips(ansi_cprng)", .cra_driver_name = "fips_ansi_cprng", .cra_priority = 300, .cra_ctxsize = sizeof(struct prng_context), .cra_module = THIS_MODULE, .cra_init = cprng_init, .cra_exit = cprng_exit, } #endif } }; /* Module initalization */ static int __init prng_mod_init(void) { return crypto_register_rngs(rng_algs, ARRAY_SIZE(rng_algs)); } static void __exit prng_mod_fini(void) { crypto_unregister_rngs(rng_algs, ARRAY_SIZE(rng_algs)); } MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("Software Pseudo Random Number Generator"); MODULE_AUTHOR("Neil Horman <nhorman@tuxdriver.com>"); module_param(dbg, int, 0); MODULE_PARM_DESC(dbg, "Boolean to enable debugging (0/1 == off/on)"); module_init(prng_mod_init); module_exit(prng_mod_fini); MODULE_ALIAS_CRYPTO("stdrng"); MODULE_ALIAS_CRYPTO("ansi_cprng"); MODULE_IMPORT_NS("CRYPTO_INTERNAL");
2 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 // SPDX-License-Identifier: GPL-2.0+ /* * Edgeport USB Serial Converter driver * * Copyright (C) 2000-2002 Inside Out Networks, All rights reserved. * Copyright (C) 2001-2002 Greg Kroah-Hartman <greg@kroah.com> * * Supports the following devices: * EP/1 EP/2 EP/4 EP/21 EP/22 EP/221 EP/42 EP/421 WATCHPORT * * For questions or problems with this driver, contact Inside Out * Networks technical support, or Peter Berger <pberger@brimson.com>, * or Al Borchers <alborchers@steinerpoint.com>. */ #include <linux/kernel.h> #include <linux/jiffies.h> #include <linux/errno.h> #include <linux/slab.h> #include <linux/tty.h> #include <linux/tty_driver.h> #include <linux/tty_flip.h> #include <linux/module.h> #include <linux/spinlock.h> #include <linux/mutex.h> #include <linux/serial.h> #include <linux/swab.h> #include <linux/kfifo.h> #include <linux/ioctl.h> #include <linux/firmware.h> #include <linux/uaccess.h> #include <linux/usb.h> #include <linux/usb/serial.h> #include "io_16654.h" #include "io_usbvend.h" #include "io_ti.h" #define DRIVER_AUTHOR "Greg Kroah-Hartman <greg@kroah.com> and David Iacovelli" #define DRIVER_DESC "Edgeport USB Serial Driver" #define EPROM_PAGE_SIZE 64 /* different hardware types */ #define HARDWARE_TYPE_930 0 #define HARDWARE_TYPE_TIUMP 1 /* IOCTL_PRIVATE_TI_GET_MODE Definitions */ #define TI_MODE_CONFIGURING 0 /* Device has not entered start device */ #define TI_MODE_BOOT 1 /* Staying in boot mode */ #define TI_MODE_DOWNLOAD 2 /* Made it to download mode */ #define TI_MODE_TRANSITIONING 3 /* * Currently in boot mode but * transitioning to download mode */ /* read urb state */ #define EDGE_READ_URB_RUNNING 0 #define EDGE_READ_URB_STOPPING 1 #define EDGE_READ_URB_STOPPED 2 /* Product information read from the Edgeport */ struct product_info { int TiMode; /* Current TI Mode */ u8 hardware_type; /* Type of hardware */ } __packed; /* * Edgeport firmware header * * "build_number" has been set to 0 in all three of the images I have * seen, and Digi Tech Support suggests that it is safe to ignore it. * * "length" is the number of bytes of actual data following the header. * * "checksum" is the low order byte resulting from adding the values of * all the data bytes. */ struct edgeport_fw_hdr { u8 major_version; u8 minor_version; __le16 build_number; __le16 length; u8 checksum; } __packed; struct edgeport_port { u16 uart_base; u16 dma_address; u8 shadow_msr; u8 shadow_mcr; u8 shadow_lsr; u8 lsr_mask; u32 ump_read_timeout; /* * Number of milliseconds the UMP will * wait without data before completing * a read short */ int baud_rate; int close_pending; int lsr_event; struct edgeport_serial *edge_serial; struct usb_serial_port *port; u8 bUartMode; /* Port type, 0: RS232, etc. */ spinlock_t ep_lock; int ep_read_urb_state; int ep_write_urb_in_use; }; struct edgeport_serial { struct product_info product_info; u8 TI_I2C_Type; /* Type of I2C in UMP */ u8 TiReadI2C; /* * Set to TRUE if we have read the * I2c in Boot Mode */ struct mutex es_lock; int num_ports_open; struct usb_serial *serial; struct delayed_work heartbeat_work; int fw_version; bool use_heartbeat; }; /* Devices that this driver supports */ static const struct usb_device_id edgeport_1port_id_table[] = { { USB_DEVICE(USB_VENDOR_ID_ION, ION_DEVICE_ID_TI_EDGEPORT_1) }, { USB_DEVICE(USB_VENDOR_ID_ION, ION_DEVICE_ID_TI_TI3410_EDGEPORT_1) }, { USB_DEVICE(USB_VENDOR_ID_ION, ION_DEVICE_ID_TI_TI3410_EDGEPORT_1I) }, { USB_DEVICE(USB_VENDOR_ID_ION, ION_DEVICE_ID_WP_PROXIMITY) }, { USB_DEVICE(USB_VENDOR_ID_ION, ION_DEVICE_ID_WP_MOTION) }, { USB_DEVICE(USB_VENDOR_ID_ION, ION_DEVICE_ID_WP_MOISTURE) }, { USB_DEVICE(USB_VENDOR_ID_ION, ION_DEVICE_ID_WP_TEMPERATURE) }, { USB_DEVICE(USB_VENDOR_ID_ION, ION_DEVICE_ID_WP_HUMIDITY) }, { USB_DEVICE(USB_VENDOR_ID_ION, ION_DEVICE_ID_WP_POWER) }, { USB_DEVICE(USB_VENDOR_ID_ION, ION_DEVICE_ID_WP_LIGHT) }, { USB_DEVICE(USB_VENDOR_ID_ION, ION_DEVICE_ID_WP_RADIATION) }, { USB_DEVICE(USB_VENDOR_ID_ION, ION_DEVICE_ID_WP_DISTANCE) }, { USB_DEVICE(USB_VENDOR_ID_ION, ION_DEVICE_ID_WP_ACCELERATION) }, { USB_DEVICE(USB_VENDOR_ID_ION, ION_DEVICE_ID_WP_PROX_DIST) }, { USB_DEVICE(USB_VENDOR_ID_ION, ION_DEVICE_ID_PLUS_PWR_HP4CD) }, { USB_DEVICE(USB_VENDOR_ID_ION, ION_DEVICE_ID_PLUS_PWR_PCI) }, { } }; static const struct usb_device_id edgeport_2port_id_table[] = { { USB_DEVICE(USB_VENDOR_ID_ION, ION_DEVICE_ID_TI_EDGEPORT_2) }, { USB_DEVICE(USB_VENDOR_ID_ION, ION_DEVICE_ID_TI_EDGEPORT_2C) }, { USB_DEVICE(USB_VENDOR_ID_ION, ION_DEVICE_ID_TI_EDGEPORT_2I) }, { USB_DEVICE(USB_VENDOR_ID_ION, ION_DEVICE_ID_TI_EDGEPORT_421) }, { USB_DEVICE(USB_VENDOR_ID_ION, ION_DEVICE_ID_TI_EDGEPORT_21) }, { USB_DEVICE(USB_VENDOR_ID_ION, ION_DEVICE_ID_TI_EDGEPORT_42) }, { USB_DEVICE(USB_VENDOR_ID_ION, ION_DEVICE_ID_TI_EDGEPORT_4) }, { USB_DEVICE(USB_VENDOR_ID_ION, ION_DEVICE_ID_TI_EDGEPORT_4I) }, { USB_DEVICE(USB_VENDOR_ID_ION, ION_DEVICE_ID_TI_EDGEPORT_22I) }, { USB_DEVICE(USB_VENDOR_ID_ION, ION_DEVICE_ID_TI_EDGEPORT_221C) }, { USB_DEVICE(USB_VENDOR_ID_ION, ION_DEVICE_ID_TI_EDGEPORT_22C) }, { USB_DEVICE(USB_VENDOR_ID_ION, ION_DEVICE_ID_TI_EDGEPORT_21C) }, /* The 4, 8 and 16 port devices show up as multiple 2 port devices */ { USB_DEVICE(USB_VENDOR_ID_ION, ION_DEVICE_ID_TI_EDGEPORT_4S) }, { USB_DEVICE(USB_VENDOR_ID_ION, ION_DEVICE_ID_TI_EDGEPORT_8) }, { USB_DEVICE(USB_VENDOR_ID_ION, ION_DEVICE_ID_TI_EDGEPORT_8S) }, { USB_DEVICE(USB_VENDOR_ID_ION, ION_DEVICE_ID_TI_EDGEPORT_416) }, { USB_DEVICE(USB_VENDOR_ID_ION, ION_DEVICE_ID_TI_EDGEPORT_416B) }, { USB_DEVICE(USB_VENDOR_ID_ION, ION_DEVICE_ID_E5805A) }, { } }; /* Devices that this driver supports */ static const struct usb_device_id id_table_combined[] = { { USB_DEVICE(USB_VENDOR_ID_ION, ION_DEVICE_ID_TI_EDGEPORT_1) }, { USB_DEVICE(USB_VENDOR_ID_ION, ION_DEVICE_ID_TI_TI3410_EDGEPORT_1) }, { USB_DEVICE(USB_VENDOR_ID_ION, ION_DEVICE_ID_TI_TI3410_EDGEPORT_1I) }, { USB_DEVICE(USB_VENDOR_ID_ION, ION_DEVICE_ID_WP_PROXIMITY) }, { USB_DEVICE(USB_VENDOR_ID_ION, ION_DEVICE_ID_WP_MOTION) }, { USB_DEVICE(USB_VENDOR_ID_ION, ION_DEVICE_ID_WP_MOISTURE) }, { USB_DEVICE(USB_VENDOR_ID_ION, ION_DEVICE_ID_WP_TEMPERATURE) }, { USB_DEVICE(USB_VENDOR_ID_ION, ION_DEVICE_ID_WP_HUMIDITY) }, { USB_DEVICE(USB_VENDOR_ID_ION, ION_DEVICE_ID_WP_POWER) }, { USB_DEVICE(USB_VENDOR_ID_ION, ION_DEVICE_ID_WP_LIGHT) }, { USB_DEVICE(USB_VENDOR_ID_ION, ION_DEVICE_ID_WP_RADIATION) }, { USB_DEVICE(USB_VENDOR_ID_ION, ION_DEVICE_ID_WP_DISTANCE) }, { USB_DEVICE(USB_VENDOR_ID_ION, ION_DEVICE_ID_WP_ACCELERATION) }, { USB_DEVICE(USB_VENDOR_ID_ION, ION_DEVICE_ID_WP_PROX_DIST) }, { USB_DEVICE(USB_VENDOR_ID_ION, ION_DEVICE_ID_PLUS_PWR_HP4CD) }, { USB_DEVICE(USB_VENDOR_ID_ION, ION_DEVICE_ID_PLUS_PWR_PCI) }, { USB_DEVICE(USB_VENDOR_ID_ION, ION_DEVICE_ID_TI_EDGEPORT_2) }, { USB_DEVICE(USB_VENDOR_ID_ION, ION_DEVICE_ID_TI_EDGEPORT_2C) }, { USB_DEVICE(USB_VENDOR_ID_ION, ION_DEVICE_ID_TI_EDGEPORT_2I) }, { USB_DEVICE(USB_VENDOR_ID_ION, ION_DEVICE_ID_TI_EDGEPORT_421) }, { USB_DEVICE(USB_VENDOR_ID_ION, ION_DEVICE_ID_TI_EDGEPORT_21) }, { USB_DEVICE(USB_VENDOR_ID_ION, ION_DEVICE_ID_TI_EDGEPORT_42) }, { USB_DEVICE(USB_VENDOR_ID_ION, ION_DEVICE_ID_TI_EDGEPORT_4) }, { USB_DEVICE(USB_VENDOR_ID_ION, ION_DEVICE_ID_TI_EDGEPORT_4I) }, { USB_DEVICE(USB_VENDOR_ID_ION, ION_DEVICE_ID_TI_EDGEPORT_22I) }, { USB_DEVICE(USB_VENDOR_ID_ION, ION_DEVICE_ID_TI_EDGEPORT_221C) }, { USB_DEVICE(USB_VENDOR_ID_ION, ION_DEVICE_ID_TI_EDGEPORT_22C) }, { USB_DEVICE(USB_VENDOR_ID_ION, ION_DEVICE_ID_TI_EDGEPORT_21C) }, { USB_DEVICE(USB_VENDOR_ID_ION, ION_DEVICE_ID_TI_EDGEPORT_4S) }, { USB_DEVICE(USB_VENDOR_ID_ION, ION_DEVICE_ID_TI_EDGEPORT_8) }, { USB_DEVICE(USB_VENDOR_ID_ION, ION_DEVICE_ID_TI_EDGEPORT_8S) }, { USB_DEVICE(USB_VENDOR_ID_ION, ION_DEVICE_ID_TI_EDGEPORT_416) }, { USB_DEVICE(USB_VENDOR_ID_ION, ION_DEVICE_ID_TI_EDGEPORT_416B) }, { USB_DEVICE(USB_VENDOR_ID_ION, ION_DEVICE_ID_E5805A) }, { } }; MODULE_DEVICE_TABLE(usb, id_table_combined); static bool ignore_cpu_rev; static int default_uart_mode; /* RS232 */ static void edge_tty_recv(struct usb_serial_port *port, unsigned char *data, int length); static void stop_read(struct edgeport_port *edge_port); static int restart_read(struct edgeport_port *edge_port); static void edge_set_termios(struct tty_struct *tty, struct usb_serial_port *port, const struct ktermios *old_termios); static void edge_send(struct usb_serial_port *port, struct tty_struct *tty); static int do_download_mode(struct edgeport_serial *serial, const struct firmware *fw); static int do_boot_mode(struct edgeport_serial *serial, const struct firmware *fw); /* sysfs attributes */ static int edge_create_sysfs_attrs(struct usb_serial_port *port); static int edge_remove_sysfs_attrs(struct usb_serial_port *port); /* * Some release of Edgeport firmware "down3.bin" after version 4.80 * introduced code to automatically disconnect idle devices on some * Edgeport models after periods of inactivity, typically ~60 seconds. * This occurs without regard to whether ports on the device are open * or not. Digi International Tech Support suggested: * * 1. Adding driver "heartbeat" code to reset the firmware timer by * requesting a descriptor record every 15 seconds, which should be * effective with newer firmware versions that require it, and benign * with older versions that do not. In practice 40 seconds seems often * enough. * 2. The heartbeat code is currently required only on Edgeport/416 models. */ #define FW_HEARTBEAT_VERSION_CUTOFF ((4 << 8) + 80) #define FW_HEARTBEAT_SECS 40 /* Timeouts in msecs: firmware downloads take longer */ #define TI_VSEND_TIMEOUT_DEFAULT 1000 #define TI_VSEND_TIMEOUT_FW_DOWNLOAD 10000 static int ti_vread_sync(struct usb_device *dev, u8 request, u16 value, u16 index, void *data, int size) { int status; status = usb_control_msg(dev, usb_rcvctrlpipe(dev, 0), request, (USB_TYPE_VENDOR | USB_RECIP_DEVICE | USB_DIR_IN), value, index, data, size, 1000); if (status < 0) return status; if (status != size) { dev_dbg(&dev->dev, "%s - wanted to read %d, but only read %d\n", __func__, size, status); return -ECOMM; } return 0; } static int ti_vsend_sync(struct usb_device *dev, u8 request, u16 value, u16 index, void *data, int size, int timeout) { int status; status = usb_control_msg(dev, usb_sndctrlpipe(dev, 0), request, (USB_TYPE_VENDOR | USB_RECIP_DEVICE | USB_DIR_OUT), value, index, data, size, timeout); if (status < 0) return status; return 0; } static int read_port_cmd(struct usb_serial_port *port, u8 command, u16 value, void *data, int size) { return ti_vread_sync(port->serial->dev, command, value, UMPM_UART1_PORT + port->port_number, data, size); } static int send_port_cmd(struct usb_serial_port *port, u8 command, u16 value, void *data, int size) { return ti_vsend_sync(port->serial->dev, command, value, UMPM_UART1_PORT + port->port_number, data, size, TI_VSEND_TIMEOUT_DEFAULT); } /* clear tx/rx buffers and fifo in TI UMP */ static int purge_port(struct usb_serial_port *port, u16 mask) { int port_number = port->port_number; dev_dbg(&port->dev, "%s - port %d, mask %x\n", __func__, port_number, mask); return send_port_cmd(port, UMPC_PURGE_PORT, mask, NULL, 0); } /** * read_download_mem - Read edgeport memory from TI chip * @dev: usb device pointer * @start_address: Device CPU address at which to read * @length: Length of above data * @address_type: Can read both XDATA and I2C * @buffer: pointer to input data buffer */ static int read_download_mem(struct usb_device *dev, int start_address, int length, u8 address_type, u8 *buffer) { int status = 0; u8 read_length; u16 be_start_address; dev_dbg(&dev->dev, "%s - @ %x for %d\n", __func__, start_address, length); /* * Read in blocks of 64 bytes * (TI firmware can't handle more than 64 byte reads) */ while (length) { if (length > 64) read_length = 64; else read_length = (u8)length; if (read_length > 1) { dev_dbg(&dev->dev, "%s - @ %x for %d\n", __func__, start_address, read_length); } /* * NOTE: Must use swab as wIndex is sent in little-endian * byte order regardless of host byte order. */ be_start_address = swab16((u16)start_address); status = ti_vread_sync(dev, UMPC_MEMORY_READ, (u16)address_type, be_start_address, buffer, read_length); if (status) { dev_dbg(&dev->dev, "%s - ERROR %x\n", __func__, status); return status; } if (read_length > 1) usb_serial_debug_data(&dev->dev, __func__, read_length, buffer); /* Update pointers/length */ start_address += read_length; buffer += read_length; length -= read_length; } return status; } static int read_ram(struct usb_device *dev, int start_address, int length, u8 *buffer) { return read_download_mem(dev, start_address, length, DTK_ADDR_SPACE_XDATA, buffer); } /* Read edgeport memory to a given block */ static int read_boot_mem(struct edgeport_serial *serial, int start_address, int length, u8 *buffer) { int status = 0; int i; for (i = 0; i < length; i++) { status = ti_vread_sync(serial->serial->dev, UMPC_MEMORY_READ, serial->TI_I2C_Type, (u16)(start_address+i), &buffer[i], 0x01); if (status) { dev_dbg(&serial->serial->dev->dev, "%s - ERROR %x\n", __func__, status); return status; } } dev_dbg(&serial->serial->dev->dev, "%s - start_address = %x, length = %d\n", __func__, start_address, length); usb_serial_debug_data(&serial->serial->dev->dev, __func__, length, buffer); serial->TiReadI2C = 1; return status; } /* Write given block to TI EPROM memory */ static int write_boot_mem(struct edgeport_serial *serial, int start_address, int length, u8 *buffer) { int status = 0; int i; u8 *temp; /* Must do a read before write */ if (!serial->TiReadI2C) { temp = kmalloc(1, GFP_KERNEL); if (!temp) return -ENOMEM; status = read_boot_mem(serial, 0, 1, temp); kfree(temp); if (status) return status; } for (i = 0; i < length; ++i) { status = ti_vsend_sync(serial->serial->dev, UMPC_MEMORY_WRITE, buffer[i], (u16)(i + start_address), NULL, 0, TI_VSEND_TIMEOUT_DEFAULT); if (status) return status; } dev_dbg(&serial->serial->dev->dev, "%s - start_sddr = %x, length = %d\n", __func__, start_address, length); usb_serial_debug_data(&serial->serial->dev->dev, __func__, length, buffer); return status; } /* Write edgeport I2C memory to TI chip */ static int write_i2c_mem(struct edgeport_serial *serial, int start_address, int length, u8 address_type, u8 *buffer) { struct device *dev = &serial->serial->dev->dev; int status = 0; int write_length; u16 be_start_address; /* We can only send a maximum of 1 aligned byte page at a time */ /* calculate the number of bytes left in the first page */ write_length = EPROM_PAGE_SIZE - (start_address & (EPROM_PAGE_SIZE - 1)); if (write_length > length) write_length = length; dev_dbg(dev, "%s - BytesInFirstPage Addr = %x, length = %d\n", __func__, start_address, write_length); usb_serial_debug_data(dev, __func__, write_length, buffer); /* * Write first page. * * NOTE: Must use swab as wIndex is sent in little-endian byte order * regardless of host byte order. */ be_start_address = swab16((u16)start_address); status = ti_vsend_sync(serial->serial->dev, UMPC_MEMORY_WRITE, (u16)address_type, be_start_address, buffer, write_length, TI_VSEND_TIMEOUT_DEFAULT); if (status) { dev_dbg(dev, "%s - ERROR %d\n", __func__, status); return status; } length -= write_length; start_address += write_length; buffer += write_length; /* * We should be aligned now -- can write max page size bytes at a * time. */ while (length) { if (length > EPROM_PAGE_SIZE) write_length = EPROM_PAGE_SIZE; else write_length = length; dev_dbg(dev, "%s - Page Write Addr = %x, length = %d\n", __func__, start_address, write_length); usb_serial_debug_data(dev, __func__, write_length, buffer); /* * Write next page. * * NOTE: Must use swab as wIndex is sent in little-endian byte * order regardless of host byte order. */ be_start_address = swab16((u16)start_address); status = ti_vsend_sync(serial->serial->dev, UMPC_MEMORY_WRITE, (u16)address_type, be_start_address, buffer, write_length, TI_VSEND_TIMEOUT_DEFAULT); if (status) { dev_err(dev, "%s - ERROR %d\n", __func__, status); return status; } length -= write_length; start_address += write_length; buffer += write_length; } return status; } /* * Examine the UMP DMA registers and LSR * * Check the MSBit of the X and Y DMA byte count registers. * A zero in this bit indicates that the TX DMA buffers are empty * then check the TX Empty bit in the UART. */ static int tx_active(struct edgeport_port *port) { int status; struct out_endpoint_desc_block *oedb; u8 *lsr; int bytes_left = 0; oedb = kmalloc(sizeof(*oedb), GFP_KERNEL); if (!oedb) return -ENOMEM; /* * Sigh, that's right, just one byte, as not all platforms can * do DMA from stack */ lsr = kmalloc(1, GFP_KERNEL); if (!lsr) { kfree(oedb); return -ENOMEM; } /* Read the DMA Count Registers */ status = read_ram(port->port->serial->dev, port->dma_address, sizeof(*oedb), (void *)oedb); if (status) goto exit_is_tx_active; dev_dbg(&port->port->dev, "%s - XByteCount 0x%X\n", __func__, oedb->XByteCount); /* and the LSR */ status = read_ram(port->port->serial->dev, port->uart_base + UMPMEM_OFFS_UART_LSR, 1, lsr); if (status) goto exit_is_tx_active; dev_dbg(&port->port->dev, "%s - LSR = 0x%X\n", __func__, *lsr); /* If either buffer has data or we are transmitting then return TRUE */ if ((oedb->XByteCount & 0x80) != 0) bytes_left += 64; if ((*lsr & UMP_UART_LSR_TX_MASK) == 0) bytes_left += 1; /* We return Not Active if we get any kind of error */ exit_is_tx_active: dev_dbg(&port->port->dev, "%s - return %d\n", __func__, bytes_left); kfree(lsr); kfree(oedb); return bytes_left; } static int choose_config(struct usb_device *dev) { /* * There may be multiple configurations on this device, in which case * we would need to read and parse all of them to find out which one * we want. However, we just support one config at this point, * configuration # 1, which is Config Descriptor 0. */ dev_dbg(&dev->dev, "%s - Number of Interfaces = %d\n", __func__, dev->config->desc.bNumInterfaces); dev_dbg(&dev->dev, "%s - MAX Power = %d\n", __func__, dev->config->desc.bMaxPower * 2); if (dev->config->desc.bNumInterfaces != 1) { dev_err(&dev->dev, "%s - bNumInterfaces is not 1, ERROR!\n", __func__); return -ENODEV; } return 0; } static int read_rom(struct edgeport_serial *serial, int start_address, int length, u8 *buffer) { int status; if (serial->product_info.TiMode == TI_MODE_DOWNLOAD) { status = read_download_mem(serial->serial->dev, start_address, length, serial->TI_I2C_Type, buffer); } else { status = read_boot_mem(serial, start_address, length, buffer); } return status; } static int write_rom(struct edgeport_serial *serial, int start_address, int length, u8 *buffer) { if (serial->product_info.TiMode == TI_MODE_BOOT) return write_boot_mem(serial, start_address, length, buffer); if (serial->product_info.TiMode == TI_MODE_DOWNLOAD) return write_i2c_mem(serial, start_address, length, serial->TI_I2C_Type, buffer); return -EINVAL; } /* Read a descriptor header from I2C based on type */ static int get_descriptor_addr(struct edgeport_serial *serial, int desc_type, struct ti_i2c_desc *rom_desc) { int start_address; int status; /* Search for requested descriptor in I2C */ start_address = 2; do { status = read_rom(serial, start_address, sizeof(struct ti_i2c_desc), (u8 *)rom_desc); if (status) return 0; if (rom_desc->Type == desc_type) return start_address; start_address = start_address + sizeof(struct ti_i2c_desc) + le16_to_cpu(rom_desc->Size); } while ((start_address < TI_MAX_I2C_SIZE) && rom_desc->Type); return 0; } /* Validate descriptor checksum */ static int valid_csum(struct ti_i2c_desc *rom_desc, u8 *buffer) { u16 i; u8 cs = 0; for (i = 0; i < le16_to_cpu(rom_desc->Size); i++) cs = (u8)(cs + buffer[i]); if (cs != rom_desc->CheckSum) { pr_debug("%s - Mismatch %x - %x", __func__, rom_desc->CheckSum, cs); return -EINVAL; } return 0; } /* Make sure that the I2C image is good */ static int check_i2c_image(struct edgeport_serial *serial) { struct device *dev = &serial->serial->dev->dev; int status = 0; struct ti_i2c_desc *rom_desc; int start_address = 2; u8 *buffer; u16 ttype; rom_desc = kmalloc(sizeof(*rom_desc), GFP_KERNEL); if (!rom_desc) return -ENOMEM; buffer = kmalloc(TI_MAX_I2C_SIZE, GFP_KERNEL); if (!buffer) { kfree(rom_desc); return -ENOMEM; } /* Read the first byte (Signature0) must be 0x52 or 0x10 */ status = read_rom(serial, 0, 1, buffer); if (status) goto out; if (*buffer != UMP5152 && *buffer != UMP3410) { dev_err(dev, "%s - invalid buffer signature\n", __func__); status = -ENODEV; goto out; } do { /* Validate the I2C */ status = read_rom(serial, start_address, sizeof(struct ti_i2c_desc), (u8 *)rom_desc); if (status) break; if ((start_address + sizeof(struct ti_i2c_desc) + le16_to_cpu(rom_desc->Size)) > TI_MAX_I2C_SIZE) { status = -ENODEV; dev_dbg(dev, "%s - structure too big, erroring out.\n", __func__); break; } dev_dbg(dev, "%s Type = 0x%x\n", __func__, rom_desc->Type); /* Skip type 2 record */ ttype = rom_desc->Type & 0x0f; if (ttype != I2C_DESC_TYPE_FIRMWARE_BASIC && ttype != I2C_DESC_TYPE_FIRMWARE_AUTO) { /* Read the descriptor data */ status = read_rom(serial, start_address + sizeof(struct ti_i2c_desc), le16_to_cpu(rom_desc->Size), buffer); if (status) break; status = valid_csum(rom_desc, buffer); if (status) break; } start_address = start_address + sizeof(struct ti_i2c_desc) + le16_to_cpu(rom_desc->Size); } while ((rom_desc->Type != I2C_DESC_TYPE_ION) && (start_address < TI_MAX_I2C_SIZE)); if ((rom_desc->Type != I2C_DESC_TYPE_ION) || (start_address > TI_MAX_I2C_SIZE)) status = -ENODEV; out: kfree(buffer); kfree(rom_desc); return status; } static int get_manuf_info(struct edgeport_serial *serial, u8 *buffer) { int status; int start_address; struct ti_i2c_desc *rom_desc; struct edge_ti_manuf_descriptor *desc; struct device *dev = &serial->serial->dev->dev; rom_desc = kmalloc(sizeof(*rom_desc), GFP_KERNEL); if (!rom_desc) return -ENOMEM; start_address = get_descriptor_addr(serial, I2C_DESC_TYPE_ION, rom_desc); if (!start_address) { dev_dbg(dev, "%s - Edge Descriptor not found in I2C\n", __func__); status = -ENODEV; goto exit; } /* Read the descriptor data */ status = read_rom(serial, start_address+sizeof(struct ti_i2c_desc), le16_to_cpu(rom_desc->Size), buffer); if (status) goto exit; status = valid_csum(rom_desc, buffer); desc = (struct edge_ti_manuf_descriptor *)buffer; dev_dbg(dev, "%s - IonConfig 0x%x\n", __func__, desc->IonConfig); dev_dbg(dev, "%s - Version %d\n", __func__, desc->Version); dev_dbg(dev, "%s - Cpu/Board 0x%x\n", __func__, desc->CpuRev_BoardRev); dev_dbg(dev, "%s - NumPorts %d\n", __func__, desc->NumPorts); dev_dbg(dev, "%s - NumVirtualPorts %d\n", __func__, desc->NumVirtualPorts); dev_dbg(dev, "%s - TotalPorts %d\n", __func__, desc->TotalPorts); exit: kfree(rom_desc); return status; } /* Build firmware header used for firmware update */ static int build_i2c_fw_hdr(u8 *header, const struct firmware *fw) { u8 *buffer; int buffer_size; int i; u8 cs = 0; struct ti_i2c_desc *i2c_header; struct ti_i2c_image_header *img_header; struct ti_i2c_firmware_rec *firmware_rec; struct edgeport_fw_hdr *fw_hdr = (struct edgeport_fw_hdr *)fw->data; /* * In order to update the I2C firmware we must change the type 2 record * to type 0xF2. This will force the UMP to come up in Boot Mode. * Then while in boot mode, the driver will download the latest * firmware (padded to 15.5k) into the UMP ram. And finally when the * device comes back up in download mode the driver will cause the new * firmware to be copied from the UMP Ram to I2C and the firmware will * update the record type from 0xf2 to 0x02. */ /* * Allocate a 15.5k buffer + 2 bytes for version number (Firmware * Record) */ buffer_size = (((1024 * 16) - 512 ) + sizeof(struct ti_i2c_firmware_rec)); buffer = kmalloc(buffer_size, GFP_KERNEL); if (!buffer) return -ENOMEM; /* Set entire image of 0xffs */ memset(buffer, 0xff, buffer_size); /* Copy version number into firmware record */ firmware_rec = (struct ti_i2c_firmware_rec *)buffer; firmware_rec->Ver_Major = fw_hdr->major_version; firmware_rec->Ver_Minor = fw_hdr->minor_version; /* Pointer to fw_down memory image */ img_header = (struct ti_i2c_image_header *)&fw->data[4]; memcpy(buffer + sizeof(struct ti_i2c_firmware_rec), &fw->data[4 + sizeof(struct ti_i2c_image_header)], le16_to_cpu(img_header->Length)); for (i=0; i < buffer_size; i++) { cs = (u8)(cs + buffer[i]); } kfree(buffer); /* Build new header */ i2c_header = (struct ti_i2c_desc *)header; firmware_rec = (struct ti_i2c_firmware_rec*)i2c_header->Data; i2c_header->Type = I2C_DESC_TYPE_FIRMWARE_BLANK; i2c_header->Size = cpu_to_le16(buffer_size); i2c_header->CheckSum = cs; firmware_rec->Ver_Major = fw_hdr->major_version; firmware_rec->Ver_Minor = fw_hdr->minor_version; return 0; } /* Try to figure out what type of I2c we have */ static int i2c_type_bootmode(struct edgeport_serial *serial) { struct device *dev = &serial->serial->dev->dev; int status; u8 *data; data = kmalloc(1, GFP_KERNEL); if (!data) return -ENOMEM; /* Try to read type 2 */ status = ti_vread_sync(serial->serial->dev, UMPC_MEMORY_READ, DTK_ADDR_SPACE_I2C_TYPE_II, 0, data, 0x01); if (status) dev_dbg(dev, "%s - read 2 status error = %d\n", __func__, status); else dev_dbg(dev, "%s - read 2 data = 0x%x\n", __func__, *data); if ((!status) && (*data == UMP5152 || *data == UMP3410)) { dev_dbg(dev, "%s - ROM_TYPE_II\n", __func__); serial->TI_I2C_Type = DTK_ADDR_SPACE_I2C_TYPE_II; goto out; } /* Try to read type 3 */ status = ti_vread_sync(serial->serial->dev, UMPC_MEMORY_READ, DTK_ADDR_SPACE_I2C_TYPE_III, 0, data, 0x01); if (status) dev_dbg(dev, "%s - read 3 status error = %d\n", __func__, status); else dev_dbg(dev, "%s - read 2 data = 0x%x\n", __func__, *data); if ((!status) && (*data == UMP5152 || *data == UMP3410)) { dev_dbg(dev, "%s - ROM_TYPE_III\n", __func__); serial->TI_I2C_Type = DTK_ADDR_SPACE_I2C_TYPE_III; goto out; } dev_dbg(dev, "%s - Unknown\n", __func__); serial->TI_I2C_Type = DTK_ADDR_SPACE_I2C_TYPE_II; status = -ENODEV; out: kfree(data); return status; } static int bulk_xfer(struct usb_serial *serial, void *buffer, int length, int *num_sent) { int status; status = usb_bulk_msg(serial->dev, usb_sndbulkpipe(serial->dev, serial->port[0]->bulk_out_endpointAddress), buffer, length, num_sent, 1000); return status; } /* Download given firmware image to the device (IN BOOT MODE) */ static int download_code(struct edgeport_serial *serial, u8 *image, int image_length) { int status = 0; int pos; int transfer; int done; /* Transfer firmware image */ for (pos = 0; pos < image_length; ) { /* Read the next buffer from file */ transfer = image_length - pos; if (transfer > EDGE_FW_BULK_MAX_PACKET_SIZE) transfer = EDGE_FW_BULK_MAX_PACKET_SIZE; /* Transfer data */ status = bulk_xfer(serial->serial, &image[pos], transfer, &done); if (status) break; /* Advance buffer pointer */ pos += done; } return status; } /* FIXME!!! */ static int config_boot_dev(struct usb_device *dev) { return 0; } static int ti_cpu_rev(struct edge_ti_manuf_descriptor *desc) { return TI_GET_CPU_REVISION(desc->CpuRev_BoardRev); } static int check_fw_sanity(struct edgeport_serial *serial, const struct firmware *fw) { u16 length_total; u8 checksum = 0; int pos; struct device *dev = &serial->serial->interface->dev; struct edgeport_fw_hdr *fw_hdr = (struct edgeport_fw_hdr *)fw->data; if (fw->size < sizeof(struct edgeport_fw_hdr)) { dev_err(dev, "incomplete fw header\n"); return -EINVAL; } length_total = le16_to_cpu(fw_hdr->length) + sizeof(struct edgeport_fw_hdr); if (fw->size != length_total) { dev_err(dev, "bad fw size (expected: %u, got: %zu)\n", length_total, fw->size); return -EINVAL; } for (pos = sizeof(struct edgeport_fw_hdr); pos < fw->size; ++pos) checksum += fw->data[pos]; if (checksum != fw_hdr->checksum) { dev_err(dev, "bad fw checksum (expected: 0x%x, got: 0x%x)\n", fw_hdr->checksum, checksum); return -EINVAL; } return 0; } /* * DownloadTIFirmware - Download run-time operating firmware to the TI5052 * * This routine downloads the main operating code into the TI5052, using the * boot code already burned into E2PROM or ROM. */ static int download_fw(struct edgeport_serial *serial) { struct device *dev = &serial->serial->interface->dev; int status = 0; struct usb_interface_descriptor *interface; const struct firmware *fw; const char *fw_name = "edgeport/down3.bin"; struct edgeport_fw_hdr *fw_hdr; status = request_firmware(&fw, fw_name, dev); if (status) { dev_err(dev, "Failed to load image \"%s\" err %d\n", fw_name, status); return status; } if (check_fw_sanity(serial, fw)) { status = -EINVAL; goto out; } fw_hdr = (struct edgeport_fw_hdr *)fw->data; /* If on-board version is newer, "fw_version" will be updated later. */ serial->fw_version = (fw_hdr->major_version << 8) + fw_hdr->minor_version; /* * This routine is entered by both the BOOT mode and the Download mode * We can determine which code is running by the reading the config * descriptor and if we have only one bulk pipe it is in boot mode */ serial->product_info.hardware_type = HARDWARE_TYPE_TIUMP; /* Default to type 2 i2c */ serial->TI_I2C_Type = DTK_ADDR_SPACE_I2C_TYPE_II; status = choose_config(serial->serial->dev); if (status) goto out; interface = &serial->serial->interface->cur_altsetting->desc; if (!interface) { dev_err(dev, "%s - no interface set, error!\n", __func__); status = -ENODEV; goto out; } /* * Setup initial mode -- the default mode 0 is TI_MODE_CONFIGURING * if we have more than one endpoint we are definitely in download * mode */ if (interface->bNumEndpoints > 1) { serial->product_info.TiMode = TI_MODE_DOWNLOAD; status = do_download_mode(serial, fw); } else { /* Otherwise we will remain in configuring mode */ serial->product_info.TiMode = TI_MODE_CONFIGURING; status = do_boot_mode(serial, fw); } out: release_firmware(fw); return status; } static int do_download_mode(struct edgeport_serial *serial, const struct firmware *fw) { struct device *dev = &serial->serial->interface->dev; int status = 0; int start_address; struct edge_ti_manuf_descriptor *ti_manuf_desc; int download_cur_ver; int download_new_ver; struct edgeport_fw_hdr *fw_hdr = (struct edgeport_fw_hdr *)fw->data; struct ti_i2c_desc *rom_desc; dev_dbg(dev, "%s - RUNNING IN DOWNLOAD MODE\n", __func__); status = check_i2c_image(serial); if (status) { dev_dbg(dev, "%s - DOWNLOAD MODE -- BAD I2C\n", __func__); return status; } /* * Validate Hardware version number * Read Manufacturing Descriptor from TI Based Edgeport */ ti_manuf_desc = kmalloc(sizeof(*ti_manuf_desc), GFP_KERNEL); if (!ti_manuf_desc) return -ENOMEM; status = get_manuf_info(serial, (u8 *)ti_manuf_desc); if (status) { kfree(ti_manuf_desc); return status; } /* Check version number of ION descriptor */ if (!ignore_cpu_rev && ti_cpu_rev(ti_manuf_desc) < 2) { dev_dbg(dev, "%s - Wrong CPU Rev %d (Must be 2)\n", __func__, ti_cpu_rev(ti_manuf_desc)); kfree(ti_manuf_desc); return -EINVAL; } rom_desc = kmalloc(sizeof(*rom_desc), GFP_KERNEL); if (!rom_desc) { kfree(ti_manuf_desc); return -ENOMEM; } /* Search for type 2 record (firmware record) */ start_address = get_descriptor_addr(serial, I2C_DESC_TYPE_FIRMWARE_BASIC, rom_desc); if (start_address != 0) { struct ti_i2c_firmware_rec *firmware_version; u8 *record; dev_dbg(dev, "%s - Found Type FIRMWARE (Type 2) record\n", __func__); firmware_version = kmalloc(sizeof(*firmware_version), GFP_KERNEL); if (!firmware_version) { kfree(rom_desc); kfree(ti_manuf_desc); return -ENOMEM; } /* * Validate version number * Read the descriptor data */ status = read_rom(serial, start_address + sizeof(struct ti_i2c_desc), sizeof(struct ti_i2c_firmware_rec), (u8 *)firmware_version); if (status) { kfree(firmware_version); kfree(rom_desc); kfree(ti_manuf_desc); return status; } /* * Check version number of download with current * version in I2c */ download_cur_ver = (firmware_version->Ver_Major << 8) + (firmware_version->Ver_Minor); download_new_ver = (fw_hdr->major_version << 8) + (fw_hdr->minor_version); dev_dbg(dev, "%s - >> FW Versions Device %d.%d Driver %d.%d\n", __func__, firmware_version->Ver_Major, firmware_version->Ver_Minor, fw_hdr->major_version, fw_hdr->minor_version); /* * Check if we have an old version in the I2C and * update if necessary */ if (download_cur_ver < download_new_ver) { dev_dbg(dev, "%s - Update I2C dld from %d.%d to %d.%d\n", __func__, firmware_version->Ver_Major, firmware_version->Ver_Minor, fw_hdr->major_version, fw_hdr->minor_version); record = kmalloc(1, GFP_KERNEL); if (!record) { kfree(firmware_version); kfree(rom_desc); kfree(ti_manuf_desc); return -ENOMEM; } /* * In order to update the I2C firmware we must * change the type 2 record to type 0xF2. This * will force the UMP to come up in Boot Mode. * Then while in boot mode, the driver will * download the latest firmware (padded to * 15.5k) into the UMP ram. Finally when the * device comes back up in download mode the * driver will cause the new firmware to be * copied from the UMP Ram to I2C and the * firmware will update the record type from * 0xf2 to 0x02. */ *record = I2C_DESC_TYPE_FIRMWARE_BLANK; /* * Change the I2C Firmware record type to * 0xf2 to trigger an update */ status = write_rom(serial, start_address, sizeof(*record), record); if (status) { kfree(record); kfree(firmware_version); kfree(rom_desc); kfree(ti_manuf_desc); return status; } /* * verify the write -- must do this in order * for write to complete before we do the * hardware reset */ status = read_rom(serial, start_address, sizeof(*record), record); if (status) { kfree(record); kfree(firmware_version); kfree(rom_desc); kfree(ti_manuf_desc); return status; } if (*record != I2C_DESC_TYPE_FIRMWARE_BLANK) { dev_err(dev, "%s - error resetting device\n", __func__); kfree(record); kfree(firmware_version); kfree(rom_desc); kfree(ti_manuf_desc); return -ENODEV; } dev_dbg(dev, "%s - HARDWARE RESET\n", __func__); /* Reset UMP -- Back to BOOT MODE */ status = ti_vsend_sync(serial->serial->dev, UMPC_HARDWARE_RESET, 0, 0, NULL, 0, TI_VSEND_TIMEOUT_DEFAULT); dev_dbg(dev, "%s - HARDWARE RESET return %d\n", __func__, status); /* return an error on purpose. */ kfree(record); kfree(firmware_version); kfree(rom_desc); kfree(ti_manuf_desc); return -ENODEV; } /* Same or newer fw version is already loaded */ serial->fw_version = download_cur_ver; kfree(firmware_version); } /* Search for type 0xF2 record (firmware blank record) */ else { start_address = get_descriptor_addr(serial, I2C_DESC_TYPE_FIRMWARE_BLANK, rom_desc); if (start_address != 0) { #define HEADER_SIZE (sizeof(struct ti_i2c_desc) + \ sizeof(struct ti_i2c_firmware_rec)) u8 *header; u8 *vheader; header = kmalloc(HEADER_SIZE, GFP_KERNEL); if (!header) { kfree(rom_desc); kfree(ti_manuf_desc); return -ENOMEM; } vheader = kmalloc(HEADER_SIZE, GFP_KERNEL); if (!vheader) { kfree(header); kfree(rom_desc); kfree(ti_manuf_desc); return -ENOMEM; } dev_dbg(dev, "%s - Found Type BLANK FIRMWARE (Type F2) record\n", __func__); /* * In order to update the I2C firmware we must change * the type 2 record to type 0xF2. This will force the * UMP to come up in Boot Mode. Then while in boot * mode, the driver will download the latest firmware * (padded to 15.5k) into the UMP ram. Finally when the * device comes back up in download mode the driver * will cause the new firmware to be copied from the * UMP Ram to I2C and the firmware will update the * record type from 0xf2 to 0x02. */ status = build_i2c_fw_hdr(header, fw); if (status) { kfree(vheader); kfree(header); kfree(rom_desc); kfree(ti_manuf_desc); return -EINVAL; } /* * Update I2C with type 0xf2 record with correct * size and checksum */ status = write_rom(serial, start_address, HEADER_SIZE, header); if (status) { kfree(vheader); kfree(header); kfree(rom_desc); kfree(ti_manuf_desc); return -EINVAL; } /* * verify the write -- must do this in order for * write to complete before we do the hardware reset */ status = read_rom(serial, start_address, HEADER_SIZE, vheader); if (status) { dev_dbg(dev, "%s - can't read header back\n", __func__); kfree(vheader); kfree(header); kfree(rom_desc); kfree(ti_manuf_desc); return status; } if (memcmp(vheader, header, HEADER_SIZE)) { dev_dbg(dev, "%s - write download record failed\n", __func__); kfree(vheader); kfree(header); kfree(rom_desc); kfree(ti_manuf_desc); return -EINVAL; } kfree(vheader); kfree(header); dev_dbg(dev, "%s - Start firmware update\n", __func__); /* Tell firmware to copy download image into I2C */ status = ti_vsend_sync(serial->serial->dev, UMPC_COPY_DNLD_TO_I2C, 0, 0, NULL, 0, TI_VSEND_TIMEOUT_FW_DOWNLOAD); dev_dbg(dev, "%s - Update complete 0x%x\n", __func__, status); if (status) { dev_err(dev, "%s - UMPC_COPY_DNLD_TO_I2C failed\n", __func__); kfree(rom_desc); kfree(ti_manuf_desc); return status; } } } /* The device is running the download code */ kfree(rom_desc); kfree(ti_manuf_desc); return 0; } static int do_boot_mode(struct edgeport_serial *serial, const struct firmware *fw) { struct device *dev = &serial->serial->interface->dev; int status = 0; struct edge_ti_manuf_descriptor *ti_manuf_desc; struct edgeport_fw_hdr *fw_hdr = (struct edgeport_fw_hdr *)fw->data; dev_dbg(dev, "%s - RUNNING IN BOOT MODE\n", __func__); /* Configure the TI device so we can use the BULK pipes for download */ status = config_boot_dev(serial->serial->dev); if (status) return status; if (le16_to_cpu(serial->serial->dev->descriptor.idVendor) != USB_VENDOR_ID_ION) { dev_dbg(dev, "%s - VID = 0x%x\n", __func__, le16_to_cpu(serial->serial->dev->descriptor.idVendor)); serial->TI_I2C_Type = DTK_ADDR_SPACE_I2C_TYPE_II; goto stayinbootmode; } /* * We have an ION device (I2c Must be programmed) * Determine I2C image type */ if (i2c_type_bootmode(serial)) goto stayinbootmode; /* Check for ION Vendor ID and that the I2C is valid */ if (!check_i2c_image(serial)) { struct ti_i2c_image_header *header; int i; u8 cs = 0; u8 *buffer; int buffer_size; /* * Validate Hardware version number * Read Manufacturing Descriptor from TI Based Edgeport */ ti_manuf_desc = kmalloc(sizeof(*ti_manuf_desc), GFP_KERNEL); if (!ti_manuf_desc) return -ENOMEM; status = get_manuf_info(serial, (u8 *)ti_manuf_desc); if (status) { kfree(ti_manuf_desc); goto stayinbootmode; } /* Check for version 2 */ if (!ignore_cpu_rev && ti_cpu_rev(ti_manuf_desc) < 2) { dev_dbg(dev, "%s - Wrong CPU Rev %d (Must be 2)\n", __func__, ti_cpu_rev(ti_manuf_desc)); kfree(ti_manuf_desc); goto stayinbootmode; } kfree(ti_manuf_desc); /* * In order to update the I2C firmware we must change the type * 2 record to type 0xF2. This will force the UMP to come up * in Boot Mode. Then while in boot mode, the driver will * download the latest firmware (padded to 15.5k) into the * UMP ram. Finally when the device comes back up in download * mode the driver will cause the new firmware to be copied * from the UMP Ram to I2C and the firmware will update the * record type from 0xf2 to 0x02. * * Do we really have to copy the whole firmware image, * or could we do this in place! */ /* Allocate a 15.5k buffer + 3 byte header */ buffer_size = (((1024 * 16) - 512) + sizeof(struct ti_i2c_image_header)); buffer = kmalloc(buffer_size, GFP_KERNEL); if (!buffer) return -ENOMEM; /* Initialize the buffer to 0xff (pad the buffer) */ memset(buffer, 0xff, buffer_size); memcpy(buffer, &fw->data[4], fw->size - 4); for (i = sizeof(struct ti_i2c_image_header); i < buffer_size; i++) { cs = (u8)(cs + buffer[i]); } header = (struct ti_i2c_image_header *)buffer; /* update length and checksum after padding */ header->Length = cpu_to_le16((u16)(buffer_size - sizeof(struct ti_i2c_image_header))); header->CheckSum = cs; /* Download the operational code */ dev_dbg(dev, "%s - Downloading operational code image version %d.%d (TI UMP)\n", __func__, fw_hdr->major_version, fw_hdr->minor_version); status = download_code(serial, buffer, buffer_size); kfree(buffer); if (status) { dev_dbg(dev, "%s - Error downloading operational code image\n", __func__); return status; } /* Device will reboot */ serial->product_info.TiMode = TI_MODE_TRANSITIONING; dev_dbg(dev, "%s - Download successful -- Device rebooting...\n", __func__); return 1; } stayinbootmode: /* Eprom is invalid or blank stay in boot mode */ dev_dbg(dev, "%s - STAYING IN BOOT MODE\n", __func__); serial->product_info.TiMode = TI_MODE_BOOT; return 1; } static int ti_do_config(struct edgeport_port *port, int feature, int on) { on = !!on; /* 1 or 0 not bitmask */ return send_port_cmd(port->port, feature, on, NULL, 0); } static int restore_mcr(struct edgeport_port *port, u8 mcr) { int status = 0; dev_dbg(&port->port->dev, "%s - %x\n", __func__, mcr); status = ti_do_config(port, UMPC_SET_CLR_DTR, mcr & MCR_DTR); if (status) return status; status = ti_do_config(port, UMPC_SET_CLR_RTS, mcr & MCR_RTS); if (status) return status; return ti_do_config(port, UMPC_SET_CLR_LOOPBACK, mcr & MCR_LOOPBACK); } /* Convert TI LSR to standard UART flags */ static u8 map_line_status(u8 ti_lsr) { u8 lsr = 0; #define MAP_FLAG(flagUmp, flagUart) \ if (ti_lsr & flagUmp) \ lsr |= flagUart; MAP_FLAG(UMP_UART_LSR_OV_MASK, LSR_OVER_ERR) /* overrun */ MAP_FLAG(UMP_UART_LSR_PE_MASK, LSR_PAR_ERR) /* parity error */ MAP_FLAG(UMP_UART_LSR_FE_MASK, LSR_FRM_ERR) /* framing error */ MAP_FLAG(UMP_UART_LSR_BR_MASK, LSR_BREAK) /* break detected */ MAP_FLAG(UMP_UART_LSR_RX_MASK, LSR_RX_AVAIL) /* rx data available */ MAP_FLAG(UMP_UART_LSR_TX_MASK, LSR_TX_EMPTY) /* tx hold reg empty */ #undef MAP_FLAG return lsr; } static void handle_new_msr(struct edgeport_port *edge_port, u8 msr) { struct async_icount *icount; struct tty_struct *tty; dev_dbg(&edge_port->port->dev, "%s - %02x\n", __func__, msr); if (msr & (EDGEPORT_MSR_DELTA_CTS | EDGEPORT_MSR_DELTA_DSR | EDGEPORT_MSR_DELTA_RI | EDGEPORT_MSR_DELTA_CD)) { icount = &edge_port->port->icount; /* update input line counters */ if (msr & EDGEPORT_MSR_DELTA_CTS) icount->cts++; if (msr & EDGEPORT_MSR_DELTA_DSR) icount->dsr++; if (msr & EDGEPORT_MSR_DELTA_CD) icount->dcd++; if (msr & EDGEPORT_MSR_DELTA_RI) icount->rng++; wake_up_interruptible(&edge_port->port->port.delta_msr_wait); } /* Save the new modem status */ edge_port->shadow_msr = msr & 0xf0; tty = tty_port_tty_get(&edge_port->port->port); /* handle CTS flow control */ if (tty && C_CRTSCTS(tty)) { if (msr & EDGEPORT_MSR_CTS) tty_wakeup(tty); } tty_kref_put(tty); } static void handle_new_lsr(struct edgeport_port *edge_port, int lsr_data, u8 lsr, u8 data) { struct async_icount *icount; u8 new_lsr = (u8)(lsr & (u8)(LSR_OVER_ERR | LSR_PAR_ERR | LSR_FRM_ERR | LSR_BREAK)); dev_dbg(&edge_port->port->dev, "%s - %02x\n", __func__, new_lsr); edge_port->shadow_lsr = lsr; if (new_lsr & LSR_BREAK) /* * Parity and Framing errors only count if they * occur exclusive of a break being received. */ new_lsr &= (u8)(LSR_OVER_ERR | LSR_BREAK); /* Place LSR data byte into Rx buffer */ if (lsr_data) edge_tty_recv(edge_port->port, &data, 1); /* update input line counters */ icount = &edge_port->port->icount; if (new_lsr & LSR_BREAK) icount->brk++; if (new_lsr & LSR_OVER_ERR) icount->overrun++; if (new_lsr & LSR_PAR_ERR) icount->parity++; if (new_lsr & LSR_FRM_ERR) icount->frame++; } static void edge_interrupt_callback(struct urb *urb) { struct edgeport_serial *edge_serial = urb->context; struct usb_serial_port *port; struct edgeport_port *edge_port; struct device *dev; unsigned char *data = urb->transfer_buffer; int length = urb->actual_length; int port_number; int function; int retval; u8 lsr; u8 msr; int status = urb->status; switch (status) { case 0: /* success */ break; case -ECONNRESET: case -ENOENT: case -ESHUTDOWN: /* this urb is terminated, clean up */ dev_dbg(&urb->dev->dev, "%s - urb shutting down with status: %d\n", __func__, status); return; default: dev_err(&urb->dev->dev, "%s - nonzero urb status received: " "%d\n", __func__, status); goto exit; } if (!length) { dev_dbg(&urb->dev->dev, "%s - no data in urb\n", __func__); goto exit; } dev = &edge_serial->serial->dev->dev; usb_serial_debug_data(dev, __func__, length, data); if (length != 2) { dev_dbg(dev, "%s - expecting packet of size 2, got %d\n", __func__, length); goto exit; } port_number = TIUMP_GET_PORT_FROM_CODE(data[0]); function = TIUMP_GET_FUNC_FROM_CODE(data[0]); dev_dbg(dev, "%s - port_number %d, function %d, info 0x%x\n", __func__, port_number, function, data[1]); if (port_number >= edge_serial->serial->num_ports) { dev_err(dev, "bad port number %d\n", port_number); goto exit; } port = edge_serial->serial->port[port_number]; edge_port = usb_get_serial_port_data(port); if (!edge_port) { dev_dbg(dev, "%s - edge_port not found\n", __func__); return; } switch (function) { case TIUMP_INTERRUPT_CODE_LSR: lsr = map_line_status(data[1]); if (lsr & UMP_UART_LSR_DATA_MASK) { /* * Save the LSR event for bulk read completion routine */ dev_dbg(dev, "%s - LSR Event Port %u LSR Status = %02x\n", __func__, port_number, lsr); edge_port->lsr_event = 1; edge_port->lsr_mask = lsr; } else { dev_dbg(dev, "%s - ===== Port %d LSR Status = %02x ======\n", __func__, port_number, lsr); handle_new_lsr(edge_port, 0, lsr, 0); } break; case TIUMP_INTERRUPT_CODE_MSR: /* MSR */ /* Copy MSR from UMP */ msr = data[1]; dev_dbg(dev, "%s - ===== Port %u MSR Status = %02x ======\n", __func__, port_number, msr); handle_new_msr(edge_port, msr); break; default: dev_err(&urb->dev->dev, "%s - Unknown Interrupt code from UMP %x\n", __func__, data[1]); break; } exit: retval = usb_submit_urb(urb, GFP_ATOMIC); if (retval) dev_err(&urb->dev->dev, "%s - usb_submit_urb failed with result %d\n", __func__, retval); } static void edge_bulk_in_callback(struct urb *urb) { struct edgeport_port *edge_port = urb->context; struct device *dev = &edge_port->port->dev; unsigned char *data = urb->transfer_buffer; unsigned long flags; int retval = 0; int port_number; int status = urb->status; switch (status) { case 0: /* success */ break; case -ECONNRESET: case -ENOENT: case -ESHUTDOWN: /* this urb is terminated, clean up */ dev_dbg(&urb->dev->dev, "%s - urb shutting down with status: %d\n", __func__, status); return; default: dev_err(&urb->dev->dev, "%s - nonzero read bulk status received: %d\n", __func__, status); } if (status == -EPIPE) goto exit; if (status) { dev_err(&urb->dev->dev, "%s - stopping read!\n", __func__); return; } port_number = edge_port->port->port_number; if (urb->actual_length > 0 && edge_port->lsr_event) { edge_port->lsr_event = 0; dev_dbg(dev, "%s ===== Port %u LSR Status = %02x, Data = %02x ======\n", __func__, port_number, edge_port->lsr_mask, *data); handle_new_lsr(edge_port, 1, edge_port->lsr_mask, *data); /* Adjust buffer length/pointer */ --urb->actual_length; ++data; } if (urb->actual_length) { usb_serial_debug_data(dev, __func__, urb->actual_length, data); if (edge_port->close_pending) dev_dbg(dev, "%s - close pending, dropping data on the floor\n", __func__); else edge_tty_recv(edge_port->port, data, urb->actual_length); edge_port->port->icount.rx += urb->actual_length; } exit: /* continue read unless stopped */ spin_lock_irqsave(&edge_port->ep_lock, flags); if (edge_port->ep_read_urb_state == EDGE_READ_URB_RUNNING) retval = usb_submit_urb(urb, GFP_ATOMIC); else if (edge_port->ep_read_urb_state == EDGE_READ_URB_STOPPING) edge_port->ep_read_urb_state = EDGE_READ_URB_STOPPED; spin_unlock_irqrestore(&edge_port->ep_lock, flags); if (retval) dev_err(dev, "%s - usb_submit_urb failed with result %d\n", __func__, retval); } static void edge_tty_recv(struct usb_serial_port *port, unsigned char *data, int length) { int queued; queued = tty_insert_flip_string(&port->port, data, length); if (queued < length) dev_err(&port->dev, "%s - dropping data, %d bytes lost\n", __func__, length - queued); tty_flip_buffer_push(&port->port); } static void edge_bulk_out_callback(struct urb *urb) { struct usb_serial_port *port = urb->context; struct edgeport_port *edge_port = usb_get_serial_port_data(port); int status = urb->status; struct tty_struct *tty; edge_port->ep_write_urb_in_use = 0; switch (status) { case 0: /* success */ break; case -ECONNRESET: case -ENOENT: case -ESHUTDOWN: /* this urb is terminated, clean up */ dev_dbg(&urb->dev->dev, "%s - urb shutting down with status: %d\n", __func__, status); return; default: dev_err_console(port, "%s - nonzero write bulk status " "received: %d\n", __func__, status); } /* send any buffered data */ tty = tty_port_tty_get(&port->port); edge_send(port, tty); tty_kref_put(tty); } static int edge_open(struct tty_struct *tty, struct usb_serial_port *port) { struct edgeport_port *edge_port = usb_get_serial_port_data(port); struct edgeport_serial *edge_serial; struct usb_device *dev; struct urb *urb; int status; u16 open_settings; u8 transaction_timeout; if (edge_port == NULL) return -ENODEV; dev = port->serial->dev; /* turn off loopback */ status = ti_do_config(edge_port, UMPC_SET_CLR_LOOPBACK, 0); if (status) { dev_err(&port->dev, "%s - cannot send clear loopback command, %d\n", __func__, status); return status; } /* set up the port settings */ if (tty) edge_set_termios(tty, port, &tty->termios); /* open up the port */ /* milliseconds to timeout for DMA transfer */ transaction_timeout = 2; edge_port->ump_read_timeout = max(20, ((transaction_timeout * 3) / 2)); /* milliseconds to timeout for DMA transfer */ open_settings = (u8)(UMP_DMA_MODE_CONTINOUS | UMP_PIPE_TRANS_TIMEOUT_ENA | (transaction_timeout << 2)); dev_dbg(&port->dev, "%s - Sending UMPC_OPEN_PORT\n", __func__); /* Tell TI to open and start the port */ status = send_port_cmd(port, UMPC_OPEN_PORT, open_settings, NULL, 0); if (status) { dev_err(&port->dev, "%s - cannot send open command, %d\n", __func__, status); return status; } /* Start the DMA? */ status = send_port_cmd(port, UMPC_START_PORT, 0, NULL, 0); if (status) { dev_err(&port->dev, "%s - cannot send start DMA command, %d\n", __func__, status); return status; } /* Clear TX and RX buffers in UMP */ status = purge_port(port, UMP_PORT_DIR_OUT | UMP_PORT_DIR_IN); if (status) { dev_err(&port->dev, "%s - cannot send clear buffers command, %d\n", __func__, status); return status; } /* Read Initial MSR */ status = read_port_cmd(port, UMPC_READ_MSR, 0, &edge_port->shadow_msr, 1); if (status) { dev_err(&port->dev, "%s - cannot send read MSR command, %d\n", __func__, status); return status; } dev_dbg(&port->dev, "ShadowMSR 0x%X\n", edge_port->shadow_msr); /* Set Initial MCR */ edge_port->shadow_mcr = MCR_RTS | MCR_DTR; dev_dbg(&port->dev, "ShadowMCR 0x%X\n", edge_port->shadow_mcr); edge_serial = edge_port->edge_serial; if (mutex_lock_interruptible(&edge_serial->es_lock)) return -ERESTARTSYS; if (edge_serial->num_ports_open == 0) { /* we are the first port to open, post the interrupt urb */ urb = edge_serial->serial->port[0]->interrupt_in_urb; urb->context = edge_serial; status = usb_submit_urb(urb, GFP_KERNEL); if (status) { dev_err(&port->dev, "%s - usb_submit_urb failed with value %d\n", __func__, status); goto release_es_lock; } } /* * reset the data toggle on the bulk endpoints to work around bug in * host controllers where things get out of sync some times */ usb_clear_halt(dev, port->write_urb->pipe); usb_clear_halt(dev, port->read_urb->pipe); /* start up our bulk read urb */ urb = port->read_urb; edge_port->ep_read_urb_state = EDGE_READ_URB_RUNNING; urb->context = edge_port; status = usb_submit_urb(urb, GFP_KERNEL); if (status) { dev_err(&port->dev, "%s - read bulk usb_submit_urb failed with value %d\n", __func__, status); goto unlink_int_urb; } ++edge_serial->num_ports_open; goto release_es_lock; unlink_int_urb: if (edge_port->edge_serial->num_ports_open == 0) usb_kill_urb(port->serial->port[0]->interrupt_in_urb); release_es_lock: mutex_unlock(&edge_serial->es_lock); return status; } static void edge_close(struct usb_serial_port *port) { struct edgeport_serial *edge_serial; struct edgeport_port *edge_port; unsigned long flags; edge_serial = usb_get_serial_data(port->serial); edge_port = usb_get_serial_port_data(port); if (edge_serial == NULL || edge_port == NULL) return; /* * The bulkreadcompletion routine will check * this flag and dump add read data */ edge_port->close_pending = 1; usb_kill_urb(port->read_urb); usb_kill_urb(port->write_urb); edge_port->ep_write_urb_in_use = 0; spin_lock_irqsave(&edge_port->ep_lock, flags); kfifo_reset_out(&port->write_fifo); spin_unlock_irqrestore(&edge_port->ep_lock, flags); dev_dbg(&port->dev, "%s - send umpc_close_port\n", __func__); send_port_cmd(port, UMPC_CLOSE_PORT, 0, NULL, 0); mutex_lock(&edge_serial->es_lock); --edge_port->edge_serial->num_ports_open; if (edge_port->edge_serial->num_ports_open <= 0) { /* last port is now closed, let's shut down our interrupt urb */ usb_kill_urb(port->serial->port[0]->interrupt_in_urb); edge_port->edge_serial->num_ports_open = 0; } mutex_unlock(&edge_serial->es_lock); edge_port->close_pending = 0; } static int edge_write(struct tty_struct *tty, struct usb_serial_port *port, const unsigned char *data, int count) { struct edgeport_port *edge_port = usb_get_serial_port_data(port); if (count == 0) { dev_dbg(&port->dev, "%s - write request of 0 bytes\n", __func__); return 0; } if (edge_port == NULL) return -ENODEV; if (edge_port->close_pending == 1) return -ENODEV; count = kfifo_in_locked(&port->write_fifo, data, count, &edge_port->ep_lock); edge_send(port, tty); return count; } static void edge_send(struct usb_serial_port *port, struct tty_struct *tty) { int count, result; struct edgeport_port *edge_port = usb_get_serial_port_data(port); unsigned long flags; spin_lock_irqsave(&edge_port->ep_lock, flags); if (edge_port->ep_write_urb_in_use) { spin_unlock_irqrestore(&edge_port->ep_lock, flags); return; } count = kfifo_out(&port->write_fifo, port->write_urb->transfer_buffer, port->bulk_out_size); if (count == 0) { spin_unlock_irqrestore(&edge_port->ep_lock, flags); return; } edge_port->ep_write_urb_in_use = 1; spin_unlock_irqrestore(&edge_port->ep_lock, flags); usb_serial_debug_data(&port->dev, __func__, count, port->write_urb->transfer_buffer); /* set up our urb */ port->write_urb->transfer_buffer_length = count; /* send the data out the bulk port */ result = usb_submit_urb(port->write_urb, GFP_ATOMIC); if (result) { dev_err_console(port, "%s - failed submitting write urb, error %d\n", __func__, result); edge_port->ep_write_urb_in_use = 0; /* TODO: reschedule edge_send */ } else edge_port->port->icount.tx += count; /* * wakeup any process waiting for writes to complete * there is now more room in the buffer for new writes */ if (tty) tty_wakeup(tty); } static unsigned int edge_write_room(struct tty_struct *tty) { struct usb_serial_port *port = tty->driver_data; struct edgeport_port *edge_port = usb_get_serial_port_data(port); unsigned int room; unsigned long flags; if (edge_port == NULL) return 0; if (edge_port->close_pending == 1) return 0; spin_lock_irqsave(&edge_port->ep_lock, flags); room = kfifo_avail(&port->write_fifo); spin_unlock_irqrestore(&edge_port->ep_lock, flags); dev_dbg(&port->dev, "%s - returns %u\n", __func__, room); return room; } static unsigned int edge_chars_in_buffer(struct tty_struct *tty) { struct usb_serial_port *port = tty->driver_data; struct edgeport_port *edge_port = usb_get_serial_port_data(port); unsigned int chars; unsigned long flags; if (edge_port == NULL) return 0; spin_lock_irqsave(&edge_port->ep_lock, flags); chars = kfifo_len(&port->write_fifo); spin_unlock_irqrestore(&edge_port->ep_lock, flags); dev_dbg(&port->dev, "%s - returns %u\n", __func__, chars); return chars; } static bool edge_tx_empty(struct usb_serial_port *port) { struct edgeport_port *edge_port = usb_get_serial_port_data(port); int ret; ret = tx_active(edge_port); if (ret > 0) return false; return true; } static void edge_throttle(struct tty_struct *tty) { struct usb_serial_port *port = tty->driver_data; struct edgeport_port *edge_port = usb_get_serial_port_data(port); int status; if (edge_port == NULL) return; /* if we are implementing XON/XOFF, send the stop character */ if (I_IXOFF(tty)) { unsigned char stop_char = STOP_CHAR(tty); status = edge_write(tty, port, &stop_char, 1); if (status <= 0) { dev_err(&port->dev, "%s - failed to write stop character, %d\n", __func__, status); } } /* * if we are implementing RTS/CTS, stop reads * and the Edgeport will clear the RTS line */ if (C_CRTSCTS(tty)) stop_read(edge_port); } static void edge_unthrottle(struct tty_struct *tty) { struct usb_serial_port *port = tty->driver_data; struct edgeport_port *edge_port = usb_get_serial_port_data(port); int status; if (edge_port == NULL) return; /* if we are implementing XON/XOFF, send the start character */ if (I_IXOFF(tty)) { unsigned char start_char = START_CHAR(tty); status = edge_write(tty, port, &start_char, 1); if (status <= 0) { dev_err(&port->dev, "%s - failed to write start character, %d\n", __func__, status); } } /* * if we are implementing RTS/CTS, restart reads * are the Edgeport will assert the RTS line */ if (C_CRTSCTS(tty)) { status = restart_read(edge_port); if (status) dev_err(&port->dev, "%s - read bulk usb_submit_urb failed: %d\n", __func__, status); } } static void stop_read(struct edgeport_port *edge_port) { unsigned long flags; spin_lock_irqsave(&edge_port->ep_lock, flags); if (edge_port->ep_read_urb_state == EDGE_READ_URB_RUNNING) edge_port->ep_read_urb_state = EDGE_READ_URB_STOPPING; edge_port->shadow_mcr &= ~MCR_RTS; spin_unlock_irqrestore(&edge_port->ep_lock, flags); } static int restart_read(struct edgeport_port *edge_port) { struct urb *urb; int status = 0; unsigned long flags; spin_lock_irqsave(&edge_port->ep_lock, flags); if (edge_port->ep_read_urb_state == EDGE_READ_URB_STOPPED) { urb = edge_port->port->read_urb; status = usb_submit_urb(urb, GFP_ATOMIC); } edge_port->ep_read_urb_state = EDGE_READ_URB_RUNNING; edge_port->shadow_mcr |= MCR_RTS; spin_unlock_irqrestore(&edge_port->ep_lock, flags); return status; } static void change_port_settings(struct tty_struct *tty, struct edgeport_port *edge_port, const struct ktermios *old_termios) { struct device *dev = &edge_port->port->dev; struct ump_uart_config *config; int baud; unsigned cflag; int status; config = kmalloc (sizeof (*config), GFP_KERNEL); if (!config) { tty->termios = *old_termios; return; } cflag = tty->termios.c_cflag; config->wFlags = 0; /* These flags must be set */ config->wFlags |= UMP_MASK_UART_FLAGS_RECEIVE_MS_INT; config->wFlags |= UMP_MASK_UART_FLAGS_AUTO_START_ON_ERR; config->bUartMode = (u8)(edge_port->bUartMode); switch (cflag & CSIZE) { case CS5: config->bDataBits = UMP_UART_CHAR5BITS; dev_dbg(dev, "%s - data bits = 5\n", __func__); break; case CS6: config->bDataBits = UMP_UART_CHAR6BITS; dev_dbg(dev, "%s - data bits = 6\n", __func__); break; case CS7: config->bDataBits = UMP_UART_CHAR7BITS; dev_dbg(dev, "%s - data bits = 7\n", __func__); break; default: case CS8: config->bDataBits = UMP_UART_CHAR8BITS; dev_dbg(dev, "%s - data bits = 8\n", __func__); break; } if (cflag & PARENB) { if (cflag & PARODD) { config->wFlags |= UMP_MASK_UART_FLAGS_PARITY; config->bParity = UMP_UART_ODDPARITY; dev_dbg(dev, "%s - parity = odd\n", __func__); } else { config->wFlags |= UMP_MASK_UART_FLAGS_PARITY; config->bParity = UMP_UART_EVENPARITY; dev_dbg(dev, "%s - parity = even\n", __func__); } } else { config->bParity = UMP_UART_NOPARITY; dev_dbg(dev, "%s - parity = none\n", __func__); } if (cflag & CSTOPB) { config->bStopBits = UMP_UART_STOPBIT2; dev_dbg(dev, "%s - stop bits = 2\n", __func__); } else { config->bStopBits = UMP_UART_STOPBIT1; dev_dbg(dev, "%s - stop bits = 1\n", __func__); } /* figure out the flow control settings */ if (cflag & CRTSCTS) { config->wFlags |= UMP_MASK_UART_FLAGS_OUT_X_CTS_FLOW; config->wFlags |= UMP_MASK_UART_FLAGS_RTS_FLOW; dev_dbg(dev, "%s - RTS/CTS is enabled\n", __func__); } else { dev_dbg(dev, "%s - RTS/CTS is disabled\n", __func__); restart_read(edge_port); } /* * if we are implementing XON/XOFF, set the start and stop * character in the device */ config->cXon = START_CHAR(tty); config->cXoff = STOP_CHAR(tty); /* if we are implementing INBOUND XON/XOFF */ if (I_IXOFF(tty)) { config->wFlags |= UMP_MASK_UART_FLAGS_IN_X; dev_dbg(dev, "%s - INBOUND XON/XOFF is enabled, XON = %2x, XOFF = %2x\n", __func__, config->cXon, config->cXoff); } else dev_dbg(dev, "%s - INBOUND XON/XOFF is disabled\n", __func__); /* if we are implementing OUTBOUND XON/XOFF */ if (I_IXON(tty)) { config->wFlags |= UMP_MASK_UART_FLAGS_OUT_X; dev_dbg(dev, "%s - OUTBOUND XON/XOFF is enabled, XON = %2x, XOFF = %2x\n", __func__, config->cXon, config->cXoff); } else dev_dbg(dev, "%s - OUTBOUND XON/XOFF is disabled\n", __func__); tty->termios.c_cflag &= ~CMSPAR; /* Round the baud rate */ baud = tty_get_baud_rate(tty); if (!baud) { /* pick a default, any default... */ baud = 9600; } else { /* Avoid a zero divisor. */ baud = min(baud, 461550); tty_encode_baud_rate(tty, baud, baud); } edge_port->baud_rate = baud; config->wBaudRate = (u16)((461550L + baud/2) / baud); /* FIXME: Recompute actual baud from divisor here */ dev_dbg(dev, "%s - baud rate = %d, wBaudRate = %d\n", __func__, baud, config->wBaudRate); dev_dbg(dev, "wBaudRate: %d\n", (int)(461550L / config->wBaudRate)); dev_dbg(dev, "wFlags: 0x%x\n", config->wFlags); dev_dbg(dev, "bDataBits: %d\n", config->bDataBits); dev_dbg(dev, "bParity: %d\n", config->bParity); dev_dbg(dev, "bStopBits: %d\n", config->bStopBits); dev_dbg(dev, "cXon: %d\n", config->cXon); dev_dbg(dev, "cXoff: %d\n", config->cXoff); dev_dbg(dev, "bUartMode: %d\n", config->bUartMode); /* move the word values into big endian mode */ cpu_to_be16s(&config->wFlags); cpu_to_be16s(&config->wBaudRate); status = send_port_cmd(edge_port->port, UMPC_SET_CONFIG, 0, config, sizeof(*config)); if (status) dev_dbg(dev, "%s - error %d when trying to write config to device\n", __func__, status); kfree(config); } static void edge_set_termios(struct tty_struct *tty, struct usb_serial_port *port, const struct ktermios *old_termios) { struct edgeport_port *edge_port = usb_get_serial_port_data(port); if (edge_port == NULL) return; /* change the port settings to the new ones specified */ change_port_settings(tty, edge_port, old_termios); } static int edge_tiocmset(struct tty_struct *tty, unsigned int set, unsigned int clear) { struct usb_serial_port *port = tty->driver_data; struct edgeport_port *edge_port = usb_get_serial_port_data(port); unsigned int mcr; unsigned long flags; spin_lock_irqsave(&edge_port->ep_lock, flags); mcr = edge_port->shadow_mcr; if (set & TIOCM_RTS) mcr |= MCR_RTS; if (set & TIOCM_DTR) mcr |= MCR_DTR; if (set & TIOCM_LOOP) mcr |= MCR_LOOPBACK; if (clear & TIOCM_RTS) mcr &= ~MCR_RTS; if (clear & TIOCM_DTR) mcr &= ~MCR_DTR; if (clear & TIOCM_LOOP) mcr &= ~MCR_LOOPBACK; edge_port->shadow_mcr = mcr; spin_unlock_irqrestore(&edge_port->ep_lock, flags); restore_mcr(edge_port, mcr); return 0; } static int edge_tiocmget(struct tty_struct *tty) { struct usb_serial_port *port = tty->driver_data; struct edgeport_port *edge_port = usb_get_serial_port_data(port); unsigned int result = 0; unsigned int msr; unsigned int mcr; unsigned long flags; spin_lock_irqsave(&edge_port->ep_lock, flags); msr = edge_port->shadow_msr; mcr = edge_port->shadow_mcr; result = ((mcr & MCR_DTR) ? TIOCM_DTR: 0) /* 0x002 */ | ((mcr & MCR_RTS) ? TIOCM_RTS: 0) /* 0x004 */ | ((msr & EDGEPORT_MSR_CTS) ? TIOCM_CTS: 0) /* 0x020 */ | ((msr & EDGEPORT_MSR_CD) ? TIOCM_CAR: 0) /* 0x040 */ | ((msr & EDGEPORT_MSR_RI) ? TIOCM_RI: 0) /* 0x080 */ | ((msr & EDGEPORT_MSR_DSR) ? TIOCM_DSR: 0); /* 0x100 */ dev_dbg(&port->dev, "%s -- %x\n", __func__, result); spin_unlock_irqrestore(&edge_port->ep_lock, flags); return result; } static int edge_break(struct tty_struct *tty, int break_state) { struct usb_serial_port *port = tty->driver_data; struct edgeport_port *edge_port = usb_get_serial_port_data(port); int status; int bv = 0; /* Off */ if (break_state == -1) bv = 1; /* On */ status = ti_do_config(edge_port, UMPC_SET_CLR_BREAK, bv); if (status) { dev_dbg(&port->dev, "%s - error %d sending break set/clear command.\n", __func__, status); return status; } return 0; } static void edge_heartbeat_schedule(struct edgeport_serial *edge_serial) { if (!edge_serial->use_heartbeat) return; schedule_delayed_work(&edge_serial->heartbeat_work, FW_HEARTBEAT_SECS * HZ); } static void edge_heartbeat_work(struct work_struct *work) { struct edgeport_serial *serial; struct ti_i2c_desc *rom_desc; serial = container_of(work, struct edgeport_serial, heartbeat_work.work); rom_desc = kmalloc(sizeof(*rom_desc), GFP_KERNEL); /* Descriptor address request is enough to reset the firmware timer */ if (!rom_desc || !get_descriptor_addr(serial, I2C_DESC_TYPE_ION, rom_desc)) { dev_err(&serial->serial->interface->dev, "%s - Incomplete heartbeat\n", __func__); } kfree(rom_desc); edge_heartbeat_schedule(serial); } static int edge_calc_num_ports(struct usb_serial *serial, struct usb_serial_endpoints *epds) { struct device *dev = &serial->interface->dev; unsigned char num_ports = serial->type->num_ports; /* Make sure we have the required endpoints when in download mode. */ if (serial->interface->cur_altsetting->desc.bNumEndpoints > 1) { if (epds->num_bulk_in < num_ports || epds->num_bulk_out < num_ports || epds->num_interrupt_in < 1) { dev_err(dev, "required endpoints missing\n"); return -ENODEV; } } return num_ports; } static int edge_startup(struct usb_serial *serial) { struct edgeport_serial *edge_serial; int status; u16 product_id; /* create our private serial structure */ edge_serial = kzalloc(sizeof(struct edgeport_serial), GFP_KERNEL); if (!edge_serial) return -ENOMEM; mutex_init(&edge_serial->es_lock); edge_serial->serial = serial; INIT_DELAYED_WORK(&edge_serial->heartbeat_work, edge_heartbeat_work); usb_set_serial_data(serial, edge_serial); status = download_fw(edge_serial); if (status < 0) { kfree(edge_serial); return status; } if (status > 0) return 1; /* bind but do not register any ports */ product_id = le16_to_cpu( edge_serial->serial->dev->descriptor.idProduct); /* Currently only the EP/416 models require heartbeat support */ if (edge_serial->fw_version > FW_HEARTBEAT_VERSION_CUTOFF) { if (product_id == ION_DEVICE_ID_TI_EDGEPORT_416 || product_id == ION_DEVICE_ID_TI_EDGEPORT_416B) { edge_serial->use_heartbeat = true; } } edge_heartbeat_schedule(edge_serial); return 0; } static void edge_disconnect(struct usb_serial *serial) { struct edgeport_serial *edge_serial = usb_get_serial_data(serial); cancel_delayed_work_sync(&edge_serial->heartbeat_work); } static void edge_release(struct usb_serial *serial) { struct edgeport_serial *edge_serial = usb_get_serial_data(serial); cancel_delayed_work_sync(&edge_serial->heartbeat_work); kfree(edge_serial); } static int edge_port_probe(struct usb_serial_port *port) { struct edgeport_port *edge_port; int ret; edge_port = kzalloc(sizeof(*edge_port), GFP_KERNEL); if (!edge_port) return -ENOMEM; spin_lock_init(&edge_port->ep_lock); edge_port->port = port; edge_port->edge_serial = usb_get_serial_data(port->serial); edge_port->bUartMode = default_uart_mode; switch (port->port_number) { case 0: edge_port->uart_base = UMPMEM_BASE_UART1; edge_port->dma_address = UMPD_OEDB1_ADDRESS; break; case 1: edge_port->uart_base = UMPMEM_BASE_UART2; edge_port->dma_address = UMPD_OEDB2_ADDRESS; break; default: dev_err(&port->dev, "unknown port number\n"); ret = -ENODEV; goto err; } dev_dbg(&port->dev, "%s - port_number = %d, uart_base = %04x, dma_address = %04x\n", __func__, port->port_number, edge_port->uart_base, edge_port->dma_address); usb_set_serial_port_data(port, edge_port); ret = edge_create_sysfs_attrs(port); if (ret) goto err; /* * The LSR does not tell when the transmitter shift register has * emptied so add a one-character drain delay. */ port->port.drain_delay = 1; return 0; err: kfree(edge_port); return ret; } static void edge_port_remove(struct usb_serial_port *port) { struct edgeport_port *edge_port; edge_port = usb_get_serial_port_data(port); edge_remove_sysfs_attrs(port); kfree(edge_port); } /* Sysfs Attributes */ static ssize_t uart_mode_show(struct device *dev, struct device_attribute *attr, char *buf) { struct usb_serial_port *port = to_usb_serial_port(dev); struct edgeport_port *edge_port = usb_get_serial_port_data(port); return sprintf(buf, "%d\n", edge_port->bUartMode); } static ssize_t uart_mode_store(struct device *dev, struct device_attribute *attr, const char *valbuf, size_t count) { struct usb_serial_port *port = to_usb_serial_port(dev); struct edgeport_port *edge_port = usb_get_serial_port_data(port); unsigned int v = simple_strtoul(valbuf, NULL, 0); dev_dbg(dev, "%s: setting uart_mode = %d\n", __func__, v); if (v < 256) edge_port->bUartMode = v; else dev_err(dev, "%s - uart_mode %d is invalid\n", __func__, v); return count; } static DEVICE_ATTR_RW(uart_mode); static int edge_create_sysfs_attrs(struct usb_serial_port *port) { return device_create_file(&port->dev, &dev_attr_uart_mode); } static int edge_remove_sysfs_attrs(struct usb_serial_port *port) { device_remove_file(&port->dev, &dev_attr_uart_mode); return 0; } #ifdef CONFIG_PM static int edge_suspend(struct usb_serial *serial, pm_message_t message) { struct edgeport_serial *edge_serial = usb_get_serial_data(serial); cancel_delayed_work_sync(&edge_serial->heartbeat_work); return 0; } static int edge_resume(struct usb_serial *serial) { struct edgeport_serial *edge_serial = usb_get_serial_data(serial); edge_heartbeat_schedule(edge_serial); return 0; } #endif static struct usb_serial_driver edgeport_1port_device = { .driver = { .name = "edgeport_ti_1", }, .description = "Edgeport TI 1 port adapter", .id_table = edgeport_1port_id_table, .num_ports = 1, .num_bulk_out = 1, .open = edge_open, .close = edge_close, .throttle = edge_throttle, .unthrottle = edge_unthrottle, .attach = edge_startup, .calc_num_ports = edge_calc_num_ports, .disconnect = edge_disconnect, .release = edge_release, .port_probe = edge_port_probe, .port_remove = edge_port_remove, .set_termios = edge_set_termios, .tiocmget = edge_tiocmget, .tiocmset = edge_tiocmset, .tiocmiwait = usb_serial_generic_tiocmiwait, .get_icount = usb_serial_generic_get_icount, .write = edge_write, .write_room = edge_write_room, .chars_in_buffer = edge_chars_in_buffer, .tx_empty = edge_tx_empty, .break_ctl = edge_break, .read_int_callback = edge_interrupt_callback, .read_bulk_callback = edge_bulk_in_callback, .write_bulk_callback = edge_bulk_out_callback, #ifdef CONFIG_PM .suspend = edge_suspend, .resume = edge_resume, #endif }; static struct usb_serial_driver edgeport_2port_device = { .driver = { .name = "edgeport_ti_2", }, .description = "Edgeport TI 2 port adapter", .id_table = edgeport_2port_id_table, .num_ports = 2, .num_bulk_out = 1, .open = edge_open, .close = edge_close, .throttle = edge_throttle, .unthrottle = edge_unthrottle, .attach = edge_startup, .calc_num_ports = edge_calc_num_ports, .disconnect = edge_disconnect, .release = edge_release, .port_probe = edge_port_probe, .port_remove = edge_port_remove, .set_termios = edge_set_termios, .tiocmget = edge_tiocmget, .tiocmset = edge_tiocmset, .tiocmiwait = usb_serial_generic_tiocmiwait, .get_icount = usb_serial_generic_get_icount, .write = edge_write, .write_room = edge_write_room, .chars_in_buffer = edge_chars_in_buffer, .tx_empty = edge_tx_empty, .break_ctl = edge_break, .read_int_callback = edge_interrupt_callback, .read_bulk_callback = edge_bulk_in_callback, .write_bulk_callback = edge_bulk_out_callback, #ifdef CONFIG_PM .suspend = edge_suspend, .resume = edge_resume, #endif }; static struct usb_serial_driver * const serial_drivers[] = { &edgeport_1port_device, &edgeport_2port_device, NULL }; module_usb_serial_driver(serial_drivers, id_table_combined); MODULE_AUTHOR(DRIVER_AUTHOR); MODULE_DESCRIPTION(DRIVER_DESC); MODULE_LICENSE("GPL"); MODULE_FIRMWARE("edgeport/down3.bin"); module_param(ignore_cpu_rev, bool, 0644); MODULE_PARM_DESC(ignore_cpu_rev, "Ignore the cpu revision when connecting to a device"); module_param(default_uart_mode, int, 0644); MODULE_PARM_DESC(default_uart_mode, "Default uart_mode, 0=RS232, ...");
46 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 // SPDX-License-Identifier: GPL-2.0-only /* Accounting handling for netfilter. */ /* * (C) 2008 Krzysztof Piotr Oledzki <ole@ans.pl> */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/netfilter.h> #include <linux/slab.h> #include <linux/kernel.h> #include <linux/moduleparam.h> #include <linux/export.h> #include <net/netfilter/nf_conntrack.h> #include <net/netfilter/nf_conntrack_extend.h> #include <net/netfilter/nf_conntrack_acct.h> static bool nf_ct_acct __read_mostly; module_param_named(acct, nf_ct_acct, bool, 0644); MODULE_PARM_DESC(acct, "Enable connection tracking flow accounting."); void nf_conntrack_acct_pernet_init(struct net *net) { net->ct.sysctl_acct = nf_ct_acct; }
11 13 12 3 5 12 9 8 13 5 3 2 4 2 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 // SPDX-License-Identifier: GPL-2.0-only /* * Cryptographic API * * Michael MIC (IEEE 802.11i/TKIP) keyed digest * * Copyright (c) 2004 Jouni Malinen <j@w1.fi> */ #include <crypto/internal/hash.h> #include <linux/unaligned.h> #include <linux/init.h> #include <linux/module.h> #include <linux/string.h> #include <linux/types.h> struct michael_mic_ctx { u32 l, r; }; struct michael_mic_desc_ctx { __le32 pending; size_t pending_len; u32 l, r; }; static inline u32 xswap(u32 val) { return ((val & 0x00ff00ff) << 8) | ((val & 0xff00ff00) >> 8); } #define michael_block(l, r) \ do { \ r ^= rol32(l, 17); \ l += r; \ r ^= xswap(l); \ l += r; \ r ^= rol32(l, 3); \ l += r; \ r ^= ror32(l, 2); \ l += r; \ } while (0) static int michael_init(struct shash_desc *desc) { struct michael_mic_desc_ctx *mctx = shash_desc_ctx(desc); struct michael_mic_ctx *ctx = crypto_shash_ctx(desc->tfm); mctx->pending_len = 0; mctx->l = ctx->l; mctx->r = ctx->r; return 0; } static int michael_update(struct shash_desc *desc, const u8 *data, unsigned int len) { struct michael_mic_desc_ctx *mctx = shash_desc_ctx(desc); if (mctx->pending_len) { int flen = 4 - mctx->pending_len; if (flen > len) flen = len; memcpy((u8 *)&mctx->pending + mctx->pending_len, data, flen); mctx->pending_len += flen; data += flen; len -= flen; if (mctx->pending_len < 4) return 0; mctx->l ^= le32_to_cpu(mctx->pending); michael_block(mctx->l, mctx->r); mctx->pending_len = 0; } while (len >= 4) { mctx->l ^= get_unaligned_le32(data); michael_block(mctx->l, mctx->r); data += 4; len -= 4; } if (len > 0) { mctx->pending_len = len; memcpy(&mctx->pending, data, len); } return 0; } static int michael_final(struct shash_desc *desc, u8 *out) { struct michael_mic_desc_ctx *mctx = shash_desc_ctx(desc); u8 *data = (u8 *)&mctx->pending; /* Last block and padding (0x5a, 4..7 x 0) */ switch (mctx->pending_len) { case 0: mctx->l ^= 0x5a; break; case 1: mctx->l ^= data[0] | 0x5a00; break; case 2: mctx->l ^= data[0] | (data[1] << 8) | 0x5a0000; break; case 3: mctx->l ^= data[0] | (data[1] << 8) | (data[2] << 16) | 0x5a000000; break; } michael_block(mctx->l, mctx->r); /* l ^= 0; */ michael_block(mctx->l, mctx->r); put_unaligned_le32(mctx->l, out); put_unaligned_le32(mctx->r, out + 4); return 0; } static int michael_setkey(struct crypto_shash *tfm, const u8 *key, unsigned int keylen) { struct michael_mic_ctx *mctx = crypto_shash_ctx(tfm); if (keylen != 8) return -EINVAL; mctx->l = get_unaligned_le32(key); mctx->r = get_unaligned_le32(key + 4); return 0; } static struct shash_alg alg = { .digestsize = 8, .setkey = michael_setkey, .init = michael_init, .update = michael_update, .final = michael_final, .descsize = sizeof(struct michael_mic_desc_ctx), .base = { .cra_name = "michael_mic", .cra_driver_name = "michael_mic-generic", .cra_blocksize = 8, .cra_ctxsize = sizeof(struct michael_mic_ctx), .cra_module = THIS_MODULE, } }; static int __init michael_mic_init(void) { return crypto_register_shash(&alg); } static void __exit michael_mic_exit(void) { crypto_unregister_shash(&alg); } module_init(michael_mic_init); module_exit(michael_mic_exit); MODULE_LICENSE("GPL v2"); MODULE_DESCRIPTION("Michael MIC"); MODULE_AUTHOR("Jouni Malinen <j@w1.fi>"); MODULE_ALIAS_CRYPTO("michael_mic");
136 137 497 500 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 // SPDX-License-Identifier: GPL-2.0-only /* * fs/dax.c - Direct Access filesystem code * Copyright (c) 2013-2014 Intel Corporation * Author: Matthew Wilcox <matthew.r.wilcox@intel.com> * Author: Ross Zwisler <ross.zwisler@linux.intel.com> */ #include <linux/atomic.h> #include <linux/blkdev.h> #include <linux/buffer_head.h> #include <linux/dax.h> #include <linux/fs.h> #include <linux/highmem.h> #include <linux/memcontrol.h> #include <linux/mm.h> #include <linux/mutex.h> #include <linux/pagevec.h> #include <linux/sched.h> #include <linux/sched/signal.h> #include <linux/uio.h> #include <linux/vmstat.h> #include <linux/pfn_t.h> #include <linux/sizes.h> #include <linux/mmu_notifier.h> #include <linux/iomap.h> #include <linux/rmap.h> #include <asm/pgalloc.h> #define CREATE_TRACE_POINTS #include <trace/events/fs_dax.h> /* We choose 4096 entries - same as per-zone page wait tables */ #define DAX_WAIT_TABLE_BITS 12 #define DAX_WAIT_TABLE_ENTRIES (1 << DAX_WAIT_TABLE_BITS) /* The 'colour' (ie low bits) within a PMD of a page offset. */ #define PG_PMD_COLOUR ((PMD_SIZE >> PAGE_SHIFT) - 1) #define PG_PMD_NR (PMD_SIZE >> PAGE_SHIFT) static wait_queue_head_t wait_table[DAX_WAIT_TABLE_ENTRIES]; static int __init init_dax_wait_table(void) { int i; for (i = 0; i < DAX_WAIT_TABLE_ENTRIES; i++) init_waitqueue_head(wait_table + i); return 0; } fs_initcall(init_dax_wait_table); /* * DAX pagecache entries use XArray value entries so they can't be mistaken * for pages. We use one bit for locking, one bit for the entry size (PMD) * and two more to tell us if the entry is a zero page or an empty entry that * is just used for locking. In total four special bits. * * If the PMD bit isn't set the entry has size PAGE_SIZE, and if the ZERO_PAGE * and EMPTY bits aren't set the entry is a normal DAX entry with a filesystem * block allocation. */ #define DAX_SHIFT (4) #define DAX_LOCKED (1UL << 0) #define DAX_PMD (1UL << 1) #define DAX_ZERO_PAGE (1UL << 2) #define DAX_EMPTY (1UL << 3) static unsigned long dax_to_pfn(void *entry) { return xa_to_value(entry) >> DAX_SHIFT; } static struct folio *dax_to_folio(void *entry) { return page_folio(pfn_to_page(dax_to_pfn(entry))); } static void *dax_make_entry(pfn_t pfn, unsigned long flags) { return xa_mk_value(flags | (pfn_t_to_pfn(pfn) << DAX_SHIFT)); } static bool dax_is_locked(void *entry) { return xa_to_value(entry) & DAX_LOCKED; } static unsigned int dax_entry_order(void *entry) { if (xa_to_value(entry) & DAX_PMD) return PMD_ORDER; return 0; } static unsigned long dax_is_pmd_entry(void *entry) { return xa_to_value(entry) & DAX_PMD; } static bool dax_is_pte_entry(void *entry) { return !(xa_to_value(entry) & DAX_PMD); } static int dax_is_zero_entry(void *entry) { return xa_to_value(entry) & DAX_ZERO_PAGE; } static int dax_is_empty_entry(void *entry) { return xa_to_value(entry) & DAX_EMPTY; } /* * true if the entry that was found is of a smaller order than the entry * we were looking for */ static bool dax_is_conflict(void *entry) { return entry == XA_RETRY_ENTRY; } /* * DAX page cache entry locking */ struct exceptional_entry_key { struct xarray *xa; pgoff_t entry_start; }; struct wait_exceptional_entry_queue { wait_queue_entry_t wait; struct exceptional_entry_key key; }; /** * enum dax_wake_mode: waitqueue wakeup behaviour * @WAKE_ALL: wake all waiters in the waitqueue * @WAKE_NEXT: wake only the first waiter in the waitqueue */ enum dax_wake_mode { WAKE_ALL, WAKE_NEXT, }; static wait_queue_head_t *dax_entry_waitqueue(struct xa_state *xas, void *entry, struct exceptional_entry_key *key) { unsigned long hash; unsigned long index = xas->xa_index; /* * If 'entry' is a PMD, align the 'index' that we use for the wait * queue to the start of that PMD. This ensures that all offsets in * the range covered by the PMD map to the same bit lock. */ if (dax_is_pmd_entry(entry)) index &= ~PG_PMD_COLOUR; key->xa = xas->xa; key->entry_start = index; hash = hash_long((unsigned long)xas->xa ^ index, DAX_WAIT_TABLE_BITS); return wait_table + hash; } static int wake_exceptional_entry_func(wait_queue_entry_t *wait, unsigned int mode, int sync, void *keyp) { struct exceptional_entry_key *key = keyp; struct wait_exceptional_entry_queue *ewait = container_of(wait, struct wait_exceptional_entry_queue, wait); if (key->xa != ewait->key.xa || key->entry_start != ewait->key.entry_start) return 0; return autoremove_wake_function(wait, mode, sync, NULL); } /* * @entry may no longer be the entry at the index in the mapping. * The important information it's conveying is whether the entry at * this index used to be a PMD entry. */ static void dax_wake_entry(struct xa_state *xas, void *entry, enum dax_wake_mode mode) { struct exceptional_entry_key key; wait_queue_head_t *wq; wq = dax_entry_waitqueue(xas, entry, &key); /* * Checking for locked entry and prepare_to_wait_exclusive() happens * under the i_pages lock, ditto for entry handling in our callers. * So at this point all tasks that could have seen our entry locked * must be in the waitqueue and the following check will see them. */ if (waitqueue_active(wq)) __wake_up(wq, TASK_NORMAL, mode == WAKE_ALL ? 0 : 1, &key); } /* * Look up entry in page cache, wait for it to become unlocked if it * is a DAX entry and return it. The caller must subsequently call * put_unlocked_entry() if it did not lock the entry or dax_unlock_entry() * if it did. The entry returned may have a larger order than @order. * If @order is larger than the order of the entry found in i_pages, this * function returns a dax_is_conflict entry. * * Must be called with the i_pages lock held. */ static void *get_next_unlocked_entry(struct xa_state *xas, unsigned int order) { void *entry; struct wait_exceptional_entry_queue ewait; wait_queue_head_t *wq; init_wait(&ewait.wait); ewait.wait.func = wake_exceptional_entry_func; for (;;) { entry = xas_find_conflict(xas); if (!entry || WARN_ON_ONCE(!xa_is_value(entry))) return entry; if (dax_entry_order(entry) < order) return XA_RETRY_ENTRY; if (!dax_is_locked(entry)) return entry; wq = dax_entry_waitqueue(xas, entry, &ewait.key); prepare_to_wait_exclusive(wq, &ewait.wait, TASK_UNINTERRUPTIBLE); xas_unlock_irq(xas); xas_reset(xas); schedule(); finish_wait(wq, &ewait.wait); xas_lock_irq(xas); } } /* * Wait for the given entry to become unlocked. Caller must hold the i_pages * lock and call either put_unlocked_entry() if it did not lock the entry or * dax_unlock_entry() if it did. Returns an unlocked entry if still present. */ static void *wait_entry_unlocked_exclusive(struct xa_state *xas, void *entry) { struct wait_exceptional_entry_queue ewait; wait_queue_head_t *wq; init_wait(&ewait.wait); ewait.wait.func = wake_exceptional_entry_func; while (unlikely(dax_is_locked(entry))) { wq = dax_entry_waitqueue(xas, entry, &ewait.key); prepare_to_wait_exclusive(wq, &ewait.wait, TASK_UNINTERRUPTIBLE); xas_reset(xas); xas_unlock_irq(xas); schedule(); finish_wait(wq, &ewait.wait); xas_lock_irq(xas); entry = xas_load(xas); } if (xa_is_internal(entry)) return NULL; return entry; } /* * The only thing keeping the address space around is the i_pages lock * (it's cycled in clear_inode() after removing the entries from i_pages) * After we call xas_unlock_irq(), we cannot touch xas->xa. */ static void wait_entry_unlocked(struct xa_state *xas, void *entry) { struct wait_exceptional_entry_queue ewait; wait_queue_head_t *wq; init_wait(&ewait.wait); ewait.wait.func = wake_exceptional_entry_func; wq = dax_entry_waitqueue(xas, entry, &ewait.key); /* * Unlike get_next_unlocked_entry() there is no guarantee that this * path ever successfully retrieves an unlocked entry before an * inode dies. Perform a non-exclusive wait in case this path * never successfully performs its own wake up. */ prepare_to_wait(wq, &ewait.wait, TASK_UNINTERRUPTIBLE); xas_unlock_irq(xas); schedule(); finish_wait(wq, &ewait.wait); } static void put_unlocked_entry(struct xa_state *xas, void *entry, enum dax_wake_mode mode) { if (entry && !dax_is_conflict(entry)) dax_wake_entry(xas, entry, mode); } /* * We used the xa_state to get the entry, but then we locked the entry and * dropped the xa_lock, so we know the xa_state is stale and must be reset * before use. */ static void dax_unlock_entry(struct xa_state *xas, void *entry) { void *old; BUG_ON(dax_is_locked(entry)); xas_reset(xas); xas_lock_irq(xas); old = xas_store(xas, entry); xas_unlock_irq(xas); BUG_ON(!dax_is_locked(old)); dax_wake_entry(xas, entry, WAKE_NEXT); } /* * Return: The entry stored at this location before it was locked. */ static void *dax_lock_entry(struct xa_state *xas, void *entry) { unsigned long v = xa_to_value(entry); return xas_store(xas, xa_mk_value(v | DAX_LOCKED)); } static unsigned long dax_entry_size(void *entry) { if (dax_is_zero_entry(entry)) return 0; else if (dax_is_empty_entry(entry)) return 0; else if (dax_is_pmd_entry(entry)) return PMD_SIZE; else return PAGE_SIZE; } /* * A DAX folio is considered shared if it has no mapping set and ->share (which * shares the ->index field) is non-zero. Note this may return false even if the * page is shared between multiple files but has not yet actually been mapped * into multiple address spaces. */ static inline bool dax_folio_is_shared(struct folio *folio) { return !folio->mapping && folio->share; } /* * When it is called by dax_insert_entry(), the shared flag will indicate * whether this entry is shared by multiple files. If the page has not * previously been associated with any mappings the ->mapping and ->index * fields will be set. If it has already been associated with a mapping * the mapping will be cleared and the share count set. It's then up to * reverse map users like memory_failure() to call back into the filesystem to * recover ->mapping and ->index information. For example by implementing * dax_holder_operations. */ static void dax_folio_make_shared(struct folio *folio) { /* * folio is not currently shared so mark it as shared by clearing * folio->mapping. */ folio->mapping = NULL; /* * folio has previously been mapped into one address space so set the * share count. */ folio->share = 1; } static inline unsigned long dax_folio_put(struct folio *folio) { unsigned long ref; int order, i; if (!dax_folio_is_shared(folio)) ref = 0; else ref = --folio->share; if (ref) return ref; folio->mapping = NULL; order = folio_order(folio); if (!order) return 0; folio_reset_order(folio); for (i = 0; i < (1UL << order); i++) { struct dev_pagemap *pgmap = page_pgmap(&folio->page); struct page *page = folio_page(folio, i); struct folio *new_folio = (struct folio *)page; ClearPageHead(page); clear_compound_head(page); new_folio->mapping = NULL; /* * Reset pgmap which was over-written by * prep_compound_page(). */ new_folio->pgmap = pgmap; new_folio->share = 0; WARN_ON_ONCE(folio_ref_count(new_folio)); } return ref; } static void dax_folio_init(void *entry) { struct folio *folio = dax_to_folio(entry); int order = dax_entry_order(entry); /* * Folio should have been split back to order-0 pages in * dax_folio_put() when they were removed from their * final mapping. */ WARN_ON_ONCE(folio_order(folio)); if (order > 0) { prep_compound_page(&folio->page, order); if (order > 1) INIT_LIST_HEAD(&folio->_deferred_list); WARN_ON_ONCE(folio_ref_count(folio)); } } static void dax_associate_entry(void *entry, struct address_space *mapping, struct vm_area_struct *vma, unsigned long address, bool shared) { unsigned long size = dax_entry_size(entry), index; struct folio *folio = dax_to_folio(entry); if (dax_is_zero_entry(entry) || dax_is_empty_entry(entry)) return; if (IS_ENABLED(CONFIG_FS_DAX_LIMITED)) return; index = linear_page_index(vma, address & ~(size - 1)); if (shared && (folio->mapping || dax_folio_is_shared(folio))) { if (folio->mapping) dax_folio_make_shared(folio); WARN_ON_ONCE(!folio->share); WARN_ON_ONCE(dax_entry_order(entry) != folio_order(folio)); folio->share++; } else { WARN_ON_ONCE(folio->mapping); dax_folio_init(entry); folio = dax_to_folio(entry); folio->mapping = mapping; folio->index = index; } } static void dax_disassociate_entry(void *entry, struct address_space *mapping, bool trunc) { struct folio *folio = dax_to_folio(entry); if (IS_ENABLED(CONFIG_FS_DAX_LIMITED)) return; if (dax_is_zero_entry(entry) || dax_is_empty_entry(entry)) return; dax_folio_put(folio); } static struct page *dax_busy_page(void *entry) { struct folio *folio = dax_to_folio(entry); if (dax_is_zero_entry(entry) || dax_is_empty_entry(entry)) return NULL; if (folio_ref_count(folio) - folio_mapcount(folio)) return &folio->page; else return NULL; } /** * dax_lock_folio - Lock the DAX entry corresponding to a folio * @folio: The folio whose entry we want to lock * * Context: Process context. * Return: A cookie to pass to dax_unlock_folio() or 0 if the entry could * not be locked. */ dax_entry_t dax_lock_folio(struct folio *folio) { XA_STATE(xas, NULL, 0); void *entry; /* Ensure folio->mapping isn't freed while we look at it */ rcu_read_lock(); for (;;) { struct address_space *mapping = READ_ONCE(folio->mapping); entry = NULL; if (!mapping || !dax_mapping(mapping)) break; /* * In the device-dax case there's no need to lock, a * struct dev_pagemap pin is sufficient to keep the * inode alive, and we assume we have dev_pagemap pin * otherwise we would not have a valid pfn_to_page() * translation. */ entry = (void *)~0UL; if (S_ISCHR(mapping->host->i_mode)) break; xas.xa = &mapping->i_pages; xas_lock_irq(&xas); if (mapping != folio->mapping) { xas_unlock_irq(&xas); continue; } xas_set(&xas, folio->index); entry = xas_load(&xas); if (dax_is_locked(entry)) { rcu_read_unlock(); wait_entry_unlocked(&xas, entry); rcu_read_lock(); continue; } dax_lock_entry(&xas, entry); xas_unlock_irq(&xas); break; } rcu_read_unlock(); return (dax_entry_t)entry; } void dax_unlock_folio(struct folio *folio, dax_entry_t cookie) { struct address_space *mapping = folio->mapping; XA_STATE(xas, &mapping->i_pages, folio->index); if (S_ISCHR(mapping->host->i_mode)) return; dax_unlock_entry(&xas, (void *)cookie); } /* * dax_lock_mapping_entry - Lock the DAX entry corresponding to a mapping * @mapping: the file's mapping whose entry we want to lock * @index: the offset within this file * @page: output the dax page corresponding to this dax entry * * Return: A cookie to pass to dax_unlock_mapping_entry() or 0 if the entry * could not be locked. */ dax_entry_t dax_lock_mapping_entry(struct address_space *mapping, pgoff_t index, struct page **page) { XA_STATE(xas, NULL, 0); void *entry; rcu_read_lock(); for (;;) { entry = NULL; if (!dax_mapping(mapping)) break; xas.xa = &mapping->i_pages; xas_lock_irq(&xas); xas_set(&xas, index); entry = xas_load(&xas); if (dax_is_locked(entry)) { rcu_read_unlock(); wait_entry_unlocked(&xas, entry); rcu_read_lock(); continue; } if (!entry || dax_is_zero_entry(entry) || dax_is_empty_entry(entry)) { /* * Because we are looking for entry from file's mapping * and index, so the entry may not be inserted for now, * or even a zero/empty entry. We don't think this is * an error case. So, return a special value and do * not output @page. */ entry = (void *)~0UL; } else { *page = pfn_to_page(dax_to_pfn(entry)); dax_lock_entry(&xas, entry); } xas_unlock_irq(&xas); break; } rcu_read_unlock(); return (dax_entry_t)entry; } void dax_unlock_mapping_entry(struct address_space *mapping, pgoff_t index, dax_entry_t cookie) { XA_STATE(xas, &mapping->i_pages, index); if (cookie == ~0UL) return; dax_unlock_entry(&xas, (void *)cookie); } /* * Find page cache entry at given index. If it is a DAX entry, return it * with the entry locked. If the page cache doesn't contain an entry at * that index, add a locked empty entry. * * When requesting an entry with size DAX_PMD, grab_mapping_entry() will * either return that locked entry or will return VM_FAULT_FALLBACK. * This will happen if there are any PTE entries within the PMD range * that we are requesting. * * We always favor PTE entries over PMD entries. There isn't a flow where we * evict PTE entries in order to 'upgrade' them to a PMD entry. A PMD * insertion will fail if it finds any PTE entries already in the tree, and a * PTE insertion will cause an existing PMD entry to be unmapped and * downgraded to PTE entries. This happens for both PMD zero pages as * well as PMD empty entries. * * The exception to this downgrade path is for PMD entries that have * real storage backing them. We will leave these real PMD entries in * the tree, and PTE writes will simply dirty the entire PMD entry. * * Note: Unlike filemap_fault() we don't honor FAULT_FLAG_RETRY flags. For * persistent memory the benefit is doubtful. We can add that later if we can * show it helps. * * On error, this function does not return an ERR_PTR. Instead it returns * a VM_FAULT code, encoded as an xarray internal entry. The ERR_PTR values * overlap with xarray value entries. */ static void *grab_mapping_entry(struct xa_state *xas, struct address_space *mapping, unsigned int order) { unsigned long index = xas->xa_index; bool pmd_downgrade; /* splitting PMD entry into PTE entries? */ void *entry; retry: pmd_downgrade = false; xas_lock_irq(xas); entry = get_next_unlocked_entry(xas, order); if (entry) { if (dax_is_conflict(entry)) goto fallback; if (!xa_is_value(entry)) { xas_set_err(xas, -EIO); goto out_unlock; } if (order == 0) { if (dax_is_pmd_entry(entry) && (dax_is_zero_entry(entry) || dax_is_empty_entry(entry))) { pmd_downgrade = true; } } } if (pmd_downgrade) { /* * Make sure 'entry' remains valid while we drop * the i_pages lock. */ dax_lock_entry(xas, entry); /* * Besides huge zero pages the only other thing that gets * downgraded are empty entries which don't need to be * unmapped. */ if (dax_is_zero_entry(entry)) { xas_unlock_irq(xas); unmap_mapping_pages(mapping, xas->xa_index & ~PG_PMD_COLOUR, PG_PMD_NR, false); xas_reset(xas); xas_lock_irq(xas); } dax_disassociate_entry(entry, mapping, false); xas_store(xas, NULL); /* undo the PMD join */ dax_wake_entry(xas, entry, WAKE_ALL); mapping->nrpages -= PG_PMD_NR; entry = NULL; xas_set(xas, index); } if (entry) { dax_lock_entry(xas, entry); } else { unsigned long flags = DAX_EMPTY; if (order > 0) flags |= DAX_PMD; entry = dax_make_entry(pfn_to_pfn_t(0), flags); dax_lock_entry(xas, entry); if (xas_error(xas)) goto out_unlock; mapping->nrpages += 1UL << order; } out_unlock: xas_unlock_irq(xas); if (xas_nomem(xas, mapping_gfp_mask(mapping) & ~__GFP_HIGHMEM)) goto retry; if (xas->xa_node == XA_ERROR(-ENOMEM)) return xa_mk_internal(VM_FAULT_OOM); if (xas_error(xas)) return xa_mk_internal(VM_FAULT_SIGBUS); return entry; fallback: xas_unlock_irq(xas); return xa_mk_internal(VM_FAULT_FALLBACK); } /** * dax_layout_busy_page_range - find first pinned page in @mapping * @mapping: address space to scan for a page with ref count > 1 * @start: Starting offset. Page containing 'start' is included. * @end: End offset. Page containing 'end' is included. If 'end' is LLONG_MAX, * pages from 'start' till the end of file are included. * * DAX requires ZONE_DEVICE mapped pages. These pages are never * 'onlined' to the page allocator so they are considered idle when * page->count == 1. A filesystem uses this interface to determine if * any page in the mapping is busy, i.e. for DMA, or other * get_user_pages() usages. * * It is expected that the filesystem is holding locks to block the * establishment of new mappings in this address_space. I.e. it expects * to be able to run unmap_mapping_range() and subsequently not race * mapping_mapped() becoming true. */ struct page *dax_layout_busy_page_range(struct address_space *mapping, loff_t start, loff_t end) { void *entry; unsigned int scanned = 0; struct page *page = NULL; pgoff_t start_idx = start >> PAGE_SHIFT; pgoff_t end_idx; XA_STATE(xas, &mapping->i_pages, start_idx); /* * In the 'limited' case get_user_pages() for dax is disabled. */ if (IS_ENABLED(CONFIG_FS_DAX_LIMITED)) return NULL; if (!dax_mapping(mapping)) return NULL; /* If end == LLONG_MAX, all pages from start to till end of file */ if (end == LLONG_MAX) end_idx = ULONG_MAX; else end_idx = end >> PAGE_SHIFT; /* * If we race get_user_pages_fast() here either we'll see the * elevated page count in the iteration and wait, or * get_user_pages_fast() will see that the page it took a reference * against is no longer mapped in the page tables and bail to the * get_user_pages() slow path. The slow path is protected by * pte_lock() and pmd_lock(). New references are not taken without * holding those locks, and unmap_mapping_pages() will not zero the * pte or pmd without holding the respective lock, so we are * guaranteed to either see new references or prevent new * references from being established. */ unmap_mapping_pages(mapping, start_idx, end_idx - start_idx + 1, 0); xas_lock_irq(&xas); xas_for_each(&xas, entry, end_idx) { if (WARN_ON_ONCE(!xa_is_value(entry))) continue; entry = wait_entry_unlocked_exclusive(&xas, entry); if (entry) page = dax_busy_page(entry); put_unlocked_entry(&xas, entry, WAKE_NEXT); if (page) break; if (++scanned % XA_CHECK_SCHED) continue; xas_pause(&xas); xas_unlock_irq(&xas); cond_resched(); xas_lock_irq(&xas); } xas_unlock_irq(&xas); return page; } EXPORT_SYMBOL_GPL(dax_layout_busy_page_range); struct page *dax_layout_busy_page(struct address_space *mapping) { return dax_layout_busy_page_range(mapping, 0, LLONG_MAX); } EXPORT_SYMBOL_GPL(dax_layout_busy_page); static int __dax_invalidate_entry(struct address_space *mapping, pgoff_t index, bool trunc) { XA_STATE(xas, &mapping->i_pages, index); int ret = 0; void *entry; xas_lock_irq(&xas); entry = get_next_unlocked_entry(&xas, 0); if (!entry || WARN_ON_ONCE(!xa_is_value(entry))) goto out; if (!trunc && (xas_get_mark(&xas, PAGECACHE_TAG_DIRTY) || xas_get_mark(&xas, PAGECACHE_TAG_TOWRITE))) goto out; dax_disassociate_entry(entry, mapping, trunc); xas_store(&xas, NULL); mapping->nrpages -= 1UL << dax_entry_order(entry); ret = 1; out: put_unlocked_entry(&xas, entry, WAKE_ALL); xas_unlock_irq(&xas); return ret; } static int __dax_clear_dirty_range(struct address_space *mapping, pgoff_t start, pgoff_t end) { XA_STATE(xas, &mapping->i_pages, start); unsigned int scanned = 0; void *entry; xas_lock_irq(&xas); xas_for_each(&xas, entry, end) { entry = wait_entry_unlocked_exclusive(&xas, entry); if (!entry) continue; xas_clear_mark(&xas, PAGECACHE_TAG_DIRTY); xas_clear_mark(&xas, PAGECACHE_TAG_TOWRITE); put_unlocked_entry(&xas, entry, WAKE_NEXT); if (++scanned % XA_CHECK_SCHED) continue; xas_pause(&xas); xas_unlock_irq(&xas); cond_resched(); xas_lock_irq(&xas); } xas_unlock_irq(&xas); return 0; } /* * Delete DAX entry at @index from @mapping. Wait for it * to be unlocked before deleting it. */ int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index) { int ret = __dax_invalidate_entry(mapping, index, true); /* * This gets called from truncate / punch_hole path. As such, the caller * must hold locks protecting against concurrent modifications of the * page cache (usually fs-private i_mmap_sem for writing). Since the * caller has seen a DAX entry for this index, we better find it * at that index as well... */ WARN_ON_ONCE(!ret); return ret; } void dax_delete_mapping_range(struct address_space *mapping, loff_t start, loff_t end) { void *entry; pgoff_t start_idx = start >> PAGE_SHIFT; pgoff_t end_idx; XA_STATE(xas, &mapping->i_pages, start_idx); /* If end == LLONG_MAX, all pages from start to till end of file */ if (end == LLONG_MAX) end_idx = ULONG_MAX; else end_idx = end >> PAGE_SHIFT; xas_lock_irq(&xas); xas_for_each(&xas, entry, end_idx) { if (!xa_is_value(entry)) continue; entry = wait_entry_unlocked_exclusive(&xas, entry); if (!entry) continue; dax_disassociate_entry(entry, mapping, true); xas_store(&xas, NULL); mapping->nrpages -= 1UL << dax_entry_order(entry); put_unlocked_entry(&xas, entry, WAKE_ALL); } xas_unlock_irq(&xas); } EXPORT_SYMBOL_GPL(dax_delete_mapping_range); static int wait_page_idle(struct page *page, void (cb)(struct inode *), struct inode *inode) { return ___wait_var_event(page, dax_page_is_idle(page), TASK_INTERRUPTIBLE, 0, 0, cb(inode)); } static void wait_page_idle_uninterruptible(struct page *page, struct inode *inode) { ___wait_var_event(page, dax_page_is_idle(page), TASK_UNINTERRUPTIBLE, 0, 0, schedule()); } /* * Unmaps the inode and waits for any DMA to complete prior to deleting the * DAX mapping entries for the range. * * For NOWAIT behavior, pass @cb as NULL to early-exit on first found * busy page */ int dax_break_layout(struct inode *inode, loff_t start, loff_t end, void (cb)(struct inode *)) { struct page *page; int error = 0; if (!dax_mapping(inode->i_mapping)) return 0; do { page = dax_layout_busy_page_range(inode->i_mapping, start, end); if (!page) break; if (!cb) { error = -ERESTARTSYS; break; } error = wait_page_idle(page, cb, inode); } while (error == 0); if (!page) dax_delete_mapping_range(inode->i_mapping, start, end); return error; } EXPORT_SYMBOL_GPL(dax_break_layout); void dax_break_layout_final(struct inode *inode) { struct page *page; if (!dax_mapping(inode->i_mapping)) return; do { page = dax_layout_busy_page_range(inode->i_mapping, 0, LLONG_MAX); if (!page) break; wait_page_idle_uninterruptible(page, inode); } while (true); if (!page) dax_delete_mapping_range(inode->i_mapping, 0, LLONG_MAX); } EXPORT_SYMBOL_GPL(dax_break_layout_final); /* * Invalidate DAX entry if it is clean. */ int dax_invalidate_mapping_entry_sync(struct address_space *mapping, pgoff_t index) { return __dax_invalidate_entry(mapping, index, false); } static pgoff_t dax_iomap_pgoff(const struct iomap *iomap, loff_t pos) { return PHYS_PFN(iomap->addr + (pos & PAGE_MASK) - iomap->offset); } static int copy_cow_page_dax(struct vm_fault *vmf, const struct iomap_iter *iter) { pgoff_t pgoff = dax_iomap_pgoff(&iter->iomap, iter->pos); void *vto, *kaddr; long rc; int id; id = dax_read_lock(); rc = dax_direct_access(iter->iomap.dax_dev, pgoff, 1, DAX_ACCESS, &kaddr, NULL); if (rc < 0) { dax_read_unlock(id); return rc; } vto = kmap_atomic(vmf->cow_page); copy_user_page(vto, kaddr, vmf->address, vmf->cow_page); kunmap_atomic(vto); dax_read_unlock(id); return 0; } /* * MAP_SYNC on a dax mapping guarantees dirty metadata is * flushed on write-faults (non-cow), but not read-faults. */ static bool dax_fault_is_synchronous(const struct iomap_iter *iter, struct vm_area_struct *vma) { return (iter->flags & IOMAP_WRITE) && (vma->vm_flags & VM_SYNC) && (iter->iomap.flags & IOMAP_F_DIRTY); } /* * By this point grab_mapping_entry() has ensured that we have a locked entry * of the appropriate size so we don't have to worry about downgrading PMDs to * PTEs. If we happen to be trying to insert a PTE and there is a PMD * already in the tree, we will skip the insertion and just dirty the PMD as * appropriate. */ static void *dax_insert_entry(struct xa_state *xas, struct vm_fault *vmf, const struct iomap_iter *iter, void *entry, pfn_t pfn, unsigned long flags) { struct address_space *mapping = vmf->vma->vm_file->f_mapping; void *new_entry = dax_make_entry(pfn, flags); bool write = iter->flags & IOMAP_WRITE; bool dirty = write && !dax_fault_is_synchronous(iter, vmf->vma); bool shared = iter->iomap.flags & IOMAP_F_SHARED; if (dirty) __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); if (shared || (dax_is_zero_entry(entry) && !(flags & DAX_ZERO_PAGE))) { unsigned long index = xas->xa_index; /* we are replacing a zero page with block mapping */ if (dax_is_pmd_entry(entry)) unmap_mapping_pages(mapping, index & ~PG_PMD_COLOUR, PG_PMD_NR, false); else /* pte entry */ unmap_mapping_pages(mapping, index, 1, false); } xas_reset(xas); xas_lock_irq(xas); if (shared || dax_is_zero_entry(entry) || dax_is_empty_entry(entry)) { void *old; dax_disassociate_entry(entry, mapping, false); dax_associate_entry(new_entry, mapping, vmf->vma, vmf->address, shared); /* * Only swap our new entry into the page cache if the current * entry is a zero page or an empty entry. If a normal PTE or * PMD entry is already in the cache, we leave it alone. This * means that if we are trying to insert a PTE and the * existing entry is a PMD, we will just leave the PMD in the * tree and dirty it if necessary. */ old = dax_lock_entry(xas, new_entry); WARN_ON_ONCE(old != xa_mk_value(xa_to_value(entry) | DAX_LOCKED)); entry = new_entry; } else { xas_load(xas); /* Walk the xa_state */ } if (dirty) xas_set_mark(xas, PAGECACHE_TAG_DIRTY); if (write && shared) xas_set_mark(xas, PAGECACHE_TAG_TOWRITE); xas_unlock_irq(xas); return entry; } static int dax_writeback_one(struct xa_state *xas, struct dax_device *dax_dev, struct address_space *mapping, void *entry) { unsigned long pfn, index, count, end; long ret = 0; struct vm_area_struct *vma; /* * A page got tagged dirty in DAX mapping? Something is seriously * wrong. */ if (WARN_ON(!xa_is_value(entry))) return -EIO; if (unlikely(dax_is_locked(entry))) { void *old_entry = entry; entry = get_next_unlocked_entry(xas, 0); /* Entry got punched out / reallocated? */ if (!entry || WARN_ON_ONCE(!xa_is_value(entry))) goto put_unlocked; /* * Entry got reallocated elsewhere? No need to writeback. * We have to compare pfns as we must not bail out due to * difference in lockbit or entry type. */ if (dax_to_pfn(old_entry) != dax_to_pfn(entry)) goto put_unlocked; if (WARN_ON_ONCE(dax_is_empty_entry(entry) || dax_is_zero_entry(entry))) { ret = -EIO; goto put_unlocked; } /* Another fsync thread may have already done this entry */ if (!xas_get_mark(xas, PAGECACHE_TAG_TOWRITE)) goto put_unlocked; } /* Lock the entry to serialize with page faults */ dax_lock_entry(xas, entry); /* * We can clear the tag now but we have to be careful so that concurrent * dax_writeback_one() calls for the same index cannot finish before we * actually flush the caches. This is achieved as the calls will look * at the entry only under the i_pages lock and once they do that * they will see the entry locked and wait for it to unlock. */ xas_clear_mark(xas, PAGECACHE_TAG_TOWRITE); xas_unlock_irq(xas); /* * If dax_writeback_mapping_range() was given a wbc->range_start * in the middle of a PMD, the 'index' we use needs to be * aligned to the start of the PMD. * This allows us to flush for PMD_SIZE and not have to worry about * partial PMD writebacks. */ pfn = dax_to_pfn(entry); count = 1UL << dax_entry_order(entry); index = xas->xa_index & ~(count - 1); end = index + count - 1; /* Walk all mappings of a given index of a file and writeprotect them */ i_mmap_lock_read(mapping); vma_interval_tree_foreach(vma, &mapping->i_mmap, index, end) { pfn_mkclean_range(pfn, count, index, vma); cond_resched(); } i_mmap_unlock_read(mapping); dax_flush(dax_dev, page_address(pfn_to_page(pfn)), count * PAGE_SIZE); /* * After we have flushed the cache, we can clear the dirty tag. There * cannot be new dirty data in the pfn after the flush has completed as * the pfn mappings are writeprotected and fault waits for mapping * entry lock. */ xas_reset(xas); xas_lock_irq(xas); xas_store(xas, entry); xas_clear_mark(xas, PAGECACHE_TAG_DIRTY); dax_wake_entry(xas, entry, WAKE_NEXT); trace_dax_writeback_one(mapping->host, index, count); return ret; put_unlocked: put_unlocked_entry(xas, entry, WAKE_NEXT); return ret; } /* * Flush the mapping to the persistent domain within the byte range of [start, * end]. This is required by data integrity operations to ensure file data is * on persistent storage prior to completion of the operation. */ int dax_writeback_mapping_range(struct address_space *mapping, struct dax_device *dax_dev, struct writeback_control *wbc) { XA_STATE(xas, &mapping->i_pages, wbc->range_start >> PAGE_SHIFT); struct inode *inode = mapping->host; pgoff_t end_index = wbc->range_end >> PAGE_SHIFT; void *entry; int ret = 0; unsigned int scanned = 0; if (WARN_ON_ONCE(inode->i_blkbits != PAGE_SHIFT)) return -EIO; if (mapping_empty(mapping) || wbc->sync_mode != WB_SYNC_ALL) return 0; trace_dax_writeback_range(inode, xas.xa_index, end_index); tag_pages_for_writeback(mapping, xas.xa_index, end_index); xas_lock_irq(&xas); xas_for_each_marked(&xas, entry, end_index, PAGECACHE_TAG_TOWRITE) { ret = dax_writeback_one(&xas, dax_dev, mapping, entry); if (ret < 0) { mapping_set_error(mapping, ret); break; } if (++scanned % XA_CHECK_SCHED) continue; xas_pause(&xas); xas_unlock_irq(&xas); cond_resched(); xas_lock_irq(&xas); } xas_unlock_irq(&xas); trace_dax_writeback_range_done(inode, xas.xa_index, end_index); return ret; } EXPORT_SYMBOL_GPL(dax_writeback_mapping_range); static int dax_iomap_direct_access(const struct iomap *iomap, loff_t pos, size_t size, void **kaddr, pfn_t *pfnp) { pgoff_t pgoff = dax_iomap_pgoff(iomap, pos); int id, rc = 0; long length; id = dax_read_lock(); length = dax_direct_access(iomap->dax_dev, pgoff, PHYS_PFN(size), DAX_ACCESS, kaddr, pfnp); if (length < 0) { rc = length; goto out; } if (!pfnp) goto out_check_addr; rc = -EINVAL; if (PFN_PHYS(length) < size) goto out; if (pfn_t_to_pfn(*pfnp) & (PHYS_PFN(size)-1)) goto out; rc = 0; out_check_addr: if (!kaddr) goto out; if (!*kaddr) rc = -EFAULT; out: dax_read_unlock(id); return rc; } /** * dax_iomap_copy_around - Prepare for an unaligned write to a shared/cow page * by copying the data before and after the range to be written. * @pos: address to do copy from. * @length: size of copy operation. * @align_size: aligned w.r.t align_size (either PMD_SIZE or PAGE_SIZE) * @srcmap: iomap srcmap * @daddr: destination address to copy to. * * This can be called from two places. Either during DAX write fault (page * aligned), to copy the length size data to daddr. Or, while doing normal DAX * write operation, dax_iomap_iter() might call this to do the copy of either * start or end unaligned address. In the latter case the rest of the copy of * aligned ranges is taken care by dax_iomap_iter() itself. * If the srcmap contains invalid data, such as HOLE and UNWRITTEN, zero the * area to make sure no old data remains. */ static int dax_iomap_copy_around(loff_t pos, uint64_t length, size_t align_size, const struct iomap *srcmap, void *daddr) { loff_t head_off = pos & (align_size - 1); size_t size = ALIGN(head_off + length, align_size); loff_t end = pos + length; loff_t pg_end = round_up(end, align_size); /* copy_all is usually in page fault case */ bool copy_all = head_off == 0 && end == pg_end; /* zero the edges if srcmap is a HOLE or IOMAP_UNWRITTEN */ bool zero_edge = srcmap->flags & IOMAP_F_SHARED || srcmap->type == IOMAP_UNWRITTEN; void *saddr = NULL; int ret = 0; if (!zero_edge) { ret = dax_iomap_direct_access(srcmap, pos, size, &saddr, NULL); if (ret) return dax_mem2blk_err(ret); } if (copy_all) { if (zero_edge) memset(daddr, 0, size); else ret = copy_mc_to_kernel(daddr, saddr, length); goto out; } /* Copy the head part of the range */ if (head_off) { if (zero_edge) memset(daddr, 0, head_off); else { ret = copy_mc_to_kernel(daddr, saddr, head_off); if (ret) return -EIO; } } /* Copy the tail part of the range */ if (end < pg_end) { loff_t tail_off = head_off + length; loff_t tail_len = pg_end - end; if (zero_edge) memset(daddr + tail_off, 0, tail_len); else { ret = copy_mc_to_kernel(daddr + tail_off, saddr + tail_off, tail_len); if (ret) return -EIO; } } out: if (zero_edge) dax_flush(srcmap->dax_dev, daddr, size); return ret ? -EIO : 0; } /* * The user has performed a load from a hole in the file. Allocating a new * page in the file would cause excessive storage usage for workloads with * sparse files. Instead we insert a read-only mapping of the 4k zero page. * If this page is ever written to we will re-fault and change the mapping to * point to real DAX storage instead. */ static vm_fault_t dax_load_hole(struct xa_state *xas, struct vm_fault *vmf, const struct iomap_iter *iter, void **entry) { struct inode *inode = iter->inode; unsigned long vaddr = vmf->address; pfn_t pfn = pfn_to_pfn_t(my_zero_pfn(vaddr)); vm_fault_t ret; *entry = dax_insert_entry(xas, vmf, iter, *entry, pfn, DAX_ZERO_PAGE); ret = vmf_insert_page_mkwrite(vmf, pfn_t_to_page(pfn), false); trace_dax_load_hole(inode, vmf, ret); return ret; } #ifdef CONFIG_FS_DAX_PMD static vm_fault_t dax_pmd_load_hole(struct xa_state *xas, struct vm_fault *vmf, const struct iomap_iter *iter, void **entry) { struct address_space *mapping = vmf->vma->vm_file->f_mapping; unsigned long pmd_addr = vmf->address & PMD_MASK; struct vm_area_struct *vma = vmf->vma; struct inode *inode = mapping->host; pgtable_t pgtable = NULL; struct folio *zero_folio; spinlock_t *ptl; pmd_t pmd_entry; pfn_t pfn; zero_folio = mm_get_huge_zero_folio(vmf->vma->vm_mm); if (unlikely(!zero_folio)) goto fallback; pfn = page_to_pfn_t(&zero_folio->page); *entry = dax_insert_entry(xas, vmf, iter, *entry, pfn, DAX_PMD | DAX_ZERO_PAGE); if (arch_needs_pgtable_deposit()) { pgtable = pte_alloc_one(vma->vm_mm); if (!pgtable) return VM_FAULT_OOM; } ptl = pmd_lock(vmf->vma->vm_mm, vmf->pmd); if (!pmd_none(*(vmf->pmd))) { spin_unlock(ptl); goto fallback; } if (pgtable) { pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable); mm_inc_nr_ptes(vma->vm_mm); } pmd_entry = folio_mk_pmd(zero_folio, vmf->vma->vm_page_prot); set_pmd_at(vmf->vma->vm_mm, pmd_addr, vmf->pmd, pmd_entry); spin_unlock(ptl); trace_dax_pmd_load_hole(inode, vmf, zero_folio, *entry); return VM_FAULT_NOPAGE; fallback: if (pgtable) pte_free(vma->vm_mm, pgtable); trace_dax_pmd_load_hole_fallback(inode, vmf, zero_folio, *entry); return VM_FAULT_FALLBACK; } #else static vm_fault_t dax_pmd_load_hole(struct xa_state *xas, struct vm_fault *vmf, const struct iomap_iter *iter, void **entry) { return VM_FAULT_FALLBACK; } #endif /* CONFIG_FS_DAX_PMD */ static int dax_unshare_iter(struct iomap_iter *iter) { struct iomap *iomap = &iter->iomap; const struct iomap *srcmap = iomap_iter_srcmap(iter); loff_t copy_pos = iter->pos; u64 copy_len = iomap_length(iter); u32 mod; int id = 0; s64 ret; void *daddr = NULL, *saddr = NULL; if (!iomap_want_unshare_iter(iter)) return iomap_iter_advance_full(iter); /* * Extend the file range to be aligned to fsblock/pagesize, because * we need to copy entire blocks, not just the byte range specified. * Invalidate the mapping because we're about to CoW. */ mod = offset_in_page(copy_pos); if (mod) { copy_len += mod; copy_pos -= mod; } mod = offset_in_page(copy_pos + copy_len); if (mod) copy_len += PAGE_SIZE - mod; invalidate_inode_pages2_range(iter->inode->i_mapping, copy_pos >> PAGE_SHIFT, (copy_pos + copy_len - 1) >> PAGE_SHIFT); id = dax_read_lock(); ret = dax_iomap_direct_access(iomap, copy_pos, copy_len, &daddr, NULL); if (ret < 0) goto out_unlock; ret = dax_iomap_direct_access(srcmap, copy_pos, copy_len, &saddr, NULL); if (ret < 0) goto out_unlock; if (copy_mc_to_kernel(daddr, saddr, copy_len) != 0) ret = -EIO; out_unlock: dax_read_unlock(id); if (ret < 0) return dax_mem2blk_err(ret); return iomap_iter_advance_full(iter); } int dax_file_unshare(struct inode *inode, loff_t pos, loff_t len, const struct iomap_ops *ops) { struct iomap_iter iter = { .inode = inode, .pos = pos, .flags = IOMAP_WRITE | IOMAP_UNSHARE | IOMAP_DAX, }; loff_t size = i_size_read(inode); int ret; if (pos < 0 || pos >= size) return 0; iter.len = min(len, size - pos); while ((ret = iomap_iter(&iter, ops)) > 0) iter.status = dax_unshare_iter(&iter); return ret; } EXPORT_SYMBOL_GPL(dax_file_unshare); static int dax_memzero(struct iomap_iter *iter, loff_t pos, size_t size) { const struct iomap *iomap = &iter->iomap; const struct iomap *srcmap = iomap_iter_srcmap(iter); unsigned offset = offset_in_page(pos); pgoff_t pgoff = dax_iomap_pgoff(iomap, pos); void *kaddr; long ret; ret = dax_direct_access(iomap->dax_dev, pgoff, 1, DAX_ACCESS, &kaddr, NULL); if (ret < 0) return dax_mem2blk_err(ret); memset(kaddr + offset, 0, size); if (iomap->flags & IOMAP_F_SHARED) ret = dax_iomap_copy_around(pos, size, PAGE_SIZE, srcmap, kaddr); else dax_flush(iomap->dax_dev, kaddr + offset, size); return ret; } static int dax_zero_iter(struct iomap_iter *iter, bool *did_zero) { const struct iomap *iomap = &iter->iomap; const struct iomap *srcmap = iomap_iter_srcmap(iter); u64 length = iomap_length(iter); int ret; /* already zeroed? we're done. */ if (srcmap->type == IOMAP_HOLE || srcmap->type == IOMAP_UNWRITTEN) return iomap_iter_advance(iter, &length); /* * invalidate the pages whose sharing state is to be changed * because of CoW. */ if (iomap->flags & IOMAP_F_SHARED) invalidate_inode_pages2_range(iter->inode->i_mapping, iter->pos >> PAGE_SHIFT, (iter->pos + length - 1) >> PAGE_SHIFT); do { loff_t pos = iter->pos; unsigned offset = offset_in_page(pos); pgoff_t pgoff = dax_iomap_pgoff(iomap, pos); int id; length = min_t(u64, PAGE_SIZE - offset, length); id = dax_read_lock(); if (IS_ALIGNED(pos, PAGE_SIZE) && length == PAGE_SIZE) ret = dax_zero_page_range(iomap->dax_dev, pgoff, 1); else ret = dax_memzero(iter, pos, length); dax_read_unlock(id); if (ret < 0) return ret; ret = iomap_iter_advance(iter, &length); if (ret) return ret; } while (length > 0); if (did_zero) *did_zero = true; return ret; } int dax_zero_range(struct inode *inode, loff_t pos, loff_t len, bool *did_zero, const struct iomap_ops *ops) { struct iomap_iter iter = { .inode = inode, .pos = pos, .len = len, .flags = IOMAP_DAX | IOMAP_ZERO, }; int ret; while ((ret = iomap_iter(&iter, ops)) > 0) iter.status = dax_zero_iter(&iter, did_zero); return ret; } EXPORT_SYMBOL_GPL(dax_zero_range); int dax_truncate_page(struct inode *inode, loff_t pos, bool *did_zero, const struct iomap_ops *ops) { unsigned int blocksize = i_blocksize(inode); unsigned int off = pos & (blocksize - 1); /* Block boundary? Nothing to do */ if (!off) return 0; return dax_zero_range(inode, pos, blocksize - off, did_zero, ops); } EXPORT_SYMBOL_GPL(dax_truncate_page); static int dax_iomap_iter(struct iomap_iter *iomi, struct iov_iter *iter) { const struct iomap *iomap = &iomi->iomap; const struct iomap *srcmap = iomap_iter_srcmap(iomi); loff_t length = iomap_length(iomi); loff_t pos = iomi->pos; struct dax_device *dax_dev = iomap->dax_dev; loff_t end = pos + length, done = 0; bool write = iov_iter_rw(iter) == WRITE; bool cow = write && iomap->flags & IOMAP_F_SHARED; ssize_t ret = 0; size_t xfer; int id; if (!write) { end = min(end, i_size_read(iomi->inode)); if (pos >= end) return 0; if (iomap->type == IOMAP_HOLE || iomap->type == IOMAP_UNWRITTEN) { done = iov_iter_zero(min(length, end - pos), iter); return iomap_iter_advance(iomi, &done); } } /* * In DAX mode, enforce either pure overwrites of written extents, or * writes to unwritten extents as part of a copy-on-write operation. */ if (WARN_ON_ONCE(iomap->type != IOMAP_MAPPED && !(iomap->flags & IOMAP_F_SHARED))) return -EIO; /* * Write can allocate block for an area which has a hole page mapped * into page tables. We have to tear down these mappings so that data * written by write(2) is visible in mmap. */ if (iomap->flags & IOMAP_F_NEW || cow) { /* * Filesystem allows CoW on non-shared extents. The src extents * may have been mmapped with dirty mark before. To be able to * invalidate its dax entries, we need to clear the dirty mark * in advance. */ if (cow) __dax_clear_dirty_range(iomi->inode->i_mapping, pos >> PAGE_SHIFT, (end - 1) >> PAGE_SHIFT); invalidate_inode_pages2_range(iomi->inode->i_mapping, pos >> PAGE_SHIFT, (end - 1) >> PAGE_SHIFT); } id = dax_read_lock(); while ((pos = iomi->pos) < end) { unsigned offset = pos & (PAGE_SIZE - 1); const size_t size = ALIGN(length + offset, PAGE_SIZE); pgoff_t pgoff = dax_iomap_pgoff(iomap, pos); ssize_t map_len; bool recovery = false; void *kaddr; if (fatal_signal_pending(current)) { ret = -EINTR; break; } map_len = dax_direct_access(dax_dev, pgoff, PHYS_PFN(size), DAX_ACCESS, &kaddr, NULL); if (map_len == -EHWPOISON && iov_iter_rw(iter) == WRITE) { map_len = dax_direct_access(dax_dev, pgoff, PHYS_PFN(size), DAX_RECOVERY_WRITE, &kaddr, NULL); if (map_len > 0) recovery = true; } if (map_len < 0) { ret = dax_mem2blk_err(map_len); break; } if (cow) { ret = dax_iomap_copy_around(pos, length, PAGE_SIZE, srcmap, kaddr); if (ret) break; } map_len = PFN_PHYS(map_len); kaddr += offset; map_len -= offset; if (map_len > end - pos) map_len = end - pos; if (recovery) xfer = dax_recovery_write(dax_dev, pgoff, kaddr, map_len, iter); else if (write) xfer = dax_copy_from_iter(dax_dev, pgoff, kaddr, map_len, iter); else xfer = dax_copy_to_iter(dax_dev, pgoff, kaddr, map_len, iter); length = xfer; ret = iomap_iter_advance(iomi, &length); if (!ret && xfer == 0) ret = -EFAULT; if (xfer < map_len) break; } dax_read_unlock(id); return ret; } /** * dax_iomap_rw - Perform I/O to a DAX file * @iocb: The control block for this I/O * @iter: The addresses to do I/O from or to * @ops: iomap ops passed from the file system * * This function performs read and write operations to directly mapped * persistent memory. The callers needs to take care of read/write exclusion * and evicting any page cache pages in the region under I/O. */ ssize_t dax_iomap_rw(struct kiocb *iocb, struct iov_iter *iter, const struct iomap_ops *ops) { struct iomap_iter iomi = { .inode = iocb->ki_filp->f_mapping->host, .pos = iocb->ki_pos, .len = iov_iter_count(iter), .flags = IOMAP_DAX, }; loff_t done = 0; int ret; if (!iomi.len) return 0; if (iov_iter_rw(iter) == WRITE) { lockdep_assert_held_write(&iomi.inode->i_rwsem); iomi.flags |= IOMAP_WRITE; } else { lockdep_assert_held(&iomi.inode->i_rwsem); } if (iocb->ki_flags & IOCB_NOWAIT) iomi.flags |= IOMAP_NOWAIT; while ((ret = iomap_iter(&iomi, ops)) > 0) iomi.status = dax_iomap_iter(&iomi, iter); done = iomi.pos - iocb->ki_pos; iocb->ki_pos = iomi.pos; return done ? done : ret; } EXPORT_SYMBOL_GPL(dax_iomap_rw); static vm_fault_t dax_fault_return(int error) { if (error == 0) return VM_FAULT_NOPAGE; return vmf_error(error); } /* * When handling a synchronous page fault and the inode need a fsync, we can * insert the PTE/PMD into page tables only after that fsync happened. Skip * insertion for now and return the pfn so that caller can insert it after the * fsync is done. */ static vm_fault_t dax_fault_synchronous_pfnp(pfn_t *pfnp, pfn_t pfn) { if (WARN_ON_ONCE(!pfnp)) return VM_FAULT_SIGBUS; *pfnp = pfn; return VM_FAULT_NEEDDSYNC; } static vm_fault_t dax_fault_cow_page(struct vm_fault *vmf, const struct iomap_iter *iter) { vm_fault_t ret; int error = 0; switch (iter->iomap.type) { case IOMAP_HOLE: case IOMAP_UNWRITTEN: clear_user_highpage(vmf->cow_page, vmf->address); break; case IOMAP_MAPPED: error = copy_cow_page_dax(vmf, iter); break; default: WARN_ON_ONCE(1); error = -EIO; break; } if (error) return dax_fault_return(error); __SetPageUptodate(vmf->cow_page); ret = finish_fault(vmf); if (!ret) return VM_FAULT_DONE_COW; return ret; } /** * dax_fault_iter - Common actor to handle pfn insertion in PTE/PMD fault. * @vmf: vm fault instance * @iter: iomap iter * @pfnp: pfn to be returned * @xas: the dax mapping tree of a file * @entry: an unlocked dax entry to be inserted * @pmd: distinguish whether it is a pmd fault */ static vm_fault_t dax_fault_iter(struct vm_fault *vmf, const struct iomap_iter *iter, pfn_t *pfnp, struct xa_state *xas, void **entry, bool pmd) { const struct iomap *iomap = &iter->iomap; const struct iomap *srcmap = iomap_iter_srcmap(iter); size_t size = pmd ? PMD_SIZE : PAGE_SIZE; loff_t pos = (loff_t)xas->xa_index << PAGE_SHIFT; bool write = iter->flags & IOMAP_WRITE; unsigned long entry_flags = pmd ? DAX_PMD : 0; struct folio *folio; int ret, err = 0; pfn_t pfn; void *kaddr; if (!pmd && vmf->cow_page) return dax_fault_cow_page(vmf, iter); /* if we are reading UNWRITTEN and HOLE, return a hole. */ if (!write && (iomap->type == IOMAP_UNWRITTEN || iomap->type == IOMAP_HOLE)) { if (!pmd) return dax_load_hole(xas, vmf, iter, entry); return dax_pmd_load_hole(xas, vmf, iter, entry); } if (iomap->type != IOMAP_MAPPED && !(iomap->flags & IOMAP_F_SHARED)) { WARN_ON_ONCE(1); return pmd ? VM_FAULT_FALLBACK : VM_FAULT_SIGBUS; } err = dax_iomap_direct_access(iomap, pos, size, &kaddr, &pfn); if (err) return pmd ? VM_FAULT_FALLBACK : dax_fault_return(err); *entry = dax_insert_entry(xas, vmf, iter, *entry, pfn, entry_flags); if (write && iomap->flags & IOMAP_F_SHARED) { err = dax_iomap_copy_around(pos, size, size, srcmap, kaddr); if (err) return dax_fault_return(err); } folio = dax_to_folio(*entry); if (dax_fault_is_synchronous(iter, vmf->vma)) return dax_fault_synchronous_pfnp(pfnp, pfn); folio_ref_inc(folio); if (pmd) ret = vmf_insert_folio_pmd(vmf, pfn_folio(pfn_t_to_pfn(pfn)), write); else ret = vmf_insert_page_mkwrite(vmf, pfn_t_to_page(pfn), write); folio_put(folio); return ret; } static vm_fault_t dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp, int *iomap_errp, const struct iomap_ops *ops) { struct address_space *mapping = vmf->vma->vm_file->f_mapping; XA_STATE(xas, &mapping->i_pages, vmf->pgoff); struct iomap_iter iter = { .inode = mapping->host, .pos = (loff_t)vmf->pgoff << PAGE_SHIFT, .len = PAGE_SIZE, .flags = IOMAP_DAX | IOMAP_FAULT, }; vm_fault_t ret = 0; void *entry; int error; trace_dax_pte_fault(iter.inode, vmf, ret); /* * Check whether offset isn't beyond end of file now. Caller is supposed * to hold locks serializing us with truncate / punch hole so this is * a reliable test. */ if (iter.pos >= i_size_read(iter.inode)) { ret = VM_FAULT_SIGBUS; goto out; } if ((vmf->flags & FAULT_FLAG_WRITE) && !vmf->cow_page) iter.flags |= IOMAP_WRITE; entry = grab_mapping_entry(&xas, mapping, 0); if (xa_is_internal(entry)) { ret = xa_to_internal(entry); goto out; } /* * It is possible, particularly with mixed reads & writes to private * mappings, that we have raced with a PMD fault that overlaps with * the PTE we need to set up. If so just return and the fault will be * retried. */ if (pmd_trans_huge(*vmf->pmd) || pmd_devmap(*vmf->pmd)) { ret = VM_FAULT_NOPAGE; goto unlock_entry; } while ((error = iomap_iter(&iter, ops)) > 0) { if (WARN_ON_ONCE(iomap_length(&iter) < PAGE_SIZE)) { iter.status = -EIO; /* fs corruption? */ continue; } ret = dax_fault_iter(vmf, &iter, pfnp, &xas, &entry, false); if (ret != VM_FAULT_SIGBUS && (iter.iomap.flags & IOMAP_F_NEW)) { count_vm_event(PGMAJFAULT); count_memcg_event_mm(vmf->vma->vm_mm, PGMAJFAULT); ret |= VM_FAULT_MAJOR; } if (!(ret & VM_FAULT_ERROR)) { u64 length = PAGE_SIZE; iter.status = iomap_iter_advance(&iter, &length); } } if (iomap_errp) *iomap_errp = error; if (!ret && error) ret = dax_fault_return(error); unlock_entry: dax_unlock_entry(&xas, entry); out: trace_dax_pte_fault_done(iter.inode, vmf, ret); return ret; } #ifdef CONFIG_FS_DAX_PMD static bool dax_fault_check_fallback(struct vm_fault *vmf, struct xa_state *xas, pgoff_t max_pgoff) { unsigned long pmd_addr = vmf->address & PMD_MASK; bool write = vmf->flags & FAULT_FLAG_WRITE; /* * Make sure that the faulting address's PMD offset (color) matches * the PMD offset from the start of the file. This is necessary so * that a PMD range in the page table overlaps exactly with a PMD * range in the page cache. */ if ((vmf->pgoff & PG_PMD_COLOUR) != ((vmf->address >> PAGE_SHIFT) & PG_PMD_COLOUR)) return true; /* Fall back to PTEs if we're going to COW */ if (write && !(vmf->vma->vm_flags & VM_SHARED)) return true; /* If the PMD would extend outside the VMA */ if (pmd_addr < vmf->vma->vm_start) return true; if ((pmd_addr + PMD_SIZE) > vmf->vma->vm_end) return true; /* If the PMD would extend beyond the file size */ if ((xas->xa_index | PG_PMD_COLOUR) >= max_pgoff) return true; return false; } static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp, const struct iomap_ops *ops) { struct address_space *mapping = vmf->vma->vm_file->f_mapping; XA_STATE_ORDER(xas, &mapping->i_pages, vmf->pgoff, PMD_ORDER); struct iomap_iter iter = { .inode = mapping->host, .len = PMD_SIZE, .flags = IOMAP_DAX | IOMAP_FAULT, }; vm_fault_t ret = VM_FAULT_FALLBACK; pgoff_t max_pgoff; void *entry; if (vmf->flags & FAULT_FLAG_WRITE) iter.flags |= IOMAP_WRITE; /* * Check whether offset isn't beyond end of file now. Caller is * supposed to hold locks serializing us with truncate / punch hole so * this is a reliable test. */ max_pgoff = DIV_ROUND_UP(i_size_read(iter.inode), PAGE_SIZE); trace_dax_pmd_fault(iter.inode, vmf, max_pgoff, 0); if (xas.xa_index >= max_pgoff) { ret = VM_FAULT_SIGBUS; goto out; } if (dax_fault_check_fallback(vmf, &xas, max_pgoff)) goto fallback; /* * grab_mapping_entry() will make sure we get an empty PMD entry, * a zero PMD entry or a DAX PMD. If it can't (because a PTE * entry is already in the array, for instance), it will return * VM_FAULT_FALLBACK. */ entry = grab_mapping_entry(&xas, mapping, PMD_ORDER); if (xa_is_internal(entry)) { ret = xa_to_internal(entry); goto fallback; } /* * It is possible, particularly with mixed reads & writes to private * mappings, that we have raced with a PTE fault that overlaps with * the PMD we need to set up. If so just return and the fault will be * retried. */ if (!pmd_none(*vmf->pmd) && !pmd_trans_huge(*vmf->pmd) && !pmd_devmap(*vmf->pmd)) { ret = 0; goto unlock_entry; } iter.pos = (loff_t)xas.xa_index << PAGE_SHIFT; while (iomap_iter(&iter, ops) > 0) { if (iomap_length(&iter) < PMD_SIZE) continue; /* actually breaks out of the loop */ ret = dax_fault_iter(vmf, &iter, pfnp, &xas, &entry, true); if (ret != VM_FAULT_FALLBACK) { u64 length = PMD_SIZE; iter.status = iomap_iter_advance(&iter, &length); } } unlock_entry: dax_unlock_entry(&xas, entry); fallback: if (ret == VM_FAULT_FALLBACK) { split_huge_pmd(vmf->vma, vmf->pmd, vmf->address); count_vm_event(THP_FAULT_FALLBACK); } out: trace_dax_pmd_fault_done(iter.inode, vmf, max_pgoff, ret); return ret; } #else static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp, const struct iomap_ops *ops) { return VM_FAULT_FALLBACK; } #endif /* CONFIG_FS_DAX_PMD */ /** * dax_iomap_fault - handle a page fault on a DAX file * @vmf: The description of the fault * @order: Order of the page to fault in * @pfnp: PFN to insert for synchronous faults if fsync is required * @iomap_errp: Storage for detailed error code in case of error * @ops: Iomap ops passed from the file system * * When a page fault occurs, filesystems may call this helper in * their fault handler for DAX files. dax_iomap_fault() assumes the caller * has done all the necessary locking for page fault to proceed * successfully. */ vm_fault_t dax_iomap_fault(struct vm_fault *vmf, unsigned int order, pfn_t *pfnp, int *iomap_errp, const struct iomap_ops *ops) { if (order == 0) return dax_iomap_pte_fault(vmf, pfnp, iomap_errp, ops); else if (order == PMD_ORDER) return dax_iomap_pmd_fault(vmf, pfnp, ops); else return VM_FAULT_FALLBACK; } EXPORT_SYMBOL_GPL(dax_iomap_fault); /* * dax_insert_pfn_mkwrite - insert PTE or PMD entry into page tables * @vmf: The description of the fault * @pfn: PFN to insert * @order: Order of entry to insert. * * This function inserts a writeable PTE or PMD entry into the page tables * for an mmaped DAX file. It also marks the page cache entry as dirty. */ static vm_fault_t dax_insert_pfn_mkwrite(struct vm_fault *vmf, pfn_t pfn, unsigned int order) { struct address_space *mapping = vmf->vma->vm_file->f_mapping; XA_STATE_ORDER(xas, &mapping->i_pages, vmf->pgoff, order); struct folio *folio; void *entry; vm_fault_t ret; xas_lock_irq(&xas); entry = get_next_unlocked_entry(&xas, order); /* Did we race with someone splitting entry or so? */ if (!entry || dax_is_conflict(entry) || (order == 0 && !dax_is_pte_entry(entry))) { put_unlocked_entry(&xas, entry, WAKE_NEXT); xas_unlock_irq(&xas); trace_dax_insert_pfn_mkwrite_no_entry(mapping->host, vmf, VM_FAULT_NOPAGE); return VM_FAULT_NOPAGE; } xas_set_mark(&xas, PAGECACHE_TAG_DIRTY); dax_lock_entry(&xas, entry); xas_unlock_irq(&xas); folio = pfn_folio(pfn_t_to_pfn(pfn)); folio_ref_inc(folio); if (order == 0) ret = vmf_insert_page_mkwrite(vmf, &folio->page, true); #ifdef CONFIG_FS_DAX_PMD else if (order == PMD_ORDER) ret = vmf_insert_folio_pmd(vmf, folio, FAULT_FLAG_WRITE); #endif else ret = VM_FAULT_FALLBACK; folio_put(folio); dax_unlock_entry(&xas, entry); trace_dax_insert_pfn_mkwrite(mapping->host, vmf, ret); return ret; } /** * dax_finish_sync_fault - finish synchronous page fault * @vmf: The description of the fault * @order: Order of entry to be inserted * @pfn: PFN to insert * * This function ensures that the file range touched by the page fault is * stored persistently on the media and handles inserting of appropriate page * table entry. */ vm_fault_t dax_finish_sync_fault(struct vm_fault *vmf, unsigned int order, pfn_t pfn) { int err; loff_t start = ((loff_t)vmf->pgoff) << PAGE_SHIFT; size_t len = PAGE_SIZE << order; err = vfs_fsync_range(vmf->vma->vm_file, start, start + len - 1, 1); if (err) return VM_FAULT_SIGBUS; return dax_insert_pfn_mkwrite(vmf, pfn, order); } EXPORT_SYMBOL_GPL(dax_finish_sync_fault); static int dax_range_compare_iter(struct iomap_iter *it_src, struct iomap_iter *it_dest, u64 len, bool *same) { const struct iomap *smap = &it_src->iomap; const struct iomap *dmap = &it_dest->iomap; loff_t pos1 = it_src->pos, pos2 = it_dest->pos; u64 dest_len; void *saddr, *daddr; int id, ret; len = min(len, min(smap->length, dmap->length)); if (smap->type == IOMAP_HOLE && dmap->type == IOMAP_HOLE) { *same = true; goto advance; } if (smap->type == IOMAP_HOLE || dmap->type == IOMAP_HOLE) { *same = false; return 0; } id = dax_read_lock(); ret = dax_iomap_direct_access(smap, pos1, ALIGN(pos1 + len, PAGE_SIZE), &saddr, NULL); if (ret < 0) goto out_unlock; ret = dax_iomap_direct_access(dmap, pos2, ALIGN(pos2 + len, PAGE_SIZE), &daddr, NULL); if (ret < 0) goto out_unlock; *same = !memcmp(saddr, daddr, len); if (!*same) len = 0; dax_read_unlock(id); advance: dest_len = len; ret = iomap_iter_advance(it_src, &len); if (!ret) ret = iomap_iter_advance(it_dest, &dest_len); return ret; out_unlock: dax_read_unlock(id); return -EIO; } int dax_dedupe_file_range_compare(struct inode *src, loff_t srcoff, struct inode *dst, loff_t dstoff, loff_t len, bool *same, const struct iomap_ops *ops) { struct iomap_iter src_iter = { .inode = src, .pos = srcoff, .len = len, .flags = IOMAP_DAX, }; struct iomap_iter dst_iter = { .inode = dst, .pos = dstoff, .len = len, .flags = IOMAP_DAX, }; int ret, status; while ((ret = iomap_iter(&src_iter, ops)) > 0 && (ret = iomap_iter(&dst_iter, ops)) > 0) { status = dax_range_compare_iter(&src_iter, &dst_iter, min(src_iter.len, dst_iter.len), same); if (status < 0) return ret; src_iter.status = dst_iter.status = status; } return ret; } int dax_remap_file_range_prep(struct file *file_in, loff_t pos_in, struct file *file_out, loff_t pos_out, loff_t *len, unsigned int remap_flags, const struct iomap_ops *ops) { return __generic_remap_file_range_prep(file_in, pos_in, file_out, pos_out, len, remap_flags, ops); } EXPORT_SYMBOL_GPL(dax_remap_file_range_prep);
14 311 4 1 7 257 10 17 4 5 113 94 343 17 116 477 13 31 293 304 140 242 11 141 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 /* SPDX-License-Identifier: GPL-2.0 */ /* * * Copyright (C) 2019-2021 Paragon Software GmbH, All rights reserved. * */ // clang-format off #ifndef _LINUX_NTFS3_NTFS_FS_H #define _LINUX_NTFS3_NTFS_FS_H #include <linux/blkdev.h> #include <linux/buffer_head.h> #include <linux/fs.h> #include <linux/highmem.h> #include <linux/kernel.h> #include <linux/mm.h> #include <linux/mutex.h> #include <linux/page-flags.h> #include <linux/pagemap.h> #include <linux/rbtree.h> #include <linux/rwsem.h> #include <linux/slab.h> #include <linux/string.h> #include <linux/time64.h> #include <linux/types.h> #include <linux/uidgid.h> #include <asm/div64.h> #include <asm/page.h> #include "debug.h" #include "ntfs.h" struct dentry; struct fiemap_extent_info; struct user_namespace; struct page; struct writeback_control; enum utf16_endian; #define MINUS_ONE_T ((size_t)(-1)) /* Biggest MFT / smallest cluster */ #define MAXIMUM_BYTES_PER_MFT 4096 #define MAXIMUM_SHIFT_BYTES_PER_MFT 12 #define NTFS_BLOCKS_PER_MFT_RECORD (MAXIMUM_BYTES_PER_MFT / 512) #define MAXIMUM_BYTES_PER_INDEX 4096 #define MAXIMUM_SHIFT_BYTES_PER_INDEX 12 #define NTFS_BLOCKS_PER_INODE (MAXIMUM_BYTES_PER_INDEX / 512) /* NTFS specific error code when fixup failed. */ #define E_NTFS_FIXUP 555 /* NTFS specific error code about resident->nonresident. */ #define E_NTFS_NONRESIDENT 556 /* NTFS specific error code about punch hole. */ #define E_NTFS_NOTALIGNED 557 /* NTFS specific error code when on-disk struct is corrupted. */ #define E_NTFS_CORRUPT 558 /* sbi->flags */ #define NTFS_FLAGS_NODISCARD 0x00000001 /* ntfs in shutdown state. */ #define NTFS_FLAGS_SHUTDOWN_BIT 0x00000002 /* == 4*/ /* Set when LogFile is replaying. */ #define NTFS_FLAGS_LOG_REPLAYING 0x00000008 /* Set when we changed first MFT's which copy must be updated in $MftMirr. */ #define NTFS_FLAGS_MFTMIRR 0x00001000 #define NTFS_FLAGS_NEED_REPLAY 0x04000000 /* ni->ni_flags */ /* * Data attribute is external compressed (LZX/Xpress) * 1 - WOF_COMPRESSION_XPRESS4K * 2 - WOF_COMPRESSION_XPRESS8K * 3 - WOF_COMPRESSION_XPRESS16K * 4 - WOF_COMPRESSION_LZX32K */ #define NI_FLAG_COMPRESSED_MASK 0x0000000f /* Data attribute is deduplicated. */ #define NI_FLAG_DEDUPLICATED 0x00000010 #define NI_FLAG_EA 0x00000020 #define NI_FLAG_DIR 0x00000040 #define NI_FLAG_RESIDENT 0x00000080 #define NI_FLAG_UPDATE_PARENT 0x00000100 // clang-format on struct ntfs_mount_options { char *nls_name; struct nls_table *nls; kuid_t fs_uid; kgid_t fs_gid; u16 fs_fmask_inv; u16 fs_dmask_inv; unsigned fmask : 1; /* fmask was set. */ unsigned dmask : 1; /*dmask was set. */ unsigned sys_immutable : 1; /* Immutable system files. */ unsigned discard : 1; /* Issue discard requests on deletions. */ unsigned sparse : 1; /* Create sparse files. */ unsigned showmeta : 1; /* Show meta files. */ unsigned nohidden : 1; /* Do not show hidden files. */ unsigned hide_dot_files : 1; /* Set hidden flag on dot files. */ unsigned windows_names : 1; /* Disallow names forbidden by Windows. */ unsigned force : 1; /* RW mount dirty volume. */ unsigned prealloc : 1; /* Preallocate space when file is growing. */ unsigned nocase : 1; /* case insensitive. */ }; /* Special value to unpack and deallocate. */ #define RUN_DEALLOCATE ((struct runs_tree *)(size_t)1) /* TODO: Use rb tree instead of array. */ struct runs_tree { struct ntfs_run *runs; size_t count; /* Currently used size a ntfs_run storage. */ size_t allocated; /* Currently allocated ntfs_run storage size. */ }; struct ntfs_buffers { /* Biggest MFT / smallest cluster = 4096 / 512 = 8 */ /* Biggest index / smallest cluster = 4096 / 512 = 8 */ struct buffer_head *bh[PAGE_SIZE >> SECTOR_SHIFT]; u32 bytes; u32 nbufs; u32 off; }; enum ALLOCATE_OPT { ALLOCATE_DEF = 0, // Allocate all clusters. ALLOCATE_MFT = 1, // Allocate for MFT. ALLOCATE_ZERO = 2, // Zeroout new allocated clusters }; enum bitmap_mutex_classes { BITMAP_MUTEX_CLUSTERS = 0, BITMAP_MUTEX_MFT = 1, }; struct wnd_bitmap { struct super_block *sb; struct rw_semaphore rw_lock; struct runs_tree run; size_t nbits; size_t total_zeroes; // Total number of free bits. u16 *free_bits; // Free bits in each window. size_t nwnd; u32 bits_last; // Bits in last window. struct rb_root start_tree; // Extents, sorted by 'start'. struct rb_root count_tree; // Extents, sorted by 'count + start'. size_t count; // Extents count. /* * -1 Tree is activated but not updated (too many fragments). * 0 - Tree is not activated. * 1 - Tree is activated and updated. */ int uptodated; size_t extent_min; // Minimal extent used while building. size_t extent_max; // Upper estimate of biggest free block. /* Zone [bit, end) */ size_t zone_bit; size_t zone_end; bool inited; }; typedef int (*NTFS_CMP_FUNC)(const void *key1, size_t len1, const void *key2, size_t len2, const void *param); enum index_mutex_classed { INDEX_MUTEX_I30 = 0, INDEX_MUTEX_SII = 1, INDEX_MUTEX_SDH = 2, INDEX_MUTEX_SO = 3, INDEX_MUTEX_SQ = 4, INDEX_MUTEX_SR = 5, INDEX_MUTEX_TOTAL }; /* ntfs_index - Allocation unit inside directory. */ struct ntfs_index { struct runs_tree bitmap_run; struct runs_tree alloc_run; /* read/write access to 'bitmap_run'/'alloc_run' while ntfs_readdir */ struct rw_semaphore run_lock; /*TODO: Remove 'cmp'. */ NTFS_CMP_FUNC cmp; u8 index_bits; // log2(root->index_block_size) u8 idx2vbn_bits; // log2(root->index_block_clst) u8 vbn2vbo_bits; // index_block_size < cluster? 9 : cluster_bits u8 type; // index_mutex_classed }; /* Minimum MFT zone. */ #define NTFS_MIN_MFT_ZONE 100 /* Step to increase the MFT. */ #define NTFS_MFT_INCREASE_STEP 1024 /* Ntfs file system in-core superblock data. */ struct ntfs_sb_info { struct super_block *sb; u32 discard_granularity; u64 discard_granularity_mask_inv; // ~(discard_granularity_mask_inv-1) u32 cluster_size; // bytes per cluster u32 cluster_mask; // == cluster_size - 1 u64 cluster_mask_inv; // ~(cluster_size - 1) u32 block_mask; // sb->s_blocksize - 1 u32 blocks_per_cluster; // cluster_size / sb->s_blocksize u32 record_size; u32 index_size; u8 cluster_bits; u8 record_bits; u64 maxbytes; // Maximum size for normal files. u64 maxbytes_sparse; // Maximum size for sparse file. unsigned long flags; // See NTFS_FLAGS_ CLST zone_max; // Maximum MFT zone length in clusters CLST bad_clusters; // The count of marked bad clusters. u16 max_bytes_per_attr; // Maximum attribute size in record. u16 attr_size_tr; // Attribute size threshold (320 bytes). /* Records in $Extend. */ CLST objid_no; CLST quota_no; CLST reparse_no; CLST usn_jrnl_no; struct ATTR_DEF_ENTRY *def_table; // Attribute definition table. u32 def_entries; u32 ea_max_size; struct MFT_REC *new_rec; u16 *upcase; struct { u64 lbo, lbo2; struct ntfs_inode *ni; struct wnd_bitmap bitmap; // $MFT::Bitmap /* * MFT records [11-24) used to expand MFT itself. * They always marked as used in $MFT::Bitmap * 'reserved_bitmap' contains real bitmap of these records. */ ulong reserved_bitmap; // Bitmap of used records [11 - 24) size_t next_free; // The next record to allocate from size_t used; // MFT valid size in records. u32 recs_mirr; // Number of records in MFTMirr u8 next_reserved; u8 reserved_bitmap_inited; } mft; struct { struct wnd_bitmap bitmap; // $Bitmap::Data CLST next_free_lcn; } used; struct { u64 size; // In bytes. u64 blocks; // In blocks. u64 ser_num; struct ntfs_inode *ni; __le16 flags; // Cached current VOLUME_INFO::flags, VOLUME_FLAG_DIRTY. u8 major_ver; u8 minor_ver; char label[256]; bool real_dirty; // Real fs state. } volume; struct { struct ntfs_index index_sii; struct ntfs_index index_sdh; struct ntfs_inode *ni; u32 next_id; u64 next_off; __le32 def_security_id; } security; struct { struct ntfs_index index_r; struct ntfs_inode *ni; u64 max_size; // 16K } reparse; struct { struct ntfs_index index_o; struct ntfs_inode *ni; } objid; struct { struct mutex mtx_lznt; struct lznt *lznt; #ifdef CONFIG_NTFS3_LZX_XPRESS struct mutex mtx_xpress; struct xpress_decompressor *xpress; struct mutex mtx_lzx; struct lzx_decompressor *lzx; #endif } compress; struct ntfs_mount_options *options; struct ratelimit_state msg_ratelimit; struct proc_dir_entry *procdir; }; /* One MFT record(usually 1024 bytes), consists of attributes. */ struct mft_inode { struct rb_node node; struct ntfs_sb_info *sbi; struct MFT_REC *mrec; struct ntfs_buffers nb; CLST rno; bool dirty; }; /* Nested class for ntfs_inode::ni_lock. */ enum ntfs_inode_mutex_lock_class { NTFS_INODE_MUTEX_DIRTY = 1, NTFS_INODE_MUTEX_SECURITY, NTFS_INODE_MUTEX_OBJID, NTFS_INODE_MUTEX_REPARSE, NTFS_INODE_MUTEX_NORMAL, NTFS_INODE_MUTEX_PARENT, NTFS_INODE_MUTEX_PARENT2, }; /* * struct ntfs_inode * * Ntfs inode - extends linux inode. consists of one or more MFT inodes. */ struct ntfs_inode { struct mft_inode mi; // base record /* * Valid size: [0 - i_valid) - these range in file contains valid data. * Range [i_valid - inode->i_size) - contains 0. * Usually i_valid <= inode->i_size. */ u64 i_valid; struct timespec64 i_crtime; struct mutex ni_lock; /* File attributes from std. */ enum FILE_ATTRIBUTE std_fa; __le32 std_security_id; /* * Tree of mft_inode. * Not empty when primary MFT record (usually 1024 bytes) can't save all attributes * e.g. file becomes too fragmented or contains a lot of names. */ struct rb_root mi_tree; /* * This member is used in ntfs_readdir to ensure that all subrecords are loaded */ u8 mi_loaded; union { struct ntfs_index dir; struct { struct rw_semaphore run_lock; struct runs_tree run; #ifdef CONFIG_NTFS3_LZX_XPRESS struct folio *offs_folio; #endif } file; }; struct { struct runs_tree run; struct ATTR_LIST_ENTRY *le; // 1K aligned memory. size_t size; bool dirty; } attr_list; size_t ni_flags; // NI_FLAG_XXX struct inode vfs_inode; }; struct indx_node { struct ntfs_buffers nb; struct INDEX_BUFFER *index; }; struct ntfs_fnd { int level; struct indx_node *nodes[20]; struct NTFS_DE *de[20]; struct NTFS_DE *root_de; }; enum REPARSE_SIGN { REPARSE_NONE = 0, REPARSE_COMPRESSED = 1, REPARSE_DEDUPLICATED = 2, REPARSE_LINK = 3 }; /* Functions from attrib.c */ int attr_allocate_clusters(struct ntfs_sb_info *sbi, struct runs_tree *run, CLST vcn, CLST lcn, CLST len, CLST *pre_alloc, enum ALLOCATE_OPT opt, CLST *alen, const size_t fr, CLST *new_lcn, CLST *new_len); int attr_make_nonresident(struct ntfs_inode *ni, struct ATTRIB *attr, struct ATTR_LIST_ENTRY *le, struct mft_inode *mi, u64 new_size, struct runs_tree *run, struct ATTRIB **ins_attr, struct page *page); int attr_set_size(struct ntfs_inode *ni, enum ATTR_TYPE type, const __le16 *name, u8 name_len, struct runs_tree *run, u64 new_size, const u64 *new_valid, bool keep_prealloc, struct ATTRIB **ret); int attr_data_get_block(struct ntfs_inode *ni, CLST vcn, CLST clen, CLST *lcn, CLST *len, bool *new, bool zero); int attr_data_read_resident(struct ntfs_inode *ni, struct folio *folio); int attr_data_write_resident(struct ntfs_inode *ni, struct folio *folio); int attr_load_runs_vcn(struct ntfs_inode *ni, enum ATTR_TYPE type, const __le16 *name, u8 name_len, struct runs_tree *run, CLST vcn); int attr_load_runs_range(struct ntfs_inode *ni, enum ATTR_TYPE type, const __le16 *name, u8 name_len, struct runs_tree *run, u64 from, u64 to); int attr_wof_frame_info(struct ntfs_inode *ni, struct ATTRIB *attr, struct runs_tree *run, u64 frame, u64 frames, u8 frame_bits, u32 *ondisk_size, u64 *vbo_data); int attr_is_frame_compressed(struct ntfs_inode *ni, struct ATTRIB *attr, CLST frame, CLST *clst_data, struct runs_tree *run); int attr_allocate_frame(struct ntfs_inode *ni, CLST frame, size_t compr_size, u64 new_valid); int attr_collapse_range(struct ntfs_inode *ni, u64 vbo, u64 bytes); int attr_insert_range(struct ntfs_inode *ni, u64 vbo, u64 bytes); int attr_punch_hole(struct ntfs_inode *ni, u64 vbo, u64 bytes, u32 *frame_size); int attr_force_nonresident(struct ntfs_inode *ni); /* Functions from attrlist.c */ void al_destroy(struct ntfs_inode *ni); bool al_verify(struct ntfs_inode *ni); int ntfs_load_attr_list(struct ntfs_inode *ni, struct ATTRIB *attr); struct ATTR_LIST_ENTRY *al_enumerate(struct ntfs_inode *ni, struct ATTR_LIST_ENTRY *le); struct ATTR_LIST_ENTRY *al_find_le(struct ntfs_inode *ni, struct ATTR_LIST_ENTRY *le, const struct ATTRIB *attr); struct ATTR_LIST_ENTRY *al_find_ex(struct ntfs_inode *ni, struct ATTR_LIST_ENTRY *le, enum ATTR_TYPE type, const __le16 *name, u8 name_len, const CLST *vcn); int al_add_le(struct ntfs_inode *ni, enum ATTR_TYPE type, const __le16 *name, u8 name_len, CLST svcn, __le16 id, const struct MFT_REF *ref, struct ATTR_LIST_ENTRY **new_le); bool al_remove_le(struct ntfs_inode *ni, struct ATTR_LIST_ENTRY *le); int al_update(struct ntfs_inode *ni, int sync); static inline size_t al_aligned(size_t size) { return size_add(size, 1023) & ~(size_t)1023; } /* Globals from bitfunc.c */ bool are_bits_clear(const void *map, size_t bit, size_t nbits); bool are_bits_set(const void *map, size_t bit, size_t nbits); size_t get_set_bits_ex(const void *map, size_t bit, size_t nbits); /* Globals from dir.c */ int ntfs_utf16_to_nls(struct ntfs_sb_info *sbi, const __le16 *name, u32 len, u8 *buf, int buf_len); int ntfs_nls_to_utf16(struct ntfs_sb_info *sbi, const u8 *name, u32 name_len, struct cpu_str *uni, u32 max_ulen, enum utf16_endian endian); struct inode *dir_search_u(struct inode *dir, const struct cpu_str *uni, struct ntfs_fnd *fnd); bool dir_is_empty(struct inode *dir); extern const struct file_operations ntfs_dir_operations; extern const struct file_operations ntfs_legacy_dir_operations; /* Globals from file.c */ int ntfs_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, u32 flags); int ntfs_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *attr); int ntfs_file_open(struct inode *inode, struct file *file); int ntfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, __u64 start, __u64 len); long ntfs_ioctl(struct file *filp, u32 cmd, unsigned long arg); long ntfs_compat_ioctl(struct file *filp, u32 cmd, unsigned long arg); extern const struct inode_operations ntfs_special_inode_operations; extern const struct inode_operations ntfs_file_inode_operations; extern const struct file_operations ntfs_file_operations; extern const struct file_operations ntfs_legacy_file_operations; /* Globals from frecord.c */ void ni_remove_mi(struct ntfs_inode *ni, struct mft_inode *mi); struct ATTR_STD_INFO *ni_std(struct ntfs_inode *ni); struct ATTR_STD_INFO5 *ni_std5(struct ntfs_inode *ni); void ni_clear(struct ntfs_inode *ni); int ni_load_mi_ex(struct ntfs_inode *ni, CLST rno, struct mft_inode **mi); int ni_load_mi(struct ntfs_inode *ni, const struct ATTR_LIST_ENTRY *le, struct mft_inode **mi); struct ATTRIB *ni_find_attr(struct ntfs_inode *ni, struct ATTRIB *attr, struct ATTR_LIST_ENTRY **entry_o, enum ATTR_TYPE type, const __le16 *name, u8 name_len, const CLST *vcn, struct mft_inode **mi); struct ATTRIB *ni_enum_attr_ex(struct ntfs_inode *ni, struct ATTRIB *attr, struct ATTR_LIST_ENTRY **le, struct mft_inode **mi); int ni_load_all_mi(struct ntfs_inode *ni); bool ni_add_subrecord(struct ntfs_inode *ni, CLST rno, struct mft_inode **mi); int ni_remove_attr(struct ntfs_inode *ni, enum ATTR_TYPE type, const __le16 *name, u8 name_len, bool base_only, const __le16 *id); int ni_create_attr_list(struct ntfs_inode *ni); int ni_expand_list(struct ntfs_inode *ni); int ni_insert_nonresident(struct ntfs_inode *ni, enum ATTR_TYPE type, const __le16 *name, u8 name_len, const struct runs_tree *run, CLST svcn, CLST len, __le16 flags, struct ATTRIB **new_attr, struct mft_inode **mi, struct ATTR_LIST_ENTRY **le); int ni_insert_resident(struct ntfs_inode *ni, u32 data_size, enum ATTR_TYPE type, const __le16 *name, u8 name_len, struct ATTRIB **new_attr, struct mft_inode **mi, struct ATTR_LIST_ENTRY **le); void ni_remove_attr_le(struct ntfs_inode *ni, struct ATTRIB *attr, struct mft_inode *mi, struct ATTR_LIST_ENTRY *le); int ni_delete_all(struct ntfs_inode *ni); struct ATTR_FILE_NAME *ni_fname_name(struct ntfs_inode *ni, const struct le_str *uni, const struct MFT_REF *home, struct mft_inode **mi, struct ATTR_LIST_ENTRY **entry); struct ATTR_FILE_NAME *ni_fname_type(struct ntfs_inode *ni, u8 name_type, struct mft_inode **mi, struct ATTR_LIST_ENTRY **entry); int ni_new_attr_flags(struct ntfs_inode *ni, enum FILE_ATTRIBUTE new_fa); enum REPARSE_SIGN ni_parse_reparse(struct ntfs_inode *ni, struct ATTRIB *attr, struct REPARSE_DATA_BUFFER *buffer); int ni_write_inode(struct inode *inode, int sync, const char *hint); #define _ni_write_inode(i, w) ni_write_inode(i, w, __func__) int ni_fiemap(struct ntfs_inode *ni, struct fiemap_extent_info *fieinfo, __u64 vbo, __u64 len); int ni_readpage_cmpr(struct ntfs_inode *ni, struct folio *folio); int ni_decompress_file(struct ntfs_inode *ni); int ni_read_frame(struct ntfs_inode *ni, u64 frame_vbo, struct page **pages, u32 pages_per_frame); int ni_write_frame(struct ntfs_inode *ni, struct page **pages, u32 pages_per_frame); int ni_remove_name(struct ntfs_inode *dir_ni, struct ntfs_inode *ni, struct NTFS_DE *de, struct NTFS_DE **de2, int *undo_step); bool ni_remove_name_undo(struct ntfs_inode *dir_ni, struct ntfs_inode *ni, struct NTFS_DE *de, struct NTFS_DE *de2, int undo_step); int ni_add_name(struct ntfs_inode *dir_ni, struct ntfs_inode *ni, struct NTFS_DE *de); int ni_rename(struct ntfs_inode *dir_ni, struct ntfs_inode *new_dir_ni, struct ntfs_inode *ni, struct NTFS_DE *de, struct NTFS_DE *new_de, bool *is_bad); bool ni_is_dirty(struct inode *inode); /* Globals from fslog.c */ bool check_index_header(const struct INDEX_HDR *hdr, size_t bytes); int log_replay(struct ntfs_inode *ni, bool *initialized); /* Globals from fsntfs.c */ struct buffer_head *ntfs_bread(struct super_block *sb, sector_t block); bool ntfs_fix_pre_write(struct NTFS_RECORD_HEADER *rhdr, size_t bytes); int ntfs_fix_post_read(struct NTFS_RECORD_HEADER *rhdr, size_t bytes, bool simple); int ntfs_extend_init(struct ntfs_sb_info *sbi); int ntfs_loadlog_and_replay(struct ntfs_inode *ni, struct ntfs_sb_info *sbi); int ntfs_look_for_free_space(struct ntfs_sb_info *sbi, CLST lcn, CLST len, CLST *new_lcn, CLST *new_len, enum ALLOCATE_OPT opt); bool ntfs_check_for_free_space(struct ntfs_sb_info *sbi, CLST clen, CLST mlen); int ntfs_look_free_mft(struct ntfs_sb_info *sbi, CLST *rno, bool mft, struct ntfs_inode *ni, struct mft_inode **mi); void ntfs_mark_rec_free(struct ntfs_sb_info *sbi, CLST rno, bool is_mft); int ntfs_clear_mft_tail(struct ntfs_sb_info *sbi, size_t from, size_t to); int ntfs_refresh_zone(struct ntfs_sb_info *sbi); void ntfs_update_mftmirr(struct ntfs_sb_info *sbi, int wait); void ntfs_bad_inode(struct inode *inode, const char *hint); #define _ntfs_bad_inode(i) ntfs_bad_inode(i, __func__) enum NTFS_DIRTY_FLAGS { NTFS_DIRTY_CLEAR = 0, NTFS_DIRTY_DIRTY = 1, NTFS_DIRTY_ERROR = 2, }; int ntfs_set_state(struct ntfs_sb_info *sbi, enum NTFS_DIRTY_FLAGS dirty); int ntfs_sb_write(struct super_block *sb, u64 lbo, size_t bytes, const void *buffer, int wait); int ntfs_sb_write_run(struct ntfs_sb_info *sbi, const struct runs_tree *run, u64 vbo, const void *buf, size_t bytes, int sync); struct buffer_head *ntfs_bread_run(struct ntfs_sb_info *sbi, const struct runs_tree *run, u64 vbo); int ntfs_read_run_nb(struct ntfs_sb_info *sbi, const struct runs_tree *run, u64 vbo, void *buf, u32 bytes, struct ntfs_buffers *nb); int ntfs_read_bh(struct ntfs_sb_info *sbi, const struct runs_tree *run, u64 vbo, struct NTFS_RECORD_HEADER *rhdr, u32 bytes, struct ntfs_buffers *nb); int ntfs_get_bh(struct ntfs_sb_info *sbi, const struct runs_tree *run, u64 vbo, u32 bytes, struct ntfs_buffers *nb); int ntfs_write_bh(struct ntfs_sb_info *sbi, struct NTFS_RECORD_HEADER *rhdr, struct ntfs_buffers *nb, int sync); int ntfs_bio_pages(struct ntfs_sb_info *sbi, const struct runs_tree *run, struct page **pages, u32 nr_pages, u64 vbo, u32 bytes, enum req_op op); int ntfs_bio_fill_1(struct ntfs_sb_info *sbi, const struct runs_tree *run); int ntfs_vbo_to_lbo(struct ntfs_sb_info *sbi, const struct runs_tree *run, u64 vbo, u64 *lbo, u64 *bytes); struct ntfs_inode *ntfs_new_inode(struct ntfs_sb_info *sbi, CLST nRec, enum RECORD_FLAG flag); extern const u8 s_default_security[0x50]; bool is_sd_valid(const struct SECURITY_DESCRIPTOR_RELATIVE *sd, u32 len); int ntfs_security_init(struct ntfs_sb_info *sbi); int ntfs_get_security_by_id(struct ntfs_sb_info *sbi, __le32 security_id, struct SECURITY_DESCRIPTOR_RELATIVE **sd, size_t *size); int ntfs_insert_security(struct ntfs_sb_info *sbi, const struct SECURITY_DESCRIPTOR_RELATIVE *sd, u32 size, __le32 *security_id, bool *inserted); int ntfs_reparse_init(struct ntfs_sb_info *sbi); int ntfs_objid_init(struct ntfs_sb_info *sbi); int ntfs_objid_remove(struct ntfs_sb_info *sbi, struct GUID *guid); int ntfs_insert_reparse(struct ntfs_sb_info *sbi, __le32 rtag, const struct MFT_REF *ref); int ntfs_remove_reparse(struct ntfs_sb_info *sbi, __le32 rtag, const struct MFT_REF *ref); void mark_as_free_ex(struct ntfs_sb_info *sbi, CLST lcn, CLST len, bool trim); int run_deallocate(struct ntfs_sb_info *sbi, const struct runs_tree *run, bool trim); bool valid_windows_name(struct ntfs_sb_info *sbi, const struct le_str *name); int ntfs_set_label(struct ntfs_sb_info *sbi, u8 *label, int len); /* Globals from index.c */ int indx_used_bit(struct ntfs_index *indx, struct ntfs_inode *ni, size_t *bit); void fnd_clear(struct ntfs_fnd *fnd); static inline struct ntfs_fnd *fnd_get(void) { return kzalloc(sizeof(struct ntfs_fnd), GFP_NOFS); } static inline void fnd_put(struct ntfs_fnd *fnd) { if (fnd) { fnd_clear(fnd); kfree(fnd); } } void indx_clear(struct ntfs_index *idx); int indx_init(struct ntfs_index *indx, struct ntfs_sb_info *sbi, const struct ATTRIB *attr, enum index_mutex_classed type); struct INDEX_ROOT *indx_get_root(struct ntfs_index *indx, struct ntfs_inode *ni, struct ATTRIB **attr, struct mft_inode **mi); int indx_read(struct ntfs_index *idx, struct ntfs_inode *ni, CLST vbn, struct indx_node **node); int indx_find(struct ntfs_index *indx, struct ntfs_inode *dir, const struct INDEX_ROOT *root, const void *Key, size_t KeyLen, const void *param, int *diff, struct NTFS_DE **entry, struct ntfs_fnd *fnd); int indx_find_sort(struct ntfs_index *indx, struct ntfs_inode *ni, const struct INDEX_ROOT *root, struct NTFS_DE **entry, struct ntfs_fnd *fnd); int indx_find_raw(struct ntfs_index *indx, struct ntfs_inode *ni, const struct INDEX_ROOT *root, struct NTFS_DE **entry, size_t *off, struct ntfs_fnd *fnd); int indx_insert_entry(struct ntfs_index *indx, struct ntfs_inode *ni, const struct NTFS_DE *new_de, const void *param, struct ntfs_fnd *fnd, bool undo); int indx_delete_entry(struct ntfs_index *indx, struct ntfs_inode *ni, const void *key, u32 key_len, const void *param); int indx_update_dup(struct ntfs_inode *ni, struct ntfs_sb_info *sbi, const struct ATTR_FILE_NAME *fname, const struct NTFS_DUP_INFO *dup, int sync); /* Globals from inode.c */ struct inode *ntfs_iget5(struct super_block *sb, const struct MFT_REF *ref, const struct cpu_str *name); int ntfs_set_size(struct inode *inode, u64 new_size); int ntfs_get_block(struct inode *inode, sector_t vbn, struct buffer_head *bh_result, int create); int ntfs_write_begin(struct file *file, struct address_space *mapping, loff_t pos, u32 len, struct folio **foliop, void **fsdata); int ntfs_write_end(struct file *file, struct address_space *mapping, loff_t pos, u32 len, u32 copied, struct folio *folio, void *fsdata); int ntfs3_write_inode(struct inode *inode, struct writeback_control *wbc); int ntfs_sync_inode(struct inode *inode); int inode_read_data(struct inode *inode, void *data, size_t bytes); int ntfs_create_inode(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, const struct cpu_str *uni, umode_t mode, dev_t dev, const char *symname, u32 size, struct ntfs_fnd *fnd); int ntfs_link_inode(struct inode *inode, struct dentry *dentry); int ntfs_unlink_inode(struct inode *dir, const struct dentry *dentry); void ntfs_evict_inode(struct inode *inode); extern const struct inode_operations ntfs_link_inode_operations; extern const struct address_space_operations ntfs_aops; extern const struct address_space_operations ntfs_aops_cmpr; /* Globals from name_i.c */ int fill_name_de(struct ntfs_sb_info *sbi, void *buf, const struct qstr *name, const struct cpu_str *uni); struct dentry *ntfs3_get_parent(struct dentry *child); extern const struct inode_operations ntfs_dir_inode_operations; extern const struct inode_operations ntfs_special_inode_operations; extern const struct dentry_operations ntfs_dentry_ops; /* Globals from record.c */ int mi_get(struct ntfs_sb_info *sbi, CLST rno, struct mft_inode **mi); void mi_put(struct mft_inode *mi); int mi_init(struct mft_inode *mi, struct ntfs_sb_info *sbi, CLST rno); int mi_read(struct mft_inode *mi, bool is_mft); struct ATTRIB *mi_enum_attr(struct ntfs_inode *ni, struct mft_inode *mi, struct ATTRIB *attr); struct ATTRIB *mi_find_attr(struct ntfs_inode *ni, struct mft_inode *mi, struct ATTRIB *attr, enum ATTR_TYPE type, const __le16 *name, u8 name_len, const __le16 *id); static inline struct ATTRIB *rec_find_attr_le(struct ntfs_inode *ni, struct mft_inode *rec, struct ATTR_LIST_ENTRY *le) { return mi_find_attr(ni, rec, NULL, le->type, le_name(le), le->name_len, &le->id); } int mi_write(struct mft_inode *mi, int wait); int mi_format_new(struct mft_inode *mi, struct ntfs_sb_info *sbi, CLST rno, __le16 flags, bool is_mft); struct ATTRIB *mi_insert_attr(struct ntfs_inode *ni, struct mft_inode *mi, enum ATTR_TYPE type, const __le16 *name, u8 name_len, u32 asize, u16 name_off); bool mi_remove_attr(struct ntfs_inode *ni, struct mft_inode *mi, struct ATTRIB *attr); bool mi_resize_attr(struct mft_inode *mi, struct ATTRIB *attr, int bytes); int mi_pack_runs(struct mft_inode *mi, struct ATTRIB *attr, struct runs_tree *run, CLST len); static inline bool mi_is_ref(const struct mft_inode *mi, const struct MFT_REF *ref) { if (le32_to_cpu(ref->low) != mi->rno) return false; if (ref->seq != mi->mrec->seq) return false; #ifdef CONFIG_NTFS3_64BIT_CLUSTER return le16_to_cpu(ref->high) == (mi->rno >> 32); #else return !ref->high; #endif } static inline void mi_get_ref(const struct mft_inode *mi, struct MFT_REF *ref) { ref->low = cpu_to_le32(mi->rno); #ifdef CONFIG_NTFS3_64BIT_CLUSTER ref->high = cpu_to_le16(mi->rno >> 32); #else ref->high = 0; #endif ref->seq = mi->mrec->seq; } /* Globals from run.c */ bool run_lookup_entry(const struct runs_tree *run, CLST vcn, CLST *lcn, CLST *len, size_t *index); void run_truncate(struct runs_tree *run, CLST vcn); void run_truncate_head(struct runs_tree *run, CLST vcn); void run_truncate_around(struct runs_tree *run, CLST vcn); bool run_add_entry(struct runs_tree *run, CLST vcn, CLST lcn, CLST len, bool is_mft); bool run_collapse_range(struct runs_tree *run, CLST vcn, CLST len); bool run_insert_range(struct runs_tree *run, CLST vcn, CLST len); bool run_get_entry(const struct runs_tree *run, size_t index, CLST *vcn, CLST *lcn, CLST *len); bool run_is_mapped_full(const struct runs_tree *run, CLST svcn, CLST evcn); int run_pack(const struct runs_tree *run, CLST svcn, CLST len, u8 *run_buf, u32 run_buf_size, CLST *packed_vcns); int run_unpack(struct runs_tree *run, struct ntfs_sb_info *sbi, CLST ino, CLST svcn, CLST evcn, CLST vcn, const u8 *run_buf, int run_buf_size); #ifdef NTFS3_CHECK_FREE_CLST int run_unpack_ex(struct runs_tree *run, struct ntfs_sb_info *sbi, CLST ino, CLST svcn, CLST evcn, CLST vcn, const u8 *run_buf, int run_buf_size); #else #define run_unpack_ex run_unpack #endif int run_get_highest_vcn(CLST vcn, const u8 *run_buf, u64 *highest_vcn); int run_clone(const struct runs_tree *run, struct runs_tree *new_run); /* Globals from super.c */ void *ntfs_set_shared(void *ptr, u32 bytes); void *ntfs_put_shared(void *ptr); void ntfs_unmap_meta(struct super_block *sb, CLST lcn, CLST len); int ntfs_discard(struct ntfs_sb_info *sbi, CLST Lcn, CLST Len); /* Globals from bitmap.c*/ int __init ntfs3_init_bitmap(void); void ntfs3_exit_bitmap(void); void wnd_close(struct wnd_bitmap *wnd); static inline size_t wnd_zeroes(const struct wnd_bitmap *wnd) { return wnd->total_zeroes; } int wnd_init(struct wnd_bitmap *wnd, struct super_block *sb, size_t nbits); int wnd_set_free(struct wnd_bitmap *wnd, size_t bit, size_t bits); int wnd_set_used(struct wnd_bitmap *wnd, size_t bit, size_t bits); int wnd_set_used_safe(struct wnd_bitmap *wnd, size_t bit, size_t bits, size_t *done); bool wnd_is_free(struct wnd_bitmap *wnd, size_t bit, size_t bits); bool wnd_is_used(struct wnd_bitmap *wnd, size_t bit, size_t bits); /* Possible values for 'flags' 'wnd_find'. */ #define BITMAP_FIND_MARK_AS_USED 0x01 #define BITMAP_FIND_FULL 0x02 size_t wnd_find(struct wnd_bitmap *wnd, size_t to_alloc, size_t hint, size_t flags, size_t *allocated); int wnd_extend(struct wnd_bitmap *wnd, size_t new_bits); void wnd_zone_set(struct wnd_bitmap *wnd, size_t Lcn, size_t Len); int ntfs_trim_fs(struct ntfs_sb_info *sbi, struct fstrim_range *range); void ntfs_bitmap_set_le(void *map, unsigned int start, int len); void ntfs_bitmap_clear_le(void *map, unsigned int start, int len); unsigned int ntfs_bitmap_weight_le(const void *bitmap, int bits); /* Globals from upcase.c */ int ntfs_cmp_names(const __le16 *s1, size_t l1, const __le16 *s2, size_t l2, const u16 *upcase, bool bothcase); int ntfs_cmp_names_cpu(const struct cpu_str *uni1, const struct le_str *uni2, const u16 *upcase, bool bothcase); unsigned long ntfs_names_hash(const u16 *name, size_t len, const u16 *upcase, unsigned long hash); /* globals from xattr.c */ #ifdef CONFIG_NTFS3_FS_POSIX_ACL struct posix_acl *ntfs_get_acl(struct mnt_idmap *idmap, struct dentry *dentry, int type); int ntfs_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, struct posix_acl *acl, int type); int ntfs_init_acl(struct mnt_idmap *idmap, struct inode *inode, struct inode *dir); #else #define ntfs_get_acl NULL #define ntfs_set_acl NULL #endif int ntfs_acl_chmod(struct mnt_idmap *idmap, struct dentry *dentry); ssize_t ntfs_listxattr(struct dentry *dentry, char *buffer, size_t size); extern const struct xattr_handler *const ntfs_xattr_handlers[]; int ntfs_save_wsl_perm(struct inode *inode, __le16 *ea_size); void ntfs_get_wsl_perm(struct inode *inode); /* globals from lznt.c */ struct lznt *get_lznt_ctx(int level); size_t compress_lznt(const void *uncompressed, size_t uncompressed_size, void *compressed, size_t compressed_size, struct lznt *ctx); ssize_t decompress_lznt(const void *compressed, size_t compressed_size, void *uncompressed, size_t uncompressed_size); static inline bool is_ntfs3(struct ntfs_sb_info *sbi) { return sbi->volume.major_ver >= 3; } /* (sb->s_flags & SB_ACTIVE) */ static inline bool is_mounted(struct ntfs_sb_info *sbi) { return !!sbi->sb->s_root; } static inline bool ntfs_is_meta_file(struct ntfs_sb_info *sbi, CLST rno) { return rno < MFT_REC_FREE || rno == sbi->objid_no || rno == sbi->quota_no || rno == sbi->reparse_no || rno == sbi->usn_jrnl_no; } static inline size_t wnd_zone_bit(const struct wnd_bitmap *wnd) { return wnd->zone_bit; } static inline size_t wnd_zone_len(const struct wnd_bitmap *wnd) { return wnd->zone_end - wnd->zone_bit; } static inline void run_init(struct runs_tree *run) { run->runs = NULL; run->count = 0; run->allocated = 0; } static inline struct runs_tree *run_alloc(void) { return kzalloc(sizeof(struct runs_tree), GFP_NOFS); } static inline void run_close(struct runs_tree *run) { kvfree(run->runs); memset(run, 0, sizeof(*run)); } static inline void run_free(struct runs_tree *run) { if (run) { kvfree(run->runs); kfree(run); } } static inline bool run_is_empty(struct runs_tree *run) { return !run->count; } /* NTFS uses quad aligned bitmaps. */ static inline size_t ntfs3_bitmap_size(size_t bits) { return BITS_TO_U64(bits) * sizeof(u64); } #define _100ns2seconds 10000000 #define SecondsToStartOf1970 0x00000002B6109100 #define NTFS_TIME_GRAN 100 /* * kernel2nt - Converts in-memory kernel timestamp into nt time. */ static inline __le64 kernel2nt(const struct timespec64 *ts) { // 10^7 units of 100 nanoseconds one second return cpu_to_le64(_100ns2seconds * (ts->tv_sec + SecondsToStartOf1970) + ts->tv_nsec / NTFS_TIME_GRAN); } /* * nt2kernel - Converts on-disk nt time into kernel timestamp. */ static inline void nt2kernel(const __le64 tm, struct timespec64 *ts) { u64 t = le64_to_cpu(tm) - _100ns2seconds * SecondsToStartOf1970; // WARNING: do_div changes its first argument(!) ts->tv_nsec = do_div(t, _100ns2seconds) * 100; ts->tv_sec = t; } static inline struct ntfs_sb_info *ntfs_sb(struct super_block *sb) { return sb->s_fs_info; } static inline int ntfs3_forced_shutdown(struct super_block *sb) { return test_bit(NTFS_FLAGS_SHUTDOWN_BIT, &ntfs_sb(sb)->flags); } /* * ntfs_up_cluster - Align up on cluster boundary. */ static inline u64 ntfs_up_cluster(const struct ntfs_sb_info *sbi, u64 size) { return (size + sbi->cluster_mask) & sbi->cluster_mask_inv; } /* * ntfs_up_block - Align up on cluster boundary. */ static inline u64 ntfs_up_block(const struct super_block *sb, u64 size) { return (size + sb->s_blocksize - 1) & ~(u64)(sb->s_blocksize - 1); } static inline CLST bytes_to_cluster(const struct ntfs_sb_info *sbi, u64 size) { return (size + sbi->cluster_mask) >> sbi->cluster_bits; } static inline u64 bytes_to_block(const struct super_block *sb, u64 size) { return (size + sb->s_blocksize - 1) >> sb->s_blocksize_bits; } static inline struct ntfs_inode *ntfs_i(struct inode *inode) { return container_of(inode, struct ntfs_inode, vfs_inode); } static inline bool is_compressed(const struct ntfs_inode *ni) { return (ni->std_fa & FILE_ATTRIBUTE_COMPRESSED) || (ni->ni_flags & NI_FLAG_COMPRESSED_MASK); } static inline int ni_ext_compress_bits(const struct ntfs_inode *ni) { return 0xb + (ni->ni_flags & NI_FLAG_COMPRESSED_MASK); } /* Bits - 0xc, 0xd, 0xe, 0xf, 0x10 */ static inline void ni_set_ext_compress_bits(struct ntfs_inode *ni, u8 bits) { ni->ni_flags |= (bits - 0xb) & NI_FLAG_COMPRESSED_MASK; } static inline bool is_dedup(const struct ntfs_inode *ni) { return ni->ni_flags & NI_FLAG_DEDUPLICATED; } static inline bool is_encrypted(const struct ntfs_inode *ni) { return ni->std_fa & FILE_ATTRIBUTE_ENCRYPTED; } static inline bool is_sparsed(const struct ntfs_inode *ni) { return ni->std_fa & FILE_ATTRIBUTE_SPARSE_FILE; } static inline int is_resident(struct ntfs_inode *ni) { return ni->ni_flags & NI_FLAG_RESIDENT; } static inline void le16_sub_cpu(__le16 *var, u16 val) { *var = cpu_to_le16(le16_to_cpu(*var) - val); } static inline void le32_sub_cpu(__le32 *var, u32 val) { *var = cpu_to_le32(le32_to_cpu(*var) - val); } static inline void nb_put(struct ntfs_buffers *nb) { u32 i, nbufs = nb->nbufs; if (!nbufs) return; for (i = 0; i < nbufs; i++) put_bh(nb->bh[i]); nb->nbufs = 0; } static inline void put_indx_node(struct indx_node *in) { if (!in) return; kfree(in->index); nb_put(&in->nb); kfree(in); } static inline void mi_clear(struct mft_inode *mi) { nb_put(&mi->nb); kfree(mi->mrec); mi->mrec = NULL; } static inline void ni_lock(struct ntfs_inode *ni) { mutex_lock_nested(&ni->ni_lock, NTFS_INODE_MUTEX_NORMAL); } static inline void ni_lock_dir(struct ntfs_inode *ni) { mutex_lock_nested(&ni->ni_lock, NTFS_INODE_MUTEX_PARENT); } static inline void ni_lock_dir2(struct ntfs_inode *ni) { mutex_lock_nested(&ni->ni_lock, NTFS_INODE_MUTEX_PARENT2); } static inline void ni_unlock(struct ntfs_inode *ni) { mutex_unlock(&ni->ni_lock); } static inline int ni_trylock(struct ntfs_inode *ni) { return mutex_trylock(&ni->ni_lock); } static inline int attr_load_runs_attr(struct ntfs_inode *ni, struct ATTRIB *attr, struct runs_tree *run, CLST vcn) { return attr_load_runs_vcn(ni, attr->type, attr_name(attr), attr->name_len, run, vcn); } static inline void le64_sub_cpu(__le64 *var, u64 val) { *var = cpu_to_le64(le64_to_cpu(*var) - val); } #if IS_ENABLED(CONFIG_NTFS_FS) bool is_legacy_ntfs(struct super_block *sb); #else static inline bool is_legacy_ntfs(struct super_block *sb) { return false; } #endif #endif /* _LINUX_NTFS3_NTFS_FS_H */
469 469 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 // SPDX-License-Identifier: GPL-2.0 #ifndef _LINUX_KERNEL_TRACE_H #define _LINUX_KERNEL_TRACE_H #include <linux/fs.h> #include <linux/atomic.h> #include <linux/sched.h> #include <linux/clocksource.h> #include <linux/ring_buffer.h> #include <linux/mmiotrace.h> #include <linux/tracepoint.h> #include <linux/ftrace.h> #include <linux/trace.h> #include <linux/hw_breakpoint.h> #include <linux/trace_seq.h> #include <linux/trace_events.h> #include <linux/compiler.h> #include <linux/glob.h> #include <linux/irq_work.h> #include <linux/workqueue.h> #include <linux/ctype.h> #include <linux/once_lite.h> #include <linux/ftrace_regs.h> #include "pid_list.h" #ifdef CONFIG_FTRACE_SYSCALLS #include <asm/unistd.h> /* For NR_syscalls */ #include <asm/syscall.h> /* some archs define it here */ #endif #define TRACE_MODE_WRITE 0640 #define TRACE_MODE_READ 0440 enum trace_type { __TRACE_FIRST_TYPE = 0, TRACE_FN, TRACE_CTX, TRACE_WAKE, TRACE_STACK, TRACE_PRINT, TRACE_BPRINT, TRACE_MMIO_RW, TRACE_MMIO_MAP, TRACE_BRANCH, TRACE_GRAPH_RET, TRACE_GRAPH_ENT, TRACE_GRAPH_RETADDR_ENT, TRACE_USER_STACK, TRACE_BLK, TRACE_BPUTS, TRACE_HWLAT, TRACE_OSNOISE, TRACE_TIMERLAT, TRACE_RAW_DATA, TRACE_FUNC_REPEATS, __TRACE_LAST_TYPE, }; #undef __field #define __field(type, item) type item; #undef __field_fn #define __field_fn(type, item) type item; #undef __field_struct #define __field_struct(type, item) __field(type, item) #undef __field_desc #define __field_desc(type, container, item) #undef __field_packed #define __field_packed(type, container, item) #undef __array #define __array(type, item, size) type item[size]; /* * For backward compatibility, older user space expects to see the * kernel_stack event with a fixed size caller field. But today the fix * size is ignored by the kernel, and the real structure is dynamic. * Expose to user space: "unsigned long caller[8];" but the real structure * will be "unsigned long caller[] __counted_by(size)" */ #undef __stack_array #define __stack_array(type, item, size, field) type item[] __counted_by(field); #undef __array_desc #define __array_desc(type, container, item, size) #undef __dynamic_array #define __dynamic_array(type, item) type item[]; #undef __rel_dynamic_array #define __rel_dynamic_array(type, item) type item[]; #undef F_STRUCT #define F_STRUCT(args...) args #undef FTRACE_ENTRY #define FTRACE_ENTRY(name, struct_name, id, tstruct, print) \ struct struct_name { \ struct trace_entry ent; \ tstruct \ } #undef FTRACE_ENTRY_DUP #define FTRACE_ENTRY_DUP(name, name_struct, id, tstruct, printk) #undef FTRACE_ENTRY_REG #define FTRACE_ENTRY_REG(name, struct_name, id, tstruct, print, regfn) \ FTRACE_ENTRY(name, struct_name, id, PARAMS(tstruct), PARAMS(print)) #undef FTRACE_ENTRY_PACKED #define FTRACE_ENTRY_PACKED(name, struct_name, id, tstruct, print) \ FTRACE_ENTRY(name, struct_name, id, PARAMS(tstruct), PARAMS(print)) __packed #include "trace_entries.h" /* Use this for memory failure errors */ #define MEM_FAIL(condition, fmt, ...) \ DO_ONCE_LITE_IF(condition, pr_err, "ERROR: " fmt, ##__VA_ARGS__) #define FAULT_STRING "(fault)" #define HIST_STACKTRACE_DEPTH 16 #define HIST_STACKTRACE_SIZE (HIST_STACKTRACE_DEPTH * sizeof(unsigned long)) #define HIST_STACKTRACE_SKIP 5 /* * syscalls are special, and need special handling, this is why * they are not included in trace_entries.h */ struct syscall_trace_enter { struct trace_entry ent; int nr; unsigned long args[]; }; struct syscall_trace_exit { struct trace_entry ent; int nr; long ret; }; struct kprobe_trace_entry_head { struct trace_entry ent; unsigned long ip; }; struct eprobe_trace_entry_head { struct trace_entry ent; }; struct kretprobe_trace_entry_head { struct trace_entry ent; unsigned long func; unsigned long ret_ip; }; struct fentry_trace_entry_head { struct trace_entry ent; unsigned long ip; }; struct fexit_trace_entry_head { struct trace_entry ent; unsigned long func; unsigned long ret_ip; }; #define TRACE_BUF_SIZE 1024 struct trace_array; /* * The CPU trace array - it consists of thousands of trace entries * plus some other descriptor data: (for example which task started * the trace, etc.) */ struct trace_array_cpu { local_t disabled; unsigned long entries; unsigned long saved_latency; unsigned long critical_start; unsigned long critical_end; unsigned long critical_sequence; unsigned long nice; unsigned long policy; unsigned long rt_priority; unsigned long skipped_entries; u64 preempt_timestamp; pid_t pid; kuid_t uid; char comm[TASK_COMM_LEN]; #ifdef CONFIG_FUNCTION_TRACER int ftrace_ignore_pid; #endif bool ignore_pid; }; struct tracer; struct trace_option_dentry; struct array_buffer { struct trace_array *tr; struct trace_buffer *buffer; struct trace_array_cpu __percpu *data; u64 time_start; int cpu; }; #define TRACE_FLAGS_MAX_SIZE 32 struct trace_options { struct tracer *tracer; struct trace_option_dentry *topts; }; struct trace_pid_list *trace_pid_list_alloc(void); void trace_pid_list_free(struct trace_pid_list *pid_list); bool trace_pid_list_is_set(struct trace_pid_list *pid_list, unsigned int pid); int trace_pid_list_set(struct trace_pid_list *pid_list, unsigned int pid); int trace_pid_list_clear(struct trace_pid_list *pid_list, unsigned int pid); int trace_pid_list_first(struct trace_pid_list *pid_list, unsigned int *pid); int trace_pid_list_next(struct trace_pid_list *pid_list, unsigned int pid, unsigned int *next); enum { TRACE_PIDS = BIT(0), TRACE_NO_PIDS = BIT(1), }; static inline bool pid_type_enabled(int type, struct trace_pid_list *pid_list, struct trace_pid_list *no_pid_list) { /* Return true if the pid list in type has pids */ return ((type & TRACE_PIDS) && pid_list) || ((type & TRACE_NO_PIDS) && no_pid_list); } static inline bool still_need_pid_events(int type, struct trace_pid_list *pid_list, struct trace_pid_list *no_pid_list) { /* * Turning off what is in @type, return true if the "other" * pid list, still has pids in it. */ return (!(type & TRACE_PIDS) && pid_list) || (!(type & TRACE_NO_PIDS) && no_pid_list); } typedef bool (*cond_update_fn_t)(struct trace_array *tr, void *cond_data); /** * struct cond_snapshot - conditional snapshot data and callback * * The cond_snapshot structure encapsulates a callback function and * data associated with the snapshot for a given tracing instance. * * When a snapshot is taken conditionally, by invoking * tracing_snapshot_cond(tr, cond_data), the cond_data passed in is * passed in turn to the cond_snapshot.update() function. That data * can be compared by the update() implementation with the cond_data * contained within the struct cond_snapshot instance associated with * the trace_array. Because the tr->max_lock is held throughout the * update() call, the update() function can directly retrieve the * cond_snapshot and cond_data associated with the per-instance * snapshot associated with the trace_array. * * The cond_snapshot.update() implementation can save data to be * associated with the snapshot if it decides to, and returns 'true' * in that case, or it returns 'false' if the conditional snapshot * shouldn't be taken. * * The cond_snapshot instance is created and associated with the * user-defined cond_data by tracing_cond_snapshot_enable(). * Likewise, the cond_snapshot instance is destroyed and is no longer * associated with the trace instance by * tracing_cond_snapshot_disable(). * * The method below is required. * * @update: When a conditional snapshot is invoked, the update() * callback function is invoked with the tr->max_lock held. The * update() implementation signals whether or not to actually * take the snapshot, by returning 'true' if so, 'false' if no * snapshot should be taken. Because the max_lock is held for * the duration of update(), the implementation is safe to * directly retrieved and save any implementation data it needs * to in association with the snapshot. */ struct cond_snapshot { void *cond_data; cond_update_fn_t update; }; /* * struct trace_func_repeats - used to keep track of the consecutive * (on the same CPU) calls of a single function. */ struct trace_func_repeats { unsigned long ip; unsigned long parent_ip; unsigned long count; u64 ts_last_call; }; struct trace_module_delta { struct rcu_head rcu; long delta[]; }; /* * The trace array - an array of per-CPU trace arrays. This is the * highest level data structure that individual tracers deal with. * They have on/off state as well: */ struct trace_array { struct list_head list; char *name; struct array_buffer array_buffer; #ifdef CONFIG_TRACER_MAX_TRACE /* * The max_buffer is used to snapshot the trace when a maximum * latency is reached, or when the user initiates a snapshot. * Some tracers will use this to store a maximum trace while * it continues examining live traces. * * The buffers for the max_buffer are set up the same as the array_buffer * When a snapshot is taken, the buffer of the max_buffer is swapped * with the buffer of the array_buffer and the buffers are reset for * the array_buffer so the tracing can continue. */ struct array_buffer max_buffer; bool allocated_snapshot; spinlock_t snapshot_trigger_lock; unsigned int snapshot; unsigned long max_latency; #ifdef CONFIG_FSNOTIFY struct dentry *d_max_latency; struct work_struct fsnotify_work; struct irq_work fsnotify_irqwork; #endif #endif /* The below is for memory mapped ring buffer */ unsigned int mapped; unsigned long range_addr_start; unsigned long range_addr_size; char *range_name; long text_delta; struct trace_module_delta *module_delta; void *scratch; /* pointer in persistent memory */ int scratch_size; int buffer_disabled; struct trace_pid_list __rcu *filtered_pids; struct trace_pid_list __rcu *filtered_no_pids; /* * max_lock is used to protect the swapping of buffers * when taking a max snapshot. The buffers themselves are * protected by per_cpu spinlocks. But the action of the swap * needs its own lock. * * This is defined as a arch_spinlock_t in order to help * with performance when lockdep debugging is enabled. * * It is also used in other places outside the update_max_tr * so it needs to be defined outside of the * CONFIG_TRACER_MAX_TRACE. */ arch_spinlock_t max_lock; #ifdef CONFIG_FTRACE_SYSCALLS int sys_refcount_enter; int sys_refcount_exit; struct trace_event_file __rcu *enter_syscall_files[NR_syscalls]; struct trace_event_file __rcu *exit_syscall_files[NR_syscalls]; #endif int stop_count; int clock_id; int nr_topts; bool clear_trace; int buffer_percent; unsigned int n_err_log_entries; struct tracer *current_trace; unsigned int trace_flags; unsigned char trace_flags_index[TRACE_FLAGS_MAX_SIZE]; unsigned int flags; raw_spinlock_t start_lock; const char *system_names; struct list_head err_log; struct dentry *dir; struct dentry *options; struct dentry *percpu_dir; struct eventfs_inode *event_dir; struct trace_options *topts; struct list_head systems; struct list_head events; struct list_head marker_list; struct trace_event_file *trace_marker_file; cpumask_var_t tracing_cpumask; /* only trace on set CPUs */ /* one per_cpu trace_pipe can be opened by only one user */ cpumask_var_t pipe_cpumask; int ref; int trace_ref; #ifdef CONFIG_MODULES struct list_head mod_events; #endif #ifdef CONFIG_FUNCTION_TRACER struct ftrace_ops *ops; struct trace_pid_list __rcu *function_pids; struct trace_pid_list __rcu *function_no_pids; #ifdef CONFIG_FUNCTION_GRAPH_TRACER struct fgraph_ops *gops; #endif #ifdef CONFIG_DYNAMIC_FTRACE /* All of these are protected by the ftrace_lock */ struct list_head func_probes; struct list_head mod_trace; struct list_head mod_notrace; #endif /* function tracing enabled */ int function_enabled; #endif int no_filter_buffering_ref; struct list_head hist_vars; #ifdef CONFIG_TRACER_SNAPSHOT struct cond_snapshot *cond_snapshot; #endif struct trace_func_repeats __percpu *last_func_repeats; /* * On boot up, the ring buffer is set to the minimum size, so that * we do not waste memory on systems that are not using tracing. */ bool ring_buffer_expanded; }; enum { TRACE_ARRAY_FL_GLOBAL = BIT(0), TRACE_ARRAY_FL_BOOT = BIT(1), TRACE_ARRAY_FL_LAST_BOOT = BIT(2), TRACE_ARRAY_FL_MOD_INIT = BIT(3), TRACE_ARRAY_FL_MEMMAP = BIT(4), }; #ifdef CONFIG_MODULES bool module_exists(const char *module); #else static inline bool module_exists(const char *module) { return false; } #endif extern struct list_head ftrace_trace_arrays; extern struct mutex trace_types_lock; extern int trace_array_get(struct trace_array *tr); extern int tracing_check_open_get_tr(struct trace_array *tr); extern struct trace_array *trace_array_find(const char *instance); extern struct trace_array *trace_array_find_get(const char *instance); extern u64 tracing_event_time_stamp(struct trace_buffer *buffer, struct ring_buffer_event *rbe); extern int tracing_set_filter_buffering(struct trace_array *tr, bool set); extern int tracing_set_clock(struct trace_array *tr, const char *clockstr); extern bool trace_clock_in_ns(struct trace_array *tr); extern unsigned long trace_adjust_address(struct trace_array *tr, unsigned long addr); /* * The global tracer (top) should be the first trace array added, * but we check the flag anyway. */ static inline struct trace_array *top_trace_array(void) { struct trace_array *tr; if (list_empty(&ftrace_trace_arrays)) return NULL; tr = list_entry(ftrace_trace_arrays.prev, typeof(*tr), list); WARN_ON(!(tr->flags & TRACE_ARRAY_FL_GLOBAL)); return tr; } #define FTRACE_CMP_TYPE(var, type) \ __builtin_types_compatible_p(typeof(var), type *) #undef IF_ASSIGN #define IF_ASSIGN(var, entry, etype, id) \ if (FTRACE_CMP_TYPE(var, etype)) { \ var = (typeof(var))(entry); \ WARN_ON(id != 0 && (entry)->type != id); \ break; \ } /* Will cause compile errors if type is not found. */ extern void __ftrace_bad_type(void); /* * The trace_assign_type is a verifier that the entry type is * the same as the type being assigned. To add new types simply * add a line with the following format: * * IF_ASSIGN(var, ent, type, id); * * Where "type" is the trace type that includes the trace_entry * as the "ent" item. And "id" is the trace identifier that is * used in the trace_type enum. * * If the type can have more than one id, then use zero. */ #define trace_assign_type(var, ent) \ do { \ IF_ASSIGN(var, ent, struct ftrace_entry, TRACE_FN); \ IF_ASSIGN(var, ent, struct ctx_switch_entry, 0); \ IF_ASSIGN(var, ent, struct stack_entry, TRACE_STACK); \ IF_ASSIGN(var, ent, struct userstack_entry, TRACE_USER_STACK);\ IF_ASSIGN(var, ent, struct print_entry, TRACE_PRINT); \ IF_ASSIGN(var, ent, struct bprint_entry, TRACE_BPRINT); \ IF_ASSIGN(var, ent, struct bputs_entry, TRACE_BPUTS); \ IF_ASSIGN(var, ent, struct hwlat_entry, TRACE_HWLAT); \ IF_ASSIGN(var, ent, struct osnoise_entry, TRACE_OSNOISE);\ IF_ASSIGN(var, ent, struct timerlat_entry, TRACE_TIMERLAT);\ IF_ASSIGN(var, ent, struct raw_data_entry, TRACE_RAW_DATA);\ IF_ASSIGN(var, ent, struct trace_mmiotrace_rw, \ TRACE_MMIO_RW); \ IF_ASSIGN(var, ent, struct trace_mmiotrace_map, \ TRACE_MMIO_MAP); \ IF_ASSIGN(var, ent, struct trace_branch, TRACE_BRANCH); \ IF_ASSIGN(var, ent, struct ftrace_graph_ent_entry, \ TRACE_GRAPH_ENT); \ IF_ASSIGN(var, ent, struct fgraph_retaddr_ent_entry,\ TRACE_GRAPH_RETADDR_ENT); \ IF_ASSIGN(var, ent, struct ftrace_graph_ret_entry, \ TRACE_GRAPH_RET); \ IF_ASSIGN(var, ent, struct func_repeats_entry, \ TRACE_FUNC_REPEATS); \ __ftrace_bad_type(); \ } while (0) /* * An option specific to a tracer. This is a boolean value. * The bit is the bit index that sets its value on the * flags value in struct tracer_flags. */ struct tracer_opt { const char *name; /* Will appear on the trace_options file */ u32 bit; /* Mask assigned in val field in tracer_flags */ }; /* * The set of specific options for a tracer. Your tracer * have to set the initial value of the flags val. */ struct tracer_flags { u32 val; struct tracer_opt *opts; struct tracer *trace; }; /* Makes more easy to define a tracer opt */ #define TRACER_OPT(s, b) .name = #s, .bit = b struct trace_option_dentry { struct tracer_opt *opt; struct tracer_flags *flags; struct trace_array *tr; struct dentry *entry; }; /** * struct tracer - a specific tracer and its callbacks to interact with tracefs * @name: the name chosen to select it on the available_tracers file * @init: called when one switches to this tracer (echo name > current_tracer) * @reset: called when one switches to another tracer * @start: called when tracing is unpaused (echo 1 > tracing_on) * @stop: called when tracing is paused (echo 0 > tracing_on) * @update_thresh: called when tracing_thresh is updated * @open: called when the trace file is opened * @pipe_open: called when the trace_pipe file is opened * @close: called when the trace file is released * @pipe_close: called when the trace_pipe file is released * @read: override the default read callback on trace_pipe * @splice_read: override the default splice_read callback on trace_pipe * @selftest: selftest to run on boot (see trace_selftest.c) * @print_headers: override the first lines that describe your columns * @print_line: callback that prints a trace * @set_flag: signals one of your private flags changed (trace_options file) * @flags: your private flags */ struct tracer { const char *name; int (*init)(struct trace_array *tr); void (*reset)(struct trace_array *tr); void (*start)(struct trace_array *tr); void (*stop)(struct trace_array *tr); int (*update_thresh)(struct trace_array *tr); void (*open)(struct trace_iterator *iter); void (*pipe_open)(struct trace_iterator *iter); void (*close)(struct trace_iterator *iter); void (*pipe_close)(struct trace_iterator *iter); ssize_t (*read)(struct trace_iterator *iter, struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos); ssize_t (*splice_read)(struct trace_iterator *iter, struct file *filp, loff_t *ppos, struct pipe_inode_info *pipe, size_t len, unsigned int flags); #ifdef CONFIG_FTRACE_STARTUP_TEST int (*selftest)(struct tracer *trace, struct trace_array *tr); #endif void (*print_header)(struct seq_file *m); enum print_line_t (*print_line)(struct trace_iterator *iter); /* If you handled the flag setting, return 0 */ int (*set_flag)(struct trace_array *tr, u32 old_flags, u32 bit, int set); /* Return 0 if OK with change, else return non-zero */ int (*flag_changed)(struct trace_array *tr, u32 mask, int set); struct tracer *next; struct tracer_flags *flags; int enabled; bool print_max; bool allow_instances; #ifdef CONFIG_TRACER_MAX_TRACE bool use_max_tr; #endif /* True if tracer cannot be enabled in kernel param */ bool noboot; }; static inline struct ring_buffer_iter * trace_buffer_iter(struct trace_iterator *iter, int cpu) { return iter->buffer_iter ? iter->buffer_iter[cpu] : NULL; } int tracer_init(struct tracer *t, struct trace_array *tr); int tracing_is_enabled(void); void tracing_reset_online_cpus(struct array_buffer *buf); void tracing_reset_all_online_cpus(void); void tracing_reset_all_online_cpus_unlocked(void); int tracing_open_generic(struct inode *inode, struct file *filp); int tracing_open_generic_tr(struct inode *inode, struct file *filp); int tracing_release_generic_tr(struct inode *inode, struct file *file); int tracing_open_file_tr(struct inode *inode, struct file *filp); int tracing_release_file_tr(struct inode *inode, struct file *filp); int tracing_single_release_file_tr(struct inode *inode, struct file *filp); bool tracing_is_disabled(void); bool tracer_tracing_is_on(struct trace_array *tr); void tracer_tracing_on(struct trace_array *tr); void tracer_tracing_off(struct trace_array *tr); void tracer_tracing_disable(struct trace_array *tr); void tracer_tracing_enable(struct trace_array *tr); struct dentry *trace_create_file(const char *name, umode_t mode, struct dentry *parent, void *data, const struct file_operations *fops); /** * tracer_tracing_is_on_cpu - show real state of ring buffer enabled on for a cpu * @tr : the trace array to know if ring buffer is enabled * @cpu: The cpu buffer to check if enabled * * Shows real state of the per CPU buffer if it is enabled or not. */ static inline bool tracer_tracing_is_on_cpu(struct trace_array *tr, int cpu) { if (tr->array_buffer.buffer) return ring_buffer_record_is_on_cpu(tr->array_buffer.buffer, cpu); return false; } int tracing_init_dentry(void); struct ring_buffer_event; struct ring_buffer_event * trace_buffer_lock_reserve(struct trace_buffer *buffer, int type, unsigned long len, unsigned int trace_ctx); int ring_buffer_meta_seq_init(struct file *file, struct trace_buffer *buffer, int cpu); struct trace_entry *tracing_get_trace_entry(struct trace_array *tr, struct trace_array_cpu *data); struct trace_entry *trace_find_next_entry(struct trace_iterator *iter, int *ent_cpu, u64 *ent_ts); void trace_buffer_unlock_commit_nostack(struct trace_buffer *buffer, struct ring_buffer_event *event); bool trace_is_tracepoint_string(const char *str); const char *trace_event_format(struct trace_iterator *iter, const char *fmt); char *trace_iter_expand_format(struct trace_iterator *iter); bool ignore_event(struct trace_iterator *iter); int trace_empty(struct trace_iterator *iter); void *trace_find_next_entry_inc(struct trace_iterator *iter); void trace_init_global_iter(struct trace_iterator *iter); void tracing_iter_reset(struct trace_iterator *iter, int cpu); unsigned long trace_total_entries_cpu(struct trace_array *tr, int cpu); unsigned long trace_total_entries(struct trace_array *tr); void trace_function(struct trace_array *tr, unsigned long ip, unsigned long parent_ip, unsigned int trace_ctx, struct ftrace_regs *regs); void trace_graph_function(struct trace_array *tr, unsigned long ip, unsigned long parent_ip, unsigned int trace_ctx); void trace_latency_header(struct seq_file *m); void trace_default_header(struct seq_file *m); void print_trace_header(struct seq_file *m, struct trace_iterator *iter); void trace_graph_return(struct ftrace_graph_ret *trace, struct fgraph_ops *gops, struct ftrace_regs *fregs); int trace_graph_entry(struct ftrace_graph_ent *trace, struct fgraph_ops *gops, struct ftrace_regs *fregs); void tracing_start_cmdline_record(void); void tracing_stop_cmdline_record(void); void tracing_start_tgid_record(void); void tracing_stop_tgid_record(void); int register_tracer(struct tracer *type); int is_tracing_stopped(void); loff_t tracing_lseek(struct file *file, loff_t offset, int whence); extern cpumask_var_t __read_mostly tracing_buffer_mask; #define for_each_tracing_cpu(cpu) \ for_each_cpu(cpu, tracing_buffer_mask) extern unsigned long nsecs_to_usecs(unsigned long nsecs); extern unsigned long tracing_thresh; /* PID filtering */ bool trace_find_filtered_pid(struct trace_pid_list *filtered_pids, pid_t search_pid); bool trace_ignore_this_task(struct trace_pid_list *filtered_pids, struct trace_pid_list *filtered_no_pids, struct task_struct *task); void trace_filter_add_remove_task(struct trace_pid_list *pid_list, struct task_struct *self, struct task_struct *task); void *trace_pid_next(struct trace_pid_list *pid_list, void *v, loff_t *pos); void *trace_pid_start(struct trace_pid_list *pid_list, loff_t *pos); int trace_pid_show(struct seq_file *m, void *v); int trace_pid_write(struct trace_pid_list *filtered_pids, struct trace_pid_list **new_pid_list, const char __user *ubuf, size_t cnt); #ifdef CONFIG_TRACER_MAX_TRACE void update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu, void *cond_data); void update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu); #ifdef CONFIG_FSNOTIFY #define LATENCY_FS_NOTIFY #endif #endif /* CONFIG_TRACER_MAX_TRACE */ #ifdef LATENCY_FS_NOTIFY void latency_fsnotify(struct trace_array *tr); #else static inline void latency_fsnotify(struct trace_array *tr) { } #endif #ifdef CONFIG_STACKTRACE void __trace_stack(struct trace_array *tr, unsigned int trace_ctx, int skip); #else static inline void __trace_stack(struct trace_array *tr, unsigned int trace_ctx, int skip) { } #endif /* CONFIG_STACKTRACE */ void trace_last_func_repeats(struct trace_array *tr, struct trace_func_repeats *last_info, unsigned int trace_ctx); extern u64 ftrace_now(int cpu); extern void trace_find_cmdline(int pid, char comm[]); extern int trace_find_tgid(int pid); extern void trace_event_follow_fork(struct trace_array *tr, bool enable); extern int trace_events_enabled(struct trace_array *tr, const char *system); #ifdef CONFIG_DYNAMIC_FTRACE extern unsigned long ftrace_update_tot_cnt; extern unsigned long ftrace_number_of_pages; extern unsigned long ftrace_number_of_groups; extern u64 ftrace_update_time; extern u64 ftrace_total_mod_time; void ftrace_init_trace_array(struct trace_array *tr); #else static inline void ftrace_init_trace_array(struct trace_array *tr) { } #endif #define DYN_FTRACE_TEST_NAME trace_selftest_dynamic_test_func extern int DYN_FTRACE_TEST_NAME(void); #define DYN_FTRACE_TEST_NAME2 trace_selftest_dynamic_test_func2 extern int DYN_FTRACE_TEST_NAME2(void); extern void trace_set_ring_buffer_expanded(struct trace_array *tr); extern bool tracing_selftest_disabled; #ifdef CONFIG_FTRACE_STARTUP_TEST extern void __init disable_tracing_selftest(const char *reason); extern int trace_selftest_startup_function(struct tracer *trace, struct trace_array *tr); extern int trace_selftest_startup_function_graph(struct tracer *trace, struct trace_array *tr); extern int trace_selftest_startup_irqsoff(struct tracer *trace, struct trace_array *tr); extern int trace_selftest_startup_preemptoff(struct tracer *trace, struct trace_array *tr); extern int trace_selftest_startup_preemptirqsoff(struct tracer *trace, struct trace_array *tr); extern int trace_selftest_startup_wakeup(struct tracer *trace, struct trace_array *tr); extern int trace_selftest_startup_nop(struct tracer *trace, struct trace_array *tr); extern int trace_selftest_startup_branch(struct tracer *trace, struct trace_array *tr); /* * Tracer data references selftest functions that only occur * on boot up. These can be __init functions. Thus, when selftests * are enabled, then the tracers need to reference __init functions. */ #define __tracer_data __refdata #else static inline void __init disable_tracing_selftest(const char *reason) { } /* Tracers are seldom changed. Optimize when selftests are disabled. */ #define __tracer_data __read_mostly #endif /* CONFIG_FTRACE_STARTUP_TEST */ extern void *head_page(struct trace_array_cpu *data); extern unsigned long long ns2usecs(u64 nsec); __printf(2, 0) int trace_vbprintk(unsigned long ip, const char *fmt, va_list args); __printf(2, 0) int trace_vprintk(unsigned long ip, const char *fmt, va_list args); __printf(3, 0) int trace_array_vprintk(struct trace_array *tr, unsigned long ip, const char *fmt, va_list args); __printf(3, 4) int trace_array_printk_buf(struct trace_buffer *buffer, unsigned long ip, const char *fmt, ...); void trace_printk_seq(struct trace_seq *s); enum print_line_t print_trace_line(struct trace_iterator *iter); extern char trace_find_mark(unsigned long long duration); struct ftrace_hash; struct ftrace_mod_load { struct list_head list; char *func; char *module; int enable; }; enum { FTRACE_HASH_FL_MOD = (1 << 0), }; struct ftrace_hash { unsigned long size_bits; struct hlist_head *buckets; unsigned long count; unsigned long flags; struct rcu_head rcu; }; struct ftrace_func_entry * ftrace_lookup_ip(struct ftrace_hash *hash, unsigned long ip); static __always_inline bool ftrace_hash_empty(struct ftrace_hash *hash) { return !hash || !(hash->count || (hash->flags & FTRACE_HASH_FL_MOD)); } /* Standard output formatting function used for function return traces */ #ifdef CONFIG_FUNCTION_GRAPH_TRACER /* Flag options */ #define TRACE_GRAPH_PRINT_OVERRUN 0x1 #define TRACE_GRAPH_PRINT_CPU 0x2 #define TRACE_GRAPH_PRINT_OVERHEAD 0x4 #define TRACE_GRAPH_PRINT_PROC 0x8 #define TRACE_GRAPH_PRINT_DURATION 0x10 #define TRACE_GRAPH_PRINT_ABS_TIME 0x20 #define TRACE_GRAPH_PRINT_REL_TIME 0x40 #define TRACE_GRAPH_PRINT_IRQS 0x80 #define TRACE_GRAPH_PRINT_TAIL 0x100 #define TRACE_GRAPH_SLEEP_TIME 0x200 #define TRACE_GRAPH_GRAPH_TIME 0x400 #define TRACE_GRAPH_PRINT_RETVAL 0x800 #define TRACE_GRAPH_PRINT_RETVAL_HEX 0x1000 #define TRACE_GRAPH_PRINT_RETADDR 0x2000 #define TRACE_GRAPH_ARGS 0x4000 #define TRACE_GRAPH_PRINT_FILL_SHIFT 28 #define TRACE_GRAPH_PRINT_FILL_MASK (0x3 << TRACE_GRAPH_PRINT_FILL_SHIFT) extern void ftrace_graph_sleep_time_control(bool enable); #ifdef CONFIG_FUNCTION_PROFILER extern void ftrace_graph_graph_time_control(bool enable); #else static inline void ftrace_graph_graph_time_control(bool enable) { } #endif extern enum print_line_t print_graph_function_flags(struct trace_iterator *iter, u32 flags); extern void print_graph_headers_flags(struct seq_file *s, u32 flags); extern void trace_print_graph_duration(unsigned long long duration, struct trace_seq *s); extern void graph_trace_open(struct trace_iterator *iter); extern void graph_trace_close(struct trace_iterator *iter); extern int __trace_graph_entry(struct trace_array *tr, struct ftrace_graph_ent *trace, unsigned int trace_ctx); extern int __trace_graph_retaddr_entry(struct trace_array *tr, struct ftrace_graph_ent *trace, unsigned int trace_ctx, unsigned long retaddr); extern void __trace_graph_return(struct trace_array *tr, struct ftrace_graph_ret *trace, unsigned int trace_ctx, u64 calltime, u64 rettime); extern void init_array_fgraph_ops(struct trace_array *tr, struct ftrace_ops *ops); extern int allocate_fgraph_ops(struct trace_array *tr, struct ftrace_ops *ops); extern void free_fgraph_ops(struct trace_array *tr); enum { TRACE_GRAPH_FL = 1, /* * In the very unlikely case that an interrupt came in * at a start of graph tracing, and we want to trace * the function in that interrupt, the depth can be greater * than zero, because of the preempted start of a previous * trace. In an even more unlikely case, depth could be 2 * if a softirq interrupted the start of graph tracing, * followed by an interrupt preempting a start of graph * tracing in the softirq, and depth can even be 3 * if an NMI came in at the start of an interrupt function * that preempted a softirq start of a function that * preempted normal context!!!! Luckily, it can't be * greater than 3, so the next two bits are a mask * of what the depth is when we set TRACE_GRAPH_FL */ TRACE_GRAPH_DEPTH_START_BIT, TRACE_GRAPH_DEPTH_END_BIT, /* * To implement set_graph_notrace, if this bit is set, we ignore * function graph tracing of called functions, until the return * function is called to clear it. */ TRACE_GRAPH_NOTRACE_BIT, }; #define TRACE_GRAPH_NOTRACE (1 << TRACE_GRAPH_NOTRACE_BIT) static inline unsigned long ftrace_graph_depth(unsigned long *task_var) { return (*task_var >> TRACE_GRAPH_DEPTH_START_BIT) & 3; } static inline void ftrace_graph_set_depth(unsigned long *task_var, int depth) { *task_var &= ~(3 << TRACE_GRAPH_DEPTH_START_BIT); *task_var |= (depth & 3) << TRACE_GRAPH_DEPTH_START_BIT; } #ifdef CONFIG_DYNAMIC_FTRACE extern struct ftrace_hash __rcu *ftrace_graph_hash; extern struct ftrace_hash __rcu *ftrace_graph_notrace_hash; static inline int ftrace_graph_addr(unsigned long *task_var, struct ftrace_graph_ent *trace) { unsigned long addr = trace->func; int ret = 0; struct ftrace_hash *hash; preempt_disable_notrace(); /* * Have to open code "rcu_dereference_sched()" because the * function graph tracer can be called when RCU is not * "watching". * Protected with schedule_on_each_cpu(ftrace_sync) */ hash = rcu_dereference_protected(ftrace_graph_hash, !preemptible()); if (ftrace_hash_empty(hash)) { ret = 1; goto out; } if (ftrace_lookup_ip(hash, addr)) { /* * This needs to be cleared on the return functions * when the depth is zero. */ *task_var |= TRACE_GRAPH_FL; ftrace_graph_set_depth(task_var, trace->depth); /* * If no irqs are to be traced, but a set_graph_function * is set, and called by an interrupt handler, we still * want to trace it. */ if (in_hardirq()) trace_recursion_set(TRACE_IRQ_BIT); else trace_recursion_clear(TRACE_IRQ_BIT); ret = 1; } out: preempt_enable_notrace(); return ret; } static inline void ftrace_graph_addr_finish(struct fgraph_ops *gops, struct ftrace_graph_ret *trace) { unsigned long *task_var = fgraph_get_task_var(gops); if ((*task_var & TRACE_GRAPH_FL) && trace->depth == ftrace_graph_depth(task_var)) *task_var &= ~TRACE_GRAPH_FL; } static inline int ftrace_graph_notrace_addr(unsigned long addr) { int ret = 0; struct ftrace_hash *notrace_hash; preempt_disable_notrace(); /* * Have to open code "rcu_dereference_sched()" because the * function graph tracer can be called when RCU is not * "watching". * Protected with schedule_on_each_cpu(ftrace_sync) */ notrace_hash = rcu_dereference_protected(ftrace_graph_notrace_hash, !preemptible()); if (ftrace_lookup_ip(notrace_hash, addr)) ret = 1; preempt_enable_notrace(); return ret; } #else static inline int ftrace_graph_addr(unsigned long *task_var, struct ftrace_graph_ent *trace) { return 1; } static inline int ftrace_graph_notrace_addr(unsigned long addr) { return 0; } static inline void ftrace_graph_addr_finish(struct fgraph_ops *gops, struct ftrace_graph_ret *trace) { } #endif /* CONFIG_DYNAMIC_FTRACE */ extern unsigned int fgraph_max_depth; extern bool fgraph_sleep_time; static inline bool ftrace_graph_ignore_func(struct fgraph_ops *gops, struct ftrace_graph_ent *trace) { unsigned long *task_var = fgraph_get_task_var(gops); /* trace it when it is-nested-in or is a function enabled. */ return !((*task_var & TRACE_GRAPH_FL) || ftrace_graph_addr(task_var, trace)) || (trace->depth < 0) || (fgraph_max_depth && trace->depth >= fgraph_max_depth); } void fgraph_init_ops(struct ftrace_ops *dst_ops, struct ftrace_ops *src_ops); #else /* CONFIG_FUNCTION_GRAPH_TRACER */ static inline enum print_line_t print_graph_function_flags(struct trace_iterator *iter, u32 flags) { return TRACE_TYPE_UNHANDLED; } static inline void free_fgraph_ops(struct trace_array *tr) { } /* ftrace_ops may not be defined */ #define init_array_fgraph_ops(tr, ops) do { } while (0) #define allocate_fgraph_ops(tr, ops) ({ 0; }) #endif /* CONFIG_FUNCTION_GRAPH_TRACER */ extern struct list_head ftrace_pids; #ifdef CONFIG_FUNCTION_TRACER #define FTRACE_PID_IGNORE -1 #define FTRACE_PID_TRACE -2 struct ftrace_func_command { struct list_head list; char *name; int (*func)(struct trace_array *tr, struct ftrace_hash *hash, char *func, char *cmd, char *params, int enable); }; extern bool ftrace_filter_param __initdata; static inline int ftrace_trace_task(struct trace_array *tr) { return this_cpu_read(tr->array_buffer.data->ftrace_ignore_pid) != FTRACE_PID_IGNORE; } extern int ftrace_is_dead(void); int ftrace_create_function_files(struct trace_array *tr, struct dentry *parent); void ftrace_destroy_function_files(struct trace_array *tr); int ftrace_allocate_ftrace_ops(struct trace_array *tr); void ftrace_free_ftrace_ops(struct trace_array *tr); void ftrace_init_global_array_ops(struct trace_array *tr); struct trace_array *trace_get_global_array(void); void ftrace_init_array_ops(struct trace_array *tr, ftrace_func_t func); void ftrace_reset_array_ops(struct trace_array *tr); void ftrace_init_tracefs(struct trace_array *tr, struct dentry *d_tracer); void ftrace_init_tracefs_toplevel(struct trace_array *tr, struct dentry *d_tracer); void ftrace_clear_pids(struct trace_array *tr); int init_function_trace(void); void ftrace_pid_follow_fork(struct trace_array *tr, bool enable); #else static inline int ftrace_trace_task(struct trace_array *tr) { return 1; } static inline int ftrace_is_dead(void) { return 0; } static inline int ftrace_create_function_files(struct trace_array *tr, struct dentry *parent) { return 0; } static inline int ftrace_allocate_ftrace_ops(struct trace_array *tr) { return 0; } static inline void ftrace_free_ftrace_ops(struct trace_array *tr) { } static inline void ftrace_destroy_function_files(struct trace_array *tr) { } static inline __init void ftrace_init_global_array_ops(struct trace_array *tr) { } static inline void ftrace_reset_array_ops(struct trace_array *tr) { } static inline void ftrace_init_tracefs(struct trace_array *tr, struct dentry *d) { } static inline void ftrace_init_tracefs_toplevel(struct trace_array *tr, struct dentry *d) { } static inline void ftrace_clear_pids(struct trace_array *tr) { } static inline int init_function_trace(void) { return 0; } static inline void ftrace_pid_follow_fork(struct trace_array *tr, bool enable) { } /* ftace_func_t type is not defined, use macro instead of static inline */ #define ftrace_init_array_ops(tr, func) do { } while (0) #endif /* CONFIG_FUNCTION_TRACER */ #if defined(CONFIG_FUNCTION_TRACER) && defined(CONFIG_DYNAMIC_FTRACE) struct ftrace_probe_ops { void (*func)(unsigned long ip, unsigned long parent_ip, struct trace_array *tr, struct ftrace_probe_ops *ops, void *data); int (*init)(struct ftrace_probe_ops *ops, struct trace_array *tr, unsigned long ip, void *init_data, void **data); void (*free)(struct ftrace_probe_ops *ops, struct trace_array *tr, unsigned long ip, void *data); int (*print)(struct seq_file *m, unsigned long ip, struct ftrace_probe_ops *ops, void *data); }; struct ftrace_func_mapper; typedef int (*ftrace_mapper_func)(void *data); struct ftrace_func_mapper *allocate_ftrace_func_mapper(void); void **ftrace_func_mapper_find_ip(struct ftrace_func_mapper *mapper, unsigned long ip); int ftrace_func_mapper_add_ip(struct ftrace_func_mapper *mapper, unsigned long ip, void *data); void *ftrace_func_mapper_remove_ip(struct ftrace_func_mapper *mapper, unsigned long ip); void free_ftrace_func_mapper(struct ftrace_func_mapper *mapper, ftrace_mapper_func free_func); extern int register_ftrace_function_probe(char *glob, struct trace_array *tr, struct ftrace_probe_ops *ops, void *data); extern int unregister_ftrace_function_probe_func(char *glob, struct trace_array *tr, struct ftrace_probe_ops *ops); extern void clear_ftrace_function_probes(struct trace_array *tr); int register_ftrace_command(struct ftrace_func_command *cmd); int unregister_ftrace_command(struct ftrace_func_command *cmd); void ftrace_create_filter_files(struct ftrace_ops *ops, struct dentry *parent); void ftrace_destroy_filter_files(struct ftrace_ops *ops); extern int ftrace_set_filter(struct ftrace_ops *ops, unsigned char *buf, int len, int reset); extern int ftrace_set_notrace(struct ftrace_ops *ops, unsigned char *buf, int len, int reset); #else struct ftrace_func_command; static inline __init int register_ftrace_command(struct ftrace_func_command *cmd) { return -EINVAL; } static inline __init int unregister_ftrace_command(char *cmd_name) { return -EINVAL; } static inline void clear_ftrace_function_probes(struct trace_array *tr) { } /* * The ops parameter passed in is usually undefined. * This must be a macro. */ #define ftrace_create_filter_files(ops, parent) do { } while (0) #define ftrace_destroy_filter_files(ops) do { } while (0) #endif /* CONFIG_FUNCTION_TRACER && CONFIG_DYNAMIC_FTRACE */ bool ftrace_event_is_function(struct trace_event_call *call); /* * struct trace_parser - servers for reading the user input separated by spaces * @cont: set if the input is not complete - no final space char was found * @buffer: holds the parsed user input * @idx: user input length * @size: buffer size */ struct trace_parser { bool cont; char *buffer; unsigned idx; unsigned size; }; static inline bool trace_parser_loaded(struct trace_parser *parser) { return (parser->idx != 0); } static inline bool trace_parser_cont(struct trace_parser *parser) { return parser->cont; } static inline void trace_parser_clear(struct trace_parser *parser) { parser->cont = false; parser->idx = 0; } extern int trace_parser_get_init(struct trace_parser *parser, int size); extern void trace_parser_put(struct trace_parser *parser); extern int trace_get_user(struct trace_parser *parser, const char __user *ubuf, size_t cnt, loff_t *ppos); /* * Only create function graph options if function graph is configured. */ #ifdef CONFIG_FUNCTION_GRAPH_TRACER # define FGRAPH_FLAGS \ C(DISPLAY_GRAPH, "display-graph"), #else # define FGRAPH_FLAGS #endif #ifdef CONFIG_BRANCH_TRACER # define BRANCH_FLAGS \ C(BRANCH, "branch"), #else # define BRANCH_FLAGS #endif #ifdef CONFIG_FUNCTION_TRACER # define FUNCTION_FLAGS \ C(FUNCTION, "function-trace"), \ C(FUNC_FORK, "function-fork"), # define FUNCTION_DEFAULT_FLAGS TRACE_ITER_FUNCTION #else # define FUNCTION_FLAGS # define FUNCTION_DEFAULT_FLAGS 0UL # define TRACE_ITER_FUNC_FORK 0UL #endif #ifdef CONFIG_STACKTRACE # define STACK_FLAGS \ C(STACKTRACE, "stacktrace"), #else # define STACK_FLAGS #endif /* * trace_iterator_flags is an enumeration that defines bit * positions into trace_flags that controls the output. * * NOTE: These bits must match the trace_options array in * trace.c (this macro guarantees it). */ #define TRACE_FLAGS \ C(PRINT_PARENT, "print-parent"), \ C(SYM_OFFSET, "sym-offset"), \ C(SYM_ADDR, "sym-addr"), \ C(VERBOSE, "verbose"), \ C(RAW, "raw"), \ C(HEX, "hex"), \ C(BIN, "bin"), \ C(BLOCK, "block"), \ C(FIELDS, "fields"), \ C(PRINTK, "trace_printk"), \ C(ANNOTATE, "annotate"), \ C(USERSTACKTRACE, "userstacktrace"), \ C(SYM_USEROBJ, "sym-userobj"), \ C(PRINTK_MSGONLY, "printk-msg-only"), \ C(CONTEXT_INFO, "context-info"), /* Print pid/cpu/time */ \ C(LATENCY_FMT, "latency-format"), \ C(RECORD_CMD, "record-cmd"), \ C(RECORD_TGID, "record-tgid"), \ C(OVERWRITE, "overwrite"), \ C(STOP_ON_FREE, "disable_on_free"), \ C(IRQ_INFO, "irq-info"), \ C(MARKERS, "markers"), \ C(EVENT_FORK, "event-fork"), \ C(TRACE_PRINTK, "trace_printk_dest"), \ C(COPY_MARKER, "copy_trace_marker"),\ C(PAUSE_ON_TRACE, "pause-on-trace"), \ C(HASH_PTR, "hash-ptr"), /* Print hashed pointer */ \ FUNCTION_FLAGS \ FGRAPH_FLAGS \ STACK_FLAGS \ BRANCH_FLAGS /* * By defining C, we can make TRACE_FLAGS a list of bit names * that will define the bits for the flag masks. */ #undef C #define C(a, b) TRACE_ITER_##a##_BIT enum trace_iterator_bits { TRACE_FLAGS /* Make sure we don't go more than we have bits for */ TRACE_ITER_LAST_BIT }; /* * By redefining C, we can make TRACE_FLAGS a list of masks that * use the bits as defined above. */ #undef C #define C(a, b) TRACE_ITER_##a = (1 << TRACE_ITER_##a##_BIT) enum trace_iterator_flags { TRACE_FLAGS }; /* * TRACE_ITER_SYM_MASK masks the options in trace_flags that * control the output of kernel symbols. */ #define TRACE_ITER_SYM_MASK \ (TRACE_ITER_PRINT_PARENT|TRACE_ITER_SYM_OFFSET|TRACE_ITER_SYM_ADDR) extern struct tracer nop_trace; #ifdef CONFIG_BRANCH_TRACER extern int enable_branch_tracing(struct trace_array *tr); extern void disable_branch_tracing(void); static inline int trace_branch_enable(struct trace_array *tr) { if (tr->trace_flags & TRACE_ITER_BRANCH) return enable_branch_tracing(tr); return 0; } static inline void trace_branch_disable(void) { /* due to races, always disable */ disable_branch_tracing(); } #else static inline int trace_branch_enable(struct trace_array *tr) { return 0; } static inline void trace_branch_disable(void) { } #endif /* CONFIG_BRANCH_TRACER */ /* set ring buffers to default size if not already done so */ int tracing_update_buffers(struct trace_array *tr); union trace_synth_field { u8 as_u8; u16 as_u16; u32 as_u32; u64 as_u64; struct trace_dynamic_info as_dynamic; }; struct ftrace_event_field { struct list_head link; const char *name; const char *type; int filter_type; int offset; int size; unsigned int is_signed:1; unsigned int needs_test:1; int len; }; struct prog_entry; struct event_filter { struct prog_entry __rcu *prog; char *filter_string; }; struct event_subsystem { struct list_head list; const char *name; struct event_filter *filter; int ref_count; }; struct trace_subsystem_dir { struct list_head list; struct event_subsystem *subsystem; struct trace_array *tr; struct eventfs_inode *ei; int ref_count; int nr_events; }; void trace_buffer_unlock_commit_regs(struct trace_array *tr, struct trace_buffer *buffer, struct ring_buffer_event *event, unsigned int trcace_ctx, struct pt_regs *regs); static inline void trace_buffer_unlock_commit(struct trace_array *tr, struct trace_buffer *buffer, struct ring_buffer_event *event, unsigned int trace_ctx) { trace_buffer_unlock_commit_regs(tr, buffer, event, trace_ctx, NULL); } DECLARE_PER_CPU(bool, trace_taskinfo_save); int trace_save_cmdline(struct task_struct *tsk); int trace_create_savedcmd(void); int trace_alloc_tgid_map(void); void trace_free_saved_cmdlines_buffer(void); extern const struct file_operations tracing_saved_cmdlines_fops; extern const struct file_operations tracing_saved_tgids_fops; extern const struct file_operations tracing_saved_cmdlines_size_fops; DECLARE_PER_CPU(struct ring_buffer_event *, trace_buffered_event); DECLARE_PER_CPU(int, trace_buffered_event_cnt); void trace_buffered_event_disable(void); void trace_buffered_event_enable(void); void early_enable_events(struct trace_array *tr, char *buf, bool disable_first); static inline void __trace_event_discard_commit(struct trace_buffer *buffer, struct ring_buffer_event *event) { if (this_cpu_read(trace_buffered_event) == event) { /* Simply release the temp buffer and enable preemption */ this_cpu_dec(trace_buffered_event_cnt); preempt_enable_notrace(); return; } /* ring_buffer_discard_commit() enables preemption */ ring_buffer_discard_commit(buffer, event); } /* * Helper function for event_trigger_unlock_commit{_regs}(). * If there are event triggers attached to this event that requires * filtering against its fields, then they will be called as the * entry already holds the field information of the current event. * * It also checks if the event should be discarded or not. * It is to be discarded if the event is soft disabled and the * event was only recorded to process triggers, or if the event * filter is active and this event did not match the filters. * * Returns true if the event is discarded, false otherwise. */ static inline bool __event_trigger_test_discard(struct trace_event_file *file, struct trace_buffer *buffer, struct ring_buffer_event *event, void *entry, enum event_trigger_type *tt) { unsigned long eflags = file->flags; if (eflags & EVENT_FILE_FL_TRIGGER_COND) *tt = event_triggers_call(file, buffer, entry, event); if (likely(!(file->flags & (EVENT_FILE_FL_SOFT_DISABLED | EVENT_FILE_FL_FILTERED | EVENT_FILE_FL_PID_FILTER)))) return false; if (file->flags & EVENT_FILE_FL_SOFT_DISABLED) goto discard; if (file->flags & EVENT_FILE_FL_FILTERED && !filter_match_preds(file->filter, entry)) goto discard; if ((file->flags & EVENT_FILE_FL_PID_FILTER) && trace_event_ignore_this_pid(file)) goto discard; return false; discard: __trace_event_discard_commit(buffer, event); return true; } /** * event_trigger_unlock_commit - handle triggers and finish event commit * @file: The file pointer associated with the event * @buffer: The ring buffer that the event is being written to * @event: The event meta data in the ring buffer * @entry: The event itself * @trace_ctx: The tracing context flags. * * This is a helper function to handle triggers that require data * from the event itself. It also tests the event against filters and * if the event is soft disabled and should be discarded. */ static inline void event_trigger_unlock_commit(struct trace_event_file *file, struct trace_buffer *buffer, struct ring_buffer_event *event, void *entry, unsigned int trace_ctx) { enum event_trigger_type tt = ETT_NONE; if (!__event_trigger_test_discard(file, buffer, event, entry, &tt)) trace_buffer_unlock_commit(file->tr, buffer, event, trace_ctx); if (tt) event_triggers_post_call(file, tt); } #define FILTER_PRED_INVALID ((unsigned short)-1) #define FILTER_PRED_IS_RIGHT (1 << 15) #define FILTER_PRED_FOLD (1 << 15) /* * The max preds is the size of unsigned short with * two flags at the MSBs. One bit is used for both the IS_RIGHT * and FOLD flags. The other is reserved. * * 2^14 preds is way more than enough. */ #define MAX_FILTER_PRED 16384 struct filter_pred; struct regex; typedef int (*regex_match_func)(char *str, struct regex *r, int len); enum regex_type { MATCH_FULL = 0, MATCH_FRONT_ONLY, MATCH_MIDDLE_ONLY, MATCH_END_ONLY, MATCH_GLOB, MATCH_INDEX, }; struct regex { char pattern[MAX_FILTER_STR_VAL]; int len; int field_len; regex_match_func match; }; static inline bool is_string_field(struct ftrace_event_field *field) { return field->filter_type == FILTER_DYN_STRING || field->filter_type == FILTER_RDYN_STRING || field->filter_type == FILTER_STATIC_STRING || field->filter_type == FILTER_PTR_STRING || field->filter_type == FILTER_COMM; } static inline bool is_function_field(struct ftrace_event_field *field) { return field->filter_type == FILTER_TRACE_FN; } extern enum regex_type filter_parse_regex(char *buff, int len, char **search, int *not); extern void print_event_filter(struct trace_event_file *file, struct trace_seq *s); extern int apply_event_filter(struct trace_event_file *file, char *filter_string); extern int apply_subsystem_event_filter(struct trace_subsystem_dir *dir, char *filter_string); extern void print_subsystem_event_filter(struct event_subsystem *system, struct trace_seq *s); extern int filter_assign_type(const char *type); extern int create_event_filter(struct trace_array *tr, struct trace_event_call *call, char *filter_str, bool set_str, struct event_filter **filterp); extern void free_event_filter(struct event_filter *filter); struct ftrace_event_field * trace_find_event_field(struct trace_event_call *call, char *name); extern void trace_event_enable_cmd_record(bool enable); extern void trace_event_enable_tgid_record(bool enable); extern int event_trace_init(void); extern int init_events(void); extern int event_trace_add_tracer(struct dentry *parent, struct trace_array *tr); extern int event_trace_del_tracer(struct trace_array *tr); extern void __trace_early_add_events(struct trace_array *tr); extern struct trace_event_file *__find_event_file(struct trace_array *tr, const char *system, const char *event); extern struct trace_event_file *find_event_file(struct trace_array *tr, const char *system, const char *event); static inline void *event_file_data(struct file *filp) { return READ_ONCE(file_inode(filp)->i_private); } extern struct mutex event_mutex; extern struct list_head ftrace_events; /* * When the trace_event_file is the filp->i_private pointer, * it must be taken under the event_mutex lock, and then checked * if the EVENT_FILE_FL_FREED flag is set. If it is, then the * data pointed to by the trace_event_file can not be trusted. * * Use the event_file_file() to access the trace_event_file from * the filp the first time under the event_mutex and check for * NULL. If it is needed to be retrieved again and the event_mutex * is still held, then the event_file_data() can be used and it * is guaranteed to be valid. */ static inline struct trace_event_file *event_file_file(struct file *filp) { struct trace_event_file *file; lockdep_assert_held(&event_mutex); file = READ_ONCE(file_inode(filp)->i_private); if (!file || file->flags & EVENT_FILE_FL_FREED) return NULL; return file; } extern const struct file_operations event_trigger_fops; extern const struct file_operations event_hist_fops; extern const struct file_operations event_hist_debug_fops; extern const struct file_operations event_inject_fops; #ifdef CONFIG_HIST_TRIGGERS extern int register_trigger_hist_cmd(void); extern int register_trigger_hist_enable_disable_cmds(void); #else static inline int register_trigger_hist_cmd(void) { return 0; } static inline int register_trigger_hist_enable_disable_cmds(void) { return 0; } #endif extern int register_trigger_cmds(void); extern void clear_event_triggers(struct trace_array *tr); enum { EVENT_TRIGGER_FL_PROBE = BIT(0), }; struct event_trigger_data { unsigned long count; int ref; int flags; const struct event_trigger_ops *ops; struct event_command *cmd_ops; struct event_filter __rcu *filter; char *filter_str; void *private_data; bool paused; bool paused_tmp; struct list_head list; char *name; struct list_head named_list; struct event_trigger_data *named_data; }; /* Avoid typos */ #define ENABLE_EVENT_STR "enable_event" #define DISABLE_EVENT_STR "disable_event" #define ENABLE_HIST_STR "enable_hist" #define DISABLE_HIST_STR "disable_hist" struct enable_trigger_data { struct trace_event_file *file; bool enable; bool hist; }; extern int event_enable_trigger_print(struct seq_file *m, struct event_trigger_data *data); extern void event_enable_trigger_free(struct event_trigger_data *data); extern int event_enable_trigger_parse(struct event_command *cmd_ops, struct trace_event_file *file, char *glob, char *cmd, char *param_and_filter); extern int event_enable_register_trigger(char *glob, struct event_trigger_data *data, struct trace_event_file *file); extern void event_enable_unregister_trigger(char *glob, struct event_trigger_data *test, struct trace_event_file *file); extern struct event_trigger_data * trigger_data_alloc(struct event_command *cmd_ops, char *cmd, char *param, void *private_data); extern void trigger_data_free(struct event_trigger_data *data); extern int event_trigger_init(struct event_trigger_data *data); extern int trace_event_trigger_enable_disable(struct trace_event_file *file, int trigger_enable); extern void update_cond_flag(struct trace_event_file *file); extern int set_trigger_filter(char *filter_str, struct event_trigger_data *trigger_data, struct trace_event_file *file); extern struct event_trigger_data *find_named_trigger(const char *name); extern bool is_named_trigger(struct event_trigger_data *test); extern int save_named_trigger(const char *name, struct event_trigger_data *data); extern void del_named_trigger(struct event_trigger_data *data); extern void pause_named_trigger(struct event_trigger_data *data); extern void unpause_named_trigger(struct event_trigger_data *data); extern void set_named_trigger_data(struct event_trigger_data *data, struct event_trigger_data *named_data); extern struct event_trigger_data * get_named_trigger_data(struct event_trigger_data *data); extern int register_event_command(struct event_command *cmd); extern int unregister_event_command(struct event_command *cmd); extern int register_trigger_hist_enable_disable_cmds(void); extern bool event_trigger_check_remove(const char *glob); extern bool event_trigger_empty_param(const char *param); extern int event_trigger_separate_filter(char *param_and_filter, char **param, char **filter, bool param_required); extern int event_trigger_parse_num(char *trigger, struct event_trigger_data *trigger_data); extern int event_trigger_set_filter(struct event_command *cmd_ops, struct trace_event_file *file, char *param, struct event_trigger_data *trigger_data); extern void event_trigger_reset_filter(struct event_command *cmd_ops, struct event_trigger_data *trigger_data); extern int event_trigger_register(struct event_command *cmd_ops, struct trace_event_file *file, char *glob, struct event_trigger_data *trigger_data); extern void event_trigger_unregister(struct event_command *cmd_ops, struct trace_event_file *file, char *glob, struct event_trigger_data *trigger_data); extern void event_file_get(struct trace_event_file *file); extern void event_file_put(struct trace_event_file *file); /** * struct event_trigger_ops - callbacks for trace event triggers * * The methods in this structure provide per-event trigger hooks for * various trigger operations. * * The @init and @free methods are used during trigger setup and * teardown, typically called from an event_command's @parse() * function implementation. * * The @print method is used to print the trigger spec. * * The @trigger method is the function that actually implements the * trigger and is called in the context of the triggering event * whenever that event occurs. * * All the methods below, except for @init() and @free(), must be * implemented. * * @trigger: The trigger 'probe' function called when the triggering * event occurs. The data passed into this callback is the data * that was supplied to the event_command @reg() function that * registered the trigger (see struct event_command) along with * the trace record, rec. * * @init: An optional initialization function called for the trigger * when the trigger is registered (via the event_command reg() * function). This can be used to perform per-trigger * initialization such as incrementing a per-trigger reference * count, for instance. This is usually implemented by the * generic utility function @event_trigger_init() (see * trace_event_triggers.c). * * @free: An optional de-initialization function called for the * trigger when the trigger is unregistered (via the * event_command @reg() function). This can be used to perform * per-trigger de-initialization such as decrementing a * per-trigger reference count and freeing corresponding trigger * data, for instance. This is usually implemented by the * generic utility function @event_trigger_free() (see * trace_event_triggers.c). * * @print: The callback function invoked to have the trigger print * itself. This is usually implemented by a wrapper function * that calls the generic utility function @event_trigger_print() * (see trace_event_triggers.c). */ struct event_trigger_ops { void (*trigger)(struct event_trigger_data *data, struct trace_buffer *buffer, void *rec, struct ring_buffer_event *rbe); int (*init)(struct event_trigger_data *data); void (*free)(struct event_trigger_data *data); int (*print)(struct seq_file *m, struct event_trigger_data *data); }; /** * struct event_command - callbacks and data members for event commands * * Event commands are invoked by users by writing the command name * into the 'trigger' file associated with a trace event. The * parameters associated with a specific invocation of an event * command are used to create an event trigger instance, which is * added to the list of trigger instances associated with that trace * event. When the event is hit, the set of triggers associated with * that event is invoked. * * The data members in this structure provide per-event command data * for various event commands. * * All the data members below, except for @post_trigger, must be set * for each event command. * * @name: The unique name that identifies the event command. This is * the name used when setting triggers via trigger files. * * @trigger_type: A unique id that identifies the event command * 'type'. This value has two purposes, the first to ensure that * only one trigger of the same type can be set at a given time * for a particular event e.g. it doesn't make sense to have both * a traceon and traceoff trigger attached to a single event at * the same time, so traceon and traceoff have the same type * though they have different names. The @trigger_type value is * also used as a bit value for deferring the actual trigger * action until after the current event is finished. Some * commands need to do this if they themselves log to the trace * buffer (see the @post_trigger() member below). @trigger_type * values are defined by adding new values to the trigger_type * enum in include/linux/trace_events.h. * * @flags: See the enum event_command_flags below. * * All the methods below, except for @set_filter() and @unreg_all(), * must be implemented. * * @parse: The callback function responsible for parsing and * registering the trigger written to the 'trigger' file by the * user. It allocates the trigger instance and registers it with * the appropriate trace event. It makes use of the other * event_command callback functions to orchestrate this, and is * usually implemented by the generic utility function * @event_trigger_callback() (see trace_event_triggers.c). * * @reg: Adds the trigger to the list of triggers associated with the * event, and enables the event trigger itself, after * initializing it (via the event_trigger_ops @init() function). * This is also where commands can use the @trigger_type value to * make the decision as to whether or not multiple instances of * the trigger should be allowed. This is usually implemented by * the generic utility function @register_trigger() (see * trace_event_triggers.c). * * @unreg: Removes the trigger from the list of triggers associated * with the event, and disables the event trigger itself, after * initializing it (via the event_trigger_ops @free() function). * This is usually implemented by the generic utility function * @unregister_trigger() (see trace_event_triggers.c). * * @unreg_all: An optional function called to remove all the triggers * from the list of triggers associated with the event. Called * when a trigger file is opened in truncate mode. * * @set_filter: An optional function called to parse and set a filter * for the trigger. If no @set_filter() method is set for the * event command, filters set by the user for the command will be * ignored. This is usually implemented by the generic utility * function @set_trigger_filter() (see trace_event_triggers.c). * * @get_trigger_ops: The callback function invoked to retrieve the * event_trigger_ops implementation associated with the command. * This callback function allows a single event_command to * support multiple trigger implementations via different sets of * event_trigger_ops, depending on the value of the @param * string. */ struct event_command { struct list_head list; char *name; enum event_trigger_type trigger_type; int flags; int (*parse)(struct event_command *cmd_ops, struct trace_event_file *file, char *glob, char *cmd, char *param_and_filter); int (*reg)(char *glob, struct event_trigger_data *data, struct trace_event_file *file); void (*unreg)(char *glob, struct event_trigger_data *data, struct trace_event_file *file); void (*unreg_all)(struct trace_event_file *file); int (*set_filter)(char *filter_str, struct event_trigger_data *data, struct trace_event_file *file); const struct event_trigger_ops *(*get_trigger_ops)(char *cmd, char *param); }; /** * enum event_command_flags - flags for struct event_command * * @POST_TRIGGER: A flag that says whether or not this command needs * to have its action delayed until after the current event has * been closed. Some triggers need to avoid being invoked while * an event is currently in the process of being logged, since * the trigger may itself log data into the trace buffer. Thus * we make sure the current event is committed before invoking * those triggers. To do that, the trigger invocation is split * in two - the first part checks the filter using the current * trace record; if a command has the @post_trigger flag set, it * sets a bit for itself in the return value, otherwise it * directly invokes the trigger. Once all commands have been * either invoked or set their return flag, the current record is * either committed or discarded. At that point, if any commands * have deferred their triggers, those commands are finally * invoked following the close of the current event. In other * words, if the event_trigger_ops @func() probe implementation * itself logs to the trace buffer, this flag should be set, * otherwise it can be left unspecified. * * @NEEDS_REC: A flag that says whether or not this command needs * access to the trace record in order to perform its function, * regardless of whether or not it has a filter associated with * it (filters make a trigger require access to the trace record * but are not always present). */ enum event_command_flags { EVENT_CMD_FL_POST_TRIGGER = 1, EVENT_CMD_FL_NEEDS_REC = 2, }; static inline bool event_command_post_trigger(struct event_command *cmd_ops) { return cmd_ops->flags & EVENT_CMD_FL_POST_TRIGGER; } static inline bool event_command_needs_rec(struct event_command *cmd_ops) { return cmd_ops->flags & EVENT_CMD_FL_NEEDS_REC; } extern int trace_event_enable_disable(struct trace_event_file *file, int enable, int soft_disable); extern int tracing_alloc_snapshot(void); extern void tracing_snapshot_cond(struct trace_array *tr, void *cond_data); extern int tracing_snapshot_cond_enable(struct trace_array *tr, void *cond_data, cond_update_fn_t update); extern int tracing_snapshot_cond_disable(struct trace_array *tr); extern void *tracing_cond_snapshot_data(struct trace_array *tr); extern const char *__start___trace_bprintk_fmt[]; extern const char *__stop___trace_bprintk_fmt[]; extern const char *__start___tracepoint_str[]; extern const char *__stop___tracepoint_str[]; void trace_printk_control(bool enabled); void trace_printk_start_comm(void); int trace_keep_overwrite(struct tracer *tracer, u32 mask, int set); int set_tracer_flag(struct trace_array *tr, unsigned int mask, int enabled); /* Used from boot time tracer */ extern int trace_set_options(struct trace_array *tr, char *option); extern int tracing_set_tracer(struct trace_array *tr, const char *buf); extern ssize_t tracing_resize_ring_buffer(struct trace_array *tr, unsigned long size, int cpu_id); extern int tracing_set_cpumask(struct trace_array *tr, cpumask_var_t tracing_cpumask_new); #define MAX_EVENT_NAME_LEN 64 extern ssize_t trace_parse_run_command(struct file *file, const char __user *buffer, size_t count, loff_t *ppos, int (*createfn)(const char *)); extern unsigned int err_pos(char *cmd, const char *str); extern void tracing_log_err(struct trace_array *tr, const char *loc, const char *cmd, const char **errs, u8 type, u16 pos); /* * Normal trace_printk() and friends allocates special buffers * to do the manipulation, as well as saves the print formats * into sections to display. But the trace infrastructure wants * to use these without the added overhead at the price of being * a bit slower (used mainly for warnings, where we don't care * about performance). The internal_trace_puts() is for such * a purpose. */ #define internal_trace_puts(str) __trace_puts(_THIS_IP_, str, strlen(str)) #undef FTRACE_ENTRY #define FTRACE_ENTRY(call, struct_name, id, tstruct, print) \ extern struct trace_event_call \ __aligned(4) event_##call; #undef FTRACE_ENTRY_DUP #define FTRACE_ENTRY_DUP(call, struct_name, id, tstruct, print) \ FTRACE_ENTRY(call, struct_name, id, PARAMS(tstruct), PARAMS(print)) #undef FTRACE_ENTRY_PACKED #define FTRACE_ENTRY_PACKED(call, struct_name, id, tstruct, print) \ FTRACE_ENTRY(call, struct_name, id, PARAMS(tstruct), PARAMS(print)) #include "trace_entries.h" #if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_FUNCTION_TRACER) int perf_ftrace_event_register(struct trace_event_call *call, enum trace_reg type, void *data); #else #define perf_ftrace_event_register NULL #endif #ifdef CONFIG_FTRACE_SYSCALLS void init_ftrace_syscalls(void); const char *get_syscall_name(int syscall); #else static inline void init_ftrace_syscalls(void) { } static inline const char *get_syscall_name(int syscall) { return NULL; } #endif #ifdef CONFIG_EVENT_TRACING void trace_event_init(void); void trace_event_eval_update(struct trace_eval_map **map, int len); /* Used from boot time tracer */ extern int ftrace_set_clr_event(struct trace_array *tr, char *buf, int set); extern int trigger_process_regex(struct trace_event_file *file, char *buff); #else static inline void __init trace_event_init(void) { } static inline void trace_event_eval_update(struct trace_eval_map **map, int len) { } #endif #ifdef CONFIG_TRACER_SNAPSHOT void tracing_snapshot_instance(struct trace_array *tr); int tracing_alloc_snapshot_instance(struct trace_array *tr); int tracing_arm_snapshot(struct trace_array *tr); void tracing_disarm_snapshot(struct trace_array *tr); #else static inline void tracing_snapshot_instance(struct trace_array *tr) { } static inline int tracing_alloc_snapshot_instance(struct trace_array *tr) { return 0; } static inline int tracing_arm_snapshot(struct trace_array *tr) { return 0; } static inline void tracing_disarm_snapshot(struct trace_array *tr) { } #endif #ifdef CONFIG_PREEMPT_TRACER void tracer_preempt_on(unsigned long a0, unsigned long a1); void tracer_preempt_off(unsigned long a0, unsigned long a1); #else static inline void tracer_preempt_on(unsigned long a0, unsigned long a1) { } static inline void tracer_preempt_off(unsigned long a0, unsigned long a1) { } #endif #ifdef CONFIG_IRQSOFF_TRACER void tracer_hardirqs_on(unsigned long a0, unsigned long a1); void tracer_hardirqs_off(unsigned long a0, unsigned long a1); #else static inline void tracer_hardirqs_on(unsigned long a0, unsigned long a1) { } static inline void tracer_hardirqs_off(unsigned long a0, unsigned long a1) { } #endif /* * Reset the state of the trace_iterator so that it can read consumed data. * Normally, the trace_iterator is used for reading the data when it is not * consumed, and must retain state. */ static __always_inline void trace_iterator_reset(struct trace_iterator *iter) { memset_startat(iter, 0, seq); iter->pos = -1; } /* Check the name is good for event/group/fields */ static inline bool __is_good_name(const char *name, bool hash_ok) { if (!isalpha(*name) && *name != '_' && (!hash_ok || *name != '-')) return false; while (*++name != '\0') { if (!isalpha(*name) && !isdigit(*name) && *name != '_' && (!hash_ok || *name != '-')) return false; } return true; } /* Check the name is good for event/group/fields */ static inline bool is_good_name(const char *name) { return __is_good_name(name, false); } /* Check the name is good for system */ static inline bool is_good_system_name(const char *name) { return __is_good_name(name, true); } /* Convert certain expected symbols into '_' when generating event names */ static inline void sanitize_event_name(char *name) { while (*name++ != '\0') if (*name == ':' || *name == '.') *name = '_'; } /* * This is a generic way to read and write a u64 value from a file in tracefs. * * The value is stored on the variable pointed by *val. The value needs * to be at least *min and at most *max. The write is protected by an * existing *lock. */ struct trace_min_max_param { struct mutex *lock; u64 *val; u64 *min; u64 *max; }; #define U64_STR_SIZE 24 /* 20 digits max */ extern const struct file_operations trace_min_max_fops; #ifdef CONFIG_RV extern int rv_init_interface(void); #else static inline int rv_init_interface(void) { return 0; } #endif /* * This is used only to distinguish * function address from trampoline code. * So this value has no meaning. */ #define FTRACE_TRAMPOLINE_MARKER ((unsigned long) INT_MAX) #endif /* _LINUX_KERNEL_TRACE_H */
37 35 42 57 28 13 2 1 5 2 2 1 3 3 3 3 4 4 4 1 5 3 1 1 3 3 7 7 7 20 20 1 4 2 1 1 1 1 19 18 1 4 3 1 6 2 4 5 3 1 1 1 5 3 1 1 1 1 4 3 1 1 5 3 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 // SPDX-License-Identifier: GPL-2.0-only /* * kernel/power/main.c - PM subsystem core functionality. * * Copyright (c) 2003 Patrick Mochel * Copyright (c) 2003 Open Source Development Lab */ #include <linux/acpi.h> #include <linux/export.h> #include <linux/kobject.h> #include <linux/string.h> #include <linux/pm-trace.h> #include <linux/workqueue.h> #include <linux/debugfs.h> #include <linux/seq_file.h> #include <linux/suspend.h> #include <linux/syscalls.h> #include <linux/pm_runtime.h> #include "power.h" #ifdef CONFIG_PM_SLEEP /* * The following functions are used by the suspend/hibernate code to temporarily * change gfp_allowed_mask in order to avoid using I/O during memory allocations * while devices are suspended. To avoid races with the suspend/hibernate code, * they should always be called with system_transition_mutex held * (gfp_allowed_mask also should only be modified with system_transition_mutex * held, unless the suspend/hibernate code is guaranteed not to run in parallel * with that modification). */ static gfp_t saved_gfp_mask; void pm_restore_gfp_mask(void) { WARN_ON(!mutex_is_locked(&system_transition_mutex)); if (saved_gfp_mask) { gfp_allowed_mask = saved_gfp_mask; saved_gfp_mask = 0; } } void pm_restrict_gfp_mask(void) { WARN_ON(!mutex_is_locked(&system_transition_mutex)); WARN_ON(saved_gfp_mask); saved_gfp_mask = gfp_allowed_mask; gfp_allowed_mask &= ~(__GFP_IO | __GFP_FS); } unsigned int lock_system_sleep(void) { unsigned int flags = current->flags; current->flags |= PF_NOFREEZE; mutex_lock(&system_transition_mutex); return flags; } EXPORT_SYMBOL_GPL(lock_system_sleep); void unlock_system_sleep(unsigned int flags) { if (!(flags & PF_NOFREEZE)) current->flags &= ~PF_NOFREEZE; mutex_unlock(&system_transition_mutex); } EXPORT_SYMBOL_GPL(unlock_system_sleep); void ksys_sync_helper(void) { ktime_t start; long elapsed_msecs; start = ktime_get(); ksys_sync(); elapsed_msecs = ktime_to_ms(ktime_sub(ktime_get(), start)); pr_info("Filesystems sync: %ld.%03ld seconds\n", elapsed_msecs / MSEC_PER_SEC, elapsed_msecs % MSEC_PER_SEC); } EXPORT_SYMBOL_GPL(ksys_sync_helper); /* Routines for PM-transition notifications */ static BLOCKING_NOTIFIER_HEAD(pm_chain_head); int register_pm_notifier(struct notifier_block *nb) { return blocking_notifier_chain_register(&pm_chain_head, nb); } EXPORT_SYMBOL_GPL(register_pm_notifier); int unregister_pm_notifier(struct notifier_block *nb) { return blocking_notifier_chain_unregister(&pm_chain_head, nb); } EXPORT_SYMBOL_GPL(unregister_pm_notifier); int pm_notifier_call_chain_robust(unsigned long val_up, unsigned long val_down) { int ret; ret = blocking_notifier_call_chain_robust(&pm_chain_head, val_up, val_down, NULL); return notifier_to_errno(ret); } int pm_notifier_call_chain(unsigned long val) { return blocking_notifier_call_chain(&pm_chain_head, val, NULL); } /* If set, devices may be suspended and resumed asynchronously. */ int pm_async_enabled = 1; static ssize_t pm_async_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return sysfs_emit(buf, "%d\n", pm_async_enabled); } static ssize_t pm_async_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t n) { unsigned long val; if (kstrtoul(buf, 10, &val)) return -EINVAL; if (val > 1) return -EINVAL; pm_async_enabled = val; return n; } power_attr(pm_async); #ifdef CONFIG_SUSPEND static ssize_t mem_sleep_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { ssize_t count = 0; suspend_state_t i; for (i = PM_SUSPEND_MIN; i < PM_SUSPEND_MAX; i++) { if (i >= PM_SUSPEND_MEM && cxl_mem_active()) continue; if (mem_sleep_states[i]) { const char *label = mem_sleep_states[i]; if (mem_sleep_current == i) count += sysfs_emit_at(buf, count, "[%s] ", label); else count += sysfs_emit_at(buf, count, "%s ", label); } } /* Convert the last space to a newline if needed. */ if (count > 0) buf[count - 1] = '\n'; return count; } static suspend_state_t decode_suspend_state(const char *buf, size_t n) { suspend_state_t state; char *p; int len; p = memchr(buf, '\n', n); len = p ? p - buf : n; for (state = PM_SUSPEND_MIN; state < PM_SUSPEND_MAX; state++) { const char *label = mem_sleep_states[state]; if (label && len == strlen(label) && !strncmp(buf, label, len)) return state; } return PM_SUSPEND_ON; } static ssize_t mem_sleep_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t n) { suspend_state_t state; int error; error = pm_autosleep_lock(); if (error) return error; if (pm_autosleep_state() > PM_SUSPEND_ON) { error = -EBUSY; goto out; } state = decode_suspend_state(buf, n); if (state < PM_SUSPEND_MAX && state > PM_SUSPEND_ON) mem_sleep_current = state; else error = -EINVAL; out: pm_autosleep_unlock(); return error ? error : n; } power_attr(mem_sleep); /* * sync_on_suspend: invoke ksys_sync_helper() before suspend. * * show() returns whether ksys_sync_helper() is invoked before suspend. * store() accepts 0 or 1. 0 disables ksys_sync_helper() and 1 enables it. */ bool sync_on_suspend_enabled = !IS_ENABLED(CONFIG_SUSPEND_SKIP_SYNC); static ssize_t sync_on_suspend_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return sysfs_emit(buf, "%d\n", sync_on_suspend_enabled); } static ssize_t sync_on_suspend_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t n) { unsigned long val; if (kstrtoul(buf, 10, &val)) return -EINVAL; if (val > 1) return -EINVAL; sync_on_suspend_enabled = !!val; return n; } power_attr(sync_on_suspend); #endif /* CONFIG_SUSPEND */ #ifdef CONFIG_PM_SLEEP_DEBUG int pm_test_level = TEST_NONE; static const char * const pm_tests[__TEST_AFTER_LAST] = { [TEST_NONE] = "none", [TEST_CORE] = "core", [TEST_CPUS] = "processors", [TEST_PLATFORM] = "platform", [TEST_DEVICES] = "devices", [TEST_FREEZER] = "freezer", }; static ssize_t pm_test_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { ssize_t count = 0; int level; for (level = TEST_FIRST; level <= TEST_MAX; level++) if (pm_tests[level]) { if (level == pm_test_level) count += sysfs_emit_at(buf, count, "[%s] ", pm_tests[level]); else count += sysfs_emit_at(buf, count, "%s ", pm_tests[level]); } /* Convert the last space to a newline if needed. */ if (count > 0) buf[count - 1] = '\n'; return count; } static ssize_t pm_test_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t n) { unsigned int sleep_flags; const char * const *s; int error = -EINVAL; int level; char *p; int len; p = memchr(buf, '\n', n); len = p ? p - buf : n; sleep_flags = lock_system_sleep(); level = TEST_FIRST; for (s = &pm_tests[level]; level <= TEST_MAX; s++, level++) if (*s && len == strlen(*s) && !strncmp(buf, *s, len)) { pm_test_level = level; error = 0; break; } unlock_system_sleep(sleep_flags); return error ? error : n; } power_attr(pm_test); #endif /* CONFIG_PM_SLEEP_DEBUG */ #define SUSPEND_NR_STEPS SUSPEND_RESUME #define REC_FAILED_NUM 2 struct suspend_stats { unsigned int step_failures[SUSPEND_NR_STEPS]; unsigned int success; unsigned int fail; int last_failed_dev; char failed_devs[REC_FAILED_NUM][40]; int last_failed_errno; int errno[REC_FAILED_NUM]; int last_failed_step; u64 last_hw_sleep; u64 total_hw_sleep; u64 max_hw_sleep; enum suspend_stat_step failed_steps[REC_FAILED_NUM]; }; static struct suspend_stats suspend_stats; static DEFINE_MUTEX(suspend_stats_lock); void dpm_save_failed_dev(const char *name) { mutex_lock(&suspend_stats_lock); strscpy(suspend_stats.failed_devs[suspend_stats.last_failed_dev], name, sizeof(suspend_stats.failed_devs[0])); suspend_stats.last_failed_dev++; suspend_stats.last_failed_dev %= REC_FAILED_NUM; mutex_unlock(&suspend_stats_lock); } void dpm_save_failed_step(enum suspend_stat_step step) { suspend_stats.step_failures[step-1]++; suspend_stats.failed_steps[suspend_stats.last_failed_step] = step; suspend_stats.last_failed_step++; suspend_stats.last_failed_step %= REC_FAILED_NUM; } void dpm_save_errno(int err) { if (!err) { suspend_stats.success++; return; } suspend_stats.fail++; suspend_stats.errno[suspend_stats.last_failed_errno] = err; suspend_stats.last_failed_errno++; suspend_stats.last_failed_errno %= REC_FAILED_NUM; } void pm_report_hw_sleep_time(u64 t) { suspend_stats.last_hw_sleep = t; suspend_stats.total_hw_sleep += t; } EXPORT_SYMBOL_GPL(pm_report_hw_sleep_time); void pm_report_max_hw_sleep(u64 t) { suspend_stats.max_hw_sleep = t; } EXPORT_SYMBOL_GPL(pm_report_max_hw_sleep); static const char * const suspend_step_names[] = { [SUSPEND_WORKING] = "", [SUSPEND_FREEZE] = "freeze", [SUSPEND_PREPARE] = "prepare", [SUSPEND_SUSPEND] = "suspend", [SUSPEND_SUSPEND_LATE] = "suspend_late", [SUSPEND_SUSPEND_NOIRQ] = "suspend_noirq", [SUSPEND_RESUME_NOIRQ] = "resume_noirq", [SUSPEND_RESUME_EARLY] = "resume_early", [SUSPEND_RESUME] = "resume", }; #define suspend_attr(_name, format_str) \ static ssize_t _name##_show(struct kobject *kobj, \ struct kobj_attribute *attr, char *buf) \ { \ return sysfs_emit(buf, format_str, suspend_stats._name);\ } \ static struct kobj_attribute _name = __ATTR_RO(_name) suspend_attr(success, "%u\n"); suspend_attr(fail, "%u\n"); suspend_attr(last_hw_sleep, "%llu\n"); suspend_attr(total_hw_sleep, "%llu\n"); suspend_attr(max_hw_sleep, "%llu\n"); #define suspend_step_attr(_name, step) \ static ssize_t _name##_show(struct kobject *kobj, \ struct kobj_attribute *attr, char *buf) \ { \ return sysfs_emit(buf, "%u\n", \ suspend_stats.step_failures[step-1]); \ } \ static struct kobj_attribute _name = __ATTR_RO(_name) suspend_step_attr(failed_freeze, SUSPEND_FREEZE); suspend_step_attr(failed_prepare, SUSPEND_PREPARE); suspend_step_attr(failed_suspend, SUSPEND_SUSPEND); suspend_step_attr(failed_suspend_late, SUSPEND_SUSPEND_LATE); suspend_step_attr(failed_suspend_noirq, SUSPEND_SUSPEND_NOIRQ); suspend_step_attr(failed_resume, SUSPEND_RESUME); suspend_step_attr(failed_resume_early, SUSPEND_RESUME_EARLY); suspend_step_attr(failed_resume_noirq, SUSPEND_RESUME_NOIRQ); static ssize_t last_failed_dev_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { int index; char *last_failed_dev = NULL; index = suspend_stats.last_failed_dev + REC_FAILED_NUM - 1; index %= REC_FAILED_NUM; last_failed_dev = suspend_stats.failed_devs[index]; return sysfs_emit(buf, "%s\n", last_failed_dev); } static struct kobj_attribute last_failed_dev = __ATTR_RO(last_failed_dev); static ssize_t last_failed_errno_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { int index; int last_failed_errno; index = suspend_stats.last_failed_errno + REC_FAILED_NUM - 1; index %= REC_FAILED_NUM; last_failed_errno = suspend_stats.errno[index]; return sysfs_emit(buf, "%d\n", last_failed_errno); } static struct kobj_attribute last_failed_errno = __ATTR_RO(last_failed_errno); static ssize_t last_failed_step_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { enum suspend_stat_step step; int index; index = suspend_stats.last_failed_step + REC_FAILED_NUM - 1; index %= REC_FAILED_NUM; step = suspend_stats.failed_steps[index]; return sysfs_emit(buf, "%s\n", suspend_step_names[step]); } static struct kobj_attribute last_failed_step = __ATTR_RO(last_failed_step); static struct attribute *suspend_attrs[] = { &success.attr, &fail.attr, &failed_freeze.attr, &failed_prepare.attr, &failed_suspend.attr, &failed_suspend_late.attr, &failed_suspend_noirq.attr, &failed_resume.attr, &failed_resume_early.attr, &failed_resume_noirq.attr, &last_failed_dev.attr, &last_failed_errno.attr, &last_failed_step.attr, &last_hw_sleep.attr, &total_hw_sleep.attr, &max_hw_sleep.attr, NULL, }; static umode_t suspend_attr_is_visible(struct kobject *kobj, struct attribute *attr, int idx) { if (attr != &last_hw_sleep.attr && attr != &total_hw_sleep.attr && attr != &max_hw_sleep.attr) return 0444; #ifdef CONFIG_ACPI if (acpi_gbl_FADT.flags & ACPI_FADT_LOW_POWER_S0) return 0444; #endif return 0; } static const struct attribute_group suspend_attr_group = { .name = "suspend_stats", .attrs = suspend_attrs, .is_visible = suspend_attr_is_visible, }; #ifdef CONFIG_DEBUG_FS static int suspend_stats_show(struct seq_file *s, void *unused) { int i, index, last_dev, last_errno, last_step; enum suspend_stat_step step; last_dev = suspend_stats.last_failed_dev + REC_FAILED_NUM - 1; last_dev %= REC_FAILED_NUM; last_errno = suspend_stats.last_failed_errno + REC_FAILED_NUM - 1; last_errno %= REC_FAILED_NUM; last_step = suspend_stats.last_failed_step + REC_FAILED_NUM - 1; last_step %= REC_FAILED_NUM; seq_printf(s, "success: %u\nfail: %u\n", suspend_stats.success, suspend_stats.fail); for (step = SUSPEND_FREEZE; step <= SUSPEND_NR_STEPS; step++) seq_printf(s, "failed_%s: %u\n", suspend_step_names[step], suspend_stats.step_failures[step-1]); seq_printf(s, "failures:\n last_failed_dev:\t%-s\n", suspend_stats.failed_devs[last_dev]); for (i = 1; i < REC_FAILED_NUM; i++) { index = last_dev + REC_FAILED_NUM - i; index %= REC_FAILED_NUM; seq_printf(s, "\t\t\t%-s\n", suspend_stats.failed_devs[index]); } seq_printf(s, " last_failed_errno:\t%-d\n", suspend_stats.errno[last_errno]); for (i = 1; i < REC_FAILED_NUM; i++) { index = last_errno + REC_FAILED_NUM - i; index %= REC_FAILED_NUM; seq_printf(s, "\t\t\t%-d\n", suspend_stats.errno[index]); } seq_printf(s, " last_failed_step:\t%-s\n", suspend_step_names[suspend_stats.failed_steps[last_step]]); for (i = 1; i < REC_FAILED_NUM; i++) { index = last_step + REC_FAILED_NUM - i; index %= REC_FAILED_NUM; seq_printf(s, "\t\t\t%-s\n", suspend_step_names[suspend_stats.failed_steps[index]]); } return 0; } DEFINE_SHOW_ATTRIBUTE(suspend_stats); static int __init pm_debugfs_init(void) { debugfs_create_file("suspend_stats", S_IFREG | S_IRUGO, NULL, NULL, &suspend_stats_fops); return 0; } late_initcall(pm_debugfs_init); #endif /* CONFIG_DEBUG_FS */ bool pm_sleep_transition_in_progress(void) { return pm_suspend_in_progress() || hibernation_in_progress(); } #endif /* CONFIG_PM_SLEEP */ #ifdef CONFIG_PM_SLEEP_DEBUG /* * pm_print_times: print time taken by devices to suspend and resume. * * show() returns whether printing of suspend and resume times is enabled. * store() accepts 0 or 1. 0 disables printing and 1 enables it. */ bool pm_print_times_enabled; static ssize_t pm_print_times_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return sysfs_emit(buf, "%d\n", pm_print_times_enabled); } static ssize_t pm_print_times_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t n) { unsigned long val; if (kstrtoul(buf, 10, &val)) return -EINVAL; if (val > 1) return -EINVAL; pm_print_times_enabled = !!val; return n; } power_attr(pm_print_times); static inline void pm_print_times_init(void) { pm_print_times_enabled = initcall_debug; } static ssize_t pm_wakeup_irq_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { if (!pm_wakeup_irq()) return -ENODATA; return sysfs_emit(buf, "%u\n", pm_wakeup_irq()); } power_attr_ro(pm_wakeup_irq); bool pm_debug_messages_on __read_mostly; bool pm_debug_messages_should_print(void) { return pm_debug_messages_on && pm_sleep_transition_in_progress(); } EXPORT_SYMBOL_GPL(pm_debug_messages_should_print); static ssize_t pm_debug_messages_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return sysfs_emit(buf, "%d\n", pm_debug_messages_on); } static ssize_t pm_debug_messages_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t n) { unsigned long val; if (kstrtoul(buf, 10, &val)) return -EINVAL; if (val > 1) return -EINVAL; pm_debug_messages_on = !!val; return n; } power_attr(pm_debug_messages); static int __init pm_debug_messages_setup(char *str) { pm_debug_messages_on = true; return 1; } __setup("pm_debug_messages", pm_debug_messages_setup); #else /* !CONFIG_PM_SLEEP_DEBUG */ static inline void pm_print_times_init(void) {} #endif /* CONFIG_PM_SLEEP_DEBUG */ struct kobject *power_kobj; /* * state - control system sleep states. * * show() returns available sleep state labels, which may be "mem", "standby", * "freeze" and "disk" (hibernation). * See Documentation/admin-guide/pm/sleep-states.rst for a description of * what they mean. * * store() accepts one of those strings, translates it into the proper * enumerated value, and initiates a suspend transition. */ static ssize_t state_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { ssize_t count = 0; #ifdef CONFIG_SUSPEND suspend_state_t i; for (i = PM_SUSPEND_MIN; i < PM_SUSPEND_MAX; i++) if (pm_states[i]) count += sysfs_emit_at(buf, count, "%s ", pm_states[i]); #endif if (hibernation_available()) count += sysfs_emit_at(buf, count, "disk "); /* Convert the last space to a newline if needed. */ if (count > 0) buf[count - 1] = '\n'; return count; } static suspend_state_t decode_state(const char *buf, size_t n) { #ifdef CONFIG_SUSPEND suspend_state_t state; #endif char *p; int len; p = memchr(buf, '\n', n); len = p ? p - buf : n; /* Check hibernation first. */ if (len == 4 && str_has_prefix(buf, "disk")) return PM_SUSPEND_MAX; #ifdef CONFIG_SUSPEND for (state = PM_SUSPEND_MIN; state < PM_SUSPEND_MAX; state++) { const char *label = pm_states[state]; if (label && len == strlen(label) && !strncmp(buf, label, len)) return state; } #endif return PM_SUSPEND_ON; } static ssize_t state_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t n) { suspend_state_t state; int error; error = pm_autosleep_lock(); if (error) return error; if (pm_autosleep_state() > PM_SUSPEND_ON) { error = -EBUSY; goto out; } state = decode_state(buf, n); if (state < PM_SUSPEND_MAX) { if (state == PM_SUSPEND_MEM) state = mem_sleep_current; error = pm_suspend(state); } else if (state == PM_SUSPEND_MAX) { error = hibernate(); } else { error = -EINVAL; } out: pm_autosleep_unlock(); return error ? error : n; } power_attr(state); #ifdef CONFIG_PM_SLEEP /* * The 'wakeup_count' attribute, along with the functions defined in * drivers/base/power/wakeup.c, provides a means by which wakeup events can be * handled in a non-racy way. * * If a wakeup event occurs when the system is in a sleep state, it simply is * woken up. In turn, if an event that would wake the system up from a sleep * state occurs when it is undergoing a transition to that sleep state, the * transition should be aborted. Moreover, if such an event occurs when the * system is in the working state, an attempt to start a transition to the * given sleep state should fail during certain period after the detection of * the event. Using the 'state' attribute alone is not sufficient to satisfy * these requirements, because a wakeup event may occur exactly when 'state' * is being written to and may be delivered to user space right before it is * frozen, so the event will remain only partially processed until the system is * woken up by another event. In particular, it won't cause the transition to * a sleep state to be aborted. * * This difficulty may be overcome if user space uses 'wakeup_count' before * writing to 'state'. It first should read from 'wakeup_count' and store * the read value. Then, after carrying out its own preparations for the system * transition to a sleep state, it should write the stored value to * 'wakeup_count'. If that fails, at least one wakeup event has occurred since * 'wakeup_count' was read and 'state' should not be written to. Otherwise, it * is allowed to write to 'state', but the transition will be aborted if there * are any wakeup events detected after 'wakeup_count' was written to. */ static ssize_t wakeup_count_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { unsigned int val; return pm_get_wakeup_count(&val, true) ? sysfs_emit(buf, "%u\n", val) : -EINTR; } static ssize_t wakeup_count_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t n) { unsigned int val; int error; error = pm_autosleep_lock(); if (error) return error; if (pm_autosleep_state() > PM_SUSPEND_ON) { error = -EBUSY; goto out; } error = -EINVAL; if (sscanf(buf, "%u", &val) == 1) { if (pm_save_wakeup_count(val)) error = n; else pm_print_active_wakeup_sources(); } out: pm_autosleep_unlock(); return error; } power_attr(wakeup_count); #ifdef CONFIG_PM_AUTOSLEEP static ssize_t autosleep_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { suspend_state_t state = pm_autosleep_state(); if (state == PM_SUSPEND_ON) return sysfs_emit(buf, "off\n"); #ifdef CONFIG_SUSPEND if (state < PM_SUSPEND_MAX) return sysfs_emit(buf, "%s\n", pm_states[state] ? pm_states[state] : "error"); #endif #ifdef CONFIG_HIBERNATION return sysfs_emit(buf, "disk\n"); #else return sysfs_emit(buf, "error\n"); #endif } static ssize_t autosleep_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t n) { suspend_state_t state = decode_state(buf, n); int error; if (state == PM_SUSPEND_ON && strcmp(buf, "off") && strcmp(buf, "off\n")) return -EINVAL; if (state == PM_SUSPEND_MEM) state = mem_sleep_current; error = pm_autosleep_set_state(state); return error ? error : n; } power_attr(autosleep); #endif /* CONFIG_PM_AUTOSLEEP */ #ifdef CONFIG_PM_WAKELOCKS static ssize_t wake_lock_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return pm_show_wakelocks(buf, true); } static ssize_t wake_lock_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t n) { int error = pm_wake_lock(buf); return error ? error : n; } power_attr(wake_lock); static ssize_t wake_unlock_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return pm_show_wakelocks(buf, false); } static ssize_t wake_unlock_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t n) { int error = pm_wake_unlock(buf); return error ? error : n; } power_attr(wake_unlock); #endif /* CONFIG_PM_WAKELOCKS */ #endif /* CONFIG_PM_SLEEP */ #ifdef CONFIG_PM_TRACE int pm_trace_enabled; static ssize_t pm_trace_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return sysfs_emit(buf, "%d\n", pm_trace_enabled); } static ssize_t pm_trace_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t n) { int val; if (sscanf(buf, "%d", &val) == 1) { pm_trace_enabled = !!val; if (pm_trace_enabled) { pr_warn("PM: Enabling pm_trace changes system date and time during resume.\n" "PM: Correct system time has to be restored manually after resume.\n"); } return n; } return -EINVAL; } power_attr(pm_trace); static ssize_t pm_trace_dev_match_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return show_trace_dev_match(buf, PAGE_SIZE); } power_attr_ro(pm_trace_dev_match); #endif /* CONFIG_PM_TRACE */ #ifdef CONFIG_FREEZER static ssize_t pm_freeze_timeout_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return sysfs_emit(buf, "%u\n", freeze_timeout_msecs); } static ssize_t pm_freeze_timeout_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t n) { unsigned long val; if (kstrtoul(buf, 10, &val)) return -EINVAL; freeze_timeout_msecs = val; return n; } power_attr(pm_freeze_timeout); #endif /* CONFIG_FREEZER*/ #if defined(CONFIG_SUSPEND) || defined(CONFIG_HIBERNATION) bool filesystem_freeze_enabled = false; static ssize_t freeze_filesystems_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return sysfs_emit(buf, "%d\n", filesystem_freeze_enabled); } static ssize_t freeze_filesystems_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t n) { unsigned long val; if (kstrtoul(buf, 10, &val)) return -EINVAL; if (val > 1) return -EINVAL; filesystem_freeze_enabled = !!val; return n; } power_attr(freeze_filesystems); #endif /* CONFIG_SUSPEND || CONFIG_HIBERNATION */ static struct attribute * g[] = { &state_attr.attr, #ifdef CONFIG_PM_TRACE &pm_trace_attr.attr, &pm_trace_dev_match_attr.attr, #endif #ifdef CONFIG_PM_SLEEP &pm_async_attr.attr, &wakeup_count_attr.attr, #ifdef CONFIG_SUSPEND &mem_sleep_attr.attr, &sync_on_suspend_attr.attr, #endif #ifdef CONFIG_PM_AUTOSLEEP &autosleep_attr.attr, #endif #ifdef CONFIG_PM_WAKELOCKS &wake_lock_attr.attr, &wake_unlock_attr.attr, #endif #ifdef CONFIG_PM_SLEEP_DEBUG &pm_test_attr.attr, &pm_print_times_attr.attr, &pm_wakeup_irq_attr.attr, &pm_debug_messages_attr.attr, #endif #endif #ifdef CONFIG_FREEZER &pm_freeze_timeout_attr.attr, #endif #if defined(CONFIG_SUSPEND) || defined(CONFIG_HIBERNATION) &freeze_filesystems_attr.attr, #endif NULL, }; static const struct attribute_group attr_group = { .attrs = g, }; static const struct attribute_group *attr_groups[] = { &attr_group, #ifdef CONFIG_PM_SLEEP &suspend_attr_group, #endif NULL, }; struct workqueue_struct *pm_wq; EXPORT_SYMBOL_GPL(pm_wq); static int __init pm_start_workqueue(void) { pm_wq = alloc_workqueue("pm", WQ_FREEZABLE, 0); return pm_wq ? 0 : -ENOMEM; } static int __init pm_init(void) { int error = pm_start_workqueue(); if (error) return error; hibernate_image_size_init(); hibernate_reserved_size_init(); pm_states_init(); power_kobj = kobject_create_and_add("power", NULL); if (!power_kobj) return -ENOMEM; error = sysfs_create_groups(power_kobj, attr_groups); if (error) return error; pm_print_times_init(); return pm_autosleep_init(); } core_initcall(pm_init);
46 45 1 16 1 5 3 1 3 9 9 1 2 10 1 1 10 11 11 3 8 11 2 1 2 1 1 5 7 7 77 51 19 30 1 1 1 1 23 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 // SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2007 Red Hat. All rights reserved. */ #include <linux/init.h> #include <linux/fs.h> #include <linux/slab.h> #include <linux/rwsem.h> #include <linux/xattr.h> #include <linux/security.h> #include <linux/posix_acl_xattr.h> #include <linux/iversion.h> #include <linux/sched/mm.h> #include "ctree.h" #include "fs.h" #include "messages.h" #include "btrfs_inode.h" #include "transaction.h" #include "xattr.h" #include "disk-io.h" #include "props.h" #include "locking.h" #include "accessors.h" #include "dir-item.h" int btrfs_getxattr(const struct inode *inode, const char *name, void *buffer, size_t size) { struct btrfs_dir_item *di; struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_path *path; struct extent_buffer *leaf; int ret = 0; unsigned long data_ptr; path = btrfs_alloc_path(); if (!path) return -ENOMEM; /* lookup the xattr by name */ di = btrfs_lookup_xattr(NULL, root, path, btrfs_ino(BTRFS_I(inode)), name, strlen(name), 0); if (!di) { ret = -ENODATA; goto out; } else if (IS_ERR(di)) { ret = PTR_ERR(di); goto out; } leaf = path->nodes[0]; /* if size is 0, that means we want the size of the attr */ if (!size) { ret = btrfs_dir_data_len(leaf, di); goto out; } /* now get the data out of our dir_item */ if (btrfs_dir_data_len(leaf, di) > size) { ret = -ERANGE; goto out; } /* * The way things are packed into the leaf is like this * |struct btrfs_dir_item|name|data| * where name is the xattr name, so security.foo, and data is the * content of the xattr. data_ptr points to the location in memory * where the data starts in the in memory leaf */ data_ptr = (unsigned long)((char *)(di + 1) + btrfs_dir_name_len(leaf, di)); read_extent_buffer(leaf, buffer, data_ptr, btrfs_dir_data_len(leaf, di)); ret = btrfs_dir_data_len(leaf, di); out: btrfs_free_path(path); return ret; } int btrfs_setxattr(struct btrfs_trans_handle *trans, struct inode *inode, const char *name, const void *value, size_t size, int flags) { struct btrfs_dir_item *di = NULL; struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_path *path; size_t name_len = strlen(name); int ret = 0; ASSERT(trans); if (name_len + size > BTRFS_MAX_XATTR_SIZE(root->fs_info)) return -ENOSPC; path = btrfs_alloc_path(); if (!path) return -ENOMEM; path->skip_release_on_error = 1; if (!value) { di = btrfs_lookup_xattr(trans, root, path, btrfs_ino(BTRFS_I(inode)), name, name_len, -1); if (!di && (flags & XATTR_REPLACE)) ret = -ENODATA; else if (IS_ERR(di)) ret = PTR_ERR(di); else if (di) ret = btrfs_delete_one_dir_name(trans, root, path, di); goto out; } /* * For a replace we can't just do the insert blindly. * Do a lookup first (read-only btrfs_search_slot), and return if xattr * doesn't exist. If it exists, fall down below to the insert/replace * path - we can't race with a concurrent xattr delete, because the VFS * locks the inode's i_mutex before calling setxattr or removexattr. */ if (flags & XATTR_REPLACE) { btrfs_assert_inode_locked(BTRFS_I(inode)); di = btrfs_lookup_xattr(NULL, root, path, btrfs_ino(BTRFS_I(inode)), name, name_len, 0); if (!di) ret = -ENODATA; else if (IS_ERR(di)) ret = PTR_ERR(di); if (ret) goto out; btrfs_release_path(path); di = NULL; } ret = btrfs_insert_xattr_item(trans, root, path, btrfs_ino(BTRFS_I(inode)), name, name_len, value, size); if (ret == -EOVERFLOW) { /* * We have an existing item in a leaf, split_leaf couldn't * expand it. That item might have or not a dir_item that * matches our target xattr, so lets check. */ ret = 0; btrfs_assert_tree_write_locked(path->nodes[0]); di = btrfs_match_dir_item_name(path, name, name_len); if (!di && !(flags & XATTR_REPLACE)) { ret = -ENOSPC; goto out; } } else if (ret == -EEXIST) { ret = 0; di = btrfs_match_dir_item_name(path, name, name_len); ASSERT(di); /* logic error */ } else if (ret) { goto out; } if (di && (flags & XATTR_CREATE)) { ret = -EEXIST; goto out; } if (di) { /* * We're doing a replace, and it must be atomic, that is, at * any point in time we have either the old or the new xattr * value in the tree. We don't want readers (getxattr and * listxattrs) to miss a value, this is specially important * for ACLs. */ const int slot = path->slots[0]; struct extent_buffer *leaf = path->nodes[0]; const u16 old_data_len = btrfs_dir_data_len(leaf, di); const u32 item_size = btrfs_item_size(leaf, slot); const u32 data_size = sizeof(*di) + name_len + size; unsigned long data_ptr; char *ptr; if (size > old_data_len) { if (btrfs_leaf_free_space(leaf) < (size - old_data_len)) { ret = -ENOSPC; goto out; } } if (old_data_len + name_len + sizeof(*di) == item_size) { /* No other xattrs packed in the same leaf item. */ if (size > old_data_len) btrfs_extend_item(trans, path, size - old_data_len); else if (size < old_data_len) btrfs_truncate_item(trans, path, data_size, 1); } else { /* There are other xattrs packed in the same item. */ ret = btrfs_delete_one_dir_name(trans, root, path, di); if (ret) goto out; btrfs_extend_item(trans, path, data_size); } ptr = btrfs_item_ptr(leaf, slot, char); ptr += btrfs_item_size(leaf, slot) - data_size; di = (struct btrfs_dir_item *)ptr; btrfs_set_dir_data_len(leaf, di, size); data_ptr = ((unsigned long)(di + 1)) + name_len; write_extent_buffer(leaf, value, data_ptr, size); } else { /* * Insert, and we had space for the xattr, so path->slots[0] is * where our xattr dir_item is and btrfs_insert_xattr_item() * filled it. */ } out: btrfs_free_path(path); if (!ret) { set_bit(BTRFS_INODE_COPY_EVERYTHING, &BTRFS_I(inode)->runtime_flags); clear_bit(BTRFS_INODE_NO_XATTRS, &BTRFS_I(inode)->runtime_flags); } return ret; } /* * @value: "" makes the attribute to empty, NULL removes it */ int btrfs_setxattr_trans(struct inode *inode, const char *name, const void *value, size_t size, int flags) { struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_trans_handle *trans; const bool start_trans = (current->journal_info == NULL); int ret; if (start_trans) { /* * 1 unit for inserting/updating/deleting the xattr * 1 unit for the inode item update */ trans = btrfs_start_transaction(root, 2); if (IS_ERR(trans)) return PTR_ERR(trans); } else { /* * This can happen when smack is enabled and a directory is being * created. It happens through d_instantiate_new(), which calls * smack_d_instantiate(), which in turn calls __vfs_setxattr() to * set the transmute xattr (XATTR_NAME_SMACKTRANSMUTE) on the * inode. We have already reserved space for the xattr and inode * update at btrfs_mkdir(), so just use the transaction handle. * We don't join or start a transaction, as that will reset the * block_rsv of the handle and trigger a warning for the start * case. */ ASSERT(strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN) == 0); trans = current->journal_info; } ret = btrfs_setxattr(trans, inode, name, value, size, flags); if (ret) goto out; inode_inc_iversion(inode); inode_set_ctime_current(inode); ret = btrfs_update_inode(trans, BTRFS_I(inode)); if (ret) btrfs_abort_transaction(trans, ret); out: if (start_trans) btrfs_end_transaction(trans); return ret; } ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size) { struct btrfs_key found_key; struct btrfs_key key; struct inode *inode = d_inode(dentry); struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_path *path; int iter_ret = 0; int ret = 0; size_t total_size = 0, size_left = size; /* * ok we want all objects associated with this id. * NOTE: we set key.offset = 0; because we want to start with the * first xattr that we find and walk forward */ key.objectid = btrfs_ino(BTRFS_I(inode)); key.type = BTRFS_XATTR_ITEM_KEY; key.offset = 0; path = btrfs_alloc_path(); if (!path) return -ENOMEM; path->reada = READA_FORWARD; /* search for our xattrs */ btrfs_for_each_slot(root, &key, &found_key, path, iter_ret) { struct extent_buffer *leaf; int slot; struct btrfs_dir_item *di; u32 item_size; u32 cur; leaf = path->nodes[0]; slot = path->slots[0]; /* check to make sure this item is what we want */ if (found_key.objectid != key.objectid) break; if (found_key.type > BTRFS_XATTR_ITEM_KEY) break; if (found_key.type < BTRFS_XATTR_ITEM_KEY) continue; di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item); item_size = btrfs_item_size(leaf, slot); cur = 0; while (cur < item_size) { u16 name_len = btrfs_dir_name_len(leaf, di); u16 data_len = btrfs_dir_data_len(leaf, di); u32 this_len = sizeof(*di) + name_len + data_len; unsigned long name_ptr = (unsigned long)(di + 1); total_size += name_len + 1; /* * We are just looking for how big our buffer needs to * be. */ if (!size) goto next; if (!buffer || (name_len + 1) > size_left) { iter_ret = -ERANGE; break; } read_extent_buffer(leaf, buffer, name_ptr, name_len); buffer[name_len] = '\0'; size_left -= name_len + 1; buffer += name_len + 1; next: cur += this_len; di = (struct btrfs_dir_item *)((char *)di + this_len); } } if (iter_ret < 0) ret = iter_ret; else ret = total_size; btrfs_free_path(path); return ret; } static int btrfs_xattr_handler_get(const struct xattr_handler *handler, struct dentry *unused, struct inode *inode, const char *name, void *buffer, size_t size) { name = xattr_full_name(handler, name); return btrfs_getxattr(inode, name, buffer, size); } static int btrfs_xattr_handler_set(const struct xattr_handler *handler, struct mnt_idmap *idmap, struct dentry *unused, struct inode *inode, const char *name, const void *buffer, size_t size, int flags) { if (btrfs_root_readonly(BTRFS_I(inode)->root)) return -EROFS; name = xattr_full_name(handler, name); return btrfs_setxattr_trans(inode, name, buffer, size, flags); } static int btrfs_xattr_handler_get_security(const struct xattr_handler *handler, struct dentry *unused, struct inode *inode, const char *name, void *buffer, size_t size) { int ret; bool is_cap = false; name = xattr_full_name(handler, name); /* * security.capability doesn't cache the results, so calls into us * constantly to see if there's a capability xattr. Cache the result * here in order to avoid wasting time doing lookups for xattrs we know * don't exist. */ if (strcmp(name, XATTR_NAME_CAPS) == 0) { is_cap = true; if (test_bit(BTRFS_INODE_NO_CAP_XATTR, &BTRFS_I(inode)->runtime_flags)) return -ENODATA; } ret = btrfs_getxattr(inode, name, buffer, size); if (ret == -ENODATA && is_cap) set_bit(BTRFS_INODE_NO_CAP_XATTR, &BTRFS_I(inode)->runtime_flags); return ret; } static int btrfs_xattr_handler_set_security(const struct xattr_handler *handler, struct mnt_idmap *idmap, struct dentry *unused, struct inode *inode, const char *name, const void *buffer, size_t size, int flags) { if (btrfs_root_readonly(BTRFS_I(inode)->root)) return -EROFS; name = xattr_full_name(handler, name); if (strcmp(name, XATTR_NAME_CAPS) == 0) clear_bit(BTRFS_INODE_NO_CAP_XATTR, &BTRFS_I(inode)->runtime_flags); return btrfs_setxattr_trans(inode, name, buffer, size, flags); } static int btrfs_xattr_handler_set_prop(const struct xattr_handler *handler, struct mnt_idmap *idmap, struct dentry *unused, struct inode *inode, const char *name, const void *value, size_t size, int flags) { int ret; struct btrfs_trans_handle *trans; struct btrfs_root *root = BTRFS_I(inode)->root; name = xattr_full_name(handler, name); ret = btrfs_validate_prop(BTRFS_I(inode), name, value, size); if (ret) return ret; if (btrfs_ignore_prop(BTRFS_I(inode), name)) return 0; trans = btrfs_start_transaction(root, 2); if (IS_ERR(trans)) return PTR_ERR(trans); ret = btrfs_set_prop(trans, BTRFS_I(inode), name, value, size, flags); if (!ret) { inode_inc_iversion(inode); inode_set_ctime_current(inode); ret = btrfs_update_inode(trans, BTRFS_I(inode)); if (ret) btrfs_abort_transaction(trans, ret); } btrfs_end_transaction(trans); return ret; } static const struct xattr_handler btrfs_security_xattr_handler = { .prefix = XATTR_SECURITY_PREFIX, .get = btrfs_xattr_handler_get_security, .set = btrfs_xattr_handler_set_security, }; static const struct xattr_handler btrfs_trusted_xattr_handler = { .prefix = XATTR_TRUSTED_PREFIX, .get = btrfs_xattr_handler_get, .set = btrfs_xattr_handler_set, }; static const struct xattr_handler btrfs_user_xattr_handler = { .prefix = XATTR_USER_PREFIX, .get = btrfs_xattr_handler_get, .set = btrfs_xattr_handler_set, }; static const struct xattr_handler btrfs_btrfs_xattr_handler = { .prefix = XATTR_BTRFS_PREFIX, .get = btrfs_xattr_handler_get, .set = btrfs_xattr_handler_set_prop, }; const struct xattr_handler * const btrfs_xattr_handlers[] = { &btrfs_security_xattr_handler, &btrfs_trusted_xattr_handler, &btrfs_user_xattr_handler, &btrfs_btrfs_xattr_handler, NULL, }; static int btrfs_initxattrs(struct inode *inode, const struct xattr *xattr_array, void *fs_private) { struct btrfs_trans_handle *trans = fs_private; const struct xattr *xattr; unsigned int nofs_flag; char *name; int ret = 0; /* * We're holding a transaction handle, so use a NOFS memory allocation * context to avoid deadlock if reclaim happens. */ nofs_flag = memalloc_nofs_save(); for (xattr = xattr_array; xattr->name != NULL; xattr++) { name = kmalloc(XATTR_SECURITY_PREFIX_LEN + strlen(xattr->name) + 1, GFP_KERNEL); if (!name) { ret = -ENOMEM; break; } strcpy(name, XATTR_SECURITY_PREFIX); strcpy(name + XATTR_SECURITY_PREFIX_LEN, xattr->name); if (strcmp(name, XATTR_NAME_CAPS) == 0) clear_bit(BTRFS_INODE_NO_CAP_XATTR, &BTRFS_I(inode)->runtime_flags); ret = btrfs_setxattr(trans, inode, name, xattr->value, xattr->value_len, 0); kfree(name); if (ret < 0) break; } memalloc_nofs_restore(nofs_flag); return ret; } int btrfs_xattr_security_init(struct btrfs_trans_handle *trans, struct inode *inode, struct inode *dir, const struct qstr *qstr) { return security_inode_init_security(inode, dir, qstr, &btrfs_initxattrs, trans); }
2 6 3 3 3 2 1 1 2 1 1 2 1 1 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 // SPDX-License-Identifier: GPL-2.0+ /* * USB Cypress M8 driver * * Copyright (C) 2004 * Lonnie Mendez (dignome@gmail.com) * Copyright (C) 2003,2004 * Neil Whelchel (koyama@firstlight.net) * * See Documentation/usb/usb-serial.rst for more information on using this * driver * * See http://geocities.com/i0xox0i for information on this driver and the * earthmate usb device. */ /* Thanks to Neil Whelchel for writing the first cypress m8 implementation for linux. */ /* Thanks to cypress for providing references for the hid reports. */ /* Thanks to Jiang Zhang for providing links and for general help. */ /* Code originates and was built up from ftdi_sio, belkin, pl2303 and others.*/ #include <linux/kernel.h> #include <linux/errno.h> #include <linux/slab.h> #include <linux/tty.h> #include <linux/tty_driver.h> #include <linux/tty_flip.h> #include <linux/module.h> #include <linux/moduleparam.h> #include <linux/spinlock.h> #include <linux/usb.h> #include <linux/usb/serial.h> #include <linux/serial.h> #include <linux/kfifo.h> #include <linux/delay.h> #include <linux/uaccess.h> #include <linux/unaligned.h> #include "cypress_m8.h" static bool stats; static int interval; static bool unstable_bauds; #define DRIVER_AUTHOR "Lonnie Mendez <dignome@gmail.com>, Neil Whelchel <koyama@firstlight.net>" #define DRIVER_DESC "Cypress USB to Serial Driver" /* write buffer size defines */ #define CYPRESS_BUF_SIZE 1024 static const struct usb_device_id id_table_earthmate[] = { { USB_DEVICE(VENDOR_ID_DELORME, PRODUCT_ID_EARTHMATEUSB) }, { USB_DEVICE(VENDOR_ID_DELORME, PRODUCT_ID_EARTHMATEUSB_LT20) }, { } /* Terminating entry */ }; static const struct usb_device_id id_table_cyphidcomrs232[] = { { USB_DEVICE(VENDOR_ID_CYPRESS, PRODUCT_ID_CYPHIDCOM) }, { USB_DEVICE(VENDOR_ID_SAI, PRODUCT_ID_CYPHIDCOM) }, { USB_DEVICE(VENDOR_ID_POWERCOM, PRODUCT_ID_UPS) }, { USB_DEVICE(VENDOR_ID_FRWD, PRODUCT_ID_CYPHIDCOM_FRWD) }, { } /* Terminating entry */ }; static const struct usb_device_id id_table_nokiaca42v2[] = { { USB_DEVICE(VENDOR_ID_DAZZLE, PRODUCT_ID_CA42) }, { } /* Terminating entry */ }; static const struct usb_device_id id_table_combined[] = { { USB_DEVICE(VENDOR_ID_DELORME, PRODUCT_ID_EARTHMATEUSB) }, { USB_DEVICE(VENDOR_ID_DELORME, PRODUCT_ID_EARTHMATEUSB_LT20) }, { USB_DEVICE(VENDOR_ID_CYPRESS, PRODUCT_ID_CYPHIDCOM) }, { USB_DEVICE(VENDOR_ID_SAI, PRODUCT_ID_CYPHIDCOM) }, { USB_DEVICE(VENDOR_ID_POWERCOM, PRODUCT_ID_UPS) }, { USB_DEVICE(VENDOR_ID_FRWD, PRODUCT_ID_CYPHIDCOM_FRWD) }, { USB_DEVICE(VENDOR_ID_DAZZLE, PRODUCT_ID_CA42) }, { } /* Terminating entry */ }; MODULE_DEVICE_TABLE(usb, id_table_combined); enum packet_format { packet_format_1, /* b0:status, b1:payload count */ packet_format_2 /* b0[7:3]:status, b0[2:0]:payload count */ }; struct cypress_private { spinlock_t lock; /* private lock */ int chiptype; /* identifier of device, for quirks/etc */ int bytes_in; /* used for statistics */ int bytes_out; /* used for statistics */ int cmd_count; /* used for statistics */ int cmd_ctrl; /* always set this to 1 before issuing a command */ struct kfifo write_fifo; /* write fifo */ int write_urb_in_use; /* write urb in use indicator */ int write_urb_interval; /* interval to use for write urb */ int read_urb_interval; /* interval to use for read urb */ int comm_is_ok; /* true if communication is (still) ok */ __u8 line_control; /* holds dtr / rts value */ __u8 current_status; /* received from last read - info on dsr,cts,cd,ri,etc */ __u8 current_config; /* stores the current configuration byte */ __u8 rx_flags; /* throttling - used from whiteheat/ftdi_sio */ enum packet_format pkt_fmt; /* format to use for packet send / receive */ int get_cfg_unsafe; /* If true, the CYPRESS_GET_CONFIG is unsafe */ int baud_rate; /* stores current baud rate in integer form */ char prev_status; /* used for TIOCMIWAIT */ }; /* function prototypes for the Cypress USB to serial device */ static int cypress_earthmate_port_probe(struct usb_serial_port *port); static int cypress_hidcom_port_probe(struct usb_serial_port *port); static int cypress_ca42v2_port_probe(struct usb_serial_port *port); static void cypress_port_remove(struct usb_serial_port *port); static int cypress_open(struct tty_struct *tty, struct usb_serial_port *port); static void cypress_close(struct usb_serial_port *port); static void cypress_dtr_rts(struct usb_serial_port *port, int on); static int cypress_write(struct tty_struct *tty, struct usb_serial_port *port, const unsigned char *buf, int count); static void cypress_send(struct usb_serial_port *port); static unsigned int cypress_write_room(struct tty_struct *tty); static void cypress_earthmate_init_termios(struct tty_struct *tty); static void cypress_set_termios(struct tty_struct *tty, struct usb_serial_port *port, const struct ktermios *old_termios); static int cypress_tiocmget(struct tty_struct *tty); static int cypress_tiocmset(struct tty_struct *tty, unsigned int set, unsigned int clear); static unsigned int cypress_chars_in_buffer(struct tty_struct *tty); static void cypress_throttle(struct tty_struct *tty); static void cypress_unthrottle(struct tty_struct *tty); static void cypress_set_dead(struct usb_serial_port *port); static void cypress_read_int_callback(struct urb *urb); static void cypress_write_int_callback(struct urb *urb); static struct usb_serial_driver cypress_earthmate_device = { .driver = { .name = "earthmate", }, .description = "DeLorme Earthmate USB", .id_table = id_table_earthmate, .num_ports = 1, .port_probe = cypress_earthmate_port_probe, .port_remove = cypress_port_remove, .open = cypress_open, .close = cypress_close, .dtr_rts = cypress_dtr_rts, .write = cypress_write, .write_room = cypress_write_room, .init_termios = cypress_earthmate_init_termios, .set_termios = cypress_set_termios, .tiocmget = cypress_tiocmget, .tiocmset = cypress_tiocmset, .tiocmiwait = usb_serial_generic_tiocmiwait, .chars_in_buffer = cypress_chars_in_buffer, .throttle = cypress_throttle, .unthrottle = cypress_unthrottle, .read_int_callback = cypress_read_int_callback, .write_int_callback = cypress_write_int_callback, }; static struct usb_serial_driver cypress_hidcom_device = { .driver = { .name = "cyphidcom", }, .description = "HID->COM RS232 Adapter", .id_table = id_table_cyphidcomrs232, .num_ports = 1, .port_probe = cypress_hidcom_port_probe, .port_remove = cypress_port_remove, .open = cypress_open, .close = cypress_close, .dtr_rts = cypress_dtr_rts, .write = cypress_write, .write_room = cypress_write_room, .set_termios = cypress_set_termios, .tiocmget = cypress_tiocmget, .tiocmset = cypress_tiocmset, .tiocmiwait = usb_serial_generic_tiocmiwait, .chars_in_buffer = cypress_chars_in_buffer, .throttle = cypress_throttle, .unthrottle = cypress_unthrottle, .read_int_callback = cypress_read_int_callback, .write_int_callback = cypress_write_int_callback, }; static struct usb_serial_driver cypress_ca42v2_device = { .driver = { .name = "nokiaca42v2", }, .description = "Nokia CA-42 V2 Adapter", .id_table = id_table_nokiaca42v2, .num_ports = 1, .port_probe = cypress_ca42v2_port_probe, .port_remove = cypress_port_remove, .open = cypress_open, .close = cypress_close, .dtr_rts = cypress_dtr_rts, .write = cypress_write, .write_room = cypress_write_room, .set_termios = cypress_set_termios, .tiocmget = cypress_tiocmget, .tiocmset = cypress_tiocmset, .tiocmiwait = usb_serial_generic_tiocmiwait, .chars_in_buffer = cypress_chars_in_buffer, .throttle = cypress_throttle, .unthrottle = cypress_unthrottle, .read_int_callback = cypress_read_int_callback, .write_int_callback = cypress_write_int_callback, }; static struct usb_serial_driver * const serial_drivers[] = { &cypress_earthmate_device, &cypress_hidcom_device, &cypress_ca42v2_device, NULL }; /***************************************************************************** * Cypress serial helper functions *****************************************************************************/ /* FRWD Dongle hidcom needs to skip reset and speed checks */ static inline bool is_frwd(struct usb_device *dev) { return ((le16_to_cpu(dev->descriptor.idVendor) == VENDOR_ID_FRWD) && (le16_to_cpu(dev->descriptor.idProduct) == PRODUCT_ID_CYPHIDCOM_FRWD)); } static int analyze_baud_rate(struct usb_serial_port *port, speed_t new_rate) { struct cypress_private *priv; priv = usb_get_serial_port_data(port); if (unstable_bauds) return new_rate; /* FRWD Dongle uses 115200 bps */ if (is_frwd(port->serial->dev)) return new_rate; /* * The general purpose firmware for the Cypress M8 allows for * a maximum speed of 57600bps (I have no idea whether DeLorme * chose to use the general purpose firmware or not), if you * need to modify this speed setting for your own project * please add your own chiptype and modify the code likewise. * The Cypress HID->COM device will work successfully up to * 115200bps (but the actual throughput is around 3kBps). */ if (port->serial->dev->speed == USB_SPEED_LOW) { /* * Mike Isely <isely@pobox.com> 2-Feb-2008: The * Cypress app note that describes this mechanism * states that the low-speed part can't handle more * than 800 bytes/sec, in which case 4800 baud is the * safest speed for a part like that. */ if (new_rate > 4800) { dev_dbg(&port->dev, "%s - failed setting baud rate, device incapable speed %d\n", __func__, new_rate); return -1; } } switch (priv->chiptype) { case CT_EARTHMATE: if (new_rate <= 600) { /* 300 and 600 baud rates are supported under * the generic firmware, but are not used with * NMEA and SiRF protocols */ dev_dbg(&port->dev, "%s - failed setting baud rate, unsupported speed of %d on Earthmate GPS\n", __func__, new_rate); return -1; } break; default: break; } return new_rate; } /* This function can either set or retrieve the current serial line settings */ static int cypress_serial_control(struct tty_struct *tty, struct usb_serial_port *port, speed_t baud_rate, int data_bits, int stop_bits, int parity_enable, int parity_type, int reset, int cypress_request_type) { int new_baudrate = 0, retval = 0, tries = 0; struct cypress_private *priv; struct device *dev = &port->dev; u8 *feature_buffer; const unsigned int feature_len = 5; unsigned long flags; priv = usb_get_serial_port_data(port); if (!priv->comm_is_ok) return -ENODEV; feature_buffer = kcalloc(feature_len, sizeof(u8), GFP_KERNEL); if (!feature_buffer) return -ENOMEM; switch (cypress_request_type) { case CYPRESS_SET_CONFIG: /* 0 means 'Hang up' so doesn't change the true bit rate */ new_baudrate = priv->baud_rate; if (baud_rate && baud_rate != priv->baud_rate) { dev_dbg(dev, "%s - baud rate is changing\n", __func__); retval = analyze_baud_rate(port, baud_rate); if (retval >= 0) { new_baudrate = retval; dev_dbg(dev, "%s - New baud rate set to %d\n", __func__, new_baudrate); } } dev_dbg(dev, "%s - baud rate is being sent as %d\n", __func__, new_baudrate); /* fill the feature_buffer with new configuration */ put_unaligned_le32(new_baudrate, feature_buffer); feature_buffer[4] |= data_bits - 5; /* assign data bits in 2 bit space ( max 3 ) */ /* 1 bit gap */ feature_buffer[4] |= (stop_bits << 3); /* assign stop bits in 1 bit space */ feature_buffer[4] |= (parity_enable << 4); /* assign parity flag in 1 bit space */ feature_buffer[4] |= (parity_type << 5); /* assign parity type in 1 bit space */ /* 1 bit gap */ feature_buffer[4] |= (reset << 7); /* assign reset at end of byte, 1 bit space */ dev_dbg(dev, "%s - device is being sent this feature report:\n", __func__); dev_dbg(dev, "%s - %02X - %02X - %02X - %02X - %02X\n", __func__, feature_buffer[0], feature_buffer[1], feature_buffer[2], feature_buffer[3], feature_buffer[4]); do { retval = usb_control_msg(port->serial->dev, usb_sndctrlpipe(port->serial->dev, 0), HID_REQ_SET_REPORT, USB_DIR_OUT | USB_RECIP_INTERFACE | USB_TYPE_CLASS, 0x0300, 0, feature_buffer, feature_len, 500); if (tries++ >= 3) break; } while (retval != feature_len && retval != -ENODEV); if (retval != feature_len) { dev_err(dev, "%s - failed sending serial line settings - %d\n", __func__, retval); cypress_set_dead(port); } else { spin_lock_irqsave(&priv->lock, flags); priv->baud_rate = new_baudrate; priv->current_config = feature_buffer[4]; spin_unlock_irqrestore(&priv->lock, flags); /* If we asked for a speed change encode it */ if (baud_rate) tty_encode_baud_rate(tty, new_baudrate, new_baudrate); } break; case CYPRESS_GET_CONFIG: if (priv->get_cfg_unsafe) { /* Not implemented for this device, and if we try to do it we're likely to crash the hardware. */ retval = -ENOTTY; goto out; } dev_dbg(dev, "%s - retrieving serial line settings\n", __func__); do { retval = usb_control_msg(port->serial->dev, usb_rcvctrlpipe(port->serial->dev, 0), HID_REQ_GET_REPORT, USB_DIR_IN | USB_RECIP_INTERFACE | USB_TYPE_CLASS, 0x0300, 0, feature_buffer, feature_len, 500); if (tries++ >= 3) break; } while (retval != feature_len && retval != -ENODEV); if (retval != feature_len) { dev_err(dev, "%s - failed to retrieve serial line settings - %d\n", __func__, retval); cypress_set_dead(port); goto out; } else { spin_lock_irqsave(&priv->lock, flags); /* store the config in one byte, and later use bit masks to check values */ priv->current_config = feature_buffer[4]; priv->baud_rate = get_unaligned_le32(feature_buffer); spin_unlock_irqrestore(&priv->lock, flags); } } spin_lock_irqsave(&priv->lock, flags); ++priv->cmd_count; spin_unlock_irqrestore(&priv->lock, flags); out: kfree(feature_buffer); return retval; } /* cypress_serial_control */ static void cypress_set_dead(struct usb_serial_port *port) { struct cypress_private *priv = usb_get_serial_port_data(port); unsigned long flags; spin_lock_irqsave(&priv->lock, flags); if (!priv->comm_is_ok) { spin_unlock_irqrestore(&priv->lock, flags); return; } priv->comm_is_ok = 0; spin_unlock_irqrestore(&priv->lock, flags); dev_err(&port->dev, "cypress_m8 suspending failing port %d - " "interval might be too short\n", port->port_number); } /***************************************************************************** * Cypress serial driver functions *****************************************************************************/ static int cypress_generic_port_probe(struct usb_serial_port *port) { struct usb_serial *serial = port->serial; struct cypress_private *priv; if (!port->interrupt_out_urb || !port->interrupt_in_urb) { dev_err(&port->dev, "required endpoint is missing\n"); return -ENODEV; } priv = kzalloc(sizeof(struct cypress_private), GFP_KERNEL); if (!priv) return -ENOMEM; priv->comm_is_ok = !0; spin_lock_init(&priv->lock); if (kfifo_alloc(&priv->write_fifo, CYPRESS_BUF_SIZE, GFP_KERNEL)) { kfree(priv); return -ENOMEM; } /* Skip reset for FRWD device. It is a workaound: device hangs if it receives SET_CONFIGURE in Configured state. */ if (!is_frwd(serial->dev)) usb_reset_configuration(serial->dev); priv->cmd_ctrl = 0; priv->line_control = 0; priv->rx_flags = 0; /* Default packet format setting is determined by packet size. Anything with a size larger then 9 must have a separate count field since the 3 bit count field is otherwise too small. Otherwise we can use the slightly more compact format. This is in accordance with the cypress_m8 serial converter app note. */ if (port->interrupt_out_size > 9) priv->pkt_fmt = packet_format_1; else priv->pkt_fmt = packet_format_2; if (interval > 0) { priv->write_urb_interval = interval; priv->read_urb_interval = interval; dev_dbg(&port->dev, "%s - read & write intervals forced to %d\n", __func__, interval); } else { priv->write_urb_interval = port->interrupt_out_urb->interval; priv->read_urb_interval = port->interrupt_in_urb->interval; dev_dbg(&port->dev, "%s - intervals: read=%d write=%d\n", __func__, priv->read_urb_interval, priv->write_urb_interval); } usb_set_serial_port_data(port, priv); port->port.drain_delay = 256; return 0; } static int cypress_earthmate_port_probe(struct usb_serial_port *port) { struct usb_serial *serial = port->serial; struct cypress_private *priv; int ret; ret = cypress_generic_port_probe(port); if (ret) { dev_dbg(&port->dev, "%s - Failed setting up port\n", __func__); return ret; } priv = usb_get_serial_port_data(port); priv->chiptype = CT_EARTHMATE; /* All Earthmate devices use the separated-count packet format! Idiotic. */ priv->pkt_fmt = packet_format_1; if (serial->dev->descriptor.idProduct != cpu_to_le16(PRODUCT_ID_EARTHMATEUSB)) { /* The old original USB Earthmate seemed able to handle GET_CONFIG requests; everything they've produced since that time crashes if this command is attempted :-( */ dev_dbg(&port->dev, "%s - Marking this device as unsafe for GET_CONFIG commands\n", __func__); priv->get_cfg_unsafe = !0; } return 0; } static int cypress_hidcom_port_probe(struct usb_serial_port *port) { struct cypress_private *priv; int ret; ret = cypress_generic_port_probe(port); if (ret) { dev_dbg(&port->dev, "%s - Failed setting up port\n", __func__); return ret; } priv = usb_get_serial_port_data(port); priv->chiptype = CT_CYPHIDCOM; return 0; } static int cypress_ca42v2_port_probe(struct usb_serial_port *port) { struct cypress_private *priv; int ret; ret = cypress_generic_port_probe(port); if (ret) { dev_dbg(&port->dev, "%s - Failed setting up port\n", __func__); return ret; } priv = usb_get_serial_port_data(port); priv->chiptype = CT_CA42V2; return 0; } static void cypress_port_remove(struct usb_serial_port *port) { struct cypress_private *priv; priv = usb_get_serial_port_data(port); kfifo_free(&priv->write_fifo); kfree(priv); } static int cypress_open(struct tty_struct *tty, struct usb_serial_port *port) { struct cypress_private *priv = usb_get_serial_port_data(port); struct usb_serial *serial = port->serial; unsigned long flags; int result = 0; if (!priv->comm_is_ok) return -EIO; /* clear halts before open */ usb_clear_halt(serial->dev, 0x81); usb_clear_halt(serial->dev, 0x02); spin_lock_irqsave(&priv->lock, flags); /* reset read/write statistics */ priv->bytes_in = 0; priv->bytes_out = 0; priv->cmd_count = 0; priv->rx_flags = 0; spin_unlock_irqrestore(&priv->lock, flags); /* Set termios */ cypress_send(port); if (tty) cypress_set_termios(tty, port, NULL); /* setup the port and start reading from the device */ usb_fill_int_urb(port->interrupt_in_urb, serial->dev, usb_rcvintpipe(serial->dev, port->interrupt_in_endpointAddress), port->interrupt_in_urb->transfer_buffer, port->interrupt_in_urb->transfer_buffer_length, cypress_read_int_callback, port, priv->read_urb_interval); result = usb_submit_urb(port->interrupt_in_urb, GFP_KERNEL); if (result) { dev_err(&port->dev, "%s - failed submitting read urb, error %d\n", __func__, result); cypress_set_dead(port); } return result; } /* cypress_open */ static void cypress_dtr_rts(struct usb_serial_port *port, int on) { struct cypress_private *priv = usb_get_serial_port_data(port); /* drop dtr and rts */ spin_lock_irq(&priv->lock); if (on == 0) priv->line_control = 0; else priv->line_control = CONTROL_DTR | CONTROL_RTS; priv->cmd_ctrl = 1; spin_unlock_irq(&priv->lock); cypress_write(NULL, port, NULL, 0); } static void cypress_close(struct usb_serial_port *port) { struct cypress_private *priv = usb_get_serial_port_data(port); unsigned long flags; spin_lock_irqsave(&priv->lock, flags); kfifo_reset_out(&priv->write_fifo); spin_unlock_irqrestore(&priv->lock, flags); dev_dbg(&port->dev, "%s - stopping urbs\n", __func__); usb_kill_urb(port->interrupt_in_urb); usb_kill_urb(port->interrupt_out_urb); if (stats) dev_info(&port->dev, "Statistics: %d Bytes In | %d Bytes Out | %d Commands Issued\n", priv->bytes_in, priv->bytes_out, priv->cmd_count); } /* cypress_close */ static int cypress_write(struct tty_struct *tty, struct usb_serial_port *port, const unsigned char *buf, int count) { struct cypress_private *priv = usb_get_serial_port_data(port); dev_dbg(&port->dev, "%s - %d bytes\n", __func__, count); /* line control commands, which need to be executed immediately, are not put into the buffer for obvious reasons. */ if (priv->cmd_ctrl) { count = 0; goto finish; } if (!count) return count; count = kfifo_in_locked(&priv->write_fifo, buf, count, &priv->lock); finish: cypress_send(port); return count; } /* cypress_write */ static void cypress_send(struct usb_serial_port *port) { int count = 0, result, offset, actual_size; struct cypress_private *priv = usb_get_serial_port_data(port); struct device *dev = &port->dev; unsigned long flags; if (!priv->comm_is_ok) return; dev_dbg(dev, "%s - interrupt out size is %d\n", __func__, port->interrupt_out_size); spin_lock_irqsave(&priv->lock, flags); if (priv->write_urb_in_use) { dev_dbg(dev, "%s - can't write, urb in use\n", __func__); spin_unlock_irqrestore(&priv->lock, flags); return; } spin_unlock_irqrestore(&priv->lock, flags); /* clear buffer */ memset(port->interrupt_out_urb->transfer_buffer, 0, port->interrupt_out_size); spin_lock_irqsave(&priv->lock, flags); switch (priv->pkt_fmt) { default: case packet_format_1: /* this is for the CY7C64013... */ offset = 2; port->interrupt_out_buffer[0] = priv->line_control; break; case packet_format_2: /* this is for the CY7C63743... */ offset = 1; port->interrupt_out_buffer[0] = priv->line_control; break; } if (priv->line_control & CONTROL_RESET) priv->line_control &= ~CONTROL_RESET; if (priv->cmd_ctrl) { priv->cmd_count++; dev_dbg(dev, "%s - line control command being issued\n", __func__); spin_unlock_irqrestore(&priv->lock, flags); goto send; } else spin_unlock_irqrestore(&priv->lock, flags); count = kfifo_out_locked(&priv->write_fifo, &port->interrupt_out_buffer[offset], port->interrupt_out_size - offset, &priv->lock); if (count == 0) return; switch (priv->pkt_fmt) { default: case packet_format_1: port->interrupt_out_buffer[1] = count; break; case packet_format_2: port->interrupt_out_buffer[0] |= count; } dev_dbg(dev, "%s - count is %d\n", __func__, count); send: spin_lock_irqsave(&priv->lock, flags); priv->write_urb_in_use = 1; spin_unlock_irqrestore(&priv->lock, flags); if (priv->cmd_ctrl) actual_size = 1; else actual_size = count + (priv->pkt_fmt == packet_format_1 ? 2 : 1); usb_serial_debug_data(dev, __func__, port->interrupt_out_size, port->interrupt_out_urb->transfer_buffer); usb_fill_int_urb(port->interrupt_out_urb, port->serial->dev, usb_sndintpipe(port->serial->dev, port->interrupt_out_endpointAddress), port->interrupt_out_buffer, actual_size, cypress_write_int_callback, port, priv->write_urb_interval); result = usb_submit_urb(port->interrupt_out_urb, GFP_ATOMIC); if (result) { dev_err_console(port, "%s - failed submitting write urb, error %d\n", __func__, result); priv->write_urb_in_use = 0; cypress_set_dead(port); } spin_lock_irqsave(&priv->lock, flags); if (priv->cmd_ctrl) priv->cmd_ctrl = 0; /* do not count the line control and size bytes */ priv->bytes_out += count; spin_unlock_irqrestore(&priv->lock, flags); usb_serial_port_softint(port); } /* cypress_send */ /* returns how much space is available in the soft buffer */ static unsigned int cypress_write_room(struct tty_struct *tty) { struct usb_serial_port *port = tty->driver_data; struct cypress_private *priv = usb_get_serial_port_data(port); unsigned int room; unsigned long flags; spin_lock_irqsave(&priv->lock, flags); room = kfifo_avail(&priv->write_fifo); spin_unlock_irqrestore(&priv->lock, flags); dev_dbg(&port->dev, "%s - returns %u\n", __func__, room); return room; } static int cypress_tiocmget(struct tty_struct *tty) { struct usb_serial_port *port = tty->driver_data; struct cypress_private *priv = usb_get_serial_port_data(port); __u8 status, control; unsigned int result = 0; unsigned long flags; spin_lock_irqsave(&priv->lock, flags); control = priv->line_control; status = priv->current_status; spin_unlock_irqrestore(&priv->lock, flags); result = ((control & CONTROL_DTR) ? TIOCM_DTR : 0) | ((control & CONTROL_RTS) ? TIOCM_RTS : 0) | ((status & UART_CTS) ? TIOCM_CTS : 0) | ((status & UART_DSR) ? TIOCM_DSR : 0) | ((status & UART_RI) ? TIOCM_RI : 0) | ((status & UART_CD) ? TIOCM_CD : 0); dev_dbg(&port->dev, "%s - result = %x\n", __func__, result); return result; } static int cypress_tiocmset(struct tty_struct *tty, unsigned int set, unsigned int clear) { struct usb_serial_port *port = tty->driver_data; struct cypress_private *priv = usb_get_serial_port_data(port); unsigned long flags; spin_lock_irqsave(&priv->lock, flags); if (set & TIOCM_RTS) priv->line_control |= CONTROL_RTS; if (set & TIOCM_DTR) priv->line_control |= CONTROL_DTR; if (clear & TIOCM_RTS) priv->line_control &= ~CONTROL_RTS; if (clear & TIOCM_DTR) priv->line_control &= ~CONTROL_DTR; priv->cmd_ctrl = 1; spin_unlock_irqrestore(&priv->lock, flags); return cypress_write(tty, port, NULL, 0); } static void cypress_earthmate_init_termios(struct tty_struct *tty) { tty_encode_baud_rate(tty, 4800, 4800); } static void cypress_set_termios(struct tty_struct *tty, struct usb_serial_port *port, const struct ktermios *old_termios) { struct cypress_private *priv = usb_get_serial_port_data(port); struct device *dev = &port->dev; int data_bits, stop_bits, parity_type, parity_enable; unsigned int cflag; unsigned long flags; __u8 oldlines; int linechange = 0; /* Unsupported features need clearing */ tty->termios.c_cflag &= ~(CMSPAR|CRTSCTS); cflag = tty->termios.c_cflag; /* set number of data bits, parity, stop bits */ /* when parity is disabled the parity type bit is ignored */ /* 1 means 2 stop bits, 0 means 1 stop bit */ stop_bits = cflag & CSTOPB ? 1 : 0; if (cflag & PARENB) { parity_enable = 1; /* 1 means odd parity, 0 means even parity */ parity_type = cflag & PARODD ? 1 : 0; } else parity_enable = parity_type = 0; data_bits = tty_get_char_size(cflag); spin_lock_irqsave(&priv->lock, flags); oldlines = priv->line_control; if ((cflag & CBAUD) == B0) { /* drop dtr and rts */ dev_dbg(dev, "%s - dropping the lines, baud rate 0bps\n", __func__); priv->line_control &= ~(CONTROL_DTR | CONTROL_RTS); } else priv->line_control = (CONTROL_DTR | CONTROL_RTS); spin_unlock_irqrestore(&priv->lock, flags); dev_dbg(dev, "%s - sending %d stop_bits, %d parity_enable, %d parity_type, %d data_bits (+5)\n", __func__, stop_bits, parity_enable, parity_type, data_bits); cypress_serial_control(tty, port, tty_get_baud_rate(tty), data_bits, stop_bits, parity_enable, parity_type, 0, CYPRESS_SET_CONFIG); /* we perform a CYPRESS_GET_CONFIG so that the current settings are * filled into the private structure this should confirm that all is * working if it returns what we just set */ cypress_serial_control(tty, port, 0, 0, 0, 0, 0, 0, CYPRESS_GET_CONFIG); /* Here we can define custom tty settings for devices; the main tty * termios flag base comes from empeg.c */ spin_lock_irqsave(&priv->lock, flags); if (priv->chiptype == CT_EARTHMATE && priv->baud_rate == 4800) { dev_dbg(dev, "Using custom termios settings for a baud rate of 4800bps.\n"); /* define custom termios settings for NMEA protocol */ tty->termios.c_iflag /* input modes - */ &= ~(IGNBRK /* disable ignore break */ | BRKINT /* disable break causes interrupt */ | PARMRK /* disable mark parity errors */ | ISTRIP /* disable clear high bit of input char */ | INLCR /* disable translate NL to CR */ | IGNCR /* disable ignore CR */ | ICRNL /* disable translate CR to NL */ | IXON); /* disable enable XON/XOFF flow control */ tty->termios.c_oflag /* output modes */ &= ~OPOST; /* disable postprocess output char */ tty->termios.c_lflag /* line discipline modes */ &= ~(ECHO /* disable echo input characters */ | ECHONL /* disable echo new line */ | ICANON /* disable erase, kill, werase, and rprnt special characters */ | ISIG /* disable interrupt, quit, and suspend special characters */ | IEXTEN); /* disable non-POSIX special characters */ } /* CT_CYPHIDCOM: Application should handle this for device */ linechange = (priv->line_control != oldlines); spin_unlock_irqrestore(&priv->lock, flags); /* if necessary, set lines */ if (linechange) { priv->cmd_ctrl = 1; cypress_write(tty, port, NULL, 0); } } /* cypress_set_termios */ /* returns amount of data still left in soft buffer */ static unsigned int cypress_chars_in_buffer(struct tty_struct *tty) { struct usb_serial_port *port = tty->driver_data; struct cypress_private *priv = usb_get_serial_port_data(port); unsigned int chars; unsigned long flags; spin_lock_irqsave(&priv->lock, flags); chars = kfifo_len(&priv->write_fifo); spin_unlock_irqrestore(&priv->lock, flags); dev_dbg(&port->dev, "%s - returns %u\n", __func__, chars); return chars; } static void cypress_throttle(struct tty_struct *tty) { struct usb_serial_port *port = tty->driver_data; struct cypress_private *priv = usb_get_serial_port_data(port); spin_lock_irq(&priv->lock); priv->rx_flags = THROTTLED; spin_unlock_irq(&priv->lock); } static void cypress_unthrottle(struct tty_struct *tty) { struct usb_serial_port *port = tty->driver_data; struct cypress_private *priv = usb_get_serial_port_data(port); int actually_throttled, result; spin_lock_irq(&priv->lock); actually_throttled = priv->rx_flags & ACTUALLY_THROTTLED; priv->rx_flags = 0; spin_unlock_irq(&priv->lock); if (!priv->comm_is_ok) return; if (actually_throttled) { result = usb_submit_urb(port->interrupt_in_urb, GFP_KERNEL); if (result) { dev_err(&port->dev, "%s - failed submitting read urb, " "error %d\n", __func__, result); cypress_set_dead(port); } } } static void cypress_read_int_callback(struct urb *urb) { struct usb_serial_port *port = urb->context; struct cypress_private *priv = usb_get_serial_port_data(port); struct device *dev = &urb->dev->dev; struct tty_struct *tty; unsigned char *data = urb->transfer_buffer; unsigned long flags; char tty_flag = TTY_NORMAL; int bytes = 0; int result; int i = 0; int status = urb->status; switch (status) { case 0: /* success */ break; case -ECONNRESET: case -ENOENT: case -ESHUTDOWN: /* precursor to disconnect so just go away */ return; case -EPIPE: /* Can't call usb_clear_halt while in_interrupt */ fallthrough; default: /* something ugly is going on... */ dev_err(dev, "%s - unexpected nonzero read status received: %d\n", __func__, status); cypress_set_dead(port); return; } spin_lock_irqsave(&priv->lock, flags); if (priv->rx_flags & THROTTLED) { dev_dbg(dev, "%s - now throttling\n", __func__); priv->rx_flags |= ACTUALLY_THROTTLED; spin_unlock_irqrestore(&priv->lock, flags); return; } spin_unlock_irqrestore(&priv->lock, flags); tty = tty_port_tty_get(&port->port); if (!tty) { dev_dbg(dev, "%s - bad tty pointer - exiting\n", __func__); return; } spin_lock_irqsave(&priv->lock, flags); result = urb->actual_length; switch (priv->pkt_fmt) { default: case packet_format_1: /* This is for the CY7C64013... */ priv->current_status = data[0] & 0xF8; bytes = data[1] + 2; i = 2; break; case packet_format_2: /* This is for the CY7C63743... */ priv->current_status = data[0] & 0xF8; bytes = (data[0] & 0x07) + 1; i = 1; break; } spin_unlock_irqrestore(&priv->lock, flags); if (result < bytes) { dev_dbg(dev, "%s - wrong packet size - received %d bytes but packet said %d bytes\n", __func__, result, bytes); goto continue_read; } usb_serial_debug_data(&port->dev, __func__, urb->actual_length, data); spin_lock_irqsave(&priv->lock, flags); /* check to see if status has changed */ if (priv->current_status != priv->prev_status) { u8 delta = priv->current_status ^ priv->prev_status; if (delta & UART_MSR_MASK) { if (delta & UART_CTS) port->icount.cts++; if (delta & UART_DSR) port->icount.dsr++; if (delta & UART_RI) port->icount.rng++; if (delta & UART_CD) port->icount.dcd++; wake_up_interruptible(&port->port.delta_msr_wait); } priv->prev_status = priv->current_status; } spin_unlock_irqrestore(&priv->lock, flags); /* hangup, as defined in acm.c... this might be a bad place for it * though */ if (tty && !C_CLOCAL(tty) && !(priv->current_status & UART_CD)) { dev_dbg(dev, "%s - calling hangup\n", __func__); tty_hangup(tty); goto continue_read; } /* There is one error bit... I'm assuming it is a parity error * indicator as the generic firmware will set this bit to 1 if a * parity error occurs. * I can not find reference to any other error events. */ spin_lock_irqsave(&priv->lock, flags); if (priv->current_status & CYP_ERROR) { spin_unlock_irqrestore(&priv->lock, flags); tty_flag = TTY_PARITY; dev_dbg(dev, "%s - Parity Error detected\n", __func__); } else spin_unlock_irqrestore(&priv->lock, flags); /* process read if there is data other than line status */ if (bytes > i) { tty_insert_flip_string_fixed_flag(&port->port, data + i, tty_flag, bytes - i); tty_flip_buffer_push(&port->port); } spin_lock_irqsave(&priv->lock, flags); /* control and status byte(s) are also counted */ priv->bytes_in += bytes; spin_unlock_irqrestore(&priv->lock, flags); continue_read: tty_kref_put(tty); /* Continue trying to always read */ if (priv->comm_is_ok) { usb_fill_int_urb(port->interrupt_in_urb, port->serial->dev, usb_rcvintpipe(port->serial->dev, port->interrupt_in_endpointAddress), port->interrupt_in_urb->transfer_buffer, port->interrupt_in_urb->transfer_buffer_length, cypress_read_int_callback, port, priv->read_urb_interval); result = usb_submit_urb(port->interrupt_in_urb, GFP_ATOMIC); if (result && result != -EPERM) { dev_err(dev, "%s - failed resubmitting read urb, error %d\n", __func__, result); cypress_set_dead(port); } } } /* cypress_read_int_callback */ static void cypress_write_int_callback(struct urb *urb) { struct usb_serial_port *port = urb->context; struct cypress_private *priv = usb_get_serial_port_data(port); struct device *dev = &urb->dev->dev; int status = urb->status; switch (status) { case 0: /* success */ break; case -ECONNRESET: case -ENOENT: case -ESHUTDOWN: /* this urb is terminated, clean up */ dev_dbg(dev, "%s - urb shutting down with status: %d\n", __func__, status); priv->write_urb_in_use = 0; return; case -EPIPE: /* Cannot call usb_clear_halt while in_interrupt */ fallthrough; default: dev_err(dev, "%s - unexpected nonzero write status received: %d\n", __func__, status); cypress_set_dead(port); break; } priv->write_urb_in_use = 0; /* send any buffered data */ cypress_send(port); } module_usb_serial_driver(serial_drivers, id_table_combined); MODULE_AUTHOR(DRIVER_AUTHOR); MODULE_DESCRIPTION(DRIVER_DESC); MODULE_LICENSE("GPL"); module_param(stats, bool, 0644); MODULE_PARM_DESC(stats, "Enable statistics or not"); module_param(interval, int, 0644); MODULE_PARM_DESC(interval, "Overrides interrupt interval"); module_param(unstable_bauds, bool, 0644); MODULE_PARM_DESC(unstable_bauds, "Allow unstable baud rates");
1 3 30 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 /* SPDX-License-Identifier: GPL-2.0-only */ /* * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. * Copyright (C) 2004-2008 Red Hat, Inc. All rights reserved. */ #ifndef __INCORE_DOT_H__ #define __INCORE_DOT_H__ #include <linux/fs.h> #include <linux/kobject.h> #include <linux/workqueue.h> #include <linux/dlm.h> #include <linux/buffer_head.h> #include <linux/rcupdate.h> #include <linux/rculist_bl.h> #include <linux/completion.h> #include <linux/rbtree.h> #include <linux/ktime.h> #include <linux/percpu.h> #include <linux/lockref.h> #include <linux/rhashtable.h> #include <linux/mutex.h> #define DIO_WAIT 0x00000010 #define DIO_METADATA 0x00000020 struct gfs2_log_operations; struct gfs2_bufdata; struct gfs2_holder; struct gfs2_glock; struct gfs2_quota_data; struct gfs2_trans; struct gfs2_jdesc; struct gfs2_sbd; struct lm_lockops; typedef void (*gfs2_glop_bh_t) (struct gfs2_glock *gl, unsigned int ret); struct gfs2_log_header_host { u64 lh_sequence; /* Sequence number of this transaction */ u32 lh_flags; /* GFS2_LOG_HEAD_... */ u32 lh_tail; /* Block number of log tail */ u32 lh_blkno; s64 lh_local_total; s64 lh_local_free; s64 lh_local_dinodes; }; /* * Structure of operations that are associated with each * type of element in the log. */ struct gfs2_log_operations { void (*lo_before_commit) (struct gfs2_sbd *sdp, struct gfs2_trans *tr); void (*lo_after_commit) (struct gfs2_sbd *sdp, struct gfs2_trans *tr); void (*lo_before_scan) (struct gfs2_jdesc *jd, struct gfs2_log_header_host *head, int pass); int (*lo_scan_elements) (struct gfs2_jdesc *jd, unsigned int start, struct gfs2_log_descriptor *ld, __be64 *ptr, int pass); void (*lo_after_scan) (struct gfs2_jdesc *jd, int error, int pass); const char *lo_name; }; #define GBF_FULL 1 /** * Clone bitmaps (bi_clone): * * - When a block is freed, we remember the previous state of the block in the * clone bitmap, and only mark the block as free in the real bitmap. * * - When looking for a block to allocate, we check for a free block in the * clone bitmap, and if no clone bitmap exists, in the real bitmap. * * - For allocating a block, we mark it as allocated in the real bitmap, and if * a clone bitmap exists, also in the clone bitmap. * * - At the end of a log_flush, we copy the real bitmap into the clone bitmap * to make the clone bitmap reflect the current allocation state. * (Alternatively, we could remove the clone bitmap.) * * The clone bitmaps are in-core only, and is never written to disk. * * These steps ensure that blocks which have been freed in a transaction cannot * be reallocated in that same transaction. */ struct gfs2_bitmap { struct buffer_head *bi_bh; char *bi_clone; unsigned long bi_flags; u32 bi_offset; u32 bi_start; u32 bi_bytes; u32 bi_blocks; }; struct gfs2_rgrpd { struct rb_node rd_node; /* Link with superblock */ struct gfs2_glock *rd_gl; /* Glock for this rgrp */ u64 rd_addr; /* grp block disk address */ u64 rd_data0; /* first data location */ u32 rd_length; /* length of rgrp header in fs blocks */ u32 rd_data; /* num of data blocks in rgrp */ u32 rd_bitbytes; /* number of bytes in data bitmaps */ u32 rd_free; u32 rd_requested; /* number of blocks in rd_rstree */ u32 rd_reserved; /* number of reserved blocks */ u32 rd_free_clone; u32 rd_dinodes; u64 rd_igeneration; struct gfs2_bitmap *rd_bits; struct gfs2_sbd *rd_sbd; struct gfs2_rgrp_lvb *rd_rgl; u32 rd_last_alloc; u32 rd_flags; u32 rd_extfail_pt; /* extent failure point */ #define GFS2_RDF_CHECK 0x10000000 /* check for unlinked inodes */ #define GFS2_RDF_ERROR 0x40000000 /* error in rg */ #define GFS2_RDF_PREFERRED 0x80000000 /* This rgrp is preferred */ #define GFS2_RDF_MASK 0xf0000000 /* mask for internal flags */ spinlock_t rd_rsspin; /* protects reservation related vars */ struct mutex rd_mutex; struct rb_root rd_rstree; /* multi-block reservation tree */ }; enum gfs2_state_bits { BH_Pinned = BH_PrivateStart, BH_Escaped = BH_PrivateStart + 1, }; BUFFER_FNS(Pinned, pinned) TAS_BUFFER_FNS(Pinned, pinned) BUFFER_FNS(Escaped, escaped) TAS_BUFFER_FNS(Escaped, escaped) struct gfs2_bufdata { struct buffer_head *bd_bh; struct gfs2_glock *bd_gl; u64 bd_blkno; struct list_head bd_list; struct gfs2_trans *bd_tr; struct list_head bd_ail_st_list; struct list_head bd_ail_gl_list; }; /* * Internally, we prefix things with gdlm_ and GDLM_ (for gfs-dlm) since a * prefix of lock_dlm_ gets awkward. */ #define GDLM_STRNAME_BYTES 25 #define GDLM_LVB_SIZE 32 /* * ls_recover_flags: * * DFL_BLOCK_LOCKS: dlm is in recovery and will grant locks that had been * held by failed nodes whose journals need recovery. Those locks should * only be used for journal recovery until the journal recovery is done. * This is set by the dlm recover_prep callback and cleared by the * gfs2_control thread when journal recovery is complete. To avoid * races between recover_prep setting and gfs2_control clearing, recover_spin * is held while changing this bit and reading/writing recover_block * and recover_start. * * DFL_NO_DLM_OPS: dlm lockspace ops/callbacks are not being used. * * DFL_FIRST_MOUNT: this node is the first to mount this fs and is doing * recovery of all journals before allowing other nodes to mount the fs. * This is cleared when FIRST_MOUNT_DONE is set. * * DFL_FIRST_MOUNT_DONE: this node was the first mounter, and has finished * recovery of all journals, and now allows other nodes to mount the fs. * * DFL_MOUNT_DONE: gdlm_mount has completed successfully and cleared * BLOCK_LOCKS for the first time. The gfs2_control thread should now * control clearing BLOCK_LOCKS for further recoveries. * * DFL_UNMOUNT: gdlm_unmount sets to keep sdp off gfs2_control_wq. * * DFL_DLM_RECOVERY: set while dlm is in recovery, between recover_prep() * and recover_done(), i.e. set while recover_block == recover_start. */ enum { DFL_BLOCK_LOCKS = 0, DFL_NO_DLM_OPS = 1, DFL_FIRST_MOUNT = 2, DFL_FIRST_MOUNT_DONE = 3, DFL_MOUNT_DONE = 4, DFL_UNMOUNT = 5, DFL_DLM_RECOVERY = 6, }; /* * We are using struct lm_lockname as an rhashtable key. Avoid holes within * the struct; padding at the end is fine. */ struct lm_lockname { u64 ln_number; struct gfs2_sbd *ln_sbd; unsigned int ln_type; }; #define lm_name_equal(name1, name2) \ (((name1)->ln_number == (name2)->ln_number) && \ ((name1)->ln_type == (name2)->ln_type) && \ ((name1)->ln_sbd == (name2)->ln_sbd)) struct gfs2_glock_operations { int (*go_sync) (struct gfs2_glock *gl); int (*go_xmote_bh)(struct gfs2_glock *gl); void (*go_inval) (struct gfs2_glock *gl, int flags); int (*go_instantiate) (struct gfs2_glock *gl); int (*go_held)(struct gfs2_holder *gh); void (*go_dump)(struct seq_file *seq, const struct gfs2_glock *gl, const char *fs_id_buf); void (*go_callback)(struct gfs2_glock *gl, bool remote); void (*go_unlocked)(struct gfs2_glock *gl); const int go_subclass; const int go_type; const unsigned long go_flags; #define GLOF_ASPACE 1 /* address space attached */ #define GLOF_LVB 2 /* Lock Value Block attached */ #define GLOF_NONDISK 8 /* not I/O related */ }; enum { GFS2_LKS_SRTT = 0, /* Non blocking smoothed round trip time */ GFS2_LKS_SRTTVAR = 1, /* Non blocking smoothed variance */ GFS2_LKS_SRTTB = 2, /* Blocking smoothed round trip time */ GFS2_LKS_SRTTVARB = 3, /* Blocking smoothed variance */ GFS2_LKS_SIRT = 4, /* Smoothed Inter-request time */ GFS2_LKS_SIRTVAR = 5, /* Smoothed Inter-request variance */ GFS2_LKS_DCOUNT = 6, /* Count of dlm requests */ GFS2_LKS_QCOUNT = 7, /* Count of gfs2_holder queues */ GFS2_NR_LKSTATS }; struct gfs2_lkstats { u64 stats[GFS2_NR_LKSTATS]; }; enum { /* States */ HIF_HOLDER = 6, /* Set for gh that "holds" the glock */ HIF_WAIT = 10, }; struct gfs2_holder { struct list_head gh_list; struct gfs2_glock *gh_gl; struct pid *gh_owner_pid; u16 gh_flags; u16 gh_state; int gh_error; unsigned long gh_iflags; /* HIF_... */ unsigned long gh_ip; }; /* Number of quota types we support */ #define GFS2_MAXQUOTAS 2 struct gfs2_qadata { /* quota allocation data */ /* Quota stuff */ struct gfs2_quota_data *qa_qd[2 * GFS2_MAXQUOTAS]; struct gfs2_holder qa_qd_ghs[2 * GFS2_MAXQUOTAS]; unsigned int qa_qd_num; int qa_ref; }; /* Resource group multi-block reservation, in order of appearance: Step 1. Function prepares to write, allocates a mb, sets the size hint. Step 2. User calls inplace_reserve to target an rgrp, sets the rgrp info Step 3. Function get_local_rgrp locks the rgrp, determines which bits to use Step 4. Bits are assigned from the rgrp based on either the reservation or wherever it can. */ struct gfs2_blkreserv { struct rb_node rs_node; /* node within rd_rstree */ struct gfs2_rgrpd *rs_rgd; u64 rs_start; u32 rs_requested; u32 rs_reserved; /* number of reserved blocks */ }; /* * Allocation parameters * @target: The number of blocks we'd ideally like to allocate * @aflags: The flags (e.g. Orlov flag) * * The intent is to gradually expand this structure over time in * order to give more information, e.g. alignment, min extent size * to the allocation code. */ struct gfs2_alloc_parms { u64 target; u32 min_target; u32 aflags; u64 allowed; }; enum { GLF_LOCK = 1, GLF_INSTANTIATE_NEEDED = 2, /* needs instantiate */ GLF_DEMOTE = 3, GLF_PENDING_DEMOTE = 4, GLF_DEMOTE_IN_PROGRESS = 5, GLF_DIRTY = 6, GLF_LFLUSH = 7, GLF_INVALIDATE_IN_PROGRESS = 8, GLF_HAVE_REPLY = 9, GLF_INITIAL = 10, GLF_HAVE_FROZEN_REPLY = 11, GLF_INSTANTIATE_IN_PROG = 12, /* instantiate happening now */ GLF_LRU = 13, GLF_OBJECT = 14, /* Used only for tracing */ GLF_BLOCKING = 15, GLF_UNLOCKED = 16, /* Wait for glock to be unlocked */ GLF_TRY_TO_EVICT = 17, /* iopen glocks only */ GLF_VERIFY_DELETE = 18, /* iopen glocks only */ GLF_PENDING_REPLY = 19, GLF_DEFER_DELETE = 20, /* iopen glocks only */ GLF_CANCELING = 21, }; struct gfs2_glock { unsigned long gl_flags; /* GLF_... */ struct lm_lockname gl_name; struct lockref gl_lockref; /* State fields protected by gl_lockref.lock */ unsigned int gl_state:2, /* Current state */ gl_target:2, /* Target state */ gl_demote_state:2, /* State requested by remote node */ gl_req:2, /* State in last dlm request */ gl_reply:8; /* Last reply from the dlm */ unsigned long gl_demote_time; /* time of first demote request */ long gl_hold_time; struct list_head gl_holders; const struct gfs2_glock_operations *gl_ops; ktime_t gl_dstamp; struct gfs2_lkstats gl_stats; struct dlm_lksb gl_lksb; unsigned long gl_tchange; void *gl_object; struct list_head gl_lru; struct list_head gl_ail_list; atomic_t gl_ail_count; atomic_t gl_revokes; struct delayed_work gl_work; /* For iopen glocks only */ struct { struct delayed_work gl_delete; u64 gl_no_formal_ino; }; struct rcu_head gl_rcu; struct rhash_head gl_node; }; enum { GIF_QD_LOCKED = 1, GIF_ALLOC_FAILED = 2, GIF_SW_PAGED = 3, GIF_FREE_VFS_INODE = 5, GIF_GLOP_PENDING = 6, }; struct gfs2_inode { struct inode i_inode; u64 i_no_addr; u64 i_no_formal_ino; u64 i_generation; u64 i_eattr; unsigned long i_flags; /* GIF_... */ struct gfs2_glock *i_gl; struct gfs2_holder i_iopen_gh; struct gfs2_qadata *i_qadata; /* quota allocation data */ struct gfs2_holder i_rgd_gh; struct gfs2_blkreserv i_res; /* rgrp multi-block reservation */ u64 i_goal; /* goal block for allocations */ atomic_t i_sizehint; /* hint of the write size */ struct rw_semaphore i_rw_mutex; struct list_head i_ordered; __be64 *i_hash_cache; u32 i_entries; u32 i_diskflags; u8 i_height; u8 i_depth; u16 i_rahead; }; /* * Since i_inode is the first element of struct gfs2_inode, * this is effectively a cast. */ static inline struct gfs2_inode *GFS2_I(struct inode *inode) { return container_of(inode, struct gfs2_inode, i_inode); } static inline struct gfs2_sbd *GFS2_SB(const struct inode *inode) { return inode->i_sb->s_fs_info; } struct gfs2_file { struct mutex f_fl_mutex; struct gfs2_holder f_fl_gh; }; struct gfs2_revoke_replay { struct list_head rr_list; u64 rr_blkno; unsigned int rr_where; }; enum { QDF_CHANGE = 1, QDF_LOCKED = 2, QDF_REFRESH = 3, QDF_QMSG_QUIET = 4, }; struct gfs2_quota_data { struct hlist_bl_node qd_hlist; struct list_head qd_list; struct kqid qd_id; struct gfs2_sbd *qd_sbd; struct lockref qd_lockref; struct list_head qd_lru; unsigned qd_hash; unsigned long qd_flags; /* QDF_... */ s64 qd_change; s64 qd_change_sync; unsigned int qd_slot; unsigned int qd_slot_ref; struct buffer_head *qd_bh; struct gfs2_quota_change *qd_bh_qc; unsigned int qd_bh_count; struct gfs2_glock *qd_gl; struct gfs2_quota_lvb qd_qb; u64 qd_sync_gen; unsigned long qd_last_warn; struct rcu_head qd_rcu; }; enum { TR_TOUCHED = 1, TR_ATTACHED = 2, TR_ONSTACK = 3, }; struct gfs2_trans { unsigned long tr_ip; unsigned int tr_blocks; unsigned int tr_revokes; unsigned int tr_reserved; unsigned long tr_flags; unsigned int tr_num_buf_new; unsigned int tr_num_databuf_new; unsigned int tr_num_buf_rm; unsigned int tr_num_databuf_rm; unsigned int tr_num_revoke; struct list_head tr_list; struct list_head tr_databuf; struct list_head tr_buf; unsigned int tr_first; struct list_head tr_ail1_list; struct list_head tr_ail2_list; }; struct gfs2_journal_extent { struct list_head list; unsigned int lblock; /* First logical block */ u64 dblock; /* First disk block */ u64 blocks; }; struct gfs2_jdesc { struct list_head jd_list; struct list_head extent_list; unsigned int nr_extents; struct work_struct jd_work; struct inode *jd_inode; struct bio *jd_log_bio; unsigned long jd_flags; #define JDF_RECOVERY 1 unsigned int jd_jid; u32 jd_blocks; int jd_recover_error; /* Replay stuff */ unsigned int jd_found_blocks; unsigned int jd_found_revokes; unsigned int jd_replayed_blocks; struct list_head jd_revoke_list; unsigned int jd_replay_tail; u64 jd_no_addr; }; struct gfs2_statfs_change_host { s64 sc_total; s64 sc_free; s64 sc_dinodes; }; #define GFS2_QUOTA_DEFAULT GFS2_QUOTA_OFF #define GFS2_QUOTA_OFF 0 #define GFS2_QUOTA_ACCOUNT 1 #define GFS2_QUOTA_ON 2 #define GFS2_QUOTA_QUIET 3 /* on but not complaining */ #define GFS2_DATA_DEFAULT GFS2_DATA_ORDERED #define GFS2_DATA_WRITEBACK 1 #define GFS2_DATA_ORDERED 2 #define GFS2_ERRORS_DEFAULT GFS2_ERRORS_WITHDRAW #define GFS2_ERRORS_WITHDRAW 0 #define GFS2_ERRORS_CONTINUE 1 /* place holder for future feature */ #define GFS2_ERRORS_RO 2 /* place holder for future feature */ #define GFS2_ERRORS_PANIC 3 struct gfs2_args { char ar_lockproto[GFS2_LOCKNAME_LEN]; /* Name of the Lock Protocol */ char ar_locktable[GFS2_LOCKNAME_LEN]; /* Name of the Lock Table */ char ar_hostdata[GFS2_LOCKNAME_LEN]; /* Host specific data */ unsigned int ar_spectator:1; /* Don't get a journal */ unsigned int ar_localflocks:1; /* Let the VFS do flock|fcntl */ unsigned int ar_debug:1; /* Oops on errors */ unsigned int ar_posix_acl:1; /* Enable posix acls */ unsigned int ar_quota:2; /* off/account/on */ unsigned int ar_suiddir:1; /* suiddir support */ unsigned int ar_data:2; /* ordered/writeback */ unsigned int ar_meta:1; /* mount metafs */ unsigned int ar_discard:1; /* discard requests */ unsigned int ar_errors:2; /* errors=withdraw | panic */ unsigned int ar_nobarrier:1; /* do not send barriers */ unsigned int ar_rgrplvb:1; /* use lvbs for rgrp info */ unsigned int ar_got_rgrplvb:1; /* Was the rgrplvb opt given? */ unsigned int ar_loccookie:1; /* use location based readdir cookies */ s32 ar_commit; /* Commit interval */ s32 ar_statfs_quantum; /* The fast statfs interval */ s32 ar_quota_quantum; /* The quota interval */ s32 ar_statfs_percent; /* The % change to force sync */ }; struct gfs2_tune { spinlock_t gt_spin; unsigned int gt_logd_secs; unsigned int gt_quota_warn_period; /* Secs between quota warn msgs */ unsigned int gt_quota_scale_num; /* Numerator */ unsigned int gt_quota_scale_den; /* Denominator */ unsigned int gt_quota_quantum; /* Secs between syncs to quota file */ unsigned int gt_new_files_jdata; unsigned int gt_max_readahead; /* Max bytes to read-ahead from disk */ unsigned int gt_complain_secs; unsigned int gt_statfs_quantum; unsigned int gt_statfs_slow; }; enum { SDF_JOURNAL_CHECKED = 0, SDF_JOURNAL_LIVE = 1, SDF_WITHDRAWN = 2, SDF_NOBARRIERS = 3, SDF_NORECOVERY = 4, SDF_DEMOTE = 5, SDF_NOJOURNALID = 6, SDF_RORECOVERY = 7, /* read only recovery */ SDF_SKIP_DLM_UNLOCK = 8, SDF_FORCE_AIL_FLUSH = 9, SDF_FREEZE_INITIATOR = 10, SDF_WITHDRAWING = 11, /* Will withdraw eventually */ SDF_WITHDRAW_IN_PROG = 12, /* Withdraw is in progress */ SDF_REMOTE_WITHDRAW = 13, /* Performing remote recovery */ SDF_WITHDRAW_RECOVERY = 14, /* Wait for journal recovery when we are withdrawing */ SDF_KILL = 15, SDF_EVICTING = 16, SDF_FROZEN = 17, }; #define GFS2_FSNAME_LEN 256 struct gfs2_inum_host { u64 no_formal_ino; u64 no_addr; }; struct gfs2_sb_host { u32 sb_magic; u32 sb_type; u32 sb_fs_format; u32 sb_multihost_format; u32 sb_bsize; u32 sb_bsize_shift; struct gfs2_inum_host sb_master_dir; struct gfs2_inum_host sb_root_dir; char sb_lockproto[GFS2_LOCKNAME_LEN]; char sb_locktable[GFS2_LOCKNAME_LEN]; }; /* * lm_mount() return values * * ls_jid - the journal ID this node should use * ls_first - this node is the first to mount the file system * ls_lockspace - lock module's context for this file system * ls_ops - lock module's functions */ struct lm_lockstruct { int ls_jid; unsigned int ls_first; const struct lm_lockops *ls_ops; dlm_lockspace_t *ls_dlm; int ls_recover_jid_done; /* These two are deprecated, */ int ls_recover_jid_status; /* used previously by gfs_controld */ struct dlm_lksb ls_mounted_lksb; /* mounted_lock */ struct dlm_lksb ls_control_lksb; /* control_lock */ char ls_control_lvb[GDLM_LVB_SIZE]; /* control_lock lvb */ struct completion ls_sync_wait; /* {control,mounted}_{lock,unlock} */ char *ls_lvb_bits; spinlock_t ls_recover_spin; /* protects following fields */ unsigned long ls_recover_flags; /* DFL_ */ uint32_t ls_recover_mount; /* gen in first recover_done cb */ uint32_t ls_recover_start; /* gen in last recover_done cb */ uint32_t ls_recover_block; /* copy recover_start in last recover_prep */ uint32_t ls_recover_size; /* size of recover_submit, recover_result */ uint32_t *ls_recover_submit; /* gen in last recover_slot cb per jid */ uint32_t *ls_recover_result; /* result of last jid recovery */ }; struct gfs2_pcpu_lkstats { /* One struct for each glock type */ struct gfs2_lkstats lkstats[10]; }; /* List of local (per node) statfs inodes */ struct local_statfs_inode { struct list_head si_list; struct inode *si_sc_inode; unsigned int si_jid; /* journal id this statfs inode corresponds to */ }; struct gfs2_sbd { struct super_block *sd_vfs; struct gfs2_pcpu_lkstats __percpu *sd_lkstats; struct kobject sd_kobj; struct completion sd_kobj_unregister; unsigned long sd_flags; /* SDF_... */ struct gfs2_sb_host sd_sb; /* Constants computed on mount */ u32 sd_fsb2bb; u32 sd_fsb2bb_shift; u32 sd_diptrs; /* Number of pointers in a dinode */ u32 sd_inptrs; /* Number of pointers in a indirect block */ u32 sd_ldptrs; /* Number of pointers in a log descriptor block */ u32 sd_jbsize; /* Size of a journaled data block */ u32 sd_hash_bsize; /* sizeof(exhash block) */ u32 sd_hash_bsize_shift; u32 sd_hash_ptrs; /* Number of pointers in a hash block */ u32 sd_qc_per_block; u32 sd_blocks_per_bitmap; u32 sd_max_dirres; /* Max blocks needed to add a directory entry */ u32 sd_max_height; /* Max height of a file's metadata tree */ u64 sd_heightsize[GFS2_MAX_META_HEIGHT + 1]; u32 sd_max_dents_per_leaf; /* Max number of dirents in a leaf block */ struct gfs2_args sd_args; /* Mount arguments */ struct gfs2_tune sd_tune; /* Filesystem tuning structure */ /* Lock Stuff */ struct lm_lockstruct sd_lockstruct; struct gfs2_holder sd_live_gh; struct gfs2_glock *sd_rename_gl; struct gfs2_glock *sd_freeze_gl; struct work_struct sd_freeze_work; wait_queue_head_t sd_kill_wait; wait_queue_head_t sd_async_glock_wait; atomic_t sd_glock_disposal; struct completion sd_locking_init; struct completion sd_wdack; struct delayed_work sd_control_work; /* Inode Stuff */ struct dentry *sd_master_dir; struct dentry *sd_root_dir; struct inode *sd_jindex; struct inode *sd_statfs_inode; struct inode *sd_sc_inode; struct list_head sd_sc_inodes_list; struct inode *sd_qc_inode; struct inode *sd_rindex; struct inode *sd_quota_inode; /* StatFS stuff */ spinlock_t sd_statfs_spin; struct gfs2_statfs_change_host sd_statfs_master; struct gfs2_statfs_change_host sd_statfs_local; int sd_statfs_force_sync; /* Resource group stuff */ int sd_rindex_uptodate; spinlock_t sd_rindex_spin; struct rb_root sd_rindex_tree; unsigned int sd_rgrps; unsigned int sd_max_rg_data; /* Journal index stuff */ struct list_head sd_jindex_list; spinlock_t sd_jindex_spin; struct mutex sd_jindex_mutex; unsigned int sd_journals; struct gfs2_jdesc *sd_jdesc; struct gfs2_holder sd_journal_gh; struct gfs2_holder sd_jinode_gh; struct gfs2_glock *sd_jinode_gl; struct gfs2_holder sd_sc_gh; struct buffer_head *sd_sc_bh; struct gfs2_holder sd_qc_gh; struct completion sd_journal_ready; /* Workqueue stuff */ struct workqueue_struct *sd_glock_wq; struct workqueue_struct *sd_delete_wq; /* Daemon stuff */ struct task_struct *sd_logd_process; struct task_struct *sd_quotad_process; /* Quota stuff */ struct list_head sd_quota_list; atomic_t sd_quota_count; struct mutex sd_quota_sync_mutex; wait_queue_head_t sd_quota_wait; unsigned int sd_quota_slots; unsigned long *sd_quota_bitmap; spinlock_t sd_bitmap_lock; u64 sd_quota_sync_gen; /* Log stuff */ struct inode *sd_inode; spinlock_t sd_log_lock; struct gfs2_trans *sd_log_tr; unsigned int sd_log_blks_reserved; atomic_t sd_log_pinned; unsigned int sd_log_num_revoke; struct list_head sd_log_revokes; struct list_head sd_log_ordered; spinlock_t sd_ordered_lock; atomic_t sd_log_thresh1; atomic_t sd_log_thresh2; atomic_t sd_log_blks_free; atomic_t sd_log_blks_needed; atomic_t sd_log_revokes_available; wait_queue_head_t sd_log_waitq; wait_queue_head_t sd_logd_waitq; u64 sd_log_sequence; int sd_log_idle; struct rw_semaphore sd_log_flush_lock; atomic_t sd_log_in_flight; wait_queue_head_t sd_log_flush_wait; int sd_log_error; /* First log error */ wait_queue_head_t sd_withdraw_wait; unsigned int sd_log_tail; unsigned int sd_log_flush_tail; unsigned int sd_log_head; unsigned int sd_log_flush_head; spinlock_t sd_ail_lock; struct list_head sd_ail1_list; struct list_head sd_ail2_list; /* For quiescing the filesystem */ struct gfs2_holder sd_freeze_gh; struct mutex sd_freeze_mutex; struct list_head sd_dead_glocks; char sd_fsname[GFS2_FSNAME_LEN + 3 * sizeof(int) + 2]; char sd_table_name[GFS2_FSNAME_LEN]; char sd_proto_name[GFS2_FSNAME_LEN]; /* Debugging crud */ unsigned long sd_last_warning; struct dentry *debugfs_dir; /* debugfs directory */ unsigned long sd_glock_dqs_held; }; #define GFS2_BAD_INO 1 static inline struct address_space *gfs2_aspace(struct gfs2_sbd *sdp) { return sdp->sd_inode->i_mapping; } static inline void gfs2_glstats_inc(struct gfs2_glock *gl, int which) { gl->gl_stats.stats[which]++; } static inline void gfs2_sbstats_inc(const struct gfs2_glock *gl, int which) { const struct gfs2_sbd *sdp = gl->gl_name.ln_sbd; preempt_disable(); this_cpu_ptr(sdp->sd_lkstats)->lkstats[gl->gl_name.ln_type].stats[which]++; preempt_enable(); } struct gfs2_rgrpd *gfs2_glock2rgrp(struct gfs2_glock *gl); static inline unsigned gfs2_max_stuffed_size(const struct gfs2_inode *ip) { return GFS2_SB(&ip->i_inode)->sd_sb.sb_bsize - sizeof(struct gfs2_dinode); } #endif /* __INCORE_DOT_H__ */
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _X86_POSTED_INTR_H #define _X86_POSTED_INTR_H #include <asm/cmpxchg.h> #include <asm/rwonce.h> #include <asm/irq_vectors.h> #include <linux/bitmap.h> #define POSTED_INTR_ON 0 #define POSTED_INTR_SN 1 #define PID_TABLE_ENTRY_VALID 1 #define NR_PIR_VECTORS 256 #define NR_PIR_WORDS (NR_PIR_VECTORS / BITS_PER_LONG) /* Posted-Interrupt Descriptor */ struct pi_desc { unsigned long pir[NR_PIR_WORDS]; /* Posted interrupt requested */ union { struct { u16 notifications; /* Suppress and outstanding bits */ u8 nv; u8 rsvd_2; u32 ndst; }; u64 control; }; u32 rsvd[6]; } __aligned(64); /* * De-multiplexing posted interrupts is on the performance path, the code * below is written to optimize the cache performance based on the following * considerations: * 1.Posted interrupt descriptor (PID) fits in a cache line that is frequently * accessed by both CPU and IOMMU. * 2.During software processing of posted interrupts, the CPU needs to do * natural width read and xchg for checking and clearing posted interrupt * request (PIR), a 256 bit field within the PID. * 3.On the other side, the IOMMU does atomic swaps of the entire PID cache * line when posting interrupts and setting control bits. * 4.The CPU can access the cache line a magnitude faster than the IOMMU. * 5.Each time the IOMMU does interrupt posting to the PIR will evict the PID * cache line. The cache line states after each operation are as follows, * assuming a 64-bit kernel: * CPU IOMMU PID Cache line state * --------------------------------------------------------------- *...read64 exclusive *...lock xchg64 modified *... post/atomic swap invalid *...------------------------------------------------------------- * * To reduce L1 data cache miss, it is important to avoid contention with * IOMMU's interrupt posting/atomic swap. Therefore, a copy of PIR is used * when processing posted interrupts in software, e.g. to dispatch interrupt * handlers for posted MSIs, or to move interrupts from the PIR to the vIRR * in KVM. * * In addition, the code is trying to keep the cache line state consistent * as much as possible. e.g. when making a copy and clearing the PIR * (assuming non-zero PIR bits are present in the entire PIR), it does: * read, read, read, read, xchg, xchg, xchg, xchg * instead of: * read, xchg, read, xchg, read, xchg, read, xchg */ static __always_inline bool pi_harvest_pir(unsigned long *pir, unsigned long *pir_vals) { unsigned long pending = 0; int i; for (i = 0; i < NR_PIR_WORDS; i++) { pir_vals[i] = READ_ONCE(pir[i]); pending |= pir_vals[i]; } if (!pending) return false; for (i = 0; i < NR_PIR_WORDS; i++) { if (!pir_vals[i]) continue; pir_vals[i] = arch_xchg(&pir[i], 0); } return true; } static inline bool pi_test_and_set_on(struct pi_desc *pi_desc) { return test_and_set_bit(POSTED_INTR_ON, (unsigned long *)&pi_desc->control); } static inline bool pi_test_and_clear_on(struct pi_desc *pi_desc) { return test_and_clear_bit(POSTED_INTR_ON, (unsigned long *)&pi_desc->control); } static inline bool pi_test_and_clear_sn(struct pi_desc *pi_desc) { return test_and_clear_bit(POSTED_INTR_SN, (unsigned long *)&pi_desc->control); } static inline bool pi_test_and_set_pir(int vector, struct pi_desc *pi_desc) { return test_and_set_bit(vector, pi_desc->pir); } static inline bool pi_is_pir_empty(struct pi_desc *pi_desc) { return bitmap_empty(pi_desc->pir, NR_VECTORS); } static inline void pi_set_sn(struct pi_desc *pi_desc) { set_bit(POSTED_INTR_SN, (unsigned long *)&pi_desc->control); } static inline void pi_set_on(struct pi_desc *pi_desc) { set_bit(POSTED_INTR_ON, (unsigned long *)&pi_desc->control); } static inline void pi_clear_on(struct pi_desc *pi_desc) { clear_bit(POSTED_INTR_ON, (unsigned long *)&pi_desc->control); } static inline void pi_clear_sn(struct pi_desc *pi_desc) { clear_bit(POSTED_INTR_SN, (unsigned long *)&pi_desc->control); } static inline bool pi_test_on(struct pi_desc *pi_desc) { return test_bit(POSTED_INTR_ON, (unsigned long *)&pi_desc->control); } static inline bool pi_test_sn(struct pi_desc *pi_desc) { return test_bit(POSTED_INTR_SN, (unsigned long *)&pi_desc->control); } static inline bool pi_test_pir(int vector, struct pi_desc *pi_desc) { return test_bit(vector, (unsigned long *)pi_desc->pir); } /* Non-atomic helpers */ static inline void __pi_set_sn(struct pi_desc *pi_desc) { pi_desc->notifications |= BIT(POSTED_INTR_SN); } static inline void __pi_clear_sn(struct pi_desc *pi_desc) { pi_desc->notifications &= ~BIT(POSTED_INTR_SN); } #ifdef CONFIG_X86_POSTED_MSI /* * Not all external vectors are subject to interrupt remapping, e.g. IOMMU's * own interrupts. Here we do not distinguish them since those vector bits in * PIR will always be zero. */ static inline bool pi_pending_this_cpu(unsigned int vector) { struct pi_desc *pid = this_cpu_ptr(&posted_msi_pi_desc); if (WARN_ON_ONCE(vector > NR_VECTORS || vector < FIRST_EXTERNAL_VECTOR)) return false; return test_bit(vector, pid->pir); } extern void intel_posted_msi_init(void); #else static inline bool pi_pending_this_cpu(unsigned int vector) { return false; } static inline void intel_posted_msi_init(void) {}; #endif /* X86_POSTED_MSI */ #endif /* _X86_POSTED_INTR_H */
6 1 1 1 1 1 1 1 1 1 5 3 2 3 2 6 2 5 5 2 1 1 10 8 2 5 5 2 5 5 5 5 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 // SPDX-License-Identifier: GPL-2.0 /* * cdc-wdm.c * * This driver supports USB CDC WCM Device Management. * * Copyright (c) 2007-2009 Oliver Neukum * * Some code taken from cdc-acm.c * * Released under the GPLv2. * * Many thanks to Carl Nordbeck */ #include <linux/kernel.h> #include <linux/errno.h> #include <linux/ioctl.h> #include <linux/slab.h> #include <linux/module.h> #include <linux/mutex.h> #include <linux/uaccess.h> #include <linux/bitops.h> #include <linux/poll.h> #include <linux/skbuff.h> #include <linux/usb.h> #include <linux/usb/cdc.h> #include <linux/wwan.h> #include <asm/byteorder.h> #include <linux/unaligned.h> #include <linux/usb/cdc-wdm.h> #define DRIVER_AUTHOR "Oliver Neukum" #define DRIVER_DESC "USB Abstract Control Model driver for USB WCM Device Management" static const struct usb_device_id wdm_ids[] = { { .match_flags = USB_DEVICE_ID_MATCH_INT_CLASS | USB_DEVICE_ID_MATCH_INT_SUBCLASS, .bInterfaceClass = USB_CLASS_COMM, .bInterfaceSubClass = USB_CDC_SUBCLASS_DMM }, { } }; MODULE_DEVICE_TABLE (usb, wdm_ids); #define WDM_MINOR_BASE 176 #define WDM_IN_USE 1 #define WDM_DISCONNECTING 2 #define WDM_RESULT 3 #define WDM_READ 4 #define WDM_INT_STALL 5 #define WDM_POLL_RUNNING 6 #define WDM_RESPONDING 7 #define WDM_SUSPENDING 8 #define WDM_RESETTING 9 #define WDM_OVERFLOW 10 #define WDM_WWAN_IN_USE 11 #define WDM_MAX 16 /* we cannot wait forever at flush() */ #define WDM_FLUSH_TIMEOUT (30 * HZ) /* CDC-WMC r1.1 requires wMaxCommand to be "at least 256 decimal (0x100)" */ #define WDM_DEFAULT_BUFSIZE 256 static DEFINE_MUTEX(wdm_mutex); static DEFINE_SPINLOCK(wdm_device_list_lock); static LIST_HEAD(wdm_device_list); /* --- method tables --- */ struct wdm_device { u8 *inbuf; /* buffer for response */ u8 *outbuf; /* buffer for command */ u8 *sbuf; /* buffer for status */ u8 *ubuf; /* buffer for copy to user space */ struct urb *command; struct urb *response; struct urb *validity; struct usb_interface *intf; struct usb_ctrlrequest *orq; struct usb_ctrlrequest *irq; spinlock_t iuspin; unsigned long flags; u16 bufsize; u16 wMaxCommand; u16 wMaxPacketSize; __le16 inum; int length; int read; int count; dma_addr_t shandle; dma_addr_t ihandle; struct mutex wlock; struct mutex rlock; wait_queue_head_t wait; struct work_struct rxwork; struct work_struct service_outs_intr; int werr; int rerr; int resp_count; struct list_head device_list; int (*manage_power)(struct usb_interface *, int); enum wwan_port_type wwanp_type; struct wwan_port *wwanp; }; static struct usb_driver wdm_driver; /* return intfdata if we own the interface, else look up intf in the list */ static struct wdm_device *wdm_find_device(struct usb_interface *intf) { struct wdm_device *desc; spin_lock(&wdm_device_list_lock); list_for_each_entry(desc, &wdm_device_list, device_list) if (desc->intf == intf) goto found; desc = NULL; found: spin_unlock(&wdm_device_list_lock); return desc; } static struct wdm_device *wdm_find_device_by_minor(int minor) { struct wdm_device *desc; spin_lock(&wdm_device_list_lock); list_for_each_entry(desc, &wdm_device_list, device_list) if (desc->intf->minor == minor) goto found; desc = NULL; found: spin_unlock(&wdm_device_list_lock); return desc; } /* --- callbacks --- */ static void wdm_out_callback(struct urb *urb) { struct wdm_device *desc; unsigned long flags; desc = urb->context; spin_lock_irqsave(&desc->iuspin, flags); desc->werr = urb->status; spin_unlock_irqrestore(&desc->iuspin, flags); kfree(desc->outbuf); desc->outbuf = NULL; clear_bit(WDM_IN_USE, &desc->flags); wake_up_all(&desc->wait); } static void wdm_wwan_rx(struct wdm_device *desc, int length); static void wdm_in_callback(struct urb *urb) { unsigned long flags; struct wdm_device *desc = urb->context; int status = urb->status; int length = urb->actual_length; spin_lock_irqsave(&desc->iuspin, flags); clear_bit(WDM_RESPONDING, &desc->flags); if (status) { switch (status) { case -ENOENT: dev_dbg(&desc->intf->dev, "nonzero urb status received: -ENOENT\n"); goto skip_error; case -ECONNRESET: dev_dbg(&desc->intf->dev, "nonzero urb status received: -ECONNRESET\n"); goto skip_error; case -ESHUTDOWN: dev_dbg(&desc->intf->dev, "nonzero urb status received: -ESHUTDOWN\n"); goto skip_error; case -EPIPE: dev_err(&desc->intf->dev, "nonzero urb status received: -EPIPE\n"); break; default: dev_err(&desc->intf->dev, "Unexpected error %d\n", status); break; } } if (test_bit(WDM_WWAN_IN_USE, &desc->flags)) { wdm_wwan_rx(desc, length); goto out; } /* * only set a new error if there is no previous error. * Errors are only cleared during read/open * Avoid propagating -EPIPE (stall) to userspace since it is * better handled as an empty read */ if (desc->rerr == 0 && status != -EPIPE) desc->rerr = status; if (length == 0) { dev_dbg(&desc->intf->dev, "received ZLP\n"); goto skip_zlp; } if (length + desc->length > desc->wMaxCommand) { /* The buffer would overflow */ set_bit(WDM_OVERFLOW, &desc->flags); } else { /* we may already be in overflow */ if (!test_bit(WDM_OVERFLOW, &desc->flags)) { memmove(desc->ubuf + desc->length, desc->inbuf, length); desc->length += length; } } skip_error: if (desc->rerr) { /* * If there was a ZLP or an error, userspace may decide to not * read any data after poll'ing. * We should respond to further attempts from the device to send * data, so that we can get unstuck. */ skip_zlp: schedule_work(&desc->service_outs_intr); } else { set_bit(WDM_READ, &desc->flags); wake_up(&desc->wait); } out: spin_unlock_irqrestore(&desc->iuspin, flags); } static void wdm_int_callback(struct urb *urb) { unsigned long flags; int rv = 0; int responding; int status = urb->status; struct wdm_device *desc; struct usb_cdc_notification *dr; desc = urb->context; dr = (struct usb_cdc_notification *)desc->sbuf; if (status) { switch (status) { case -ESHUTDOWN: case -ENOENT: case -ECONNRESET: return; /* unplug */ case -EPIPE: set_bit(WDM_INT_STALL, &desc->flags); dev_err(&desc->intf->dev, "Stall on int endpoint\n"); goto sw; /* halt is cleared in work */ default: dev_err_ratelimited(&desc->intf->dev, "nonzero urb status received: %d\n", status); break; } } if (urb->actual_length < sizeof(struct usb_cdc_notification)) { dev_err_ratelimited(&desc->intf->dev, "wdm_int_callback - %d bytes\n", urb->actual_length); goto exit; } switch (dr->bNotificationType) { case USB_CDC_NOTIFY_RESPONSE_AVAILABLE: dev_dbg(&desc->intf->dev, "NOTIFY_RESPONSE_AVAILABLE received: index %d len %d\n", le16_to_cpu(dr->wIndex), le16_to_cpu(dr->wLength)); break; case USB_CDC_NOTIFY_NETWORK_CONNECTION: dev_dbg(&desc->intf->dev, "NOTIFY_NETWORK_CONNECTION %s network\n", dr->wValue ? "connected to" : "disconnected from"); goto exit; case USB_CDC_NOTIFY_SPEED_CHANGE: dev_dbg(&desc->intf->dev, "SPEED_CHANGE received (len %u)\n", urb->actual_length); goto exit; default: clear_bit(WDM_POLL_RUNNING, &desc->flags); dev_err(&desc->intf->dev, "unknown notification %d received: index %d len %d\n", dr->bNotificationType, le16_to_cpu(dr->wIndex), le16_to_cpu(dr->wLength)); goto exit; } spin_lock_irqsave(&desc->iuspin, flags); responding = test_and_set_bit(WDM_RESPONDING, &desc->flags); if (!desc->resp_count++ && !responding && !test_bit(WDM_DISCONNECTING, &desc->flags) && !test_bit(WDM_SUSPENDING, &desc->flags)) { rv = usb_submit_urb(desc->response, GFP_ATOMIC); dev_dbg(&desc->intf->dev, "submit response URB %d\n", rv); } spin_unlock_irqrestore(&desc->iuspin, flags); if (rv < 0) { clear_bit(WDM_RESPONDING, &desc->flags); if (rv == -EPERM) return; if (rv == -ENOMEM) { sw: rv = schedule_work(&desc->rxwork); if (rv) dev_err(&desc->intf->dev, "Cannot schedule work\n"); } } exit: rv = usb_submit_urb(urb, GFP_ATOMIC); if (rv) dev_err(&desc->intf->dev, "%s - usb_submit_urb failed with result %d\n", __func__, rv); } static void poison_urbs(struct wdm_device *desc) { /* the order here is essential */ usb_poison_urb(desc->command); usb_poison_urb(desc->validity); usb_poison_urb(desc->response); } static void unpoison_urbs(struct wdm_device *desc) { /* * the order here is not essential * it is symmetrical just to be nice */ usb_unpoison_urb(desc->response); usb_unpoison_urb(desc->validity); usb_unpoison_urb(desc->command); } static void free_urbs(struct wdm_device *desc) { usb_free_urb(desc->validity); usb_free_urb(desc->response); usb_free_urb(desc->command); } static void cleanup(struct wdm_device *desc) { kfree(desc->sbuf); kfree(desc->inbuf); kfree(desc->orq); kfree(desc->irq); kfree(desc->ubuf); free_urbs(desc); kfree(desc); } static ssize_t wdm_write (struct file *file, const char __user *buffer, size_t count, loff_t *ppos) { u8 *buf; int rv = -EMSGSIZE, r, we; struct wdm_device *desc = file->private_data; struct usb_ctrlrequest *req; if (count > desc->wMaxCommand) count = desc->wMaxCommand; spin_lock_irq(&desc->iuspin); we = desc->werr; desc->werr = 0; spin_unlock_irq(&desc->iuspin); if (we < 0) return usb_translate_errors(we); buf = memdup_user(buffer, count); if (IS_ERR(buf)) return PTR_ERR(buf); /* concurrent writes and disconnect */ r = mutex_lock_interruptible(&desc->wlock); rv = -ERESTARTSYS; if (r) goto out_free_mem; if (test_bit(WDM_DISCONNECTING, &desc->flags)) { rv = -ENODEV; goto out_free_mem_lock; } r = usb_autopm_get_interface(desc->intf); if (r < 0) { rv = usb_translate_errors(r); goto out_free_mem_lock; } if (!(file->f_flags & O_NONBLOCK)) r = wait_event_interruptible(desc->wait, !test_bit(WDM_IN_USE, &desc->flags)); else if (test_bit(WDM_IN_USE, &desc->flags)) r = -EAGAIN; if (test_bit(WDM_RESETTING, &desc->flags)) r = -EIO; if (test_bit(WDM_DISCONNECTING, &desc->flags)) r = -ENODEV; if (r < 0) { rv = r; goto out_free_mem_pm; } req = desc->orq; usb_fill_control_urb( desc->command, interface_to_usbdev(desc->intf), /* using common endpoint 0 */ usb_sndctrlpipe(interface_to_usbdev(desc->intf), 0), (unsigned char *)req, buf, count, wdm_out_callback, desc ); req->bRequestType = (USB_DIR_OUT | USB_TYPE_CLASS | USB_RECIP_INTERFACE); req->bRequest = USB_CDC_SEND_ENCAPSULATED_COMMAND; req->wValue = 0; req->wIndex = desc->inum; /* already converted */ req->wLength = cpu_to_le16(count); set_bit(WDM_IN_USE, &desc->flags); desc->outbuf = buf; rv = usb_submit_urb(desc->command, GFP_KERNEL); if (rv < 0) { desc->outbuf = NULL; clear_bit(WDM_IN_USE, &desc->flags); wake_up_all(&desc->wait); /* for wdm_wait_for_response() */ dev_err(&desc->intf->dev, "Tx URB error: %d\n", rv); rv = usb_translate_errors(rv); goto out_free_mem_pm; } else { dev_dbg(&desc->intf->dev, "Tx URB has been submitted index=%d\n", le16_to_cpu(req->wIndex)); } usb_autopm_put_interface(desc->intf); mutex_unlock(&desc->wlock); return count; out_free_mem_pm: usb_autopm_put_interface(desc->intf); out_free_mem_lock: mutex_unlock(&desc->wlock); out_free_mem: kfree(buf); return rv; } /* * Submit the read urb if resp_count is non-zero. * * Called with desc->iuspin locked */ static int service_outstanding_interrupt(struct wdm_device *desc) { int rv = 0; /* submit read urb only if the device is waiting for it */ if (!desc->resp_count || !--desc->resp_count) goto out; if (test_bit(WDM_DISCONNECTING, &desc->flags)) { rv = -ENODEV; goto out; } if (test_bit(WDM_RESETTING, &desc->flags)) { rv = -EIO; goto out; } set_bit(WDM_RESPONDING, &desc->flags); spin_unlock_irq(&desc->iuspin); rv = usb_submit_urb(desc->response, GFP_KERNEL); spin_lock_irq(&desc->iuspin); if (rv) { if (!test_bit(WDM_DISCONNECTING, &desc->flags)) dev_err(&desc->intf->dev, "usb_submit_urb failed with result %d\n", rv); /* make sure the next notification trigger a submit */ clear_bit(WDM_RESPONDING, &desc->flags); desc->resp_count = 0; } out: return rv; } static ssize_t wdm_read (struct file *file, char __user *buffer, size_t count, loff_t *ppos) { int rv, cntr; int i = 0; struct wdm_device *desc = file->private_data; rv = mutex_lock_interruptible(&desc->rlock); /*concurrent reads */ if (rv < 0) return -ERESTARTSYS; cntr = READ_ONCE(desc->length); if (cntr == 0) { desc->read = 0; retry: if (test_bit(WDM_DISCONNECTING, &desc->flags)) { rv = -ENODEV; goto err; } if (test_bit(WDM_OVERFLOW, &desc->flags)) { clear_bit(WDM_OVERFLOW, &desc->flags); rv = -ENOBUFS; goto err; } i++; if (file->f_flags & O_NONBLOCK) { if (!test_bit(WDM_READ, &desc->flags)) { rv = -EAGAIN; goto err; } rv = 0; } else { rv = wait_event_interruptible(desc->wait, test_bit(WDM_READ, &desc->flags)); } /* may have happened while we slept */ if (test_bit(WDM_DISCONNECTING, &desc->flags)) { rv = -ENODEV; goto err; } if (test_bit(WDM_RESETTING, &desc->flags)) { rv = -EIO; goto err; } usb_mark_last_busy(interface_to_usbdev(desc->intf)); if (rv < 0) { rv = -ERESTARTSYS; goto err; } spin_lock_irq(&desc->iuspin); if (desc->rerr) { /* read completed, error happened */ rv = usb_translate_errors(desc->rerr); desc->rerr = 0; spin_unlock_irq(&desc->iuspin); goto err; } /* * recheck whether we've lost the race * against the completion handler */ if (!test_bit(WDM_READ, &desc->flags)) { /* lost race */ spin_unlock_irq(&desc->iuspin); goto retry; } cntr = desc->length; spin_unlock_irq(&desc->iuspin); } if (cntr > count) cntr = count; rv = copy_to_user(buffer, desc->ubuf, cntr); if (rv > 0) { rv = -EFAULT; goto err; } spin_lock_irq(&desc->iuspin); for (i = 0; i < desc->length - cntr; i++) desc->ubuf[i] = desc->ubuf[i + cntr]; desc->length -= cntr; /* in case we had outstanding data */ if (!desc->length) { clear_bit(WDM_READ, &desc->flags); service_outstanding_interrupt(desc); } spin_unlock_irq(&desc->iuspin); rv = cntr; err: mutex_unlock(&desc->rlock); return rv; } static int wdm_wait_for_response(struct file *file, long timeout) { struct wdm_device *desc = file->private_data; long rv; /* Use long here because (int) MAX_SCHEDULE_TIMEOUT < 0. */ /* * Needs both flags. We cannot do with one because resetting it would * cause a race with write() yet we need to signal a disconnect. */ rv = wait_event_interruptible_timeout(desc->wait, !test_bit(WDM_IN_USE, &desc->flags) || test_bit(WDM_DISCONNECTING, &desc->flags), timeout); /* * To report the correct error. This is best effort. * We are inevitably racing with the hardware. */ if (test_bit(WDM_DISCONNECTING, &desc->flags)) return -ENODEV; if (!rv) return -EIO; if (rv < 0) return -EINTR; spin_lock_irq(&desc->iuspin); rv = desc->werr; desc->werr = 0; spin_unlock_irq(&desc->iuspin); return usb_translate_errors(rv); } /* * You need to send a signal when you react to malicious or defective hardware. * Also, don't abort when fsync() returned -EINVAL, for older kernels which do * not implement wdm_flush() will return -EINVAL. */ static int wdm_fsync(struct file *file, loff_t start, loff_t end, int datasync) { return wdm_wait_for_response(file, MAX_SCHEDULE_TIMEOUT); } /* * Same with wdm_fsync(), except it uses finite timeout in order to react to * malicious or defective hardware which ceased communication after close() was * implicitly called due to process termination. */ static int wdm_flush(struct file *file, fl_owner_t id) { return wdm_wait_for_response(file, WDM_FLUSH_TIMEOUT); } static __poll_t wdm_poll(struct file *file, struct poll_table_struct *wait) { struct wdm_device *desc = file->private_data; unsigned long flags; __poll_t mask = 0; spin_lock_irqsave(&desc->iuspin, flags); if (test_bit(WDM_DISCONNECTING, &desc->flags)) { mask = EPOLLHUP | EPOLLERR; spin_unlock_irqrestore(&desc->iuspin, flags); goto desc_out; } if (test_bit(WDM_READ, &desc->flags)) mask = EPOLLIN | EPOLLRDNORM; if (desc->rerr || desc->werr) mask |= EPOLLERR; if (!test_bit(WDM_IN_USE, &desc->flags)) mask |= EPOLLOUT | EPOLLWRNORM; spin_unlock_irqrestore(&desc->iuspin, flags); poll_wait(file, &desc->wait, wait); desc_out: return mask; } static int wdm_open(struct inode *inode, struct file *file) { int minor = iminor(inode); int rv = -ENODEV; struct usb_interface *intf; struct wdm_device *desc; mutex_lock(&wdm_mutex); desc = wdm_find_device_by_minor(minor); if (!desc) goto out; intf = desc->intf; if (test_bit(WDM_DISCONNECTING, &desc->flags)) goto out; file->private_data = desc; if (test_bit(WDM_WWAN_IN_USE, &desc->flags)) { rv = -EBUSY; goto out; } smp_rmb(); /* ordered against wdm_wwan_port_stop() */ rv = usb_autopm_get_interface(desc->intf); if (rv < 0) { dev_err(&desc->intf->dev, "Error autopm - %d\n", rv); goto out; } /* using write lock to protect desc->count */ mutex_lock(&desc->wlock); if (!desc->count++) { desc->werr = 0; desc->rerr = 0; rv = usb_submit_urb(desc->validity, GFP_KERNEL); if (rv < 0) { desc->count--; dev_err(&desc->intf->dev, "Error submitting int urb - %d\n", rv); rv = usb_translate_errors(rv); } } else { rv = 0; } mutex_unlock(&desc->wlock); if (desc->count == 1) desc->manage_power(intf, 1); usb_autopm_put_interface(desc->intf); out: mutex_unlock(&wdm_mutex); return rv; } static int wdm_release(struct inode *inode, struct file *file) { struct wdm_device *desc = file->private_data; mutex_lock(&wdm_mutex); /* using write lock to protect desc->count */ mutex_lock(&desc->wlock); desc->count--; mutex_unlock(&desc->wlock); if (!desc->count) { if (!test_bit(WDM_DISCONNECTING, &desc->flags)) { dev_dbg(&desc->intf->dev, "wdm_release: cleanup\n"); poison_urbs(desc); spin_lock_irq(&desc->iuspin); desc->resp_count = 0; clear_bit(WDM_RESPONDING, &desc->flags); spin_unlock_irq(&desc->iuspin); desc->manage_power(desc->intf, 0); unpoison_urbs(desc); } else { /* must avoid dev_printk here as desc->intf is invalid */ pr_debug(KBUILD_MODNAME " %s: device gone - cleaning up\n", __func__); cleanup(desc); } } mutex_unlock(&wdm_mutex); return 0; } static long wdm_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { struct wdm_device *desc = file->private_data; int rv = 0; switch (cmd) { case IOCTL_WDM_MAX_COMMAND: if (copy_to_user((void __user *)arg, &desc->wMaxCommand, sizeof(desc->wMaxCommand))) rv = -EFAULT; break; default: rv = -ENOTTY; } return rv; } static const struct file_operations wdm_fops = { .owner = THIS_MODULE, .read = wdm_read, .write = wdm_write, .fsync = wdm_fsync, .open = wdm_open, .flush = wdm_flush, .release = wdm_release, .poll = wdm_poll, .unlocked_ioctl = wdm_ioctl, .compat_ioctl = compat_ptr_ioctl, .llseek = noop_llseek, }; static struct usb_class_driver wdm_class = { .name = "cdc-wdm%d", .fops = &wdm_fops, .minor_base = WDM_MINOR_BASE, }; /* --- WWAN framework integration --- */ #ifdef CONFIG_WWAN static int wdm_wwan_port_start(struct wwan_port *port) { struct wdm_device *desc = wwan_port_get_drvdata(port); int rv; /* The interface is both exposed via the WWAN framework and as a * legacy usbmisc chardev. If chardev is already open, just fail * to prevent concurrent usage. Otherwise, switch to WWAN mode. */ mutex_lock(&wdm_mutex); if (desc->count) { mutex_unlock(&wdm_mutex); return -EBUSY; } set_bit(WDM_WWAN_IN_USE, &desc->flags); mutex_unlock(&wdm_mutex); desc->manage_power(desc->intf, 1); /* tx is allowed */ wwan_port_txon(port); /* Start getting events */ rv = usb_submit_urb(desc->validity, GFP_KERNEL); if (rv < 0) { wwan_port_txoff(port); desc->manage_power(desc->intf, 0); /* this must be last lest we race with chardev open */ clear_bit(WDM_WWAN_IN_USE, &desc->flags); } return rv; } static void wdm_wwan_port_stop(struct wwan_port *port) { struct wdm_device *desc = wwan_port_get_drvdata(port); /* Stop all transfers and disable WWAN mode */ poison_urbs(desc); desc->manage_power(desc->intf, 0); clear_bit(WDM_READ, &desc->flags); unpoison_urbs(desc); smp_wmb(); /* ordered against wdm_open() */ /* this must be last lest we open a poisoned device */ clear_bit(WDM_WWAN_IN_USE, &desc->flags); } static void wdm_wwan_port_tx_complete(struct urb *urb) { struct sk_buff *skb = urb->context; struct wdm_device *desc = skb_shinfo(skb)->destructor_arg; usb_autopm_put_interface_async(desc->intf); wwan_port_txon(desc->wwanp); kfree_skb(skb); } static int wdm_wwan_port_tx(struct wwan_port *port, struct sk_buff *skb) { struct wdm_device *desc = wwan_port_get_drvdata(port); struct usb_interface *intf = desc->intf; struct usb_ctrlrequest *req = desc->orq; int rv; rv = usb_autopm_get_interface(intf); if (rv) return rv; usb_fill_control_urb( desc->command, interface_to_usbdev(intf), usb_sndctrlpipe(interface_to_usbdev(intf), 0), (unsigned char *)req, skb->data, skb->len, wdm_wwan_port_tx_complete, skb ); req->bRequestType = (USB_DIR_OUT | USB_TYPE_CLASS | USB_RECIP_INTERFACE); req->bRequest = USB_CDC_SEND_ENCAPSULATED_COMMAND; req->wValue = 0; req->wIndex = desc->inum; /* already converted */ req->wLength = cpu_to_le16(skb->len); skb_shinfo(skb)->destructor_arg = desc; rv = usb_submit_urb(desc->command, GFP_KERNEL); if (rv) usb_autopm_put_interface(intf); else /* One transfer at a time, stop TX until URB completion */ wwan_port_txoff(port); return rv; } static const struct wwan_port_ops wdm_wwan_port_ops = { .start = wdm_wwan_port_start, .stop = wdm_wwan_port_stop, .tx = wdm_wwan_port_tx, }; static void wdm_wwan_init(struct wdm_device *desc) { struct usb_interface *intf = desc->intf; struct wwan_port *port; /* Only register to WWAN core if protocol/type is known */ if (desc->wwanp_type == WWAN_PORT_UNKNOWN) { dev_info(&intf->dev, "Unknown control protocol\n"); return; } port = wwan_create_port(&intf->dev, desc->wwanp_type, &wdm_wwan_port_ops, NULL, desc); if (IS_ERR(port)) { dev_err(&intf->dev, "%s: Unable to create WWAN port\n", dev_name(intf->usb_dev)); return; } desc->wwanp = port; } static void wdm_wwan_deinit(struct wdm_device *desc) { if (!desc->wwanp) return; wwan_remove_port(desc->wwanp); desc->wwanp = NULL; } static void wdm_wwan_rx(struct wdm_device *desc, int length) { struct wwan_port *port = desc->wwanp; struct sk_buff *skb; /* Forward data to WWAN port */ skb = alloc_skb(length, GFP_ATOMIC); if (!skb) return; skb_put_data(skb, desc->inbuf, length); wwan_port_rx(port, skb); /* inbuf has been copied, it is safe to check for outstanding data */ schedule_work(&desc->service_outs_intr); } #else /* CONFIG_WWAN */ static void wdm_wwan_init(struct wdm_device *desc) {} static void wdm_wwan_deinit(struct wdm_device *desc) {} static void wdm_wwan_rx(struct wdm_device *desc, int length) {} #endif /* CONFIG_WWAN */ /* --- error handling --- */ static void wdm_rxwork(struct work_struct *work) { struct wdm_device *desc = container_of(work, struct wdm_device, rxwork); unsigned long flags; int rv = 0; int responding; spin_lock_irqsave(&desc->iuspin, flags); if (test_bit(WDM_DISCONNECTING, &desc->flags)) { spin_unlock_irqrestore(&desc->iuspin, flags); } else { responding = test_and_set_bit(WDM_RESPONDING, &desc->flags); spin_unlock_irqrestore(&desc->iuspin, flags); if (!responding) rv = usb_submit_urb(desc->response, GFP_KERNEL); if (rv < 0 && rv != -EPERM) { spin_lock_irqsave(&desc->iuspin, flags); clear_bit(WDM_RESPONDING, &desc->flags); if (!test_bit(WDM_DISCONNECTING, &desc->flags)) schedule_work(&desc->rxwork); spin_unlock_irqrestore(&desc->iuspin, flags); } } } static void service_interrupt_work(struct work_struct *work) { struct wdm_device *desc; desc = container_of(work, struct wdm_device, service_outs_intr); spin_lock_irq(&desc->iuspin); service_outstanding_interrupt(desc); if (!desc->resp_count && (desc->length || desc->rerr)) { set_bit(WDM_READ, &desc->flags); wake_up(&desc->wait); } spin_unlock_irq(&desc->iuspin); } /* --- hotplug --- */ static int wdm_create(struct usb_interface *intf, struct usb_endpoint_descriptor *ep, u16 bufsize, enum wwan_port_type type, int (*manage_power)(struct usb_interface *, int)) { int rv = -ENOMEM; struct wdm_device *desc; desc = kzalloc(sizeof(struct wdm_device), GFP_KERNEL); if (!desc) goto out; INIT_LIST_HEAD(&desc->device_list); mutex_init(&desc->rlock); mutex_init(&desc->wlock); spin_lock_init(&desc->iuspin); init_waitqueue_head(&desc->wait); desc->wMaxCommand = bufsize; /* this will be expanded and needed in hardware endianness */ desc->inum = cpu_to_le16((u16)intf->cur_altsetting->desc.bInterfaceNumber); desc->intf = intf; desc->wwanp_type = type; INIT_WORK(&desc->rxwork, wdm_rxwork); INIT_WORK(&desc->service_outs_intr, service_interrupt_work); if (!usb_endpoint_is_int_in(ep)) { rv = -EINVAL; goto err; } desc->wMaxPacketSize = usb_endpoint_maxp(ep); desc->orq = kmalloc(sizeof(struct usb_ctrlrequest), GFP_KERNEL); if (!desc->orq) goto err; desc->irq = kmalloc(sizeof(struct usb_ctrlrequest), GFP_KERNEL); if (!desc->irq) goto err; desc->validity = usb_alloc_urb(0, GFP_KERNEL); if (!desc->validity) goto err; desc->response = usb_alloc_urb(0, GFP_KERNEL); if (!desc->response) goto err; desc->command = usb_alloc_urb(0, GFP_KERNEL); if (!desc->command) goto err; desc->ubuf = kmalloc(desc->wMaxCommand, GFP_KERNEL); if (!desc->ubuf) goto err; desc->sbuf = kmalloc(desc->wMaxPacketSize, GFP_KERNEL); if (!desc->sbuf) goto err; desc->inbuf = kmalloc(desc->wMaxCommand, GFP_KERNEL); if (!desc->inbuf) goto err; usb_fill_int_urb( desc->validity, interface_to_usbdev(intf), usb_rcvintpipe(interface_to_usbdev(intf), ep->bEndpointAddress), desc->sbuf, desc->wMaxPacketSize, wdm_int_callback, desc, ep->bInterval ); desc->irq->bRequestType = (USB_DIR_IN | USB_TYPE_CLASS | USB_RECIP_INTERFACE); desc->irq->bRequest = USB_CDC_GET_ENCAPSULATED_RESPONSE; desc->irq->wValue = 0; desc->irq->wIndex = desc->inum; /* already converted */ desc->irq->wLength = cpu_to_le16(desc->wMaxCommand); usb_fill_control_urb( desc->response, interface_to_usbdev(intf), /* using common endpoint 0 */ usb_rcvctrlpipe(interface_to_usbdev(desc->intf), 0), (unsigned char *)desc->irq, desc->inbuf, desc->wMaxCommand, wdm_in_callback, desc ); desc->manage_power = manage_power; spin_lock(&wdm_device_list_lock); list_add(&desc->device_list, &wdm_device_list); spin_unlock(&wdm_device_list_lock); rv = usb_register_dev(intf, &wdm_class); if (rv < 0) goto err; else dev_info(&intf->dev, "%s: USB WDM device\n", dev_name(intf->usb_dev)); wdm_wwan_init(desc); out: return rv; err: spin_lock(&wdm_device_list_lock); list_del(&desc->device_list); spin_unlock(&wdm_device_list_lock); cleanup(desc); return rv; } static int wdm_manage_power(struct usb_interface *intf, int on) { /* need autopm_get/put here to ensure the usbcore sees the new value */ int rv = usb_autopm_get_interface(intf); intf->needs_remote_wakeup = on; if (!rv) usb_autopm_put_interface(intf); return 0; } static int wdm_probe(struct usb_interface *intf, const struct usb_device_id *id) { int rv = -EINVAL; struct usb_host_interface *iface; struct usb_endpoint_descriptor *ep; struct usb_cdc_parsed_header hdr; u8 *buffer = intf->altsetting->extra; int buflen = intf->altsetting->extralen; u16 maxcom = WDM_DEFAULT_BUFSIZE; if (!buffer) goto err; cdc_parse_cdc_header(&hdr, intf, buffer, buflen); if (hdr.usb_cdc_dmm_desc) maxcom = le16_to_cpu(hdr.usb_cdc_dmm_desc->wMaxCommand); iface = intf->cur_altsetting; if (iface->desc.bNumEndpoints != 1) goto err; ep = &iface->endpoint[0].desc; rv = wdm_create(intf, ep, maxcom, WWAN_PORT_UNKNOWN, &wdm_manage_power); err: return rv; } /** * usb_cdc_wdm_register - register a WDM subdriver * @intf: usb interface the subdriver will associate with * @ep: interrupt endpoint to monitor for notifications * @bufsize: maximum message size to support for read/write * @type: Type/protocol of the transported data (MBIM, QMI...) * @manage_power: call-back invoked during open and release to * manage the device's power * Create WDM usb class character device and associate it with intf * without binding, allowing another driver to manage the interface. * * The subdriver will manage the given interrupt endpoint exclusively * and will issue control requests referring to the given intf. It * will otherwise avoid interferring, and in particular not do * usb_set_intfdata/usb_get_intfdata on intf. * * The return value is a pointer to the subdriver's struct usb_driver. * The registering driver is responsible for calling this subdriver's * disconnect, suspend, resume, pre_reset and post_reset methods from * its own. */ struct usb_driver *usb_cdc_wdm_register(struct usb_interface *intf, struct usb_endpoint_descriptor *ep, int bufsize, enum wwan_port_type type, int (*manage_power)(struct usb_interface *, int)) { int rv; rv = wdm_create(intf, ep, bufsize, type, manage_power); if (rv < 0) goto err; return &wdm_driver; err: return ERR_PTR(rv); } EXPORT_SYMBOL(usb_cdc_wdm_register); static void wdm_disconnect(struct usb_interface *intf) { struct wdm_device *desc; unsigned long flags; usb_deregister_dev(intf, &wdm_class); desc = wdm_find_device(intf); mutex_lock(&wdm_mutex); wdm_wwan_deinit(desc); /* the spinlock makes sure no new urbs are generated in the callbacks */ spin_lock_irqsave(&desc->iuspin, flags); set_bit(WDM_DISCONNECTING, &desc->flags); set_bit(WDM_READ, &desc->flags); spin_unlock_irqrestore(&desc->iuspin, flags); wake_up_all(&desc->wait); mutex_lock(&desc->rlock); mutex_lock(&desc->wlock); poison_urbs(desc); cancel_work_sync(&desc->rxwork); cancel_work_sync(&desc->service_outs_intr); mutex_unlock(&desc->wlock); mutex_unlock(&desc->rlock); /* the desc->intf pointer used as list key is now invalid */ spin_lock(&wdm_device_list_lock); list_del(&desc->device_list); spin_unlock(&wdm_device_list_lock); if (!desc->count) cleanup(desc); else dev_dbg(&intf->dev, "%d open files - postponing cleanup\n", desc->count); mutex_unlock(&wdm_mutex); } #ifdef CONFIG_PM static int wdm_suspend(struct usb_interface *intf, pm_message_t message) { struct wdm_device *desc = wdm_find_device(intf); int rv = 0; dev_dbg(&desc->intf->dev, "wdm%d_suspend\n", intf->minor); /* if this is an autosuspend the caller does the locking */ if (!PMSG_IS_AUTO(message)) { mutex_lock(&desc->rlock); mutex_lock(&desc->wlock); } spin_lock_irq(&desc->iuspin); if (PMSG_IS_AUTO(message) && (test_bit(WDM_IN_USE, &desc->flags) || test_bit(WDM_RESPONDING, &desc->flags))) { spin_unlock_irq(&desc->iuspin); rv = -EBUSY; } else { set_bit(WDM_SUSPENDING, &desc->flags); spin_unlock_irq(&desc->iuspin); /* callback submits work - order is essential */ poison_urbs(desc); cancel_work_sync(&desc->rxwork); cancel_work_sync(&desc->service_outs_intr); unpoison_urbs(desc); } if (!PMSG_IS_AUTO(message)) { mutex_unlock(&desc->wlock); mutex_unlock(&desc->rlock); } return rv; } #endif static int recover_from_urb_loss(struct wdm_device *desc) { int rv = 0; if (desc->count) { rv = usb_submit_urb(desc->validity, GFP_NOIO); if (rv < 0) dev_err(&desc->intf->dev, "Error resume submitting int urb - %d\n", rv); } return rv; } #ifdef CONFIG_PM static int wdm_resume(struct usb_interface *intf) { struct wdm_device *desc = wdm_find_device(intf); int rv; dev_dbg(&desc->intf->dev, "wdm%d_resume\n", intf->minor); clear_bit(WDM_SUSPENDING, &desc->flags); rv = recover_from_urb_loss(desc); return rv; } #endif static int wdm_pre_reset(struct usb_interface *intf) { struct wdm_device *desc = wdm_find_device(intf); /* * we notify everybody using poll of * an exceptional situation * must be done before recovery lest a spontaneous * message from the device is lost */ spin_lock_irq(&desc->iuspin); set_bit(WDM_RESETTING, &desc->flags); /* inform read/write */ set_bit(WDM_READ, &desc->flags); /* unblock read */ clear_bit(WDM_IN_USE, &desc->flags); /* unblock write */ desc->rerr = -EINTR; spin_unlock_irq(&desc->iuspin); wake_up_all(&desc->wait); mutex_lock(&desc->rlock); mutex_lock(&desc->wlock); poison_urbs(desc); cancel_work_sync(&desc->rxwork); cancel_work_sync(&desc->service_outs_intr); return 0; } static int wdm_post_reset(struct usb_interface *intf) { struct wdm_device *desc = wdm_find_device(intf); int rv; unpoison_urbs(desc); clear_bit(WDM_OVERFLOW, &desc->flags); clear_bit(WDM_RESETTING, &desc->flags); rv = recover_from_urb_loss(desc); mutex_unlock(&desc->wlock); mutex_unlock(&desc->rlock); return rv; } static struct usb_driver wdm_driver = { .name = "cdc_wdm", .probe = wdm_probe, .disconnect = wdm_disconnect, #ifdef CONFIG_PM .suspend = wdm_suspend, .resume = wdm_resume, .reset_resume = wdm_resume, #endif .pre_reset = wdm_pre_reset, .post_reset = wdm_post_reset, .id_table = wdm_ids, .supports_autosuspend = 1, .disable_hub_initiated_lpm = 1, }; module_usb_driver(wdm_driver); MODULE_AUTHOR(DRIVER_AUTHOR); MODULE_DESCRIPTION(DRIVER_DESC); MODULE_LICENSE("GPL");
441 437 439 48 49 34 29 428 368 357 417 429 4 368 418 187 158 435 437 308 30 56 10 1 426 9 328 155 1 260 96 9 2 3 90 91 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 // SPDX-License-Identifier: GPL-2.0 /* * linux/fs/hfsplus/bfind.c * * Copyright (C) 2001 * Brad Boyer (flar@allandria.com) * (C) 2003 Ardis Technologies <roman@ardistech.com> * * Search routines for btrees */ #include <linux/slab.h> #include "hfsplus_fs.h" int hfs_find_init(struct hfs_btree *tree, struct hfs_find_data *fd) { void *ptr; fd->tree = tree; fd->bnode = NULL; ptr = kmalloc(tree->max_key_len * 2 + 4, GFP_KERNEL); if (!ptr) return -ENOMEM; fd->search_key = ptr; fd->key = ptr + tree->max_key_len + 2; hfs_dbg(BNODE_REFS, "find_init: %d (%p)\n", tree->cnid, __builtin_return_address(0)); mutex_lock_nested(&tree->tree_lock, hfsplus_btree_lock_class(tree)); return 0; } void hfs_find_exit(struct hfs_find_data *fd) { hfs_bnode_put(fd->bnode); kfree(fd->search_key); hfs_dbg(BNODE_REFS, "find_exit: %d (%p)\n", fd->tree->cnid, __builtin_return_address(0)); mutex_unlock(&fd->tree->tree_lock); fd->tree = NULL; } int hfs_find_1st_rec_by_cnid(struct hfs_bnode *bnode, struct hfs_find_data *fd, int *begin, int *end, int *cur_rec) { __be32 cur_cnid; __be32 search_cnid; if (bnode->tree->cnid == HFSPLUS_EXT_CNID) { cur_cnid = fd->key->ext.cnid; search_cnid = fd->search_key->ext.cnid; } else if (bnode->tree->cnid == HFSPLUS_CAT_CNID) { cur_cnid = fd->key->cat.parent; search_cnid = fd->search_key->cat.parent; } else if (bnode->tree->cnid == HFSPLUS_ATTR_CNID) { cur_cnid = fd->key->attr.cnid; search_cnid = fd->search_key->attr.cnid; } else { cur_cnid = 0; /* used-uninitialized warning */ search_cnid = 0; BUG(); } if (cur_cnid == search_cnid) { (*end) = (*cur_rec); if ((*begin) == (*end)) return 1; } else { if (be32_to_cpu(cur_cnid) < be32_to_cpu(search_cnid)) (*begin) = (*cur_rec) + 1; else (*end) = (*cur_rec) - 1; } return 0; } int hfs_find_rec_by_key(struct hfs_bnode *bnode, struct hfs_find_data *fd, int *begin, int *end, int *cur_rec) { int cmpval; cmpval = bnode->tree->keycmp(fd->key, fd->search_key); if (!cmpval) { (*end) = (*cur_rec); return 1; } if (cmpval < 0) (*begin) = (*cur_rec) + 1; else *(end) = (*cur_rec) - 1; return 0; } /* Find the record in bnode that best matches key (not greater than...)*/ int __hfs_brec_find(struct hfs_bnode *bnode, struct hfs_find_data *fd, search_strategy_t rec_found) { u16 off, len, keylen; int rec; int b, e; int res; BUG_ON(!rec_found); b = 0; e = bnode->num_recs - 1; res = -ENOENT; do { rec = (e + b) / 2; len = hfs_brec_lenoff(bnode, rec, &off); keylen = hfs_brec_keylen(bnode, rec); if (keylen == 0) { res = -EINVAL; goto fail; } hfs_bnode_read(bnode, fd->key, off, keylen); if (rec_found(bnode, fd, &b, &e, &rec)) { res = 0; goto done; } } while (b <= e); if (rec != e && e >= 0) { len = hfs_brec_lenoff(bnode, e, &off); keylen = hfs_brec_keylen(bnode, e); if (keylen == 0) { res = -EINVAL; goto fail; } hfs_bnode_read(bnode, fd->key, off, keylen); } done: fd->record = e; fd->keyoffset = off; fd->keylength = keylen; fd->entryoffset = off + keylen; fd->entrylength = len - keylen; fail: return res; } /* Traverse a B*Tree from the root to a leaf finding best fit to key */ /* Return allocated copy of node found, set recnum to best record */ int hfs_brec_find(struct hfs_find_data *fd, search_strategy_t do_key_compare) { struct hfs_btree *tree; struct hfs_bnode *bnode; u32 nidx, parent; __be32 data; int height, res; tree = fd->tree; if (fd->bnode) hfs_bnode_put(fd->bnode); fd->bnode = NULL; nidx = tree->root; if (!nidx) return -ENOENT; height = tree->depth; res = 0; parent = 0; for (;;) { bnode = hfs_bnode_find(tree, nidx); if (IS_ERR(bnode)) { res = PTR_ERR(bnode); bnode = NULL; break; } if (bnode->height != height) goto invalid; if (bnode->type != (--height ? HFS_NODE_INDEX : HFS_NODE_LEAF)) goto invalid; bnode->parent = parent; res = __hfs_brec_find(bnode, fd, do_key_compare); if (!height) break; if (fd->record < 0) goto release; parent = nidx; hfs_bnode_read(bnode, &data, fd->entryoffset, 4); nidx = be32_to_cpu(data); hfs_bnode_put(bnode); } fd->bnode = bnode; return res; invalid: pr_err("inconsistency in B*Tree (%d,%d,%d,%u,%u)\n", height, bnode->height, bnode->type, nidx, parent); res = -EIO; release: hfs_bnode_put(bnode); return res; } int hfs_brec_read(struct hfs_find_data *fd, void *rec, int rec_len) { int res; res = hfs_brec_find(fd, hfs_find_rec_by_key); if (res) return res; if (fd->entrylength > rec_len) return -EINVAL; hfs_bnode_read(fd->bnode, rec, fd->entryoffset, fd->entrylength); return 0; } int hfs_brec_goto(struct hfs_find_data *fd, int cnt) { struct hfs_btree *tree; struct hfs_bnode *bnode; int idx, res = 0; u16 off, len, keylen; bnode = fd->bnode; tree = bnode->tree; if (cnt < 0) { cnt = -cnt; while (cnt > fd->record) { cnt -= fd->record + 1; fd->record = bnode->num_recs - 1; idx = bnode->prev; if (!idx) { res = -ENOENT; goto out; } hfs_bnode_put(bnode); bnode = hfs_bnode_find(tree, idx); if (IS_ERR(bnode)) { res = PTR_ERR(bnode); bnode = NULL; goto out; } } fd->record -= cnt; } else { while (cnt >= bnode->num_recs - fd->record) { cnt -= bnode->num_recs - fd->record; fd->record = 0; idx = bnode->next; if (!idx) { res = -ENOENT; goto out; } hfs_bnode_put(bnode); bnode = hfs_bnode_find(tree, idx); if (IS_ERR(bnode)) { res = PTR_ERR(bnode); bnode = NULL; goto out; } } fd->record += cnt; } len = hfs_brec_lenoff(bnode, fd->record, &off); keylen = hfs_brec_keylen(bnode, fd->record); if (keylen == 0) { res = -EINVAL; goto out; } fd->keyoffset = off; fd->keylength = keylen; fd->entryoffset = off + keylen; fd->entrylength = len - keylen; hfs_bnode_read(bnode, fd->key, off, keylen); out: fd->bnode = bnode; return res; }
10 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 /* SPDX-License-Identifier: GPL-2.0 */ /* include/net/dsfield.h - Manipulation of the Differentiated Services field */ /* Written 1998-2000 by Werner Almesberger, EPFL ICA */ #ifndef __NET_DSFIELD_H #define __NET_DSFIELD_H #include <linux/types.h> #include <linux/ip.h> #include <linux/ipv6.h> #include <asm/byteorder.h> static inline __u8 ipv4_get_dsfield(const struct iphdr *iph) { return iph->tos; } static inline __u8 ipv6_get_dsfield(const struct ipv6hdr *ipv6h) { return ntohs(*(__force const __be16 *)ipv6h) >> 4; } static inline void ipv4_change_dsfield(struct iphdr *iph,__u8 mask, __u8 value) { __u32 check = ntohs((__force __be16)iph->check); __u8 dsfield; dsfield = (iph->tos & mask) | value; check += iph->tos; if ((check+1) >> 16) check = (check+1) & 0xffff; check -= dsfield; check += check >> 16; /* adjust carry */ iph->check = (__force __sum16)htons(check); iph->tos = dsfield; } static inline void ipv6_change_dsfield(struct ipv6hdr *ipv6h,__u8 mask, __u8 value) { __be16 *p = (__force __be16 *)ipv6h; *p = (*p & htons((((u16)mask << 4) | 0xf00f))) | htons((u16)value << 4); } #endif
13 9 13 14 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 /* * linux/fs/nls/nls_iso8859-2.c * * Charset iso8859-2 translation tables. * Generated automatically from the Unicode and charset * tables from the Unicode Organization (www.unicode.org). * The Unicode to charset table has only exact mappings. */ #include <linux/module.h> #include <linux/kernel.h> #include <linux/string.h> #include <linux/nls.h> #include <linux/errno.h> static const wchar_t charset2uni[256] = { /* 0x00*/ 0x0000, 0x0001, 0x0002, 0x0003, 0x0004, 0x0005, 0x0006, 0x0007, 0x0008, 0x0009, 0x000a, 0x000b, 0x000c, 0x000d, 0x000e, 0x000f, /* 0x10*/ 0x0010, 0x0011, 0x0012, 0x0013, 0x0014, 0x0015, 0x0016, 0x0017, 0x0018, 0x0019, 0x001a, 0x001b, 0x001c, 0x001d, 0x001e, 0x001f, /* 0x20*/ 0x0020, 0x0021, 0x0022, 0x0023, 0x0024, 0x0025, 0x0026, 0x0027, 0x0028, 0x0029, 0x002a, 0x002b, 0x002c, 0x002d, 0x002e, 0x002f, /* 0x30*/ 0x0030, 0x0031, 0x0032, 0x0033, 0x0034, 0x0035, 0x0036, 0x0037, 0x0038, 0x0039, 0x003a, 0x003b, 0x003c, 0x003d, 0x003e, 0x003f, /* 0x40*/ 0x0040, 0x0041, 0x0042, 0x0043, 0x0044, 0x0045, 0x0046, 0x0047, 0x0048, 0x0049, 0x004a, 0x004b, 0x004c, 0x004d, 0x004e, 0x004f, /* 0x50*/ 0x0050, 0x0051, 0x0052, 0x0053, 0x0054, 0x0055, 0x0056, 0x0057, 0x0058, 0x0059, 0x005a, 0x005b, 0x005c, 0x005d, 0x005e, 0x005f, /* 0x60*/ 0x0060, 0x0061, 0x0062, 0x0063, 0x0064, 0x0065, 0x0066, 0x0067, 0x0068, 0x0069, 0x006a, 0x006b, 0x006c, 0x006d, 0x006e, 0x006f, /* 0x70*/ 0x0070, 0x0071, 0x0072, 0x0073, 0x0074, 0x0075, 0x0076, 0x0077, 0x0078, 0x0079, 0x007a, 0x007b, 0x007c, 0x007d, 0x007e, 0x007f, /* 0x80*/ 0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x0085, 0x0086, 0x0087, 0x0088, 0x0089, 0x008a, 0x008b, 0x008c, 0x008d, 0x008e, 0x008f, /* 0x90*/ 0x0090, 0x0091, 0x0092, 0x0093, 0x0094, 0x0095, 0x0096, 0x0097, 0x0098, 0x0099, 0x009a, 0x009b, 0x009c, 0x009d, 0x009e, 0x009f, /* 0xa0*/ 0x00a0, 0x0104, 0x02d8, 0x0141, 0x00a4, 0x013d, 0x015a, 0x00a7, 0x00a8, 0x0160, 0x015e, 0x0164, 0x0179, 0x00ad, 0x017d, 0x017b, /* 0xb0*/ 0x00b0, 0x0105, 0x02db, 0x0142, 0x00b4, 0x013e, 0x015b, 0x02c7, 0x00b8, 0x0161, 0x015f, 0x0165, 0x017a, 0x02dd, 0x017e, 0x017c, /* 0xc0*/ 0x0154, 0x00c1, 0x00c2, 0x0102, 0x00c4, 0x0139, 0x0106, 0x00c7, 0x010c, 0x00c9, 0x0118, 0x00cb, 0x011a, 0x00cd, 0x00ce, 0x010e, /* 0xd0*/ 0x0110, 0x0143, 0x0147, 0x00d3, 0x00d4, 0x0150, 0x00d6, 0x00d7, 0x0158, 0x016e, 0x00da, 0x0170, 0x00dc, 0x00dd, 0x0162, 0x00df, /* 0xe0*/ 0x0155, 0x00e1, 0x00e2, 0x0103, 0x00e4, 0x013a, 0x0107, 0x00e7, 0x010d, 0x00e9, 0x0119, 0x00eb, 0x011b, 0x00ed, 0x00ee, 0x010f, /* 0xf0*/ 0x0111, 0x0144, 0x0148, 0x00f3, 0x00f4, 0x0151, 0x00f6, 0x00f7, 0x0159, 0x016f, 0x00fa, 0x0171, 0x00fc, 0x00fd, 0x0163, 0x02d9, }; static const unsigned char page00[256] = { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */ 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */ 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */ 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */ 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */ 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */ 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */ 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */ 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */ 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x98-0x9f */ 0xa0, 0x00, 0x00, 0x00, 0xa4, 0x00, 0x00, 0xa7, /* 0xa0-0xa7 */ 0xa8, 0x00, 0x00, 0x00, 0x00, 0xad, 0x00, 0x00, /* 0xa8-0xaf */ 0xb0, 0x00, 0x00, 0x00, 0xb4, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */ 0xb8, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */ 0x00, 0xc1, 0xc2, 0x00, 0xc4, 0x00, 0x00, 0xc7, /* 0xc0-0xc7 */ 0x00, 0xc9, 0x00, 0xcb, 0x00, 0xcd, 0xce, 0x00, /* 0xc8-0xcf */ 0x00, 0x00, 0x00, 0xd3, 0xd4, 0x00, 0xd6, 0xd7, /* 0xd0-0xd7 */ 0x00, 0x00, 0xda, 0x00, 0xdc, 0xdd, 0x00, 0xdf, /* 0xd8-0xdf */ 0x00, 0xe1, 0xe2, 0x00, 0xe4, 0x00, 0x00, 0xe7, /* 0xe0-0xe7 */ 0x00, 0xe9, 0x00, 0xeb, 0x00, 0xed, 0xee, 0x00, /* 0xe8-0xef */ 0x00, 0x00, 0x00, 0xf3, 0xf4, 0x00, 0xf6, 0xf7, /* 0xf0-0xf7 */ 0x00, 0x00, 0xfa, 0x00, 0xfc, 0xfd, 0x00, 0x00, /* 0xf8-0xff */ }; static const unsigned char page01[256] = { 0x00, 0x00, 0xc3, 0xe3, 0xa1, 0xb1, 0xc6, 0xe6, /* 0x00-0x07 */ 0x00, 0x00, 0x00, 0x00, 0xc8, 0xe8, 0xcf, 0xef, /* 0x08-0x0f */ 0xd0, 0xf0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ 0xca, 0xea, 0xcc, 0xec, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */ 0x00, 0xc5, 0xe5, 0x00, 0x00, 0xa5, 0xb5, 0x00, /* 0x38-0x3f */ 0x00, 0xa3, 0xb3, 0xd1, 0xf1, 0x00, 0x00, 0xd2, /* 0x40-0x47 */ 0xf2, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ 0xd5, 0xf5, 0x00, 0x00, 0xc0, 0xe0, 0x00, 0x00, /* 0x50-0x57 */ 0xd8, 0xf8, 0xa6, 0xb6, 0x00, 0x00, 0xaa, 0xba, /* 0x58-0x5f */ 0xa9, 0xb9, 0xde, 0xfe, 0xab, 0xbb, 0x00, 0x00, /* 0x60-0x67 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xd9, 0xf9, /* 0x68-0x6f */ 0xdb, 0xfb, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ 0x00, 0xac, 0xbc, 0xaf, 0xbf, 0xae, 0xbe, 0x00, /* 0x78-0x7f */ }; static const unsigned char page02[256] = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xb7, /* 0xc0-0xc7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */ 0xa2, 0xff, 0x00, 0xb2, 0x00, 0xbd, 0x00, 0x00, /* 0xd8-0xdf */ }; static const unsigned char *const page_uni2charset[256] = { page00, page01, page02, NULL, NULL, NULL, NULL, NULL, }; static const unsigned char charset2lower[256] = { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ 0x40, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x40-0x47 */ 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x48-0x4f */ 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x50-0x57 */ 0x78, 0x79, 0x7a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */ 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */ 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */ 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */ 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */ 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */ 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x98-0x9f */ 0xa0, 0xb1, 0xa2, 0xb3, 0xa4, 0xb5, 0xb6, 0xa7, /* 0xa0-0xa7 */ 0xa8, 0xb9, 0xba, 0xbb, 0xbc, 0xad, 0xbe, 0xbf, /* 0xa8-0xaf */ 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */ 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, /* 0xb8-0xbf */ 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0xc0-0xc7 */ 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xc8-0xcf */ 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xd7, /* 0xd0-0xd7 */ 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xdf, /* 0xd8-0xdf */ 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0xe0-0xe7 */ 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xe8-0xef */ 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xf0-0xf7 */ 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, /* 0xf8-0xff */ }; static const unsigned char charset2upper[256] = { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */ 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */ 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */ 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ 0x60, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x60-0x67 */ 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x68-0x6f */ 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x70-0x77 */ 0x58, 0x59, 0x5a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */ 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */ 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */ 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x98-0x9f */ 0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, /* 0xa0-0xa7 */ 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, /* 0xa8-0xaf */ 0xb0, 0xa1, 0xb2, 0xa3, 0xb4, 0xa5, 0xa6, 0xb7, /* 0xb0-0xb7 */ 0xb8, 0xa9, 0xaa, 0xab, 0xac, 0xbd, 0xae, 0xaf, /* 0xb8-0xbf */ 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xc0-0xc7 */ 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */ 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0xd0-0xd7 */ 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, /* 0xd8-0xdf */ 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xe0-0xe7 */ 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xe8-0xef */ 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xf7, /* 0xf0-0xf7 */ 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xff, /* 0xf8-0xff */ }; static int uni2char(wchar_t uni, unsigned char *out, int boundlen) { const unsigned char *uni2charset; unsigned char cl = uni & 0x00ff; unsigned char ch = (uni & 0xff00) >> 8; if (boundlen <= 0) return -ENAMETOOLONG; uni2charset = page_uni2charset[ch]; if (uni2charset && uni2charset[cl]) out[0] = uni2charset[cl]; else return -EINVAL; return 1; } static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni) { *uni = charset2uni[*rawstring]; if (*uni == 0x0000) return -EINVAL; return 1; } static struct nls_table table = { .charset = "iso8859-2", .uni2char = uni2char, .char2uni = char2uni, .charset2lower = charset2lower, .charset2upper = charset2upper, }; static int __init init_nls_iso8859_2(void) { return register_nls(&table); } static void __exit exit_nls_iso8859_2(void) { unregister_nls(&table); } module_init(init_nls_iso8859_2) module_exit(exit_nls_iso8859_2) MODULE_DESCRIPTION("NLS ISO 8859-2 (Latin 2; Slavic/Central European Languages)"); MODULE_LICENSE("Dual BSD/GPL");
410 410 410 138 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 // SPDX-License-Identifier: GPL-2.0 /* * Implement mseal() syscall. * * Copyright (c) 2023,2024 Google, Inc. * * Author: Jeff Xu <jeffxu@chromium.org> */ #include <linux/mempolicy.h> #include <linux/mman.h> #include <linux/mm.h> #include <linux/mm_inline.h> #include <linux/mmu_context.h> #include <linux/syscalls.h> #include <linux/sched.h> #include "internal.h" static inline void set_vma_sealed(struct vm_area_struct *vma) { vm_flags_set(vma, VM_SEALED); } static bool is_madv_discard(int behavior) { switch (behavior) { case MADV_FREE: case MADV_DONTNEED: case MADV_DONTNEED_LOCKED: case MADV_REMOVE: case MADV_DONTFORK: case MADV_WIPEONFORK: case MADV_GUARD_INSTALL: return true; } return false; } static bool is_ro_anon(struct vm_area_struct *vma) { /* check anonymous mapping. */ if (vma->vm_file || vma->vm_flags & VM_SHARED) return false; /* * check for non-writable: * PROT=RO or PKRU is not writeable. */ if (!(vma->vm_flags & VM_WRITE) || !arch_vma_access_permitted(vma, true, false, false)) return true; return false; } /* * Check if a vma is allowed to be modified by madvise. */ bool can_modify_vma_madv(struct vm_area_struct *vma, int behavior) { if (!is_madv_discard(behavior)) return true; if (unlikely(!can_modify_vma(vma) && is_ro_anon(vma))) return false; /* Allow by default. */ return true; } static int mseal_fixup(struct vma_iterator *vmi, struct vm_area_struct *vma, struct vm_area_struct **prev, unsigned long start, unsigned long end, vm_flags_t newflags) { int ret = 0; vm_flags_t oldflags = vma->vm_flags; if (newflags == oldflags) goto out; vma = vma_modify_flags(vmi, *prev, vma, start, end, newflags); if (IS_ERR(vma)) { ret = PTR_ERR(vma); goto out; } set_vma_sealed(vma); out: *prev = vma; return ret; } /* * Check for do_mseal: * 1> start is part of a valid vma. * 2> end is part of a valid vma. * 3> No gap (unallocated address) between start and end. * 4> map is sealable. */ static int check_mm_seal(unsigned long start, unsigned long end) { struct vm_area_struct *vma; unsigned long nstart = start; VMA_ITERATOR(vmi, current->mm, start); /* going through each vma to check. */ for_each_vma_range(vmi, vma, end) { if (vma->vm_start > nstart) /* unallocated memory found. */ return -ENOMEM; if (vma->vm_end >= end) return 0; nstart = vma->vm_end; } return -ENOMEM; } /* * Apply sealing. */ static int apply_mm_seal(unsigned long start, unsigned long end) { unsigned long nstart; struct vm_area_struct *vma, *prev; VMA_ITERATOR(vmi, current->mm, start); vma = vma_iter_load(&vmi); /* * Note: check_mm_seal should already checked ENOMEM case. * so vma should not be null, same for the other ENOMEM cases. */ prev = vma_prev(&vmi); if (start > vma->vm_start) prev = vma; nstart = start; for_each_vma_range(vmi, vma, end) { int error; unsigned long tmp; vm_flags_t newflags; newflags = vma->vm_flags | VM_SEALED; tmp = vma->vm_end; if (tmp > end) tmp = end; error = mseal_fixup(&vmi, vma, &prev, nstart, tmp, newflags); if (error) return error; nstart = vma_iter_end(&vmi); } return 0; } /* * mseal(2) seals the VM's meta data from * selected syscalls. * * addr/len: VM address range. * * The address range by addr/len must meet: * start (addr) must be in a valid VMA. * end (addr + len) must be in a valid VMA. * no gap (unallocated memory) between start and end. * start (addr) must be page aligned. * * len: len will be page aligned implicitly. * * Below VMA operations are blocked after sealing. * 1> Unmapping, moving to another location, and shrinking * the size, via munmap() and mremap(), can leave an empty * space, therefore can be replaced with a VMA with a new * set of attributes. * 2> Moving or expanding a different vma into the current location, * via mremap(). * 3> Modifying a VMA via mmap(MAP_FIXED). * 4> Size expansion, via mremap(), does not appear to pose any * specific risks to sealed VMAs. It is included anyway because * the use case is unclear. In any case, users can rely on * merging to expand a sealed VMA. * 5> mprotect and pkey_mprotect. * 6> Some destructive madvice() behavior (e.g. MADV_DONTNEED) * for anonymous memory, when users don't have write permission to the * memory. Those behaviors can alter region contents by discarding pages, * effectively a memset(0) for anonymous memory. * * flags: reserved. * * return values: * zero: success. * -EINVAL: * invalid input flags. * start address is not page aligned. * Address arange (start + len) overflow. * -ENOMEM: * addr is not a valid address (not allocated). * end (start + len) is not a valid address. * a gap (unallocated memory) between start and end. * -EPERM: * - In 32 bit architecture, sealing is not supported. * Note: * user can call mseal(2) multiple times, adding a seal on an * already sealed memory is a no-action (no error). * * unseal() is not supported. */ int do_mseal(unsigned long start, size_t len_in, unsigned long flags) { size_t len; int ret = 0; unsigned long end; struct mm_struct *mm = current->mm; /* Verify flags not set. */ if (flags) return -EINVAL; start = untagged_addr(start); if (!PAGE_ALIGNED(start)) return -EINVAL; len = PAGE_ALIGN(len_in); /* Check to see whether len was rounded up from small -ve to zero. */ if (len_in && !len) return -EINVAL; end = start + len; if (end < start) return -EINVAL; if (end == start) return 0; if (mmap_write_lock_killable(mm)) return -EINTR; /* * First pass, this helps to avoid * partial sealing in case of error in input address range, * e.g. ENOMEM error. */ ret = check_mm_seal(start, end); if (ret) goto out; /* * Second pass, this should success, unless there are errors * from vma_modify_flags, e.g. merge/split error, or process * reaching the max supported VMAs, however, those cases shall * be rare. */ ret = apply_mm_seal(start, end); out: mmap_write_unlock(current->mm); return ret; } SYSCALL_DEFINE3(mseal, unsigned long, start, size_t, len, unsigned long, flags) { return do_mseal(start, len, flags); }
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 // SPDX-License-Identifier: GPL-2.0 /* * Power trace points * * Copyright (C) 2009 Ming Lei <ming.lei@canonical.com> */ #include <linux/string.h> #include <linux/types.h> #include <linux/workqueue.h> #include <linux/sched.h> #include <linux/module.h> #include <linux/usb.h> #define CREATE_TRACE_POINTS #include <trace/events/rpm.h> EXPORT_TRACEPOINT_SYMBOL_GPL(rpm_return_int); EXPORT_TRACEPOINT_SYMBOL_GPL(rpm_idle); EXPORT_TRACEPOINT_SYMBOL_GPL(rpm_suspend); EXPORT_TRACEPOINT_SYMBOL_GPL(rpm_resume);
121 1 7 7 7 16 11 14 4 4 4 4 37 1 2 2 14 2 153 2 163 167 166 2 164 164 14 154 2 441 441 439 441 49 414 306 14 2 2 14 15 3 3 15 4 10 10 10 11 10 10 1 9 10 11 10 11 4 1 4 11 2 1 8 1 20 4 42 11 11 11 3 11 11 1 11 3 11 11 11 11 18 1 1 17 12 8 17 85 22 63 3 81 4 25 25 25 31 444 446 84 25 79 22 20 3 29 29 4 5 10 9 1 2 10 5 8 2 10 3 14 18 2 17 26 35 29 1 5 29 4 1 19 2 18 25 25 14 11 11 25 10 1 1 2 22 3 21 4 20 20 26 26 395 26 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 // SPDX-License-Identifier: GPL-2.0-only /* * drivers/net/veth.c * * Copyright (C) 2007 OpenVZ http://openvz.org, SWsoft Inc * * Author: Pavel Emelianov <xemul@openvz.org> * Ethtool interface from: Eric W. Biederman <ebiederm@xmission.com> * */ #include <linux/netdevice.h> #include <linux/slab.h> #include <linux/ethtool.h> #include <linux/etherdevice.h> #include <linux/u64_stats_sync.h> #include <net/rtnetlink.h> #include <net/dst.h> #include <net/netdev_lock.h> #include <net/xfrm.h> #include <net/xdp.h> #include <linux/veth.h> #include <linux/module.h> #include <linux/bpf.h> #include <linux/filter.h> #include <linux/ptr_ring.h> #include <linux/bpf_trace.h> #include <linux/net_tstamp.h> #include <linux/skbuff_ref.h> #include <net/page_pool/helpers.h> #define DRV_NAME "veth" #define DRV_VERSION "1.0" #define VETH_XDP_FLAG BIT(0) #define VETH_RING_SIZE 256 #define VETH_XDP_HEADROOM (XDP_PACKET_HEADROOM + NET_IP_ALIGN) #define VETH_XDP_TX_BULK_SIZE 16 #define VETH_XDP_BATCH 16 struct veth_stats { u64 rx_drops; /* xdp */ u64 xdp_packets; u64 xdp_bytes; u64 xdp_redirect; u64 xdp_drops; u64 xdp_tx; u64 xdp_tx_err; u64 peer_tq_xdp_xmit; u64 peer_tq_xdp_xmit_err; }; struct veth_rq_stats { struct veth_stats vs; struct u64_stats_sync syncp; }; struct veth_rq { struct napi_struct xdp_napi; struct napi_struct __rcu *napi; /* points to xdp_napi when the latter is initialized */ struct net_device *dev; struct bpf_prog __rcu *xdp_prog; struct xdp_mem_info xdp_mem; struct veth_rq_stats stats; bool rx_notify_masked; struct ptr_ring xdp_ring; struct xdp_rxq_info xdp_rxq; struct page_pool *page_pool; }; struct veth_priv { struct net_device __rcu *peer; atomic64_t dropped; struct bpf_prog *_xdp_prog; struct veth_rq *rq; unsigned int requested_headroom; }; struct veth_xdp_tx_bq { struct xdp_frame *q[VETH_XDP_TX_BULK_SIZE]; unsigned int count; }; /* * ethtool interface */ struct veth_q_stat_desc { char desc[ETH_GSTRING_LEN]; size_t offset; }; #define VETH_RQ_STAT(m) offsetof(struct veth_stats, m) static const struct veth_q_stat_desc veth_rq_stats_desc[] = { { "xdp_packets", VETH_RQ_STAT(xdp_packets) }, { "xdp_bytes", VETH_RQ_STAT(xdp_bytes) }, { "drops", VETH_RQ_STAT(rx_drops) }, { "xdp_redirect", VETH_RQ_STAT(xdp_redirect) }, { "xdp_drops", VETH_RQ_STAT(xdp_drops) }, { "xdp_tx", VETH_RQ_STAT(xdp_tx) }, { "xdp_tx_errors", VETH_RQ_STAT(xdp_tx_err) }, }; #define VETH_RQ_STATS_LEN ARRAY_SIZE(veth_rq_stats_desc) static const struct veth_q_stat_desc veth_tq_stats_desc[] = { { "xdp_xmit", VETH_RQ_STAT(peer_tq_xdp_xmit) }, { "xdp_xmit_errors", VETH_RQ_STAT(peer_tq_xdp_xmit_err) }, }; #define VETH_TQ_STATS_LEN ARRAY_SIZE(veth_tq_stats_desc) static struct { const char string[ETH_GSTRING_LEN]; } ethtool_stats_keys[] = { { "peer_ifindex" }, }; struct veth_xdp_buff { struct xdp_buff xdp; struct sk_buff *skb; }; static int veth_get_link_ksettings(struct net_device *dev, struct ethtool_link_ksettings *cmd) { cmd->base.speed = SPEED_10000; cmd->base.duplex = DUPLEX_FULL; cmd->base.port = PORT_TP; cmd->base.autoneg = AUTONEG_DISABLE; return 0; } static void veth_get_drvinfo(struct net_device *dev, struct ethtool_drvinfo *info) { strscpy(info->driver, DRV_NAME, sizeof(info->driver)); strscpy(info->version, DRV_VERSION, sizeof(info->version)); } static void veth_get_strings(struct net_device *dev, u32 stringset, u8 *buf) { u8 *p = buf; int i, j; switch(stringset) { case ETH_SS_STATS: memcpy(p, &ethtool_stats_keys, sizeof(ethtool_stats_keys)); p += sizeof(ethtool_stats_keys); for (i = 0; i < dev->real_num_rx_queues; i++) for (j = 0; j < VETH_RQ_STATS_LEN; j++) ethtool_sprintf(&p, "rx_queue_%u_%.18s", i, veth_rq_stats_desc[j].desc); for (i = 0; i < dev->real_num_tx_queues; i++) for (j = 0; j < VETH_TQ_STATS_LEN; j++) ethtool_sprintf(&p, "tx_queue_%u_%.18s", i, veth_tq_stats_desc[j].desc); page_pool_ethtool_stats_get_strings(p); break; } } static int veth_get_sset_count(struct net_device *dev, int sset) { switch (sset) { case ETH_SS_STATS: return ARRAY_SIZE(ethtool_stats_keys) + VETH_RQ_STATS_LEN * dev->real_num_rx_queues + VETH_TQ_STATS_LEN * dev->real_num_tx_queues + page_pool_ethtool_stats_get_count(); default: return -EOPNOTSUPP; } } static void veth_get_page_pool_stats(struct net_device *dev, u64 *data) { #ifdef CONFIG_PAGE_POOL_STATS struct veth_priv *priv = netdev_priv(dev); struct page_pool_stats pp_stats = {}; int i; for (i = 0; i < dev->real_num_rx_queues; i++) { if (!priv->rq[i].page_pool) continue; page_pool_get_stats(priv->rq[i].page_pool, &pp_stats); } page_pool_ethtool_stats_get(data, &pp_stats); #endif /* CONFIG_PAGE_POOL_STATS */ } static void veth_get_ethtool_stats(struct net_device *dev, struct ethtool_stats *stats, u64 *data) { struct veth_priv *rcv_priv, *priv = netdev_priv(dev); struct net_device *peer = rtnl_dereference(priv->peer); int i, j, idx, pp_idx; data[0] = peer ? peer->ifindex : 0; idx = 1; for (i = 0; i < dev->real_num_rx_queues; i++) { const struct veth_rq_stats *rq_stats = &priv->rq[i].stats; const void *stats_base = (void *)&rq_stats->vs; unsigned int start; size_t offset; do { start = u64_stats_fetch_begin(&rq_stats->syncp); for (j = 0; j < VETH_RQ_STATS_LEN; j++) { offset = veth_rq_stats_desc[j].offset; data[idx + j] = *(u64 *)(stats_base + offset); } } while (u64_stats_fetch_retry(&rq_stats->syncp, start)); idx += VETH_RQ_STATS_LEN; } pp_idx = idx; if (!peer) goto page_pool_stats; rcv_priv = netdev_priv(peer); for (i = 0; i < peer->real_num_rx_queues; i++) { const struct veth_rq_stats *rq_stats = &rcv_priv->rq[i].stats; const void *base = (void *)&rq_stats->vs; unsigned int start, tx_idx = idx; size_t offset; tx_idx += (i % dev->real_num_tx_queues) * VETH_TQ_STATS_LEN; do { start = u64_stats_fetch_begin(&rq_stats->syncp); for (j = 0; j < VETH_TQ_STATS_LEN; j++) { offset = veth_tq_stats_desc[j].offset; data[tx_idx + j] += *(u64 *)(base + offset); } } while (u64_stats_fetch_retry(&rq_stats->syncp, start)); } pp_idx = idx + dev->real_num_tx_queues * VETH_TQ_STATS_LEN; page_pool_stats: veth_get_page_pool_stats(dev, &data[pp_idx]); } static void veth_get_channels(struct net_device *dev, struct ethtool_channels *channels) { channels->tx_count = dev->real_num_tx_queues; channels->rx_count = dev->real_num_rx_queues; channels->max_tx = dev->num_tx_queues; channels->max_rx = dev->num_rx_queues; } static int veth_set_channels(struct net_device *dev, struct ethtool_channels *ch); static const struct ethtool_ops veth_ethtool_ops = { .get_drvinfo = veth_get_drvinfo, .get_link = ethtool_op_get_link, .get_strings = veth_get_strings, .get_sset_count = veth_get_sset_count, .get_ethtool_stats = veth_get_ethtool_stats, .get_link_ksettings = veth_get_link_ksettings, .get_ts_info = ethtool_op_get_ts_info, .get_channels = veth_get_channels, .set_channels = veth_set_channels, }; /* general routines */ static bool veth_is_xdp_frame(void *ptr) { return (unsigned long)ptr & VETH_XDP_FLAG; } static struct xdp_frame *veth_ptr_to_xdp(void *ptr) { return (void *)((unsigned long)ptr & ~VETH_XDP_FLAG); } static void *veth_xdp_to_ptr(struct xdp_frame *xdp) { return (void *)((unsigned long)xdp | VETH_XDP_FLAG); } static void veth_ptr_free(void *ptr) { if (veth_is_xdp_frame(ptr)) xdp_return_frame(veth_ptr_to_xdp(ptr)); else kfree_skb(ptr); } static void __veth_xdp_flush(struct veth_rq *rq) { /* Write ptr_ring before reading rx_notify_masked */ smp_mb(); if (!READ_ONCE(rq->rx_notify_masked) && napi_schedule_prep(&rq->xdp_napi)) { WRITE_ONCE(rq->rx_notify_masked, true); __napi_schedule(&rq->xdp_napi); } } static int veth_xdp_rx(struct veth_rq *rq, struct sk_buff *skb) { if (unlikely(ptr_ring_produce(&rq->xdp_ring, skb))) return NETDEV_TX_BUSY; /* signal qdisc layer */ return NET_RX_SUCCESS; /* same as NETDEV_TX_OK */ } static int veth_forward_skb(struct net_device *dev, struct sk_buff *skb, struct veth_rq *rq, bool xdp) { return __dev_forward_skb(dev, skb) ?: xdp ? veth_xdp_rx(rq, skb) : __netif_rx(skb); } /* return true if the specified skb has chances of GRO aggregation * Don't strive for accuracy, but try to avoid GRO overhead in the most * common scenarios. * When XDP is enabled, all traffic is considered eligible, as the xmit * device has TSO off. * When TSO is enabled on the xmit device, we are likely interested only * in UDP aggregation, explicitly check for that if the skb is suspected * - the sock_wfree destructor is used by UDP, ICMP and XDP sockets - * to belong to locally generated UDP traffic. */ static bool veth_skb_is_eligible_for_gro(const struct net_device *dev, const struct net_device *rcv, const struct sk_buff *skb) { return !(dev->features & NETIF_F_ALL_TSO) || (skb->destructor == sock_wfree && rcv->features & (NETIF_F_GRO_FRAGLIST | NETIF_F_GRO_UDP_FWD)); } static netdev_tx_t veth_xmit(struct sk_buff *skb, struct net_device *dev) { struct veth_priv *rcv_priv, *priv = netdev_priv(dev); struct veth_rq *rq = NULL; struct netdev_queue *txq; struct net_device *rcv; int length = skb->len; bool use_napi = false; int ret, rxq; rcu_read_lock(); rcv = rcu_dereference(priv->peer); if (unlikely(!rcv) || !pskb_may_pull(skb, ETH_HLEN)) { kfree_skb(skb); goto drop; } rcv_priv = netdev_priv(rcv); rxq = skb_get_queue_mapping(skb); if (rxq < rcv->real_num_rx_queues) { rq = &rcv_priv->rq[rxq]; /* The napi pointer is available when an XDP program is * attached or when GRO is enabled * Don't bother with napi/GRO if the skb can't be aggregated */ use_napi = rcu_access_pointer(rq->napi) && veth_skb_is_eligible_for_gro(dev, rcv, skb); } skb_tx_timestamp(skb); ret = veth_forward_skb(rcv, skb, rq, use_napi); switch (ret) { case NET_RX_SUCCESS: /* same as NETDEV_TX_OK */ if (!use_napi) dev_sw_netstats_tx_add(dev, 1, length); else __veth_xdp_flush(rq); break; case NETDEV_TX_BUSY: /* If a qdisc is attached to our virtual device, returning * NETDEV_TX_BUSY is allowed. */ txq = netdev_get_tx_queue(dev, rxq); if (qdisc_txq_has_no_queue(txq)) { dev_kfree_skb_any(skb); goto drop; } /* Restore Eth hdr pulled by dev_forward_skb/eth_type_trans */ __skb_push(skb, ETH_HLEN); /* Depend on prior success packets started NAPI consumer via * __veth_xdp_flush(). Cancel TXQ stop if consumer stopped, * paired with empty check in veth_poll(). */ netif_tx_stop_queue(txq); smp_mb__after_atomic(); if (unlikely(__ptr_ring_empty(&rq->xdp_ring))) netif_tx_wake_queue(txq); break; case NET_RX_DROP: /* same as NET_XMIT_DROP */ drop: atomic64_inc(&priv->dropped); ret = NET_XMIT_DROP; break; default: net_crit_ratelimited("%s(%s): Invalid return code(%d)", __func__, dev->name, ret); } rcu_read_unlock(); return ret; } static void veth_stats_rx(struct veth_stats *result, struct net_device *dev) { struct veth_priv *priv = netdev_priv(dev); int i; result->peer_tq_xdp_xmit_err = 0; result->xdp_packets = 0; result->xdp_tx_err = 0; result->xdp_bytes = 0; result->rx_drops = 0; for (i = 0; i < dev->num_rx_queues; i++) { u64 packets, bytes, drops, xdp_tx_err, peer_tq_xdp_xmit_err; struct veth_rq_stats *stats = &priv->rq[i].stats; unsigned int start; do { start = u64_stats_fetch_begin(&stats->syncp); peer_tq_xdp_xmit_err = stats->vs.peer_tq_xdp_xmit_err; xdp_tx_err = stats->vs.xdp_tx_err; packets = stats->vs.xdp_packets; bytes = stats->vs.xdp_bytes; drops = stats->vs.rx_drops; } while (u64_stats_fetch_retry(&stats->syncp, start)); result->peer_tq_xdp_xmit_err += peer_tq_xdp_xmit_err; result->xdp_tx_err += xdp_tx_err; result->xdp_packets += packets; result->xdp_bytes += bytes; result->rx_drops += drops; } } static void veth_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *tot) { struct veth_priv *priv = netdev_priv(dev); struct net_device *peer; struct veth_stats rx; tot->tx_dropped = atomic64_read(&priv->dropped); dev_fetch_sw_netstats(tot, dev->tstats); veth_stats_rx(&rx, dev); tot->tx_dropped += rx.xdp_tx_err; tot->rx_dropped = rx.rx_drops + rx.peer_tq_xdp_xmit_err; tot->rx_bytes += rx.xdp_bytes; tot->rx_packets += rx.xdp_packets; rcu_read_lock(); peer = rcu_dereference(priv->peer); if (peer) { struct rtnl_link_stats64 tot_peer = {}; dev_fetch_sw_netstats(&tot_peer, peer->tstats); tot->rx_bytes += tot_peer.tx_bytes; tot->rx_packets += tot_peer.tx_packets; veth_stats_rx(&rx, peer); tot->tx_dropped += rx.peer_tq_xdp_xmit_err; tot->rx_dropped += rx.xdp_tx_err; tot->tx_bytes += rx.xdp_bytes; tot->tx_packets += rx.xdp_packets; } rcu_read_unlock(); } /* fake multicast ability */ static void veth_set_multicast_list(struct net_device *dev) { } static int veth_select_rxq(struct net_device *dev) { return smp_processor_id() % dev->real_num_rx_queues; } static struct net_device *veth_peer_dev(struct net_device *dev) { struct veth_priv *priv = netdev_priv(dev); /* Callers must be under RCU read side. */ return rcu_dereference(priv->peer); } static int veth_xdp_xmit(struct net_device *dev, int n, struct xdp_frame **frames, u32 flags, bool ndo_xmit) { struct veth_priv *rcv_priv, *priv = netdev_priv(dev); int i, ret = -ENXIO, nxmit = 0; struct net_device *rcv; unsigned int max_len; struct veth_rq *rq; if (unlikely(flags & ~XDP_XMIT_FLAGS_MASK)) return -EINVAL; rcu_read_lock(); rcv = rcu_dereference(priv->peer); if (unlikely(!rcv)) goto out; rcv_priv = netdev_priv(rcv); rq = &rcv_priv->rq[veth_select_rxq(rcv)]; /* The napi pointer is set if NAPI is enabled, which ensures that * xdp_ring is initialized on receive side and the peer device is up. */ if (!rcu_access_pointer(rq->napi)) goto out; max_len = rcv->mtu + rcv->hard_header_len + VLAN_HLEN; spin_lock(&rq->xdp_ring.producer_lock); for (i = 0; i < n; i++) { struct xdp_frame *frame = frames[i]; void *ptr = veth_xdp_to_ptr(frame); if (unlikely(xdp_get_frame_len(frame) > max_len || __ptr_ring_produce(&rq->xdp_ring, ptr))) break; nxmit++; } spin_unlock(&rq->xdp_ring.producer_lock); if (flags & XDP_XMIT_FLUSH) __veth_xdp_flush(rq); ret = nxmit; if (ndo_xmit) { u64_stats_update_begin(&rq->stats.syncp); rq->stats.vs.peer_tq_xdp_xmit += nxmit; rq->stats.vs.peer_tq_xdp_xmit_err += n - nxmit; u64_stats_update_end(&rq->stats.syncp); } out: rcu_read_unlock(); return ret; } static int veth_ndo_xdp_xmit(struct net_device *dev, int n, struct xdp_frame **frames, u32 flags) { int err; err = veth_xdp_xmit(dev, n, frames, flags, true); if (err < 0) { struct veth_priv *priv = netdev_priv(dev); atomic64_add(n, &priv->dropped); } return err; } static void veth_xdp_flush_bq(struct veth_rq *rq, struct veth_xdp_tx_bq *bq) { int sent, i, err = 0, drops; sent = veth_xdp_xmit(rq->dev, bq->count, bq->q, 0, false); if (sent < 0) { err = sent; sent = 0; } for (i = sent; unlikely(i < bq->count); i++) xdp_return_frame(bq->q[i]); drops = bq->count - sent; trace_xdp_bulk_tx(rq->dev, sent, drops, err); u64_stats_update_begin(&rq->stats.syncp); rq->stats.vs.xdp_tx += sent; rq->stats.vs.xdp_tx_err += drops; u64_stats_update_end(&rq->stats.syncp); bq->count = 0; } static void veth_xdp_flush(struct veth_rq *rq, struct veth_xdp_tx_bq *bq) { struct veth_priv *rcv_priv, *priv = netdev_priv(rq->dev); struct net_device *rcv; struct veth_rq *rcv_rq; rcu_read_lock(); veth_xdp_flush_bq(rq, bq); rcv = rcu_dereference(priv->peer); if (unlikely(!rcv)) goto out; rcv_priv = netdev_priv(rcv); rcv_rq = &rcv_priv->rq[veth_select_rxq(rcv)]; /* xdp_ring is initialized on receive side? */ if (unlikely(!rcu_access_pointer(rcv_rq->xdp_prog))) goto out; __veth_xdp_flush(rcv_rq); out: rcu_read_unlock(); } static int veth_xdp_tx(struct veth_rq *rq, struct xdp_buff *xdp, struct veth_xdp_tx_bq *bq) { struct xdp_frame *frame = xdp_convert_buff_to_frame(xdp); if (unlikely(!frame)) return -EOVERFLOW; if (unlikely(bq->count == VETH_XDP_TX_BULK_SIZE)) veth_xdp_flush_bq(rq, bq); bq->q[bq->count++] = frame; return 0; } static struct xdp_frame *veth_xdp_rcv_one(struct veth_rq *rq, struct xdp_frame *frame, struct veth_xdp_tx_bq *bq, struct veth_stats *stats) { struct xdp_frame orig_frame; struct bpf_prog *xdp_prog; rcu_read_lock(); xdp_prog = rcu_dereference(rq->xdp_prog); if (likely(xdp_prog)) { struct veth_xdp_buff vxbuf; struct xdp_buff *xdp = &vxbuf.xdp; u32 act; xdp_convert_frame_to_buff(frame, xdp); xdp->rxq = &rq->xdp_rxq; vxbuf.skb = NULL; act = bpf_prog_run_xdp(xdp_prog, xdp); switch (act) { case XDP_PASS: if (xdp_update_frame_from_buff(xdp, frame)) goto err_xdp; break; case XDP_TX: orig_frame = *frame; xdp->rxq->mem.type = frame->mem_type; if (unlikely(veth_xdp_tx(rq, xdp, bq) < 0)) { trace_xdp_exception(rq->dev, xdp_prog, act); frame = &orig_frame; stats->rx_drops++; goto err_xdp; } stats->xdp_tx++; rcu_read_unlock(); goto xdp_xmit; case XDP_REDIRECT: orig_frame = *frame; xdp->rxq->mem.type = frame->mem_type; if (xdp_do_redirect(rq->dev, xdp, xdp_prog)) { frame = &orig_frame; stats->rx_drops++; goto err_xdp; } stats->xdp_redirect++; rcu_read_unlock(); goto xdp_xmit; default: bpf_warn_invalid_xdp_action(rq->dev, xdp_prog, act); fallthrough; case XDP_ABORTED: trace_xdp_exception(rq->dev, xdp_prog, act); fallthrough; case XDP_DROP: stats->xdp_drops++; goto err_xdp; } } rcu_read_unlock(); return frame; err_xdp: rcu_read_unlock(); xdp_return_frame(frame); xdp_xmit: return NULL; } /* frames array contains VETH_XDP_BATCH at most */ static void veth_xdp_rcv_bulk_skb(struct veth_rq *rq, void **frames, int n_xdpf, struct veth_xdp_tx_bq *bq, struct veth_stats *stats) { void *skbs[VETH_XDP_BATCH]; int i; if (unlikely(!napi_skb_cache_get_bulk(skbs, n_xdpf))) { for (i = 0; i < n_xdpf; i++) xdp_return_frame(frames[i]); stats->rx_drops += n_xdpf; return; } for (i = 0; i < n_xdpf; i++) { struct sk_buff *skb = skbs[i]; skb = __xdp_build_skb_from_frame(frames[i], skb, rq->dev); if (!skb) { xdp_return_frame(frames[i]); stats->rx_drops++; continue; } napi_gro_receive(&rq->xdp_napi, skb); } } static void veth_xdp_get(struct xdp_buff *xdp) { struct skb_shared_info *sinfo = xdp_get_shared_info_from_buff(xdp); int i; get_page(virt_to_page(xdp->data)); if (likely(!xdp_buff_has_frags(xdp))) return; for (i = 0; i < sinfo->nr_frags; i++) __skb_frag_ref(&sinfo->frags[i]); } static int veth_convert_skb_to_xdp_buff(struct veth_rq *rq, struct xdp_buff *xdp, struct sk_buff **pskb) { struct sk_buff *skb = *pskb; u32 frame_sz; if (skb_shared(skb) || skb_head_is_locked(skb) || skb_shinfo(skb)->nr_frags || skb_headroom(skb) < XDP_PACKET_HEADROOM) { if (skb_pp_cow_data(rq->page_pool, pskb, XDP_PACKET_HEADROOM)) goto drop; skb = *pskb; } /* SKB "head" area always have tailroom for skb_shared_info */ frame_sz = skb_end_pointer(skb) - skb->head; frame_sz += SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); xdp_init_buff(xdp, frame_sz, &rq->xdp_rxq); xdp_prepare_buff(xdp, skb->head, skb_headroom(skb), skb_headlen(skb), true); if (skb_is_nonlinear(skb)) { skb_shinfo(skb)->xdp_frags_size = skb->data_len; xdp_buff_set_frags_flag(xdp); } else { xdp_buff_clear_frags_flag(xdp); } *pskb = skb; return 0; drop: consume_skb(skb); *pskb = NULL; return -ENOMEM; } static struct sk_buff *veth_xdp_rcv_skb(struct veth_rq *rq, struct sk_buff *skb, struct veth_xdp_tx_bq *bq, struct veth_stats *stats) { void *orig_data, *orig_data_end; struct bpf_prog *xdp_prog; struct veth_xdp_buff vxbuf; struct xdp_buff *xdp = &vxbuf.xdp; u32 act, metalen; int off; skb_prepare_for_gro(skb); rcu_read_lock(); xdp_prog = rcu_dereference(rq->xdp_prog); if (unlikely(!xdp_prog)) { rcu_read_unlock(); goto out; } __skb_push(skb, skb->data - skb_mac_header(skb)); if (veth_convert_skb_to_xdp_buff(rq, xdp, &skb)) goto drop; vxbuf.skb = skb; orig_data = xdp->data; orig_data_end = xdp->data_end; act = bpf_prog_run_xdp(xdp_prog, xdp); switch (act) { case XDP_PASS: break; case XDP_TX: veth_xdp_get(xdp); consume_skb(skb); xdp->rxq->mem = rq->xdp_mem; if (unlikely(veth_xdp_tx(rq, xdp, bq) < 0)) { trace_xdp_exception(rq->dev, xdp_prog, act); stats->rx_drops++; goto err_xdp; } stats->xdp_tx++; rcu_read_unlock(); goto xdp_xmit; case XDP_REDIRECT: veth_xdp_get(xdp); consume_skb(skb); xdp->rxq->mem = rq->xdp_mem; if (xdp_do_redirect(rq->dev, xdp, xdp_prog)) { stats->rx_drops++; goto err_xdp; } stats->xdp_redirect++; rcu_read_unlock(); goto xdp_xmit; default: bpf_warn_invalid_xdp_action(rq->dev, xdp_prog, act); fallthrough; case XDP_ABORTED: trace_xdp_exception(rq->dev, xdp_prog, act); fallthrough; case XDP_DROP: stats->xdp_drops++; goto xdp_drop; } rcu_read_unlock(); /* check if bpf_xdp_adjust_head was used */ off = orig_data - xdp->data; if (off > 0) __skb_push(skb, off); else if (off < 0) __skb_pull(skb, -off); skb_reset_mac_header(skb); /* check if bpf_xdp_adjust_tail was used */ off = xdp->data_end - orig_data_end; if (off != 0) __skb_put(skb, off); /* positive on grow, negative on shrink */ /* XDP frag metadata (e.g. nr_frags) are updated in eBPF helpers * (e.g. bpf_xdp_adjust_tail), we need to update data_len here. */ if (xdp_buff_has_frags(xdp)) skb->data_len = skb_shinfo(skb)->xdp_frags_size; else skb->data_len = 0; skb->protocol = eth_type_trans(skb, rq->dev); metalen = xdp->data - xdp->data_meta; if (metalen) skb_metadata_set(skb, metalen); out: return skb; drop: stats->rx_drops++; xdp_drop: rcu_read_unlock(); kfree_skb(skb); return NULL; err_xdp: rcu_read_unlock(); xdp_return_buff(xdp); xdp_xmit: return NULL; } static int veth_xdp_rcv(struct veth_rq *rq, int budget, struct veth_xdp_tx_bq *bq, struct veth_stats *stats) { struct veth_priv *priv = netdev_priv(rq->dev); int queue_idx = rq->xdp_rxq.queue_index; struct netdev_queue *peer_txq; struct net_device *peer_dev; int i, done = 0, n_xdpf = 0; void *xdpf[VETH_XDP_BATCH]; /* NAPI functions as RCU section */ peer_dev = rcu_dereference_check(priv->peer, rcu_read_lock_bh_held()); peer_txq = peer_dev ? netdev_get_tx_queue(peer_dev, queue_idx) : NULL; for (i = 0; i < budget; i++) { void *ptr = __ptr_ring_consume(&rq->xdp_ring); if (!ptr) break; if (veth_is_xdp_frame(ptr)) { /* ndo_xdp_xmit */ struct xdp_frame *frame = veth_ptr_to_xdp(ptr); stats->xdp_bytes += xdp_get_frame_len(frame); frame = veth_xdp_rcv_one(rq, frame, bq, stats); if (frame) { /* XDP_PASS */ xdpf[n_xdpf++] = frame; if (n_xdpf == VETH_XDP_BATCH) { veth_xdp_rcv_bulk_skb(rq, xdpf, n_xdpf, bq, stats); n_xdpf = 0; } } } else { /* ndo_start_xmit */ struct sk_buff *skb = ptr; stats->xdp_bytes += skb->len; skb = veth_xdp_rcv_skb(rq, skb, bq, stats); if (skb) { if (skb_shared(skb) || skb_unclone(skb, GFP_ATOMIC)) netif_receive_skb(skb); else napi_gro_receive(&rq->xdp_napi, skb); } } done++; } if (n_xdpf) veth_xdp_rcv_bulk_skb(rq, xdpf, n_xdpf, bq, stats); u64_stats_update_begin(&rq->stats.syncp); rq->stats.vs.xdp_redirect += stats->xdp_redirect; rq->stats.vs.xdp_bytes += stats->xdp_bytes; rq->stats.vs.xdp_drops += stats->xdp_drops; rq->stats.vs.rx_drops += stats->rx_drops; rq->stats.vs.xdp_packets += done; u64_stats_update_end(&rq->stats.syncp); if (peer_txq && unlikely(netif_tx_queue_stopped(peer_txq))) netif_tx_wake_queue(peer_txq); return done; } static int veth_poll(struct napi_struct *napi, int budget) { struct veth_rq *rq = container_of(napi, struct veth_rq, xdp_napi); struct veth_stats stats = {}; struct veth_xdp_tx_bq bq; int done; bq.count = 0; xdp_set_return_frame_no_direct(); done = veth_xdp_rcv(rq, budget, &bq, &stats); if (stats.xdp_redirect > 0) xdp_do_flush(); if (done < budget && napi_complete_done(napi, done)) { /* Write rx_notify_masked before reading ptr_ring */ smp_store_mb(rq->rx_notify_masked, false); if (unlikely(!__ptr_ring_empty(&rq->xdp_ring))) { if (napi_schedule_prep(&rq->xdp_napi)) { WRITE_ONCE(rq->rx_notify_masked, true); __napi_schedule(&rq->xdp_napi); } } } if (stats.xdp_tx > 0) veth_xdp_flush(rq, &bq); xdp_clear_return_frame_no_direct(); return done; } static int veth_create_page_pool(struct veth_rq *rq) { struct page_pool_params pp_params = { .order = 0, .pool_size = VETH_RING_SIZE, .nid = NUMA_NO_NODE, .dev = &rq->dev->dev, }; rq->page_pool = page_pool_create(&pp_params); if (IS_ERR(rq->page_pool)) { int err = PTR_ERR(rq->page_pool); rq->page_pool = NULL; return err; } return 0; } static int __veth_napi_enable_range(struct net_device *dev, int start, int end) { struct veth_priv *priv = netdev_priv(dev); int err, i; for (i = start; i < end; i++) { err = veth_create_page_pool(&priv->rq[i]); if (err) goto err_page_pool; } for (i = start; i < end; i++) { struct veth_rq *rq = &priv->rq[i]; err = ptr_ring_init(&rq->xdp_ring, VETH_RING_SIZE, GFP_KERNEL); if (err) goto err_xdp_ring; } for (i = start; i < end; i++) { struct veth_rq *rq = &priv->rq[i]; napi_enable(&rq->xdp_napi); rcu_assign_pointer(priv->rq[i].napi, &priv->rq[i].xdp_napi); } return 0; err_xdp_ring: for (i--; i >= start; i--) ptr_ring_cleanup(&priv->rq[i].xdp_ring, veth_ptr_free); i = end; err_page_pool: for (i--; i >= start; i--) { page_pool_destroy(priv->rq[i].page_pool); priv->rq[i].page_pool = NULL; } return err; } static int __veth_napi_enable(struct net_device *dev) { return __veth_napi_enable_range(dev, 0, dev->real_num_rx_queues); } static void veth_napi_del_range(struct net_device *dev, int start, int end) { struct veth_priv *priv = netdev_priv(dev); int i; for (i = start; i < end; i++) { struct veth_rq *rq = &priv->rq[i]; rcu_assign_pointer(priv->rq[i].napi, NULL); napi_disable(&rq->xdp_napi); __netif_napi_del(&rq->xdp_napi); } synchronize_net(); for (i = start; i < end; i++) { struct veth_rq *rq = &priv->rq[i]; rq->rx_notify_masked = false; ptr_ring_cleanup(&rq->xdp_ring, veth_ptr_free); } for (i = start; i < end; i++) { page_pool_destroy(priv->rq[i].page_pool); priv->rq[i].page_pool = NULL; } } static void veth_napi_del(struct net_device *dev) { veth_napi_del_range(dev, 0, dev->real_num_rx_queues); } static bool veth_gro_requested(const struct net_device *dev) { return !!(dev->wanted_features & NETIF_F_GRO); } static int veth_enable_xdp_range(struct net_device *dev, int start, int end, bool napi_already_on) { struct veth_priv *priv = netdev_priv(dev); int err, i; for (i = start; i < end; i++) { struct veth_rq *rq = &priv->rq[i]; if (!napi_already_on) netif_napi_add(dev, &rq->xdp_napi, veth_poll); err = xdp_rxq_info_reg(&rq->xdp_rxq, dev, i, rq->xdp_napi.napi_id); if (err < 0) goto err_rxq_reg; err = xdp_rxq_info_reg_mem_model(&rq->xdp_rxq, MEM_TYPE_PAGE_SHARED, NULL); if (err < 0) goto err_reg_mem; /* Save original mem info as it can be overwritten */ rq->xdp_mem = rq->xdp_rxq.mem; } return 0; err_reg_mem: xdp_rxq_info_unreg(&priv->rq[i].xdp_rxq); err_rxq_reg: for (i--; i >= start; i--) { struct veth_rq *rq = &priv->rq[i]; xdp_rxq_info_unreg(&rq->xdp_rxq); if (!napi_already_on) netif_napi_del(&rq->xdp_napi); } return err; } static void veth_disable_xdp_range(struct net_device *dev, int start, int end, bool delete_napi) { struct veth_priv *priv = netdev_priv(dev); int i; for (i = start; i < end; i++) { struct veth_rq *rq = &priv->rq[i]; rq->xdp_rxq.mem = rq->xdp_mem; xdp_rxq_info_unreg(&rq->xdp_rxq); if (delete_napi) netif_napi_del(&rq->xdp_napi); } } static int veth_enable_xdp(struct net_device *dev) { bool napi_already_on = veth_gro_requested(dev) && (dev->flags & IFF_UP); struct veth_priv *priv = netdev_priv(dev); int err, i; if (!xdp_rxq_info_is_reg(&priv->rq[0].xdp_rxq)) { err = veth_enable_xdp_range(dev, 0, dev->real_num_rx_queues, napi_already_on); if (err) return err; if (!napi_already_on) { err = __veth_napi_enable(dev); if (err) { veth_disable_xdp_range(dev, 0, dev->real_num_rx_queues, true); return err; } } } for (i = 0; i < dev->real_num_rx_queues; i++) { rcu_assign_pointer(priv->rq[i].xdp_prog, priv->_xdp_prog); rcu_assign_pointer(priv->rq[i].napi, &priv->rq[i].xdp_napi); } return 0; } static void veth_disable_xdp(struct net_device *dev) { struct veth_priv *priv = netdev_priv(dev); int i; for (i = 0; i < dev->real_num_rx_queues; i++) rcu_assign_pointer(priv->rq[i].xdp_prog, NULL); if (!netif_running(dev) || !veth_gro_requested(dev)) veth_napi_del(dev); veth_disable_xdp_range(dev, 0, dev->real_num_rx_queues, false); } static int veth_napi_enable_range(struct net_device *dev, int start, int end) { struct veth_priv *priv = netdev_priv(dev); int err, i; for (i = start; i < end; i++) { struct veth_rq *rq = &priv->rq[i]; netif_napi_add(dev, &rq->xdp_napi, veth_poll); } err = __veth_napi_enable_range(dev, start, end); if (err) { for (i = start; i < end; i++) { struct veth_rq *rq = &priv->rq[i]; netif_napi_del(&rq->xdp_napi); } return err; } return err; } static int veth_napi_enable(struct net_device *dev) { return veth_napi_enable_range(dev, 0, dev->real_num_rx_queues); } static void veth_disable_range_safe(struct net_device *dev, int start, int end) { struct veth_priv *priv = netdev_priv(dev); if (start >= end) return; if (priv->_xdp_prog) { veth_napi_del_range(dev, start, end); veth_disable_xdp_range(dev, start, end, false); } else if (veth_gro_requested(dev)) { veth_napi_del_range(dev, start, end); } } static int veth_enable_range_safe(struct net_device *dev, int start, int end) { struct veth_priv *priv = netdev_priv(dev); int err; if (start >= end) return 0; if (priv->_xdp_prog) { /* these channels are freshly initialized, napi is not on there even * when GRO is requeste */ err = veth_enable_xdp_range(dev, start, end, false); if (err) return err; err = __veth_napi_enable_range(dev, start, end); if (err) { /* on error always delete the newly added napis */ veth_disable_xdp_range(dev, start, end, true); return err; } } else if (veth_gro_requested(dev)) { return veth_napi_enable_range(dev, start, end); } return 0; } static void veth_set_xdp_features(struct net_device *dev) { struct veth_priv *priv = netdev_priv(dev); struct net_device *peer; peer = rtnl_dereference(priv->peer); if (peer && peer->real_num_tx_queues <= dev->real_num_rx_queues) { struct veth_priv *priv_peer = netdev_priv(peer); xdp_features_t val = NETDEV_XDP_ACT_BASIC | NETDEV_XDP_ACT_REDIRECT | NETDEV_XDP_ACT_RX_SG; if (priv_peer->_xdp_prog || veth_gro_requested(peer)) val |= NETDEV_XDP_ACT_NDO_XMIT | NETDEV_XDP_ACT_NDO_XMIT_SG; xdp_set_features_flag(dev, val); } else { xdp_clear_features_flag(dev); } } static int veth_set_channels(struct net_device *dev, struct ethtool_channels *ch) { struct veth_priv *priv = netdev_priv(dev); unsigned int old_rx_count, new_rx_count; struct veth_priv *peer_priv; struct net_device *peer; int err; /* sanity check. Upper bounds are already enforced by the caller */ if (!ch->rx_count || !ch->tx_count) return -EINVAL; /* avoid braking XDP, if that is enabled */ peer = rtnl_dereference(priv->peer); peer_priv = peer ? netdev_priv(peer) : NULL; if (priv->_xdp_prog && peer && ch->rx_count < peer->real_num_tx_queues) return -EINVAL; if (peer && peer_priv && peer_priv->_xdp_prog && ch->tx_count > peer->real_num_rx_queues) return -EINVAL; old_rx_count = dev->real_num_rx_queues; new_rx_count = ch->rx_count; if (netif_running(dev)) { /* turn device off */ netif_carrier_off(dev); if (peer) netif_carrier_off(peer); /* try to allocate new resurces, as needed*/ err = veth_enable_range_safe(dev, old_rx_count, new_rx_count); if (err) goto out; } err = netif_set_real_num_rx_queues(dev, ch->rx_count); if (err) goto revert; err = netif_set_real_num_tx_queues(dev, ch->tx_count); if (err) { int err2 = netif_set_real_num_rx_queues(dev, old_rx_count); /* this error condition could happen only if rx and tx change * in opposite directions (e.g. tx nr raises, rx nr decreases) * and we can't do anything to fully restore the original * status */ if (err2) pr_warn("Can't restore rx queues config %d -> %d %d", new_rx_count, old_rx_count, err2); else goto revert; } out: if (netif_running(dev)) { /* note that we need to swap the arguments WRT the enable part * to identify the range we have to disable */ veth_disable_range_safe(dev, new_rx_count, old_rx_count); netif_carrier_on(dev); if (peer) netif_carrier_on(peer); } /* update XDP supported features */ veth_set_xdp_features(dev); if (peer) veth_set_xdp_features(peer); return err; revert: new_rx_count = old_rx_count; old_rx_count = ch->rx_count; goto out; } static int veth_open(struct net_device *dev) { struct veth_priv *priv = netdev_priv(dev); struct net_device *peer = rtnl_dereference(priv->peer); int err; if (!peer) return -ENOTCONN; if (priv->_xdp_prog) { err = veth_enable_xdp(dev); if (err) return err; } else if (veth_gro_requested(dev)) { err = veth_napi_enable(dev); if (err) return err; } if (peer->flags & IFF_UP) { netif_carrier_on(dev); netif_carrier_on(peer); } veth_set_xdp_features(dev); return 0; } static int veth_close(struct net_device *dev) { struct veth_priv *priv = netdev_priv(dev); struct net_device *peer = rtnl_dereference(priv->peer); netif_carrier_off(dev); if (peer) netif_carrier_off(peer); if (priv->_xdp_prog) veth_disable_xdp(dev); else if (veth_gro_requested(dev)) veth_napi_del(dev); return 0; } static int is_valid_veth_mtu(int mtu) { return mtu >= ETH_MIN_MTU && mtu <= ETH_MAX_MTU; } static int veth_alloc_queues(struct net_device *dev) { struct veth_priv *priv = netdev_priv(dev); int i; priv->rq = kvcalloc(dev->num_rx_queues, sizeof(*priv->rq), GFP_KERNEL_ACCOUNT | __GFP_RETRY_MAYFAIL); if (!priv->rq) return -ENOMEM; for (i = 0; i < dev->num_rx_queues; i++) { priv->rq[i].dev = dev; u64_stats_init(&priv->rq[i].stats.syncp); } return 0; } static void veth_free_queues(struct net_device *dev) { struct veth_priv *priv = netdev_priv(dev); kvfree(priv->rq); } static int veth_dev_init(struct net_device *dev) { netdev_lockdep_set_classes(dev); return veth_alloc_queues(dev); } static void veth_dev_free(struct net_device *dev) { veth_free_queues(dev); } #ifdef CONFIG_NET_POLL_CONTROLLER static void veth_poll_controller(struct net_device *dev) { /* veth only receives frames when its peer sends one * Since it has nothing to do with disabling irqs, we are guaranteed * never to have pending data when we poll for it so * there is nothing to do here. * * We need this though so netpoll recognizes us as an interface that * supports polling, which enables bridge devices in virt setups to * still use netconsole */ } #endif /* CONFIG_NET_POLL_CONTROLLER */ static int veth_get_iflink(const struct net_device *dev) { struct veth_priv *priv = netdev_priv(dev); struct net_device *peer; int iflink; rcu_read_lock(); peer = rcu_dereference(priv->peer); iflink = peer ? READ_ONCE(peer->ifindex) : 0; rcu_read_unlock(); return iflink; } static netdev_features_t veth_fix_features(struct net_device *dev, netdev_features_t features) { struct veth_priv *priv = netdev_priv(dev); struct net_device *peer; peer = rtnl_dereference(priv->peer); if (peer) { struct veth_priv *peer_priv = netdev_priv(peer); if (peer_priv->_xdp_prog) features &= ~NETIF_F_GSO_SOFTWARE; } return features; } static int veth_set_features(struct net_device *dev, netdev_features_t features) { netdev_features_t changed = features ^ dev->features; struct veth_priv *priv = netdev_priv(dev); struct net_device *peer; int err; if (!(changed & NETIF_F_GRO) || !(dev->flags & IFF_UP) || priv->_xdp_prog) return 0; peer = rtnl_dereference(priv->peer); if (features & NETIF_F_GRO) { err = veth_napi_enable(dev); if (err) return err; if (peer) xdp_features_set_redirect_target(peer, true); } else { if (peer) xdp_features_clear_redirect_target(peer); veth_napi_del(dev); } return 0; } static void veth_set_rx_headroom(struct net_device *dev, int new_hr) { struct veth_priv *peer_priv, *priv = netdev_priv(dev); struct net_device *peer; if (new_hr < 0) new_hr = 0; rcu_read_lock(); peer = rcu_dereference(priv->peer); if (unlikely(!peer)) goto out; peer_priv = netdev_priv(peer); priv->requested_headroom = new_hr; new_hr = max(priv->requested_headroom, peer_priv->requested_headroom); dev->needed_headroom = new_hr; peer->needed_headroom = new_hr; out: rcu_read_unlock(); } static int veth_xdp_set(struct net_device *dev, struct bpf_prog *prog, struct netlink_ext_ack *extack) { struct veth_priv *priv = netdev_priv(dev); struct bpf_prog *old_prog; struct net_device *peer; unsigned int max_mtu; int err; old_prog = priv->_xdp_prog; priv->_xdp_prog = prog; peer = rtnl_dereference(priv->peer); if (prog) { if (!peer) { NL_SET_ERR_MSG_MOD(extack, "Cannot set XDP when peer is detached"); err = -ENOTCONN; goto err; } max_mtu = SKB_WITH_OVERHEAD(PAGE_SIZE - VETH_XDP_HEADROOM) - peer->hard_header_len; /* Allow increasing the max_mtu if the program supports * XDP fragments. */ if (prog->aux->xdp_has_frags) max_mtu += PAGE_SIZE * MAX_SKB_FRAGS; if (peer->mtu > max_mtu) { NL_SET_ERR_MSG_MOD(extack, "Peer MTU is too large to set XDP"); err = -ERANGE; goto err; } if (dev->real_num_rx_queues < peer->real_num_tx_queues) { NL_SET_ERR_MSG_MOD(extack, "XDP expects number of rx queues not less than peer tx queues"); err = -ENOSPC; goto err; } if (dev->flags & IFF_UP) { err = veth_enable_xdp(dev); if (err) { NL_SET_ERR_MSG_MOD(extack, "Setup for XDP failed"); goto err; } } if (!old_prog) { peer->hw_features &= ~NETIF_F_GSO_SOFTWARE; peer->max_mtu = max_mtu; } xdp_features_set_redirect_target(peer, true); } if (old_prog) { if (!prog) { if (peer && !veth_gro_requested(dev)) xdp_features_clear_redirect_target(peer); if (dev->flags & IFF_UP) veth_disable_xdp(dev); if (peer) { peer->hw_features |= NETIF_F_GSO_SOFTWARE; peer->max_mtu = ETH_MAX_MTU; } } bpf_prog_put(old_prog); } if ((!!old_prog ^ !!prog) && peer) netdev_update_features(peer); return 0; err: priv->_xdp_prog = old_prog; return err; } static int veth_xdp(struct net_device *dev, struct netdev_bpf *xdp) { switch (xdp->command) { case XDP_SETUP_PROG: return veth_xdp_set(dev, xdp->prog, xdp->extack); default: return -EINVAL; } } static int veth_xdp_rx_timestamp(const struct xdp_md *ctx, u64 *timestamp) { struct veth_xdp_buff *_ctx = (void *)ctx; if (!_ctx->skb) return -ENODATA; *timestamp = skb_hwtstamps(_ctx->skb)->hwtstamp; return 0; } static int veth_xdp_rx_hash(const struct xdp_md *ctx, u32 *hash, enum xdp_rss_hash_type *rss_type) { struct veth_xdp_buff *_ctx = (void *)ctx; struct sk_buff *skb = _ctx->skb; if (!skb) return -ENODATA; *hash = skb_get_hash(skb); *rss_type = skb->l4_hash ? XDP_RSS_TYPE_L4_ANY : XDP_RSS_TYPE_NONE; return 0; } static int veth_xdp_rx_vlan_tag(const struct xdp_md *ctx, __be16 *vlan_proto, u16 *vlan_tci) { const struct veth_xdp_buff *_ctx = (void *)ctx; const struct sk_buff *skb = _ctx->skb; int err; if (!skb) return -ENODATA; err = __vlan_hwaccel_get_tag(skb, vlan_tci); if (err) return err; *vlan_proto = skb->vlan_proto; return err; } static const struct net_device_ops veth_netdev_ops = { .ndo_init = veth_dev_init, .ndo_open = veth_open, .ndo_stop = veth_close, .ndo_start_xmit = veth_xmit, .ndo_get_stats64 = veth_get_stats64, .ndo_set_rx_mode = veth_set_multicast_list, .ndo_set_mac_address = eth_mac_addr, #ifdef CONFIG_NET_POLL_CONTROLLER .ndo_poll_controller = veth_poll_controller, #endif .ndo_get_iflink = veth_get_iflink, .ndo_fix_features = veth_fix_features, .ndo_set_features = veth_set_features, .ndo_features_check = passthru_features_check, .ndo_set_rx_headroom = veth_set_rx_headroom, .ndo_bpf = veth_xdp, .ndo_xdp_xmit = veth_ndo_xdp_xmit, .ndo_get_peer_dev = veth_peer_dev, }; static const struct xdp_metadata_ops veth_xdp_metadata_ops = { .xmo_rx_timestamp = veth_xdp_rx_timestamp, .xmo_rx_hash = veth_xdp_rx_hash, .xmo_rx_vlan_tag = veth_xdp_rx_vlan_tag, }; #define VETH_FEATURES (NETIF_F_SG | NETIF_F_FRAGLIST | NETIF_F_HW_CSUM | \ NETIF_F_RXCSUM | NETIF_F_SCTP_CRC | NETIF_F_HIGHDMA | \ NETIF_F_GSO_SOFTWARE | NETIF_F_GSO_ENCAP_ALL | \ NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_CTAG_RX | \ NETIF_F_HW_VLAN_STAG_TX | NETIF_F_HW_VLAN_STAG_RX ) static void veth_setup(struct net_device *dev) { ether_setup(dev); dev->priv_flags &= ~IFF_TX_SKB_SHARING; dev->priv_flags |= IFF_LIVE_ADDR_CHANGE; dev->priv_flags |= IFF_NO_QUEUE; dev->priv_flags |= IFF_PHONY_HEADROOM; dev->priv_flags |= IFF_DISABLE_NETPOLL; dev->lltx = true; dev->netdev_ops = &veth_netdev_ops; dev->xdp_metadata_ops = &veth_xdp_metadata_ops; dev->ethtool_ops = &veth_ethtool_ops; dev->features |= VETH_FEATURES; dev->vlan_features = dev->features & ~(NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_STAG_TX | NETIF_F_HW_VLAN_CTAG_RX | NETIF_F_HW_VLAN_STAG_RX); dev->needs_free_netdev = true; dev->priv_destructor = veth_dev_free; dev->pcpu_stat_type = NETDEV_PCPU_STAT_TSTATS; dev->max_mtu = ETH_MAX_MTU; dev->hw_features = VETH_FEATURES; dev->hw_enc_features = VETH_FEATURES; dev->mpls_features = NETIF_F_HW_CSUM | NETIF_F_GSO_SOFTWARE; netif_set_tso_max_size(dev, GSO_MAX_SIZE); } /* * netlink interface */ static int veth_validate(struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { if (tb[IFLA_ADDRESS]) { if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN) return -EINVAL; if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS]))) return -EADDRNOTAVAIL; } if (tb[IFLA_MTU]) { if (!is_valid_veth_mtu(nla_get_u32(tb[IFLA_MTU]))) return -EINVAL; } return 0; } static struct rtnl_link_ops veth_link_ops; static void veth_disable_gro(struct net_device *dev) { dev->features &= ~NETIF_F_GRO; dev->wanted_features &= ~NETIF_F_GRO; netdev_update_features(dev); } static int veth_init_queues(struct net_device *dev, struct nlattr *tb[]) { int err; if (!tb[IFLA_NUM_TX_QUEUES] && dev->num_tx_queues > 1) { err = netif_set_real_num_tx_queues(dev, 1); if (err) return err; } if (!tb[IFLA_NUM_RX_QUEUES] && dev->num_rx_queues > 1) { err = netif_set_real_num_rx_queues(dev, 1); if (err) return err; } return 0; } static int veth_newlink(struct net_device *dev, struct rtnl_newlink_params *params, struct netlink_ext_ack *extack) { struct net *peer_net = rtnl_newlink_peer_net(params); struct nlattr **data = params->data; struct nlattr **tb = params->tb; int err; struct net_device *peer; struct veth_priv *priv; char ifname[IFNAMSIZ]; struct nlattr *peer_tb[IFLA_MAX + 1], **tbp; unsigned char name_assign_type; struct ifinfomsg *ifmp; /* * create and register peer first */ if (data && data[VETH_INFO_PEER]) { struct nlattr *nla_peer = data[VETH_INFO_PEER]; ifmp = nla_data(nla_peer); rtnl_nla_parse_ifinfomsg(peer_tb, nla_peer, extack); tbp = peer_tb; } else { ifmp = NULL; tbp = tb; } if (ifmp && tbp[IFLA_IFNAME]) { nla_strscpy(ifname, tbp[IFLA_IFNAME], IFNAMSIZ); name_assign_type = NET_NAME_USER; } else { snprintf(ifname, IFNAMSIZ, DRV_NAME "%%d"); name_assign_type = NET_NAME_ENUM; } peer = rtnl_create_link(peer_net, ifname, name_assign_type, &veth_link_ops, tbp, extack); if (IS_ERR(peer)) return PTR_ERR(peer); if (!ifmp || !tbp[IFLA_ADDRESS]) eth_hw_addr_random(peer); if (ifmp && (dev->ifindex != 0)) peer->ifindex = ifmp->ifi_index; netif_inherit_tso_max(peer, dev); err = register_netdevice(peer); if (err < 0) goto err_register_peer; /* keep GRO disabled by default to be consistent with the established * veth behavior */ veth_disable_gro(peer); netif_carrier_off(peer); err = rtnl_configure_link(peer, ifmp, 0, NULL); if (err < 0) goto err_configure_peer; /* * register dev last * * note, that since we've registered new device the dev's name * should be re-allocated */ if (tb[IFLA_ADDRESS] == NULL) eth_hw_addr_random(dev); if (tb[IFLA_IFNAME]) nla_strscpy(dev->name, tb[IFLA_IFNAME], IFNAMSIZ); else snprintf(dev->name, IFNAMSIZ, DRV_NAME "%%d"); err = register_netdevice(dev); if (err < 0) goto err_register_dev; netif_carrier_off(dev); /* * tie the deviced together */ priv = netdev_priv(dev); rcu_assign_pointer(priv->peer, peer); err = veth_init_queues(dev, tb); if (err) goto err_queues; priv = netdev_priv(peer); rcu_assign_pointer(priv->peer, dev); err = veth_init_queues(peer, tb); if (err) goto err_queues; veth_disable_gro(dev); /* update XDP supported features */ veth_set_xdp_features(dev); veth_set_xdp_features(peer); return 0; err_queues: unregister_netdevice(dev); err_register_dev: /* nothing to do */ err_configure_peer: unregister_netdevice(peer); return err; err_register_peer: free_netdev(peer); return err; } static void veth_dellink(struct net_device *dev, struct list_head *head) { struct veth_priv *priv; struct net_device *peer; priv = netdev_priv(dev); peer = rtnl_dereference(priv->peer); /* Note : dellink() is called from default_device_exit_batch(), * before a rcu_synchronize() point. The devices are guaranteed * not being freed before one RCU grace period. */ RCU_INIT_POINTER(priv->peer, NULL); unregister_netdevice_queue(dev, head); if (peer) { priv = netdev_priv(peer); RCU_INIT_POINTER(priv->peer, NULL); unregister_netdevice_queue(peer, head); } } static const struct nla_policy veth_policy[VETH_INFO_MAX + 1] = { [VETH_INFO_PEER] = { .len = sizeof(struct ifinfomsg) }, }; static struct net *veth_get_link_net(const struct net_device *dev) { struct veth_priv *priv = netdev_priv(dev); struct net_device *peer = rtnl_dereference(priv->peer); return peer ? dev_net(peer) : dev_net(dev); } static unsigned int veth_get_num_queues(void) { /* enforce the same queue limit as rtnl_create_link */ int queues = num_possible_cpus(); if (queues > 4096) queues = 4096; return queues; } static struct rtnl_link_ops veth_link_ops = { .kind = DRV_NAME, .priv_size = sizeof(struct veth_priv), .setup = veth_setup, .validate = veth_validate, .newlink = veth_newlink, .dellink = veth_dellink, .policy = veth_policy, .peer_type = VETH_INFO_PEER, .maxtype = VETH_INFO_MAX, .get_link_net = veth_get_link_net, .get_num_tx_queues = veth_get_num_queues, .get_num_rx_queues = veth_get_num_queues, }; /* * init/fini */ static __init int veth_init(void) { return rtnl_link_register(&veth_link_ops); } static __exit void veth_exit(void) { rtnl_link_unregister(&veth_link_ops); } module_init(veth_init); module_exit(veth_exit); MODULE_DESCRIPTION("Virtual Ethernet Tunnel"); MODULE_LICENSE("GPL v2"); MODULE_ALIAS_RTNL_LINK(DRV_NAME);
84 84 83 84 17 86 106 102 101 87 16 103 13 13 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 // SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2007 Oracle. All rights reserved. * Copyright (C) 2014 Fujitsu. All rights reserved. */ #include <linux/kthread.h> #include <linux/slab.h> #include <linux/list.h> #include <linux/spinlock.h> #include <linux/freezer.h> #include <trace/events/btrfs.h> #include "async-thread.h" enum { WORK_DONE_BIT, WORK_ORDER_DONE_BIT, }; #define NO_THRESHOLD (-1) #define DEFAULT_THRESHOLD (32) struct btrfs_workqueue { struct workqueue_struct *normal_wq; /* File system this workqueue services */ struct btrfs_fs_info *fs_info; /* List head pointing to ordered work list */ struct list_head ordered_list; /* Spinlock for ordered_list */ spinlock_t list_lock; /* Thresholding related variants */ atomic_t pending; /* Up limit of concurrency workers */ int limit_active; /* Current number of concurrency workers */ int current_active; /* Threshold to change current_active */ int thresh; unsigned int count; spinlock_t thres_lock; }; struct btrfs_fs_info * __pure btrfs_workqueue_owner(const struct btrfs_workqueue *wq) { return wq->fs_info; } struct btrfs_fs_info * __pure btrfs_work_owner(const struct btrfs_work *work) { return work->wq->fs_info; } bool btrfs_workqueue_normal_congested(const struct btrfs_workqueue *wq) { /* * We could compare wq->pending with num_online_cpus() * to support "thresh == NO_THRESHOLD" case, but it requires * moving up atomic_inc/dec in thresh_queue/exec_hook. Let's * postpone it until someone needs the support of that case. */ if (wq->thresh == NO_THRESHOLD) return false; return atomic_read(&wq->pending) > wq->thresh * 2; } static void btrfs_init_workqueue(struct btrfs_workqueue *wq, struct btrfs_fs_info *fs_info) { wq->fs_info = fs_info; atomic_set(&wq->pending, 0); INIT_LIST_HEAD(&wq->ordered_list); spin_lock_init(&wq->list_lock); spin_lock_init(&wq->thres_lock); } struct btrfs_workqueue *btrfs_alloc_workqueue(struct btrfs_fs_info *fs_info, const char *name, unsigned int flags, int limit_active, int thresh) { struct btrfs_workqueue *ret = kzalloc(sizeof(*ret), GFP_KERNEL); if (!ret) return NULL; btrfs_init_workqueue(ret, fs_info); ret->limit_active = limit_active; if (thresh == 0) thresh = DEFAULT_THRESHOLD; /* For low threshold, disabling threshold is a better choice */ if (thresh < DEFAULT_THRESHOLD) { ret->current_active = limit_active; ret->thresh = NO_THRESHOLD; } else { /* * For threshold-able wq, let its concurrency grow on demand. * Use minimal max_active at alloc time to reduce resource * usage. */ ret->current_active = 1; ret->thresh = thresh; } ret->normal_wq = alloc_workqueue("btrfs-%s", flags, ret->current_active, name); if (!ret->normal_wq) { kfree(ret); return NULL; } trace_btrfs_workqueue_alloc(ret, name); return ret; } struct btrfs_workqueue *btrfs_alloc_ordered_workqueue( struct btrfs_fs_info *fs_info, const char *name, unsigned int flags) { struct btrfs_workqueue *ret; ret = kzalloc(sizeof(*ret), GFP_KERNEL); if (!ret) return NULL; btrfs_init_workqueue(ret, fs_info); /* Ordered workqueues don't allow @max_active adjustments. */ ret->limit_active = 1; ret->current_active = 1; ret->thresh = NO_THRESHOLD; ret->normal_wq = alloc_ordered_workqueue("btrfs-%s", flags, name); if (!ret->normal_wq) { kfree(ret); return NULL; } trace_btrfs_workqueue_alloc(ret, name); return ret; } /* * Hook for threshold which will be called in btrfs_queue_work. * This hook WILL be called in IRQ handler context, * so workqueue_set_max_active MUST NOT be called in this hook */ static inline void thresh_queue_hook(struct btrfs_workqueue *wq) { if (wq->thresh == NO_THRESHOLD) return; atomic_inc(&wq->pending); } /* * Hook for threshold which will be called before executing the work, * This hook is called in kthread content. * So workqueue_set_max_active is called here. */ static inline void thresh_exec_hook(struct btrfs_workqueue *wq) { int new_current_active; long pending; bool need_change = false; if (wq->thresh == NO_THRESHOLD) return; atomic_dec(&wq->pending); spin_lock(&wq->thres_lock); /* * Use wq->count to limit the calling frequency of * workqueue_set_max_active. */ wq->count++; wq->count %= (wq->thresh / 4); if (!wq->count) goto out; new_current_active = wq->current_active; /* * pending may be changed later, but it's OK since we really * don't need it so accurate to calculate new_max_active. */ pending = atomic_read(&wq->pending); if (pending > wq->thresh) new_current_active++; if (pending < wq->thresh / 2) new_current_active--; new_current_active = clamp_val(new_current_active, 1, wq->limit_active); if (new_current_active != wq->current_active) { need_change = true; wq->current_active = new_current_active; } out: spin_unlock(&wq->thres_lock); if (need_change) workqueue_set_max_active(wq->normal_wq, wq->current_active); } static void run_ordered_work(struct btrfs_workqueue *wq, struct btrfs_work *self) { struct list_head *list = &wq->ordered_list; struct btrfs_work *work; spinlock_t *lock = &wq->list_lock; unsigned long flags; bool free_self = false; while (1) { spin_lock_irqsave(lock, flags); if (list_empty(list)) break; work = list_first_entry(list, struct btrfs_work, ordered_list); if (!test_bit(WORK_DONE_BIT, &work->flags)) break; /* * Orders all subsequent loads after reading WORK_DONE_BIT, * paired with the smp_mb__before_atomic in btrfs_work_helper * this guarantees that the ordered function will see all * updates from ordinary work function. */ smp_rmb(); /* * we are going to call the ordered done function, but * we leave the work item on the list as a barrier so * that later work items that are done don't have their * functions called before this one returns */ if (test_and_set_bit(WORK_ORDER_DONE_BIT, &work->flags)) break; trace_btrfs_ordered_sched(work); spin_unlock_irqrestore(lock, flags); work->ordered_func(work, false); /* now take the lock again and drop our item from the list */ spin_lock_irqsave(lock, flags); list_del(&work->ordered_list); spin_unlock_irqrestore(lock, flags); if (work == self) { /* * This is the work item that the worker is currently * executing. * * The kernel workqueue code guarantees non-reentrancy * of work items. I.e., if a work item with the same * address and work function is queued twice, the second * execution is blocked until the first one finishes. A * work item may be freed and recycled with the same * work function; the workqueue code assumes that the * original work item cannot depend on the recycled work * item in that case (see find_worker_executing_work()). * * Note that different types of Btrfs work can depend on * each other, and one type of work on one Btrfs * filesystem may even depend on the same type of work * on another Btrfs filesystem via, e.g., a loop device. * Therefore, we must not allow the current work item to * be recycled until we are really done, otherwise we * break the above assumption and can deadlock. */ free_self = true; } else { /* * We don't want to call the ordered free functions with * the lock held. */ work->ordered_func(work, true); /* NB: work must not be dereferenced past this point. */ trace_btrfs_all_work_done(wq->fs_info, work); } } spin_unlock_irqrestore(lock, flags); if (free_self) { self->ordered_func(self, true); /* NB: self must not be dereferenced past this point. */ trace_btrfs_all_work_done(wq->fs_info, self); } } static void btrfs_work_helper(struct work_struct *normal_work) { struct btrfs_work *work = container_of(normal_work, struct btrfs_work, normal_work); struct btrfs_workqueue *wq = work->wq; bool need_order = false; /* * We should not touch things inside work in the following cases: * 1) after work->func() if it has no ordered_func(..., true) to free * Since the struct is freed in work->func(). * 2) after setting WORK_DONE_BIT * The work may be freed in other threads almost instantly. * So we save the needed things here. */ if (work->ordered_func) need_order = true; trace_btrfs_work_sched(work); thresh_exec_hook(wq); work->func(work); if (need_order) { /* * Ensures all memory accesses done in the work function are * ordered before setting the WORK_DONE_BIT. Ensuring the thread * which is going to executed the ordered work sees them. * Pairs with the smp_rmb in run_ordered_work. */ smp_mb__before_atomic(); set_bit(WORK_DONE_BIT, &work->flags); run_ordered_work(wq, work); } else { /* NB: work must not be dereferenced past this point. */ trace_btrfs_all_work_done(wq->fs_info, work); } } void btrfs_init_work(struct btrfs_work *work, btrfs_func_t func, btrfs_ordered_func_t ordered_func) { work->func = func; work->ordered_func = ordered_func; INIT_WORK(&work->normal_work, btrfs_work_helper); INIT_LIST_HEAD(&work->ordered_list); work->flags = 0; } void btrfs_queue_work(struct btrfs_workqueue *wq, struct btrfs_work *work) { unsigned long flags; work->wq = wq; thresh_queue_hook(wq); if (work->ordered_func) { spin_lock_irqsave(&wq->list_lock, flags); list_add_tail(&work->ordered_list, &wq->ordered_list); spin_unlock_irqrestore(&wq->list_lock, flags); } trace_btrfs_work_queued(work); queue_work(wq->normal_wq, &work->normal_work); } void btrfs_destroy_workqueue(struct btrfs_workqueue *wq) { if (!wq) return; destroy_workqueue(wq->normal_wq); trace_btrfs_workqueue_destroy(wq); kfree(wq); } void btrfs_workqueue_set_max(struct btrfs_workqueue *wq, int limit_active) { if (wq) wq->limit_active = limit_active; } void btrfs_flush_workqueue(struct btrfs_workqueue *wq) { flush_workqueue(wq->normal_wq); }
10 6 4 6 6 14 14 1 11 11 2102 949 439 32 32 7448 7470 7461 1590 1528 119 8 7902 53 7865 554 12 537 120 1 980 88 8 31 6 113 331 335 329 3 590 21 567 3 209 149 3 141 85 15 71 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 // SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 1991, 1992 Linus Torvalds * * This file contains the interface functions for the various time related * system calls: time, stime, gettimeofday, settimeofday, adjtime * * Modification history: * * 1993-09-02 Philip Gladstone * Created file with time related functions from sched/core.c and adjtimex() * 1993-10-08 Torsten Duwe * adjtime interface update and CMOS clock write code * 1995-08-13 Torsten Duwe * kernel PLL updated to 1994-12-13 specs (rfc-1589) * 1999-01-16 Ulrich Windl * Introduced error checking for many cases in adjtimex(). * Updated NTP code according to technical memorandum Jan '96 * "A Kernel Model for Precision Timekeeping" by Dave Mills * Allow time_constant larger than MAXTC(6) for NTP v4 (MAXTC == 10) * (Even though the technical memorandum forbids it) * 2004-07-14 Christoph Lameter * Added getnstimeofday to allow the posix timer functions to return * with nanosecond accuracy */ #include <linux/export.h> #include <linux/kernel.h> #include <linux/timex.h> #include <linux/capability.h> #include <linux/timekeeper_internal.h> #include <linux/errno.h> #include <linux/syscalls.h> #include <linux/security.h> #include <linux/fs.h> #include <linux/math64.h> #include <linux/ptrace.h> #include <linux/uaccess.h> #include <linux/compat.h> #include <asm/unistd.h> #include <generated/timeconst.h> #include "timekeeping.h" /* * The timezone where the local system is located. Used as a default by some * programs who obtain this value by using gettimeofday. */ struct timezone sys_tz; EXPORT_SYMBOL(sys_tz); #ifdef __ARCH_WANT_SYS_TIME /* * sys_time() can be implemented in user-level using * sys_gettimeofday(). Is this for backwards compatibility? If so, * why not move it into the appropriate arch directory (for those * architectures that need it). */ SYSCALL_DEFINE1(time, __kernel_old_time_t __user *, tloc) { __kernel_old_time_t i = (__kernel_old_time_t)ktime_get_real_seconds(); if (tloc) { if (put_user(i,tloc)) return -EFAULT; } force_successful_syscall_return(); return i; } /* * sys_stime() can be implemented in user-level using * sys_settimeofday(). Is this for backwards compatibility? If so, * why not move it into the appropriate arch directory (for those * architectures that need it). */ SYSCALL_DEFINE1(stime, __kernel_old_time_t __user *, tptr) { struct timespec64 tv; int err; if (get_user(tv.tv_sec, tptr)) return -EFAULT; tv.tv_nsec = 0; err = security_settime64(&tv, NULL); if (err) return err; do_settimeofday64(&tv); return 0; } #endif /* __ARCH_WANT_SYS_TIME */ #ifdef CONFIG_COMPAT_32BIT_TIME #ifdef __ARCH_WANT_SYS_TIME32 /* old_time32_t is a 32 bit "long" and needs to get converted. */ SYSCALL_DEFINE1(time32, old_time32_t __user *, tloc) { old_time32_t i; i = (old_time32_t)ktime_get_real_seconds(); if (tloc) { if (put_user(i,tloc)) return -EFAULT; } force_successful_syscall_return(); return i; } SYSCALL_DEFINE1(stime32, old_time32_t __user *, tptr) { struct timespec64 tv; int err; if (get_user(tv.tv_sec, tptr)) return -EFAULT; tv.tv_nsec = 0; err = security_settime64(&tv, NULL); if (err) return err; do_settimeofday64(&tv); return 0; } #endif /* __ARCH_WANT_SYS_TIME32 */ #endif SYSCALL_DEFINE2(gettimeofday, struct __kernel_old_timeval __user *, tv, struct timezone __user *, tz) { if (likely(tv != NULL)) { struct timespec64 ts; ktime_get_real_ts64(&ts); if (put_user(ts.tv_sec, &tv->tv_sec) || put_user(ts.tv_nsec / 1000, &tv->tv_usec)) return -EFAULT; } if (unlikely(tz != NULL)) { if (copy_to_user(tz, &sys_tz, sizeof(sys_tz))) return -EFAULT; } return 0; } /* * In case for some reason the CMOS clock has not already been running * in UTC, but in some local time: The first time we set the timezone, * we will warp the clock so that it is ticking UTC time instead of * local time. Presumably, if someone is setting the timezone then we * are running in an environment where the programs understand about * timezones. This should be done at boot time in the /etc/rc script, * as soon as possible, so that the clock can be set right. Otherwise, * various programs will get confused when the clock gets warped. */ int do_sys_settimeofday64(const struct timespec64 *tv, const struct timezone *tz) { static int firsttime = 1; int error = 0; if (tv && !timespec64_valid_settod(tv)) return -EINVAL; error = security_settime64(tv, tz); if (error) return error; if (tz) { /* Verify we're within the +-15 hrs range */ if (tz->tz_minuteswest > 15*60 || tz->tz_minuteswest < -15*60) return -EINVAL; sys_tz = *tz; update_vsyscall_tz(); if (firsttime) { firsttime = 0; if (!tv) timekeeping_warp_clock(); } } if (tv) return do_settimeofday64(tv); return 0; } SYSCALL_DEFINE2(settimeofday, struct __kernel_old_timeval __user *, tv, struct timezone __user *, tz) { struct timespec64 new_ts; struct timezone new_tz; if (tv) { if (get_user(new_ts.tv_sec, &tv->tv_sec) || get_user(new_ts.tv_nsec, &tv->tv_usec)) return -EFAULT; if (new_ts.tv_nsec > USEC_PER_SEC || new_ts.tv_nsec < 0) return -EINVAL; new_ts.tv_nsec *= NSEC_PER_USEC; } if (tz) { if (copy_from_user(&new_tz, tz, sizeof(*tz))) return -EFAULT; } return do_sys_settimeofday64(tv ? &new_ts : NULL, tz ? &new_tz : NULL); } #ifdef CONFIG_COMPAT COMPAT_SYSCALL_DEFINE2(gettimeofday, struct old_timeval32 __user *, tv, struct timezone __user *, tz) { if (tv) { struct timespec64 ts; ktime_get_real_ts64(&ts); if (put_user(ts.tv_sec, &tv->tv_sec) || put_user(ts.tv_nsec / 1000, &tv->tv_usec)) return -EFAULT; } if (tz) { if (copy_to_user(tz, &sys_tz, sizeof(sys_tz))) return -EFAULT; } return 0; } COMPAT_SYSCALL_DEFINE2(settimeofday, struct old_timeval32 __user *, tv, struct timezone __user *, tz) { struct timespec64 new_ts; struct timezone new_tz; if (tv) { if (get_user(new_ts.tv_sec, &tv->tv_sec) || get_user(new_ts.tv_nsec, &tv->tv_usec)) return -EFAULT; if (new_ts.tv_nsec > USEC_PER_SEC || new_ts.tv_nsec < 0) return -EINVAL; new_ts.tv_nsec *= NSEC_PER_USEC; } if (tz) { if (copy_from_user(&new_tz, tz, sizeof(*tz))) return -EFAULT; } return do_sys_settimeofday64(tv ? &new_ts : NULL, tz ? &new_tz : NULL); } #endif #ifdef CONFIG_64BIT SYSCALL_DEFINE1(adjtimex, struct __kernel_timex __user *, txc_p) { struct __kernel_timex txc; /* Local copy of parameter */ int ret; /* Copy the user data space into the kernel copy * structure. But bear in mind that the structures * may change */ if (copy_from_user(&txc, txc_p, sizeof(struct __kernel_timex))) return -EFAULT; ret = do_adjtimex(&txc); return copy_to_user(txc_p, &txc, sizeof(struct __kernel_timex)) ? -EFAULT : ret; } #endif #ifdef CONFIG_COMPAT_32BIT_TIME int get_old_timex32(struct __kernel_timex *txc, const struct old_timex32 __user *utp) { struct old_timex32 tx32; memset(txc, 0, sizeof(struct __kernel_timex)); if (copy_from_user(&tx32, utp, sizeof(struct old_timex32))) return -EFAULT; txc->modes = tx32.modes; txc->offset = tx32.offset; txc->freq = tx32.freq; txc->maxerror = tx32.maxerror; txc->esterror = tx32.esterror; txc->status = tx32.status; txc->constant = tx32.constant; txc->precision = tx32.precision; txc->tolerance = tx32.tolerance; txc->time.tv_sec = tx32.time.tv_sec; txc->time.tv_usec = tx32.time.tv_usec; txc->tick = tx32.tick; txc->ppsfreq = tx32.ppsfreq; txc->jitter = tx32.jitter; txc->shift = tx32.shift; txc->stabil = tx32.stabil; txc->jitcnt = tx32.jitcnt; txc->calcnt = tx32.calcnt; txc->errcnt = tx32.errcnt; txc->stbcnt = tx32.stbcnt; return 0; } int put_old_timex32(struct old_timex32 __user *utp, const struct __kernel_timex *txc) { struct old_timex32 tx32; memset(&tx32, 0, sizeof(struct old_timex32)); tx32.modes = txc->modes; tx32.offset = txc->offset; tx32.freq = txc->freq; tx32.maxerror = txc->maxerror; tx32.esterror = txc->esterror; tx32.status = txc->status; tx32.constant = txc->constant; tx32.precision = txc->precision; tx32.tolerance = txc->tolerance; tx32.time.tv_sec = txc->time.tv_sec; tx32.time.tv_usec = txc->time.tv_usec; tx32.tick = txc->tick; tx32.ppsfreq = txc->ppsfreq; tx32.jitter = txc->jitter; tx32.shift = txc->shift; tx32.stabil = txc->stabil; tx32.jitcnt = txc->jitcnt; tx32.calcnt = txc->calcnt; tx32.errcnt = txc->errcnt; tx32.stbcnt = txc->stbcnt; tx32.tai = txc->tai; if (copy_to_user(utp, &tx32, sizeof(struct old_timex32))) return -EFAULT; return 0; } SYSCALL_DEFINE1(adjtimex_time32, struct old_timex32 __user *, utp) { struct __kernel_timex txc; int err, ret; err = get_old_timex32(&txc, utp); if (err) return err; ret = do_adjtimex(&txc); err = put_old_timex32(utp, &txc); if (err) return err; return ret; } #endif /** * jiffies_to_msecs - Convert jiffies to milliseconds * @j: jiffies value * * Avoid unnecessary multiplications/divisions in the * two most common HZ cases. * * Return: milliseconds value */ unsigned int jiffies_to_msecs(const unsigned long j) { #if HZ <= MSEC_PER_SEC && !(MSEC_PER_SEC % HZ) return (MSEC_PER_SEC / HZ) * j; #elif HZ > MSEC_PER_SEC && !(HZ % MSEC_PER_SEC) return (j + (HZ / MSEC_PER_SEC) - 1)/(HZ / MSEC_PER_SEC); #else # if BITS_PER_LONG == 32 return (HZ_TO_MSEC_MUL32 * j + (1ULL << HZ_TO_MSEC_SHR32) - 1) >> HZ_TO_MSEC_SHR32; # else return DIV_ROUND_UP(j * HZ_TO_MSEC_NUM, HZ_TO_MSEC_DEN); # endif #endif } EXPORT_SYMBOL(jiffies_to_msecs); /** * jiffies_to_usecs - Convert jiffies to microseconds * @j: jiffies value * * Return: microseconds value */ unsigned int jiffies_to_usecs(const unsigned long j) { /* * Hz usually doesn't go much further MSEC_PER_SEC. * jiffies_to_usecs() and usecs_to_jiffies() depend on that. */ BUILD_BUG_ON(HZ > USEC_PER_SEC); #if !(USEC_PER_SEC % HZ) return (USEC_PER_SEC / HZ) * j; #else # if BITS_PER_LONG == 32 return (HZ_TO_USEC_MUL32 * j) >> HZ_TO_USEC_SHR32; # else return (j * HZ_TO_USEC_NUM) / HZ_TO_USEC_DEN; # endif #endif } EXPORT_SYMBOL(jiffies_to_usecs); /** * mktime64 - Converts date to seconds. * @year0: year to convert * @mon0: month to convert * @day: day to convert * @hour: hour to convert * @min: minute to convert * @sec: second to convert * * Converts Gregorian date to seconds since 1970-01-01 00:00:00. * Assumes input in normal date format, i.e. 1980-12-31 23:59:59 * => year=1980, mon=12, day=31, hour=23, min=59, sec=59. * * [For the Julian calendar (which was used in Russia before 1917, * Britain & colonies before 1752, anywhere else before 1582, * and is still in use by some communities) leave out the * -year/100+year/400 terms, and add 10.] * * This algorithm was first published by Gauss (I think). * * A leap second can be indicated by calling this function with sec as * 60 (allowable under ISO 8601). The leap second is treated the same * as the following second since they don't exist in UNIX time. * * An encoding of midnight at the end of the day as 24:00:00 - ie. midnight * tomorrow - (allowable under ISO 8601) is supported. * * Return: seconds since the epoch time for the given input date */ time64_t mktime64(const unsigned int year0, const unsigned int mon0, const unsigned int day, const unsigned int hour, const unsigned int min, const unsigned int sec) { unsigned int mon = mon0, year = year0; /* 1..12 -> 11,12,1..10 */ if (0 >= (int) (mon -= 2)) { mon += 12; /* Puts Feb last since it has leap day */ year -= 1; } return ((((time64_t) (year/4 - year/100 + year/400 + 367*mon/12 + day) + year*365 - 719499 )*24 + hour /* now have hours - midnight tomorrow handled here */ )*60 + min /* now have minutes */ )*60 + sec; /* finally seconds */ } EXPORT_SYMBOL(mktime64); struct __kernel_old_timeval ns_to_kernel_old_timeval(s64 nsec) { struct timespec64 ts = ns_to_timespec64(nsec); struct __kernel_old_timeval tv; tv.tv_sec = ts.tv_sec; tv.tv_usec = (suseconds_t)ts.tv_nsec / 1000; return tv; } EXPORT_SYMBOL(ns_to_kernel_old_timeval); /** * set_normalized_timespec64 - set timespec sec and nsec parts and normalize * * @ts: pointer to timespec variable to be set * @sec: seconds to set * @nsec: nanoseconds to set * * Set seconds and nanoseconds field of a timespec variable and * normalize to the timespec storage format * * Note: The tv_nsec part is always in the range of 0 <= tv_nsec < NSEC_PER_SEC. * For negative values only the tv_sec field is negative ! */ void set_normalized_timespec64(struct timespec64 *ts, time64_t sec, s64 nsec) { while (nsec >= NSEC_PER_SEC) { /* * The following asm() prevents the compiler from * optimising this loop into a modulo operation. See * also __iter_div_u64_rem() in include/linux/time.h */ asm("" : "+rm"(nsec)); nsec -= NSEC_PER_SEC; ++sec; } while (nsec < 0) { asm("" : "+rm"(nsec)); nsec += NSEC_PER_SEC; --sec; } ts->tv_sec = sec; ts->tv_nsec = nsec; } EXPORT_SYMBOL(set_normalized_timespec64); /** * ns_to_timespec64 - Convert nanoseconds to timespec64 * @nsec: the nanoseconds value to be converted * * Return: the timespec64 representation of the nsec parameter. */ struct timespec64 ns_to_timespec64(s64 nsec) { struct timespec64 ts = { 0, 0 }; s32 rem; if (likely(nsec > 0)) { ts.tv_sec = div_u64_rem(nsec, NSEC_PER_SEC, &rem); ts.tv_nsec = rem; } else if (nsec < 0) { /* * With negative times, tv_sec points to the earlier * second, and tv_nsec counts the nanoseconds since * then, so tv_nsec is always a positive number. */ ts.tv_sec = -div_u64_rem(-nsec - 1, NSEC_PER_SEC, &rem) - 1; ts.tv_nsec = NSEC_PER_SEC - rem - 1; } return ts; } EXPORT_SYMBOL(ns_to_timespec64); /** * __msecs_to_jiffies: - convert milliseconds to jiffies * @m: time in milliseconds * * conversion is done as follows: * * - negative values mean 'infinite timeout' (MAX_JIFFY_OFFSET) * * - 'too large' values [that would result in larger than * MAX_JIFFY_OFFSET values] mean 'infinite timeout' too. * * - all other values are converted to jiffies by either multiplying * the input value by a factor or dividing it with a factor and * handling any 32-bit overflows. * for the details see _msecs_to_jiffies() * * msecs_to_jiffies() checks for the passed in value being a constant * via __builtin_constant_p() allowing gcc to eliminate most of the * code, __msecs_to_jiffies() is called if the value passed does not * allow constant folding and the actual conversion must be done at * runtime. * The _msecs_to_jiffies helpers are the HZ dependent conversion * routines found in include/linux/jiffies.h * * Return: jiffies value */ unsigned long __msecs_to_jiffies(const unsigned int m) { /* * Negative value, means infinite timeout: */ if ((int)m < 0) return MAX_JIFFY_OFFSET; return _msecs_to_jiffies(m); } EXPORT_SYMBOL(__msecs_to_jiffies); /** * __usecs_to_jiffies: - convert microseconds to jiffies * @u: time in milliseconds * * Return: jiffies value */ unsigned long __usecs_to_jiffies(const unsigned int u) { if (u > jiffies_to_usecs(MAX_JIFFY_OFFSET)) return MAX_JIFFY_OFFSET; return _usecs_to_jiffies(u); } EXPORT_SYMBOL(__usecs_to_jiffies); /** * timespec64_to_jiffies - convert a timespec64 value to jiffies * @value: pointer to &struct timespec64 * * The TICK_NSEC - 1 rounds up the value to the next resolution. Note * that a remainder subtract here would not do the right thing as the * resolution values don't fall on second boundaries. I.e. the line: * nsec -= nsec % TICK_NSEC; is NOT a correct resolution rounding. * Note that due to the small error in the multiplier here, this * rounding is incorrect for sufficiently large values of tv_nsec, but * well formed timespecs should have tv_nsec < NSEC_PER_SEC, so we're * OK. * * Rather, we just shift the bits off the right. * * The >> (NSEC_JIFFIE_SC - SEC_JIFFIE_SC) converts the scaled nsec * value to a scaled second value. * * Return: jiffies value */ unsigned long timespec64_to_jiffies(const struct timespec64 *value) { u64 sec = value->tv_sec; long nsec = value->tv_nsec + TICK_NSEC - 1; if (sec >= MAX_SEC_IN_JIFFIES){ sec = MAX_SEC_IN_JIFFIES; nsec = 0; } return ((sec * SEC_CONVERSION) + (((u64)nsec * NSEC_CONVERSION) >> (NSEC_JIFFIE_SC - SEC_JIFFIE_SC))) >> SEC_JIFFIE_SC; } EXPORT_SYMBOL(timespec64_to_jiffies); /** * jiffies_to_timespec64 - convert jiffies value to &struct timespec64 * @jiffies: jiffies value * @value: pointer to &struct timespec64 */ void jiffies_to_timespec64(const unsigned long jiffies, struct timespec64 *value) { /* * Convert jiffies to nanoseconds and separate with * one divide. */ u32 rem; value->tv_sec = div_u64_rem((u64)jiffies * TICK_NSEC, NSEC_PER_SEC, &rem); value->tv_nsec = rem; } EXPORT_SYMBOL(jiffies_to_timespec64); /* * Convert jiffies/jiffies_64 to clock_t and back. */ /** * jiffies_to_clock_t - Convert jiffies to clock_t * @x: jiffies value * * Return: jiffies converted to clock_t (CLOCKS_PER_SEC) */ clock_t jiffies_to_clock_t(unsigned long x) { #if (TICK_NSEC % (NSEC_PER_SEC / USER_HZ)) == 0 # if HZ < USER_HZ return x * (USER_HZ / HZ); # else return x / (HZ / USER_HZ); # endif #else return div_u64((u64)x * TICK_NSEC, NSEC_PER_SEC / USER_HZ); #endif } EXPORT_SYMBOL(jiffies_to_clock_t); /** * clock_t_to_jiffies - Convert clock_t to jiffies * @x: clock_t value * * Return: clock_t value converted to jiffies */ unsigned long clock_t_to_jiffies(unsigned long x) { #if (HZ % USER_HZ)==0 if (x >= ~0UL / (HZ / USER_HZ)) return ~0UL; return x * (HZ / USER_HZ); #else /* Don't worry about loss of precision here .. */ if (x >= ~0UL / HZ * USER_HZ) return ~0UL; /* .. but do try to contain it here */ return div_u64((u64)x * HZ, USER_HZ); #endif } EXPORT_SYMBOL(clock_t_to_jiffies); /** * jiffies_64_to_clock_t - Convert jiffies_64 to clock_t * @x: jiffies_64 value * * Return: jiffies_64 value converted to 64-bit "clock_t" (CLOCKS_PER_SEC) */ u64 jiffies_64_to_clock_t(u64 x) { #if (TICK_NSEC % (NSEC_PER_SEC / USER_HZ)) == 0 # if HZ < USER_HZ x = div_u64(x * USER_HZ, HZ); # elif HZ > USER_HZ x = div_u64(x, HZ / USER_HZ); # else /* Nothing to do */ # endif #else /* * There are better ways that don't overflow early, * but even this doesn't overflow in hundreds of years * in 64 bits, so.. */ x = div_u64(x * TICK_NSEC, (NSEC_PER_SEC / USER_HZ)); #endif return x; } EXPORT_SYMBOL(jiffies_64_to_clock_t); /** * nsec_to_clock_t - Convert nsec value to clock_t * @x: nsec value * * Return: nsec value converted to 64-bit "clock_t" (CLOCKS_PER_SEC) */ u64 nsec_to_clock_t(u64 x) { #if (NSEC_PER_SEC % USER_HZ) == 0 return div_u64(x, NSEC_PER_SEC / USER_HZ); #elif (USER_HZ % 512) == 0 return div_u64(x * USER_HZ / 512, NSEC_PER_SEC / 512); #else /* * max relative error 5.7e-8 (1.8s per year) for USER_HZ <= 1024, * overflow after 64.99 years. * exact for HZ=60, 72, 90, 120, 144, 180, 300, 600, 900, ... */ return div_u64(x * 9, (9ull * NSEC_PER_SEC + (USER_HZ / 2)) / USER_HZ); #endif } /** * jiffies64_to_nsecs - Convert jiffies64 to nanoseconds * @j: jiffies64 value * * Return: nanoseconds value */ u64 jiffies64_to_nsecs(u64 j) { #if !(NSEC_PER_SEC % HZ) return (NSEC_PER_SEC / HZ) * j; # else return div_u64(j * HZ_TO_NSEC_NUM, HZ_TO_NSEC_DEN); #endif } EXPORT_SYMBOL(jiffies64_to_nsecs); /** * jiffies64_to_msecs - Convert jiffies64 to milliseconds * @j: jiffies64 value * * Return: milliseconds value */ u64 jiffies64_to_msecs(const u64 j) { #if HZ <= MSEC_PER_SEC && !(MSEC_PER_SEC % HZ) return (MSEC_PER_SEC / HZ) * j; #else return div_u64(j * HZ_TO_MSEC_NUM, HZ_TO_MSEC_DEN); #endif } EXPORT_SYMBOL(jiffies64_to_msecs); /** * nsecs_to_jiffies64 - Convert nsecs in u64 to jiffies64 * * @n: nsecs in u64 * * Unlike {m,u}secs_to_jiffies, type of input is not unsigned int but u64. * And this doesn't return MAX_JIFFY_OFFSET since this function is designed * for scheduler, not for use in device drivers to calculate timeout value. * * note: * NSEC_PER_SEC = 10^9 = (5^9 * 2^9) = (1953125 * 512) * ULLONG_MAX ns = 18446744073.709551615 secs = about 584 years * * Return: nsecs converted to jiffies64 value */ u64 nsecs_to_jiffies64(u64 n) { #if (NSEC_PER_SEC % HZ) == 0 /* Common case, HZ = 100, 128, 200, 250, 256, 500, 512, 1000 etc. */ return div_u64(n, NSEC_PER_SEC / HZ); #elif (HZ % 512) == 0 /* overflow after 292 years if HZ = 1024 */ return div_u64(n * HZ / 512, NSEC_PER_SEC / 512); #else /* * Generic case - optimized for cases where HZ is a multiple of 3. * overflow after 64.99 years, exact for HZ = 60, 72, 90, 120 etc. */ return div_u64(n * 9, (9ull * NSEC_PER_SEC + HZ / 2) / HZ); #endif } EXPORT_SYMBOL(nsecs_to_jiffies64); /** * nsecs_to_jiffies - Convert nsecs in u64 to jiffies * * @n: nsecs in u64 * * Unlike {m,u}secs_to_jiffies, type of input is not unsigned int but u64. * And this doesn't return MAX_JIFFY_OFFSET since this function is designed * for scheduler, not for use in device drivers to calculate timeout value. * * note: * NSEC_PER_SEC = 10^9 = (5^9 * 2^9) = (1953125 * 512) * ULLONG_MAX ns = 18446744073.709551615 secs = about 584 years * * Return: nsecs converted to jiffies value */ unsigned long nsecs_to_jiffies(u64 n) { return (unsigned long)nsecs_to_jiffies64(n); } EXPORT_SYMBOL_GPL(nsecs_to_jiffies); /** * timespec64_add_safe - Add two timespec64 values and do a safety check * for overflow. * @lhs: first (left) timespec64 to add * @rhs: second (right) timespec64 to add * * It's assumed that both values are valid (>= 0). * And, each timespec64 is in normalized form. * * Return: sum of @lhs + @rhs */ struct timespec64 timespec64_add_safe(const struct timespec64 lhs, const struct timespec64 rhs) { struct timespec64 res; set_normalized_timespec64(&res, (timeu64_t) lhs.tv_sec + rhs.tv_sec, lhs.tv_nsec + rhs.tv_nsec); if (unlikely(res.tv_sec < lhs.tv_sec || res.tv_sec < rhs.tv_sec)) { res.tv_sec = TIME64_MAX; res.tv_nsec = 0; } return res; } /** * get_timespec64 - get user's time value into kernel space * @ts: destination &struct timespec64 * @uts: user's time value as &struct __kernel_timespec * * Handles compat or 32-bit modes. * * Return: 0 on success or negative errno on error */ int get_timespec64(struct timespec64 *ts, const struct __kernel_timespec __user *uts) { struct __kernel_timespec kts; int ret; ret = copy_from_user(&kts, uts, sizeof(kts)); if (ret) return -EFAULT; ts->tv_sec = kts.tv_sec; /* Zero out the padding in compat mode */ if (in_compat_syscall()) kts.tv_nsec &= 0xFFFFFFFFUL; /* In 32-bit mode, this drops the padding */ ts->tv_nsec = kts.tv_nsec; return 0; } EXPORT_SYMBOL_GPL(get_timespec64); /** * put_timespec64 - convert timespec64 value to __kernel_timespec format and * copy the latter to userspace * @ts: input &struct timespec64 * @uts: user's &struct __kernel_timespec * * Return: 0 on success or negative errno on error */ int put_timespec64(const struct timespec64 *ts, struct __kernel_timespec __user *uts) { struct __kernel_timespec kts = { .tv_sec = ts->tv_sec, .tv_nsec = ts->tv_nsec }; return copy_to_user(uts, &kts, sizeof(kts)) ? -EFAULT : 0; } EXPORT_SYMBOL_GPL(put_timespec64); static int __get_old_timespec32(struct timespec64 *ts64, const struct old_timespec32 __user *cts) { struct old_timespec32 ts; int ret; ret = copy_from_user(&ts, cts, sizeof(ts)); if (ret) return -EFAULT; ts64->tv_sec = ts.tv_sec; ts64->tv_nsec = ts.tv_nsec; return 0; } static int __put_old_timespec32(const struct timespec64 *ts64, struct old_timespec32 __user *cts) { struct old_timespec32 ts = { .tv_sec = ts64->tv_sec, .tv_nsec = ts64->tv_nsec }; return copy_to_user(cts, &ts, sizeof(ts)) ? -EFAULT : 0; } /** * get_old_timespec32 - get user's old-format time value into kernel space * @ts: destination &struct timespec64 * @uts: user's old-format time value (&struct old_timespec32) * * Handles X86_X32_ABI compatibility conversion. * * Return: 0 on success or negative errno on error */ int get_old_timespec32(struct timespec64 *ts, const void __user *uts) { if (COMPAT_USE_64BIT_TIME) return copy_from_user(ts, uts, sizeof(*ts)) ? -EFAULT : 0; else return __get_old_timespec32(ts, uts); } EXPORT_SYMBOL_GPL(get_old_timespec32); /** * put_old_timespec32 - convert timespec64 value to &struct old_timespec32 and * copy the latter to userspace * @ts: input &struct timespec64 * @uts: user's &struct old_timespec32 * * Handles X86_X32_ABI compatibility conversion. * * Return: 0 on success or negative errno on error */ int put_old_timespec32(const struct timespec64 *ts, void __user *uts) { if (COMPAT_USE_64BIT_TIME) return copy_to_user(uts, ts, sizeof(*ts)) ? -EFAULT : 0; else return __put_old_timespec32(ts, uts); } EXPORT_SYMBOL_GPL(put_old_timespec32); /** * get_itimerspec64 - get user's &struct __kernel_itimerspec into kernel space * @it: destination &struct itimerspec64 * @uit: user's &struct __kernel_itimerspec * * Return: 0 on success or negative errno on error */ int get_itimerspec64(struct itimerspec64 *it, const struct __kernel_itimerspec __user *uit) { int ret; ret = get_timespec64(&it->it_interval, &uit->it_interval); if (ret) return ret; ret = get_timespec64(&it->it_value, &uit->it_value); return ret; } EXPORT_SYMBOL_GPL(get_itimerspec64); /** * put_itimerspec64 - convert &struct itimerspec64 to __kernel_itimerspec format * and copy the latter to userspace * @it: input &struct itimerspec64 * @uit: user's &struct __kernel_itimerspec * * Return: 0 on success or negative errno on error */ int put_itimerspec64(const struct itimerspec64 *it, struct __kernel_itimerspec __user *uit) { int ret; ret = put_timespec64(&it->it_interval, &uit->it_interval); if (ret) return ret; ret = put_timespec64(&it->it_value, &uit->it_value); return ret; } EXPORT_SYMBOL_GPL(put_itimerspec64); /** * get_old_itimerspec32 - get user's &struct old_itimerspec32 into kernel space * @its: destination &struct itimerspec64 * @uits: user's &struct old_itimerspec32 * * Return: 0 on success or negative errno on error */ int get_old_itimerspec32(struct itimerspec64 *its, const struct old_itimerspec32 __user *uits) { if (__get_old_timespec32(&its->it_interval, &uits->it_interval) || __get_old_timespec32(&its->it_value, &uits->it_value)) return -EFAULT; return 0; } EXPORT_SYMBOL_GPL(get_old_itimerspec32); /** * put_old_itimerspec32 - convert &struct itimerspec64 to &struct * old_itimerspec32 and copy the latter to userspace * @its: input &struct itimerspec64 * @uits: user's &struct old_itimerspec32 * * Return: 0 on success or negative errno on error */ int put_old_itimerspec32(const struct itimerspec64 *its, struct old_itimerspec32 __user *uits) { if (__put_old_timespec32(&its->it_interval, &uits->it_interval) || __put_old_timespec32(&its->it_value, &uits->it_value)) return -EFAULT; return 0; } EXPORT_SYMBOL_GPL(put_old_itimerspec32);
88 89 89 1 1 88 2 139 140 85 86 2 12 85 85 85 85 85 85 71 18 85 85 84 85 78 69 4 85 83 78 88 74 20 20 88 88 88 2 87 87 88 88 88 88 88 12 85 88 85 20 74 73 88 88 88 87 86 3 3 90 89 90 90 107 105 107 107 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 // SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2000-2006 Silicon Graphics, Inc. * All Rights Reserved. */ #include "xfs.h" #include "xfs_fs.h" #include "xfs_shared.h" #include "xfs_format.h" #include "xfs_log_format.h" #include "xfs_trans_resv.h" #include "xfs_mount.h" #include "xfs_ag.h" #include "xfs_inode.h" #include "xfs_errortag.h" #include "xfs_error.h" #include "xfs_icache.h" #include "xfs_trans.h" #include "xfs_ialloc.h" #include "xfs_dir2.h" #include "xfs_health.h" #include "xfs_metafile.h" #include <linux/iversion.h> /* * If we are doing readahead on an inode buffer, we might be in log recovery * reading an inode allocation buffer that hasn't yet been replayed, and hence * has not had the inode cores stamped into it. Hence for readahead, the buffer * may be potentially invalid. * * If the readahead buffer is invalid, we need to mark it with an error and * clear the DONE status of the buffer so that a followup read will re-read it * from disk. We don't report the error otherwise to avoid warnings during log * recovery and we don't get unnecessary panics on debug kernels. We use EIO here * because all we want to do is say readahead failed; there is no-one to report * the error to, so this will distinguish it from a non-ra verifier failure. * Changes to this readahead error behaviour also need to be reflected in * xfs_dquot_buf_readahead_verify(). */ static void xfs_inode_buf_verify( struct xfs_buf *bp, bool readahead) { struct xfs_mount *mp = bp->b_mount; int i; int ni; /* * Validate the magic number and version of every inode in the buffer */ ni = XFS_BB_TO_FSB(mp, bp->b_length) * mp->m_sb.sb_inopblock; for (i = 0; i < ni; i++) { struct xfs_dinode *dip; xfs_agino_t unlinked_ino; int di_ok; dip = xfs_buf_offset(bp, (i << mp->m_sb.sb_inodelog)); unlinked_ino = be32_to_cpu(dip->di_next_unlinked); di_ok = xfs_verify_magic16(bp, dip->di_magic) && xfs_dinode_good_version(mp, dip->di_version) && xfs_verify_agino_or_null(bp->b_pag, unlinked_ino); if (unlikely(XFS_TEST_ERROR(!di_ok, mp, XFS_ERRTAG_ITOBP_INOTOBP))) { if (readahead) { bp->b_flags &= ~XBF_DONE; xfs_buf_ioerror(bp, -EIO); return; } #ifdef DEBUG xfs_alert(mp, "bad inode magic/vsn daddr %lld #%d (magic=%x)", (unsigned long long)xfs_buf_daddr(bp), i, be16_to_cpu(dip->di_magic)); #endif xfs_buf_verifier_error(bp, -EFSCORRUPTED, __func__, dip, sizeof(*dip), NULL); return; } } } static void xfs_inode_buf_read_verify( struct xfs_buf *bp) { xfs_inode_buf_verify(bp, false); } static void xfs_inode_buf_readahead_verify( struct xfs_buf *bp) { xfs_inode_buf_verify(bp, true); } static void xfs_inode_buf_write_verify( struct xfs_buf *bp) { xfs_inode_buf_verify(bp, false); } const struct xfs_buf_ops xfs_inode_buf_ops = { .name = "xfs_inode", .magic16 = { cpu_to_be16(XFS_DINODE_MAGIC), cpu_to_be16(XFS_DINODE_MAGIC) }, .verify_read = xfs_inode_buf_read_verify, .verify_write = xfs_inode_buf_write_verify, }; const struct xfs_buf_ops xfs_inode_buf_ra_ops = { .name = "xfs_inode_ra", .magic16 = { cpu_to_be16(XFS_DINODE_MAGIC), cpu_to_be16(XFS_DINODE_MAGIC) }, .verify_read = xfs_inode_buf_readahead_verify, .verify_write = xfs_inode_buf_write_verify, }; /* * This routine is called to map an inode to the buffer containing the on-disk * version of the inode. It returns a pointer to the buffer containing the * on-disk inode in the bpp parameter. */ int xfs_imap_to_bp( struct xfs_mount *mp, struct xfs_trans *tp, struct xfs_imap *imap, struct xfs_buf **bpp) { int error; error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, imap->im_blkno, imap->im_len, 0, bpp, &xfs_inode_buf_ops); if (xfs_metadata_is_sick(error)) xfs_agno_mark_sick(mp, xfs_daddr_to_agno(mp, imap->im_blkno), XFS_SICK_AG_INODES); return error; } static inline struct timespec64 xfs_inode_decode_bigtime(uint64_t ts) { struct timespec64 tv; uint32_t n; tv.tv_sec = xfs_bigtime_to_unix(div_u64_rem(ts, NSEC_PER_SEC, &n)); tv.tv_nsec = n; return tv; } /* Convert an ondisk timestamp to an incore timestamp. */ struct timespec64 xfs_inode_from_disk_ts( struct xfs_dinode *dip, const xfs_timestamp_t ts) { struct timespec64 tv; struct xfs_legacy_timestamp *lts; if (xfs_dinode_has_bigtime(dip)) return xfs_inode_decode_bigtime(be64_to_cpu(ts)); lts = (struct xfs_legacy_timestamp *)&ts; tv.tv_sec = (int)be32_to_cpu(lts->t_sec); tv.tv_nsec = (int)be32_to_cpu(lts->t_nsec); return tv; } int xfs_inode_from_disk( struct xfs_inode *ip, struct xfs_dinode *from) { struct inode *inode = VFS_I(ip); int error; xfs_failaddr_t fa; ASSERT(ip->i_cowfp == NULL); fa = xfs_dinode_verify(ip->i_mount, ip->i_ino, from); if (fa) { xfs_inode_verifier_error(ip, -EFSCORRUPTED, "dinode", from, sizeof(*from), fa); return -EFSCORRUPTED; } /* * First get the permanent information that is needed to allocate an * inode. If the inode is unused, mode is zero and we shouldn't mess * with the uninitialized part of it. */ if (!xfs_has_v3inodes(ip->i_mount)) ip->i_flushiter = be16_to_cpu(from->di_flushiter); inode->i_generation = be32_to_cpu(from->di_gen); inode->i_mode = be16_to_cpu(from->di_mode); if (!inode->i_mode) return 0; /* * Convert v1 inodes immediately to v2 inode format as this is the * minimum inode version format we support in the rest of the code. * They will also be unconditionally written back to disk as v2 inodes. */ if (unlikely(from->di_version == 1)) { /* di_metatype used to be di_onlink */ set_nlink(inode, be16_to_cpu(from->di_metatype)); ip->i_projid = 0; } else { set_nlink(inode, be32_to_cpu(from->di_nlink)); ip->i_projid = (prid_t)be16_to_cpu(from->di_projid_hi) << 16 | be16_to_cpu(from->di_projid_lo); if (xfs_dinode_is_metadir(from)) ip->i_metatype = be16_to_cpu(from->di_metatype); } i_uid_write(inode, be32_to_cpu(from->di_uid)); i_gid_write(inode, be32_to_cpu(from->di_gid)); /* * Time is signed, so need to convert to signed 32 bit before * storing in inode timestamp which may be 64 bit. Otherwise * a time before epoch is converted to a time long after epoch * on 64 bit systems. */ inode_set_atime_to_ts(inode, xfs_inode_from_disk_ts(from, from->di_atime)); inode_set_mtime_to_ts(inode, xfs_inode_from_disk_ts(from, from->di_mtime)); inode_set_ctime_to_ts(inode, xfs_inode_from_disk_ts(from, from->di_ctime)); ip->i_disk_size = be64_to_cpu(from->di_size); ip->i_nblocks = be64_to_cpu(from->di_nblocks); ip->i_extsize = be32_to_cpu(from->di_extsize); ip->i_forkoff = from->di_forkoff; ip->i_diflags = be16_to_cpu(from->di_flags); ip->i_next_unlinked = be32_to_cpu(from->di_next_unlinked); if (from->di_dmevmask || from->di_dmstate) xfs_iflags_set(ip, XFS_IPRESERVE_DM_FIELDS); if (xfs_has_v3inodes(ip->i_mount)) { inode_set_iversion_queried(inode, be64_to_cpu(from->di_changecount)); ip->i_crtime = xfs_inode_from_disk_ts(from, from->di_crtime); ip->i_diflags2 = be64_to_cpu(from->di_flags2); /* also covers the di_used_blocks union arm: */ ip->i_cowextsize = be32_to_cpu(from->di_cowextsize); BUILD_BUG_ON(sizeof(from->di_cowextsize) != sizeof(from->di_used_blocks)); } error = xfs_iformat_data_fork(ip, from); if (error) return error; if (from->di_forkoff) { error = xfs_iformat_attr_fork(ip, from); if (error) goto out_destroy_data_fork; } if (xfs_is_reflink_inode(ip)) xfs_ifork_init_cow(ip); return 0; out_destroy_data_fork: xfs_idestroy_fork(&ip->i_df); return error; } /* Convert an incore timestamp to an ondisk timestamp. */ static inline xfs_timestamp_t xfs_inode_to_disk_ts( struct xfs_inode *ip, const struct timespec64 tv) { struct xfs_legacy_timestamp *lts; xfs_timestamp_t ts; if (xfs_inode_has_bigtime(ip)) return cpu_to_be64(xfs_inode_encode_bigtime(tv)); lts = (struct xfs_legacy_timestamp *)&ts; lts->t_sec = cpu_to_be32(tv.tv_sec); lts->t_nsec = cpu_to_be32(tv.tv_nsec); return ts; } static inline void xfs_inode_to_disk_iext_counters( struct xfs_inode *ip, struct xfs_dinode *to) { if (xfs_inode_has_large_extent_counts(ip)) { to->di_big_nextents = cpu_to_be64(xfs_ifork_nextents(&ip->i_df)); to->di_big_anextents = cpu_to_be32(xfs_ifork_nextents(&ip->i_af)); /* * We might be upgrading the inode to use larger extent counters * than was previously used. Hence zero the unused field. */ to->di_nrext64_pad = cpu_to_be16(0); } else { to->di_nextents = cpu_to_be32(xfs_ifork_nextents(&ip->i_df)); to->di_anextents = cpu_to_be16(xfs_ifork_nextents(&ip->i_af)); } } void xfs_inode_to_disk( struct xfs_inode *ip, struct xfs_dinode *to, xfs_lsn_t lsn) { struct inode *inode = VFS_I(ip); to->di_magic = cpu_to_be16(XFS_DINODE_MAGIC); if (xfs_is_metadir_inode(ip)) to->di_metatype = cpu_to_be16(ip->i_metatype); else to->di_metatype = 0; to->di_format = xfs_ifork_format(&ip->i_df); to->di_uid = cpu_to_be32(i_uid_read(inode)); to->di_gid = cpu_to_be32(i_gid_read(inode)); to->di_projid_lo = cpu_to_be16(ip->i_projid & 0xffff); to->di_projid_hi = cpu_to_be16(ip->i_projid >> 16); to->di_atime = xfs_inode_to_disk_ts(ip, inode_get_atime(inode)); to->di_mtime = xfs_inode_to_disk_ts(ip, inode_get_mtime(inode)); to->di_ctime = xfs_inode_to_disk_ts(ip, inode_get_ctime(inode)); to->di_nlink = cpu_to_be32(inode->i_nlink); to->di_gen = cpu_to_be32(inode->i_generation); to->di_mode = cpu_to_be16(inode->i_mode); to->di_size = cpu_to_be64(ip->i_disk_size); to->di_nblocks = cpu_to_be64(ip->i_nblocks); to->di_extsize = cpu_to_be32(ip->i_extsize); to->di_forkoff = ip->i_forkoff; to->di_aformat = xfs_ifork_format(&ip->i_af); to->di_flags = cpu_to_be16(ip->i_diflags); if (xfs_has_v3inodes(ip->i_mount)) { to->di_version = 3; to->di_changecount = cpu_to_be64(inode_peek_iversion(inode)); to->di_crtime = xfs_inode_to_disk_ts(ip, ip->i_crtime); to->di_flags2 = cpu_to_be64(ip->i_diflags2); /* also covers the di_used_blocks union arm: */ to->di_cowextsize = cpu_to_be32(ip->i_cowextsize); to->di_ino = cpu_to_be64(ip->i_ino); to->di_lsn = cpu_to_be64(lsn); memset(to->di_pad2, 0, sizeof(to->di_pad2)); uuid_copy(&to->di_uuid, &ip->i_mount->m_sb.sb_meta_uuid); to->di_v3_pad = 0; } else { to->di_version = 2; to->di_flushiter = cpu_to_be16(ip->i_flushiter); memset(to->di_v2_pad, 0, sizeof(to->di_v2_pad)); } xfs_inode_to_disk_iext_counters(ip, to); } static xfs_failaddr_t xfs_dinode_verify_fork( struct xfs_dinode *dip, struct xfs_mount *mp, int whichfork) { xfs_extnum_t di_nextents; xfs_extnum_t max_extents; mode_t mode = be16_to_cpu(dip->di_mode); uint32_t fork_size = XFS_DFORK_SIZE(dip, mp, whichfork); uint32_t fork_format = XFS_DFORK_FORMAT(dip, whichfork); di_nextents = xfs_dfork_nextents(dip, whichfork); /* * For fork types that can contain local data, check that the fork * format matches the size of local data contained within the fork. */ if (whichfork == XFS_DATA_FORK) { /* * A directory small enough to fit in the inode must be stored * in local format. The directory sf <-> extents conversion * code updates the directory size accordingly. Directories * being truncated have zero size and are not subject to this * check. */ if (S_ISDIR(mode)) { if (dip->di_size && be64_to_cpu(dip->di_size) <= fork_size && fork_format != XFS_DINODE_FMT_LOCAL) return __this_address; } /* * A symlink with a target small enough to fit in the inode can * be stored in extents format if xattrs were added (thus * converting the data fork from shortform to remote format) * and then removed. */ if (S_ISLNK(mode)) { if (be64_to_cpu(dip->di_size) <= fork_size && fork_format != XFS_DINODE_FMT_EXTENTS && fork_format != XFS_DINODE_FMT_LOCAL) return __this_address; } /* * For all types, check that when the size says the fork should * be in extent or btree format, the inode isn't claiming to be * in local format. */ if (be64_to_cpu(dip->di_size) > fork_size && fork_format == XFS_DINODE_FMT_LOCAL) return __this_address; } switch (fork_format) { case XFS_DINODE_FMT_LOCAL: /* * No local regular files yet. */ if (S_ISREG(mode) && whichfork == XFS_DATA_FORK) return __this_address; if (di_nextents) return __this_address; break; case XFS_DINODE_FMT_EXTENTS: if (di_nextents > XFS_DFORK_MAXEXT(dip, mp, whichfork)) return __this_address; break; case XFS_DINODE_FMT_BTREE: max_extents = xfs_iext_max_nextents( xfs_dinode_has_large_extent_counts(dip), whichfork); if (di_nextents > max_extents) return __this_address; break; case XFS_DINODE_FMT_META_BTREE: if (!xfs_has_metadir(mp)) return __this_address; if (!(dip->di_flags2 & cpu_to_be64(XFS_DIFLAG2_METADATA))) return __this_address; switch (be16_to_cpu(dip->di_metatype)) { case XFS_METAFILE_RTRMAP: /* * growfs must create the rtrmap inodes before adding a * realtime volume to the filesystem, so we cannot use * the rtrmapbt predicate here. */ if (!xfs_has_rmapbt(mp)) return __this_address; break; case XFS_METAFILE_RTREFCOUNT: /* same comment about growfs and rmap inodes applies */ if (!xfs_has_reflink(mp)) return __this_address; break; default: return __this_address; } break; default: return __this_address; } return NULL; } static xfs_failaddr_t xfs_dinode_verify_forkoff( struct xfs_dinode *dip, struct xfs_mount *mp) { if (!dip->di_forkoff) return NULL; switch (dip->di_format) { case XFS_DINODE_FMT_DEV: if (dip->di_forkoff != (roundup(sizeof(xfs_dev_t), 8) >> 3)) return __this_address; break; case XFS_DINODE_FMT_META_BTREE: if (!xfs_has_metadir(mp) || !xfs_has_parent(mp)) return __this_address; fallthrough; case XFS_DINODE_FMT_LOCAL: /* fall through ... */ case XFS_DINODE_FMT_EXTENTS: /* fall through ... */ case XFS_DINODE_FMT_BTREE: if (dip->di_forkoff >= (XFS_LITINO(mp) >> 3)) return __this_address; break; default: return __this_address; } return NULL; } static xfs_failaddr_t xfs_dinode_verify_nrext64( struct xfs_mount *mp, struct xfs_dinode *dip) { if (xfs_dinode_has_large_extent_counts(dip)) { if (!xfs_has_large_extent_counts(mp)) return __this_address; if (dip->di_nrext64_pad != 0) return __this_address; } else if (dip->di_version >= 3) { if (dip->di_v3_pad != 0) return __this_address; } return NULL; } /* * Validate all the picky requirements we have for a file that claims to be * filesystem metadata. */ xfs_failaddr_t xfs_dinode_verify_metadir( struct xfs_mount *mp, struct xfs_dinode *dip, uint16_t mode, uint16_t flags, uint64_t flags2) { if (!xfs_has_metadir(mp)) return __this_address; /* V5 filesystem only */ if (dip->di_version < 3) return __this_address; if (be16_to_cpu(dip->di_metatype) >= XFS_METAFILE_MAX) return __this_address; /* V3 inode fields that are always zero */ if ((flags2 & XFS_DIFLAG2_NREXT64) && dip->di_nrext64_pad) return __this_address; if (!(flags2 & XFS_DIFLAG2_NREXT64) && dip->di_flushiter) return __this_address; /* Metadata files can only be directories or regular files */ if (!S_ISDIR(mode) && !S_ISREG(mode)) return __this_address; /* They must have zero access permissions */ if (mode & 0777) return __this_address; /* DMAPI event and state masks are zero */ if (dip->di_dmevmask || dip->di_dmstate) return __this_address; /* * User and group IDs must be zero. The project ID is used for * grouping inodes. Metadata inodes are never accounted to quotas. */ if (dip->di_uid || dip->di_gid) return __this_address; /* Mandatory inode flags must be set */ if (S_ISDIR(mode)) { if ((flags & XFS_METADIR_DIFLAGS) != XFS_METADIR_DIFLAGS) return __this_address; } else { if ((flags & XFS_METAFILE_DIFLAGS) != XFS_METAFILE_DIFLAGS) return __this_address; } /* dax flags2 must not be set */ if (flags2 & XFS_DIFLAG2_DAX) return __this_address; return NULL; } xfs_failaddr_t xfs_dinode_verify( struct xfs_mount *mp, xfs_ino_t ino, struct xfs_dinode *dip) { xfs_failaddr_t fa; uint16_t mode; uint16_t flags; uint64_t flags2; uint64_t di_size; xfs_extnum_t nextents; xfs_extnum_t naextents; xfs_filblks_t nblocks; if (dip->di_magic != cpu_to_be16(XFS_DINODE_MAGIC)) return __this_address; /* Verify v3 integrity information first */ if (dip->di_version >= 3) { if (!xfs_has_v3inodes(mp)) return __this_address; if (!xfs_verify_cksum((char *)dip, mp->m_sb.sb_inodesize, XFS_DINODE_CRC_OFF)) return __this_address; if (be64_to_cpu(dip->di_ino) != ino) return __this_address; if (!uuid_equal(&dip->di_uuid, &mp->m_sb.sb_meta_uuid)) return __this_address; } /* * Historical note: xfsprogs in the 3.2 era set up its incore inodes to * have di_nlink track the link count, even if the actual filesystem * only supported V1 inodes (i.e. di_onlink). When writing out the * ondisk inode, it would set both the ondisk di_nlink and di_onlink to * the the incore di_nlink value, which is why we cannot check for * di_nlink==0 on a V1 inode. V2/3 inodes would get written out with * di_onlink==0, so we can check that. */ if (dip->di_version == 2) { if (dip->di_metatype) return __this_address; } else if (dip->di_version >= 3) { if (!xfs_dinode_is_metadir(dip) && dip->di_metatype) return __this_address; } /* don't allow invalid i_size */ di_size = be64_to_cpu(dip->di_size); if (di_size & (1ULL << 63)) return __this_address; mode = be16_to_cpu(dip->di_mode); if (mode && xfs_mode_to_ftype(mode) == XFS_DIR3_FT_UNKNOWN) return __this_address; /* * No zero-length symlinks/dirs unless they're unlinked and hence being * inactivated. */ if ((S_ISLNK(mode) || S_ISDIR(mode)) && di_size == 0) { if (dip->di_version > 1) { if (dip->di_nlink) return __this_address; } else { /* di_metatype used to be di_onlink */ if (dip->di_metatype) return __this_address; } } fa = xfs_dinode_verify_nrext64(mp, dip); if (fa) return fa; nextents = xfs_dfork_data_extents(dip); naextents = xfs_dfork_attr_extents(dip); nblocks = be64_to_cpu(dip->di_nblocks); /* Fork checks carried over from xfs_iformat_fork */ if (mode && nextents + naextents > nblocks) return __this_address; if (S_ISDIR(mode) && nextents > mp->m_dir_geo->max_extents) return __this_address; if (mode && XFS_DFORK_BOFF(dip) > mp->m_sb.sb_inodesize) return __this_address; flags = be16_to_cpu(dip->di_flags); if (mode && (flags & XFS_DIFLAG_REALTIME) && !mp->m_rtdev_targp) return __this_address; /* check for illegal values of forkoff */ fa = xfs_dinode_verify_forkoff(dip, mp); if (fa) return fa; /* Do we have appropriate data fork formats for the mode? */ switch (mode & S_IFMT) { case S_IFIFO: case S_IFCHR: case S_IFBLK: case S_IFSOCK: if (dip->di_format != XFS_DINODE_FMT_DEV) return __this_address; break; case S_IFREG: case S_IFLNK: case S_IFDIR: fa = xfs_dinode_verify_fork(dip, mp, XFS_DATA_FORK); if (fa) return fa; break; case 0: /* Uninitialized inode ok. */ break; default: return __this_address; } if (dip->di_forkoff) { fa = xfs_dinode_verify_fork(dip, mp, XFS_ATTR_FORK); if (fa) return fa; } else { /* * If there is no fork offset, this may be a freshly-made inode * in a new disk cluster, in which case di_aformat is zeroed. * Otherwise, such an inode must be in EXTENTS format; this goes * for freed inodes as well. */ switch (dip->di_aformat) { case 0: case XFS_DINODE_FMT_EXTENTS: break; default: return __this_address; } if (naextents) return __this_address; } /* extent size hint validation */ fa = xfs_inode_validate_extsize(mp, be32_to_cpu(dip->di_extsize), mode, flags); if (fa) return fa; /* only version 3 or greater inodes are extensively verified here */ if (dip->di_version < 3) return NULL; flags2 = be64_to_cpu(dip->di_flags2); /* don't allow reflink/cowextsize if we don't have reflink */ if ((flags2 & (XFS_DIFLAG2_REFLINK | XFS_DIFLAG2_COWEXTSIZE)) && !xfs_has_reflink(mp)) return __this_address; /* only regular files get reflink */ if ((flags2 & XFS_DIFLAG2_REFLINK) && (mode & S_IFMT) != S_IFREG) return __this_address; /* don't let reflink and realtime mix */ if ((flags2 & XFS_DIFLAG2_REFLINK) && (flags & XFS_DIFLAG_REALTIME) && !xfs_has_rtreflink(mp)) return __this_address; if (xfs_has_zoned(mp) && dip->di_metatype == cpu_to_be16(XFS_METAFILE_RTRMAP)) { if (be32_to_cpu(dip->di_used_blocks) > mp->m_sb.sb_rgextents) return __this_address; } else { /* COW extent size hint validation */ fa = xfs_inode_validate_cowextsize(mp, be32_to_cpu(dip->di_cowextsize), mode, flags, flags2); if (fa) return fa; } /* bigtime iflag can only happen on bigtime filesystems */ if (xfs_dinode_has_bigtime(dip) && !xfs_has_bigtime(mp)) return __this_address; if (flags2 & XFS_DIFLAG2_METADATA) { fa = xfs_dinode_verify_metadir(mp, dip, mode, flags, flags2); if (fa) return fa; } /* metadata inodes containing btrees always have zero extent count */ if (XFS_DFORK_FORMAT(dip, XFS_DATA_FORK) != XFS_DINODE_FMT_META_BTREE) { if (nextents + naextents == 0 && nblocks != 0) return __this_address; } return NULL; } void xfs_dinode_calc_crc( struct xfs_mount *mp, struct xfs_dinode *dip) { uint32_t crc; if (dip->di_version < 3) return; ASSERT(xfs_has_crc(mp)); crc = xfs_start_cksum_update((char *)dip, mp->m_sb.sb_inodesize, XFS_DINODE_CRC_OFF); dip->di_crc = xfs_end_cksum(crc); } /* * Validate di_extsize hint. * * 1. Extent size hint is only valid for directories and regular files. * 2. FS_XFLAG_EXTSIZE is only valid for regular files. * 3. FS_XFLAG_EXTSZINHERIT is only valid for directories. * 4. Hint cannot be larger than MAXTEXTLEN. * 5. Can be changed on directories at any time. * 6. Hint value of 0 turns off hints, clears inode flags. * 7. Extent size must be a multiple of the appropriate block size. * For realtime files, this is the rt extent size. * 8. For non-realtime files, the extent size hint must be limited * to half the AG size to avoid alignment extending the extent beyond the * limits of the AG. */ xfs_failaddr_t xfs_inode_validate_extsize( struct xfs_mount *mp, uint32_t extsize, uint16_t mode, uint16_t flags) { bool rt_flag; bool hint_flag; bool inherit_flag; uint32_t extsize_bytes; uint32_t blocksize_bytes; rt_flag = (flags & XFS_DIFLAG_REALTIME); hint_flag = (flags & XFS_DIFLAG_EXTSIZE); inherit_flag = (flags & XFS_DIFLAG_EXTSZINHERIT); extsize_bytes = XFS_FSB_TO_B(mp, extsize); /* * This comment describes a historic gap in this verifier function. * * For a directory with both RTINHERIT and EXTSZINHERIT flags set, this * function has never checked that the extent size hint is an integer * multiple of the realtime extent size. Since we allow users to set * this combination on non-rt filesystems /and/ to change the rt * extent size when adding a rt device to a filesystem, the net effect * is that users can configure a filesystem anticipating one rt * geometry and change their minds later. Directories do not use the * extent size hint, so this is harmless for them. * * If a directory with a misaligned extent size hint is allowed to * propagate that hint into a new regular realtime file, the result * is that the inode cluster buffer verifier will trigger a corruption * shutdown the next time it is run, because the verifier has always * enforced the alignment rule for regular files. * * Because we allow administrators to set a new rt extent size when * adding a rt section, we cannot add a check to this verifier because * that will result a new source of directory corruption errors when * reading an existing filesystem. Instead, we rely on callers to * decide when alignment checks are appropriate, and fix things up as * needed. */ if (rt_flag) blocksize_bytes = XFS_FSB_TO_B(mp, mp->m_sb.sb_rextsize); else blocksize_bytes = mp->m_sb.sb_blocksize; if ((hint_flag || inherit_flag) && !(S_ISDIR(mode) || S_ISREG(mode))) return __this_address; if (hint_flag && !S_ISREG(mode)) return __this_address; if (inherit_flag && !S_ISDIR(mode)) return __this_address; if ((hint_flag || inherit_flag) && extsize == 0) return __this_address; /* free inodes get flags set to zero but extsize remains */ if (mode && !(hint_flag || inherit_flag) && extsize != 0) return __this_address; if (extsize_bytes % blocksize_bytes) return __this_address; if (extsize > XFS_MAX_BMBT_EXTLEN) return __this_address; if (!rt_flag && extsize > mp->m_sb.sb_agblocks / 2) return __this_address; return NULL; } /* * Validate di_cowextsize hint. * * 1. CoW extent size hint can only be set if reflink is enabled on the fs. * The inode does not have to have any shared blocks, but it must be a v3. * 2. FS_XFLAG_COWEXTSIZE is only valid for directories and regular files; * for a directory, the hint is propagated to new files. * 3. Can be changed on files & directories at any time. * 4. Hint value of 0 turns off hints, clears inode flags. * 5. Extent size must be a multiple of the appropriate block size. * 6. The extent size hint must be limited to half the AG size to avoid * alignment extending the extent beyond the limits of the AG. */ xfs_failaddr_t xfs_inode_validate_cowextsize( struct xfs_mount *mp, uint32_t cowextsize, uint16_t mode, uint16_t flags, uint64_t flags2) { bool rt_flag; bool hint_flag; uint32_t cowextsize_bytes; uint32_t blocksize_bytes; rt_flag = (flags & XFS_DIFLAG_REALTIME); hint_flag = (flags2 & XFS_DIFLAG2_COWEXTSIZE); cowextsize_bytes = XFS_FSB_TO_B(mp, cowextsize); /* * Similar to extent size hints, a directory can be configured to * propagate realtime status and a CoW extent size hint to newly * created files even if there is no realtime device, and the hints on * disk can become misaligned if the sysadmin changes the rt extent * size while adding the realtime device. * * Therefore, we can only enforce the rextsize alignment check against * regular realtime files, and rely on callers to decide when alignment * checks are appropriate, and fix things up as needed. */ if (rt_flag) blocksize_bytes = XFS_FSB_TO_B(mp, mp->m_sb.sb_rextsize); else blocksize_bytes = mp->m_sb.sb_blocksize; if (hint_flag && !xfs_has_reflink(mp)) return __this_address; if (hint_flag && !(S_ISDIR(mode) || S_ISREG(mode))) return __this_address; if (hint_flag && cowextsize == 0) return __this_address; /* free inodes get flags set to zero but cowextsize remains */ if (mode && !hint_flag && cowextsize != 0) return __this_address; if (cowextsize_bytes % blocksize_bytes) return __this_address; if (cowextsize > XFS_MAX_BMBT_EXTLEN) return __this_address; if (!rt_flag && cowextsize > mp->m_sb.sb_agblocks / 2) return __this_address; return NULL; }
1 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 // SPDX-License-Identifier: GPL-2.0-only /* * Frontend driver for the GENPIX 8pks/qpsk/DCII USB2.0 DVB-S module * * Copyright (C) 2006,2007 Alan Nisota (alannisota@gmail.com) * Copyright (C) 2006,2007 Genpix Electronics (genpix@genpix-electronics.com) * * Thanks to GENPIX for the sample code used to implement this module. * * This module is based off the vp7045 and vp702x modules */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include "gp8psk-fe.h" #include <media/dvb_frontend.h> static int debug; module_param(debug, int, 0644); MODULE_PARM_DESC(debug, "Turn on/off debugging (default:off)."); #define dprintk(fmt, arg...) do { \ if (debug) \ printk(KERN_DEBUG pr_fmt("%s: " fmt), \ __func__, ##arg); \ } while (0) struct gp8psk_fe_state { struct dvb_frontend fe; void *priv; const struct gp8psk_fe_ops *ops; bool is_rev1; u8 lock; u16 snr; unsigned long next_status_check; unsigned long status_check_interval; }; static int gp8psk_tuned_to_DCII(struct dvb_frontend *fe) { struct gp8psk_fe_state *st = fe->demodulator_priv; u8 status; st->ops->in(st->priv, GET_8PSK_CONFIG, 0, 0, &status, 1); return status & bmDCtuned; } static int gp8psk_set_tuner_mode(struct dvb_frontend *fe, int mode) { struct gp8psk_fe_state *st = fe->demodulator_priv; return st->ops->out(st->priv, SET_8PSK_CONFIG, mode, 0, NULL, 0); } static int gp8psk_fe_update_status(struct gp8psk_fe_state *st) { u8 buf[6]; if (time_after(jiffies,st->next_status_check)) { st->ops->in(st->priv, GET_SIGNAL_LOCK, 0, 0, &st->lock, 1); st->ops->in(st->priv, GET_SIGNAL_STRENGTH, 0, 0, buf, 6); st->snr = (buf[1]) << 8 | buf[0]; st->next_status_check = jiffies + (st->status_check_interval*HZ)/1000; } return 0; } static int gp8psk_fe_read_status(struct dvb_frontend *fe, enum fe_status *status) { struct gp8psk_fe_state *st = fe->demodulator_priv; gp8psk_fe_update_status(st); if (st->lock) *status = FE_HAS_LOCK | FE_HAS_SYNC | FE_HAS_VITERBI | FE_HAS_SIGNAL | FE_HAS_CARRIER; else *status = 0; if (*status & FE_HAS_LOCK) st->status_check_interval = 1000; else st->status_check_interval = 100; return 0; } /* not supported by this Frontend */ static int gp8psk_fe_read_ber(struct dvb_frontend* fe, u32 *ber) { (void) fe; *ber = 0; return 0; } /* not supported by this Frontend */ static int gp8psk_fe_read_unc_blocks(struct dvb_frontend* fe, u32 *unc) { (void) fe; *unc = 0; return 0; } static int gp8psk_fe_read_snr(struct dvb_frontend* fe, u16 *snr) { struct gp8psk_fe_state *st = fe->demodulator_priv; gp8psk_fe_update_status(st); /* snr is reported in dBu*256 */ *snr = st->snr; return 0; } static int gp8psk_fe_read_signal_strength(struct dvb_frontend* fe, u16 *strength) { struct gp8psk_fe_state *st = fe->demodulator_priv; gp8psk_fe_update_status(st); /* snr is reported in dBu*256 */ /* snr / 38.4 ~= 100% strength */ /* snr * 17 returns 100% strength as 65535 */ if (st->snr > 0xf00) *strength = 0xffff; else *strength = (st->snr << 4) + st->snr; /* snr*17 */ return 0; } static int gp8psk_fe_get_tune_settings(struct dvb_frontend* fe, struct dvb_frontend_tune_settings *tune) { tune->min_delay_ms = 800; return 0; } static int gp8psk_fe_set_frontend(struct dvb_frontend *fe) { struct gp8psk_fe_state *st = fe->demodulator_priv; struct dtv_frontend_properties *c = &fe->dtv_property_cache; u8 cmd[10]; u32 freq = c->frequency * 1000; dprintk("%s()\n", __func__); cmd[4] = freq & 0xff; cmd[5] = (freq >> 8) & 0xff; cmd[6] = (freq >> 16) & 0xff; cmd[7] = (freq >> 24) & 0xff; /* backwards compatibility: DVB-S + 8-PSK were used for Turbo-FEC */ if (c->delivery_system == SYS_DVBS && c->modulation == PSK_8) c->delivery_system = SYS_TURBO; switch (c->delivery_system) { case SYS_DVBS: if (c->modulation != QPSK) { dprintk("%s: unsupported modulation selected (%d)\n", __func__, c->modulation); return -EOPNOTSUPP; } c->fec_inner = FEC_AUTO; break; case SYS_DVBS2: /* kept for backwards compatibility */ dprintk("%s: DVB-S2 delivery system selected\n", __func__); break; case SYS_TURBO: dprintk("%s: Turbo-FEC delivery system selected\n", __func__); break; default: dprintk("%s: unsupported delivery system selected (%d)\n", __func__, c->delivery_system); return -EOPNOTSUPP; } cmd[0] = c->symbol_rate & 0xff; cmd[1] = (c->symbol_rate >> 8) & 0xff; cmd[2] = (c->symbol_rate >> 16) & 0xff; cmd[3] = (c->symbol_rate >> 24) & 0xff; switch (c->modulation) { case QPSK: if (st->is_rev1) if (gp8psk_tuned_to_DCII(fe)) st->ops->reload(st->priv); switch (c->fec_inner) { case FEC_1_2: cmd[9] = 0; break; case FEC_2_3: cmd[9] = 1; break; case FEC_3_4: cmd[9] = 2; break; case FEC_5_6: cmd[9] = 3; break; case FEC_7_8: cmd[9] = 4; break; case FEC_AUTO: cmd[9] = 5; break; default: cmd[9] = 5; break; } if (c->delivery_system == SYS_TURBO) cmd[8] = ADV_MOD_TURBO_QPSK; else cmd[8] = ADV_MOD_DVB_QPSK; break; case PSK_8: /* PSK_8 is for compatibility with DN */ cmd[8] = ADV_MOD_TURBO_8PSK; switch (c->fec_inner) { case FEC_2_3: cmd[9] = 0; break; case FEC_3_4: cmd[9] = 1; break; case FEC_3_5: cmd[9] = 2; break; case FEC_5_6: cmd[9] = 3; break; case FEC_8_9: cmd[9] = 4; break; default: cmd[9] = 0; break; } break; case QAM_16: /* QAM_16 is for compatibility with DN */ cmd[8] = ADV_MOD_TURBO_16QAM; cmd[9] = 0; break; default: /* Unknown modulation */ dprintk("%s: unsupported modulation selected (%d)\n", __func__, c->modulation); return -EOPNOTSUPP; } if (st->is_rev1) gp8psk_set_tuner_mode(fe, 0); st->ops->out(st->priv, TUNE_8PSK, 0, 0, cmd, 10); st->lock = 0; st->next_status_check = jiffies; st->status_check_interval = 200; return 0; } static int gp8psk_fe_send_diseqc_msg (struct dvb_frontend* fe, struct dvb_diseqc_master_cmd *m) { struct gp8psk_fe_state *st = fe->demodulator_priv; dprintk("%s\n", __func__); if (st->ops->out(st->priv, SEND_DISEQC_COMMAND, m->msg[0], 0, m->msg, m->msg_len)) { return -EINVAL; } return 0; } static int gp8psk_fe_send_diseqc_burst(struct dvb_frontend *fe, enum fe_sec_mini_cmd burst) { struct gp8psk_fe_state *st = fe->demodulator_priv; u8 cmd; dprintk("%s\n", __func__); /* These commands are certainly wrong */ cmd = (burst == SEC_MINI_A) ? 0x00 : 0x01; if (st->ops->out(st->priv, SEND_DISEQC_COMMAND, cmd, 0, &cmd, 0)) { return -EINVAL; } return 0; } static int gp8psk_fe_set_tone(struct dvb_frontend *fe, enum fe_sec_tone_mode tone) { struct gp8psk_fe_state *st = fe->demodulator_priv; if (st->ops->out(st->priv, SET_22KHZ_TONE, (tone == SEC_TONE_ON), 0, NULL, 0)) { return -EINVAL; } return 0; } static int gp8psk_fe_set_voltage(struct dvb_frontend *fe, enum fe_sec_voltage voltage) { struct gp8psk_fe_state *st = fe->demodulator_priv; if (st->ops->out(st->priv, SET_LNB_VOLTAGE, voltage == SEC_VOLTAGE_18, 0, NULL, 0)) { return -EINVAL; } return 0; } static int gp8psk_fe_enable_high_lnb_voltage(struct dvb_frontend* fe, long onoff) { struct gp8psk_fe_state *st = fe->demodulator_priv; return st->ops->out(st->priv, USE_EXTRA_VOLT, onoff, 0, NULL, 0); } static int gp8psk_fe_send_legacy_dish_cmd (struct dvb_frontend* fe, unsigned long sw_cmd) { struct gp8psk_fe_state *st = fe->demodulator_priv; u8 cmd = sw_cmd & 0x7f; if (st->ops->out(st->priv, SET_DN_SWITCH, cmd, 0, NULL, 0)) return -EINVAL; if (st->ops->out(st->priv, SET_LNB_VOLTAGE, !!(sw_cmd & 0x80), 0, NULL, 0)) return -EINVAL; return 0; } static void gp8psk_fe_release(struct dvb_frontend* fe) { struct gp8psk_fe_state *st = fe->demodulator_priv; kfree(st); } static const struct dvb_frontend_ops gp8psk_fe_ops; struct dvb_frontend *gp8psk_fe_attach(const struct gp8psk_fe_ops *ops, void *priv, bool is_rev1) { struct gp8psk_fe_state *st; if (!ops || !ops->in || !ops->out || !ops->reload) { pr_err("Error! gp8psk-fe ops not defined.\n"); return NULL; } st = kzalloc(sizeof(struct gp8psk_fe_state), GFP_KERNEL); if (!st) return NULL; memcpy(&st->fe.ops, &gp8psk_fe_ops, sizeof(struct dvb_frontend_ops)); st->fe.demodulator_priv = st; st->ops = ops; st->priv = priv; st->is_rev1 = is_rev1; pr_info("Frontend %sattached\n", is_rev1 ? "revision 1 " : ""); return &st->fe; } EXPORT_SYMBOL_GPL(gp8psk_fe_attach); static const struct dvb_frontend_ops gp8psk_fe_ops = { .delsys = { SYS_DVBS }, .info = { .name = "Genpix DVB-S", .frequency_min_hz = 800 * MHz, .frequency_max_hz = 2250 * MHz, .frequency_stepsize_hz = 100 * kHz, .symbol_rate_min = 1000000, .symbol_rate_max = 45000000, .symbol_rate_tolerance = 500, /* ppm */ .caps = FE_CAN_INVERSION_AUTO | FE_CAN_FEC_1_2 | FE_CAN_FEC_2_3 | FE_CAN_FEC_3_4 | FE_CAN_FEC_5_6 | FE_CAN_FEC_7_8 | FE_CAN_FEC_AUTO | /* * FE_CAN_QAM_16 is for compatibility * (Myth incorrectly detects Turbo-QPSK as plain QAM-16) */ FE_CAN_QPSK | FE_CAN_QAM_16 | FE_CAN_TURBO_FEC }, .release = gp8psk_fe_release, .init = NULL, .sleep = NULL, .set_frontend = gp8psk_fe_set_frontend, .get_tune_settings = gp8psk_fe_get_tune_settings, .read_status = gp8psk_fe_read_status, .read_ber = gp8psk_fe_read_ber, .read_signal_strength = gp8psk_fe_read_signal_strength, .read_snr = gp8psk_fe_read_snr, .read_ucblocks = gp8psk_fe_read_unc_blocks, .diseqc_send_master_cmd = gp8psk_fe_send_diseqc_msg, .diseqc_send_burst = gp8psk_fe_send_diseqc_burst, .set_tone = gp8psk_fe_set_tone, .set_voltage = gp8psk_fe_set_voltage, .dishnetwork_send_legacy_command = gp8psk_fe_send_legacy_dish_cmd, .enable_high_lnb_voltage = gp8psk_fe_enable_high_lnb_voltage }; MODULE_AUTHOR("Alan Nisota <alannisota@gamil.com>"); MODULE_DESCRIPTION("Frontend Driver for Genpix DVB-S"); MODULE_VERSION("1.1"); MODULE_LICENSE("GPL");
114 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_SIX_H #define _LINUX_SIX_H /** * DOC: SIX locks overview * * Shared/intent/exclusive locks: sleepable read/write locks, like rw semaphores * but with an additional state: read/shared, intent, exclusive/write * * The purpose of the intent state is to allow for greater concurrency on tree * structures without deadlocking. In general, a read can't be upgraded to a * write lock without deadlocking, so an operation that updates multiple nodes * will have to take write locks for the full duration of the operation. * * But by adding an intent state, which is exclusive with other intent locks but * not with readers, we can take intent locks at the start of the operation, * and then take write locks only for the actual update to each individual * nodes, without deadlocking. * * Example usage: * six_lock_read(&foo->lock); * six_unlock_read(&foo->lock); * * An intent lock must be held before taking a write lock: * six_lock_intent(&foo->lock); * six_lock_write(&foo->lock); * six_unlock_write(&foo->lock); * six_unlock_intent(&foo->lock); * * Other operations: * six_trylock_read() * six_trylock_intent() * six_trylock_write() * * six_lock_downgrade() convert from intent to read * six_lock_tryupgrade() attempt to convert from read to intent, may fail * * There are also interfaces that take the lock type as an enum: * * six_lock_type(&foo->lock, SIX_LOCK_read); * six_trylock_convert(&foo->lock, SIX_LOCK_read, SIX_LOCK_intent) * six_lock_type(&foo->lock, SIX_LOCK_write); * six_unlock_type(&foo->lock, SIX_LOCK_write); * six_unlock_type(&foo->lock, SIX_LOCK_intent); * * Lock sequence numbers - unlock(), relock(): * * Locks embed sequences numbers, which are incremented on write lock/unlock. * This allows locks to be dropped and the retaken iff the state they protect * hasn't changed; this makes it much easier to avoid holding locks while e.g. * doing IO or allocating memory. * * Example usage: * six_lock_read(&foo->lock); * u32 seq = six_lock_seq(&foo->lock); * six_unlock_read(&foo->lock); * * some_operation_that_may_block(); * * if (six_relock_read(&foo->lock, seq)) { ... } * * If the relock operation succeeds, it is as if the lock was never unlocked. * * Reentrancy: * * Six locks are not by themselves reentrant, but have counters for both the * read and intent states that can be used to provide reentrancy by an upper * layer that tracks held locks. If a lock is known to already be held in the * read or intent state, six_lock_increment() can be used to bump the "lock * held in this state" counter, increasing the number of unlock calls that * will be required to fully unlock it. * * Example usage: * six_lock_read(&foo->lock); * six_lock_increment(&foo->lock, SIX_LOCK_read); * six_unlock_read(&foo->lock); * six_unlock_read(&foo->lock); * foo->lock is now fully unlocked. * * Since the intent state supercedes read, it's legal to increment the read * counter when holding an intent lock, but not the reverse. * * A lock may only be held once for write: six_lock_increment(.., SIX_LOCK_write) * is not legal. * * should_sleep_fn: * * There is a six_lock() variant that takes a function pointer that is called * immediately prior to schedule() when blocking, and may return an error to * abort. * * One possible use for this feature is when objects being locked are part of * a cache and may reused, and lock ordering is based on a property of the * object that will change when the object is reused - i.e. logical key order. * * If looking up an object in the cache may race with object reuse, and lock * ordering is required to prevent deadlock, object reuse may change the * correct lock order for that object and cause a deadlock. should_sleep_fn * can be used to check if the object is still the object we want and avoid * this deadlock. * * Wait list entry interface: * * There is a six_lock() variant, six_lock_waiter(), that takes a pointer to a * wait list entry. By embedding six_lock_waiter into another object, and by * traversing lock waitlists, it is then possible for an upper layer to * implement full cycle detection for deadlock avoidance. * * should_sleep_fn should be used for invoking the cycle detector, walking the * graph of held locks to check for a deadlock. The upper layer must track * held locks for each thread, and each thread's held locks must be reachable * from its six_lock_waiter object. * * six_lock_waiter() will add the wait object to the waitlist re-trying taking * the lock, and before calling should_sleep_fn, and the wait object will not * be removed from the waitlist until either the lock has been successfully * acquired, or we aborted because should_sleep_fn returned an error. * * Also, six_lock_waiter contains a timestamp, and waiters on a waitlist will * have timestamps in strictly ascending order - this is so the timestamp can * be used as a cursor for lock graph traverse. */ #include <linux/lockdep.h> #include <linux/sched.h> #include <linux/types.h> enum six_lock_type { SIX_LOCK_read, SIX_LOCK_intent, SIX_LOCK_write, }; struct six_lock { atomic_t state; u32 seq; unsigned intent_lock_recurse; unsigned write_lock_recurse; struct task_struct *owner; unsigned __percpu *readers; raw_spinlock_t wait_lock; struct list_head wait_list; #ifdef CONFIG_DEBUG_LOCK_ALLOC struct lockdep_map dep_map; #endif }; struct six_lock_waiter { struct list_head list; struct task_struct *task; enum six_lock_type lock_want; bool lock_acquired; u64 start_time; }; typedef int (*six_lock_should_sleep_fn)(struct six_lock *lock, void *); void six_lock_exit(struct six_lock *lock); enum six_lock_init_flags { SIX_LOCK_INIT_PCPU = 1U << 0, }; void __six_lock_init(struct six_lock *lock, const char *name, struct lock_class_key *key, enum six_lock_init_flags flags, gfp_t gfp); /** * six_lock_init - initialize a six lock * @lock: lock to initialize * @flags: optional flags, i.e. SIX_LOCK_INIT_PCPU */ #define six_lock_init(lock, flags, gfp) \ do { \ static struct lock_class_key __key; \ \ __six_lock_init((lock), #lock, &__key, flags, gfp); \ } while (0) /** * six_lock_seq - obtain current lock sequence number * @lock: six_lock to obtain sequence number for * * @lock should be held for read or intent, and not write * * By saving the lock sequence number, we can unlock @lock and then (typically * after some blocking operation) attempt to relock it: the relock will succeed * if the sequence number hasn't changed, meaning no write locks have been taken * and state corresponding to what @lock protects is still valid. */ static inline u32 six_lock_seq(const struct six_lock *lock) { return lock->seq; } bool six_trylock_ip(struct six_lock *lock, enum six_lock_type type, unsigned long ip); /** * six_trylock_type - attempt to take a six lock without blocking * @lock: lock to take * @type: SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write * * Return: true on success, false on failure. */ static inline bool six_trylock_type(struct six_lock *lock, enum six_lock_type type) { return six_trylock_ip(lock, type, _THIS_IP_); } int six_lock_ip_waiter(struct six_lock *lock, enum six_lock_type type, struct six_lock_waiter *wait, six_lock_should_sleep_fn should_sleep_fn, void *p, unsigned long ip); /** * six_lock_waiter - take a lock, with full waitlist interface * @lock: lock to take * @type: SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write * @wait: pointer to wait object, which will be added to lock's waitlist * @should_sleep_fn: callback run after adding to waitlist, immediately prior * to scheduling * @p: passed through to @should_sleep_fn * * This is a convenience wrapper around six_lock_ip_waiter(), see that function * for full documentation. * * Return: 0 on success, or the return code from @should_sleep_fn on failure. */ static inline int six_lock_waiter(struct six_lock *lock, enum six_lock_type type, struct six_lock_waiter *wait, six_lock_should_sleep_fn should_sleep_fn, void *p) { return six_lock_ip_waiter(lock, type, wait, should_sleep_fn, p, _THIS_IP_); } /** * six_lock_ip - take a six lock lock * @lock: lock to take * @type: SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write * @should_sleep_fn: callback run after adding to waitlist, immediately prior * to scheduling * @p: passed through to @should_sleep_fn * @ip: ip parameter for lockdep/lockstat, i.e. _THIS_IP_ * * Return: 0 on success, or the return code from @should_sleep_fn on failure. */ static inline int six_lock_ip(struct six_lock *lock, enum six_lock_type type, six_lock_should_sleep_fn should_sleep_fn, void *p, unsigned long ip) { struct six_lock_waiter wait; return six_lock_ip_waiter(lock, type, &wait, should_sleep_fn, p, ip); } /** * six_lock_type - take a six lock lock * @lock: lock to take * @type: SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write * @should_sleep_fn: callback run after adding to waitlist, immediately prior * to scheduling * @p: passed through to @should_sleep_fn * * Return: 0 on success, or the return code from @should_sleep_fn on failure. */ static inline int six_lock_type(struct six_lock *lock, enum six_lock_type type, six_lock_should_sleep_fn should_sleep_fn, void *p) { struct six_lock_waiter wait; return six_lock_ip_waiter(lock, type, &wait, should_sleep_fn, p, _THIS_IP_); } bool six_relock_ip(struct six_lock *lock, enum six_lock_type type, unsigned seq, unsigned long ip); /** * six_relock_type - attempt to re-take a lock that was held previously * @lock: lock to take * @type: SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write * @seq: lock sequence number obtained from six_lock_seq() while lock was * held previously * * Return: true on success, false on failure. */ static inline bool six_relock_type(struct six_lock *lock, enum six_lock_type type, unsigned seq) { return six_relock_ip(lock, type, seq, _THIS_IP_); } void six_unlock_ip(struct six_lock *lock, enum six_lock_type type, unsigned long ip); /** * six_unlock_type - drop a six lock * @lock: lock to unlock * @type: SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write * * When a lock is held multiple times (because six_lock_incement()) was used), * this decrements the 'lock held' counter by one. * * For example: * six_lock_read(&foo->lock); read count 1 * six_lock_increment(&foo->lock, SIX_LOCK_read); read count 2 * six_lock_unlock(&foo->lock, SIX_LOCK_read); read count 1 * six_lock_unlock(&foo->lock, SIX_LOCK_read); read count 0 */ static inline void six_unlock_type(struct six_lock *lock, enum six_lock_type type) { six_unlock_ip(lock, type, _THIS_IP_); } #define __SIX_LOCK(type) \ static inline bool six_trylock_ip_##type(struct six_lock *lock, unsigned long ip)\ { \ return six_trylock_ip(lock, SIX_LOCK_##type, ip); \ } \ \ static inline bool six_trylock_##type(struct six_lock *lock) \ { \ return six_trylock_ip(lock, SIX_LOCK_##type, _THIS_IP_); \ } \ \ static inline int six_lock_ip_waiter_##type(struct six_lock *lock, \ struct six_lock_waiter *wait, \ six_lock_should_sleep_fn should_sleep_fn, void *p,\ unsigned long ip) \ { \ return six_lock_ip_waiter(lock, SIX_LOCK_##type, wait, should_sleep_fn, p, ip);\ } \ \ static inline int six_lock_ip_##type(struct six_lock *lock, \ six_lock_should_sleep_fn should_sleep_fn, void *p, \ unsigned long ip) \ { \ return six_lock_ip(lock, SIX_LOCK_##type, should_sleep_fn, p, ip);\ } \ \ static inline bool six_relock_ip_##type(struct six_lock *lock, u32 seq, unsigned long ip)\ { \ return six_relock_ip(lock, SIX_LOCK_##type, seq, ip); \ } \ \ static inline bool six_relock_##type(struct six_lock *lock, u32 seq) \ { \ return six_relock_ip(lock, SIX_LOCK_##type, seq, _THIS_IP_); \ } \ \ static inline int six_lock_##type(struct six_lock *lock, \ six_lock_should_sleep_fn fn, void *p)\ { \ return six_lock_ip_##type(lock, fn, p, _THIS_IP_); \ } \ \ static inline void six_unlock_ip_##type(struct six_lock *lock, unsigned long ip) \ { \ six_unlock_ip(lock, SIX_LOCK_##type, ip); \ } \ \ static inline void six_unlock_##type(struct six_lock *lock) \ { \ six_unlock_ip(lock, SIX_LOCK_##type, _THIS_IP_); \ } __SIX_LOCK(read) __SIX_LOCK(intent) __SIX_LOCK(write) #undef __SIX_LOCK void six_lock_downgrade(struct six_lock *); bool six_lock_tryupgrade(struct six_lock *); bool six_trylock_convert(struct six_lock *, enum six_lock_type, enum six_lock_type); void six_lock_increment(struct six_lock *, enum six_lock_type); void six_lock_wakeup_all(struct six_lock *); struct six_lock_count { unsigned n[3]; }; struct six_lock_count six_lock_counts(struct six_lock *); void six_lock_readers_add(struct six_lock *, int); #endif /* _LINUX_SIX_H */
9 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 /* SPDX-License-Identifier: GPL-2.0-only */ /* * This file is part of the Linux kernel. * * Copyright (c) 2011-2014, Intel Corporation * Authors: Fenghua Yu <fenghua.yu@intel.com>, * H. Peter Anvin <hpa@linux.intel.com> */ #ifndef ASM_X86_ARCHRANDOM_H #define ASM_X86_ARCHRANDOM_H #include <asm/processor.h> #include <asm/cpufeature.h> #define RDRAND_RETRY_LOOPS 10 /* Unconditional execution of RDRAND and RDSEED */ static inline bool __must_check rdrand_long(unsigned long *v) { bool ok; unsigned int retry = RDRAND_RETRY_LOOPS; do { asm volatile("rdrand %[out]" CC_SET(c) : CC_OUT(c) (ok), [out] "=r" (*v)); if (ok) return true; } while (--retry); return false; } static inline bool __must_check rdseed_long(unsigned long *v) { bool ok; asm volatile("rdseed %[out]" CC_SET(c) : CC_OUT(c) (ok), [out] "=r" (*v)); return ok; } /* * These are the generic interfaces; they must not be declared if the * stubs in <linux/random.h> are to be invoked. */ static inline size_t __must_check arch_get_random_longs(unsigned long *v, size_t max_longs) { return max_longs && static_cpu_has(X86_FEATURE_RDRAND) && rdrand_long(v) ? 1 : 0; } static inline size_t __must_check arch_get_random_seed_longs(unsigned long *v, size_t max_longs) { return max_longs && static_cpu_has(X86_FEATURE_RDSEED) && rdseed_long(v) ? 1 : 0; } #ifndef CONFIG_UML void x86_init_rdrand(struct cpuinfo_x86 *c); #endif #endif /* ASM_X86_ARCHRANDOM_H */
187 146 129 314 203 219 15 37 25 16 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 // SPDX-License-Identifier: GPL-2.0 #include <linux/fs.h> #include <linux/quota.h> #include <linux/export.h> /** * qid_eq - Test to see if to kquid values are the same * @left: A qid value * @right: Another quid value * * Return true if the two qid values are equal and false otherwise. */ bool qid_eq(struct kqid left, struct kqid right) { if (left.type != right.type) return false; switch(left.type) { case USRQUOTA: return uid_eq(left.uid, right.uid); case GRPQUOTA: return gid_eq(left.gid, right.gid); case PRJQUOTA: return projid_eq(left.projid, right.projid); default: BUG(); } } EXPORT_SYMBOL(qid_eq); /** * qid_lt - Test to see if one qid value is less than another * @left: The possibly lesser qid value * @right: The possibly greater qid value * * Return true if left is less than right and false otherwise. */ bool qid_lt(struct kqid left, struct kqid right) { if (left.type < right.type) return true; if (left.type > right.type) return false; switch (left.type) { case USRQUOTA: return uid_lt(left.uid, right.uid); case GRPQUOTA: return gid_lt(left.gid, right.gid); case PRJQUOTA: return projid_lt(left.projid, right.projid); default: BUG(); } } EXPORT_SYMBOL(qid_lt); /** * from_kqid - Create a qid from a kqid user-namespace pair. * @targ: The user namespace we want a qid in. * @kqid: The kernel internal quota identifier to start with. * * Map @kqid into the user-namespace specified by @targ and * return the resulting qid. * * There is always a mapping into the initial user_namespace. * * If @kqid has no mapping in @targ (qid_t)-1 is returned. */ qid_t from_kqid(struct user_namespace *targ, struct kqid kqid) { switch (kqid.type) { case USRQUOTA: return from_kuid(targ, kqid.uid); case GRPQUOTA: return from_kgid(targ, kqid.gid); case PRJQUOTA: return from_kprojid(targ, kqid.projid); default: BUG(); } } EXPORT_SYMBOL(from_kqid); /** * from_kqid_munged - Create a qid from a kqid user-namespace pair. * @targ: The user namespace we want a qid in. * @kqid: The kernel internal quota identifier to start with. * * Map @kqid into the user-namespace specified by @targ and * return the resulting qid. * * There is always a mapping into the initial user_namespace. * * Unlike from_kqid from_kqid_munged never fails and always * returns a valid projid. This makes from_kqid_munged * appropriate for use in places where failing to provide * a qid_t is not a good option. * * If @kqid has no mapping in @targ the kqid.type specific * overflow identifier is returned. */ qid_t from_kqid_munged(struct user_namespace *targ, struct kqid kqid) { switch (kqid.type) { case USRQUOTA: return from_kuid_munged(targ, kqid.uid); case GRPQUOTA: return from_kgid_munged(targ, kqid.gid); case PRJQUOTA: return from_kprojid_munged(targ, kqid.projid); default: BUG(); } } EXPORT_SYMBOL(from_kqid_munged); /** * qid_valid - Report if a valid value is stored in a kqid. * @qid: The kernel internal quota identifier to test. */ bool qid_valid(struct kqid qid) { switch (qid.type) { case USRQUOTA: return uid_valid(qid.uid); case GRPQUOTA: return gid_valid(qid.gid); case PRJQUOTA: return projid_valid(qid.projid); default: BUG(); } } EXPORT_SYMBOL(qid_valid);
32 32 3 8 31 31 1 2 29 19 32 12 12 13 28 28 30 13 12 1 1 33 33 31 20 17 13 19 13 18 32 10 12 31 1 31 31 32 1 31 12 10 32 32 1 13 13 1 1 1 10 29 12 29 12 37 28 19 32 10 2 44 44 44 43 26 28 14 2 15 32 25 24 32 16 32 32 3 19 38 31 21 31 31 5 31 9 31 3 8 1 9 9 13 31 7 31 31 30 7 31 29 31 29 12 3 10 31 31 31 30 31 31 16 18 1 31 31 12 31 31 19 9 31 31 31 31 1 31 32 32 31 9 32 23 12 16 9 3 30 28 13 12 11 12 3 3 28 26 18 17 17 17 17 9 15 17 17 32 1 32 9 32 30 2 32 12 24 17 32 7 15 6 32 32 1 32 43 43 14 32 18 26 44 12 18 42 30 33 3 3 37 38 37 4 33 21 21 21 21 17 14 21 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) 2018 HUAWEI, Inc. * https://www.huawei.com/ * Copyright (C) 2022 Alibaba Cloud */ #include "compress.h" #include <linux/psi.h> #include <linux/cpuhotplug.h> #include <trace/events/erofs.h> #define Z_EROFS_PCLUSTER_MAX_PAGES (Z_EROFS_PCLUSTER_MAX_SIZE / PAGE_SIZE) #define Z_EROFS_INLINE_BVECS 2 struct z_erofs_bvec { struct page *page; int offset; unsigned int end; }; #define __Z_EROFS_BVSET(name, total) \ struct name { \ /* point to the next page which contains the following bvecs */ \ struct page *nextpage; \ struct z_erofs_bvec bvec[total]; \ } __Z_EROFS_BVSET(z_erofs_bvset,); __Z_EROFS_BVSET(z_erofs_bvset_inline, Z_EROFS_INLINE_BVECS); /* * Structure fields follow one of the following exclusion rules. * * I: Modifiable by initialization/destruction paths and read-only * for everyone else; * * L: Field should be protected by the pcluster lock; * * A: Field should be accessed / updated in atomic for parallelized code. */ struct z_erofs_pcluster { struct mutex lock; struct lockref lockref; /* A: point to next chained pcluster or TAILs */ struct z_erofs_pcluster *next; /* I: start physical position of this pcluster */ erofs_off_t pos; /* L: the maximum decompression size of this round */ unsigned int length; /* L: total number of bvecs */ unsigned int vcnt; /* I: pcluster size (compressed size) in bytes */ unsigned int pclustersize; /* I: page offset of start position of decompression */ unsigned short pageofs_out; /* I: page offset of inline compressed data */ unsigned short pageofs_in; union { /* L: inline a certain number of bvec for bootstrap */ struct z_erofs_bvset_inline bvset; /* I: can be used to free the pcluster by RCU. */ struct rcu_head rcu; }; /* I: compression algorithm format */ unsigned char algorithmformat; /* I: whether compressed data is in-lined or not */ bool from_meta; /* L: whether partial decompression or not */ bool partial; /* L: whether extra buffer allocations are best-effort */ bool besteffort; /* A: compressed bvecs (can be cached or inplaced pages) */ struct z_erofs_bvec compressed_bvecs[]; }; /* the end of a chain of pclusters */ #define Z_EROFS_PCLUSTER_TAIL ((void *) 0x700 + POISON_POINTER_DELTA) struct z_erofs_decompressqueue { struct super_block *sb; struct z_erofs_pcluster *head; atomic_t pending_bios; union { struct completion done; struct work_struct work; struct kthread_work kthread_work; } u; bool eio, sync; }; static inline unsigned int z_erofs_pclusterpages(struct z_erofs_pcluster *pcl) { return PAGE_ALIGN(pcl->pageofs_in + pcl->pclustersize) >> PAGE_SHIFT; } static bool erofs_folio_is_managed(struct erofs_sb_info *sbi, struct folio *fo) { return fo->mapping == MNGD_MAPPING(sbi); } #define Z_EROFS_ONSTACK_PAGES 32 /* * since pclustersize is variable for big pcluster feature, introduce slab * pools implementation for different pcluster sizes. */ struct z_erofs_pcluster_slab { struct kmem_cache *slab; unsigned int maxpages; char name[48]; }; #define _PCLP(n) { .maxpages = n } static struct z_erofs_pcluster_slab pcluster_pool[] __read_mostly = { _PCLP(1), _PCLP(4), _PCLP(16), _PCLP(64), _PCLP(128), _PCLP(Z_EROFS_PCLUSTER_MAX_PAGES + 1) }; struct z_erofs_bvec_iter { struct page *bvpage; struct z_erofs_bvset *bvset; unsigned int nr, cur; }; static struct page *z_erofs_bvec_iter_end(struct z_erofs_bvec_iter *iter) { if (iter->bvpage) kunmap_local(iter->bvset); return iter->bvpage; } static struct page *z_erofs_bvset_flip(struct z_erofs_bvec_iter *iter) { unsigned long base = (unsigned long)((struct z_erofs_bvset *)0)->bvec; /* have to access nextpage in advance, otherwise it will be unmapped */ struct page *nextpage = iter->bvset->nextpage; struct page *oldpage; DBG_BUGON(!nextpage); oldpage = z_erofs_bvec_iter_end(iter); iter->bvpage = nextpage; iter->bvset = kmap_local_page(nextpage); iter->nr = (PAGE_SIZE - base) / sizeof(struct z_erofs_bvec); iter->cur = 0; return oldpage; } static void z_erofs_bvec_iter_begin(struct z_erofs_bvec_iter *iter, struct z_erofs_bvset_inline *bvset, unsigned int bootstrap_nr, unsigned int cur) { *iter = (struct z_erofs_bvec_iter) { .nr = bootstrap_nr, .bvset = (struct z_erofs_bvset *)bvset, }; while (cur > iter->nr) { cur -= iter->nr; z_erofs_bvset_flip(iter); } iter->cur = cur; } static int z_erofs_bvec_enqueue(struct z_erofs_bvec_iter *iter, struct z_erofs_bvec *bvec, struct page **candidate_bvpage, struct page **pagepool) { if (iter->cur >= iter->nr) { struct page *nextpage = *candidate_bvpage; if (!nextpage) { nextpage = __erofs_allocpage(pagepool, GFP_KERNEL, true); if (!nextpage) return -ENOMEM; set_page_private(nextpage, Z_EROFS_SHORTLIVED_PAGE); } DBG_BUGON(iter->bvset->nextpage); iter->bvset->nextpage = nextpage; z_erofs_bvset_flip(iter); iter->bvset->nextpage = NULL; *candidate_bvpage = NULL; } iter->bvset->bvec[iter->cur++] = *bvec; return 0; } static void z_erofs_bvec_dequeue(struct z_erofs_bvec_iter *iter, struct z_erofs_bvec *bvec, struct page **old_bvpage) { if (iter->cur == iter->nr) *old_bvpage = z_erofs_bvset_flip(iter); else *old_bvpage = NULL; *bvec = iter->bvset->bvec[iter->cur++]; } static void z_erofs_destroy_pcluster_pool(void) { int i; for (i = 0; i < ARRAY_SIZE(pcluster_pool); ++i) { if (!pcluster_pool[i].slab) continue; kmem_cache_destroy(pcluster_pool[i].slab); pcluster_pool[i].slab = NULL; } } static int z_erofs_create_pcluster_pool(void) { struct z_erofs_pcluster_slab *pcs; struct z_erofs_pcluster *a; unsigned int size; for (pcs = pcluster_pool; pcs < pcluster_pool + ARRAY_SIZE(pcluster_pool); ++pcs) { size = struct_size(a, compressed_bvecs, pcs->maxpages); sprintf(pcs->name, "erofs_pcluster-%u", pcs->maxpages); pcs->slab = kmem_cache_create(pcs->name, size, 0, SLAB_RECLAIM_ACCOUNT, NULL); if (pcs->slab) continue; z_erofs_destroy_pcluster_pool(); return -ENOMEM; } return 0; } static struct z_erofs_pcluster *z_erofs_alloc_pcluster(unsigned int size) { unsigned int nrpages = PAGE_ALIGN(size) >> PAGE_SHIFT; struct z_erofs_pcluster_slab *pcs = pcluster_pool; for (; pcs < pcluster_pool + ARRAY_SIZE(pcluster_pool); ++pcs) { struct z_erofs_pcluster *pcl; if (nrpages > pcs->maxpages) continue; pcl = kmem_cache_zalloc(pcs->slab, GFP_KERNEL); if (!pcl) return ERR_PTR(-ENOMEM); return pcl; } return ERR_PTR(-EINVAL); } static void z_erofs_free_pcluster(struct z_erofs_pcluster *pcl) { unsigned int pclusterpages = z_erofs_pclusterpages(pcl); int i; for (i = 0; i < ARRAY_SIZE(pcluster_pool); ++i) { struct z_erofs_pcluster_slab *pcs = pcluster_pool + i; if (pclusterpages > pcs->maxpages) continue; kmem_cache_free(pcs->slab, pcl); return; } DBG_BUGON(1); } static struct workqueue_struct *z_erofs_workqueue __read_mostly; #ifdef CONFIG_EROFS_FS_PCPU_KTHREAD static struct kthread_worker __rcu **z_erofs_pcpu_workers; static atomic_t erofs_percpu_workers_initialized = ATOMIC_INIT(0); static void erofs_destroy_percpu_workers(void) { struct kthread_worker *worker; unsigned int cpu; for_each_possible_cpu(cpu) { worker = rcu_dereference_protected( z_erofs_pcpu_workers[cpu], 1); rcu_assign_pointer(z_erofs_pcpu_workers[cpu], NULL); if (worker) kthread_destroy_worker(worker); } kfree(z_erofs_pcpu_workers); } static struct kthread_worker *erofs_init_percpu_worker(int cpu) { struct kthread_worker *worker = kthread_run_worker_on_cpu(cpu, 0, "erofs_worker/%u"); if (IS_ERR(worker)) return worker; if (IS_ENABLED(CONFIG_EROFS_FS_PCPU_KTHREAD_HIPRI)) sched_set_fifo_low(worker->task); return worker; } static int erofs_init_percpu_workers(void) { struct kthread_worker *worker; unsigned int cpu; z_erofs_pcpu_workers = kcalloc(num_possible_cpus(), sizeof(struct kthread_worker *), GFP_ATOMIC); if (!z_erofs_pcpu_workers) return -ENOMEM; for_each_online_cpu(cpu) { /* could miss cpu{off,on}line? */ worker = erofs_init_percpu_worker(cpu); if (!IS_ERR(worker)) rcu_assign_pointer(z_erofs_pcpu_workers[cpu], worker); } return 0; } #ifdef CONFIG_HOTPLUG_CPU static DEFINE_SPINLOCK(z_erofs_pcpu_worker_lock); static enum cpuhp_state erofs_cpuhp_state; static int erofs_cpu_online(unsigned int cpu) { struct kthread_worker *worker, *old; worker = erofs_init_percpu_worker(cpu); if (IS_ERR(worker)) return PTR_ERR(worker); spin_lock(&z_erofs_pcpu_worker_lock); old = rcu_dereference_protected(z_erofs_pcpu_workers[cpu], lockdep_is_held(&z_erofs_pcpu_worker_lock)); if (!old) rcu_assign_pointer(z_erofs_pcpu_workers[cpu], worker); spin_unlock(&z_erofs_pcpu_worker_lock); if (old) kthread_destroy_worker(worker); return 0; } static int erofs_cpu_offline(unsigned int cpu) { struct kthread_worker *worker; spin_lock(&z_erofs_pcpu_worker_lock); worker = rcu_dereference_protected(z_erofs_pcpu_workers[cpu], lockdep_is_held(&z_erofs_pcpu_worker_lock)); rcu_assign_pointer(z_erofs_pcpu_workers[cpu], NULL); spin_unlock(&z_erofs_pcpu_worker_lock); synchronize_rcu(); if (worker) kthread_destroy_worker(worker); return 0; } static int erofs_cpu_hotplug_init(void) { int state; state = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, "fs/erofs:online", erofs_cpu_online, erofs_cpu_offline); if (state < 0) return state; erofs_cpuhp_state = state; return 0; } static void erofs_cpu_hotplug_destroy(void) { if (erofs_cpuhp_state) cpuhp_remove_state_nocalls(erofs_cpuhp_state); } #else /* !CONFIG_HOTPLUG_CPU */ static inline int erofs_cpu_hotplug_init(void) { return 0; } static inline void erofs_cpu_hotplug_destroy(void) {} #endif/* CONFIG_HOTPLUG_CPU */ static int z_erofs_init_pcpu_workers(struct super_block *sb) { int err; if (atomic_xchg(&erofs_percpu_workers_initialized, 1)) return 0; err = erofs_init_percpu_workers(); if (err) { erofs_err(sb, "per-cpu workers: failed to allocate."); goto err_init_percpu_workers; } err = erofs_cpu_hotplug_init(); if (err < 0) { erofs_err(sb, "per-cpu workers: failed CPU hotplug init."); goto err_cpuhp_init; } erofs_info(sb, "initialized per-cpu workers successfully."); return err; err_cpuhp_init: erofs_destroy_percpu_workers(); err_init_percpu_workers: atomic_set(&erofs_percpu_workers_initialized, 0); return err; } static void z_erofs_destroy_pcpu_workers(void) { if (!atomic_xchg(&erofs_percpu_workers_initialized, 0)) return; erofs_cpu_hotplug_destroy(); erofs_destroy_percpu_workers(); } #else /* !CONFIG_EROFS_FS_PCPU_KTHREAD */ static inline int z_erofs_init_pcpu_workers(struct super_block *sb) { return 0; } static inline void z_erofs_destroy_pcpu_workers(void) {} #endif/* CONFIG_EROFS_FS_PCPU_KTHREAD */ void z_erofs_exit_subsystem(void) { z_erofs_destroy_pcpu_workers(); destroy_workqueue(z_erofs_workqueue); z_erofs_destroy_pcluster_pool(); z_erofs_crypto_disable_all_engines(); z_erofs_exit_decompressor(); } int __init z_erofs_init_subsystem(void) { int err = z_erofs_init_decompressor(); if (err) goto err_decompressor; err = z_erofs_create_pcluster_pool(); if (err) goto err_pcluster_pool; z_erofs_workqueue = alloc_workqueue("erofs_worker", WQ_UNBOUND | WQ_HIGHPRI, num_possible_cpus()); if (!z_erofs_workqueue) { err = -ENOMEM; goto err_workqueue_init; } return err; err_workqueue_init: z_erofs_destroy_pcluster_pool(); err_pcluster_pool: z_erofs_exit_decompressor(); err_decompressor: return err; } enum z_erofs_pclustermode { /* It has previously been linked into another processing chain */ Z_EROFS_PCLUSTER_INFLIGHT, /* * A weaker form of Z_EROFS_PCLUSTER_FOLLOWED; the difference is that it * may be dispatched to the bypass queue later due to uptodated managed * folios. All file-backed folios related to this pcluster cannot be * reused for in-place I/O (or bvpage) since the pcluster may be decoded * in a separate queue (and thus out of order). */ Z_EROFS_PCLUSTER_FOLLOWED_NOINPLACE, /* * The pcluster has just been linked to our processing chain. * File-backed folios (except for the head page) related to it can be * used for in-place I/O (or bvpage). */ Z_EROFS_PCLUSTER_FOLLOWED, }; struct z_erofs_frontend { struct inode *const inode; struct erofs_map_blocks map; struct z_erofs_bvec_iter biter; struct page *pagepool; struct page *candidate_bvpage; struct z_erofs_pcluster *pcl, *head; enum z_erofs_pclustermode mode; erofs_off_t headoffset; /* a pointer used to pick up inplace I/O pages */ unsigned int icur; }; #define Z_EROFS_DEFINE_FRONTEND(fe, i, ho) struct z_erofs_frontend fe = { \ .inode = i, .head = Z_EROFS_PCLUSTER_TAIL, \ .mode = Z_EROFS_PCLUSTER_FOLLOWED, .headoffset = ho } static bool z_erofs_should_alloc_cache(struct z_erofs_frontend *fe) { unsigned int cachestrategy = EROFS_I_SB(fe->inode)->opt.cache_strategy; if (cachestrategy <= EROFS_ZIP_CACHE_DISABLED) return false; if (!(fe->map.m_flags & EROFS_MAP_FULL_MAPPED)) return true; if (cachestrategy >= EROFS_ZIP_CACHE_READAROUND && fe->map.m_la < fe->headoffset) return true; return false; } static void z_erofs_bind_cache(struct z_erofs_frontend *fe) { struct address_space *mc = MNGD_MAPPING(EROFS_I_SB(fe->inode)); struct z_erofs_pcluster *pcl = fe->pcl; unsigned int pclusterpages = z_erofs_pclusterpages(pcl); bool shouldalloc = z_erofs_should_alloc_cache(fe); pgoff_t poff = pcl->pos >> PAGE_SHIFT; bool may_bypass = true; /* Optimistic allocation, as in-place I/O can be used as a fallback */ gfp_t gfp = (mapping_gfp_mask(mc) & ~__GFP_DIRECT_RECLAIM) | __GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN; struct folio *folio, *newfolio; unsigned int i; if (i_blocksize(fe->inode) != PAGE_SIZE || fe->mode < Z_EROFS_PCLUSTER_FOLLOWED) return; for (i = 0; i < pclusterpages; ++i) { /* Inaccurate check w/o locking to avoid unneeded lookups */ if (READ_ONCE(pcl->compressed_bvecs[i].page)) continue; folio = filemap_get_folio(mc, poff + i); if (IS_ERR(folio)) { may_bypass = false; if (!shouldalloc) continue; /* * Allocate a managed folio for cached I/O, or it may be * then filled with a file-backed folio for in-place I/O */ newfolio = filemap_alloc_folio(gfp, 0); if (!newfolio) continue; newfolio->private = Z_EROFS_PREALLOCATED_FOLIO; folio = NULL; } spin_lock(&pcl->lockref.lock); if (!pcl->compressed_bvecs[i].page) { pcl->compressed_bvecs[i].page = folio_page(folio ?: newfolio, 0); spin_unlock(&pcl->lockref.lock); continue; } spin_unlock(&pcl->lockref.lock); folio_put(folio ?: newfolio); } /* * Don't perform in-place I/O if all compressed pages are available in * the managed cache, as the pcluster can be moved to the bypass queue. */ if (may_bypass) fe->mode = Z_EROFS_PCLUSTER_FOLLOWED_NOINPLACE; } /* (erofs_shrinker) disconnect cached encoded data with pclusters */ static int erofs_try_to_free_all_cached_folios(struct erofs_sb_info *sbi, struct z_erofs_pcluster *pcl) { unsigned int pclusterpages = z_erofs_pclusterpages(pcl); struct folio *folio; int i; DBG_BUGON(pcl->from_meta); /* Each cached folio contains one page unless bs > ps is supported */ for (i = 0; i < pclusterpages; ++i) { if (pcl->compressed_bvecs[i].page) { folio = page_folio(pcl->compressed_bvecs[i].page); /* Avoid reclaiming or migrating this folio */ if (!folio_trylock(folio)) return -EBUSY; if (!erofs_folio_is_managed(sbi, folio)) continue; pcl->compressed_bvecs[i].page = NULL; folio_detach_private(folio); folio_unlock(folio); } } return 0; } static bool z_erofs_cache_release_folio(struct folio *folio, gfp_t gfp) { struct z_erofs_pcluster *pcl = folio_get_private(folio); struct z_erofs_bvec *bvec = pcl->compressed_bvecs; struct z_erofs_bvec *end = bvec + z_erofs_pclusterpages(pcl); bool ret; if (!folio_test_private(folio)) return true; ret = false; spin_lock(&pcl->lockref.lock); if (pcl->lockref.count <= 0) { DBG_BUGON(pcl->from_meta); for (; bvec < end; ++bvec) { if (bvec->page && page_folio(bvec->page) == folio) { bvec->page = NULL; folio_detach_private(folio); ret = true; break; } } } spin_unlock(&pcl->lockref.lock); return ret; } /* * It will be called only on inode eviction. In case that there are still some * decompression requests in progress, wait with rescheduling for a bit here. * An extra lock could be introduced instead but it seems unnecessary. */ static void z_erofs_cache_invalidate_folio(struct folio *folio, size_t offset, size_t length) { const size_t stop = length + offset; /* Check for potential overflow in debug mode */ DBG_BUGON(stop > folio_size(folio) || stop < length); if (offset == 0 && stop == folio_size(folio)) while (!z_erofs_cache_release_folio(folio, 0)) cond_resched(); } static const struct address_space_operations z_erofs_cache_aops = { .release_folio = z_erofs_cache_release_folio, .invalidate_folio = z_erofs_cache_invalidate_folio, }; int z_erofs_init_super(struct super_block *sb) { struct inode *inode; int err; err = z_erofs_init_pcpu_workers(sb); if (err) return err; inode = new_inode(sb); if (!inode) return -ENOMEM; set_nlink(inode, 1); inode->i_size = OFFSET_MAX; inode->i_mapping->a_ops = &z_erofs_cache_aops; mapping_set_gfp_mask(inode->i_mapping, GFP_KERNEL); EROFS_SB(sb)->managed_cache = inode; xa_init(&EROFS_SB(sb)->managed_pslots); return 0; } /* callers must be with pcluster lock held */ static int z_erofs_attach_page(struct z_erofs_frontend *fe, struct z_erofs_bvec *bvec, bool exclusive) { struct z_erofs_pcluster *pcl = fe->pcl; int ret; if (exclusive) { /* Inplace I/O is limited to one page for uncompressed data */ if (pcl->algorithmformat < Z_EROFS_COMPRESSION_MAX || fe->icur <= 1) { /* Try to prioritize inplace I/O here */ spin_lock(&pcl->lockref.lock); while (fe->icur > 0) { if (pcl->compressed_bvecs[--fe->icur].page) continue; pcl->compressed_bvecs[fe->icur] = *bvec; spin_unlock(&pcl->lockref.lock); return 0; } spin_unlock(&pcl->lockref.lock); } /* otherwise, check if it can be used as a bvpage */ if (fe->mode >= Z_EROFS_PCLUSTER_FOLLOWED && !fe->candidate_bvpage) fe->candidate_bvpage = bvec->page; } ret = z_erofs_bvec_enqueue(&fe->biter, bvec, &fe->candidate_bvpage, &fe->pagepool); fe->pcl->vcnt += (ret >= 0); return ret; } static bool z_erofs_get_pcluster(struct z_erofs_pcluster *pcl) { if (lockref_get_not_zero(&pcl->lockref)) return true; spin_lock(&pcl->lockref.lock); if (__lockref_is_dead(&pcl->lockref)) { spin_unlock(&pcl->lockref.lock); return false; } if (!pcl->lockref.count++) atomic_long_dec(&erofs_global_shrink_cnt); spin_unlock(&pcl->lockref.lock); return true; } static int z_erofs_register_pcluster(struct z_erofs_frontend *fe) { struct erofs_map_blocks *map = &fe->map; struct super_block *sb = fe->inode->i_sb; struct erofs_sb_info *sbi = EROFS_SB(sb); struct z_erofs_pcluster *pcl, *pre; unsigned int pageofs_in; int err; pageofs_in = erofs_blkoff(sb, map->m_pa); pcl = z_erofs_alloc_pcluster(pageofs_in + map->m_plen); if (IS_ERR(pcl)) return PTR_ERR(pcl); lockref_init(&pcl->lockref); /* one ref for this request */ pcl->algorithmformat = map->m_algorithmformat; pcl->pclustersize = map->m_plen; pcl->length = 0; pcl->partial = true; pcl->next = fe->head; pcl->pos = map->m_pa; pcl->pageofs_in = pageofs_in; pcl->pageofs_out = map->m_la & ~PAGE_MASK; pcl->from_meta = map->m_flags & EROFS_MAP_META; fe->mode = Z_EROFS_PCLUSTER_FOLLOWED; /* * lock all primary followed works before visible to others * and mutex_trylock *never* fails for a new pcluster. */ mutex_init(&pcl->lock); DBG_BUGON(!mutex_trylock(&pcl->lock)); if (!pcl->from_meta) { while (1) { xa_lock(&sbi->managed_pslots); pre = __xa_cmpxchg(&sbi->managed_pslots, pcl->pos, NULL, pcl, GFP_KERNEL); if (!pre || xa_is_err(pre) || z_erofs_get_pcluster(pre)) { xa_unlock(&sbi->managed_pslots); break; } /* try to legitimize the current in-tree one */ xa_unlock(&sbi->managed_pslots); cond_resched(); } if (xa_is_err(pre)) { err = xa_err(pre); goto err_out; } else if (pre) { fe->pcl = pre; err = -EEXIST; goto err_out; } } fe->head = fe->pcl = pcl; return 0; err_out: mutex_unlock(&pcl->lock); z_erofs_free_pcluster(pcl); return err; } static int z_erofs_pcluster_begin(struct z_erofs_frontend *fe) { struct erofs_map_blocks *map = &fe->map; struct super_block *sb = fe->inode->i_sb; struct z_erofs_pcluster *pcl = NULL; int ret; DBG_BUGON(fe->pcl); /* must be Z_EROFS_PCLUSTER_TAIL or pointed to previous pcluster */ DBG_BUGON(!fe->head); if (!(map->m_flags & EROFS_MAP_META)) { while (1) { rcu_read_lock(); pcl = xa_load(&EROFS_SB(sb)->managed_pslots, map->m_pa); if (!pcl || z_erofs_get_pcluster(pcl)) { DBG_BUGON(pcl && map->m_pa != pcl->pos); rcu_read_unlock(); break; } rcu_read_unlock(); } } else if ((map->m_pa & ~PAGE_MASK) + map->m_plen > PAGE_SIZE) { DBG_BUGON(1); return -EFSCORRUPTED; } if (pcl) { fe->pcl = pcl; ret = -EEXIST; } else { ret = z_erofs_register_pcluster(fe); } if (ret == -EEXIST) { mutex_lock(&fe->pcl->lock); /* check if this pcluster hasn't been linked into any chain. */ if (!cmpxchg(&fe->pcl->next, NULL, fe->head)) { /* .. so it can be attached to our submission chain */ fe->head = fe->pcl; fe->mode = Z_EROFS_PCLUSTER_FOLLOWED; } else { /* otherwise, it belongs to an inflight chain */ fe->mode = Z_EROFS_PCLUSTER_INFLIGHT; } } else if (ret) { return ret; } z_erofs_bvec_iter_begin(&fe->biter, &fe->pcl->bvset, Z_EROFS_INLINE_BVECS, fe->pcl->vcnt); if (!fe->pcl->from_meta) { /* bind cache first when cached decompression is preferred */ z_erofs_bind_cache(fe); } else { void *mptr; mptr = erofs_read_metabuf(&map->buf, sb, map->m_pa, false); if (IS_ERR(mptr)) { ret = PTR_ERR(mptr); erofs_err(sb, "failed to get inline data %d", ret); return ret; } get_page(map->buf.page); WRITE_ONCE(fe->pcl->compressed_bvecs[0].page, map->buf.page); fe->pcl->pageofs_in = map->m_pa & ~PAGE_MASK; fe->mode = Z_EROFS_PCLUSTER_FOLLOWED_NOINPLACE; } /* file-backed inplace I/O pages are traversed in reverse order */ fe->icur = z_erofs_pclusterpages(fe->pcl); return 0; } static void z_erofs_rcu_callback(struct rcu_head *head) { z_erofs_free_pcluster(container_of(head, struct z_erofs_pcluster, rcu)); } static bool __erofs_try_to_release_pcluster(struct erofs_sb_info *sbi, struct z_erofs_pcluster *pcl) { if (pcl->lockref.count) return false; /* * Note that all cached folios should be detached before deleted from * the XArray. Otherwise some folios could be still attached to the * orphan old pcluster when the new one is available in the tree. */ if (erofs_try_to_free_all_cached_folios(sbi, pcl)) return false; /* * It's impossible to fail after the pcluster is freezed, but in order * to avoid some race conditions, add a DBG_BUGON to observe this. */ DBG_BUGON(__xa_erase(&sbi->managed_pslots, pcl->pos) != pcl); lockref_mark_dead(&pcl->lockref); return true; } static bool erofs_try_to_release_pcluster(struct erofs_sb_info *sbi, struct z_erofs_pcluster *pcl) { bool free; spin_lock(&pcl->lockref.lock); free = __erofs_try_to_release_pcluster(sbi, pcl); spin_unlock(&pcl->lockref.lock); if (free) { atomic_long_dec(&erofs_global_shrink_cnt); call_rcu(&pcl->rcu, z_erofs_rcu_callback); } return free; } unsigned long z_erofs_shrink_scan(struct erofs_sb_info *sbi, unsigned long nr) { struct z_erofs_pcluster *pcl; unsigned long index, freed = 0; xa_lock(&sbi->managed_pslots); xa_for_each(&sbi->managed_pslots, index, pcl) { /* try to shrink each valid pcluster */ if (!erofs_try_to_release_pcluster(sbi, pcl)) continue; xa_unlock(&sbi->managed_pslots); ++freed; if (!--nr) return freed; xa_lock(&sbi->managed_pslots); } xa_unlock(&sbi->managed_pslots); return freed; } static void z_erofs_put_pcluster(struct erofs_sb_info *sbi, struct z_erofs_pcluster *pcl, bool try_free) { bool free = false; if (lockref_put_or_lock(&pcl->lockref)) return; DBG_BUGON(__lockref_is_dead(&pcl->lockref)); if (!--pcl->lockref.count) { if (try_free && xa_trylock(&sbi->managed_pslots)) { free = __erofs_try_to_release_pcluster(sbi, pcl); xa_unlock(&sbi->managed_pslots); } atomic_long_add(!free, &erofs_global_shrink_cnt); } spin_unlock(&pcl->lockref.lock); if (free) call_rcu(&pcl->rcu, z_erofs_rcu_callback); } static void z_erofs_pcluster_end(struct z_erofs_frontend *fe) { struct z_erofs_pcluster *pcl = fe->pcl; if (!pcl) return; z_erofs_bvec_iter_end(&fe->biter); mutex_unlock(&pcl->lock); if (fe->candidate_bvpage) fe->candidate_bvpage = NULL; /* Drop refcount if it doesn't belong to our processing chain */ if (fe->mode < Z_EROFS_PCLUSTER_FOLLOWED_NOINPLACE) z_erofs_put_pcluster(EROFS_I_SB(fe->inode), pcl, false); fe->pcl = NULL; } static int z_erofs_read_fragment(struct super_block *sb, struct folio *folio, unsigned int cur, unsigned int end, erofs_off_t pos) { struct inode *packed_inode = EROFS_SB(sb)->packed_inode; struct erofs_buf buf = __EROFS_BUF_INITIALIZER; unsigned int cnt; u8 *src; if (!packed_inode) return -EFSCORRUPTED; buf.mapping = packed_inode->i_mapping; for (; cur < end; cur += cnt, pos += cnt) { cnt = min(end - cur, sb->s_blocksize - erofs_blkoff(sb, pos)); src = erofs_bread(&buf, pos, true); if (IS_ERR(src)) { erofs_put_metabuf(&buf); return PTR_ERR(src); } memcpy_to_folio(folio, cur, src, cnt); } erofs_put_metabuf(&buf); return 0; } static int z_erofs_scan_folio(struct z_erofs_frontend *f, struct folio *folio, bool ra) { struct inode *const inode = f->inode; struct erofs_map_blocks *const map = &f->map; const loff_t offset = folio_pos(folio); const unsigned int bs = i_blocksize(inode); unsigned int end = folio_size(folio), split = 0, cur, pgs; bool tight, excl; int err = 0; tight = (bs == PAGE_SIZE); erofs_onlinefolio_init(folio); do { if (offset + end - 1 < map->m_la || offset + end - 1 >= map->m_la + map->m_llen) { z_erofs_pcluster_end(f); map->m_la = offset + end - 1; map->m_llen = 0; err = z_erofs_map_blocks_iter(inode, map, 0); if (err) break; } cur = offset > map->m_la ? 0 : map->m_la - offset; pgs = round_down(cur, PAGE_SIZE); /* bump split parts first to avoid several separate cases */ ++split; if (!(map->m_flags & EROFS_MAP_MAPPED)) { folio_zero_segment(folio, cur, end); tight = false; } else if (map->m_flags & EROFS_MAP_FRAGMENT) { erofs_off_t fpos = offset + cur - map->m_la; err = z_erofs_read_fragment(inode->i_sb, folio, cur, cur + min(map->m_llen - fpos, end - cur), EROFS_I(inode)->z_fragmentoff + fpos); if (err) break; tight = false; } else { if (!f->pcl) { err = z_erofs_pcluster_begin(f); if (err) break; f->pcl->besteffort |= !ra; } pgs = round_down(end - 1, PAGE_SIZE); /* * Ensure this partial page belongs to this submit chain * rather than other concurrent submit chains or * noio(bypass) chains since those chains are handled * asynchronously thus it cannot be used for inplace I/O * or bvpage (should be processed in the strict order.) */ tight &= (f->mode >= Z_EROFS_PCLUSTER_FOLLOWED); excl = false; if (cur <= pgs) { excl = (split <= 1) || tight; cur = pgs; } err = z_erofs_attach_page(f, &((struct z_erofs_bvec) { .page = folio_page(folio, pgs >> PAGE_SHIFT), .offset = offset + pgs - map->m_la, .end = end - pgs, }), excl); if (err) break; erofs_onlinefolio_split(folio); if (f->pcl->length < offset + end - map->m_la) { f->pcl->length = offset + end - map->m_la; f->pcl->pageofs_out = map->m_la & ~PAGE_MASK; } if ((map->m_flags & EROFS_MAP_FULL_MAPPED) && !(map->m_flags & EROFS_MAP_PARTIAL_REF) && f->pcl->length == map->m_llen) f->pcl->partial = false; } /* shorten the remaining extent to update progress */ map->m_llen = offset + cur - map->m_la; map->m_flags &= ~EROFS_MAP_FULL_MAPPED; if (cur <= pgs) { split = cur < pgs; tight = (bs == PAGE_SIZE); } } while ((end = cur) > 0); erofs_onlinefolio_end(folio, err); return err; } static bool z_erofs_is_sync_decompress(struct erofs_sb_info *sbi, unsigned int readahead_pages) { /* auto: enable for read_folio, disable for readahead */ if ((sbi->opt.sync_decompress == EROFS_SYNC_DECOMPRESS_AUTO) && !readahead_pages) return true; if ((sbi->opt.sync_decompress == EROFS_SYNC_DECOMPRESS_FORCE_ON) && (readahead_pages <= sbi->opt.max_sync_decompress_pages)) return true; return false; } static bool z_erofs_page_is_invalidated(struct page *page) { return !page_folio(page)->mapping && !z_erofs_is_shortlived_page(page); } struct z_erofs_backend { struct page *onstack_pages[Z_EROFS_ONSTACK_PAGES]; struct super_block *sb; struct z_erofs_pcluster *pcl; /* pages with the longest decompressed length for deduplication */ struct page **decompressed_pages; /* pages to keep the compressed data */ struct page **compressed_pages; struct list_head decompressed_secondary_bvecs; struct page **pagepool; unsigned int onstack_used, nr_pages; /* indicate if temporary copies should be preserved for later use */ bool keepxcpy; }; struct z_erofs_bvec_item { struct z_erofs_bvec bvec; struct list_head list; }; static void z_erofs_do_decompressed_bvec(struct z_erofs_backend *be, struct z_erofs_bvec *bvec) { int poff = bvec->offset + be->pcl->pageofs_out; struct z_erofs_bvec_item *item; struct page **page; if (!(poff & ~PAGE_MASK) && (bvec->end == PAGE_SIZE || bvec->offset + bvec->end == be->pcl->length)) { DBG_BUGON((poff >> PAGE_SHIFT) >= be->nr_pages); page = be->decompressed_pages + (poff >> PAGE_SHIFT); if (!*page) { *page = bvec->page; return; } } else { be->keepxcpy = true; } /* (cold path) one pcluster is requested multiple times */ item = kmalloc(sizeof(*item), GFP_KERNEL | __GFP_NOFAIL); item->bvec = *bvec; list_add(&item->list, &be->decompressed_secondary_bvecs); } static void z_erofs_fill_other_copies(struct z_erofs_backend *be, int err) { unsigned int off0 = be->pcl->pageofs_out; struct list_head *p, *n; list_for_each_safe(p, n, &be->decompressed_secondary_bvecs) { struct z_erofs_bvec_item *bvi; unsigned int end, cur; void *dst, *src; bvi = container_of(p, struct z_erofs_bvec_item, list); cur = bvi->bvec.offset < 0 ? -bvi->bvec.offset : 0; end = min_t(unsigned int, be->pcl->length - bvi->bvec.offset, bvi->bvec.end); dst = kmap_local_page(bvi->bvec.page); while (cur < end) { unsigned int pgnr, scur, len; pgnr = (bvi->bvec.offset + cur + off0) >> PAGE_SHIFT; DBG_BUGON(pgnr >= be->nr_pages); scur = bvi->bvec.offset + cur - ((pgnr << PAGE_SHIFT) - off0); len = min_t(unsigned int, end - cur, PAGE_SIZE - scur); if (!be->decompressed_pages[pgnr]) { err = -EFSCORRUPTED; cur += len; continue; } src = kmap_local_page(be->decompressed_pages[pgnr]); memcpy(dst + cur, src + scur, len); kunmap_local(src); cur += len; } kunmap_local(dst); erofs_onlinefolio_end(page_folio(bvi->bvec.page), err); list_del(p); kfree(bvi); } } static void z_erofs_parse_out_bvecs(struct z_erofs_backend *be) { struct z_erofs_pcluster *pcl = be->pcl; struct z_erofs_bvec_iter biter; struct page *old_bvpage; int i; z_erofs_bvec_iter_begin(&biter, &pcl->bvset, Z_EROFS_INLINE_BVECS, 0); for (i = 0; i < pcl->vcnt; ++i) { struct z_erofs_bvec bvec; z_erofs_bvec_dequeue(&biter, &bvec, &old_bvpage); if (old_bvpage) z_erofs_put_shortlivedpage(be->pagepool, old_bvpage); DBG_BUGON(z_erofs_page_is_invalidated(bvec.page)); z_erofs_do_decompressed_bvec(be, &bvec); } old_bvpage = z_erofs_bvec_iter_end(&biter); if (old_bvpage) z_erofs_put_shortlivedpage(be->pagepool, old_bvpage); } static int z_erofs_parse_in_bvecs(struct z_erofs_backend *be, bool *overlapped) { struct z_erofs_pcluster *pcl = be->pcl; unsigned int pclusterpages = z_erofs_pclusterpages(pcl); int i, err = 0; *overlapped = false; for (i = 0; i < pclusterpages; ++i) { struct z_erofs_bvec *bvec = &pcl->compressed_bvecs[i]; struct page *page = bvec->page; /* compressed data ought to be valid when decompressing */ if (IS_ERR(page) || !page) { bvec->page = NULL; /* clear the failure reason */ err = page ? PTR_ERR(page) : -EIO; continue; } be->compressed_pages[i] = page; if (pcl->from_meta || erofs_folio_is_managed(EROFS_SB(be->sb), page_folio(page))) { if (!PageUptodate(page)) err = -EIO; continue; } DBG_BUGON(z_erofs_page_is_invalidated(page)); if (z_erofs_is_shortlived_page(page)) continue; z_erofs_do_decompressed_bvec(be, bvec); *overlapped = true; } return err; } static int z_erofs_decompress_pcluster(struct z_erofs_backend *be, int err) { struct erofs_sb_info *const sbi = EROFS_SB(be->sb); struct z_erofs_pcluster *pcl = be->pcl; unsigned int pclusterpages = z_erofs_pclusterpages(pcl); const struct z_erofs_decompressor *decomp = z_erofs_decomp[pcl->algorithmformat]; int i, j, jtop, err2; struct page *page; bool overlapped; bool try_free = true; mutex_lock(&pcl->lock); be->nr_pages = PAGE_ALIGN(pcl->length + pcl->pageofs_out) >> PAGE_SHIFT; /* allocate (de)compressed page arrays if cannot be kept on stack */ be->decompressed_pages = NULL; be->compressed_pages = NULL; be->onstack_used = 0; if (be->nr_pages <= Z_EROFS_ONSTACK_PAGES) { be->decompressed_pages = be->onstack_pages; be->onstack_used = be->nr_pages; memset(be->decompressed_pages, 0, sizeof(struct page *) * be->nr_pages); } if (pclusterpages + be->onstack_used <= Z_EROFS_ONSTACK_PAGES) be->compressed_pages = be->onstack_pages + be->onstack_used; if (!be->decompressed_pages) be->decompressed_pages = kvcalloc(be->nr_pages, sizeof(struct page *), GFP_KERNEL | __GFP_NOFAIL); if (!be->compressed_pages) be->compressed_pages = kvcalloc(pclusterpages, sizeof(struct page *), GFP_KERNEL | __GFP_NOFAIL); z_erofs_parse_out_bvecs(be); err2 = z_erofs_parse_in_bvecs(be, &overlapped); if (err2) err = err2; if (!err) err = decomp->decompress(&(struct z_erofs_decompress_req) { .sb = be->sb, .in = be->compressed_pages, .out = be->decompressed_pages, .inpages = pclusterpages, .outpages = be->nr_pages, .pageofs_in = pcl->pageofs_in, .pageofs_out = pcl->pageofs_out, .inputsize = pcl->pclustersize, .outputsize = pcl->length, .alg = pcl->algorithmformat, .inplace_io = overlapped, .partial_decoding = pcl->partial, .fillgaps = be->keepxcpy, .gfp = pcl->besteffort ? GFP_KERNEL : GFP_NOWAIT | __GFP_NORETRY }, be->pagepool); /* must handle all compressed pages before actual file pages */ if (pcl->from_meta) { page = pcl->compressed_bvecs[0].page; WRITE_ONCE(pcl->compressed_bvecs[0].page, NULL); put_page(page); } else { /* managed folios are still left in compressed_bvecs[] */ for (i = 0; i < pclusterpages; ++i) { page = be->compressed_pages[i]; if (!page) continue; if (erofs_folio_is_managed(sbi, page_folio(page))) { try_free = false; continue; } (void)z_erofs_put_shortlivedpage(be->pagepool, page); WRITE_ONCE(pcl->compressed_bvecs[i].page, NULL); } } if (be->compressed_pages < be->onstack_pages || be->compressed_pages >= be->onstack_pages + Z_EROFS_ONSTACK_PAGES) kvfree(be->compressed_pages); jtop = 0; z_erofs_fill_other_copies(be, err); for (i = 0; i < be->nr_pages; ++i) { page = be->decompressed_pages[i]; if (!page) continue; DBG_BUGON(z_erofs_page_is_invalidated(page)); if (!z_erofs_is_shortlived_page(page)) { erofs_onlinefolio_end(page_folio(page), err); continue; } if (pcl->algorithmformat != Z_EROFS_COMPRESSION_LZ4) { erofs_pagepool_add(be->pagepool, page); continue; } for (j = 0; j < jtop && be->decompressed_pages[j] != page; ++j) ; if (j >= jtop) /* this bounce page is newly detected */ be->decompressed_pages[jtop++] = page; } while (jtop) erofs_pagepool_add(be->pagepool, be->decompressed_pages[--jtop]); if (be->decompressed_pages != be->onstack_pages) kvfree(be->decompressed_pages); pcl->length = 0; pcl->partial = true; pcl->besteffort = false; pcl->bvset.nextpage = NULL; pcl->vcnt = 0; /* pcluster lock MUST be taken before the following line */ WRITE_ONCE(pcl->next, NULL); mutex_unlock(&pcl->lock); if (pcl->from_meta) z_erofs_free_pcluster(pcl); else z_erofs_put_pcluster(sbi, pcl, try_free); return err; } static int z_erofs_decompress_queue(const struct z_erofs_decompressqueue *io, struct page **pagepool) { struct z_erofs_backend be = { .sb = io->sb, .pagepool = pagepool, .decompressed_secondary_bvecs = LIST_HEAD_INIT(be.decompressed_secondary_bvecs), .pcl = io->head, }; struct z_erofs_pcluster *next; int err = io->eio ? -EIO : 0; for (; be.pcl != Z_EROFS_PCLUSTER_TAIL; be.pcl = next) { DBG_BUGON(!be.pcl); next = READ_ONCE(be.pcl->next); err = z_erofs_decompress_pcluster(&be, err) ?: err; } return err; } static void z_erofs_decompressqueue_work(struct work_struct *work) { struct z_erofs_decompressqueue *bgq = container_of(work, struct z_erofs_decompressqueue, u.work); struct page *pagepool = NULL; DBG_BUGON(bgq->head == Z_EROFS_PCLUSTER_TAIL); z_erofs_decompress_queue(bgq, &pagepool); erofs_release_pages(&pagepool); kvfree(bgq); } #ifdef CONFIG_EROFS_FS_PCPU_KTHREAD static void z_erofs_decompressqueue_kthread_work(struct kthread_work *work) { z_erofs_decompressqueue_work((struct work_struct *)work); } #endif static void z_erofs_decompress_kickoff(struct z_erofs_decompressqueue *io, int bios) { struct erofs_sb_info *const sbi = EROFS_SB(io->sb); /* wake up the caller thread for sync decompression */ if (io->sync) { if (!atomic_add_return(bios, &io->pending_bios)) complete(&io->u.done); return; } if (atomic_add_return(bios, &io->pending_bios)) return; /* Use (kthread_)work and sync decompression for atomic contexts only */ if (!in_task() || irqs_disabled() || rcu_read_lock_any_held()) { #ifdef CONFIG_EROFS_FS_PCPU_KTHREAD struct kthread_worker *worker; rcu_read_lock(); worker = rcu_dereference( z_erofs_pcpu_workers[raw_smp_processor_id()]); if (!worker) { INIT_WORK(&io->u.work, z_erofs_decompressqueue_work); queue_work(z_erofs_workqueue, &io->u.work); } else { kthread_queue_work(worker, &io->u.kthread_work); } rcu_read_unlock(); #else queue_work(z_erofs_workqueue, &io->u.work); #endif /* enable sync decompression for readahead */ if (sbi->opt.sync_decompress == EROFS_SYNC_DECOMPRESS_AUTO) sbi->opt.sync_decompress = EROFS_SYNC_DECOMPRESS_FORCE_ON; return; } z_erofs_decompressqueue_work(&io->u.work); } static void z_erofs_fill_bio_vec(struct bio_vec *bvec, struct z_erofs_frontend *f, struct z_erofs_pcluster *pcl, unsigned int nr, struct address_space *mc) { gfp_t gfp = mapping_gfp_mask(mc); bool tocache = false; struct z_erofs_bvec zbv; struct address_space *mapping; struct folio *folio; struct page *page; int bs = i_blocksize(f->inode); /* Except for inplace folios, the entire folio can be used for I/Os */ bvec->bv_offset = 0; bvec->bv_len = PAGE_SIZE; repeat: spin_lock(&pcl->lockref.lock); zbv = pcl->compressed_bvecs[nr]; spin_unlock(&pcl->lockref.lock); if (!zbv.page) goto out_allocfolio; bvec->bv_page = zbv.page; DBG_BUGON(z_erofs_is_shortlived_page(bvec->bv_page)); folio = page_folio(zbv.page); /* For preallocated managed folios, add them to page cache here */ if (folio->private == Z_EROFS_PREALLOCATED_FOLIO) { tocache = true; goto out_tocache; } mapping = READ_ONCE(folio->mapping); /* * File-backed folios for inplace I/Os are all locked steady, * therefore it is impossible for `mapping` to be NULL. */ if (mapping && mapping != mc) { if (zbv.offset < 0) bvec->bv_offset = round_up(-zbv.offset, bs); bvec->bv_len = round_up(zbv.end, bs) - bvec->bv_offset; return; } folio_lock(folio); if (likely(folio->mapping == mc)) { /* * The cached folio is still in managed cache but without * a valid `->private` pcluster hint. Let's reconnect them. */ if (!folio_test_private(folio)) { folio_attach_private(folio, pcl); /* compressed_bvecs[] already takes a ref before */ folio_put(folio); } if (likely(folio->private == pcl)) { /* don't submit cache I/Os again if already uptodate */ if (folio_test_uptodate(folio)) { folio_unlock(folio); bvec->bv_page = NULL; } return; } /* * Already linked with another pcluster, which only appears in * crafted images by fuzzers for now. But handle this anyway. */ tocache = false; /* use temporary short-lived pages */ } else { DBG_BUGON(1); /* referenced managed folios can't be truncated */ tocache = true; } folio_unlock(folio); folio_put(folio); out_allocfolio: page = __erofs_allocpage(&f->pagepool, gfp, true); spin_lock(&pcl->lockref.lock); if (unlikely(pcl->compressed_bvecs[nr].page != zbv.page)) { if (page) erofs_pagepool_add(&f->pagepool, page); spin_unlock(&pcl->lockref.lock); cond_resched(); goto repeat; } pcl->compressed_bvecs[nr].page = page ? page : ERR_PTR(-ENOMEM); spin_unlock(&pcl->lockref.lock); bvec->bv_page = page; if (!page) return; folio = page_folio(page); out_tocache: if (!tocache || bs != PAGE_SIZE || filemap_add_folio(mc, folio, (pcl->pos >> PAGE_SHIFT) + nr, gfp)) { /* turn into a temporary shortlived folio (1 ref) */ folio->private = (void *)Z_EROFS_SHORTLIVED_PAGE; return; } folio_attach_private(folio, pcl); /* drop a refcount added by allocpage (then 2 refs in total here) */ folio_put(folio); } static struct z_erofs_decompressqueue *jobqueue_init(struct super_block *sb, struct z_erofs_decompressqueue *fgq, bool *fg) { struct z_erofs_decompressqueue *q; if (fg && !*fg) { q = kvzalloc(sizeof(*q), GFP_KERNEL | __GFP_NOWARN); if (!q) { *fg = true; goto fg_out; } #ifdef CONFIG_EROFS_FS_PCPU_KTHREAD kthread_init_work(&q->u.kthread_work, z_erofs_decompressqueue_kthread_work); #else INIT_WORK(&q->u.work, z_erofs_decompressqueue_work); #endif } else { fg_out: q = fgq; init_completion(&fgq->u.done); atomic_set(&fgq->pending_bios, 0); q->eio = false; q->sync = true; } q->sb = sb; q->head = Z_EROFS_PCLUSTER_TAIL; return q; } /* define decompression jobqueue types */ enum { JQ_BYPASS, JQ_SUBMIT, NR_JOBQUEUES, }; static void z_erofs_move_to_bypass_queue(struct z_erofs_pcluster *pcl, struct z_erofs_pcluster *next, struct z_erofs_pcluster **qtail[]) { WRITE_ONCE(pcl->next, Z_EROFS_PCLUSTER_TAIL); WRITE_ONCE(*qtail[JQ_SUBMIT], next); WRITE_ONCE(*qtail[JQ_BYPASS], pcl); qtail[JQ_BYPASS] = &pcl->next; } static void z_erofs_endio(struct bio *bio) { struct z_erofs_decompressqueue *q = bio->bi_private; blk_status_t err = bio->bi_status; struct folio_iter fi; bio_for_each_folio_all(fi, bio) { struct folio *folio = fi.folio; DBG_BUGON(folio_test_uptodate(folio)); DBG_BUGON(z_erofs_page_is_invalidated(&folio->page)); if (!erofs_folio_is_managed(EROFS_SB(q->sb), folio)) continue; if (!err) folio_mark_uptodate(folio); folio_unlock(folio); } if (err) q->eio = true; z_erofs_decompress_kickoff(q, -1); if (bio->bi_bdev) bio_put(bio); } static void z_erofs_submit_queue(struct z_erofs_frontend *f, struct z_erofs_decompressqueue *fgq, bool *force_fg, bool readahead) { struct super_block *sb = f->inode->i_sb; struct address_space *mc = MNGD_MAPPING(EROFS_SB(sb)); struct z_erofs_pcluster **qtail[NR_JOBQUEUES]; struct z_erofs_decompressqueue *q[NR_JOBQUEUES]; struct z_erofs_pcluster *pcl, *next; /* bio is NULL initially, so no need to initialize last_{index,bdev} */ erofs_off_t last_pa; unsigned int nr_bios = 0; struct bio *bio = NULL; unsigned long pflags; int memstall = 0; /* No need to read from device for pclusters in the bypass queue. */ q[JQ_BYPASS] = jobqueue_init(sb, fgq + JQ_BYPASS, NULL); q[JQ_SUBMIT] = jobqueue_init(sb, fgq + JQ_SUBMIT, force_fg); qtail[JQ_BYPASS] = &q[JQ_BYPASS]->head; qtail[JQ_SUBMIT] = &q[JQ_SUBMIT]->head; /* by default, all need io submission */ q[JQ_SUBMIT]->head = next = f->head; do { struct erofs_map_dev mdev; erofs_off_t cur, end; struct bio_vec bvec; unsigned int i = 0; bool bypass = true; pcl = next; next = READ_ONCE(pcl->next); if (pcl->from_meta) { z_erofs_move_to_bypass_queue(pcl, next, qtail); continue; } /* no device id here, thus it will always succeed */ mdev = (struct erofs_map_dev) { .m_pa = round_down(pcl->pos, sb->s_blocksize), }; (void)erofs_map_dev(sb, &mdev); cur = mdev.m_pa; end = round_up(cur + pcl->pageofs_in + pcl->pclustersize, sb->s_blocksize); do { bvec.bv_page = NULL; if (bio && (cur != last_pa || bio->bi_bdev != mdev.m_bdev)) { drain_io: if (erofs_is_fileio_mode(EROFS_SB(sb))) erofs_fileio_submit_bio(bio); else if (erofs_is_fscache_mode(sb)) erofs_fscache_submit_bio(bio); else submit_bio(bio); if (memstall) { psi_memstall_leave(&pflags); memstall = 0; } bio = NULL; } if (!bvec.bv_page) { z_erofs_fill_bio_vec(&bvec, f, pcl, i++, mc); if (!bvec.bv_page) continue; if (cur + bvec.bv_len > end) bvec.bv_len = end - cur; DBG_BUGON(bvec.bv_len < sb->s_blocksize); } if (unlikely(PageWorkingset(bvec.bv_page)) && !memstall) { psi_memstall_enter(&pflags); memstall = 1; } if (!bio) { if (erofs_is_fileio_mode(EROFS_SB(sb))) bio = erofs_fileio_bio_alloc(&mdev); else if (erofs_is_fscache_mode(sb)) bio = erofs_fscache_bio_alloc(&mdev); else bio = bio_alloc(mdev.m_bdev, BIO_MAX_VECS, REQ_OP_READ, GFP_NOIO); bio->bi_end_io = z_erofs_endio; bio->bi_iter.bi_sector = (mdev.m_dif->fsoff + cur) >> 9; bio->bi_private = q[JQ_SUBMIT]; if (readahead) bio->bi_opf |= REQ_RAHEAD; ++nr_bios; } if (!bio_add_page(bio, bvec.bv_page, bvec.bv_len, bvec.bv_offset)) goto drain_io; last_pa = cur + bvec.bv_len; bypass = false; } while ((cur += bvec.bv_len) < end); if (!bypass) qtail[JQ_SUBMIT] = &pcl->next; else z_erofs_move_to_bypass_queue(pcl, next, qtail); } while (next != Z_EROFS_PCLUSTER_TAIL); if (bio) { if (erofs_is_fileio_mode(EROFS_SB(sb))) erofs_fileio_submit_bio(bio); else if (erofs_is_fscache_mode(sb)) erofs_fscache_submit_bio(bio); else submit_bio(bio); } if (memstall) psi_memstall_leave(&pflags); /* * although background is preferred, no one is pending for submission. * don't issue decompression but drop it directly instead. */ if (!*force_fg && !nr_bios) { kvfree(q[JQ_SUBMIT]); return; } z_erofs_decompress_kickoff(q[JQ_SUBMIT], nr_bios); } static int z_erofs_runqueue(struct z_erofs_frontend *f, unsigned int rapages) { struct z_erofs_decompressqueue io[NR_JOBQUEUES]; struct erofs_sb_info *sbi = EROFS_I_SB(f->inode); bool force_fg = z_erofs_is_sync_decompress(sbi, rapages); int err; if (f->head == Z_EROFS_PCLUSTER_TAIL) return 0; z_erofs_submit_queue(f, io, &force_fg, !!rapages); /* handle bypass queue (no i/o pclusters) immediately */ err = z_erofs_decompress_queue(&io[JQ_BYPASS], &f->pagepool); if (!force_fg) return err; /* wait until all bios are completed */ wait_for_completion_io(&io[JQ_SUBMIT].u.done); /* handle synchronous decompress queue in the caller context */ return z_erofs_decompress_queue(&io[JQ_SUBMIT], &f->pagepool) ?: err; } /* * Since partial uptodate is still unimplemented for now, we have to use * approximate readmore strategies as a start. */ static void z_erofs_pcluster_readmore(struct z_erofs_frontend *f, struct readahead_control *rac, bool backmost) { struct inode *inode = f->inode; struct erofs_map_blocks *map = &f->map; erofs_off_t cur, end, headoffset = f->headoffset; int err; if (backmost) { if (rac) end = headoffset + readahead_length(rac) - 1; else end = headoffset + PAGE_SIZE - 1; map->m_la = end; err = z_erofs_map_blocks_iter(inode, map, EROFS_GET_BLOCKS_READMORE); if (err) return; /* expand ra for the trailing edge if readahead */ if (rac) { cur = round_up(map->m_la + map->m_llen, PAGE_SIZE); readahead_expand(rac, headoffset, cur - headoffset); return; } end = round_up(end, PAGE_SIZE); } else { end = round_up(map->m_la, PAGE_SIZE); if (!map->m_llen) return; } cur = map->m_la + map->m_llen - 1; while ((cur >= end) && (cur < i_size_read(inode))) { pgoff_t index = cur >> PAGE_SHIFT; struct folio *folio; folio = erofs_grab_folio_nowait(inode->i_mapping, index); if (!IS_ERR_OR_NULL(folio)) { if (folio_test_uptodate(folio)) folio_unlock(folio); else z_erofs_scan_folio(f, folio, !!rac); folio_put(folio); } if (cur < PAGE_SIZE) break; cur = (index << PAGE_SHIFT) - 1; } } static int z_erofs_read_folio(struct file *file, struct folio *folio) { struct inode *const inode = folio->mapping->host; Z_EROFS_DEFINE_FRONTEND(f, inode, folio_pos(folio)); int err; trace_erofs_read_folio(folio, false); z_erofs_pcluster_readmore(&f, NULL, true); err = z_erofs_scan_folio(&f, folio, false); z_erofs_pcluster_readmore(&f, NULL, false); z_erofs_pcluster_end(&f); /* if some pclusters are ready, need submit them anyway */ err = z_erofs_runqueue(&f, 0) ?: err; if (err && err != -EINTR) erofs_err(inode->i_sb, "read error %d @ %lu of nid %llu", err, folio->index, EROFS_I(inode)->nid); erofs_put_metabuf(&f.map.buf); erofs_release_pages(&f.pagepool); return err; } static void z_erofs_readahead(struct readahead_control *rac) { struct inode *const inode = rac->mapping->host; Z_EROFS_DEFINE_FRONTEND(f, inode, readahead_pos(rac)); unsigned int nrpages = readahead_count(rac); struct folio *head = NULL, *folio; int err; trace_erofs_readahead(inode, readahead_index(rac), nrpages, false); z_erofs_pcluster_readmore(&f, rac, true); while ((folio = readahead_folio(rac))) { folio->private = head; head = folio; } /* traverse in reverse order for best metadata I/O performance */ while (head) { folio = head; head = folio_get_private(folio); err = z_erofs_scan_folio(&f, folio, true); if (err && err != -EINTR) erofs_err(inode->i_sb, "readahead error at folio %lu @ nid %llu", folio->index, EROFS_I(inode)->nid); } z_erofs_pcluster_readmore(&f, rac, false); z_erofs_pcluster_end(&f); (void)z_erofs_runqueue(&f, nrpages); erofs_put_metabuf(&f.map.buf); erofs_release_pages(&f.pagepool); } const struct address_space_operations z_erofs_aops = { .read_folio = z_erofs_read_folio, .readahead = z_erofs_readahead, };
10 1 1 1 7 2 1 1 1 1 2 2 4 2 4 1 3 3 1 5 5 3 2 5 4 4 4 5 4 5 46 46 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 // SPDX-License-Identifier: GPL-2.0-only /* * net/sched/act_sample.c - Packet sampling tc action * Copyright (c) 2017 Yotam Gigi <yotamg@mellanox.com> */ #include <linux/types.h> #include <linux/kernel.h> #include <linux/string.h> #include <linux/errno.h> #include <linux/skbuff.h> #include <linux/rtnetlink.h> #include <linux/module.h> #include <linux/init.h> #include <linux/gfp.h> #include <net/net_namespace.h> #include <net/netlink.h> #include <net/pkt_sched.h> #include <linux/tc_act/tc_sample.h> #include <net/tc_act/tc_sample.h> #include <net/psample.h> #include <net/pkt_cls.h> #include <net/tc_wrapper.h> #include <linux/if_arp.h> static struct tc_action_ops act_sample_ops; static const struct nla_policy sample_policy[TCA_SAMPLE_MAX + 1] = { [TCA_SAMPLE_PARMS] = { .len = sizeof(struct tc_sample) }, [TCA_SAMPLE_RATE] = { .type = NLA_U32 }, [TCA_SAMPLE_TRUNC_SIZE] = { .type = NLA_U32 }, [TCA_SAMPLE_PSAMPLE_GROUP] = { .type = NLA_U32 }, }; static int tcf_sample_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **a, struct tcf_proto *tp, u32 flags, struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, act_sample_ops.net_id); bool bind = flags & TCA_ACT_FLAGS_BIND; struct nlattr *tb[TCA_SAMPLE_MAX + 1]; struct psample_group *psample_group; u32 psample_group_num, rate, index; struct tcf_chain *goto_ch = NULL; struct tc_sample *parm; struct tcf_sample *s; bool exists = false; int ret, err; if (!nla) return -EINVAL; ret = nla_parse_nested_deprecated(tb, TCA_SAMPLE_MAX, nla, sample_policy, NULL); if (ret < 0) return ret; if (!tb[TCA_SAMPLE_PARMS]) return -EINVAL; parm = nla_data(tb[TCA_SAMPLE_PARMS]); index = parm->index; err = tcf_idr_check_alloc(tn, &index, a, bind); if (err < 0) return err; exists = err; if (exists && bind) return ACT_P_BOUND; if (!exists) { ret = tcf_idr_create(tn, index, est, a, &act_sample_ops, bind, true, flags); if (ret) { tcf_idr_cleanup(tn, index); return ret; } ret = ACT_P_CREATED; } else if (!(flags & TCA_ACT_FLAGS_REPLACE)) { tcf_idr_release(*a, bind); return -EEXIST; } if (!tb[TCA_SAMPLE_RATE] || !tb[TCA_SAMPLE_PSAMPLE_GROUP]) { NL_SET_ERR_MSG(extack, "sample rate and group are required"); err = -EINVAL; goto release_idr; } err = tcf_action_check_ctrlact(parm->action, tp, &goto_ch, extack); if (err < 0) goto release_idr; rate = nla_get_u32(tb[TCA_SAMPLE_RATE]); if (!rate) { NL_SET_ERR_MSG(extack, "invalid sample rate"); err = -EINVAL; goto put_chain; } psample_group_num = nla_get_u32(tb[TCA_SAMPLE_PSAMPLE_GROUP]); psample_group = psample_group_get(net, psample_group_num); if (!psample_group) { err = -ENOMEM; goto put_chain; } s = to_sample(*a); spin_lock_bh(&s->tcf_lock); goto_ch = tcf_action_set_ctrlact(*a, parm->action, goto_ch); s->rate = rate; s->psample_group_num = psample_group_num; psample_group = rcu_replace_pointer(s->psample_group, psample_group, lockdep_is_held(&s->tcf_lock)); if (tb[TCA_SAMPLE_TRUNC_SIZE]) { s->truncate = true; s->trunc_size = nla_get_u32(tb[TCA_SAMPLE_TRUNC_SIZE]); } spin_unlock_bh(&s->tcf_lock); if (psample_group) psample_group_put(psample_group); if (goto_ch) tcf_chain_put_by_act(goto_ch); return ret; put_chain: if (goto_ch) tcf_chain_put_by_act(goto_ch); release_idr: tcf_idr_release(*a, bind); return err; } static void tcf_sample_cleanup(struct tc_action *a) { struct tcf_sample *s = to_sample(a); struct psample_group *psample_group; /* last reference to action, no need to lock */ psample_group = rcu_dereference_protected(s->psample_group, 1); RCU_INIT_POINTER(s->psample_group, NULL); if (psample_group) psample_group_put(psample_group); } static bool tcf_sample_dev_ok_push(struct net_device *dev) { switch (dev->type) { case ARPHRD_TUNNEL: case ARPHRD_TUNNEL6: case ARPHRD_SIT: case ARPHRD_IPGRE: case ARPHRD_IP6GRE: case ARPHRD_VOID: case ARPHRD_NONE: return false; default: return true; } } TC_INDIRECT_SCOPE int tcf_sample_act(struct sk_buff *skb, const struct tc_action *a, struct tcf_result *res) { struct tcf_sample *s = to_sample(a); struct psample_group *psample_group; u8 cookie_data[TC_COOKIE_MAX_SIZE]; struct psample_metadata md = {}; struct tc_cookie *user_cookie; int retval; tcf_lastuse_update(&s->tcf_tm); bstats_update(this_cpu_ptr(s->common.cpu_bstats), skb); retval = READ_ONCE(s->tcf_action); psample_group = rcu_dereference_bh(s->psample_group); /* randomly sample packets according to rate */ if (psample_group && (get_random_u32_below(s->rate) == 0)) { if (!skb_at_tc_ingress(skb)) { md.in_ifindex = skb->skb_iif; md.out_ifindex = skb->dev->ifindex; } else { md.in_ifindex = skb->dev->ifindex; } /* on ingress, the mac header gets popped, so push it back */ if (skb_at_tc_ingress(skb) && tcf_sample_dev_ok_push(skb->dev)) skb_push(skb, skb->mac_len); rcu_read_lock(); user_cookie = rcu_dereference(a->user_cookie); if (user_cookie) { memcpy(cookie_data, user_cookie->data, user_cookie->len); md.user_cookie = cookie_data; md.user_cookie_len = user_cookie->len; } rcu_read_unlock(); md.trunc_size = s->truncate ? s->trunc_size : skb->len; psample_sample_packet(psample_group, skb, s->rate, &md); if (skb_at_tc_ingress(skb) && tcf_sample_dev_ok_push(skb->dev)) skb_pull(skb, skb->mac_len); } return retval; } static void tcf_sample_stats_update(struct tc_action *a, u64 bytes, u64 packets, u64 drops, u64 lastuse, bool hw) { struct tcf_sample *s = to_sample(a); struct tcf_t *tm = &s->tcf_tm; tcf_action_update_stats(a, bytes, packets, drops, hw); tm->lastuse = max_t(u64, tm->lastuse, lastuse); } static int tcf_sample_dump(struct sk_buff *skb, struct tc_action *a, int bind, int ref) { unsigned char *b = skb_tail_pointer(skb); struct tcf_sample *s = to_sample(a); struct tc_sample opt = { .index = s->tcf_index, .refcnt = refcount_read(&s->tcf_refcnt) - ref, .bindcnt = atomic_read(&s->tcf_bindcnt) - bind, }; struct tcf_t t; spin_lock_bh(&s->tcf_lock); opt.action = s->tcf_action; if (nla_put(skb, TCA_SAMPLE_PARMS, sizeof(opt), &opt)) goto nla_put_failure; tcf_tm_dump(&t, &s->tcf_tm); if (nla_put_64bit(skb, TCA_SAMPLE_TM, sizeof(t), &t, TCA_SAMPLE_PAD)) goto nla_put_failure; if (nla_put_u32(skb, TCA_SAMPLE_RATE, s->rate)) goto nla_put_failure; if (s->truncate) if (nla_put_u32(skb, TCA_SAMPLE_TRUNC_SIZE, s->trunc_size)) goto nla_put_failure; if (nla_put_u32(skb, TCA_SAMPLE_PSAMPLE_GROUP, s->psample_group_num)) goto nla_put_failure; spin_unlock_bh(&s->tcf_lock); return skb->len; nla_put_failure: spin_unlock_bh(&s->tcf_lock); nlmsg_trim(skb, b); return -1; } static void tcf_psample_group_put(void *priv) { struct psample_group *group = priv; psample_group_put(group); } static struct psample_group * tcf_sample_get_group(const struct tc_action *a, tc_action_priv_destructor *destructor) { struct tcf_sample *s = to_sample(a); struct psample_group *group; group = rcu_dereference_protected(s->psample_group, lockdep_is_held(&s->tcf_lock)); if (group) { psample_group_take(group); *destructor = tcf_psample_group_put; } return group; } static void tcf_offload_sample_get_group(struct flow_action_entry *entry, const struct tc_action *act) { entry->sample.psample_group = act->ops->get_psample_group(act, &entry->destructor); entry->destructor_priv = entry->sample.psample_group; } static int tcf_sample_offload_act_setup(struct tc_action *act, void *entry_data, u32 *index_inc, bool bind, struct netlink_ext_ack *extack) { if (bind) { struct flow_action_entry *entry = entry_data; entry->id = FLOW_ACTION_SAMPLE; entry->sample.trunc_size = tcf_sample_trunc_size(act); entry->sample.truncate = tcf_sample_truncate(act); entry->sample.rate = tcf_sample_rate(act); tcf_offload_sample_get_group(entry, act); *index_inc = 1; } else { struct flow_offload_action *fl_action = entry_data; fl_action->id = FLOW_ACTION_SAMPLE; } return 0; } static struct tc_action_ops act_sample_ops = { .kind = "sample", .id = TCA_ID_SAMPLE, .owner = THIS_MODULE, .act = tcf_sample_act, .stats_update = tcf_sample_stats_update, .dump = tcf_sample_dump, .init = tcf_sample_init, .cleanup = tcf_sample_cleanup, .get_psample_group = tcf_sample_get_group, .offload_act_setup = tcf_sample_offload_act_setup, .size = sizeof(struct tcf_sample), }; MODULE_ALIAS_NET_ACT("sample"); static __net_init int sample_init_net(struct net *net) { struct tc_action_net *tn = net_generic(net, act_sample_ops.net_id); return tc_action_net_init(net, tn, &act_sample_ops); } static void __net_exit sample_exit_net(struct list_head *net_list) { tc_action_net_exit(net_list, act_sample_ops.net_id); } static struct pernet_operations sample_net_ops = { .init = sample_init_net, .exit_batch = sample_exit_net, .id = &act_sample_ops.net_id, .size = sizeof(struct tc_action_net), }; static int __init sample_init_module(void) { return tcf_register_action(&act_sample_ops, &sample_net_ops); } static void __exit sample_cleanup_module(void) { tcf_unregister_action(&act_sample_ops, &sample_net_ops); } module_init(sample_init_module); module_exit(sample_cleanup_module); MODULE_AUTHOR("Yotam Gigi <yotam.gi@gmail.com>"); MODULE_DESCRIPTION("Packet sampling action"); MODULE_LICENSE("GPL v2");
137 207 100 122 167 133 2 86 7 79 68 7 61 17 61 102 65 1 63 65 66 67 9 8 4 3 2 3 2 72 7 79 79 15 66 13 1 14 15 151 150 19 38 6 82 2 7 3 85 44 39 84 5 15 1 64 141 4 121 100 98 100 1 12 40 26 66 2 19 97 96 47 1 46 46 4 7 4 3 3 3 6 5 1 5 181 180 2 181 178 174 3 11 2 2 158 161 3 4 1 3 1 1 1 1 6 5 1 1 4 36 12 35 16 5 3 46 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 // SPDX-License-Identifier: GPL-2.0-or-later /* * NET3: Implementation of the ICMP protocol layer. * * Alan Cox, <alan@lxorguk.ukuu.org.uk> * * Some of the function names and the icmp unreach table for this * module were derived from [icmp.c 1.0.11 06/02/93] by * Ross Biro, Fred N. van Kempen, Mark Evans, Alan Cox, Gerhard Koerting. * Other than that this module is a complete rewrite. * * Fixes: * Clemens Fruhwirth : introduce global icmp rate limiting * with icmp type masking ability instead * of broken per type icmp timeouts. * Mike Shaver : RFC1122 checks. * Alan Cox : Multicast ping reply as self. * Alan Cox : Fix atomicity lockup in ip_build_xmit * call. * Alan Cox : Added 216,128 byte paths to the MTU * code. * Martin Mares : RFC1812 checks. * Martin Mares : Can be configured to follow redirects * if acting as a router _without_ a * routing protocol (RFC 1812). * Martin Mares : Echo requests may be configured to * be ignored (RFC 1812). * Martin Mares : Limitation of ICMP error message * transmit rate (RFC 1812). * Martin Mares : TOS and Precedence set correctly * (RFC 1812). * Martin Mares : Now copying as much data from the * original packet as we can without * exceeding 576 bytes (RFC 1812). * Willy Konynenberg : Transparent proxying support. * Keith Owens : RFC1191 correction for 4.2BSD based * path MTU bug. * Thomas Quinot : ICMP Dest Unreach codes up to 15 are * valid (RFC 1812). * Andi Kleen : Check all packet lengths properly * and moved all kfree_skb() up to * icmp_rcv. * Andi Kleen : Move the rate limit bookkeeping * into the dest entry and use a token * bucket filter (thanks to ANK). Make * the rates sysctl configurable. * Yu Tianli : Fixed two ugly bugs in icmp_send * - IP option length was accounted wrongly * - ICMP header length was not accounted * at all. * Tristan Greaves : Added sysctl option to ignore bogus * broadcast responses from broken routers. * * To Fix: * * - Should use skb_pull() instead of all the manual checking. * This would also greatly simply some upper layer error handlers. --AK */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/module.h> #include <linux/types.h> #include <linux/jiffies.h> #include <linux/kernel.h> #include <linux/fcntl.h> #include <linux/socket.h> #include <linux/in.h> #include <linux/inet.h> #include <linux/inetdevice.h> #include <linux/netdevice.h> #include <linux/string.h> #include <linux/netfilter_ipv4.h> #include <linux/slab.h> #include <net/snmp.h> #include <net/ip.h> #include <net/route.h> #include <net/protocol.h> #include <net/icmp.h> #include <net/tcp.h> #include <net/udp.h> #include <net/raw.h> #include <net/ping.h> #include <linux/skbuff.h> #include <net/sock.h> #include <linux/errno.h> #include <linux/timer.h> #include <linux/init.h> #include <linux/uaccess.h> #include <net/checksum.h> #include <net/xfrm.h> #include <net/inet_common.h> #include <net/ip_fib.h> #include <net/l3mdev.h> #include <net/addrconf.h> #include <net/inet_dscp.h> #define CREATE_TRACE_POINTS #include <trace/events/icmp.h> /* * Build xmit assembly blocks */ struct icmp_bxm { struct sk_buff *skb; int offset; int data_len; struct { struct icmphdr icmph; __be32 times[3]; } data; int head_len; struct ip_options_data replyopts; }; /* An array of errno for error messages from dest unreach. */ /* RFC 1122: 3.2.2.1 States that NET_UNREACH, HOST_UNREACH and SR_FAILED MUST be considered 'transient errs'. */ const struct icmp_err icmp_err_convert[] = { { .errno = ENETUNREACH, /* ICMP_NET_UNREACH */ .fatal = 0, }, { .errno = EHOSTUNREACH, /* ICMP_HOST_UNREACH */ .fatal = 0, }, { .errno = ENOPROTOOPT /* ICMP_PROT_UNREACH */, .fatal = 1, }, { .errno = ECONNREFUSED, /* ICMP_PORT_UNREACH */ .fatal = 1, }, { .errno = EMSGSIZE, /* ICMP_FRAG_NEEDED */ .fatal = 0, }, { .errno = EOPNOTSUPP, /* ICMP_SR_FAILED */ .fatal = 0, }, { .errno = ENETUNREACH, /* ICMP_NET_UNKNOWN */ .fatal = 1, }, { .errno = EHOSTDOWN, /* ICMP_HOST_UNKNOWN */ .fatal = 1, }, { .errno = ENONET, /* ICMP_HOST_ISOLATED */ .fatal = 1, }, { .errno = ENETUNREACH, /* ICMP_NET_ANO */ .fatal = 1, }, { .errno = EHOSTUNREACH, /* ICMP_HOST_ANO */ .fatal = 1, }, { .errno = ENETUNREACH, /* ICMP_NET_UNR_TOS */ .fatal = 0, }, { .errno = EHOSTUNREACH, /* ICMP_HOST_UNR_TOS */ .fatal = 0, }, { .errno = EHOSTUNREACH, /* ICMP_PKT_FILTERED */ .fatal = 1, }, { .errno = EHOSTUNREACH, /* ICMP_PREC_VIOLATION */ .fatal = 1, }, { .errno = EHOSTUNREACH, /* ICMP_PREC_CUTOFF */ .fatal = 1, }, }; EXPORT_SYMBOL(icmp_err_convert); /* * ICMP control array. This specifies what to do with each ICMP. */ struct icmp_control { enum skb_drop_reason (*handler)(struct sk_buff *skb); short error; /* This ICMP is classed as an error message */ }; static const struct icmp_control icmp_pointers[NR_ICMP_TYPES+1]; static DEFINE_PER_CPU(struct sock *, ipv4_icmp_sk); /* Called with BH disabled */ static inline struct sock *icmp_xmit_lock(struct net *net) { struct sock *sk; sk = this_cpu_read(ipv4_icmp_sk); if (unlikely(!spin_trylock(&sk->sk_lock.slock))) { /* This can happen if the output path signals a * dst_link_failure() for an outgoing ICMP packet. */ return NULL; } sock_net_set(sk, net); return sk; } static inline void icmp_xmit_unlock(struct sock *sk) { sock_net_set(sk, &init_net); spin_unlock(&sk->sk_lock.slock); } /** * icmp_global_allow - Are we allowed to send one more ICMP message ? * @net: network namespace * * Uses a token bucket to limit our ICMP messages to ~sysctl_icmp_msgs_per_sec. * Returns false if we reached the limit and can not send another packet. * Works in tandem with icmp_global_consume(). */ bool icmp_global_allow(struct net *net) { u32 delta, now, oldstamp; int incr, new, old; /* Note: many cpus could find this condition true. * Then later icmp_global_consume() could consume more credits, * this is an acceptable race. */ if (atomic_read(&net->ipv4.icmp_global_credit) > 0) return true; now = jiffies; oldstamp = READ_ONCE(net->ipv4.icmp_global_stamp); delta = min_t(u32, now - oldstamp, HZ); if (delta < HZ / 50) return false; incr = READ_ONCE(net->ipv4.sysctl_icmp_msgs_per_sec) * delta / HZ; if (!incr) return false; if (cmpxchg(&net->ipv4.icmp_global_stamp, oldstamp, now) == oldstamp) { old = atomic_read(&net->ipv4.icmp_global_credit); do { new = min(old + incr, READ_ONCE(net->ipv4.sysctl_icmp_msgs_burst)); } while (!atomic_try_cmpxchg(&net->ipv4.icmp_global_credit, &old, new)); } return true; } EXPORT_SYMBOL(icmp_global_allow); void icmp_global_consume(struct net *net) { int credits = get_random_u32_below(3); /* Note: this might make icmp_global.credit negative. */ if (credits) atomic_sub(credits, &net->ipv4.icmp_global_credit); } EXPORT_SYMBOL(icmp_global_consume); static bool icmpv4_mask_allow(struct net *net, int type, int code) { if (type > NR_ICMP_TYPES) return true; /* Don't limit PMTU discovery. */ if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) return true; /* Limit if icmp type is enabled in ratemask. */ if (!((1 << type) & READ_ONCE(net->ipv4.sysctl_icmp_ratemask))) return true; return false; } static bool icmpv4_global_allow(struct net *net, int type, int code, bool *apply_ratelimit) { if (icmpv4_mask_allow(net, type, code)) return true; if (icmp_global_allow(net)) { *apply_ratelimit = true; return true; } __ICMP_INC_STATS(net, ICMP_MIB_RATELIMITGLOBAL); return false; } /* * Send an ICMP frame. */ static bool icmpv4_xrlim_allow(struct net *net, struct rtable *rt, struct flowi4 *fl4, int type, int code, bool apply_ratelimit) { struct dst_entry *dst = &rt->dst; struct inet_peer *peer; bool rc = true; if (!apply_ratelimit) return true; /* No rate limit on loopback */ if (dst->dev && (dst->dev->flags&IFF_LOOPBACK)) goto out; rcu_read_lock(); peer = inet_getpeer_v4(net->ipv4.peers, fl4->daddr, l3mdev_master_ifindex_rcu(dst->dev)); rc = inet_peer_xrlim_allow(peer, READ_ONCE(net->ipv4.sysctl_icmp_ratelimit)); rcu_read_unlock(); out: if (!rc) __ICMP_INC_STATS(net, ICMP_MIB_RATELIMITHOST); else icmp_global_consume(net); return rc; } /* * Maintain the counters used in the SNMP statistics for outgoing ICMP */ void icmp_out_count(struct net *net, unsigned char type) { ICMPMSGOUT_INC_STATS(net, type); ICMP_INC_STATS(net, ICMP_MIB_OUTMSGS); } /* * Checksum each fragment, and on the first include the headers and final * checksum. */ static int icmp_glue_bits(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb) { struct icmp_bxm *icmp_param = from; __wsum csum; csum = skb_copy_and_csum_bits(icmp_param->skb, icmp_param->offset + offset, to, len); skb->csum = csum_block_add(skb->csum, csum, odd); if (icmp_pointers[icmp_param->data.icmph.type].error) nf_ct_attach(skb, icmp_param->skb); return 0; } static void icmp_push_reply(struct sock *sk, struct icmp_bxm *icmp_param, struct flowi4 *fl4, struct ipcm_cookie *ipc, struct rtable **rt) { struct sk_buff *skb; if (ip_append_data(sk, fl4, icmp_glue_bits, icmp_param, icmp_param->data_len+icmp_param->head_len, icmp_param->head_len, ipc, rt, MSG_DONTWAIT) < 0) { __ICMP_INC_STATS(sock_net(sk), ICMP_MIB_OUTERRORS); ip_flush_pending_frames(sk); } else if ((skb = skb_peek(&sk->sk_write_queue)) != NULL) { struct icmphdr *icmph = icmp_hdr(skb); __wsum csum; struct sk_buff *skb1; csum = csum_partial_copy_nocheck((void *)&icmp_param->data, (char *)icmph, icmp_param->head_len); skb_queue_walk(&sk->sk_write_queue, skb1) { csum = csum_add(csum, skb1->csum); } icmph->checksum = csum_fold(csum); skb->ip_summed = CHECKSUM_NONE; ip_push_pending_frames(sk, fl4); } } /* * Driving logic for building and sending ICMP messages. */ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb) { struct rtable *rt = skb_rtable(skb); struct net *net = dev_net_rcu(rt->dst.dev); bool apply_ratelimit = false; struct ipcm_cookie ipc; struct flowi4 fl4; struct sock *sk; __be32 daddr, saddr; u32 mark = IP4_REPLY_MARK(net, skb->mark); int type = icmp_param->data.icmph.type; int code = icmp_param->data.icmph.code; if (ip_options_echo(net, &icmp_param->replyopts.opt.opt, skb)) return; /* Needed by both icmpv4_global_allow and icmp_xmit_lock */ local_bh_disable(); /* is global icmp_msgs_per_sec exhausted ? */ if (!icmpv4_global_allow(net, type, code, &apply_ratelimit)) goto out_bh_enable; sk = icmp_xmit_lock(net); if (!sk) goto out_bh_enable; icmp_param->data.icmph.checksum = 0; ipcm_init(&ipc); ipc.tos = ip_hdr(skb)->tos; ipc.sockc.mark = mark; daddr = ipc.addr = ip_hdr(skb)->saddr; saddr = fib_compute_spec_dst(skb); if (icmp_param->replyopts.opt.opt.optlen) { ipc.opt = &icmp_param->replyopts.opt; if (ipc.opt->opt.srr) daddr = icmp_param->replyopts.opt.opt.faddr; } memset(&fl4, 0, sizeof(fl4)); fl4.daddr = daddr; fl4.saddr = saddr; fl4.flowi4_mark = mark; fl4.flowi4_uid = sock_net_uid(net, NULL); fl4.flowi4_tos = inet_dscp_to_dsfield(ip4h_dscp(ip_hdr(skb))); fl4.flowi4_proto = IPPROTO_ICMP; fl4.flowi4_oif = l3mdev_master_ifindex(skb->dev); security_skb_classify_flow(skb, flowi4_to_flowi_common(&fl4)); rt = ip_route_output_key(net, &fl4); if (IS_ERR(rt)) goto out_unlock; if (icmpv4_xrlim_allow(net, rt, &fl4, type, code, apply_ratelimit)) icmp_push_reply(sk, icmp_param, &fl4, &ipc, &rt); ip_rt_put(rt); out_unlock: icmp_xmit_unlock(sk); out_bh_enable: local_bh_enable(); } /* * The device used for looking up which routing table to use for sending an ICMP * error is preferably the source whenever it is set, which should ensure the * icmp error can be sent to the source host, else lookup using the routing * table of the destination device, else use the main routing table (index 0). */ static struct net_device *icmp_get_route_lookup_dev(struct sk_buff *skb) { struct net_device *route_lookup_dev = NULL; if (skb->dev) route_lookup_dev = skb->dev; else if (skb_dst(skb)) route_lookup_dev = skb_dst(skb)->dev; return route_lookup_dev; } static struct rtable *icmp_route_lookup(struct net *net, struct flowi4 *fl4, struct sk_buff *skb_in, const struct iphdr *iph, __be32 saddr, dscp_t dscp, u32 mark, int type, int code, struct icmp_bxm *param) { struct net_device *route_lookup_dev; struct dst_entry *dst, *dst2; struct rtable *rt, *rt2; struct flowi4 fl4_dec; int err; memset(fl4, 0, sizeof(*fl4)); fl4->daddr = (param->replyopts.opt.opt.srr ? param->replyopts.opt.opt.faddr : iph->saddr); fl4->saddr = saddr; fl4->flowi4_mark = mark; fl4->flowi4_uid = sock_net_uid(net, NULL); fl4->flowi4_tos = inet_dscp_to_dsfield(dscp); fl4->flowi4_proto = IPPROTO_ICMP; fl4->fl4_icmp_type = type; fl4->fl4_icmp_code = code; route_lookup_dev = icmp_get_route_lookup_dev(skb_in); fl4->flowi4_oif = l3mdev_master_ifindex(route_lookup_dev); security_skb_classify_flow(skb_in, flowi4_to_flowi_common(fl4)); rt = ip_route_output_key_hash(net, fl4, skb_in); if (IS_ERR(rt)) return rt; /* No need to clone since we're just using its address. */ rt2 = rt; dst = xfrm_lookup(net, &rt->dst, flowi4_to_flowi(fl4), NULL, 0); rt = dst_rtable(dst); if (!IS_ERR(dst)) { if (rt != rt2) return rt; if (inet_addr_type_dev_table(net, route_lookup_dev, fl4->daddr) == RTN_LOCAL) return rt; } else if (PTR_ERR(dst) == -EPERM) { rt = NULL; } else { return rt; } err = xfrm_decode_session_reverse(net, skb_in, flowi4_to_flowi(&fl4_dec), AF_INET); if (err) goto relookup_failed; if (inet_addr_type_dev_table(net, route_lookup_dev, fl4_dec.saddr) == RTN_LOCAL) { rt2 = __ip_route_output_key(net, &fl4_dec); if (IS_ERR(rt2)) err = PTR_ERR(rt2); } else { struct flowi4 fl4_2 = {}; unsigned long orefdst; fl4_2.daddr = fl4_dec.saddr; rt2 = ip_route_output_key(net, &fl4_2); if (IS_ERR(rt2)) { err = PTR_ERR(rt2); goto relookup_failed; } /* Ugh! */ orefdst = skb_in->_skb_refdst; /* save old refdst */ skb_dst_set(skb_in, NULL); err = ip_route_input(skb_in, fl4_dec.daddr, fl4_dec.saddr, dscp, rt2->dst.dev) ? -EINVAL : 0; dst_release(&rt2->dst); rt2 = skb_rtable(skb_in); skb_in->_skb_refdst = orefdst; /* restore old refdst */ } if (err) goto relookup_failed; dst2 = xfrm_lookup(net, &rt2->dst, flowi4_to_flowi(&fl4_dec), NULL, XFRM_LOOKUP_ICMP); rt2 = dst_rtable(dst2); if (!IS_ERR(dst2)) { dst_release(&rt->dst); memcpy(fl4, &fl4_dec, sizeof(*fl4)); rt = rt2; } else if (PTR_ERR(dst2) == -EPERM) { if (rt) dst_release(&rt->dst); return rt2; } else { err = PTR_ERR(dst2); goto relookup_failed; } return rt; relookup_failed: if (rt) return rt; return ERR_PTR(err); } /* * Send an ICMP message in response to a situation * * RFC 1122: 3.2.2 MUST send at least the IP header and 8 bytes of header. * MAY send more (we do). * MUST NOT change this header information. * MUST NOT reply to a multicast/broadcast IP address. * MUST NOT reply to a multicast/broadcast MAC address. * MUST reply to only the first fragment. */ void __icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info, const struct ip_options *opt) { struct iphdr *iph; int room; struct icmp_bxm icmp_param; struct rtable *rt = skb_rtable(skb_in); bool apply_ratelimit = false; struct ipcm_cookie ipc; struct flowi4 fl4; __be32 saddr; u8 tos; u32 mark; struct net *net; struct sock *sk; if (!rt) return; rcu_read_lock(); if (rt->dst.dev) net = dev_net_rcu(rt->dst.dev); else if (skb_in->dev) net = dev_net_rcu(skb_in->dev); else goto out; /* * Find the original header. It is expected to be valid, of course. * Check this, icmp_send is called from the most obscure devices * sometimes. */ iph = ip_hdr(skb_in); if ((u8 *)iph < skb_in->head || (skb_network_header(skb_in) + sizeof(*iph)) > skb_tail_pointer(skb_in)) goto out; /* * No replies to physical multicast/broadcast */ if (skb_in->pkt_type != PACKET_HOST) goto out; /* * Now check at the protocol level */ if (rt->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) goto out; /* * Only reply to fragment 0. We byte re-order the constant * mask for efficiency. */ if (iph->frag_off & htons(IP_OFFSET)) goto out; /* * If we send an ICMP error to an ICMP error a mess would result.. */ if (icmp_pointers[type].error) { /* * We are an error, check if we are replying to an * ICMP error */ if (iph->protocol == IPPROTO_ICMP) { u8 _inner_type, *itp; itp = skb_header_pointer(skb_in, skb_network_header(skb_in) + (iph->ihl << 2) + offsetof(struct icmphdr, type) - skb_in->data, sizeof(_inner_type), &_inner_type); if (!itp) goto out; /* * Assume any unknown ICMP type is an error. This * isn't specified by the RFC, but think about it.. */ if (*itp > NR_ICMP_TYPES || icmp_pointers[*itp].error) goto out; } } /* Needed by both icmpv4_global_allow and icmp_xmit_lock */ local_bh_disable(); /* Check global sysctl_icmp_msgs_per_sec ratelimit, unless * incoming dev is loopback. If outgoing dev change to not be * loopback, then peer ratelimit still work (in icmpv4_xrlim_allow) */ if (!(skb_in->dev && (skb_in->dev->flags&IFF_LOOPBACK)) && !icmpv4_global_allow(net, type, code, &apply_ratelimit)) goto out_bh_enable; sk = icmp_xmit_lock(net); if (!sk) goto out_bh_enable; /* * Construct source address and options. */ saddr = iph->daddr; if (!(rt->rt_flags & RTCF_LOCAL)) { struct net_device *dev = NULL; rcu_read_lock(); if (rt_is_input_route(rt) && READ_ONCE(net->ipv4.sysctl_icmp_errors_use_inbound_ifaddr)) dev = dev_get_by_index_rcu(net, inet_iif(skb_in)); if (dev) saddr = inet_select_addr(dev, iph->saddr, RT_SCOPE_LINK); else saddr = 0; rcu_read_unlock(); } tos = icmp_pointers[type].error ? (RT_TOS(iph->tos) | IPTOS_PREC_INTERNETCONTROL) : iph->tos; mark = IP4_REPLY_MARK(net, skb_in->mark); if (__ip_options_echo(net, &icmp_param.replyopts.opt.opt, skb_in, opt)) goto out_unlock; /* * Prepare data for ICMP header. */ icmp_param.data.icmph.type = type; icmp_param.data.icmph.code = code; icmp_param.data.icmph.un.gateway = info; icmp_param.data.icmph.checksum = 0; icmp_param.skb = skb_in; icmp_param.offset = skb_network_offset(skb_in); ipcm_init(&ipc); ipc.tos = tos; ipc.addr = iph->saddr; ipc.opt = &icmp_param.replyopts.opt; ipc.sockc.mark = mark; rt = icmp_route_lookup(net, &fl4, skb_in, iph, saddr, inet_dsfield_to_dscp(tos), mark, type, code, &icmp_param); if (IS_ERR(rt)) goto out_unlock; /* peer icmp_ratelimit */ if (!icmpv4_xrlim_allow(net, rt, &fl4, type, code, apply_ratelimit)) goto ende; /* RFC says return as much as we can without exceeding 576 bytes. */ room = dst_mtu(&rt->dst); if (room > 576) room = 576; room -= sizeof(struct iphdr) + icmp_param.replyopts.opt.opt.optlen; room -= sizeof(struct icmphdr); /* Guard against tiny mtu. We need to include at least one * IP network header for this message to make any sense. */ if (room <= (int)sizeof(struct iphdr)) goto ende; icmp_param.data_len = skb_in->len - icmp_param.offset; if (icmp_param.data_len > room) icmp_param.data_len = room; icmp_param.head_len = sizeof(struct icmphdr); /* if we don't have a source address at this point, fall back to the * dummy address instead of sending out a packet with a source address * of 0.0.0.0 */ if (!fl4.saddr) fl4.saddr = htonl(INADDR_DUMMY); trace_icmp_send(skb_in, type, code); icmp_push_reply(sk, &icmp_param, &fl4, &ipc, &rt); ende: ip_rt_put(rt); out_unlock: icmp_xmit_unlock(sk); out_bh_enable: local_bh_enable(); out: rcu_read_unlock(); } EXPORT_SYMBOL(__icmp_send); #if IS_ENABLED(CONFIG_NF_NAT) #include <net/netfilter/nf_conntrack.h> void icmp_ndo_send(struct sk_buff *skb_in, int type, int code, __be32 info) { struct sk_buff *cloned_skb = NULL; struct ip_options opts = { 0 }; enum ip_conntrack_info ctinfo; struct nf_conn *ct; __be32 orig_ip; ct = nf_ct_get(skb_in, &ctinfo); if (!ct || !(ct->status & IPS_SRC_NAT)) { __icmp_send(skb_in, type, code, info, &opts); return; } if (skb_shared(skb_in)) skb_in = cloned_skb = skb_clone(skb_in, GFP_ATOMIC); if (unlikely(!skb_in || skb_network_header(skb_in) < skb_in->head || (skb_network_header(skb_in) + sizeof(struct iphdr)) > skb_tail_pointer(skb_in) || skb_ensure_writable(skb_in, skb_network_offset(skb_in) + sizeof(struct iphdr)))) goto out; orig_ip = ip_hdr(skb_in)->saddr; ip_hdr(skb_in)->saddr = ct->tuplehash[0].tuple.src.u3.ip; __icmp_send(skb_in, type, code, info, &opts); ip_hdr(skb_in)->saddr = orig_ip; out: consume_skb(cloned_skb); } EXPORT_SYMBOL(icmp_ndo_send); #endif static void icmp_socket_deliver(struct sk_buff *skb, u32 info) { const struct iphdr *iph = (const struct iphdr *)skb->data; const struct net_protocol *ipprot; int protocol = iph->protocol; /* Checkin full IP header plus 8 bytes of protocol to * avoid additional coding at protocol handlers. */ if (!pskb_may_pull(skb, iph->ihl * 4 + 8)) { __ICMP_INC_STATS(dev_net_rcu(skb->dev), ICMP_MIB_INERRORS); return; } raw_icmp_error(skb, protocol, info); ipprot = rcu_dereference(inet_protos[protocol]); if (ipprot && ipprot->err_handler) ipprot->err_handler(skb, info); } static bool icmp_tag_validation(int proto) { bool ok; rcu_read_lock(); ok = rcu_dereference(inet_protos[proto])->icmp_strict_tag_validation; rcu_read_unlock(); return ok; } /* * Handle ICMP_DEST_UNREACH, ICMP_TIME_EXCEEDED, ICMP_QUENCH, and * ICMP_PARAMETERPROB. */ static enum skb_drop_reason icmp_unreach(struct sk_buff *skb) { enum skb_drop_reason reason = SKB_NOT_DROPPED_YET; const struct iphdr *iph; struct icmphdr *icmph; struct net *net; u32 info = 0; net = dev_net_rcu(skb_dst(skb)->dev); /* * Incomplete header ? * Only checks for the IP header, there should be an * additional check for longer headers in upper levels. */ if (!pskb_may_pull(skb, sizeof(struct iphdr))) goto out_err; icmph = icmp_hdr(skb); iph = (const struct iphdr *)skb->data; if (iph->ihl < 5) { /* Mangled header, drop. */ reason = SKB_DROP_REASON_IP_INHDR; goto out_err; } switch (icmph->type) { case ICMP_DEST_UNREACH: switch (icmph->code & 15) { case ICMP_NET_UNREACH: case ICMP_HOST_UNREACH: case ICMP_PROT_UNREACH: case ICMP_PORT_UNREACH: break; case ICMP_FRAG_NEEDED: /* for documentation of the ip_no_pmtu_disc * values please see * Documentation/networking/ip-sysctl.rst */ switch (READ_ONCE(net->ipv4.sysctl_ip_no_pmtu_disc)) { default: net_dbg_ratelimited("%pI4: fragmentation needed and DF set\n", &iph->daddr); break; case 2: goto out; case 3: if (!icmp_tag_validation(iph->protocol)) goto out; fallthrough; case 0: info = ntohs(icmph->un.frag.mtu); } break; case ICMP_SR_FAILED: net_dbg_ratelimited("%pI4: Source Route Failed\n", &iph->daddr); break; default: break; } if (icmph->code > NR_ICMP_UNREACH) goto out; break; case ICMP_PARAMETERPROB: info = ntohl(icmph->un.gateway) >> 24; break; case ICMP_TIME_EXCEEDED: __ICMP_