| 111 111 111 112 111 113 112 8 109 5 38 38 72 72 110 111 3 109 5 109 3 113 112 5 109 109 101 80 38 38 71 72 81 17 74 81 2 26 39 39 69 69 69 69 3 3 3 3 19 23 56 20 18 18 106 41 6 27 8 2 207 13 13 35 203 203 143 139 131 4 130 131 32 42 30 12 13 105 140 140 145 145 146 22 21 146 146 133 130 7 29 123 36 4 96 125 3 102 27 125 3 16 99 130 130 79 46 129 125 126 5 85 9 3 70 4 8 49 29 77 2 1 3 3 20 58 15 83 83 76 58 27 2 15 15 15 4 1 15 8 8 8 7 1 5 7 7 5 11 11 11 2 9 3 1 7 9 3 4 9 8 8 4 4 9 9 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 | // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) 2004, OGAWA Hirofumi */ #include <linux/blkdev.h> #include <linux/sched/signal.h> #include <linux/backing-dev-defs.h> #include "fat.h" struct fatent_operations { void (*ent_blocknr)(struct super_block *, int, int *, sector_t *); void (*ent_set_ptr)(struct fat_entry *, int); int (*ent_bread)(struct super_block *, struct fat_entry *, int, sector_t); int (*ent_get)(struct fat_entry *); void (*ent_put)(struct fat_entry *, int); int (*ent_next)(struct fat_entry *); }; static DEFINE_SPINLOCK(fat12_entry_lock); static void fat12_ent_blocknr(struct super_block *sb, int entry, int *offset, sector_t *blocknr) { struct msdos_sb_info *sbi = MSDOS_SB(sb); int bytes = entry + (entry >> 1); WARN_ON(!fat_valid_entry(sbi, entry)); *offset = bytes & (sb->s_blocksize - 1); *blocknr = sbi->fat_start + (bytes >> sb->s_blocksize_bits); } static void fat_ent_blocknr(struct super_block *sb, int entry, int *offset, sector_t *blocknr) { struct msdos_sb_info *sbi = MSDOS_SB(sb); int bytes = (entry << sbi->fatent_shift); WARN_ON(!fat_valid_entry(sbi, entry)); *offset = bytes & (sb->s_blocksize - 1); *blocknr = sbi->fat_start + (bytes >> sb->s_blocksize_bits); } static void fat12_ent_set_ptr(struct fat_entry *fatent, int offset) { struct buffer_head **bhs = fatent->bhs; if (fatent->nr_bhs == 1) { WARN_ON(offset >= (bhs[0]->b_size - 1)); fatent->u.ent12_p[0] = bhs[0]->b_data + offset; fatent->u.ent12_p[1] = bhs[0]->b_data + (offset + 1); } else { WARN_ON(offset != (bhs[0]->b_size - 1)); fatent->u.ent12_p[0] = bhs[0]->b_data + offset; fatent->u.ent12_p[1] = bhs[1]->b_data; } } static void fat16_ent_set_ptr(struct fat_entry *fatent, int offset) { WARN_ON(offset & (2 - 1)); fatent->u.ent16_p = (__le16 *)(fatent->bhs[0]->b_data + offset); } static void fat32_ent_set_ptr(struct fat_entry *fatent, int offset) { WARN_ON(offset & (4 - 1)); fatent->u.ent32_p = (__le32 *)(fatent->bhs[0]->b_data + offset); } static int fat12_ent_bread(struct super_block *sb, struct fat_entry *fatent, int offset, sector_t blocknr) { struct buffer_head **bhs = fatent->bhs; WARN_ON(blocknr < MSDOS_SB(sb)->fat_start); fatent->fat_inode = MSDOS_SB(sb)->fat_inode; bhs[0] = sb_bread(sb, blocknr); if (!bhs[0]) goto err; if ((offset + 1) < sb->s_blocksize) fatent->nr_bhs = 1; else { /* This entry is block boundary, it needs the next block */ blocknr++; bhs[1] = sb_bread(sb, blocknr); if (!bhs[1]) goto err_brelse; fatent->nr_bhs = 2; } fat12_ent_set_ptr(fatent, offset); return 0; err_brelse: brelse(bhs[0]); err: fat_msg_ratelimit(sb, KERN_ERR, "FAT read failed (blocknr %llu)", (llu)blocknr); return -EIO; } static int fat_ent_bread(struct super_block *sb, struct fat_entry *fatent, int offset, sector_t blocknr) { const struct fatent_operations *ops = MSDOS_SB(sb)->fatent_ops; WARN_ON(blocknr < MSDOS_SB(sb)->fat_start); fatent->fat_inode = MSDOS_SB(sb)->fat_inode; fatent->bhs[0] = sb_bread(sb, blocknr); if (!fatent->bhs[0]) { fat_msg_ratelimit(sb, KERN_ERR, "FAT read failed (blocknr %llu)", (llu)blocknr); return -EIO; } fatent->nr_bhs = 1; ops->ent_set_ptr(fatent, offset); return 0; } static int fat12_ent_get(struct fat_entry *fatent) { u8 **ent12_p = fatent->u.ent12_p; int next; spin_lock(&fat12_entry_lock); if (fatent->entry & 1) next = (*ent12_p[0] >> 4) | (*ent12_p[1] << 4); else next = (*ent12_p[1] << 8) | *ent12_p[0]; spin_unlock(&fat12_entry_lock); next &= 0x0fff; if (next >= BAD_FAT12) next = FAT_ENT_EOF; return next; } static int fat16_ent_get(struct fat_entry *fatent) { int next = le16_to_cpu(*fatent->u.ent16_p); WARN_ON((unsigned long)fatent->u.ent16_p & (2 - 1)); if (next >= BAD_FAT16) next = FAT_ENT_EOF; return next; } static int fat32_ent_get(struct fat_entry *fatent) { int next = le32_to_cpu(*fatent->u.ent32_p) & 0x0fffffff; WARN_ON((unsigned long)fatent->u.ent32_p & (4 - 1)); if (next >= BAD_FAT32) next = FAT_ENT_EOF; return next; } static void fat12_ent_put(struct fat_entry *fatent, int new) { u8 **ent12_p = fatent->u.ent12_p; if (new == FAT_ENT_EOF) new = EOF_FAT12; spin_lock(&fat12_entry_lock); if (fatent->entry & 1) { *ent12_p[0] = (new << 4) | (*ent12_p[0] & 0x0f); *ent12_p[1] = new >> 4; } else { *ent12_p[0] = new & 0xff; *ent12_p[1] = (*ent12_p[1] & 0xf0) | (new >> 8); } spin_unlock(&fat12_entry_lock); mark_buffer_dirty_inode(fatent->bhs[0], fatent->fat_inode); if (fatent->nr_bhs == 2) mark_buffer_dirty_inode(fatent->bhs[1], fatent->fat_inode); } static void fat16_ent_put(struct fat_entry *fatent, int new) { if (new == FAT_ENT_EOF) new = EOF_FAT16; *fatent->u.ent16_p = cpu_to_le16(new); mark_buffer_dirty_inode(fatent->bhs[0], fatent->fat_inode); } static void fat32_ent_put(struct fat_entry *fatent, int new) { WARN_ON(new & 0xf0000000); new |= le32_to_cpu(*fatent->u.ent32_p) & ~0x0fffffff; *fatent->u.ent32_p = cpu_to_le32(new); mark_buffer_dirty_inode(fatent->bhs[0], fatent->fat_inode); } static int fat12_ent_next(struct fat_entry *fatent) { u8 **ent12_p = fatent->u.ent12_p; struct buffer_head **bhs = fatent->bhs; u8 *nextp = ent12_p[1] + 1 + (fatent->entry & 1); fatent->entry++; if (fatent->nr_bhs == 1) { WARN_ON(ent12_p[0] > (u8 *)(bhs[0]->b_data + (bhs[0]->b_size - 2))); WARN_ON(ent12_p[1] > (u8 *)(bhs[0]->b_data + (bhs[0]->b_size - 1))); if (nextp < (u8 *)(bhs[0]->b_data + (bhs[0]->b_size - 1))) { ent12_p[0] = nextp - 1; ent12_p[1] = nextp; return 1; } } else { WARN_ON(ent12_p[0] != (u8 *)(bhs[0]->b_data + (bhs[0]->b_size - 1))); WARN_ON(ent12_p[1] != (u8 *)bhs[1]->b_data); ent12_p[0] = nextp - 1; ent12_p[1] = nextp; brelse(bhs[0]); bhs[0] = bhs[1]; fatent->nr_bhs = 1; return 1; } ent12_p[0] = NULL; ent12_p[1] = NULL; return 0; } static int fat16_ent_next(struct fat_entry *fatent) { const struct buffer_head *bh = fatent->bhs[0]; fatent->entry++; if (fatent->u.ent16_p < (__le16 *)(bh->b_data + (bh->b_size - 2))) { fatent->u.ent16_p++; return 1; } fatent->u.ent16_p = NULL; return 0; } static int fat32_ent_next(struct fat_entry *fatent) { const struct buffer_head *bh = fatent->bhs[0]; fatent->entry++; if (fatent->u.ent32_p < (__le32 *)(bh->b_data + (bh->b_size - 4))) { fatent->u.ent32_p++; return 1; } fatent->u.ent32_p = NULL; return 0; } static const struct fatent_operations fat12_ops = { .ent_blocknr = fat12_ent_blocknr, .ent_set_ptr = fat12_ent_set_ptr, .ent_bread = fat12_ent_bread, .ent_get = fat12_ent_get, .ent_put = fat12_ent_put, .ent_next = fat12_ent_next, }; static const struct fatent_operations fat16_ops = { .ent_blocknr = fat_ent_blocknr, .ent_set_ptr = fat16_ent_set_ptr, .ent_bread = fat_ent_bread, .ent_get = fat16_ent_get, .ent_put = fat16_ent_put, .ent_next = fat16_ent_next, }; static const struct fatent_operations fat32_ops = { .ent_blocknr = fat_ent_blocknr, .ent_set_ptr = fat32_ent_set_ptr, .ent_bread = fat_ent_bread, .ent_get = fat32_ent_get, .ent_put = fat32_ent_put, .ent_next = fat32_ent_next, }; static inline void lock_fat(struct msdos_sb_info *sbi) { mutex_lock(&sbi->fat_lock); } static inline void unlock_fat(struct msdos_sb_info *sbi) { mutex_unlock(&sbi->fat_lock); } void fat_ent_access_init(struct super_block *sb) { struct msdos_sb_info *sbi = MSDOS_SB(sb); mutex_init(&sbi->fat_lock); if (is_fat32(sbi)) { sbi->fatent_shift = 2; sbi->fatent_ops = &fat32_ops; } else if (is_fat16(sbi)) { sbi->fatent_shift = 1; sbi->fatent_ops = &fat16_ops; } else if (is_fat12(sbi)) { sbi->fatent_shift = -1; sbi->fatent_ops = &fat12_ops; } else { fat_fs_error(sb, "invalid FAT variant, %u bits", sbi->fat_bits); } } static void mark_fsinfo_dirty(struct super_block *sb) { struct msdos_sb_info *sbi = MSDOS_SB(sb); if (sb_rdonly(sb) || !is_fat32(sbi)) return; __mark_inode_dirty(sbi->fsinfo_inode, I_DIRTY_SYNC); } static inline int fat_ent_update_ptr(struct super_block *sb, struct fat_entry *fatent, int offset, sector_t blocknr) { struct msdos_sb_info *sbi = MSDOS_SB(sb); const struct fatent_operations *ops = sbi->fatent_ops; struct buffer_head **bhs = fatent->bhs; /* Is this fatent's blocks including this entry? */ if (!fatent->nr_bhs || bhs[0]->b_blocknr != blocknr) return 0; if (is_fat12(sbi)) { if ((offset + 1) < sb->s_blocksize) { /* This entry is on bhs[0]. */ if (fatent->nr_bhs == 2) { brelse(bhs[1]); fatent->nr_bhs = 1; } } else { /* This entry needs the next block. */ if (fatent->nr_bhs != 2) return 0; if (bhs[1]->b_blocknr != (blocknr + 1)) return 0; } } ops->ent_set_ptr(fatent, offset); return 1; } int fat_ent_read(struct inode *inode, struct fat_entry *fatent, int entry) { struct super_block *sb = inode->i_sb; struct msdos_sb_info *sbi = MSDOS_SB(inode->i_sb); const struct fatent_operations *ops = sbi->fatent_ops; int err, offset; sector_t blocknr; if (!fat_valid_entry(sbi, entry)) { fatent_brelse(fatent); fat_fs_error(sb, "invalid access to FAT (entry 0x%08x)", entry); return -EIO; } fatent_set_entry(fatent, entry); ops->ent_blocknr(sb, entry, &offset, &blocknr); if (!fat_ent_update_ptr(sb, fatent, offset, blocknr)) { fatent_brelse(fatent); err = ops->ent_bread(sb, fatent, offset, blocknr); if (err) return err; } return ops->ent_get(fatent); } /* FIXME: We can write the blocks as more big chunk. */ static int fat_mirror_bhs(struct super_block *sb, struct buffer_head **bhs, int nr_bhs) { struct msdos_sb_info *sbi = MSDOS_SB(sb); struct buffer_head *c_bh; int err, n, copy; err = 0; for (copy = 1; copy < sbi->fats; copy++) { sector_t backup_fat = sbi->fat_length * copy; for (n = 0; n < nr_bhs; n++) { c_bh = sb_getblk(sb, backup_fat + bhs[n]->b_blocknr); if (!c_bh) { err = -ENOMEM; goto error; } /* Avoid race with userspace read via bdev */ lock_buffer(c_bh); memcpy(c_bh->b_data, bhs[n]->b_data, sb->s_blocksize); set_buffer_uptodate(c_bh); unlock_buffer(c_bh); mark_buffer_dirty_inode(c_bh, sbi->fat_inode); if (sb->s_flags & SB_SYNCHRONOUS) err = sync_dirty_buffer(c_bh); brelse(c_bh); if (err) goto error; } } error: return err; } int fat_ent_write(struct inode *inode, struct fat_entry *fatent, int new, int wait) { struct super_block *sb = inode->i_sb; const struct fatent_operations *ops = MSDOS_SB(sb)->fatent_ops; int err; ops->ent_put(fatent, new); if (wait) { err = fat_sync_bhs(fatent->bhs, fatent->nr_bhs); if (err) return err; } return fat_mirror_bhs(sb, fatent->bhs, fatent->nr_bhs); } static inline int fat_ent_next(struct msdos_sb_info *sbi, struct fat_entry *fatent) { if (sbi->fatent_ops->ent_next(fatent)) { if (fatent->entry < sbi->max_cluster) return 1; } return 0; } static inline int fat_ent_read_block(struct super_block *sb, struct fat_entry *fatent) { const struct fatent_operations *ops = MSDOS_SB(sb)->fatent_ops; sector_t blocknr; int offset; fatent_brelse(fatent); ops->ent_blocknr(sb, fatent->entry, &offset, &blocknr); return ops->ent_bread(sb, fatent, offset, blocknr); } static void fat_collect_bhs(struct buffer_head **bhs, int *nr_bhs, struct fat_entry *fatent) { int n, i; for (n = 0; n < fatent->nr_bhs; n++) { for (i = 0; i < *nr_bhs; i++) { if (fatent->bhs[n] == bhs[i]) break; } if (i == *nr_bhs) { get_bh(fatent->bhs[n]); bhs[i] = fatent->bhs[n]; (*nr_bhs)++; } } } int fat_alloc_clusters(struct inode *inode, int *cluster, int nr_cluster) { struct super_block *sb = inode->i_sb; struct msdos_sb_info *sbi = MSDOS_SB(sb); const struct fatent_operations *ops = sbi->fatent_ops; struct fat_entry fatent, prev_ent; struct buffer_head *bhs[MAX_BUF_PER_PAGE]; int i, count, err, nr_bhs, idx_clus; BUG_ON(nr_cluster > (MAX_BUF_PER_PAGE / 2)); /* fixed limit */ lock_fat(sbi); if (sbi->free_clusters != -1 && sbi->free_clus_valid && sbi->free_clusters < nr_cluster) { unlock_fat(sbi); return -ENOSPC; } err = nr_bhs = idx_clus = 0; count = FAT_START_ENT; fatent_init(&prev_ent); fatent_init(&fatent); fatent_set_entry(&fatent, sbi->prev_free + 1); while (count < sbi->max_cluster) { if (fatent.entry >= sbi->max_cluster) fatent.entry = FAT_START_ENT; fatent_set_entry(&fatent, fatent.entry); err = fat_ent_read_block(sb, &fatent); if (err) goto out; /* Find the free entries in a block */ do { if (ops->ent_get(&fatent) == FAT_ENT_FREE) { int entry = fatent.entry; /* make the cluster chain */ ops->ent_put(&fatent, FAT_ENT_EOF); if (prev_ent.nr_bhs) ops->ent_put(&prev_ent, entry); fat_collect_bhs(bhs, &nr_bhs, &fatent); sbi->prev_free = entry; if (sbi->free_clusters != -1) sbi->free_clusters--; cluster[idx_clus] = entry; idx_clus++; if (idx_clus == nr_cluster) goto out; /* * fat_collect_bhs() gets ref-count of bhs, * so we can still use the prev_ent. */ prev_ent = fatent; } count++; if (count == sbi->max_cluster) break; } while (fat_ent_next(sbi, &fatent)); } /* Couldn't allocate the free entries */ sbi->free_clusters = 0; sbi->free_clus_valid = 1; err = -ENOSPC; out: unlock_fat(sbi); mark_fsinfo_dirty(sb); fatent_brelse(&fatent); if (!err) { if (inode_needs_sync(inode)) err = fat_sync_bhs(bhs, nr_bhs); if (!err) err = fat_mirror_bhs(sb, bhs, nr_bhs); } for (i = 0; i < nr_bhs; i++) brelse(bhs[i]); if (err && idx_clus) fat_free_clusters(inode, cluster[0]); return err; } int fat_free_clusters(struct inode *inode, int cluster) { struct super_block *sb = inode->i_sb; struct msdos_sb_info *sbi = MSDOS_SB(sb); const struct fatent_operations *ops = sbi->fatent_ops; struct fat_entry fatent; struct buffer_head *bhs[MAX_BUF_PER_PAGE]; int i, err, nr_bhs; int first_cl = cluster, dirty_fsinfo = 0; nr_bhs = 0; fatent_init(&fatent); lock_fat(sbi); do { cluster = fat_ent_read(inode, &fatent, cluster); if (cluster < 0) { err = cluster; goto error; } else if (cluster == FAT_ENT_FREE) { fat_fs_error(sb, "%s: deleting FAT entry beyond EOF", __func__); err = -EIO; goto error; } if (sbi->options.discard) { /* * Issue discard for the sectors we no longer * care about, batching contiguous clusters * into one request */ if (cluster != fatent.entry + 1) { int nr_clus = fatent.entry - first_cl + 1; sb_issue_discard(sb, fat_clus_to_blknr(sbi, first_cl), nr_clus * sbi->sec_per_clus, GFP_NOFS, 0); first_cl = cluster; } } ops->ent_put(&fatent, FAT_ENT_FREE); if (sbi->free_clusters != -1) { sbi->free_clusters++; dirty_fsinfo = 1; } if (nr_bhs + fatent.nr_bhs > MAX_BUF_PER_PAGE) { if (sb->s_flags & SB_SYNCHRONOUS) { err = fat_sync_bhs(bhs, nr_bhs); if (err) goto error; } err = fat_mirror_bhs(sb, bhs, nr_bhs); if (err) goto error; for (i = 0; i < nr_bhs; i++) brelse(bhs[i]); nr_bhs = 0; } fat_collect_bhs(bhs, &nr_bhs, &fatent); } while (cluster != FAT_ENT_EOF); if (sb->s_flags & SB_SYNCHRONOUS) { err = fat_sync_bhs(bhs, nr_bhs); if (err) goto error; } err = fat_mirror_bhs(sb, bhs, nr_bhs); error: fatent_brelse(&fatent); for (i = 0; i < nr_bhs; i++) brelse(bhs[i]); unlock_fat(sbi); if (dirty_fsinfo) mark_fsinfo_dirty(sb); return err; } EXPORT_SYMBOL_GPL(fat_free_clusters); struct fatent_ra { sector_t cur; sector_t limit; unsigned int ra_blocks; sector_t ra_advance; sector_t ra_next; sector_t ra_limit; }; static void fat_ra_init(struct super_block *sb, struct fatent_ra *ra, struct fat_entry *fatent, int ent_limit) { struct msdos_sb_info *sbi = MSDOS_SB(sb); const struct fatent_operations *ops = sbi->fatent_ops; sector_t blocknr, block_end; int offset; /* * This is the sequential read, so ra_pages * 2 (but try to * align the optimal hardware IO size). * [BTW, 128kb covers the whole sectors for FAT12 and FAT16] */ unsigned long ra_pages = sb->s_bdi->ra_pages; unsigned int reada_blocks; if (fatent->entry >= ent_limit) return; if (ra_pages > sb->s_bdi->io_pages) ra_pages = rounddown(ra_pages, sb->s_bdi->io_pages); reada_blocks = ra_pages << (PAGE_SHIFT - sb->s_blocksize_bits + 1); /* Initialize the range for sequential read */ ops->ent_blocknr(sb, fatent->entry, &offset, &blocknr); ops->ent_blocknr(sb, ent_limit - 1, &offset, &block_end); ra->cur = 0; ra->limit = (block_end + 1) - blocknr; /* Advancing the window at half size */ ra->ra_blocks = reada_blocks >> 1; ra->ra_advance = ra->cur; ra->ra_next = ra->cur; ra->ra_limit = ra->cur + min_t(sector_t, reada_blocks, ra->limit); } /* Assuming to be called before reading a new block (increments ->cur). */ static void fat_ent_reada(struct super_block *sb, struct fatent_ra *ra, struct fat_entry *fatent) { if (ra->ra_next >= ra->ra_limit) return; if (ra->cur >= ra->ra_advance) { struct msdos_sb_info *sbi = MSDOS_SB(sb); const struct fatent_operations *ops = sbi->fatent_ops; struct blk_plug plug; sector_t blocknr, diff; int offset; ops->ent_blocknr(sb, fatent->entry, &offset, &blocknr); diff = blocknr - ra->cur; blk_start_plug(&plug); /* * FIXME: we would want to directly use the bio with * pages to reduce the number of segments. */ for (; ra->ra_next < ra->ra_limit; ra->ra_next++) sb_breadahead(sb, ra->ra_next + diff); blk_finish_plug(&plug); /* Advance the readahead window */ ra->ra_advance += ra->ra_blocks; ra->ra_limit += min_t(sector_t, ra->ra_blocks, ra->limit - ra->ra_limit); } ra->cur++; } int fat_count_free_clusters(struct super_block *sb) { struct msdos_sb_info *sbi = MSDOS_SB(sb); const struct fatent_operations *ops = sbi->fatent_ops; struct fat_entry fatent; struct fatent_ra fatent_ra; int err = 0, free; lock_fat(sbi); if (sbi->free_clusters != -1 && sbi->free_clus_valid) goto out; free = 0; fatent_init(&fatent); fatent_set_entry(&fatent, FAT_START_ENT); fat_ra_init(sb, &fatent_ra, &fatent, sbi->max_cluster); while (fatent.entry < sbi->max_cluster) { /* readahead of fat blocks */ fat_ent_reada(sb, &fatent_ra, &fatent); err = fat_ent_read_block(sb, &fatent); if (err) goto out; do { if (ops->ent_get(&fatent) == FAT_ENT_FREE) free++; } while (fat_ent_next(sbi, &fatent)); cond_resched(); } sbi->free_clusters = free; sbi->free_clus_valid = 1; mark_fsinfo_dirty(sb); fatent_brelse(&fatent); out: unlock_fat(sbi); return err; } static int fat_trim_clusters(struct super_block *sb, u32 clus, u32 nr_clus) { struct msdos_sb_info *sbi = MSDOS_SB(sb); return sb_issue_discard(sb, fat_clus_to_blknr(sbi, clus), nr_clus * sbi->sec_per_clus, GFP_NOFS, 0); } int fat_trim_fs(struct inode *inode, struct fstrim_range *range) { struct super_block *sb = inode->i_sb; struct msdos_sb_info *sbi = MSDOS_SB(sb); const struct fatent_operations *ops = sbi->fatent_ops; struct fat_entry fatent; struct fatent_ra fatent_ra; u64 ent_start, ent_end, minlen, trimmed = 0; u32 free = 0; int err = 0; /* * FAT data is organized as clusters, trim at the granulary of cluster. * * fstrim_range is in byte, convert values to cluster index. * Treat sectors before data region as all used, not to trim them. */ ent_start = max_t(u64, range->start>>sbi->cluster_bits, FAT_START_ENT); ent_end = ent_start + (range->len >> sbi->cluster_bits) - 1; minlen = range->minlen >> sbi->cluster_bits; if (ent_start >= sbi->max_cluster || range->len < sbi->cluster_size) return -EINVAL; if (ent_end >= sbi->max_cluster) ent_end = sbi->max_cluster - 1; fatent_init(&fatent); lock_fat(sbi); fatent_set_entry(&fatent, ent_start); fat_ra_init(sb, &fatent_ra, &fatent, ent_end + 1); while (fatent.entry <= ent_end) { /* readahead of fat blocks */ fat_ent_reada(sb, &fatent_ra, &fatent); err = fat_ent_read_block(sb, &fatent); if (err) goto error; do { if (ops->ent_get(&fatent) == FAT_ENT_FREE) { free++; } else if (free) { if (free >= minlen) { u32 clus = fatent.entry - free; err = fat_trim_clusters(sb, clus, free); if (err && err != -EOPNOTSUPP) goto error; if (!err) trimmed += free; err = 0; } free = 0; } } while (fat_ent_next(sbi, &fatent) && fatent.entry <= ent_end); if (fatal_signal_pending(current)) { err = -ERESTARTSYS; goto error; } if (need_resched()) { fatent_brelse(&fatent); unlock_fat(sbi); cond_resched(); lock_fat(sbi); } } /* handle scenario when tail entries are all free */ if (free && free >= minlen) { u32 clus = fatent.entry - free; err = fat_trim_clusters(sb, clus, free); if (err && err != -EOPNOTSUPP) goto error; if (!err) trimmed += free; err = 0; } error: fatent_brelse(&fatent); unlock_fat(sbi); range->len = trimmed << sbi->cluster_bits; return err; } |
| 220 174 174 66 66 66 216 217 217 1 1 1 1 1 1 1 1 1 1 1 217 214 215 1 1 1 1 1 1 1 1 1 1 216 216 195 7 215 215 1 216 232 50 216 216 215 114 21 201 215 66 113 215 210 212 215 215 214 215 231 50 215 61 61 4 61 61 61 57 61 61 61 62 62 62 36 35 35 61 61 5 6 1 5 5 7 7 3 4 4 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 | // SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2015 Facebook. All rights reserved. */ #include <linux/kernel.h> #include <linux/sched/mm.h> #include "messages.h" #include "ctree.h" #include "disk-io.h" #include "locking.h" #include "free-space-tree.h" #include "transaction.h" #include "block-group.h" #include "fs.h" #include "accessors.h" #include "extent-tree.h" #include "root-tree.h" static int __add_block_group_free_space(struct btrfs_trans_handle *trans, struct btrfs_block_group *block_group, struct btrfs_path *path); static struct btrfs_root *btrfs_free_space_root( struct btrfs_block_group *block_group) { struct btrfs_key key = { .objectid = BTRFS_FREE_SPACE_TREE_OBJECTID, .type = BTRFS_ROOT_ITEM_KEY, .offset = 0, }; if (btrfs_fs_incompat(block_group->fs_info, EXTENT_TREE_V2)) key.offset = block_group->global_root_id; return btrfs_global_root(block_group->fs_info, &key); } void set_free_space_tree_thresholds(struct btrfs_block_group *cache) { u32 bitmap_range; size_t bitmap_size; u64 num_bitmaps, total_bitmap_size; if (WARN_ON(cache->length == 0)) btrfs_warn(cache->fs_info, "block group %llu length is zero", cache->start); /* * We convert to bitmaps when the disk space required for using extents * exceeds that required for using bitmaps. */ bitmap_range = cache->fs_info->sectorsize * BTRFS_FREE_SPACE_BITMAP_BITS; num_bitmaps = div_u64(cache->length + bitmap_range - 1, bitmap_range); bitmap_size = sizeof(struct btrfs_item) + BTRFS_FREE_SPACE_BITMAP_SIZE; total_bitmap_size = num_bitmaps * bitmap_size; cache->bitmap_high_thresh = div_u64(total_bitmap_size, sizeof(struct btrfs_item)); /* * We allow for a small buffer between the high threshold and low * threshold to avoid thrashing back and forth between the two formats. */ if (cache->bitmap_high_thresh > 100) cache->bitmap_low_thresh = cache->bitmap_high_thresh - 100; else cache->bitmap_low_thresh = 0; } static int add_new_free_space_info(struct btrfs_trans_handle *trans, struct btrfs_block_group *block_group, struct btrfs_path *path) { struct btrfs_root *root = btrfs_free_space_root(block_group); struct btrfs_free_space_info *info; struct btrfs_key key; struct extent_buffer *leaf; int ret; key.objectid = block_group->start; key.type = BTRFS_FREE_SPACE_INFO_KEY; key.offset = block_group->length; ret = btrfs_insert_empty_item(trans, root, path, &key, sizeof(*info)); if (ret) goto out; leaf = path->nodes[0]; info = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_free_space_info); btrfs_set_free_space_extent_count(leaf, info, 0); btrfs_set_free_space_flags(leaf, info, 0); btrfs_mark_buffer_dirty(trans, leaf); ret = 0; out: btrfs_release_path(path); return ret; } EXPORT_FOR_TESTS struct btrfs_free_space_info *search_free_space_info( struct btrfs_trans_handle *trans, struct btrfs_block_group *block_group, struct btrfs_path *path, int cow) { struct btrfs_fs_info *fs_info = block_group->fs_info; struct btrfs_root *root = btrfs_free_space_root(block_group); struct btrfs_key key; int ret; key.objectid = block_group->start; key.type = BTRFS_FREE_SPACE_INFO_KEY; key.offset = block_group->length; ret = btrfs_search_slot(trans, root, &key, path, 0, cow); if (ret < 0) return ERR_PTR(ret); if (ret != 0) { btrfs_warn(fs_info, "missing free space info for %llu", block_group->start); ASSERT(0); return ERR_PTR(-ENOENT); } return btrfs_item_ptr(path->nodes[0], path->slots[0], struct btrfs_free_space_info); } /* * btrfs_search_slot() but we're looking for the greatest key less than the * passed key. */ static int btrfs_search_prev_slot(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_key *key, struct btrfs_path *p, int ins_len, int cow) { int ret; ret = btrfs_search_slot(trans, root, key, p, ins_len, cow); if (ret < 0) return ret; if (ret == 0) { ASSERT(0); return -EIO; } if (p->slots[0] == 0) { ASSERT(0); return -EIO; } p->slots[0]--; return 0; } static inline u32 free_space_bitmap_size(const struct btrfs_fs_info *fs_info, u64 size) { return DIV_ROUND_UP(size >> fs_info->sectorsize_bits, BITS_PER_BYTE); } static unsigned long *alloc_bitmap(u32 bitmap_size) { unsigned long *ret; unsigned int nofs_flag; u32 bitmap_rounded_size = round_up(bitmap_size, sizeof(unsigned long)); /* * GFP_NOFS doesn't work with kvmalloc(), but we really can't recurse * into the filesystem as the free space bitmap can be modified in the * critical section of a transaction commit. * * TODO: push the memalloc_nofs_{save,restore}() to the caller where we * know that recursion is unsafe. */ nofs_flag = memalloc_nofs_save(); ret = kvzalloc(bitmap_rounded_size, GFP_KERNEL); memalloc_nofs_restore(nofs_flag); return ret; } static void le_bitmap_set(unsigned long *map, unsigned int start, int len) { u8 *p = ((u8 *)map) + BIT_BYTE(start); const unsigned int size = start + len; int bits_to_set = BITS_PER_BYTE - (start % BITS_PER_BYTE); u8 mask_to_set = BITMAP_FIRST_BYTE_MASK(start); while (len - bits_to_set >= 0) { *p |= mask_to_set; len -= bits_to_set; bits_to_set = BITS_PER_BYTE; mask_to_set = ~0; p++; } if (len) { mask_to_set &= BITMAP_LAST_BYTE_MASK(size); *p |= mask_to_set; } } EXPORT_FOR_TESTS int convert_free_space_to_bitmaps(struct btrfs_trans_handle *trans, struct btrfs_block_group *block_group, struct btrfs_path *path) { struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_root *root = btrfs_free_space_root(block_group); struct btrfs_free_space_info *info; struct btrfs_key key, found_key; struct extent_buffer *leaf; unsigned long *bitmap; char *bitmap_cursor; u64 start, end; u64 bitmap_range, i; u32 bitmap_size, flags, expected_extent_count; u32 extent_count = 0; int done = 0, nr; int ret; bitmap_size = free_space_bitmap_size(fs_info, block_group->length); bitmap = alloc_bitmap(bitmap_size); if (!bitmap) { ret = -ENOMEM; goto out; } start = block_group->start; end = block_group->start + block_group->length; key.objectid = end - 1; key.type = (u8)-1; key.offset = (u64)-1; while (!done) { ret = btrfs_search_prev_slot(trans, root, &key, path, -1, 1); if (ret) goto out; leaf = path->nodes[0]; nr = 0; path->slots[0]++; while (path->slots[0] > 0) { btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0] - 1); if (found_key.type == BTRFS_FREE_SPACE_INFO_KEY) { ASSERT(found_key.objectid == block_group->start); ASSERT(found_key.offset == block_group->length); done = 1; break; } else if (found_key.type == BTRFS_FREE_SPACE_EXTENT_KEY) { u64 first, last; ASSERT(found_key.objectid >= start); ASSERT(found_key.objectid < end); ASSERT(found_key.objectid + found_key.offset <= end); first = div_u64(found_key.objectid - start, fs_info->sectorsize); last = div_u64(found_key.objectid + found_key.offset - start, fs_info->sectorsize); le_bitmap_set(bitmap, first, last - first); extent_count++; nr++; path->slots[0]--; } else { ASSERT(0); } } ret = btrfs_del_items(trans, root, path, path->slots[0], nr); if (ret) goto out; btrfs_release_path(path); } info = search_free_space_info(trans, block_group, path, 1); if (IS_ERR(info)) { ret = PTR_ERR(info); goto out; } leaf = path->nodes[0]; flags = btrfs_free_space_flags(leaf, info); flags |= BTRFS_FREE_SPACE_USING_BITMAPS; btrfs_set_free_space_flags(leaf, info, flags); expected_extent_count = btrfs_free_space_extent_count(leaf, info); btrfs_mark_buffer_dirty(trans, leaf); btrfs_release_path(path); if (extent_count != expected_extent_count) { btrfs_err(fs_info, "incorrect extent count for %llu; counted %u, expected %u", block_group->start, extent_count, expected_extent_count); ASSERT(0); ret = -EIO; goto out; } bitmap_cursor = (char *)bitmap; bitmap_range = fs_info->sectorsize * BTRFS_FREE_SPACE_BITMAP_BITS; i = start; while (i < end) { unsigned long ptr; u64 extent_size; u32 data_size; extent_size = min(end - i, bitmap_range); data_size = free_space_bitmap_size(fs_info, extent_size); key.objectid = i; key.type = BTRFS_FREE_SPACE_BITMAP_KEY; key.offset = extent_size; ret = btrfs_insert_empty_item(trans, root, path, &key, data_size); if (ret) goto out; leaf = path->nodes[0]; ptr = btrfs_item_ptr_offset(leaf, path->slots[0]); write_extent_buffer(leaf, bitmap_cursor, ptr, data_size); btrfs_mark_buffer_dirty(trans, leaf); btrfs_release_path(path); i += extent_size; bitmap_cursor += data_size; } ret = 0; out: kvfree(bitmap); if (ret) btrfs_abort_transaction(trans, ret); return ret; } EXPORT_FOR_TESTS int convert_free_space_to_extents(struct btrfs_trans_handle *trans, struct btrfs_block_group *block_group, struct btrfs_path *path) { struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_root *root = btrfs_free_space_root(block_group); struct btrfs_free_space_info *info; struct btrfs_key key, found_key; struct extent_buffer *leaf; unsigned long *bitmap; u64 start, end; u32 bitmap_size, flags, expected_extent_count; unsigned long nrbits, start_bit, end_bit; u32 extent_count = 0; int done = 0, nr; int ret; bitmap_size = free_space_bitmap_size(fs_info, block_group->length); bitmap = alloc_bitmap(bitmap_size); if (!bitmap) { ret = -ENOMEM; goto out; } start = block_group->start; end = block_group->start + block_group->length; key.objectid = end - 1; key.type = (u8)-1; key.offset = (u64)-1; while (!done) { ret = btrfs_search_prev_slot(trans, root, &key, path, -1, 1); if (ret) goto out; leaf = path->nodes[0]; nr = 0; path->slots[0]++; while (path->slots[0] > 0) { btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0] - 1); if (found_key.type == BTRFS_FREE_SPACE_INFO_KEY) { ASSERT(found_key.objectid == block_group->start); ASSERT(found_key.offset == block_group->length); done = 1; break; } else if (found_key.type == BTRFS_FREE_SPACE_BITMAP_KEY) { unsigned long ptr; char *bitmap_cursor; u32 bitmap_pos, data_size; ASSERT(found_key.objectid >= start); ASSERT(found_key.objectid < end); ASSERT(found_key.objectid + found_key.offset <= end); bitmap_pos = div_u64(found_key.objectid - start, fs_info->sectorsize * BITS_PER_BYTE); bitmap_cursor = ((char *)bitmap) + bitmap_pos; data_size = free_space_bitmap_size(fs_info, found_key.offset); ptr = btrfs_item_ptr_offset(leaf, path->slots[0] - 1); read_extent_buffer(leaf, bitmap_cursor, ptr, data_size); nr++; path->slots[0]--; } else { ASSERT(0); } } ret = btrfs_del_items(trans, root, path, path->slots[0], nr); if (ret) goto out; btrfs_release_path(path); } info = search_free_space_info(trans, block_group, path, 1); if (IS_ERR(info)) { ret = PTR_ERR(info); goto out; } leaf = path->nodes[0]; flags = btrfs_free_space_flags(leaf, info); flags &= ~BTRFS_FREE_SPACE_USING_BITMAPS; btrfs_set_free_space_flags(leaf, info, flags); expected_extent_count = btrfs_free_space_extent_count(leaf, info); btrfs_mark_buffer_dirty(trans, leaf); btrfs_release_path(path); nrbits = block_group->length >> block_group->fs_info->sectorsize_bits; start_bit = find_next_bit_le(bitmap, nrbits, 0); while (start_bit < nrbits) { end_bit = find_next_zero_bit_le(bitmap, nrbits, start_bit); ASSERT(start_bit < end_bit); key.objectid = start + start_bit * block_group->fs_info->sectorsize; key.type = BTRFS_FREE_SPACE_EXTENT_KEY; key.offset = (end_bit - start_bit) * block_group->fs_info->sectorsize; ret = btrfs_insert_empty_item(trans, root, path, &key, 0); if (ret) goto out; btrfs_release_path(path); extent_count++; start_bit = find_next_bit_le(bitmap, nrbits, end_bit); } if (extent_count != expected_extent_count) { btrfs_err(fs_info, "incorrect extent count for %llu; counted %u, expected %u", block_group->start, extent_count, expected_extent_count); ASSERT(0); ret = -EIO; goto out; } ret = 0; out: kvfree(bitmap); if (ret) btrfs_abort_transaction(trans, ret); return ret; } static int update_free_space_extent_count(struct btrfs_trans_handle *trans, struct btrfs_block_group *block_group, struct btrfs_path *path, int new_extents) { struct btrfs_free_space_info *info; u32 flags; u32 extent_count; int ret = 0; if (new_extents == 0) return 0; info = search_free_space_info(trans, block_group, path, 1); if (IS_ERR(info)) { ret = PTR_ERR(info); goto out; } flags = btrfs_free_space_flags(path->nodes[0], info); extent_count = btrfs_free_space_extent_count(path->nodes[0], info); extent_count += new_extents; btrfs_set_free_space_extent_count(path->nodes[0], info, extent_count); btrfs_mark_buffer_dirty(trans, path->nodes[0]); btrfs_release_path(path); if (!(flags & BTRFS_FREE_SPACE_USING_BITMAPS) && extent_count > block_group->bitmap_high_thresh) { ret = convert_free_space_to_bitmaps(trans, block_group, path); } else if ((flags & BTRFS_FREE_SPACE_USING_BITMAPS) && extent_count < block_group->bitmap_low_thresh) { ret = convert_free_space_to_extents(trans, block_group, path); } out: return ret; } EXPORT_FOR_TESTS int free_space_test_bit(struct btrfs_block_group *block_group, struct btrfs_path *path, u64 offset) { struct extent_buffer *leaf; struct btrfs_key key; u64 found_start, found_end; unsigned long ptr, i; leaf = path->nodes[0]; btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); ASSERT(key.type == BTRFS_FREE_SPACE_BITMAP_KEY); found_start = key.objectid; found_end = key.objectid + key.offset; ASSERT(offset >= found_start && offset < found_end); ptr = btrfs_item_ptr_offset(leaf, path->slots[0]); i = div_u64(offset - found_start, block_group->fs_info->sectorsize); return !!extent_buffer_test_bit(leaf, ptr, i); } static void free_space_set_bits(struct btrfs_trans_handle *trans, struct btrfs_block_group *block_group, struct btrfs_path *path, u64 *start, u64 *size, int bit) { struct btrfs_fs_info *fs_info = block_group->fs_info; struct extent_buffer *leaf; struct btrfs_key key; u64 end = *start + *size; u64 found_start, found_end; unsigned long ptr, first, last; leaf = path->nodes[0]; btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); ASSERT(key.type == BTRFS_FREE_SPACE_BITMAP_KEY); found_start = key.objectid; found_end = key.objectid + key.offset; ASSERT(*start >= found_start && *start < found_end); ASSERT(end > found_start); if (end > found_end) end = found_end; ptr = btrfs_item_ptr_offset(leaf, path->slots[0]); first = (*start - found_start) >> fs_info->sectorsize_bits; last = (end - found_start) >> fs_info->sectorsize_bits; if (bit) extent_buffer_bitmap_set(leaf, ptr, first, last - first); else extent_buffer_bitmap_clear(leaf, ptr, first, last - first); btrfs_mark_buffer_dirty(trans, leaf); *size -= end - *start; *start = end; } /* * We can't use btrfs_next_item() in modify_free_space_bitmap() because * btrfs_next_leaf() doesn't get the path for writing. We can forgo the fancy * tree walking in btrfs_next_leaf() anyways because we know exactly what we're * looking for. */ static int free_space_next_bitmap(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *p) { struct btrfs_key key; if (p->slots[0] + 1 < btrfs_header_nritems(p->nodes[0])) { p->slots[0]++; return 0; } btrfs_item_key_to_cpu(p->nodes[0], &key, p->slots[0]); btrfs_release_path(p); key.objectid += key.offset; key.type = (u8)-1; key.offset = (u64)-1; return btrfs_search_prev_slot(trans, root, &key, p, 0, 1); } /* * If remove is 1, then we are removing free space, thus clearing bits in the * bitmap. If remove is 0, then we are adding free space, thus setting bits in * the bitmap. */ static int modify_free_space_bitmap(struct btrfs_trans_handle *trans, struct btrfs_block_group *block_group, struct btrfs_path *path, u64 start, u64 size, int remove) { struct btrfs_root *root = btrfs_free_space_root(block_group); struct btrfs_key key; u64 end = start + size; u64 cur_start, cur_size; int prev_bit, next_bit; int new_extents; int ret; /* * Read the bit for the block immediately before the extent of space if * that block is within the block group. */ if (start > block_group->start) { u64 prev_block = start - block_group->fs_info->sectorsize; key.objectid = prev_block; key.type = (u8)-1; key.offset = (u64)-1; ret = btrfs_search_prev_slot(trans, root, &key, path, 0, 1); if (ret) goto out; prev_bit = free_space_test_bit(block_group, path, prev_block); /* The previous block may have been in the previous bitmap. */ btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); if (start >= key.objectid + key.offset) { ret = free_space_next_bitmap(trans, root, path); if (ret) goto out; } } else { key.objectid = start; key.type = (u8)-1; key.offset = (u64)-1; ret = btrfs_search_prev_slot(trans, root, &key, path, 0, 1); if (ret) goto out; prev_bit = -1; } /* * Iterate over all of the bitmaps overlapped by the extent of space, * clearing/setting bits as required. */ cur_start = start; cur_size = size; while (1) { free_space_set_bits(trans, block_group, path, &cur_start, &cur_size, !remove); if (cur_size == 0) break; ret = free_space_next_bitmap(trans, root, path); if (ret) goto out; } /* * Read the bit for the block immediately after the extent of space if * that block is within the block group. */ if (end < block_group->start + block_group->length) { /* The next block may be in the next bitmap. */ btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); if (end >= key.objectid + key.offset) { ret = free_space_next_bitmap(trans, root, path); if (ret) goto out; } next_bit = free_space_test_bit(block_group, path, end); } else { next_bit = -1; } if (remove) { new_extents = -1; if (prev_bit == 1) { /* Leftover on the left. */ new_extents++; } if (next_bit == 1) { /* Leftover on the right. */ new_extents++; } } else { new_extents = 1; if (prev_bit == 1) { /* Merging with neighbor on the left. */ new_extents--; } if (next_bit == 1) { /* Merging with neighbor on the right. */ new_extents--; } } btrfs_release_path(path); ret = update_free_space_extent_count(trans, block_group, path, new_extents); out: return ret; } static int remove_free_space_extent(struct btrfs_trans_handle *trans, struct btrfs_block_group *block_group, struct btrfs_path *path, u64 start, u64 size) { struct btrfs_root *root = btrfs_free_space_root(block_group); struct btrfs_key key; u64 found_start, found_end; u64 end = start + size; int new_extents = -1; int ret; key.objectid = start; key.type = (u8)-1; key.offset = (u64)-1; ret = btrfs_search_prev_slot(trans, root, &key, path, -1, 1); if (ret) goto out; btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); ASSERT(key.type == BTRFS_FREE_SPACE_EXTENT_KEY); found_start = key.objectid; found_end = key.objectid + key.offset; ASSERT(start >= found_start && end <= found_end); /* * Okay, now that we've found the free space extent which contains the * free space that we are removing, there are four cases: * * 1. We're using the whole extent: delete the key we found and * decrement the free space extent count. * 2. We are using part of the extent starting at the beginning: delete * the key we found and insert a new key representing the leftover at * the end. There is no net change in the number of extents. * 3. We are using part of the extent ending at the end: delete the key * we found and insert a new key representing the leftover at the * beginning. There is no net change in the number of extents. * 4. We are using part of the extent in the middle: delete the key we * found and insert two new keys representing the leftovers on each * side. Where we used to have one extent, we now have two, so increment * the extent count. We may need to convert the block group to bitmaps * as a result. */ /* Delete the existing key (cases 1-4). */ ret = btrfs_del_item(trans, root, path); if (ret) goto out; /* Add a key for leftovers at the beginning (cases 3 and 4). */ if (start > found_start) { key.objectid = found_start; key.type = BTRFS_FREE_SPACE_EXTENT_KEY; key.offset = start - found_start; btrfs_release_path(path); ret = btrfs_insert_empty_item(trans, root, path, &key, 0); if (ret) goto out; new_extents++; } /* Add a key for leftovers at the end (cases 2 and 4). */ if (end < found_end) { key.objectid = end; key.type = BTRFS_FREE_SPACE_EXTENT_KEY; key.offset = found_end - end; btrfs_release_path(path); ret = btrfs_insert_empty_item(trans, root, path, &key, 0); if (ret) goto out; new_extents++; } btrfs_release_path(path); ret = update_free_space_extent_count(trans, block_group, path, new_extents); out: return ret; } EXPORT_FOR_TESTS int __remove_from_free_space_tree(struct btrfs_trans_handle *trans, struct btrfs_block_group *block_group, struct btrfs_path *path, u64 start, u64 size) { struct btrfs_free_space_info *info; u32 flags; int ret; if (test_bit(BLOCK_GROUP_FLAG_NEEDS_FREE_SPACE, &block_group->runtime_flags)) { ret = __add_block_group_free_space(trans, block_group, path); if (ret) return ret; } info = search_free_space_info(NULL, block_group, path, 0); if (IS_ERR(info)) return PTR_ERR(info); flags = btrfs_free_space_flags(path->nodes[0], info); btrfs_release_path(path); if (flags & BTRFS_FREE_SPACE_USING_BITMAPS) { return modify_free_space_bitmap(trans, block_group, path, start, size, 1); } else { return remove_free_space_extent(trans, block_group, path, start, size); } } int remove_from_free_space_tree(struct btrfs_trans_handle *trans, u64 start, u64 size) { struct btrfs_block_group *block_group; struct btrfs_path *path; int ret; if (!btrfs_fs_compat_ro(trans->fs_info, FREE_SPACE_TREE)) return 0; path = btrfs_alloc_path(); if (!path) { ret = -ENOMEM; goto out; } block_group = btrfs_lookup_block_group(trans->fs_info, start); if (!block_group) { ASSERT(0); ret = -ENOENT; goto out; } mutex_lock(&block_group->free_space_lock); ret = __remove_from_free_space_tree(trans, block_group, path, start, size); mutex_unlock(&block_group->free_space_lock); btrfs_put_block_group(block_group); out: btrfs_free_path(path); if (ret) btrfs_abort_transaction(trans, ret); return ret; } static int add_free_space_extent(struct btrfs_trans_handle *trans, struct btrfs_block_group *block_group, struct btrfs_path *path, u64 start, u64 size) { struct btrfs_root *root = btrfs_free_space_root(block_group); struct btrfs_key key, new_key; u64 found_start, found_end; u64 end = start + size; int new_extents = 1; int ret; /* * We are adding a new extent of free space, but we need to merge * extents. There are four cases here: * * 1. The new extent does not have any immediate neighbors to merge * with: add the new key and increment the free space extent count. We * may need to convert the block group to bitmaps as a result. * 2. The new extent has an immediate neighbor before it: remove the * previous key and insert a new key combining both of them. There is no * net change in the number of extents. * 3. The new extent has an immediate neighbor after it: remove the next * key and insert a new key combining both of them. There is no net * change in the number of extents. * 4. The new extent has immediate neighbors on both sides: remove both * of the keys and insert a new key combining all of them. Where we used * to have two extents, we now have one, so decrement the extent count. */ new_key.objectid = start; new_key.type = BTRFS_FREE_SPACE_EXTENT_KEY; new_key.offset = size; /* Search for a neighbor on the left. */ if (start == block_group->start) goto right; key.objectid = start - 1; key.type = (u8)-1; key.offset = (u64)-1; ret = btrfs_search_prev_slot(trans, root, &key, path, -1, 1); if (ret) goto out; btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); if (key.type != BTRFS_FREE_SPACE_EXTENT_KEY) { ASSERT(key.type == BTRFS_FREE_SPACE_INFO_KEY); btrfs_release_path(path); goto right; } found_start = key.objectid; found_end = key.objectid + key.offset; ASSERT(found_start >= block_group->start && found_end > block_group->start); ASSERT(found_start < start && found_end <= start); /* * Delete the neighbor on the left and absorb it into the new key (cases * 2 and 4). */ if (found_end == start) { ret = btrfs_del_item(trans, root, path); if (ret) goto out; new_key.objectid = found_start; new_key.offset += key.offset; new_extents--; } btrfs_release_path(path); right: /* Search for a neighbor on the right. */ if (end == block_group->start + block_group->length) goto insert; key.objectid = end; key.type = (u8)-1; key.offset = (u64)-1; ret = btrfs_search_prev_slot(trans, root, &key, path, -1, 1); if (ret) goto out; btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); if (key.type != BTRFS_FREE_SPACE_EXTENT_KEY) { ASSERT(key.type == BTRFS_FREE_SPACE_INFO_KEY); btrfs_release_path(path); goto insert; } found_start = key.objectid; found_end = key.objectid + key.offset; ASSERT(found_start >= block_group->start && found_end > block_group->start); ASSERT((found_start < start && found_end <= start) || (found_start >= end && found_end > end)); /* * Delete the neighbor on the right and absorb it into the new key * (cases 3 and 4). */ if (found_start == end) { ret = btrfs_del_item(trans, root, path); if (ret) goto out; new_key.offset += key.offset; new_extents--; } btrfs_release_path(path); insert: /* Insert the new key (cases 1-4). */ ret = btrfs_insert_empty_item(trans, root, path, &new_key, 0); if (ret) goto out; btrfs_release_path(path); ret = update_free_space_extent_count(trans, block_group, path, new_extents); out: return ret; } EXPORT_FOR_TESTS int __add_to_free_space_tree(struct btrfs_trans_handle *trans, struct btrfs_block_group *block_group, struct btrfs_path *path, u64 start, u64 size) { struct btrfs_free_space_info *info; u32 flags; int ret; if (test_bit(BLOCK_GROUP_FLAG_NEEDS_FREE_SPACE, &block_group->runtime_flags)) { ret = __add_block_group_free_space(trans, block_group, path); if (ret) return ret; } info = search_free_space_info(NULL, block_group, path, 0); if (IS_ERR(info)) return PTR_ERR(info); flags = btrfs_free_space_flags(path->nodes[0], info); btrfs_release_path(path); if (flags & BTRFS_FREE_SPACE_USING_BITMAPS) { return modify_free_space_bitmap(trans, block_group, path, start, size, 0); } else { return add_free_space_extent(trans, block_group, path, start, size); } } int add_to_free_space_tree(struct btrfs_trans_handle *trans, u64 start, u64 size) { struct btrfs_block_group *block_group; struct btrfs_path *path; int ret; if (!btrfs_fs_compat_ro(trans->fs_info, FREE_SPACE_TREE)) return 0; path = btrfs_alloc_path(); if (!path) { ret = -ENOMEM; goto out; } block_group = btrfs_lookup_block_group(trans->fs_info, start); if (!block_group) { ASSERT(0); ret = -ENOENT; goto out; } mutex_lock(&block_group->free_space_lock); ret = __add_to_free_space_tree(trans, block_group, path, start, size); mutex_unlock(&block_group->free_space_lock); btrfs_put_block_group(block_group); out: btrfs_free_path(path); if (ret) btrfs_abort_transaction(trans, ret); return ret; } /* * Populate the free space tree by walking the extent tree. Operations on the * extent tree that happen as a result of writes to the free space tree will go * through the normal add/remove hooks. */ static int populate_free_space_tree(struct btrfs_trans_handle *trans, struct btrfs_block_group *block_group) { struct btrfs_root *extent_root; struct btrfs_path *path, *path2; struct btrfs_key key; u64 start, end; int ret; path = btrfs_alloc_path(); if (!path) return -ENOMEM; path->reada = READA_FORWARD; path2 = btrfs_alloc_path(); if (!path2) { btrfs_free_path(path); return -ENOMEM; } ret = add_new_free_space_info(trans, block_group, path2); if (ret) goto out; mutex_lock(&block_group->free_space_lock); /* * Iterate through all of the extent and metadata items in this block * group, adding the free space between them and the free space at the * end. Note that EXTENT_ITEM and METADATA_ITEM are less than * BLOCK_GROUP_ITEM, so an extent may precede the block group that it's * contained in. */ key.objectid = block_group->start; key.type = BTRFS_EXTENT_ITEM_KEY; key.offset = 0; extent_root = btrfs_extent_root(trans->fs_info, key.objectid); ret = btrfs_search_slot_for_read(extent_root, &key, path, 1, 0); if (ret < 0) goto out_locked; ASSERT(ret == 0); start = block_group->start; end = block_group->start + block_group->length; while (1) { btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); if (key.type == BTRFS_EXTENT_ITEM_KEY || key.type == BTRFS_METADATA_ITEM_KEY) { if (key.objectid >= end) break; if (start < key.objectid) { ret = __add_to_free_space_tree(trans, block_group, path2, start, key.objectid - start); if (ret) goto out_locked; } start = key.objectid; if (key.type == BTRFS_METADATA_ITEM_KEY) start += trans->fs_info->nodesize; else start += key.offset; } else if (key.type == BTRFS_BLOCK_GROUP_ITEM_KEY) { if (key.objectid != block_group->start) break; } ret = btrfs_next_item(extent_root, path); if (ret < 0) goto out_locked; if (ret) break; } if (start < end) { ret = __add_to_free_space_tree(trans, block_group, path2, start, end - start); if (ret) goto out_locked; } ret = 0; out_locked: mutex_unlock(&block_group->free_space_lock); out: btrfs_free_path(path2); btrfs_free_path(path); return ret; } int btrfs_create_free_space_tree(struct btrfs_fs_info *fs_info) { struct btrfs_trans_handle *trans; struct btrfs_root *tree_root = fs_info->tree_root; struct btrfs_root *free_space_root; struct btrfs_block_group *block_group; struct rb_node *node; int ret; trans = btrfs_start_transaction(tree_root, 0); if (IS_ERR(trans)) return PTR_ERR(trans); set_bit(BTRFS_FS_CREATING_FREE_SPACE_TREE, &fs_info->flags); set_bit(BTRFS_FS_FREE_SPACE_TREE_UNTRUSTED, &fs_info->flags); free_space_root = btrfs_create_tree(trans, BTRFS_FREE_SPACE_TREE_OBJECTID); if (IS_ERR(free_space_root)) { ret = PTR_ERR(free_space_root); btrfs_abort_transaction(trans, ret); btrfs_end_transaction(trans); goto out_clear; } ret = btrfs_global_root_insert(free_space_root); if (ret) { btrfs_put_root(free_space_root); btrfs_abort_transaction(trans, ret); btrfs_end_transaction(trans); goto out_clear; } node = rb_first_cached(&fs_info->block_group_cache_tree); while (node) { block_group = rb_entry(node, struct btrfs_block_group, cache_node); ret = populate_free_space_tree(trans, block_group); if (ret) { btrfs_abort_transaction(trans, ret); btrfs_end_transaction(trans); goto out_clear; } node = rb_next(node); } btrfs_set_fs_compat_ro(fs_info, FREE_SPACE_TREE); btrfs_set_fs_compat_ro(fs_info, FREE_SPACE_TREE_VALID); clear_bit(BTRFS_FS_CREATING_FREE_SPACE_TREE, &fs_info->flags); ret = btrfs_commit_transaction(trans); /* * Now that we've committed the transaction any reading of our commit * root will be safe, so we can cache from the free space tree now. */ clear_bit(BTRFS_FS_FREE_SPACE_TREE_UNTRUSTED, &fs_info->flags); return ret; out_clear: clear_bit(BTRFS_FS_CREATING_FREE_SPACE_TREE, &fs_info->flags); clear_bit(BTRFS_FS_FREE_SPACE_TREE_UNTRUSTED, &fs_info->flags); return ret; } static int clear_free_space_tree(struct btrfs_trans_handle *trans, struct btrfs_root *root) { struct btrfs_path *path; struct btrfs_key key; int nr; int ret; path = btrfs_alloc_path(); if (!path) return -ENOMEM; key.objectid = 0; key.type = 0; key.offset = 0; while (1) { ret = btrfs_search_slot(trans, root, &key, path, -1, 1); if (ret < 0) goto out; nr = btrfs_header_nritems(path->nodes[0]); if (!nr) break; path->slots[0] = 0; ret = btrfs_del_items(trans, root, path, 0, nr); if (ret) goto out; btrfs_release_path(path); } ret = 0; out: btrfs_free_path(path); return ret; } int btrfs_delete_free_space_tree(struct btrfs_fs_info *fs_info) { struct btrfs_trans_handle *trans; struct btrfs_root *tree_root = fs_info->tree_root; struct btrfs_key key = { .objectid = BTRFS_FREE_SPACE_TREE_OBJECTID, .type = BTRFS_ROOT_ITEM_KEY, .offset = 0, }; struct btrfs_root *free_space_root = btrfs_global_root(fs_info, &key); int ret; trans = btrfs_start_transaction(tree_root, 0); if (IS_ERR(trans)) return PTR_ERR(trans); btrfs_clear_fs_compat_ro(fs_info, FREE_SPACE_TREE); btrfs_clear_fs_compat_ro(fs_info, FREE_SPACE_TREE_VALID); ret = clear_free_space_tree(trans, free_space_root); if (ret) { btrfs_abort_transaction(trans, ret); btrfs_end_transaction(trans); return ret; } ret = btrfs_del_root(trans, &free_space_root->root_key); if (ret) { btrfs_abort_transaction(trans, ret); btrfs_end_transaction(trans); return ret; } btrfs_global_root_delete(free_space_root); spin_lock(&fs_info->trans_lock); list_del(&free_space_root->dirty_list); spin_unlock(&fs_info->trans_lock); btrfs_tree_lock(free_space_root->node); btrfs_clear_buffer_dirty(trans, free_space_root->node); btrfs_tree_unlock(free_space_root->node); ret = btrfs_free_tree_block(trans, btrfs_root_id(free_space_root), free_space_root->node, 0, 1); btrfs_put_root(free_space_root); if (ret < 0) { btrfs_abort_transaction(trans, ret); btrfs_end_transaction(trans); return ret; } return btrfs_commit_transaction(trans); } int btrfs_rebuild_free_space_tree(struct btrfs_fs_info *fs_info) { struct btrfs_trans_handle *trans; struct btrfs_key key = { .objectid = BTRFS_FREE_SPACE_TREE_OBJECTID, .type = BTRFS_ROOT_ITEM_KEY, .offset = 0, }; struct btrfs_root *free_space_root = btrfs_global_root(fs_info, &key); struct rb_node *node; int ret; trans = btrfs_start_transaction(free_space_root, 1); if (IS_ERR(trans)) return PTR_ERR(trans); set_bit(BTRFS_FS_CREATING_FREE_SPACE_TREE, &fs_info->flags); set_bit(BTRFS_FS_FREE_SPACE_TREE_UNTRUSTED, &fs_info->flags); ret = clear_free_space_tree(trans, free_space_root); if (ret) { btrfs_abort_transaction(trans, ret); btrfs_end_transaction(trans); return ret; } node = rb_first_cached(&fs_info->block_group_cache_tree); while (node) { struct btrfs_block_group *block_group; block_group = rb_entry(node, struct btrfs_block_group, cache_node); ret = populate_free_space_tree(trans, block_group); if (ret) { btrfs_abort_transaction(trans, ret); btrfs_end_transaction(trans); return ret; } node = rb_next(node); } btrfs_set_fs_compat_ro(fs_info, FREE_SPACE_TREE); btrfs_set_fs_compat_ro(fs_info, FREE_SPACE_TREE_VALID); clear_bit(BTRFS_FS_CREATING_FREE_SPACE_TREE, &fs_info->flags); ret = btrfs_commit_transaction(trans); clear_bit(BTRFS_FS_FREE_SPACE_TREE_UNTRUSTED, &fs_info->flags); return ret; } static int __add_block_group_free_space(struct btrfs_trans_handle *trans, struct btrfs_block_group *block_group, struct btrfs_path *path) { int ret; clear_bit(BLOCK_GROUP_FLAG_NEEDS_FREE_SPACE, &block_group->runtime_flags); ret = add_new_free_space_info(trans, block_group, path); if (ret) return ret; return __add_to_free_space_tree(trans, block_group, path, block_group->start, block_group->length); } int add_block_group_free_space(struct btrfs_trans_handle *trans, struct btrfs_block_group *block_group) { struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_path *path = NULL; int ret = 0; if (!btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) return 0; mutex_lock(&block_group->free_space_lock); if (!test_bit(BLOCK_GROUP_FLAG_NEEDS_FREE_SPACE, &block_group->runtime_flags)) goto out; path = btrfs_alloc_path(); if (!path) { ret = -ENOMEM; goto out; } ret = __add_block_group_free_space(trans, block_group, path); out: btrfs_free_path(path); mutex_unlock(&block_group->free_space_lock); if (ret) btrfs_abort_transaction(trans, ret); return ret; } int remove_block_group_free_space(struct btrfs_trans_handle *trans, struct btrfs_block_group *block_group) { struct btrfs_root *root = btrfs_free_space_root(block_group); struct btrfs_path *path; struct btrfs_key key, found_key; struct extent_buffer *leaf; u64 start, end; int done = 0, nr; int ret; if (!btrfs_fs_compat_ro(trans->fs_info, FREE_SPACE_TREE)) return 0; if (test_bit(BLOCK_GROUP_FLAG_NEEDS_FREE_SPACE, &block_group->runtime_flags)) { /* We never added this block group to the free space tree. */ return 0; } path = btrfs_alloc_path(); if (!path) { ret = -ENOMEM; goto out; } start = block_group->start; end = block_group->start + block_group->length; key.objectid = end - 1; key.type = (u8)-1; key.offset = (u64)-1; while (!done) { ret = btrfs_search_prev_slot(trans, root, &key, path, -1, 1); if (ret) goto out; leaf = path->nodes[0]; nr = 0; path->slots[0]++; while (path->slots[0] > 0) { btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0] - 1); if (found_key.type == BTRFS_FREE_SPACE_INFO_KEY) { ASSERT(found_key.objectid == block_group->start); ASSERT(found_key.offset == block_group->length); done = 1; nr++; path->slots[0]--; break; } else if (found_key.type == BTRFS_FREE_SPACE_EXTENT_KEY || found_key.type == BTRFS_FREE_SPACE_BITMAP_KEY) { ASSERT(found_key.objectid >= start); ASSERT(found_key.objectid < end); ASSERT(found_key.objectid + found_key.offset <= end); nr++; path->slots[0]--; } else { ASSERT(0); } } ret = btrfs_del_items(trans, root, path, path->slots[0], nr); if (ret) goto out; btrfs_release_path(path); } ret = 0; out: btrfs_free_path(path); if (ret) btrfs_abort_transaction(trans, ret); return ret; } static int load_free_space_bitmaps(struct btrfs_caching_control *caching_ctl, struct btrfs_path *path, u32 expected_extent_count) { struct btrfs_block_group *block_group; struct btrfs_fs_info *fs_info; struct btrfs_root *root; struct btrfs_key key; int prev_bit = 0, bit; /* Initialize to silence GCC. */ u64 extent_start = 0; u64 end, offset; u64 total_found = 0; u32 extent_count = 0; int ret; block_group = caching_ctl->block_group; fs_info = block_group->fs_info; root = btrfs_free_space_root(block_group); end = block_group->start + block_group->length; while (1) { ret = btrfs_next_item(root, path); if (ret < 0) goto out; if (ret) break; btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); if (key.type == BTRFS_FREE_SPACE_INFO_KEY) break; ASSERT(key.type == BTRFS_FREE_SPACE_BITMAP_KEY); ASSERT(key.objectid < end && key.objectid + key.offset <= end); offset = key.objectid; while (offset < key.objectid + key.offset) { bit = free_space_test_bit(block_group, path, offset); if (prev_bit == 0 && bit == 1) { extent_start = offset; } else if (prev_bit == 1 && bit == 0) { u64 space_added; ret = btrfs_add_new_free_space(block_group, extent_start, offset, &space_added); if (ret) goto out; total_found += space_added; if (total_found > CACHING_CTL_WAKE_UP) { total_found = 0; wake_up(&caching_ctl->wait); } extent_count++; } prev_bit = bit; offset += fs_info->sectorsize; } } if (prev_bit == 1) { ret = btrfs_add_new_free_space(block_group, extent_start, end, NULL); if (ret) goto out; extent_count++; } if (extent_count != expected_extent_count) { btrfs_err(fs_info, "incorrect extent count for %llu; counted %u, expected %u", block_group->start, extent_count, expected_extent_count); ASSERT(0); ret = -EIO; goto out; } ret = 0; out: return ret; } static int load_free_space_extents(struct btrfs_caching_control *caching_ctl, struct btrfs_path *path, u32 expected_extent_count) { struct btrfs_block_group *block_group; struct btrfs_fs_info *fs_info; struct btrfs_root *root; struct btrfs_key key; u64 end; u64 total_found = 0; u32 extent_count = 0; int ret; block_group = caching_ctl->block_group; fs_info = block_group->fs_info; root = btrfs_free_space_root(block_group); end = block_group->start + block_group->length; while (1) { u64 space_added; ret = btrfs_next_item(root, path); if (ret < 0) goto out; if (ret) break; btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); if (key.type == BTRFS_FREE_SPACE_INFO_KEY) break; ASSERT(key.type == BTRFS_FREE_SPACE_EXTENT_KEY); ASSERT(key.objectid < end && key.objectid + key.offset <= end); ret = btrfs_add_new_free_space(block_group, key.objectid, key.objectid + key.offset, &space_added); if (ret) goto out; total_found += space_added; if (total_found > CACHING_CTL_WAKE_UP) { total_found = 0; wake_up(&caching_ctl->wait); } extent_count++; } if (extent_count != expected_extent_count) { btrfs_err(fs_info, "incorrect extent count for %llu; counted %u, expected %u", block_group->start, extent_count, expected_extent_count); ASSERT(0); ret = -EIO; goto out; } ret = 0; out: return ret; } int load_free_space_tree(struct btrfs_caching_control *caching_ctl) { struct btrfs_block_group *block_group; struct btrfs_free_space_info *info; struct btrfs_path *path; u32 extent_count, flags; int ret; block_group = caching_ctl->block_group; path = btrfs_alloc_path(); if (!path) return -ENOMEM; /* * Just like caching_thread() doesn't want to deadlock on the extent * tree, we don't want to deadlock on the free space tree. */ path->skip_locking = 1; path->search_commit_root = 1; path->reada = READA_FORWARD; info = search_free_space_info(NULL, block_group, path, 0); if (IS_ERR(info)) { ret = PTR_ERR(info); goto out; } extent_count = btrfs_free_space_extent_count(path->nodes[0], info); flags = btrfs_free_space_flags(path->nodes[0], info); /* * We left path pointing to the free space info item, so now * load_free_space_foo can just iterate through the free space tree from * there. */ if (flags & BTRFS_FREE_SPACE_USING_BITMAPS) ret = load_free_space_bitmaps(caching_ctl, path, extent_count); else ret = load_free_space_extents(caching_ctl, path, extent_count); out: btrfs_free_path(path); return ret; } |
| 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 | // SPDX-License-Identifier: GPL-2.0-or-later /* * * Copyright (c) International Business Machines Corp., 2002,2008 * Author(s): Steve French (sfrench@us.ibm.com) * * Error mapping routines from Samba libsmb/errormap.c * Copyright (C) Andrew Tridgell 2001 */ #include <linux/net.h> #include <linux/string.h> #include <linux/in.h> #include <linux/ctype.h> #include <linux/fs.h> #include <asm/div64.h> #include <asm/byteorder.h> #include <linux/inet.h> #include "cifsfs.h" #include "cifspdu.h" #include "cifsglob.h" #include "cifsproto.h" #include "smberr.h" #include "cifs_debug.h" #include "nterr.h" struct smb_to_posix_error { __u16 smb_err; int posix_code; }; static const struct smb_to_posix_error mapping_table_ERRDOS[] = { {ERRbadfunc, -EINVAL}, {ERRbadfile, -ENOENT}, {ERRbadpath, -ENOTDIR}, {ERRnofids, -EMFILE}, {ERRnoaccess, -EACCES}, {ERRbadfid, -EBADF}, {ERRbadmcb, -EIO}, {ERRnomem, -EREMOTEIO}, {ERRbadmem, -EFAULT}, {ERRbadenv, -EFAULT}, {ERRbadformat, -EINVAL}, {ERRbadaccess, -EACCES}, {ERRbaddata, -EIO}, {ERRbaddrive, -ENXIO}, {ERRremcd, -EACCES}, {ERRdiffdevice, -EXDEV}, {ERRnofiles, -ENOENT}, {ERRwriteprot, -EROFS}, {ERRbadshare, -EBUSY}, {ERRlock, -EACCES}, {ERRunsup, -EINVAL}, {ERRnosuchshare, -ENXIO}, {ERRfilexists, -EEXIST}, {ERRinvparm, -EINVAL}, {ERRdiskfull, -ENOSPC}, {ERRinvname, -ENOENT}, {ERRinvlevel, -EOPNOTSUPP}, {ERRdirnotempty, -ENOTEMPTY}, {ERRnotlocked, -ENOLCK}, {ERRcancelviolation, -ENOLCK}, {ERRalreadyexists, -EEXIST}, {ERRmoredata, -EOVERFLOW}, {ERReasnotsupported, -EOPNOTSUPP}, {ErrQuota, -EDQUOT}, {ErrNotALink, -ENOLINK}, {ERRnetlogonNotStarted, -ENOPROTOOPT}, {ERRsymlink, -EOPNOTSUPP}, {ErrTooManyLinks, -EMLINK}, {0, 0} }; static const struct smb_to_posix_error mapping_table_ERRSRV[] = { {ERRerror, -EIO}, {ERRbadpw, -EACCES}, /* was EPERM */ {ERRbadtype, -EREMOTE}, {ERRaccess, -EACCES}, {ERRinvtid, -ENXIO}, {ERRinvnetname, -ENXIO}, {ERRinvdevice, -ENXIO}, {ERRqfull, -ENOSPC}, {ERRqtoobig, -ENOSPC}, {ERRqeof, -EIO}, {ERRinvpfid, -EBADF}, {ERRsmbcmd, -EBADRQC}, {ERRsrverror, -EIO}, {ERRbadBID, -EIO}, {ERRfilespecs, -EINVAL}, {ERRbadLink, -EIO}, {ERRbadpermits, -EINVAL}, {ERRbadPID, -ESRCH}, {ERRsetattrmode, -EINVAL}, {ERRpaused, -EHOSTDOWN}, {ERRmsgoff, -EHOSTDOWN}, {ERRnoroom, -ENOSPC}, {ERRrmuns, -EUSERS}, {ERRtimeout, -ETIME}, {ERRnoresource, -EREMOTEIO}, {ERRtoomanyuids, -EUSERS}, {ERRbaduid, -EACCES}, {ERRusempx, -EIO}, {ERRusestd, -EIO}, {ERR_NOTIFY_ENUM_DIR, -ENOBUFS}, {ERRnoSuchUser, -EACCES}, /* {ERRaccountexpired, -EACCES}, {ERRbadclient, -EACCES}, {ERRbadLogonTime, -EACCES}, {ERRpasswordExpired, -EACCES},*/ {ERRaccountexpired, -EKEYEXPIRED}, {ERRbadclient, -EACCES}, {ERRbadLogonTime, -EACCES}, {ERRpasswordExpired, -EKEYEXPIRED}, {ERRnosupport, -EINVAL}, {0, 0} }; /* * Convert a string containing text IPv4 or IPv6 address to binary form. * * Returns 0 on failure. */ static int cifs_inet_pton(const int address_family, const char *cp, int len, void *dst) { int ret = 0; /* calculate length by finding first slash or NULL */ if (address_family == AF_INET) ret = in4_pton(cp, len, dst, '\\', NULL); else if (address_family == AF_INET6) ret = in6_pton(cp, len, dst , '\\', NULL); cifs_dbg(NOISY, "address conversion returned %d for %*.*s\n", ret, len, len, cp); if (ret > 0) ret = 1; return ret; } /* * Try to convert a string to an IPv4 address and then attempt to convert * it to an IPv6 address if that fails. Set the family field if either * succeeds. If it's an IPv6 address and it has a '%' sign in it, try to * treat the part following it as a numeric sin6_scope_id. * * Returns 0 on failure. */ int cifs_convert_address(struct sockaddr *dst, const char *src, int len) { int rc, alen, slen; const char *pct; char scope_id[13]; struct sockaddr_in *s4 = (struct sockaddr_in *) dst; struct sockaddr_in6 *s6 = (struct sockaddr_in6 *) dst; /* IPv4 address */ if (cifs_inet_pton(AF_INET, src, len, &s4->sin_addr.s_addr)) { s4->sin_family = AF_INET; return 1; } /* attempt to exclude the scope ID from the address part */ pct = memchr(src, '%', len); alen = pct ? pct - src : len; rc = cifs_inet_pton(AF_INET6, src, alen, &s6->sin6_addr.s6_addr); if (!rc) return rc; s6->sin6_family = AF_INET6; if (pct) { /* grab the scope ID */ slen = len - (alen + 1); if (slen <= 0 || slen > 12) return 0; memcpy(scope_id, pct + 1, slen); scope_id[slen] = '\0'; rc = kstrtouint(scope_id, 0, &s6->sin6_scope_id); rc = (rc == 0) ? 1 : 0; } return rc; } void cifs_set_port(struct sockaddr *addr, const unsigned short int port) { switch (addr->sa_family) { case AF_INET: ((struct sockaddr_in *)addr)->sin_port = htons(port); break; case AF_INET6: ((struct sockaddr_in6 *)addr)->sin6_port = htons(port); break; } } /***************************************************************************** convert a NT status code to a dos class/code *****************************************************************************/ /* NT status -> dos error map */ static const struct { __u8 dos_class; __u16 dos_code; __u32 ntstatus; } ntstatus_to_dos_map[] = { { ERRDOS, ERRgeneral, NT_STATUS_UNSUCCESSFUL}, { ERRDOS, ERRbadfunc, NT_STATUS_NOT_IMPLEMENTED}, { ERRDOS, ERRinvlevel, NT_STATUS_INVALID_INFO_CLASS}, { ERRDOS, 24, NT_STATUS_INFO_LENGTH_MISMATCH}, { ERRHRD, ERRgeneral, NT_STATUS_ACCESS_VIOLATION}, { ERRHRD, ERRgeneral, NT_STATUS_IN_PAGE_ERROR}, { ERRHRD, ERRgeneral, NT_STATUS_PAGEFILE_QUOTA}, { ERRDOS, ERRbadfid, NT_STATUS_INVALID_HANDLE}, { ERRHRD, ERRgeneral, NT_STATUS_BAD_INITIAL_STACK}, { ERRDOS, 193, NT_STATUS_BAD_INITIAL_PC}, { ERRDOS, 87, NT_STATUS_INVALID_CID}, { ERRHRD, ERRgeneral, NT_STATUS_TIMER_NOT_CANCELED}, { ERRDOS, 87, NT_STATUS_INVALID_PARAMETER}, { ERRDOS, ERRbadfile, NT_STATUS_NO_SUCH_DEVICE}, { ERRDOS, ERRbadfile, NT_STATUS_NO_SUCH_FILE}, { ERRDOS, ERRbadfunc, NT_STATUS_INVALID_DEVICE_REQUEST}, { ERRDOS, 38, NT_STATUS_END_OF_FILE}, { ERRDOS, 34, NT_STATUS_WRONG_VOLUME}, { ERRDOS, 21, NT_STATUS_NO_MEDIA_IN_DEVICE}, { ERRHRD, ERRgeneral, NT_STATUS_UNRECOGNIZED_MEDIA}, { ERRDOS, 27, NT_STATUS_NONEXISTENT_SECTOR}, /* { This NT error code was 'sqashed' from NT_STATUS_MORE_PROCESSING_REQUIRED to NT_STATUS_OK during the session setup } */ { ERRDOS, ERRnomem, NT_STATUS_NO_MEMORY}, { ERRDOS, 487, NT_STATUS_CONFLICTING_ADDRESSES}, { ERRDOS, 487, NT_STATUS_NOT_MAPPED_VIEW}, { ERRDOS, 87, NT_STATUS_UNABLE_TO_FREE_VM}, { ERRDOS, 87, NT_STATUS_UNABLE_TO_DELETE_SECTION}, { ERRDOS, 2142, NT_STATUS_INVALID_SYSTEM_SERVICE}, { ERRHRD, ERRgeneral, NT_STATUS_ILLEGAL_INSTRUCTION}, { ERRDOS, ERRnoaccess, NT_STATUS_INVALID_LOCK_SEQUENCE}, { ERRDOS, ERRnoaccess, NT_STATUS_INVALID_VIEW_SIZE}, { ERRDOS, 193, NT_STATUS_INVALID_FILE_FOR_SECTION}, { ERRDOS, ERRnoaccess, NT_STATUS_ALREADY_COMMITTED}, /* { This NT error code was 'sqashed' from NT_STATUS_ACCESS_DENIED to NT_STATUS_TRUSTED_RELATIONSHIP_FAILURE during the session setup } */ { ERRDOS, ERRnoaccess, NT_STATUS_ACCESS_DENIED}, { ERRDOS, 111, NT_STATUS_BUFFER_TOO_SMALL}, { ERRDOS, ERRbadfid, NT_STATUS_OBJECT_TYPE_MISMATCH}, { ERRHRD, ERRgeneral, NT_STATUS_NONCONTINUABLE_EXCEPTION}, { ERRHRD, ERRgeneral, NT_STATUS_INVALID_DISPOSITION}, { ERRHRD, ERRgeneral, NT_STATUS_UNWIND}, { ERRHRD, ERRgeneral, NT_STATUS_BAD_STACK}, { ERRHRD, ERRgeneral, NT_STATUS_INVALID_UNWIND_TARGET}, { ERRDOS, 158, NT_STATUS_NOT_LOCKED}, { ERRHRD, ERRgeneral, NT_STATUS_PARITY_ERROR}, { ERRDOS, 487, NT_STATUS_UNABLE_TO_DECOMMIT_VM}, { ERRDOS, 487, NT_STATUS_NOT_COMMITTED}, { ERRHRD, ERRgeneral, NT_STATUS_INVALID_PORT_ATTRIBUTES}, { ERRHRD, ERRgeneral, NT_STATUS_PORT_MESSAGE_TOO_LONG}, { ERRDOS, 87, NT_STATUS_INVALID_PARAMETER_MIX}, { ERRHRD, ERRgeneral, NT_STATUS_INVALID_QUOTA_LOWER}, { ERRHRD, ERRgeneral, NT_STATUS_DISK_CORRUPT_ERROR}, { /* mapping changed since shell does lookup on * expects FileNotFound */ ERRDOS, ERRbadfile, NT_STATUS_OBJECT_NAME_INVALID}, { ERRDOS, ERRbadfile, NT_STATUS_OBJECT_NAME_NOT_FOUND}, { ERRDOS, ERRalreadyexists, NT_STATUS_OBJECT_NAME_COLLISION}, { ERRHRD, ERRgeneral, NT_STATUS_HANDLE_NOT_WAITABLE}, { ERRDOS, ERRbadfid, NT_STATUS_PORT_DISCONNECTED}, { ERRHRD, ERRgeneral, NT_STATUS_DEVICE_ALREADY_ATTACHED}, { ERRDOS, 161, NT_STATUS_OBJECT_PATH_INVALID}, { ERRDOS, ERRbadpath, NT_STATUS_OBJECT_PATH_NOT_FOUND}, { ERRDOS, 161, NT_STATUS_OBJECT_PATH_SYNTAX_BAD}, { ERRHRD, ERRgeneral, NT_STATUS_DATA_OVERRUN}, { ERRHRD, ERRgeneral, NT_STATUS_DATA_LATE_ERROR}, { ERRDOS, 23, NT_STATUS_DATA_ERROR}, { ERRDOS, 23, NT_STATUS_CRC_ERROR}, { ERRDOS, ERRnomem, NT_STATUS_SECTION_TOO_BIG}, { ERRDOS, ERRnoaccess, NT_STATUS_PORT_CONNECTION_REFUSED}, { ERRDOS, ERRbadfid, NT_STATUS_INVALID_PORT_HANDLE}, { ERRDOS, ERRbadshare, NT_STATUS_SHARING_VIOLATION}, { ERRHRD, ERRgeneral, NT_STATUS_QUOTA_EXCEEDED}, { ERRDOS, 87, NT_STATUS_INVALID_PAGE_PROTECTION}, { ERRDOS, 288, NT_STATUS_MUTANT_NOT_OWNED}, { ERRDOS, 298, NT_STATUS_SEMAPHORE_LIMIT_EXCEEDED}, { ERRDOS, 87, NT_STATUS_PORT_ALREADY_SET}, { ERRDOS, 87, NT_STATUS_SECTION_NOT_IMAGE}, { ERRDOS, 156, NT_STATUS_SUSPEND_COUNT_EXCEEDED}, { ERRDOS, ERRnoaccess, NT_STATUS_THREAD_IS_TERMINATING}, { ERRDOS, 87, NT_STATUS_BAD_WORKING_SET_LIMIT}, { ERRDOS, 87, NT_STATUS_INCOMPATIBLE_FILE_MAP}, { ERRDOS, 87, NT_STATUS_SECTION_PROTECTION}, { ERRDOS, ERReasnotsupported, NT_STATUS_EAS_NOT_SUPPORTED}, { ERRDOS, 255, NT_STATUS_EA_TOO_LARGE}, { ERRHRD, ERRgeneral, NT_STATUS_NONEXISTENT_EA_ENTRY}, { ERRHRD, ERRgeneral, NT_STATUS_NO_EAS_ON_FILE}, { ERRHRD, ERRgeneral, NT_STATUS_EA_CORRUPT_ERROR}, { ERRDOS, ERRlock, NT_STATUS_FILE_LOCK_CONFLICT}, { ERRDOS, ERRlock, NT_STATUS_LOCK_NOT_GRANTED}, { ERRDOS, ERRbadfile, NT_STATUS_DELETE_PENDING}, { ERRDOS, ERRunsup, NT_STATUS_CTL_FILE_NOT_SUPPORTED}, { ERRHRD, ERRgeneral, NT_STATUS_UNKNOWN_REVISION}, { ERRHRD, ERRgeneral, NT_STATUS_REVISION_MISMATCH}, { ERRHRD, ERRgeneral, NT_STATUS_INVALID_OWNER}, { ERRHRD, ERRgeneral, NT_STATUS_INVALID_PRIMARY_GROUP}, { ERRHRD, ERRgeneral, NT_STATUS_NO_IMPERSONATION_TOKEN}, { ERRHRD, ERRgeneral, NT_STATUS_CANT_DISABLE_MANDATORY}, { ERRDOS, 2215, NT_STATUS_NO_LOGON_SERVERS}, { ERRHRD, ERRgeneral, NT_STATUS_NO_SUCH_LOGON_SESSION}, { ERRHRD, ERRgeneral, NT_STATUS_NO_SUCH_PRIVILEGE}, { ERRDOS, ERRnoaccess, NT_STATUS_PRIVILEGE_NOT_HELD}, { ERRHRD, ERRgeneral, NT_STATUS_INVALID_ACCOUNT_NAME}, { ERRHRD, ERRgeneral, NT_STATUS_USER_EXISTS}, /* { This NT error code was 'sqashed' from NT_STATUS_NO_SUCH_USER to NT_STATUS_LOGON_FAILURE during the session setup } */ { ERRDOS, ERRnoaccess, NT_STATUS_NO_SUCH_USER}, { /* could map to 2238 */ ERRHRD, ERRgeneral, NT_STATUS_GROUP_EXISTS}, { ERRHRD, ERRgeneral, NT_STATUS_NO_SUCH_GROUP}, { ERRHRD, ERRgeneral, NT_STATUS_MEMBER_IN_GROUP}, { ERRHRD, ERRgeneral, NT_STATUS_MEMBER_NOT_IN_GROUP}, { ERRHRD, ERRgeneral, NT_STATUS_LAST_ADMIN}, /* { This NT error code was 'sqashed' from NT_STATUS_WRONG_PASSWORD to NT_STATUS_LOGON_FAILURE during the session setup } */ { ERRSRV, ERRbadpw, NT_STATUS_WRONG_PASSWORD}, { ERRHRD, ERRgeneral, NT_STATUS_ILL_FORMED_PASSWORD}, { ERRHRD, ERRgeneral, NT_STATUS_PASSWORD_RESTRICTION}, { ERRDOS, ERRnoaccess, NT_STATUS_LOGON_FAILURE}, { ERRHRD, ERRgeneral, NT_STATUS_ACCOUNT_RESTRICTION}, { ERRSRV, ERRbadLogonTime, NT_STATUS_INVALID_LOGON_HOURS}, { ERRSRV, ERRbadclient, NT_STATUS_INVALID_WORKSTATION}, { ERRSRV, ERRpasswordExpired, NT_STATUS_PASSWORD_EXPIRED}, { ERRSRV, ERRaccountexpired, NT_STATUS_ACCOUNT_DISABLED}, { ERRHRD, ERRgeneral, NT_STATUS_NONE_MAPPED}, { ERRHRD, ERRgeneral, NT_STATUS_TOO_MANY_LUIDS_REQUESTED}, { ERRHRD, ERRgeneral, NT_STATUS_LUIDS_EXHAUSTED}, { ERRHRD, ERRgeneral, NT_STATUS_INVALID_SUB_AUTHORITY}, { ERRHRD, ERRgeneral, NT_STATUS_INVALID_ACL}, { ERRHRD, ERRgeneral, NT_STATUS_INVALID_SID}, { ERRHRD, ERRgeneral, NT_STATUS_INVALID_SECURITY_DESCR}, { ERRDOS, 127, NT_STATUS_PROCEDURE_NOT_FOUND}, { ERRDOS, 193, NT_STATUS_INVALID_IMAGE_FORMAT}, { ERRHRD, ERRgeneral, NT_STATUS_NO_TOKEN}, { ERRHRD, ERRgeneral, NT_STATUS_BAD_INHERITANCE_ACL}, { ERRDOS, 158, NT_STATUS_RANGE_NOT_LOCKED}, { ERRDOS, 112, NT_STATUS_DISK_FULL}, { ERRHRD, ERRgeneral, NT_STATUS_SERVER_DISABLED}, { ERRHRD, ERRgeneral, NT_STATUS_SERVER_NOT_DISABLED}, { ERRDOS, 68, NT_STATUS_TOO_MANY_GUIDS_REQUESTED}, { ERRDOS, 259, NT_STATUS_GUIDS_EXHAUSTED}, { ERRHRD, ERRgeneral, NT_STATUS_INVALID_ID_AUTHORITY}, { ERRDOS, 259, NT_STATUS_AGENTS_EXHAUSTED}, { ERRDOS, 154, NT_STATUS_INVALID_VOLUME_LABEL}, { ERRDOS, 14, NT_STATUS_SECTION_NOT_EXTENDED}, { ERRDOS, 487, NT_STATUS_NOT_MAPPED_DATA}, { ERRHRD, ERRgeneral, NT_STATUS_RESOURCE_DATA_NOT_FOUND}, { ERRHRD, ERRgeneral, NT_STATUS_RESOURCE_TYPE_NOT_FOUND}, { ERRHRD, ERRgeneral, NT_STATUS_RESOURCE_NAME_NOT_FOUND}, { ERRHRD, ERRgeneral, NT_STATUS_ARRAY_BOUNDS_EXCEEDED}, { ERRHRD, ERRgeneral, NT_STATUS_FLOAT_DENORMAL_OPERAND}, { ERRHRD, ERRgeneral, NT_STATUS_FLOAT_DIVIDE_BY_ZERO}, { ERRHRD, ERRgeneral, NT_STATUS_FLOAT_INEXACT_RESULT}, { ERRHRD, ERRgeneral, NT_STATUS_FLOAT_INVALID_OPERATION}, { ERRHRD, ERRgeneral, NT_STATUS_FLOAT_OVERFLOW}, { ERRHRD, ERRgeneral, NT_STATUS_FLOAT_STACK_CHECK}, { ERRHRD, ERRgeneral, NT_STATUS_FLOAT_UNDERFLOW}, { ERRHRD, ERRgeneral, NT_STATUS_INTEGER_DIVIDE_BY_ZERO}, { ERRDOS, 534, NT_STATUS_INTEGER_OVERFLOW}, { ERRHRD, ERRgeneral, NT_STATUS_PRIVILEGED_INSTRUCTION}, { ERRDOS, ERRnomem, NT_STATUS_TOO_MANY_PAGING_FILES}, { ERRHRD, ERRgeneral, NT_STATUS_FILE_INVALID}, { ERRHRD, ERRgeneral, NT_STATUS_ALLOTTED_SPACE_EXCEEDED}, /* { This NT error code was 'sqashed' from NT_STATUS_INSUFFICIENT_RESOURCES to NT_STATUS_INSUFF_SERVER_RESOURCES during the session setup } */ { ERRDOS, ERRnoresource, NT_STATUS_INSUFFICIENT_RESOURCES}, { ERRDOS, ERRbadpath, NT_STATUS_DFS_EXIT_PATH_FOUND}, { ERRDOS, 23, NT_STATUS_DEVICE_DATA_ERROR}, { ERRHRD, ERRgeneral, NT_STATUS_DEVICE_NOT_CONNECTED}, { ERRDOS, 21, NT_STATUS_DEVICE_POWER_FAILURE}, { ERRDOS, 487, NT_STATUS_FREE_VM_NOT_AT_BASE}, { ERRDOS, 487, NT_STATUS_MEMORY_NOT_ALLOCATED}, { ERRHRD, ERRgeneral, NT_STATUS_WORKING_SET_QUOTA}, { ERRDOS, 19, NT_STATUS_MEDIA_WRITE_PROTECTED}, { ERRDOS, 21, NT_STATUS_DEVICE_NOT_READY}, { ERRHRD, ERRgeneral, NT_STATUS_INVALID_GROUP_ATTRIBUTES}, { ERRHRD, ERRgeneral, NT_STATUS_BAD_IMPERSONATION_LEVEL}, { ERRHRD, ERRgeneral, NT_STATUS_CANT_OPEN_ANONYMOUS}, { ERRHRD, ERRgeneral, NT_STATUS_BAD_VALIDATION_CLASS}, { ERRHRD, ERRgeneral, NT_STATUS_BAD_TOKEN_TYPE}, { ERRDOS, 87, NT_STATUS_BAD_MASTER_BOOT_RECORD}, { ERRHRD, ERRgeneral, NT_STATUS_INSTRUCTION_MISALIGNMENT}, { ERRDOS, ERRpipebusy, NT_STATUS_INSTANCE_NOT_AVAILABLE}, { ERRDOS, ERRpipebusy, NT_STATUS_PIPE_NOT_AVAILABLE}, { ERRDOS, ERRbadpipe, NT_STATUS_INVALID_PIPE_STATE}, { ERRDOS, ERRpipebusy, NT_STATUS_PIPE_BUSY}, { ERRDOS, ERRbadfunc, NT_STATUS_ILLEGAL_FUNCTION}, { ERRDOS, ERRnotconnected, NT_STATUS_PIPE_DISCONNECTED}, { ERRDOS, ERRpipeclosing, NT_STATUS_PIPE_CLOSING}, { ERRHRD, ERRgeneral, NT_STATUS_PIPE_CONNECTED}, { ERRHRD, ERRgeneral, NT_STATUS_PIPE_LISTENING}, { ERRDOS, ERRbadpipe, NT_STATUS_INVALID_READ_MODE}, { ERRDOS, 121, NT_STATUS_IO_TIMEOUT}, { ERRDOS, 38, NT_STATUS_FILE_FORCED_CLOSED}, { ERRHRD, ERRgeneral, NT_STATUS_PROFILING_NOT_STARTED}, { ERRHRD, ERRgeneral, NT_STATUS_PROFILING_NOT_STOPPED}, { ERRHRD, ERRgeneral, NT_STATUS_COULD_NOT_INTERPRET}, { ERRDOS, ERRnoaccess, NT_STATUS_FILE_IS_A_DIRECTORY}, { ERRDOS, ERRunsup, NT_STATUS_NOT_SUPPORTED}, { ERRDOS, 51, NT_STATUS_REMOTE_NOT_LISTENING}, { ERRDOS, 52, NT_STATUS_DUPLICATE_NAME}, { ERRDOS, 53, NT_STATUS_BAD_NETWORK_PATH}, { ERRDOS, 54, NT_STATUS_NETWORK_BUSY}, { ERRDOS, 55, NT_STATUS_DEVICE_DOES_NOT_EXIST}, { ERRDOS, 56, NT_STATUS_TOO_MANY_COMMANDS}, { ERRDOS, 57, NT_STATUS_ADAPTER_HARDWARE_ERROR}, { ERRDOS, 58, NT_STATUS_INVALID_NETWORK_RESPONSE}, { ERRDOS, 59, NT_STATUS_UNEXPECTED_NETWORK_ERROR}, { ERRDOS, 60, NT_STATUS_BAD_REMOTE_ADAPTER}, { ERRDOS, 61, NT_STATUS_PRINT_QUEUE_FULL}, { ERRDOS, 62, NT_STATUS_NO_SPOOL_SPACE}, { ERRDOS, 63, NT_STATUS_PRINT_CANCELLED}, { ERRDOS, 64, NT_STATUS_NETWORK_NAME_DELETED}, { ERRDOS, 65, NT_STATUS_NETWORK_ACCESS_DENIED}, { ERRDOS, 66, NT_STATUS_BAD_DEVICE_TYPE}, { ERRDOS, ERRnosuchshare, NT_STATUS_BAD_NETWORK_NAME}, { ERRDOS, 68, NT_STATUS_TOO_MANY_NAMES}, { ERRDOS, 69, NT_STATUS_TOO_MANY_SESSIONS}, { ERRDOS, 70, NT_STATUS_SHARING_PAUSED}, { ERRDOS, 71, NT_STATUS_REQUEST_NOT_ACCEPTED}, { ERRDOS, 72, NT_STATUS_REDIRECTOR_PAUSED}, { ERRDOS, 88, NT_STATUS_NET_WRITE_FAULT}, { ERRHRD, ERRgeneral, NT_STATUS_PROFILING_AT_LIMIT}, { ERRDOS, ERRdiffdevice, NT_STATUS_NOT_SAME_DEVICE}, { ERRDOS, ERRnoaccess, NT_STATUS_FILE_RENAMED}, { ERRDOS, 240, NT_STATUS_VIRTUAL_CIRCUIT_CLOSED}, { ERRHRD, ERRgeneral, NT_STATUS_NO_SECURITY_ON_OBJECT}, { ERRHRD, ERRgeneral, NT_STATUS_CANT_WAIT}, { ERRDOS, ERRpipeclosing, NT_STATUS_PIPE_EMPTY}, { ERRHRD, ERRgeneral, NT_STATUS_CANT_ACCESS_DOMAIN_INFO}, { ERRHRD, ERRgeneral, NT_STATUS_CANT_TERMINATE_SELF}, { ERRHRD, ERRgeneral, NT_STATUS_INVALID_SERVER_STATE}, { ERRHRD, ERRgeneral, NT_STATUS_INVALID_DOMAIN_STATE}, { ERRHRD, ERRgeneral, NT_STATUS_INVALID_DOMAIN_ROLE}, { ERRHRD, ERRgeneral, NT_STATUS_NO_SUCH_DOMAIN}, { ERRHRD, ERRgeneral, NT_STATUS_DOMAIN_EXISTS}, { ERRHRD, ERRgeneral, NT_STATUS_DOMAIN_LIMIT_EXCEEDED}, { ERRDOS, 300, NT_STATUS_OPLOCK_NOT_GRANTED}, { ERRDOS, 301, NT_STATUS_INVALID_OPLOCK_PROTOCOL}, { ERRHRD, ERRgeneral, NT_STATUS_INTERNAL_DB_CORRUPTION}, { ERRHRD, ERRgeneral, NT_STATUS_INTERNAL_ERROR}, { ERRHRD, ERRgeneral, NT_STATUS_GENERIC_NOT_MAPPED}, { ERRHRD, ERRgeneral, NT_STATUS_BAD_DESCRIPTOR_FORMAT}, { ERRHRD, ERRgeneral, NT_STATUS_INVALID_USER_BUFFER}, { ERRHRD, ERRgeneral, NT_STATUS_UNEXPECTED_IO_ERROR}, { ERRHRD, ERRgeneral, NT_STATUS_UNEXPECTED_MM_CREATE_ERR}, { ERRHRD, ERRgeneral, NT_STATUS_UNEXPECTED_MM_MAP_ERROR}, { ERRHRD, ERRgeneral, NT_STATUS_UNEXPECTED_MM_EXTEND_ERR}, { ERRHRD, ERRgeneral, NT_STATUS_NOT_LOGON_PROCESS}, { ERRHRD, ERRgeneral, NT_STATUS_LOGON_SESSION_EXISTS}, { ERRDOS, 87, NT_STATUS_INVALID_PARAMETER_1}, { ERRDOS, 87, NT_STATUS_INVALID_PARAMETER_2}, { ERRDOS, 87, NT_STATUS_INVALID_PARAMETER_3}, { ERRDOS, 87, NT_STATUS_INVALID_PARAMETER_4}, { ERRDOS, 87, NT_STATUS_INVALID_PARAMETER_5}, { ERRDOS, 87, NT_STATUS_INVALID_PARAMETER_6}, { ERRDOS, 87, NT_STATUS_INVALID_PARAMETER_7}, { ERRDOS, 87, NT_STATUS_INVALID_PARAMETER_8}, { ERRDOS, 87, NT_STATUS_INVALID_PARAMETER_9}, { ERRDOS, 87, NT_STATUS_INVALID_PARAMETER_10}, { ERRDOS, 87, NT_STATUS_INVALID_PARAMETER_11}, { ERRDOS, 87, NT_STATUS_INVALID_PARAMETER_12}, { ERRDOS, ERRbadpath, NT_STATUS_REDIRECTOR_NOT_STARTED}, { ERRHRD, ERRgeneral, NT_STATUS_REDIRECTOR_STARTED}, { ERRHRD, ERRgeneral, NT_STATUS_STACK_OVERFLOW}, { ERRHRD, ERRgeneral, NT_STATUS_NO_SUCH_PACKAGE}, { ERRHRD, ERRgeneral, NT_STATUS_BAD_FUNCTION_TABLE}, { ERRDOS, 203, 0xc0000100}, { ERRDOS, 145, NT_STATUS_DIRECTORY_NOT_EMPTY}, { ERRHRD, ERRgeneral, NT_STATUS_FILE_CORRUPT_ERROR}, { ERRDOS, 267, NT_STATUS_NOT_A_DIRECTORY}, { ERRHRD, ERRgeneral, NT_STATUS_BAD_LOGON_SESSION_STATE}, { ERRHRD, ERRgeneral, NT_STATUS_LOGON_SESSION_COLLISION}, { ERRDOS, 206, NT_STATUS_NAME_TOO_LONG}, { ERRDOS, 2401, NT_STATUS_FILES_OPEN}, { ERRDOS, 2404, NT_STATUS_CONNECTION_IN_USE}, { ERRHRD, ERRgeneral, NT_STATUS_MESSAGE_NOT_FOUND}, { ERRDOS, ERRnoaccess, NT_STATUS_PROCESS_IS_TERMINATING}, { ERRHRD, ERRgeneral, NT_STATUS_INVALID_LOGON_TYPE}, { ERRHRD, ERRgeneral, NT_STATUS_NO_GUID_TRANSLATION}, { ERRHRD, ERRgeneral, NT_STATUS_CANNOT_IMPERSONATE}, { ERRHRD, ERRgeneral, NT_STATUS_IMAGE_ALREADY_LOADED}, { ERRHRD, ERRgeneral, NT_STATUS_ABIOS_NOT_PRESENT}, { ERRHRD, ERRgeneral, NT_STATUS_ABIOS_LID_NOT_EXIST}, { ERRHRD, ERRgeneral, NT_STATUS_ABIOS_LID_ALREADY_OWNED}, { ERRHRD, ERRgeneral, NT_STATUS_ABIOS_NOT_LID_OWNER}, { ERRHRD, ERRgeneral, NT_STATUS_ABIOS_INVALID_COMMAND}, { ERRHRD, ERRgeneral, NT_STATUS_ABIOS_INVALID_LID}, { ERRHRD, ERRgeneral, NT_STATUS_ABIOS_SELECTOR_NOT_AVAILABLE}, { ERRHRD, ERRgeneral, NT_STATUS_ABIOS_INVALID_SELECTOR}, { ERRHRD, ERRgeneral, NT_STATUS_NO_LDT}, { ERRHRD, ERRgeneral, NT_STATUS_INVALID_LDT_SIZE}, { ERRHRD, ERRgeneral, NT_STATUS_INVALID_LDT_OFFSET}, { ERRHRD, ERRgeneral, NT_STATUS_INVALID_LDT_DESCRIPTOR}, { ERRDOS, 193, NT_STATUS_INVALID_IMAGE_NE_FORMAT}, { ERRHRD, ERRgeneral, NT_STATUS_RXACT_INVALID_STATE}, { ERRHRD, ERRgeneral, NT_STATUS_RXACT_COMMIT_FAILURE}, { ERRHRD, ERRgeneral, NT_STATUS_MAPPED_FILE_SIZE_ZERO}, { ERRDOS, ERRnofids, NT_STATUS_TOO_MANY_OPENED_FILES}, { ERRHRD, ERRgeneral, NT_STATUS_CANCELLED}, { ERRDOS, ERRnoaccess, NT_STATUS_CANNOT_DELETE}, { ERRHRD, ERRgeneral, NT_STATUS_INVALID_COMPUTER_NAME}, { ERRDOS, ERRnoaccess, NT_STATUS_FILE_DELETED}, { ERRHRD, ERRgeneral, NT_STATUS_SPECIAL_ACCOUNT}, { ERRHRD, ERRgeneral, NT_STATUS_SPECIAL_GROUP}, { ERRHRD, ERRgeneral, NT_STATUS_SPECIAL_USER}, { ERRHRD, ERRgeneral, NT_STATUS_MEMBERS_PRIMARY_GROUP}, { ERRDOS, ERRbadfid, NT_STATUS_FILE_CLOSED}, { ERRHRD, ERRgeneral, NT_STATUS_TOO_MANY_THREADS}, { ERRHRD, ERRgeneral, NT_STATUS_THREAD_NOT_IN_PROCESS}, { ERRHRD, ERRgeneral, NT_STATUS_TOKEN_ALREADY_IN_USE}, { ERRHRD, ERRgeneral, NT_STATUS_PAGEFILE_QUOTA_EXCEEDED}, { ERRHRD, ERRgeneral, NT_STATUS_COMMITMENT_LIMIT}, { ERRDOS, 193, NT_STATUS_INVALID_IMAGE_LE_FORMAT}, { ERRDOS, 193, NT_STATUS_INVALID_IMAGE_NOT_MZ}, { ERRDOS, 193, NT_STATUS_INVALID_IMAGE_PROTECT}, { ERRDOS, 193, NT_STATUS_INVALID_IMAGE_WIN_16}, { ERRHRD, ERRgeneral, NT_STATUS_LOGON_SERVER_CONFLICT}, { ERRHRD, ERRgeneral, NT_STATUS_TIME_DIFFERENCE_AT_DC}, { ERRHRD, ERRgeneral, NT_STATUS_SYNCHRONIZATION_REQUIRED}, { ERRDOS, 126, NT_STATUS_DLL_NOT_FOUND}, { ERRHRD, ERRgeneral, NT_STATUS_OPEN_FAILED}, { ERRHRD, ERRgeneral, NT_STATUS_IO_PRIVILEGE_FAILED}, { ERRDOS, 182, NT_STATUS_ORDINAL_NOT_FOUND}, { ERRDOS, 127, NT_STATUS_ENTRYPOINT_NOT_FOUND}, { ERRHRD, ERRgeneral, NT_STATUS_CONTROL_C_EXIT}, { ERRDOS, 64, NT_STATUS_LOCAL_DISCONNECT}, { ERRDOS, 64, NT_STATUS_REMOTE_DISCONNECT}, { ERRDOS, 51, NT_STATUS_REMOTE_RESOURCES}, { ERRDOS, 59, NT_STATUS_LINK_FAILED}, { ERRDOS, 59, NT_STATUS_LINK_TIMEOUT}, { ERRDOS, 59, NT_STATUS_INVALID_CONNECTION}, { ERRDOS, 59, NT_STATUS_INVALID_ADDRESS}, { ERRHRD, ERRgeneral, NT_STATUS_DLL_INIT_FAILED}, { ERRHRD, ERRgeneral, NT_STATUS_MISSING_SYSTEMFILE}, { ERRHRD, ERRgeneral, NT_STATUS_UNHANDLED_EXCEPTION}, { ERRHRD, ERRgeneral, NT_STATUS_APP_INIT_FAILURE}, { ERRHRD, ERRgeneral, NT_STATUS_PAGEFILE_CREATE_FAILED}, { ERRHRD, ERRgeneral, NT_STATUS_NO_PAGEFILE}, { ERRDOS, 124, NT_STATUS_INVALID_LEVEL}, { ERRDOS, 86, NT_STATUS_WRONG_PASSWORD_CORE}, { ERRHRD, ERRgeneral, NT_STATUS_ILLEGAL_FLOAT_CONTEXT}, { ERRDOS, 109, NT_STATUS_PIPE_BROKEN}, { ERRHRD, ERRgeneral, NT_STATUS_REGISTRY_CORRUPT}, { ERRHRD, ERRgeneral, NT_STATUS_REGISTRY_IO_FAILED}, { ERRHRD, ERRgeneral, NT_STATUS_NO_EVENT_PAIR}, { ERRHRD, ERRgeneral, NT_STATUS_UNRECOGNIZED_VOLUME}, { ERRHRD, ERRgeneral, NT_STATUS_SERIAL_NO_DEVICE_INITED}, { ERRHRD, ERRgeneral, NT_STATUS_NO_SUCH_ALIAS}, { ERRHRD, ERRgeneral, NT_STATUS_MEMBER_NOT_IN_ALIAS}, { ERRHRD, ERRgeneral, NT_STATUS_MEMBER_IN_ALIAS}, { ERRHRD, ERRgeneral, NT_STATUS_ALIAS_EXISTS}, { ERRHRD, ERRgeneral, NT_STATUS_LOGON_NOT_GRANTED}, { ERRHRD, ERRgeneral, NT_STATUS_TOO_MANY_SECRETS}, { ERRHRD, ERRgeneral, NT_STATUS_SECRET_TOO_LONG}, { ERRHRD, ERRgeneral, NT_STATUS_INTERNAL_DB_ERROR}, { ERRHRD, ERRgeneral, NT_STATUS_FULLSCREEN_MODE}, { ERRHRD, ERRgeneral, NT_STATUS_TOO_MANY_CONTEXT_IDS}, { ERRDOS, ERRnoaccess, NT_STATUS_LOGON_TYPE_NOT_GRANTED}, { ERRHRD, ERRgeneral, NT_STATUS_NOT_REGISTRY_FILE}, { ERRHRD, ERRgeneral, NT_STATUS_NT_CROSS_ENCRYPTION_REQUIRED}, { ERRHRD, ERRgeneral, NT_STATUS_DOMAIN_CTRLR_CONFIG_ERROR}, { ERRHRD, ERRgeneral, NT_STATUS_FT_MISSING_MEMBER}, { ERRHRD, ERRgeneral, NT_STATUS_ILL_FORMED_SERVICE_ENTRY}, { ERRHRD, ERRgeneral, NT_STATUS_ILLEGAL_CHARACTER}, { ERRHRD, ERRgeneral, NT_STATUS_UNMAPPABLE_CHARACTER}, { ERRHRD, ERRgeneral, NT_STATUS_UNDEFINED_CHARACTER}, { ERRHRD, ERRgeneral, NT_STATUS_FLOPPY_VOLUME}, { ERRHRD, ERRgeneral, NT_STATUS_FLOPPY_ID_MARK_NOT_FOUND}, { ERRHRD, ERRgeneral, NT_STATUS_FLOPPY_WRONG_CYLINDER}, { ERRHRD, ERRgeneral, NT_STATUS_FLOPPY_UNKNOWN_ERROR}, { ERRHRD, ERRgeneral, NT_STATUS_FLOPPY_BAD_REGISTERS}, { ERRHRD, ERRgeneral, NT_STATUS_DISK_RECALIBRATE_FAILED}, { ERRHRD, ERRgeneral, NT_STATUS_DISK_OPERATION_FAILED}, { ERRHRD, ERRgeneral, NT_STATUS_DISK_RESET_FAILED}, { ERRHRD, ERRgeneral, NT_STATUS_SHARED_IRQ_BUSY}, { ERRHRD, ERRgeneral, NT_STATUS_FT_ORPHANING}, { ERRHRD, ERRgeneral, 0xc000016e}, { ERRHRD, ERRgeneral, 0xc000016f}, { ERRHRD, ERRgeneral, 0xc0000170}, { ERRHRD, ERRgeneral, 0xc0000171}, { ERRHRD, ERRgeneral, NT_STATUS_PARTITION_FAILURE}, { ERRHRD, ERRgeneral, NT_STATUS_INVALID_BLOCK_LENGTH}, { ERRHRD, ERRgeneral, NT_STATUS_DEVICE_NOT_PARTITIONED}, { ERRHRD, ERRgeneral, NT_STATUS_UNABLE_TO_LOCK_MEDIA}, { ERRHRD, ERRgeneral, NT_STATUS_UNABLE_TO_UNLOAD_MEDIA}, { ERRHRD, ERRgeneral, NT_STATUS_EOM_OVERFLOW}, { ERRHRD, ERRgeneral, NT_STATUS_NO_MEDIA}, { ERRHRD, ERRgeneral, 0xc0000179}, { ERRHRD, ERRgeneral, NT_STATUS_NO_SUCH_MEMBER}, { ERRHRD, ERRgeneral, NT_STATUS_INVALID_MEMBER}, { ERRHRD, ERRgeneral, NT_STATUS_KEY_DELETED}, { ERRHRD, ERRgeneral, NT_STATUS_NO_LOG_SPACE}, { ERRHRD, ERRgeneral, NT_STATUS_TOO_MANY_SIDS}, { ERRHRD, ERRgeneral, NT_STATUS_LM_CROSS_ENCRYPTION_REQUIRED}, { ERRHRD, ERRgeneral, NT_STATUS_KEY_HAS_CHILDREN}, { ERRHRD, ERRgeneral, NT_STATUS_CHILD_MUST_BE_VOLATILE}, { ERRDOS, 87, NT_STATUS_DEVICE_CONFIGURATION_ERROR}, { ERRHRD, ERRgeneral, NT_STATUS_DRIVER_INTERNAL_ERROR}, { ERRDOS, 22, NT_STATUS_INVALID_DEVICE_STATE}, { ERRHRD, ERRgeneral, NT_STATUS_IO_DEVICE_ERROR}, { ERRHRD, ERRgeneral, NT_STATUS_DEVICE_PROTOCOL_ERROR}, { ERRHRD, ERRgeneral, NT_STATUS_BACKUP_CONTROLLER}, { ERRHRD, ERRgeneral, NT_STATUS_LOG_FILE_FULL}, { ERRDOS, 19, NT_STATUS_TOO_LATE}, { ERRDOS, ERRnoaccess, NT_STATUS_NO_TRUST_LSA_SECRET}, /* { This NT error code was 'sqashed' from NT_STATUS_NO_TRUST_SAM_ACCOUNT to NT_STATUS_TRUSTED_RELATIONSHIP_FAILURE during the session setup } */ { ERRDOS, ERRnoaccess, NT_STATUS_NO_TRUST_SAM_ACCOUNT}, { ERRDOS, ERRnoaccess, NT_STATUS_TRUSTED_DOMAIN_FAILURE}, { ERRDOS, ERRnoaccess, NT_STATUS_TRUSTED_RELATIONSHIP_FAILURE}, { ERRHRD, ERRgeneral, NT_STATUS_EVENTLOG_FILE_CORRUPT}, { ERRHRD, ERRgeneral, NT_STATUS_EVENTLOG_CANT_START}, { ERRDOS, ERRnoaccess, NT_STATUS_TRUST_FAILURE}, { ERRHRD, ERRgeneral, NT_STATUS_MUTANT_LIMIT_EXCEEDED}, { ERRDOS, ERRnetlogonNotStarted, NT_STATUS_NETLOGON_NOT_STARTED}, { ERRSRV, ERRaccountexpired, NT_STATUS_ACCOUNT_EXPIRED}, { ERRHRD, ERRgeneral, NT_STATUS_POSSIBLE_DEADLOCK}, { ERRHRD, ERRgeneral, NT_STATUS_NETWORK_CREDENTIAL_CONFLICT}, { ERRHRD, ERRgeneral, NT_STATUS_REMOTE_SESSION_LIMIT}, { ERRHRD, ERRgeneral, NT_STATUS_EVENTLOG_FILE_CHANGED}, { ERRDOS, ERRnoaccess, NT_STATUS_NOLOGON_INTERDOMAIN_TRUST_ACCOUNT}, { ERRDOS, ERRnoaccess, NT_STATUS_NOLOGON_WORKSTATION_TRUST_ACCOUNT}, { ERRDOS, ERRnoaccess, NT_STATUS_NOLOGON_SERVER_TRUST_ACCOUNT}, /* { This NT error code was 'sqashed' from NT_STATUS_DOMAIN_TRUST_INCONSISTENT to NT_STATUS_LOGON_FAILURE during the session setup } */ { ERRDOS, ERRnoaccess, NT_STATUS_DOMAIN_TRUST_INCONSISTENT}, { ERRHRD, ERRgeneral, NT_STATUS_FS_DRIVER_REQUIRED}, { ERRHRD, ERRgeneral, NT_STATUS_NO_USER_SESSION_KEY}, { ERRDOS, 59, NT_STATUS_USER_SESSION_DELETED}, { ERRHRD, ERRgeneral, NT_STATUS_RESOURCE_LANG_NOT_FOUND}, { ERRDOS, ERRnoresource, NT_STATUS_INSUFF_SERVER_RESOURCES}, { ERRHRD, ERRgeneral, NT_STATUS_INVALID_BUFFER_SIZE}, { ERRHRD, ERRgeneral, NT_STATUS_INVALID_ADDRESS_COMPONENT}, { ERRHRD, ERRgeneral, NT_STATUS_INVALID_ADDRESS_WILDCARD}, { ERRDOS, 68, NT_STATUS_TOO_MANY_ADDRESSES}, { ERRDOS, 52, NT_STATUS_ADDRESS_ALREADY_EXISTS}, { ERRDOS, 64, NT_STATUS_ADDRESS_CLOSED}, { ERRDOS, 64, NT_STATUS_CONNECTION_DISCONNECTED}, { ERRDOS, 64, NT_STATUS_CONNECTION_RESET}, { ERRDOS, 68, NT_STATUS_TOO_MANY_NODES}, { ERRDOS, 59, NT_STATUS_TRANSACTION_ABORTED}, { ERRDOS, 59, NT_STATUS_TRANSACTION_TIMED_OUT}, { ERRDOS, 59, NT_STATUS_TRANSACTION_NO_RELEASE}, { ERRDOS, 59, NT_STATUS_TRANSACTION_NO_MATCH}, { ERRDOS, 59, NT_STATUS_TRANSACTION_RESPONDED}, { ERRDOS, 59, NT_STATUS_TRANSACTION_INVALID_ID}, { ERRDOS, 59, NT_STATUS_TRANSACTION_INVALID_TYPE}, { ERRDOS, ERRunsup, NT_STATUS_NOT_SERVER_SESSION}, { ERRDOS, ERRunsup, NT_STATUS_NOT_CLIENT_SESSION}, { ERRHRD, ERRgeneral, NT_STATUS_CANNOT_LOAD_REGISTRY_FILE}, { ERRHRD, ERRgeneral, NT_STATUS_DEBUG_ATTACH_FAILED}, { ERRHRD, ERRgeneral, NT_STATUS_SYSTEM_PROCESS_TERMINATED}, { ERRHRD, ERRgeneral, NT_STATUS_DATA_NOT_ACCEPTED}, { ERRHRD, ERRgeneral, NT_STATUS_NO_BROWSER_SERVERS_FOUND}, { ERRHRD, ERRgeneral, NT_STATUS_VDM_HARD_ERROR}, { ERRHRD, ERRgeneral, NT_STATUS_DRIVER_CANCEL_TIMEOUT}, { ERRHRD, ERRgeneral, NT_STATUS_REPLY_MESSAGE_MISMATCH}, { ERRHRD, ERRgeneral, NT_STATUS_MAPPED_ALIGNMENT}, { ERRDOS, 193, NT_STATUS_IMAGE_CHECKSUM_MISMATCH}, { ERRHRD, ERRgeneral, NT_STATUS_LOST_WRITEBEHIND_DATA}, { ERRHRD, ERRgeneral, NT_STATUS_CLIENT_SERVER_PARAMETERS_INVALID}, { ERRSRV, ERRpasswordExpired, NT_STATUS_PASSWORD_MUST_CHANGE}, { ERRHRD, ERRgeneral, NT_STATUS_NOT_FOUND}, { ERRHRD, ERRgeneral, NT_STATUS_NOT_TINY_STREAM}, { ERRHRD, ERRgeneral, NT_STATUS_RECOVERY_FAILURE}, { ERRHRD, ERRgeneral, NT_STATUS_STACK_OVERFLOW_READ}, { ERRHRD, ERRgeneral, NT_STATUS_FAIL_CHECK}, { ERRHRD, ERRgeneral, NT_STATUS_DUPLICATE_OBJECTID}, { ERRHRD, ERRgeneral, NT_STATUS_OBJECTID_EXISTS}, { ERRHRD, ERRgeneral, NT_STATUS_CONVERT_TO_LARGE}, { ERRHRD, ERRgeneral, NT_STATUS_RETRY}, { ERRHRD, ERRgeneral, NT_STATUS_FOUND_OUT_OF_SCOPE}, { ERRHRD, ERRgeneral, NT_STATUS_ALLOCATE_BUCKET}, { ERRHRD, ERRgeneral, NT_STATUS_PROPSET_NOT_FOUND}, { ERRHRD, ERRgeneral, NT_STATUS_MARSHALL_OVERFLOW}, { ERRHRD, ERRgeneral, NT_STATUS_INVALID_VARIANT}, { ERRHRD, ERRgeneral, NT_STATUS_DOMAIN_CONTROLLER_NOT_FOUND}, { ERRDOS, ERRnoaccess, NT_STATUS_ACCOUNT_LOCKED_OUT}, { ERRDOS, ERRbadfid, NT_STATUS_HANDLE_NOT_CLOSABLE}, { ERRHRD, ERRgeneral, NT_STATUS_CONNECTION_REFUSED}, { ERRHRD, ERRgeneral, NT_STATUS_GRACEFUL_DISCONNECT}, { ERRHRD, ERRgeneral, NT_STATUS_ADDRESS_ALREADY_ASSOCIATED}, { ERRHRD, ERRgeneral, NT_STATUS_ADDRESS_NOT_ASSOCIATED}, { ERRHRD, ERRgeneral, NT_STATUS_CONNECTION_INVALID}, { ERRHRD, ERRgeneral, NT_STATUS_CONNECTION_ACTIVE}, { ERRHRD, ERRgeneral, NT_STATUS_NETWORK_UNREACHABLE}, { ERRHRD, ERRgeneral, NT_STATUS_HOST_UNREACHABLE}, { ERRHRD, ERRgeneral, NT_STATUS_PROTOCOL_UNREACHABLE}, { ERRHRD, ERRgeneral, NT_STATUS_PORT_UNREACHABLE}, { ERRHRD, ERRgeneral, NT_STATUS_REQUEST_ABORTED}, { ERRHRD, ERRgeneral, NT_STATUS_CONNECTION_ABORTED}, { ERRHRD, ERRgeneral, NT_STATUS_BAD_COMPRESSION_BUFFER}, { ERRHRD, ERRgeneral, NT_STATUS_USER_MAPPED_FILE}, { ERRHRD, ERRgeneral, NT_STATUS_AUDIT_FAILED}, { ERRHRD, ERRgeneral, NT_STATUS_TIMER_RESOLUTION_NOT_SET}, { ERRHRD, ERRgeneral, NT_STATUS_CONNECTION_COUNT_LIMIT}, { ERRHRD, ERRgeneral, NT_STATUS_LOGIN_TIME_RESTRICTION}, { ERRHRD, ERRgeneral, NT_STATUS_LOGIN_WKSTA_RESTRICTION}, { ERRDOS, 193, NT_STATUS_IMAGE_MP_UP_MISMATCH}, { ERRHRD, ERRgeneral, 0xc000024a}, { ERRHRD, ERRgeneral, 0xc000024b}, { ERRHRD, ERRgeneral, 0xc000024c}, { ERRHRD, ERRgeneral, 0xc000024d}, { ERRHRD, ERRgeneral, 0xc000024e}, { ERRHRD, ERRgeneral, 0xc000024f}, { ERRHRD, ERRgeneral, NT_STATUS_INSUFFICIENT_LOGON_INFO}, { ERRHRD, ERRgeneral, NT_STATUS_BAD_DLL_ENTRYPOINT}, { ERRHRD, ERRgeneral, NT_STATUS_BAD_SERVICE_ENTRYPOINT}, { ERRHRD, ERRgeneral, NT_STATUS_LPC_REPLY_LOST}, { ERRHRD, ERRgeneral, NT_STATUS_IP_ADDRESS_CONFLICT1}, { ERRHRD, ERRgeneral, NT_STATUS_IP_ADDRESS_CONFLICT2}, { ERRHRD, ERRgeneral, NT_STATUS_REGISTRY_QUOTA_LIMIT}, { ERRSRV, 3, NT_STATUS_PATH_NOT_COVERED}, { ERRHRD, ERRgeneral, NT_STATUS_NO_CALLBACK_ACTIVE}, { ERRHRD, ERRgeneral, NT_STATUS_LICENSE_QUOTA_EXCEEDED}, { ERRHRD, ERRgeneral, NT_STATUS_PWD_TOO_SHORT}, { ERRHRD, ERRgeneral, NT_STATUS_PWD_TOO_RECENT}, { ERRHRD, ERRgeneral, NT_STATUS_PWD_HISTORY_CONFLICT}, { ERRHRD, ERRgeneral, 0xc000025d}, { ERRHRD, ERRgeneral, NT_STATUS_PLUGPLAY_NO_DEVICE}, { ERRHRD, ERRgeneral, NT_STATUS_UNSUPPORTED_COMPRESSION}, { ERRHRD, ERRgeneral, NT_STATUS_INVALID_HW_PROFILE}, { ERRHRD, ERRgeneral, NT_STATUS_INVALID_PLUGPLAY_DEVICE_PATH}, { ERRDOS, 182, NT_STATUS_DRIVER_ORDINAL_NOT_FOUND}, { ERRDOS, 127, NT_STATUS_DRIVER_ENTRYPOINT_NOT_FOUND}, { ERRDOS, 288, NT_STATUS_RESOURCE_NOT_OWNED}, { ERRDOS, ErrTooManyLinks, NT_STATUS_TOO_MANY_LINKS}, { ERRHRD, ERRgeneral, NT_STATUS_QUOTA_LIST_INCONSISTENT}, { ERRHRD, ERRgeneral, NT_STATUS_FILE_IS_OFFLINE}, { ERRDOS, 21, 0xc000026e}, { ERRDOS, 161, 0xc0000281}, { ERRDOS, ERRnoaccess, 0xc000028a}, { ERRDOS, ERRnoaccess, 0xc000028b}, { ERRHRD, ERRgeneral, 0xc000028c}, { ERRDOS, ERRnoaccess, 0xc000028d}, { ERRDOS, ERRnoaccess, 0xc000028e}, { ERRDOS, ERRnoaccess, 0xc000028f}, { ERRDOS, ERRnoaccess, 0xc0000290}, { ERRDOS, ERRbadfunc, 0xc000029c}, { ERRDOS, ERRsymlink, NT_STATUS_STOPPED_ON_SYMLINK}, { ERRDOS, ERRinvlevel, 0x007c0001}, { 0, 0, 0 } }; /***************************************************************************** Print an error message from the status code *****************************************************************************/ static void cifs_print_status(__u32 status_code) { int idx = 0; while (nt_errs[idx].nt_errstr != NULL) { if (((nt_errs[idx].nt_errcode) & 0xFFFFFF) == (status_code & 0xFFFFFF)) { pr_notice("Status code returned 0x%08x %s\n", status_code, nt_errs[idx].nt_errstr); } idx++; } return; } static void ntstatus_to_dos(__u32 ntstatus, __u8 *eclass, __u16 *ecode) { int i; if (ntstatus == 0) { *eclass = 0; *ecode = 0; return; } for (i = 0; ntstatus_to_dos_map[i].ntstatus; i++) { if (ntstatus == ntstatus_to_dos_map[i].ntstatus) { *eclass = ntstatus_to_dos_map[i].dos_class; *ecode = ntstatus_to_dos_map[i].dos_code; return; } } *eclass = ERRHRD; *ecode = ERRgeneral; } int map_smb_to_linux_error(char *buf, bool logErr) { struct smb_hdr *smb = (struct smb_hdr *)buf; unsigned int i; int rc = -EIO; /* if transport error smb error may not be set */ __u8 smberrclass; __u16 smberrcode; /* BB if NT Status codes - map NT BB */ /* old style smb error codes */ if (smb->Status.CifsError == 0) return 0; if (smb->Flags2 & SMBFLG2_ERR_STATUS) { /* translate the newer STATUS codes to old style SMB errors * and then to POSIX errors */ __u32 err = le32_to_cpu(smb->Status.CifsError); if (logErr && (err != (NT_STATUS_MORE_PROCESSING_REQUIRED))) cifs_print_status(err); else if (cifsFYI & CIFS_RC) cifs_print_status(err); ntstatus_to_dos(err, &smberrclass, &smberrcode); } else { smberrclass = smb->Status.DosError.ErrorClass; smberrcode = le16_to_cpu(smb->Status.DosError.Error); } /* old style errors */ /* DOS class smb error codes - map DOS */ if (smberrclass == ERRDOS) { /* 1 byte field no need to byte reverse */ for (i = 0; i < sizeof(mapping_table_ERRDOS) / sizeof(struct smb_to_posix_error); i++) { if (mapping_table_ERRDOS[i].smb_err == 0) break; else if (mapping_table_ERRDOS[i].smb_err == smberrcode) { rc = mapping_table_ERRDOS[i].posix_code; break; } /* else try next error mapping one to see if match */ } } else if (smberrclass == ERRSRV) { /* server class of error codes */ for (i = 0; i < sizeof(mapping_table_ERRSRV) / sizeof(struct smb_to_posix_error); i++) { if (mapping_table_ERRSRV[i].smb_err == 0) break; else if (mapping_table_ERRSRV[i].smb_err == smberrcode) { rc = mapping_table_ERRSRV[i].posix_code; break; } /* else try next error mapping to see if match */ } } /* else ERRHRD class errors or junk - return EIO */ cifs_dbg(FYI, "Mapping smb error code 0x%x to POSIX err %d\n", le32_to_cpu(smb->Status.CifsError), rc); /* generic corrective action e.g. reconnect SMB session on * ERRbaduid could be added */ return rc; } int map_and_check_smb_error(struct mid_q_entry *mid, bool logErr) { int rc; struct smb_hdr *smb = (struct smb_hdr *)mid->resp_buf; rc = map_smb_to_linux_error((char *)smb, logErr); if (rc == -EACCES && !(smb->Flags2 & SMBFLG2_ERR_STATUS)) { /* possible ERRBaduid */ __u8 class = smb->Status.DosError.ErrorClass; __u16 code = le16_to_cpu(smb->Status.DosError.Error); /* switch can be used to handle different errors */ if (class == ERRSRV && code == ERRbaduid) { cifs_dbg(FYI, "Server returned 0x%x, reconnecting session...\n", code); cifs_signal_cifsd_for_reconnect(mid->server, false); } } return rc; } /* * calculate the size of the SMB message based on the fixed header * portion, the number of word parameters and the data portion of the message */ unsigned int smbCalcSize(void *buf) { struct smb_hdr *ptr = buf; return (sizeof(struct smb_hdr) + (2 * ptr->WordCount) + 2 /* size of the bcc field */ + get_bcc(ptr)); } /* The following are taken from fs/ntfs/util.c */ #define NTFS_TIME_OFFSET ((u64)(369*365 + 89) * 24 * 3600 * 10000000) /* * Convert the NT UTC (based 1601-01-01, in hundred nanosecond units) * into Unix UTC (based 1970-01-01, in seconds). */ struct timespec64 cifs_NTtimeToUnix(__le64 ntutc) { struct timespec64 ts; /* BB what about the timezone? BB */ /* Subtract the NTFS time offset, then convert to 1s intervals. */ s64 t = le64_to_cpu(ntutc) - NTFS_TIME_OFFSET; u64 abs_t; /* * Unfortunately can not use normal 64 bit division on 32 bit arch, but * the alternative, do_div, does not work with negative numbers so have * to special case them */ if (t < 0) { abs_t = -t; ts.tv_nsec = (time64_t)(do_div(abs_t, 10000000) * 100); ts.tv_nsec = -ts.tv_nsec; ts.tv_sec = -abs_t; } else { abs_t = t; ts.tv_nsec = (time64_t)do_div(abs_t, 10000000) * 100; ts.tv_sec = abs_t; } return ts; } /* Convert the Unix UTC into NT UTC. */ u64 cifs_UnixTimeToNT(struct timespec64 t) { /* Convert to 100ns intervals and then add the NTFS time offset. */ return (u64) t.tv_sec * 10000000 + t.tv_nsec/100 + NTFS_TIME_OFFSET; } static const int total_days_of_prev_months[] = { 0, 31, 59, 90, 120, 151, 181, 212, 243, 273, 304, 334 }; struct timespec64 cnvrtDosUnixTm(__le16 le_date, __le16 le_time, int offset) { struct timespec64 ts; time64_t sec, days; int min, day, month, year; u16 date = le16_to_cpu(le_date); u16 time = le16_to_cpu(le_time); SMB_TIME *st = (SMB_TIME *)&time; SMB_DATE *sd = (SMB_DATE *)&date; cifs_dbg(FYI, "date %d time %d\n", date, time); sec = 2 * st->TwoSeconds; min = st->Minutes; if ((sec > 59) || (min > 59)) cifs_dbg(VFS, "Invalid time min %d sec %lld\n", min, sec); sec += (min * 60); sec += 60 * 60 * st->Hours; if (st->Hours > 24) cifs_dbg(VFS, "Invalid hours %d\n", st->Hours); day = sd->Day; month = sd->Month; if (day < 1 || day > 31 || month < 1 || month > 12) { cifs_dbg(VFS, "Invalid date, month %d day: %d\n", month, day); day = clamp(day, 1, 31); month = clamp(month, 1, 12); } month -= 1; days = day + total_days_of_prev_months[month]; days += 3652; /* account for difference in days between 1980 and 1970 */ year = sd->Year; days += year * 365; days += (year/4); /* leap year */ /* generalized leap year calculation is more complex, ie no leap year for years/100 except for years/400, but since the maximum number for DOS year is 2**7, the last year is 1980+127, which means we need only consider 2 special case years, ie the years 2000 and 2100, and only adjust for the lack of leap year for the year 2100, as 2000 was a leap year (divisible by 400) */ if (year >= 120) /* the year 2100 */ days = days - 1; /* do not count leap year for the year 2100 */ /* adjust for leap year where we are still before leap day */ if (year != 120) days -= ((year & 0x03) == 0) && (month < 2 ? 1 : 0); sec += 24 * 60 * 60 * days; ts.tv_sec = sec + offset; /* cifs_dbg(FYI, "sec after cnvrt dos to unix time %d\n",sec); */ ts.tv_nsec = 0; return ts; } |
| 595 595 595 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 | // SPDX-License-Identifier: LGPL-2.0+ /* * Copyright (C) 1993, 1994, 1995, 1996, 1997 Free Software Foundation, Inc. * This file is part of the GNU C Library. * Contributed by Paul Eggert (eggert@twinsun.com). * * The GNU C Library is free software; you can redistribute it and/or * modify it under the terms of the GNU Library General Public License as * published by the Free Software Foundation; either version 2 of the * License, or (at your option) any later version. * * The GNU C Library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Library General Public License for more details. * * You should have received a copy of the GNU Library General Public * License along with the GNU C Library; see the file COPYING.LIB. If not, * write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, * Boston, MA 02111-1307, USA. */ /* * Converts the calendar time to broken-down time representation * * 2009-7-14: * Moved from glibc-2.6 to kernel by Zhaolei<zhaolei@cn.fujitsu.com> * 2021-06-02: * Reimplemented by Cassio Neri <cassio.neri@gmail.com> */ #include <linux/time.h> #include <linux/module.h> #include <linux/kernel.h> #define SECS_PER_HOUR (60 * 60) #define SECS_PER_DAY (SECS_PER_HOUR * 24) /** * time64_to_tm - converts the calendar time to local broken-down time * * @totalsecs: the number of seconds elapsed since 00:00:00 on January 1, 1970, * Coordinated Universal Time (UTC). * @offset: offset seconds adding to totalsecs. * @result: pointer to struct tm variable to receive broken-down time */ void time64_to_tm(time64_t totalsecs, int offset, struct tm *result) { u32 u32tmp, day_of_century, year_of_century, day_of_year, month, day; u64 u64tmp, udays, century, year; bool is_Jan_or_Feb, is_leap_year; long days, rem; int remainder; days = div_s64_rem(totalsecs, SECS_PER_DAY, &remainder); rem = remainder; rem += offset; while (rem < 0) { rem += SECS_PER_DAY; --days; } while (rem >= SECS_PER_DAY) { rem -= SECS_PER_DAY; ++days; } result->tm_hour = rem / SECS_PER_HOUR; rem %= SECS_PER_HOUR; result->tm_min = rem / 60; result->tm_sec = rem % 60; /* January 1, 1970 was a Thursday. */ result->tm_wday = (4 + days) % 7; if (result->tm_wday < 0) result->tm_wday += 7; /* * The following algorithm is, basically, Proposition 6.3 of Neri * and Schneider [1]. In a few words: it works on the computational * (fictitious) calendar where the year starts in March, month = 2 * (*), and finishes in February, month = 13. This calendar is * mathematically convenient because the day of the year does not * depend on whether the year is leap or not. For instance: * * March 1st 0-th day of the year; * ... * April 1st 31-st day of the year; * ... * January 1st 306-th day of the year; (Important!) * ... * February 28th 364-th day of the year; * February 29th 365-th day of the year (if it exists). * * After having worked out the date in the computational calendar * (using just arithmetics) it's easy to convert it to the * corresponding date in the Gregorian calendar. * * [1] "Euclidean Affine Functions and Applications to Calendar * Algorithms". https://arxiv.org/abs/2102.06959 * * (*) The numbering of months follows tm more closely and thus, * is slightly different from [1]. */ udays = ((u64) days) + 2305843009213814918ULL; u64tmp = 4 * udays + 3; century = div64_u64_rem(u64tmp, 146097, &u64tmp); day_of_century = (u32) (u64tmp / 4); u32tmp = 4 * day_of_century + 3; u64tmp = 2939745ULL * u32tmp; year_of_century = upper_32_bits(u64tmp); day_of_year = lower_32_bits(u64tmp) / 2939745 / 4; year = 100 * century + year_of_century; is_leap_year = year_of_century ? !(year_of_century % 4) : !(century % 4); u32tmp = 2141 * day_of_year + 132377; month = u32tmp >> 16; day = ((u16) u32tmp) / 2141; /* * Recall that January 1st is the 306-th day of the year in the * computational (not Gregorian) calendar. */ is_Jan_or_Feb = day_of_year >= 306; /* Convert to the Gregorian calendar and adjust to Unix time. */ year = year + is_Jan_or_Feb - 6313183731940000ULL; month = is_Jan_or_Feb ? month - 12 : month; day = day + 1; day_of_year += is_Jan_or_Feb ? -306 : 31 + 28 + is_leap_year; /* Convert to tm's format. */ result->tm_year = (long) (year - 1900); result->tm_mon = (int) month; result->tm_mday = (int) day; result->tm_yday = (int) day_of_year; } EXPORT_SYMBOL(time64_to_tm); |
| 1 1 1 1 1 2 1 1 5 1 4 1 3 1 2 2 170 13 2 165 7 2 3 3 25 9 24 17 7 2 3 19 24 21 175 175 172 3 177 178 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 | // SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2000-2005 Silicon Graphics, Inc. * All Rights Reserved. */ #include "xfs.h" #include "xfs_fs.h" #include "xfs_shared.h" #include "xfs_format.h" #include "xfs_log_format.h" #include "xfs_trans_resv.h" #include "xfs_sb.h" #include "xfs_mount.h" #include "xfs_trans.h" #include "xfs_error.h" #include "xfs_alloc.h" #include "xfs_fsops.h" #include "xfs_trans_space.h" #include "xfs_log.h" #include "xfs_log_priv.h" #include "xfs_ag.h" #include "xfs_ag_resv.h" #include "xfs_trace.h" /* * Write new AG headers to disk. Non-transactional, but need to be * written and completed prior to the growfs transaction being logged. * To do this, we use a delayed write buffer list and wait for * submission and IO completion of the list as a whole. This allows the * IO subsystem to merge all the AG headers in a single AG into a single * IO and hide most of the latency of the IO from us. * * This also means that if we get an error whilst building the buffer * list to write, we can cancel the entire list without having written * anything. */ static int xfs_resizefs_init_new_ags( struct xfs_trans *tp, struct aghdr_init_data *id, xfs_agnumber_t oagcount, xfs_agnumber_t nagcount, xfs_rfsblock_t delta, struct xfs_perag *last_pag, bool *lastag_extended) { struct xfs_mount *mp = tp->t_mountp; xfs_rfsblock_t nb = mp->m_sb.sb_dblocks + delta; int error; *lastag_extended = false; INIT_LIST_HEAD(&id->buffer_list); for (id->agno = nagcount - 1; id->agno >= oagcount; id->agno--, delta -= id->agsize) { if (id->agno == nagcount - 1) id->agsize = nb - (id->agno * (xfs_rfsblock_t)mp->m_sb.sb_agblocks); else id->agsize = mp->m_sb.sb_agblocks; error = xfs_ag_init_headers(mp, id); if (error) { xfs_buf_delwri_cancel(&id->buffer_list); return error; } } error = xfs_buf_delwri_submit(&id->buffer_list); if (error) return error; if (delta) { *lastag_extended = true; error = xfs_ag_extend_space(last_pag, tp, delta); } return error; } /* * growfs operations */ static int xfs_growfs_data_private( struct xfs_mount *mp, /* mount point for filesystem */ struct xfs_growfs_data *in) /* growfs data input struct */ { struct xfs_buf *bp; int error; xfs_agnumber_t nagcount; xfs_agnumber_t nagimax = 0; xfs_rfsblock_t nb, nb_div, nb_mod; int64_t delta; bool lastag_extended = false; xfs_agnumber_t oagcount; struct xfs_trans *tp; struct aghdr_init_data id = {}; struct xfs_perag *last_pag; nb = in->newblocks; error = xfs_sb_validate_fsb_count(&mp->m_sb, nb); if (error) return error; if (nb > mp->m_sb.sb_dblocks) { error = xfs_buf_read_uncached(mp->m_ddev_targp, XFS_FSB_TO_BB(mp, nb) - XFS_FSS_TO_BB(mp, 1), XFS_FSS_TO_BB(mp, 1), 0, &bp, NULL); if (error) return error; xfs_buf_relse(bp); } nb_div = nb; nb_mod = do_div(nb_div, mp->m_sb.sb_agblocks); if (nb_mod && nb_mod >= XFS_MIN_AG_BLOCKS) nb_div++; else if (nb_mod) nb = nb_div * mp->m_sb.sb_agblocks; if (nb_div > XFS_MAX_AGNUMBER + 1) { nb_div = XFS_MAX_AGNUMBER + 1; nb = nb_div * mp->m_sb.sb_agblocks; } nagcount = nb_div; delta = nb - mp->m_sb.sb_dblocks; /* * Reject filesystems with a single AG because they are not * supported, and reject a shrink operation that would cause a * filesystem to become unsupported. */ if (delta < 0 && nagcount < 2) return -EINVAL; /* No work to do */ if (delta == 0) return 0; oagcount = mp->m_sb.sb_agcount; /* allocate the new per-ag structures */ if (nagcount > oagcount) { error = xfs_initialize_perag(mp, nagcount, nb, &nagimax); if (error) return error; } else if (nagcount < oagcount) { /* TODO: shrinking the entire AGs hasn't yet completed */ return -EINVAL; } if (delta > 0) error = xfs_trans_alloc(mp, &M_RES(mp)->tr_growdata, XFS_GROWFS_SPACE_RES(mp), 0, XFS_TRANS_RESERVE, &tp); else error = xfs_trans_alloc(mp, &M_RES(mp)->tr_growdata, -delta, 0, 0, &tp); if (error) goto out_free_unused_perag; last_pag = xfs_perag_get(mp, oagcount - 1); if (delta > 0) { error = xfs_resizefs_init_new_ags(tp, &id, oagcount, nagcount, delta, last_pag, &lastag_extended); } else { xfs_warn_mount(mp, XFS_OPSTATE_WARNED_SHRINK, "EXPERIMENTAL online shrink feature in use. Use at your own risk!"); error = xfs_ag_shrink_space(last_pag, &tp, -delta); } xfs_perag_put(last_pag); if (error) goto out_trans_cancel; /* * Update changed superblock fields transactionally. These are not * seen by the rest of the world until the transaction commit applies * them atomically to the superblock. */ if (nagcount > oagcount) xfs_trans_mod_sb(tp, XFS_TRANS_SB_AGCOUNT, nagcount - oagcount); if (delta) xfs_trans_mod_sb(tp, XFS_TRANS_SB_DBLOCKS, delta); if (id.nfree) xfs_trans_mod_sb(tp, XFS_TRANS_SB_FDBLOCKS, id.nfree); /* * Sync sb counters now to reflect the updated values. This is * particularly important for shrink because the write verifier * will fail if sb_fdblocks is ever larger than sb_dblocks. */ if (xfs_has_lazysbcount(mp)) xfs_log_sb(tp); xfs_trans_set_sync(tp); error = xfs_trans_commit(tp); if (error) return error; /* New allocation groups fully initialized, so update mount struct */ if (nagimax) mp->m_maxagi = nagimax; xfs_set_low_space_thresholds(mp); mp->m_alloc_set_aside = xfs_alloc_set_aside(mp); if (delta > 0) { /* * If we expanded the last AG, free the per-AG reservation * so we can reinitialize it with the new size. */ if (lastag_extended) { struct xfs_perag *pag; pag = xfs_perag_get(mp, id.agno); xfs_ag_resv_free(pag); xfs_perag_put(pag); } /* * Reserve AG metadata blocks. ENOSPC here does not mean there * was a growfs failure, just that there still isn't space for * new user data after the grow has been run. */ error = xfs_fs_reserve_ag_blocks(mp); if (error == -ENOSPC) error = 0; } return error; out_trans_cancel: xfs_trans_cancel(tp); out_free_unused_perag: if (nagcount > oagcount) xfs_free_unused_perag_range(mp, oagcount, nagcount); return error; } static int xfs_growfs_log_private( struct xfs_mount *mp, /* mount point for filesystem */ struct xfs_growfs_log *in) /* growfs log input struct */ { xfs_extlen_t nb; nb = in->newblocks; if (nb < XFS_MIN_LOG_BLOCKS || nb < XFS_B_TO_FSB(mp, XFS_MIN_LOG_BYTES)) return -EINVAL; if (nb == mp->m_sb.sb_logblocks && in->isint == (mp->m_sb.sb_logstart != 0)) return -EINVAL; /* * Moving the log is hard, need new interfaces to sync * the log first, hold off all activity while moving it. * Can have shorter or longer log in the same space, * or transform internal to external log or vice versa. */ return -ENOSYS; } static int xfs_growfs_imaxpct( struct xfs_mount *mp, __u32 imaxpct) { struct xfs_trans *tp; int dpct; int error; if (imaxpct > 100) return -EINVAL; error = xfs_trans_alloc(mp, &M_RES(mp)->tr_growdata, XFS_GROWFS_SPACE_RES(mp), 0, XFS_TRANS_RESERVE, &tp); if (error) return error; dpct = imaxpct - mp->m_sb.sb_imax_pct; xfs_trans_mod_sb(tp, XFS_TRANS_SB_IMAXPCT, dpct); xfs_trans_set_sync(tp); return xfs_trans_commit(tp); } /* * protected versions of growfs function acquire and release locks on the mount * point - exported through ioctls: XFS_IOC_FSGROWFSDATA, XFS_IOC_FSGROWFSLOG, * XFS_IOC_FSGROWFSRT */ int xfs_growfs_data( struct xfs_mount *mp, struct xfs_growfs_data *in) { int error = 0; if (!capable(CAP_SYS_ADMIN)) return -EPERM; if (!mutex_trylock(&mp->m_growlock)) return -EWOULDBLOCK; /* update imaxpct separately to the physical grow of the filesystem */ if (in->imaxpct != mp->m_sb.sb_imax_pct) { error = xfs_growfs_imaxpct(mp, in->imaxpct); if (error) goto out_error; } if (in->newblocks != mp->m_sb.sb_dblocks) { error = xfs_growfs_data_private(mp, in); if (error) goto out_error; } /* Post growfs calculations needed to reflect new state in operations */ if (mp->m_sb.sb_imax_pct) { uint64_t icount = mp->m_sb.sb_dblocks * mp->m_sb.sb_imax_pct; do_div(icount, 100); M_IGEO(mp)->maxicount = XFS_FSB_TO_INO(mp, icount); } else M_IGEO(mp)->maxicount = 0; /* Update secondary superblocks now the physical grow has completed */ error = xfs_update_secondary_sbs(mp); out_error: /* * Increment the generation unconditionally, the error could be from * updating the secondary superblocks, in which case the new size * is live already. */ mp->m_generation++; mutex_unlock(&mp->m_growlock); return error; } int xfs_growfs_log( xfs_mount_t *mp, struct xfs_growfs_log *in) { int error; if (!capable(CAP_SYS_ADMIN)) return -EPERM; if (!mutex_trylock(&mp->m_growlock)) return -EWOULDBLOCK; error = xfs_growfs_log_private(mp, in); mutex_unlock(&mp->m_growlock); return error; } /* * Reserve the requested number of blocks if available. Otherwise return * as many as possible to satisfy the request. The actual number * reserved are returned in outval. */ int xfs_reserve_blocks( struct xfs_mount *mp, uint64_t request) { int64_t lcounter, delta; int64_t fdblks_delta = 0; int64_t free; int error = 0; /* * With per-cpu counters, this becomes an interesting problem. we need * to work out if we are freeing or allocation blocks first, then we can * do the modification as necessary. * * We do this under the m_sb_lock so that if we are near ENOSPC, we will * hold out any changes while we work out what to do. This means that * the amount of free space can change while we do this, so we need to * retry if we end up trying to reserve more space than is available. */ spin_lock(&mp->m_sb_lock); /* * If our previous reservation was larger than the current value, * then move any unused blocks back to the free pool. Modify the resblks * counters directly since we shouldn't have any problems unreserving * space. */ if (mp->m_resblks > request) { lcounter = mp->m_resblks_avail - request; if (lcounter > 0) { /* release unused blocks */ fdblks_delta = lcounter; mp->m_resblks_avail -= lcounter; } mp->m_resblks = request; if (fdblks_delta) { spin_unlock(&mp->m_sb_lock); xfs_add_fdblocks(mp, fdblks_delta); spin_lock(&mp->m_sb_lock); } goto out; } /* * If the request is larger than the current reservation, reserve the * blocks before we update the reserve counters. Sample m_fdblocks and * perform a partial reservation if the request exceeds free space. * * The code below estimates how many blocks it can request from * fdblocks to stash in the reserve pool. This is a classic TOCTOU * race since fdblocks updates are not always coordinated via * m_sb_lock. Set the reserve size even if there's not enough free * space to fill it because mod_fdblocks will refill an undersized * reserve when it can. */ free = percpu_counter_sum(&mp->m_fdblocks) - xfs_fdblocks_unavailable(mp); delta = request - mp->m_resblks; mp->m_resblks = request; if (delta > 0 && free > 0) { /* * We'll either succeed in getting space from the free block * count or we'll get an ENOSPC. Don't set the reserved flag * here - we don't want to reserve the extra reserve blocks * from the reserve. * * The desired reserve size can change after we drop the lock. * Use mod_fdblocks to put the space into the reserve or into * fdblocks as appropriate. */ fdblks_delta = min(free, delta); spin_unlock(&mp->m_sb_lock); error = xfs_dec_fdblocks(mp, fdblks_delta, 0); if (!error) xfs_add_fdblocks(mp, fdblks_delta); spin_lock(&mp->m_sb_lock); } out: spin_unlock(&mp->m_sb_lock); return error; } int xfs_fs_goingdown( xfs_mount_t *mp, uint32_t inflags) { switch (inflags) { case XFS_FSOP_GOING_FLAGS_DEFAULT: { if (!bdev_freeze(mp->m_super->s_bdev)) { xfs_force_shutdown(mp, SHUTDOWN_FORCE_UMOUNT); bdev_thaw(mp->m_super->s_bdev); } break; } case XFS_FSOP_GOING_FLAGS_LOGFLUSH: xfs_force_shutdown(mp, SHUTDOWN_FORCE_UMOUNT); break; case XFS_FSOP_GOING_FLAGS_NOLOGFLUSH: xfs_force_shutdown(mp, SHUTDOWN_FORCE_UMOUNT | SHUTDOWN_LOG_IO_ERROR); break; default: return -EINVAL; } return 0; } /* * Force a shutdown of the filesystem instantly while keeping the filesystem * consistent. We don't do an unmount here; just shutdown the shop, make sure * that absolutely nothing persistent happens to this filesystem after this * point. * * The shutdown state change is atomic, resulting in the first and only the * first shutdown call processing the shutdown. This means we only shutdown the * log once as it requires, and we don't spam the logs when multiple concurrent * shutdowns race to set the shutdown flags. */ void xfs_do_force_shutdown( struct xfs_mount *mp, uint32_t flags, char *fname, int lnnum) { int tag; const char *why; if (xfs_set_shutdown(mp)) { xlog_shutdown_wait(mp->m_log); return; } if (mp->m_sb_bp) mp->m_sb_bp->b_flags |= XBF_DONE; if (flags & SHUTDOWN_FORCE_UMOUNT) xfs_alert(mp, "User initiated shutdown received."); if (xlog_force_shutdown(mp->m_log, flags)) { tag = XFS_PTAG_SHUTDOWN_LOGERROR; why = "Log I/O Error"; } else if (flags & SHUTDOWN_CORRUPT_INCORE) { tag = XFS_PTAG_SHUTDOWN_CORRUPT; why = "Corruption of in-memory data"; } else if (flags & SHUTDOWN_CORRUPT_ONDISK) { tag = XFS_PTAG_SHUTDOWN_CORRUPT; why = "Corruption of on-disk metadata"; } else if (flags & SHUTDOWN_DEVICE_REMOVED) { tag = XFS_PTAG_SHUTDOWN_IOERROR; why = "Block device removal"; } else { tag = XFS_PTAG_SHUTDOWN_IOERROR; why = "Metadata I/O Error"; } trace_xfs_force_shutdown(mp, tag, flags, fname, lnnum); xfs_alert_tag(mp, tag, "%s (0x%x) detected at %pS (%s:%d). Shutting down filesystem.", why, flags, __return_address, fname, lnnum); xfs_alert(mp, "Please unmount the filesystem and rectify the problem(s)"); if (xfs_error_level >= XFS_ERRLEVEL_HIGH) xfs_stack_trace(); } /* * Reserve free space for per-AG metadata. */ int xfs_fs_reserve_ag_blocks( struct xfs_mount *mp) { xfs_agnumber_t agno; struct xfs_perag *pag; int error = 0; int err2; mp->m_finobt_nores = false; for_each_perag(mp, agno, pag) { err2 = xfs_ag_resv_init(pag, NULL); if (err2 && !error) error = err2; } if (error && error != -ENOSPC) { xfs_warn(mp, "Error %d reserving per-AG metadata reserve pool.", error); xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); } return error; } /* * Free space reserved for per-AG metadata. */ void xfs_fs_unreserve_ag_blocks( struct xfs_mount *mp) { xfs_agnumber_t agno; struct xfs_perag *pag; for_each_perag(mp, agno, pag) xfs_ag_resv_free(pag); } |
| 7 68 1 37 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef BLK_MQ_H #define BLK_MQ_H #include <linux/blkdev.h> #include <linux/sbitmap.h> #include <linux/lockdep.h> #include <linux/scatterlist.h> #include <linux/prefetch.h> #include <linux/srcu.h> #include <linux/rw_hint.h> struct blk_mq_tags; struct blk_flush_queue; #define BLKDEV_MIN_RQ 4 #define BLKDEV_DEFAULT_RQ 128 enum rq_end_io_ret { RQ_END_IO_NONE, RQ_END_IO_FREE, }; typedef enum rq_end_io_ret (rq_end_io_fn)(struct request *, blk_status_t); /* * request flags */ typedef __u32 __bitwise req_flags_t; /* Keep rqf_name[] in sync with the definitions below */ enum { /* drive already may have started this one */ __RQF_STARTED, /* request for flush sequence */ __RQF_FLUSH_SEQ, /* merge of different types, fail separately */ __RQF_MIXED_MERGE, /* don't call prep for this one */ __RQF_DONTPREP, /* use hctx->sched_tags */ __RQF_SCHED_TAGS, /* use an I/O scheduler for this request */ __RQF_USE_SCHED, /* vaguely specified driver internal error. Ignored by block layer */ __RQF_FAILED, /* don't warn about errors */ __RQF_QUIET, /* account into disk and partition IO statistics */ __RQF_IO_STAT, /* runtime pm request */ __RQF_PM, /* on IO scheduler merge hash */ __RQF_HASHED, /* track IO completion time */ __RQF_STATS, /* Look at ->special_vec for the actual data payload instead of the bio chain. */ __RQF_SPECIAL_PAYLOAD, /* request completion needs to be signaled to zone write plugging. */ __RQF_ZONE_WRITE_PLUGGING, /* ->timeout has been called, don't expire again */ __RQF_TIMED_OUT, __RQF_RESV, __RQF_BITS }; #define RQF_STARTED ((__force req_flags_t)(1 << __RQF_STARTED)) #define RQF_FLUSH_SEQ ((__force req_flags_t)(1 << __RQF_FLUSH_SEQ)) #define RQF_MIXED_MERGE ((__force req_flags_t)(1 << __RQF_MIXED_MERGE)) #define RQF_DONTPREP ((__force req_flags_t)(1 << __RQF_DONTPREP)) #define RQF_SCHED_TAGS ((__force req_flags_t)(1 << __RQF_SCHED_TAGS)) #define RQF_USE_SCHED ((__force req_flags_t)(1 << __RQF_USE_SCHED)) #define RQF_FAILED ((__force req_flags_t)(1 << __RQF_FAILED)) #define RQF_QUIET ((__force req_flags_t)(1 << __RQF_QUIET)) #define RQF_IO_STAT ((__force req_flags_t)(1 << __RQF_IO_STAT)) #define RQF_PM ((__force req_flags_t)(1 << __RQF_PM)) #define RQF_HASHED ((__force req_flags_t)(1 << __RQF_HASHED)) #define RQF_STATS ((__force req_flags_t)(1 << __RQF_STATS)) #define RQF_SPECIAL_PAYLOAD \ ((__force req_flags_t)(1 << __RQF_SPECIAL_PAYLOAD)) #define RQF_ZONE_WRITE_PLUGGING \ ((__force req_flags_t)(1 << __RQF_ZONE_WRITE_PLUGGING)) #define RQF_TIMED_OUT ((__force req_flags_t)(1 << __RQF_TIMED_OUT)) #define RQF_RESV ((__force req_flags_t)(1 << __RQF_RESV)) /* flags that prevent us from merging requests: */ #define RQF_NOMERGE_FLAGS \ (RQF_STARTED | RQF_FLUSH_SEQ | RQF_SPECIAL_PAYLOAD) enum mq_rq_state { MQ_RQ_IDLE = 0, MQ_RQ_IN_FLIGHT = 1, MQ_RQ_COMPLETE = 2, }; /* * Try to put the fields that are referenced together in the same cacheline. * * If you modify this structure, make sure to update blk_rq_init() and * especially blk_mq_rq_ctx_init() to take care of the added fields. */ struct request { struct request_queue *q; struct blk_mq_ctx *mq_ctx; struct blk_mq_hw_ctx *mq_hctx; blk_opf_t cmd_flags; /* op and common flags */ req_flags_t rq_flags; int tag; int internal_tag; unsigned int timeout; /* the following two fields are internal, NEVER access directly */ unsigned int __data_len; /* total data len */ sector_t __sector; /* sector cursor */ struct bio *bio; struct bio *biotail; union { struct list_head queuelist; struct request *rq_next; }; struct block_device *part; #ifdef CONFIG_BLK_RQ_ALLOC_TIME /* Time that the first bio started allocating this request. */ u64 alloc_time_ns; #endif /* Time that this request was allocated for this IO. */ u64 start_time_ns; /* Time that I/O was submitted to the device. */ u64 io_start_time_ns; #ifdef CONFIG_BLK_WBT unsigned short wbt_flags; #endif /* * rq sectors used for blk stats. It has the same value * with blk_rq_sectors(rq), except that it never be zeroed * by completion. */ unsigned short stats_sectors; /* * Number of scatter-gather DMA addr+len pairs after * physical address coalescing is performed. */ unsigned short nr_phys_segments; unsigned short nr_integrity_segments; #ifdef CONFIG_BLK_INLINE_ENCRYPTION struct bio_crypt_ctx *crypt_ctx; struct blk_crypto_keyslot *crypt_keyslot; #endif enum rw_hint write_hint; unsigned short ioprio; enum mq_rq_state state; atomic_t ref; unsigned long deadline; /* * The hash is used inside the scheduler, and killed once the * request reaches the dispatch list. The ipi_list is only used * to queue the request for softirq completion, which is long * after the request has been unhashed (and even removed from * the dispatch list). */ union { struct hlist_node hash; /* merge hash */ struct llist_node ipi_list; }; /* * The rb_node is only used inside the io scheduler, requests * are pruned when moved to the dispatch queue. special_vec must * only be used if RQF_SPECIAL_PAYLOAD is set, and those cannot be * insert into an IO scheduler. */ union { struct rb_node rb_node; /* sort/lookup */ struct bio_vec special_vec; }; /* * Three pointers are available for the IO schedulers, if they need * more they have to dynamically allocate it. */ struct { struct io_cq *icq; void *priv[2]; } elv; struct { unsigned int seq; rq_end_io_fn *saved_end_io; } flush; u64 fifo_time; /* * completion callback. */ rq_end_io_fn *end_io; void *end_io_data; }; static inline enum req_op req_op(const struct request *req) { return req->cmd_flags & REQ_OP_MASK; } static inline bool blk_rq_is_passthrough(struct request *rq) { return blk_op_is_passthrough(rq->cmd_flags); } static inline unsigned short req_get_ioprio(struct request *req) { return req->ioprio; } #define rq_data_dir(rq) (op_is_write(req_op(rq)) ? WRITE : READ) #define rq_dma_dir(rq) \ (op_is_write(req_op(rq)) ? DMA_TO_DEVICE : DMA_FROM_DEVICE) #define rq_list_add(listptr, rq) do { \ (rq)->rq_next = *(listptr); \ *(listptr) = rq; \ } while (0) #define rq_list_add_tail(lastpptr, rq) do { \ (rq)->rq_next = NULL; \ **(lastpptr) = rq; \ *(lastpptr) = &rq->rq_next; \ } while (0) #define rq_list_pop(listptr) \ ({ \ struct request *__req = NULL; \ if ((listptr) && *(listptr)) { \ __req = *(listptr); \ *(listptr) = __req->rq_next; \ } \ __req; \ }) #define rq_list_peek(listptr) \ ({ \ struct request *__req = NULL; \ if ((listptr) && *(listptr)) \ __req = *(listptr); \ __req; \ }) #define rq_list_for_each(listptr, pos) \ for (pos = rq_list_peek((listptr)); pos; pos = rq_list_next(pos)) #define rq_list_for_each_safe(listptr, pos, nxt) \ for (pos = rq_list_peek((listptr)), nxt = rq_list_next(pos); \ pos; pos = nxt, nxt = pos ? rq_list_next(pos) : NULL) #define rq_list_next(rq) (rq)->rq_next #define rq_list_empty(list) ((list) == (struct request *) NULL) /** * rq_list_move() - move a struct request from one list to another * @src: The source list @rq is currently in * @dst: The destination list that @rq will be appended to * @rq: The request to move * @prev: The request preceding @rq in @src (NULL if @rq is the head) */ static inline void rq_list_move(struct request **src, struct request **dst, struct request *rq, struct request *prev) { if (prev) prev->rq_next = rq->rq_next; else *src = rq->rq_next; rq_list_add(dst, rq); } /** * enum blk_eh_timer_return - How the timeout handler should proceed * @BLK_EH_DONE: The block driver completed the command or will complete it at * a later time. * @BLK_EH_RESET_TIMER: Reset the request timer and continue waiting for the * request to complete. */ enum blk_eh_timer_return { BLK_EH_DONE, BLK_EH_RESET_TIMER, }; /* Keep alloc_policy_name[] in sync with the definitions below */ enum { BLK_TAG_ALLOC_FIFO, /* allocate starting from 0 */ BLK_TAG_ALLOC_RR, /* allocate starting from last allocated tag */ BLK_TAG_ALLOC_MAX }; /** * struct blk_mq_hw_ctx - State for a hardware queue facing the hardware * block device */ struct blk_mq_hw_ctx { struct { /** @lock: Protects the dispatch list. */ spinlock_t lock; /** * @dispatch: Used for requests that are ready to be * dispatched to the hardware but for some reason (e.g. lack of * resources) could not be sent to the hardware. As soon as the * driver can send new requests, requests at this list will * be sent first for a fairer dispatch. */ struct list_head dispatch; /** * @state: BLK_MQ_S_* flags. Defines the state of the hw * queue (active, scheduled to restart, stopped). */ unsigned long state; } ____cacheline_aligned_in_smp; /** * @run_work: Used for scheduling a hardware queue run at a later time. */ struct delayed_work run_work; /** @cpumask: Map of available CPUs where this hctx can run. */ cpumask_var_t cpumask; /** * @next_cpu: Used by blk_mq_hctx_next_cpu() for round-robin CPU * selection from @cpumask. */ int next_cpu; /** * @next_cpu_batch: Counter of how many works left in the batch before * changing to the next CPU. */ int next_cpu_batch; /** @flags: BLK_MQ_F_* flags. Defines the behaviour of the queue. */ unsigned long flags; /** * @sched_data: Pointer owned by the IO scheduler attached to a request * queue. It's up to the IO scheduler how to use this pointer. */ void *sched_data; /** * @queue: Pointer to the request queue that owns this hardware context. */ struct request_queue *queue; /** @fq: Queue of requests that need to perform a flush operation. */ struct blk_flush_queue *fq; /** * @driver_data: Pointer to data owned by the block driver that created * this hctx */ void *driver_data; /** * @ctx_map: Bitmap for each software queue. If bit is on, there is a * pending request in that software queue. */ struct sbitmap ctx_map; /** * @dispatch_from: Software queue to be used when no scheduler was * selected. */ struct blk_mq_ctx *dispatch_from; /** * @dispatch_busy: Number used by blk_mq_update_dispatch_busy() to * decide if the hw_queue is busy using Exponential Weighted Moving * Average algorithm. */ unsigned int dispatch_busy; /** @type: HCTX_TYPE_* flags. Type of hardware queue. */ unsigned short type; /** @nr_ctx: Number of software queues. */ unsigned short nr_ctx; /** @ctxs: Array of software queues. */ struct blk_mq_ctx **ctxs; /** @dispatch_wait_lock: Lock for dispatch_wait queue. */ spinlock_t dispatch_wait_lock; /** * @dispatch_wait: Waitqueue to put requests when there is no tag * available at the moment, to wait for another try in the future. */ wait_queue_entry_t dispatch_wait; /** * @wait_index: Index of next available dispatch_wait queue to insert * requests. */ atomic_t wait_index; /** * @tags: Tags owned by the block driver. A tag at this set is only * assigned when a request is dispatched from a hardware queue. */ struct blk_mq_tags *tags; /** * @sched_tags: Tags owned by I/O scheduler. If there is an I/O * scheduler associated with a request queue, a tag is assigned when * that request is allocated. Else, this member is not used. */ struct blk_mq_tags *sched_tags; /** @numa_node: NUMA node the storage adapter has been connected to. */ unsigned int numa_node; /** @queue_num: Index of this hardware queue. */ unsigned int queue_num; /** * @nr_active: Number of active requests. Only used when a tag set is * shared across request queues. */ atomic_t nr_active; /** @cpuhp_online: List to store request if CPU is going to die */ struct hlist_node cpuhp_online; /** @cpuhp_dead: List to store request if some CPU die. */ struct hlist_node cpuhp_dead; /** @kobj: Kernel object for sysfs. */ struct kobject kobj; #ifdef CONFIG_BLK_DEBUG_FS /** * @debugfs_dir: debugfs directory for this hardware queue. Named * as cpu<cpu_number>. */ struct dentry *debugfs_dir; /** @sched_debugfs_dir: debugfs directory for the scheduler. */ struct dentry *sched_debugfs_dir; #endif /** * @hctx_list: if this hctx is not in use, this is an entry in * q->unused_hctx_list. */ struct list_head hctx_list; }; /** * struct blk_mq_queue_map - Map software queues to hardware queues * @mq_map: CPU ID to hardware queue index map. This is an array * with nr_cpu_ids elements. Each element has a value in the range * [@queue_offset, @queue_offset + @nr_queues). * @nr_queues: Number of hardware queues to map CPU IDs onto. * @queue_offset: First hardware queue to map onto. Used by the PCIe NVMe * driver to map each hardware queue type (enum hctx_type) onto a distinct * set of hardware queues. */ struct blk_mq_queue_map { unsigned int *mq_map; unsigned int nr_queues; unsigned int queue_offset; }; /** * enum hctx_type - Type of hardware queue * @HCTX_TYPE_DEFAULT: All I/O not otherwise accounted for. * @HCTX_TYPE_READ: Just for READ I/O. * @HCTX_TYPE_POLL: Polled I/O of any kind. * @HCTX_MAX_TYPES: Number of types of hctx. */ enum hctx_type { HCTX_TYPE_DEFAULT, HCTX_TYPE_READ, HCTX_TYPE_POLL, HCTX_MAX_TYPES, }; /** * struct blk_mq_tag_set - tag set that can be shared between request queues * @ops: Pointers to functions that implement block driver behavior. * @map: One or more ctx -> hctx mappings. One map exists for each * hardware queue type (enum hctx_type) that the driver wishes * to support. There are no restrictions on maps being of the * same size, and it's perfectly legal to share maps between * types. * @nr_maps: Number of elements in the @map array. A number in the range * [1, HCTX_MAX_TYPES]. * @nr_hw_queues: Number of hardware queues supported by the block driver that * owns this data structure. * @queue_depth: Number of tags per hardware queue, reserved tags included. * @reserved_tags: Number of tags to set aside for BLK_MQ_REQ_RESERVED tag * allocations. * @cmd_size: Number of additional bytes to allocate per request. The block * driver owns these additional bytes. * @numa_node: NUMA node the storage adapter has been connected to. * @timeout: Request processing timeout in jiffies. * @flags: Zero or more BLK_MQ_F_* flags. * @driver_data: Pointer to data owned by the block driver that created this * tag set. * @tags: Tag sets. One tag set per hardware queue. Has @nr_hw_queues * elements. * @shared_tags: * Shared set of tags. Has @nr_hw_queues elements. If set, * shared by all @tags. * @tag_list_lock: Serializes tag_list accesses. * @tag_list: List of the request queues that use this tag set. See also * request_queue.tag_set_list. * @srcu: Use as lock when type of the request queue is blocking * (BLK_MQ_F_BLOCKING). */ struct blk_mq_tag_set { const struct blk_mq_ops *ops; struct blk_mq_queue_map map[HCTX_MAX_TYPES]; unsigned int nr_maps; unsigned int nr_hw_queues; unsigned int queue_depth; unsigned int reserved_tags; unsigned int cmd_size; int numa_node; unsigned int timeout; unsigned int flags; void *driver_data; struct blk_mq_tags **tags; struct blk_mq_tags *shared_tags; struct mutex tag_list_lock; struct list_head tag_list; struct srcu_struct *srcu; }; /** * struct blk_mq_queue_data - Data about a request inserted in a queue * * @rq: Request pointer. * @last: If it is the last request in the queue. */ struct blk_mq_queue_data { struct request *rq; bool last; }; typedef bool (busy_tag_iter_fn)(struct request *, void *); /** * struct blk_mq_ops - Callback functions that implements block driver * behaviour. */ struct blk_mq_ops { /** * @queue_rq: Queue a new request from block IO. */ blk_status_t (*queue_rq)(struct blk_mq_hw_ctx *, const struct blk_mq_queue_data *); /** * @commit_rqs: If a driver uses bd->last to judge when to submit * requests to hardware, it must define this function. In case of errors * that make us stop issuing further requests, this hook serves the * purpose of kicking the hardware (which the last request otherwise * would have done). */ void (*commit_rqs)(struct blk_mq_hw_ctx *); /** * @queue_rqs: Queue a list of new requests. Driver is guaranteed * that each request belongs to the same queue. If the driver doesn't * empty the @rqlist completely, then the rest will be queued * individually by the block layer upon return. */ void (*queue_rqs)(struct request **rqlist); /** * @get_budget: Reserve budget before queue request, once .queue_rq is * run, it is driver's responsibility to release the * reserved budget. Also we have to handle failure case * of .get_budget for avoiding I/O deadlock. */ int (*get_budget)(struct request_queue *); /** * @put_budget: Release the reserved budget. */ void (*put_budget)(struct request_queue *, int); /** * @set_rq_budget_token: store rq's budget token */ void (*set_rq_budget_token)(struct request *, int); /** * @get_rq_budget_token: retrieve rq's budget token */ int (*get_rq_budget_token)(struct request *); /** * @timeout: Called on request timeout. */ enum blk_eh_timer_return (*timeout)(struct request *); /** * @poll: Called to poll for completion of a specific tag. */ int (*poll)(struct blk_mq_hw_ctx *, struct io_comp_batch *); /** * @complete: Mark the request as complete. */ void (*complete)(struct request *); /** * @init_hctx: Called when the block layer side of a hardware queue has * been set up, allowing the driver to allocate/init matching * structures. */ int (*init_hctx)(struct blk_mq_hw_ctx *, void *, unsigned int); /** * @exit_hctx: Ditto for exit/teardown. */ void (*exit_hctx)(struct blk_mq_hw_ctx *, unsigned int); /** * @init_request: Called for every command allocated by the block layer * to allow the driver to set up driver specific data. * * Tag greater than or equal to queue_depth is for setting up * flush request. */ int (*init_request)(struct blk_mq_tag_set *set, struct request *, unsigned int, unsigned int); /** * @exit_request: Ditto for exit/teardown. */ void (*exit_request)(struct blk_mq_tag_set *set, struct request *, unsigned int); /** * @cleanup_rq: Called before freeing one request which isn't completed * yet, and usually for freeing the driver private data. */ void (*cleanup_rq)(struct request *); /** * @busy: If set, returns whether or not this queue currently is busy. */ bool (*busy)(struct request_queue *); /** * @map_queues: This allows drivers specify their own queue mapping by * overriding the setup-time function that builds the mq_map. */ void (*map_queues)(struct blk_mq_tag_set *set); #ifdef CONFIG_BLK_DEBUG_FS /** * @show_rq: Used by the debugfs implementation to show driver-specific * information about a request. */ void (*show_rq)(struct seq_file *m, struct request *rq); #endif }; /* Keep hctx_flag_name[] in sync with the definitions below */ enum { BLK_MQ_F_SHOULD_MERGE = 1 << 0, BLK_MQ_F_TAG_QUEUE_SHARED = 1 << 1, /* * Set when this device requires underlying blk-mq device for * completing IO: */ BLK_MQ_F_STACKING = 1 << 2, BLK_MQ_F_TAG_HCTX_SHARED = 1 << 3, BLK_MQ_F_BLOCKING = 1 << 4, /* Do not allow an I/O scheduler to be configured. */ BLK_MQ_F_NO_SCHED = 1 << 5, /* * Select 'none' during queue registration in case of a single hwq * or shared hwqs instead of 'mq-deadline'. */ BLK_MQ_F_NO_SCHED_BY_DEFAULT = 1 << 6, BLK_MQ_F_ALLOC_POLICY_START_BIT = 7, BLK_MQ_F_ALLOC_POLICY_BITS = 1, }; #define BLK_MQ_FLAG_TO_ALLOC_POLICY(flags) \ ((flags >> BLK_MQ_F_ALLOC_POLICY_START_BIT) & \ ((1 << BLK_MQ_F_ALLOC_POLICY_BITS) - 1)) #define BLK_ALLOC_POLICY_TO_MQ_FLAG(policy) \ ((policy & ((1 << BLK_MQ_F_ALLOC_POLICY_BITS) - 1)) \ << BLK_MQ_F_ALLOC_POLICY_START_BIT) #define BLK_MQ_MAX_DEPTH (10240) #define BLK_MQ_NO_HCTX_IDX (-1U) enum { /* Keep hctx_state_name[] in sync with the definitions below */ BLK_MQ_S_STOPPED, BLK_MQ_S_TAG_ACTIVE, BLK_MQ_S_SCHED_RESTART, /* hw queue is inactive after all its CPUs become offline */ BLK_MQ_S_INACTIVE, BLK_MQ_S_MAX }; struct gendisk *__blk_mq_alloc_disk(struct blk_mq_tag_set *set, struct queue_limits *lim, void *queuedata, struct lock_class_key *lkclass); #define blk_mq_alloc_disk(set, lim, queuedata) \ ({ \ static struct lock_class_key __key; \ \ __blk_mq_alloc_disk(set, lim, queuedata, &__key); \ }) struct gendisk *blk_mq_alloc_disk_for_queue(struct request_queue *q, struct lock_class_key *lkclass); struct request_queue *blk_mq_alloc_queue(struct blk_mq_tag_set *set, struct queue_limits *lim, void *queuedata); int blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, struct request_queue *q); void blk_mq_destroy_queue(struct request_queue *); int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set); int blk_mq_alloc_sq_tag_set(struct blk_mq_tag_set *set, const struct blk_mq_ops *ops, unsigned int queue_depth, unsigned int set_flags); void blk_mq_free_tag_set(struct blk_mq_tag_set *set); void blk_mq_free_request(struct request *rq); int blk_rq_poll(struct request *rq, struct io_comp_batch *iob, unsigned int poll_flags); bool blk_mq_queue_inflight(struct request_queue *q); enum { /* return when out of requests */ BLK_MQ_REQ_NOWAIT = (__force blk_mq_req_flags_t)(1 << 0), /* allocate from reserved pool */ BLK_MQ_REQ_RESERVED = (__force blk_mq_req_flags_t)(1 << 1), /* set RQF_PM */ BLK_MQ_REQ_PM = (__force blk_mq_req_flags_t)(1 << 2), }; struct request *blk_mq_alloc_request(struct request_queue *q, blk_opf_t opf, blk_mq_req_flags_t flags); struct request *blk_mq_alloc_request_hctx(struct request_queue *q, blk_opf_t opf, blk_mq_req_flags_t flags, unsigned int hctx_idx); /* * Tag address space map. */ struct blk_mq_tags { unsigned int nr_tags; unsigned int nr_reserved_tags; unsigned int active_queues; struct sbitmap_queue bitmap_tags; struct sbitmap_queue breserved_tags; struct request **rqs; struct request **static_rqs; struct list_head page_list; /* * used to clear request reference in rqs[] before freeing one * request pool */ spinlock_t lock; }; static inline struct request *blk_mq_tag_to_rq(struct blk_mq_tags *tags, unsigned int tag) { if (tag < tags->nr_tags) { prefetch(tags->rqs[tag]); return tags->rqs[tag]; } return NULL; } enum { BLK_MQ_UNIQUE_TAG_BITS = 16, BLK_MQ_UNIQUE_TAG_MASK = (1 << BLK_MQ_UNIQUE_TAG_BITS) - 1, }; u32 blk_mq_unique_tag(struct request *rq); static inline u16 blk_mq_unique_tag_to_hwq(u32 unique_tag) { return unique_tag >> BLK_MQ_UNIQUE_TAG_BITS; } static inline u16 blk_mq_unique_tag_to_tag(u32 unique_tag) { return unique_tag & BLK_MQ_UNIQUE_TAG_MASK; } /** * blk_mq_rq_state() - read the current MQ_RQ_* state of a request * @rq: target request. */ static inline enum mq_rq_state blk_mq_rq_state(struct request *rq) { return READ_ONCE(rq->state); } static inline int blk_mq_request_started(struct request *rq) { return blk_mq_rq_state(rq) != MQ_RQ_IDLE; } static inline int blk_mq_request_completed(struct request *rq) { return blk_mq_rq_state(rq) == MQ_RQ_COMPLETE; } /* * * Set the state to complete when completing a request from inside ->queue_rq. * This is used by drivers that want to ensure special complete actions that * need access to the request are called on failure, e.g. by nvme for * multipathing. */ static inline void blk_mq_set_request_complete(struct request *rq) { WRITE_ONCE(rq->state, MQ_RQ_COMPLETE); } /* * Complete the request directly instead of deferring it to softirq or * completing it another CPU. Useful in preemptible instead of an interrupt. */ static inline void blk_mq_complete_request_direct(struct request *rq, void (*complete)(struct request *rq)) { WRITE_ONCE(rq->state, MQ_RQ_COMPLETE); complete(rq); } void blk_mq_start_request(struct request *rq); void blk_mq_end_request(struct request *rq, blk_status_t error); void __blk_mq_end_request(struct request *rq, blk_status_t error); void blk_mq_end_request_batch(struct io_comp_batch *ib); /* * Only need start/end time stamping if we have iostat or * blk stats enabled, or using an IO scheduler. */ static inline bool blk_mq_need_time_stamp(struct request *rq) { /* * passthrough io doesn't use iostat accounting, cgroup stats * and io scheduler functionalities. */ if (blk_rq_is_passthrough(rq)) return false; return (rq->rq_flags & (RQF_IO_STAT | RQF_STATS | RQF_USE_SCHED)); } static inline bool blk_mq_is_reserved_rq(struct request *rq) { return rq->rq_flags & RQF_RESV; } /* * Batched completions only work when there is no I/O error and no special * ->end_io handler. */ static inline bool blk_mq_add_to_batch(struct request *req, struct io_comp_batch *iob, int ioerror, void (*complete)(struct io_comp_batch *)) { /* * blk_mq_end_request_batch() can't end request allocated from * sched tags */ if (!iob || (req->rq_flags & RQF_SCHED_TAGS) || ioerror || (req->end_io && !blk_rq_is_passthrough(req))) return false; if (!iob->complete) iob->complete = complete; else if (iob->complete != complete) return false; iob->need_ts |= blk_mq_need_time_stamp(req); rq_list_add(&iob->req_list, req); return true; } void blk_mq_requeue_request(struct request *rq, bool kick_requeue_list); void blk_mq_kick_requeue_list(struct request_queue *q); void blk_mq_delay_kick_requeue_list(struct request_queue *q, unsigned long msecs); void blk_mq_complete_request(struct request *rq); bool blk_mq_complete_request_remote(struct request *rq); void blk_mq_stop_hw_queue(struct blk_mq_hw_ctx *hctx); void blk_mq_start_hw_queue(struct blk_mq_hw_ctx *hctx); void blk_mq_stop_hw_queues(struct request_queue *q); void blk_mq_start_hw_queues(struct request_queue *q); void blk_mq_start_stopped_hw_queue(struct blk_mq_hw_ctx *hctx, bool async); void blk_mq_start_stopped_hw_queues(struct request_queue *q, bool async); void blk_mq_quiesce_queue(struct request_queue *q); void blk_mq_wait_quiesce_done(struct blk_mq_tag_set *set); void blk_mq_quiesce_tagset(struct blk_mq_tag_set *set); void blk_mq_unquiesce_tagset(struct blk_mq_tag_set *set); void blk_mq_unquiesce_queue(struct request_queue *q); void blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs); void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async); void blk_mq_run_hw_queues(struct request_queue *q, bool async); void blk_mq_delay_run_hw_queues(struct request_queue *q, unsigned long msecs); void blk_mq_tagset_busy_iter(struct blk_mq_tag_set *tagset, busy_tag_iter_fn *fn, void *priv); void blk_mq_tagset_wait_completed_request(struct blk_mq_tag_set *tagset); void blk_mq_freeze_queue(struct request_queue *q); void blk_mq_unfreeze_queue(struct request_queue *q); void blk_freeze_queue_start(struct request_queue *q); void blk_mq_freeze_queue_wait(struct request_queue *q); int blk_mq_freeze_queue_wait_timeout(struct request_queue *q, unsigned long timeout); void blk_mq_map_queues(struct blk_mq_queue_map *qmap); void blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, int nr_hw_queues); void blk_mq_quiesce_queue_nowait(struct request_queue *q); unsigned int blk_mq_rq_cpu(struct request *rq); bool __blk_should_fake_timeout(struct request_queue *q); static inline bool blk_should_fake_timeout(struct request_queue *q) { if (IS_ENABLED(CONFIG_FAIL_IO_TIMEOUT) && test_bit(QUEUE_FLAG_FAIL_IO, &q->queue_flags)) return __blk_should_fake_timeout(q); return false; } /** * blk_mq_rq_from_pdu - cast a PDU to a request * @pdu: the PDU (Protocol Data Unit) to be casted * * Return: request * * Driver command data is immediately after the request. So subtract request * size to get back to the original request. */ static inline struct request *blk_mq_rq_from_pdu(void *pdu) { return pdu - sizeof(struct request); } /** * blk_mq_rq_to_pdu - cast a request to a PDU * @rq: the request to be casted * * Return: pointer to the PDU * * Driver command data is immediately after the request. So add request to get * the PDU. */ static inline void *blk_mq_rq_to_pdu(struct request *rq) { return rq + 1; } #define queue_for_each_hw_ctx(q, hctx, i) \ xa_for_each(&(q)->hctx_table, (i), (hctx)) #define hctx_for_each_ctx(hctx, ctx, i) \ for ((i) = 0; (i) < (hctx)->nr_ctx && \ ({ ctx = (hctx)->ctxs[(i)]; 1; }); (i)++) static inline void blk_mq_cleanup_rq(struct request *rq) { if (rq->q->mq_ops->cleanup_rq) rq->q->mq_ops->cleanup_rq(rq); } static inline void blk_rq_bio_prep(struct request *rq, struct bio *bio, unsigned int nr_segs) { rq->nr_phys_segments = nr_segs; rq->__data_len = bio->bi_iter.bi_size; rq->bio = rq->biotail = bio; rq->ioprio = bio_prio(bio); } void blk_mq_hctx_set_fq_lock_class(struct blk_mq_hw_ctx *hctx, struct lock_class_key *key); static inline bool rq_is_sync(struct request *rq) { return op_is_sync(rq->cmd_flags); } void blk_rq_init(struct request_queue *q, struct request *rq); int blk_rq_prep_clone(struct request *rq, struct request *rq_src, struct bio_set *bs, gfp_t gfp_mask, int (*bio_ctr)(struct bio *, struct bio *, void *), void *data); void blk_rq_unprep_clone(struct request *rq); blk_status_t blk_insert_cloned_request(struct request *rq); struct rq_map_data { struct page **pages; unsigned long offset; unsigned short page_order; unsigned short nr_entries; bool null_mapped; bool from_user; }; int blk_rq_map_user(struct request_queue *, struct request *, struct rq_map_data *, void __user *, unsigned long, gfp_t); int blk_rq_map_user_io(struct request *, struct rq_map_data *, void __user *, unsigned long, gfp_t, bool, int, bool, int); int blk_rq_map_user_iov(struct request_queue *, struct request *, struct rq_map_data *, const struct iov_iter *, gfp_t); int blk_rq_unmap_user(struct bio *); int blk_rq_map_kern(struct request_queue *, struct request *, void *, unsigned int, gfp_t); int blk_rq_append_bio(struct request *rq, struct bio *bio); void blk_execute_rq_nowait(struct request *rq, bool at_head); blk_status_t blk_execute_rq(struct request *rq, bool at_head); bool blk_rq_is_poll(struct request *rq); struct req_iterator { struct bvec_iter iter; struct bio *bio; }; #define __rq_for_each_bio(_bio, rq) \ if ((rq->bio)) \ for (_bio = (rq)->bio; _bio; _bio = _bio->bi_next) #define rq_for_each_segment(bvl, _rq, _iter) \ __rq_for_each_bio(_iter.bio, _rq) \ bio_for_each_segment(bvl, _iter.bio, _iter.iter) #define rq_for_each_bvec(bvl, _rq, _iter) \ __rq_for_each_bio(_iter.bio, _rq) \ bio_for_each_bvec(bvl, _iter.bio, _iter.iter) #define rq_iter_last(bvec, _iter) \ (_iter.bio->bi_next == NULL && \ bio_iter_last(bvec, _iter.iter)) /* * blk_rq_pos() : the current sector * blk_rq_bytes() : bytes left in the entire request * blk_rq_cur_bytes() : bytes left in the current segment * blk_rq_sectors() : sectors left in the entire request * blk_rq_cur_sectors() : sectors left in the current segment * blk_rq_stats_sectors() : sectors of the entire request used for stats */ static inline sector_t blk_rq_pos(const struct request *rq) { return rq->__sector; } static inline unsigned int blk_rq_bytes(const struct request *rq) { return rq->__data_len; } static inline int blk_rq_cur_bytes(const struct request *rq) { if (!rq->bio) return 0; if (!bio_has_data(rq->bio)) /* dataless requests such as discard */ return rq->bio->bi_iter.bi_size; return bio_iovec(rq->bio).bv_len; } static inline unsigned int blk_rq_sectors(const struct request *rq) { return blk_rq_bytes(rq) >> SECTOR_SHIFT; } static inline unsigned int blk_rq_cur_sectors(const struct request *rq) { return blk_rq_cur_bytes(rq) >> SECTOR_SHIFT; } static inline unsigned int blk_rq_stats_sectors(const struct request *rq) { return rq->stats_sectors; } /* * Some commands like WRITE SAME have a payload or data transfer size which * is different from the size of the request. Any driver that supports such * commands using the RQF_SPECIAL_PAYLOAD flag needs to use this helper to * calculate the data transfer size. */ static inline unsigned int blk_rq_payload_bytes(struct request *rq) { if (rq->rq_flags & RQF_SPECIAL_PAYLOAD) return rq->special_vec.bv_len; return blk_rq_bytes(rq); } /* * Return the first full biovec in the request. The caller needs to check that * there are any bvecs before calling this helper. */ static inline struct bio_vec req_bvec(struct request *rq) { if (rq->rq_flags & RQF_SPECIAL_PAYLOAD) return rq->special_vec; return mp_bvec_iter_bvec(rq->bio->bi_io_vec, rq->bio->bi_iter); } static inline unsigned int blk_rq_count_bios(struct request *rq) { unsigned int nr_bios = 0; struct bio *bio; __rq_for_each_bio(bio, rq) nr_bios++; return nr_bios; } void blk_steal_bios(struct bio_list *list, struct request *rq); /* * Request completion related functions. * * blk_update_request() completes given number of bytes and updates * the request without completing it. */ bool blk_update_request(struct request *rq, blk_status_t error, unsigned int nr_bytes); void blk_abort_request(struct request *); /* * Number of physical segments as sent to the device. * * Normally this is the number of discontiguous data segments sent by the * submitter. But for data-less command like discard we might have no * actual data segments submitted, but the driver might have to add it's * own special payload. In that case we still return 1 here so that this * special payload will be mapped. */ static inline unsigned short blk_rq_nr_phys_segments(struct request *rq) { if (rq->rq_flags & RQF_SPECIAL_PAYLOAD) return 1; return rq->nr_phys_segments; } /* * Number of discard segments (or ranges) the driver needs to fill in. * Each discard bio merged into a request is counted as one segment. */ static inline unsigned short blk_rq_nr_discard_segments(struct request *rq) { return max_t(unsigned short, rq->nr_phys_segments, 1); } int __blk_rq_map_sg(struct request_queue *q, struct request *rq, struct scatterlist *sglist, struct scatterlist **last_sg); static inline int blk_rq_map_sg(struct request_queue *q, struct request *rq, struct scatterlist *sglist) { struct scatterlist *last_sg = NULL; return __blk_rq_map_sg(q, rq, sglist, &last_sg); } void blk_dump_rq_flags(struct request *, char *); #endif /* BLK_MQ_H */ |
| 278 276 27 278 57 25 25 25 4 25 19 24 282 282 282 282 278 15 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 | // SPDX-License-Identifier: GPL-2.0 #include "fs.h" #include "messages.h" #include "discard.h" #include "super.h" #ifdef CONFIG_PRINTK #define STATE_STRING_PREFACE " state " #define STATE_STRING_BUF_LEN (sizeof(STATE_STRING_PREFACE) + BTRFS_FS_STATE_COUNT + 1) /* * Characters to print to indicate error conditions or uncommon filesystem state. * RO is not an error. */ static const char fs_state_chars[] = { [BTRFS_FS_STATE_REMOUNTING] = 'M', [BTRFS_FS_STATE_RO] = 0, [BTRFS_FS_STATE_TRANS_ABORTED] = 'A', [BTRFS_FS_STATE_DEV_REPLACING] = 'R', [BTRFS_FS_STATE_DUMMY_FS_INFO] = 0, [BTRFS_FS_STATE_NO_DATA_CSUMS] = 'C', [BTRFS_FS_STATE_SKIP_META_CSUMS] = 'S', [BTRFS_FS_STATE_LOG_CLEANUP_ERROR] = 'L', }; static void btrfs_state_to_string(const struct btrfs_fs_info *info, char *buf) { unsigned int bit; bool states_printed = false; unsigned long fs_state = READ_ONCE(info->fs_state); char *curr = buf; memcpy(curr, STATE_STRING_PREFACE, sizeof(STATE_STRING_PREFACE)); curr += sizeof(STATE_STRING_PREFACE) - 1; if (BTRFS_FS_ERROR(info)) { *curr++ = 'E'; states_printed = true; } for_each_set_bit(bit, &fs_state, sizeof(fs_state)) { WARN_ON_ONCE(bit >= BTRFS_FS_STATE_COUNT); if ((bit < BTRFS_FS_STATE_COUNT) && fs_state_chars[bit]) { *curr++ = fs_state_chars[bit]; states_printed = true; } } /* If no states were printed, reset the buffer */ if (!states_printed) curr = buf; *curr++ = 0; } #endif /* * Generally the error codes correspond to their respective errors, but there * are a few special cases. * * EUCLEAN: Any sort of corruption that we encounter. The tree-checker for * instance will return EUCLEAN if any of the blocks are corrupted in * a way that is problematic. We want to reserve EUCLEAN for these * sort of corruptions. * * EROFS: If we check BTRFS_FS_STATE_ERROR and fail out with a return error, we * need to use EROFS for this case. We will have no idea of the * original failure, that will have been reported at the time we tripped * over the error. Each subsequent error that doesn't have any context * of the original error should use EROFS when handling BTRFS_FS_STATE_ERROR. */ const char * __attribute_const__ btrfs_decode_error(int error) { char *errstr = "unknown"; switch (error) { case -ENOENT: /* -2 */ errstr = "No such entry"; break; case -EIO: /* -5 */ errstr = "IO failure"; break; case -ENOMEM: /* -12*/ errstr = "Out of memory"; break; case -EEXIST: /* -17 */ errstr = "Object already exists"; break; case -ENOSPC: /* -28 */ errstr = "No space left"; break; case -EROFS: /* -30 */ errstr = "Readonly filesystem"; break; case -EOPNOTSUPP: /* -95 */ errstr = "Operation not supported"; break; case -EUCLEAN: /* -117 */ errstr = "Filesystem corrupted"; break; case -EDQUOT: /* -122 */ errstr = "Quota exceeded"; break; } return errstr; } /* * Decodes expected errors from the caller and invokes the appropriate error * response. */ __cold void __btrfs_handle_fs_error(struct btrfs_fs_info *fs_info, const char *function, unsigned int line, int error, const char *fmt, ...) { struct super_block *sb = fs_info->sb; #ifdef CONFIG_PRINTK char statestr[STATE_STRING_BUF_LEN]; const char *errstr; #endif #ifdef CONFIG_PRINTK_INDEX printk_index_subsys_emit( "BTRFS: error (device %s%s) in %s:%d: errno=%d %s", KERN_CRIT, fmt); #endif /* * Special case: if the error is EROFS, and we're already under * SB_RDONLY, then it is safe here. */ if (error == -EROFS && sb_rdonly(sb)) return; #ifdef CONFIG_PRINTK errstr = btrfs_decode_error(error); btrfs_state_to_string(fs_info, statestr); if (fmt) { struct va_format vaf; va_list args; va_start(args, fmt); vaf.fmt = fmt; vaf.va = &args; pr_crit("BTRFS: error (device %s%s) in %s:%d: errno=%d %s (%pV)\n", sb->s_id, statestr, function, line, error, errstr, &vaf); va_end(args); } else { pr_crit("BTRFS: error (device %s%s) in %s:%d: errno=%d %s\n", sb->s_id, statestr, function, line, error, errstr); } #endif /* * Today we only save the error info to memory. Long term we'll also * send it down to the disk. */ WRITE_ONCE(fs_info->fs_error, error); /* Don't go through full error handling during mount. */ if (!(sb->s_flags & SB_BORN)) return; if (sb_rdonly(sb)) return; btrfs_discard_stop(fs_info); /* Handle error by forcing the filesystem readonly. */ btrfs_set_sb_rdonly(sb); btrfs_info(fs_info, "forced readonly"); /* * Note that a running device replace operation is not canceled here * although there is no way to update the progress. It would add the * risk of a deadlock, therefore the canceling is omitted. The only * penalty is that some I/O remains active until the procedure * completes. The next time when the filesystem is mounted writable * again, the device replace operation continues. */ } #ifdef CONFIG_PRINTK static const char * const logtypes[] = { "emergency", "alert", "critical", "error", "warning", "notice", "info", "debug", }; /* * Use one ratelimit state per log level so that a flood of less important * messages doesn't cause more important ones to be dropped. */ static struct ratelimit_state printk_limits[] = { RATELIMIT_STATE_INIT(printk_limits[0], DEFAULT_RATELIMIT_INTERVAL, 100), RATELIMIT_STATE_INIT(printk_limits[1], DEFAULT_RATELIMIT_INTERVAL, 100), RATELIMIT_STATE_INIT(printk_limits[2], DEFAULT_RATELIMIT_INTERVAL, 100), RATELIMIT_STATE_INIT(printk_limits[3], DEFAULT_RATELIMIT_INTERVAL, 100), RATELIMIT_STATE_INIT(printk_limits[4], DEFAULT_RATELIMIT_INTERVAL, 100), RATELIMIT_STATE_INIT(printk_limits[5], DEFAULT_RATELIMIT_INTERVAL, 100), RATELIMIT_STATE_INIT(printk_limits[6], DEFAULT_RATELIMIT_INTERVAL, 100), RATELIMIT_STATE_INIT(printk_limits[7], DEFAULT_RATELIMIT_INTERVAL, 100), }; void __cold _btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...) { char lvl[PRINTK_MAX_SINGLE_HEADER_LEN + 1] = "\0"; struct va_format vaf; va_list args; int kern_level; const char *type = logtypes[4]; struct ratelimit_state *ratelimit = &printk_limits[4]; #ifdef CONFIG_PRINTK_INDEX printk_index_subsys_emit("%sBTRFS %s (device %s): ", NULL, fmt); #endif va_start(args, fmt); while ((kern_level = printk_get_level(fmt)) != 0) { size_t size = printk_skip_level(fmt) - fmt; if (kern_level >= '0' && kern_level <= '7') { memcpy(lvl, fmt, size); lvl[size] = '\0'; type = logtypes[kern_level - '0']; ratelimit = &printk_limits[kern_level - '0']; } fmt += size; } vaf.fmt = fmt; vaf.va = &args; /* Do not ratelimit if CONFIG_BTRFS_DEBUG is enabled. */ if (IS_ENABLED(CONFIG_BTRFS_DEBUG) || __ratelimit(ratelimit)) { if (fs_info) { char statestr[STATE_STRING_BUF_LEN]; btrfs_state_to_string(fs_info, statestr); _printk("%sBTRFS %s (device %s%s): %pV\n", lvl, type, fs_info->sb->s_id, statestr, &vaf); } else { _printk("%sBTRFS %s: %pV\n", lvl, type, &vaf); } } va_end(args); } #endif #if BITS_PER_LONG == 32 void __cold btrfs_warn_32bit_limit(struct btrfs_fs_info *fs_info) { if (!test_and_set_bit(BTRFS_FS_32BIT_WARN, &fs_info->flags)) { btrfs_warn(fs_info, "reaching 32bit limit for logical addresses"); btrfs_warn(fs_info, "due to page cache limit on 32bit systems, btrfs can't access metadata at or beyond %lluT", BTRFS_32BIT_MAX_FILE_SIZE >> 40); btrfs_warn(fs_info, "please consider upgrading to 64bit kernel/hardware"); } } void __cold btrfs_err_32bit_limit(struct btrfs_fs_info *fs_info) { if (!test_and_set_bit(BTRFS_FS_32BIT_ERROR, &fs_info->flags)) { btrfs_err(fs_info, "reached 32bit limit for logical addresses"); btrfs_err(fs_info, "due to page cache limit on 32bit systems, metadata beyond %lluT can't be accessed", BTRFS_32BIT_MAX_FILE_SIZE >> 40); btrfs_err(fs_info, "please consider upgrading to 64bit kernel/hardware"); } } #endif /* * Decode unexpected, fatal errors from the caller, issue an alert, and either * panic or BUGs, depending on mount options. */ __cold void __btrfs_panic(const struct btrfs_fs_info *fs_info, const char *function, unsigned int line, int error, const char *fmt, ...) { char *s_id = "<unknown>"; const char *errstr; struct va_format vaf = { .fmt = fmt }; va_list args; if (fs_info) s_id = fs_info->sb->s_id; va_start(args, fmt); vaf.va = &args; errstr = btrfs_decode_error(error); if (fs_info && (btrfs_test_opt(fs_info, PANIC_ON_FATAL_ERROR))) panic(KERN_CRIT "BTRFS panic (device %s) in %s:%d: %pV (errno=%d %s)\n", s_id, function, line, &vaf, error, errstr); btrfs_crit(fs_info, "panic in %s:%d: %pV (errno=%d %s)", function, line, &vaf, error, errstr); va_end(args); /* Caller calls BUG() */ } |
| 9 10 9 8 3 10 5 9 2 10 9 2 6 6 13 14 13 13 3 12 13 13 32 44 3 44 3 3 3 3 13 2 13 19 22 21 16 3 2 3 1 7 10 8 4 3 6 3 3 2 5 10 3 1 12 2 15 10 9 1 8 5 16 29 28 1 12 18 12 1 1 1 9 27 1 1 3 21 18 1 1 1 1 5 9 11 12 12 12 3 4 1 1 4 2 1 1 11 1 1 1 1 4 1 4 1 1 3 5 7 7 7 3 1 2 7 4 4 7 6 6 4 5 7 3 7 3 7 7 3 9 7 6 7 3 4 9 6 8 7 4 7 7 4 4 4 4 7 7 19 16 9 9 9 9 9 2 14 1 1 1 9 3 9 3 1 8 8 11 1 11 6 5 1 1 8 1 1 1 4 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 | // SPDX-License-Identifier: GPL-2.0 #include <linux/kernel.h> #include <linux/errno.h> #include <linux/fs.h> #include <linux/file.h> #include <linux/mm.h> #include <linux/slab.h> #include <linux/nospec.h> #include <linux/hugetlb.h> #include <linux/compat.h> #include <linux/io_uring.h> #include <uapi/linux/io_uring.h> #include "io_uring.h" #include "alloc_cache.h" #include "openclose.h" #include "rsrc.h" #include "memmap.h" #include "register.h" struct io_rsrc_update { struct file *file; u64 arg; u32 nr_args; u32 offset; }; static void io_rsrc_buf_put(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc); static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov, struct io_mapped_ubuf **pimu, struct page **last_hpage); /* only define max */ #define IORING_MAX_FIXED_FILES (1U << 20) #define IORING_MAX_REG_BUFFERS (1U << 14) static const struct io_mapped_ubuf dummy_ubuf = { /* set invalid range, so io_import_fixed() fails meeting it */ .ubuf = -1UL, .len = UINT_MAX, }; int __io_account_mem(struct user_struct *user, unsigned long nr_pages) { unsigned long page_limit, cur_pages, new_pages; if (!nr_pages) return 0; /* Don't allow more pages than we can safely lock */ page_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; cur_pages = atomic_long_read(&user->locked_vm); do { new_pages = cur_pages + nr_pages; if (new_pages > page_limit) return -ENOMEM; } while (!atomic_long_try_cmpxchg(&user->locked_vm, &cur_pages, new_pages)); return 0; } static void io_unaccount_mem(struct io_ring_ctx *ctx, unsigned long nr_pages) { if (ctx->user) __io_unaccount_mem(ctx->user, nr_pages); if (ctx->mm_account) atomic64_sub(nr_pages, &ctx->mm_account->pinned_vm); } static int io_account_mem(struct io_ring_ctx *ctx, unsigned long nr_pages) { int ret; if (ctx->user) { ret = __io_account_mem(ctx->user, nr_pages); if (ret) return ret; } if (ctx->mm_account) atomic64_add(nr_pages, &ctx->mm_account->pinned_vm); return 0; } static int io_buffer_validate(struct iovec *iov) { unsigned long tmp, acct_len = iov->iov_len + (PAGE_SIZE - 1); /* * Don't impose further limits on the size and buffer * constraints here, we'll -EINVAL later when IO is * submitted if they are wrong. */ if (!iov->iov_base) return iov->iov_len ? -EFAULT : 0; if (!iov->iov_len) return -EFAULT; /* arbitrary limit, but we need something */ if (iov->iov_len > SZ_1G) return -EFAULT; if (check_add_overflow((unsigned long)iov->iov_base, acct_len, &tmp)) return -EOVERFLOW; return 0; } static void io_buffer_unmap(struct io_ring_ctx *ctx, struct io_mapped_ubuf **slot) { struct io_mapped_ubuf *imu = *slot; unsigned int i; *slot = NULL; if (imu != &dummy_ubuf) { if (!refcount_dec_and_test(&imu->refs)) return; for (i = 0; i < imu->nr_bvecs; i++) unpin_user_page(imu->bvec[i].bv_page); if (imu->acct_pages) io_unaccount_mem(ctx, imu->acct_pages); kvfree(imu); } } static void io_rsrc_put_work(struct io_rsrc_node *node) { struct io_rsrc_put *prsrc = &node->item; if (prsrc->tag) io_post_aux_cqe(node->ctx, prsrc->tag, 0, 0); switch (node->type) { case IORING_RSRC_FILE: fput(prsrc->file); break; case IORING_RSRC_BUFFER: io_rsrc_buf_put(node->ctx, prsrc); break; default: WARN_ON_ONCE(1); break; } } void io_rsrc_node_destroy(struct io_ring_ctx *ctx, struct io_rsrc_node *node) { if (!io_alloc_cache_put(&ctx->rsrc_node_cache, node)) kfree(node); } void io_rsrc_node_ref_zero(struct io_rsrc_node *node) __must_hold(&node->ctx->uring_lock) { struct io_ring_ctx *ctx = node->ctx; while (!list_empty(&ctx->rsrc_ref_list)) { node = list_first_entry(&ctx->rsrc_ref_list, struct io_rsrc_node, node); /* recycle ref nodes in order */ if (node->refs) break; list_del(&node->node); if (likely(!node->empty)) io_rsrc_put_work(node); io_rsrc_node_destroy(ctx, node); } if (list_empty(&ctx->rsrc_ref_list) && unlikely(ctx->rsrc_quiesce)) wake_up_all(&ctx->rsrc_quiesce_wq); } struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx) { struct io_rsrc_node *ref_node; ref_node = io_alloc_cache_get(&ctx->rsrc_node_cache); if (!ref_node) { ref_node = kzalloc(sizeof(*ref_node), GFP_KERNEL); if (!ref_node) return NULL; } ref_node->ctx = ctx; ref_node->empty = 0; ref_node->refs = 1; return ref_node; } __cold static int io_rsrc_ref_quiesce(struct io_rsrc_data *data, struct io_ring_ctx *ctx) { struct io_rsrc_node *backup; DEFINE_WAIT(we); int ret; /* As We may drop ->uring_lock, other task may have started quiesce */ if (data->quiesce) return -ENXIO; backup = io_rsrc_node_alloc(ctx); if (!backup) return -ENOMEM; ctx->rsrc_node->empty = true; ctx->rsrc_node->type = -1; list_add_tail(&ctx->rsrc_node->node, &ctx->rsrc_ref_list); io_put_rsrc_node(ctx, ctx->rsrc_node); ctx->rsrc_node = backup; if (list_empty(&ctx->rsrc_ref_list)) return 0; if (ctx->flags & IORING_SETUP_DEFER_TASKRUN) { atomic_set(&ctx->cq_wait_nr, 1); smp_mb(); } ctx->rsrc_quiesce++; data->quiesce = true; do { prepare_to_wait(&ctx->rsrc_quiesce_wq, &we, TASK_INTERRUPTIBLE); mutex_unlock(&ctx->uring_lock); ret = io_run_task_work_sig(ctx); if (ret < 0) { finish_wait(&ctx->rsrc_quiesce_wq, &we); mutex_lock(&ctx->uring_lock); if (list_empty(&ctx->rsrc_ref_list)) ret = 0; break; } schedule(); mutex_lock(&ctx->uring_lock); ret = 0; } while (!list_empty(&ctx->rsrc_ref_list)); finish_wait(&ctx->rsrc_quiesce_wq, &we); data->quiesce = false; ctx->rsrc_quiesce--; if (ctx->flags & IORING_SETUP_DEFER_TASKRUN) { atomic_set(&ctx->cq_wait_nr, 0); smp_mb(); } return ret; } static void io_free_page_table(void **table, size_t size) { unsigned i, nr_tables = DIV_ROUND_UP(size, PAGE_SIZE); for (i = 0; i < nr_tables; i++) kfree(table[i]); kfree(table); } static void io_rsrc_data_free(struct io_rsrc_data *data) { size_t size = data->nr * sizeof(data->tags[0][0]); if (data->tags) io_free_page_table((void **)data->tags, size); kfree(data); } static __cold void **io_alloc_page_table(size_t size) { unsigned i, nr_tables = DIV_ROUND_UP(size, PAGE_SIZE); size_t init_size = size; void **table; table = kcalloc(nr_tables, sizeof(*table), GFP_KERNEL_ACCOUNT); if (!table) return NULL; for (i = 0; i < nr_tables; i++) { unsigned int this_size = min_t(size_t, size, PAGE_SIZE); table[i] = kzalloc(this_size, GFP_KERNEL_ACCOUNT); if (!table[i]) { io_free_page_table(table, init_size); return NULL; } size -= this_size; } return table; } __cold static int io_rsrc_data_alloc(struct io_ring_ctx *ctx, int type, u64 __user *utags, unsigned nr, struct io_rsrc_data **pdata) { struct io_rsrc_data *data; int ret = 0; unsigned i; data = kzalloc(sizeof(*data), GFP_KERNEL); if (!data) return -ENOMEM; data->tags = (u64 **)io_alloc_page_table(nr * sizeof(data->tags[0][0])); if (!data->tags) { kfree(data); return -ENOMEM; } data->nr = nr; data->ctx = ctx; data->rsrc_type = type; if (utags) { ret = -EFAULT; for (i = 0; i < nr; i++) { u64 *tag_slot = io_get_tag_slot(data, i); if (copy_from_user(tag_slot, &utags[i], sizeof(*tag_slot))) goto fail; } } *pdata = data; return 0; fail: io_rsrc_data_free(data); return ret; } static int __io_sqe_files_update(struct io_ring_ctx *ctx, struct io_uring_rsrc_update2 *up, unsigned nr_args) { u64 __user *tags = u64_to_user_ptr(up->tags); __s32 __user *fds = u64_to_user_ptr(up->data); struct io_rsrc_data *data = ctx->file_data; struct io_fixed_file *file_slot; int fd, i, err = 0; unsigned int done; if (!ctx->file_data) return -ENXIO; if (up->offset + nr_args > ctx->nr_user_files) return -EINVAL; for (done = 0; done < nr_args; done++) { u64 tag = 0; if ((tags && copy_from_user(&tag, &tags[done], sizeof(tag))) || copy_from_user(&fd, &fds[done], sizeof(fd))) { err = -EFAULT; break; } if ((fd == IORING_REGISTER_FILES_SKIP || fd == -1) && tag) { err = -EINVAL; break; } if (fd == IORING_REGISTER_FILES_SKIP) continue; i = array_index_nospec(up->offset + done, ctx->nr_user_files); file_slot = io_fixed_file_slot(&ctx->file_table, i); if (file_slot->file_ptr) { err = io_queue_rsrc_removal(data, i, io_slot_file(file_slot)); if (err) break; file_slot->file_ptr = 0; io_file_bitmap_clear(&ctx->file_table, i); } if (fd != -1) { struct file *file = fget(fd); if (!file) { err = -EBADF; break; } /* * Don't allow io_uring instances to be registered. */ if (io_is_uring_fops(file)) { fput(file); err = -EBADF; break; } *io_get_tag_slot(data, i) = tag; io_fixed_file_set(file_slot, file); io_file_bitmap_set(&ctx->file_table, i); } } return done ? done : err; } static int __io_sqe_buffers_update(struct io_ring_ctx *ctx, struct io_uring_rsrc_update2 *up, unsigned int nr_args) { u64 __user *tags = u64_to_user_ptr(up->tags); struct iovec fast_iov, *iov; struct page *last_hpage = NULL; struct iovec __user *uvec; u64 user_data = up->data; __u32 done; int i, err; if (!ctx->buf_data) return -ENXIO; if (up->offset + nr_args > ctx->nr_user_bufs) return -EINVAL; for (done = 0; done < nr_args; done++) { struct io_mapped_ubuf *imu; u64 tag = 0; uvec = u64_to_user_ptr(user_data); iov = iovec_from_user(uvec, 1, 1, &fast_iov, ctx->compat); if (IS_ERR(iov)) { err = PTR_ERR(iov); break; } if (tags && copy_from_user(&tag, &tags[done], sizeof(tag))) { err = -EFAULT; break; } err = io_buffer_validate(iov); if (err) break; if (!iov->iov_base && tag) { err = -EINVAL; break; } err = io_sqe_buffer_register(ctx, iov, &imu, &last_hpage); if (err) break; i = array_index_nospec(up->offset + done, ctx->nr_user_bufs); if (ctx->user_bufs[i] != &dummy_ubuf) { err = io_queue_rsrc_removal(ctx->buf_data, i, ctx->user_bufs[i]); if (unlikely(err)) { io_buffer_unmap(ctx, &imu); break; } ctx->user_bufs[i] = (struct io_mapped_ubuf *)&dummy_ubuf; } ctx->user_bufs[i] = imu; *io_get_tag_slot(ctx->buf_data, i) = tag; if (ctx->compat) user_data += sizeof(struct compat_iovec); else user_data += sizeof(struct iovec); } return done ? done : err; } static int __io_register_rsrc_update(struct io_ring_ctx *ctx, unsigned type, struct io_uring_rsrc_update2 *up, unsigned nr_args) { __u32 tmp; lockdep_assert_held(&ctx->uring_lock); if (check_add_overflow(up->offset, nr_args, &tmp)) return -EOVERFLOW; switch (type) { case IORING_RSRC_FILE: return __io_sqe_files_update(ctx, up, nr_args); case IORING_RSRC_BUFFER: return __io_sqe_buffers_update(ctx, up, nr_args); } return -EINVAL; } int io_register_files_update(struct io_ring_ctx *ctx, void __user *arg, unsigned nr_args) { struct io_uring_rsrc_update2 up; if (!nr_args) return -EINVAL; memset(&up, 0, sizeof(up)); if (copy_from_user(&up, arg, sizeof(struct io_uring_rsrc_update))) return -EFAULT; if (up.resv || up.resv2) return -EINVAL; return __io_register_rsrc_update(ctx, IORING_RSRC_FILE, &up, nr_args); } int io_register_rsrc_update(struct io_ring_ctx *ctx, void __user *arg, unsigned size, unsigned type) { struct io_uring_rsrc_update2 up; if (size != sizeof(up)) return -EINVAL; if (copy_from_user(&up, arg, sizeof(up))) return -EFAULT; if (!up.nr || up.resv || up.resv2) return -EINVAL; return __io_register_rsrc_update(ctx, type, &up, up.nr); } __cold int io_register_rsrc(struct io_ring_ctx *ctx, void __user *arg, unsigned int size, unsigned int type) { struct io_uring_rsrc_register rr; /* keep it extendible */ if (size != sizeof(rr)) return -EINVAL; memset(&rr, 0, sizeof(rr)); if (copy_from_user(&rr, arg, size)) return -EFAULT; if (!rr.nr || rr.resv2) return -EINVAL; if (rr.flags & ~IORING_RSRC_REGISTER_SPARSE) return -EINVAL; switch (type) { case IORING_RSRC_FILE: if (rr.flags & IORING_RSRC_REGISTER_SPARSE && rr.data) break; return io_sqe_files_register(ctx, u64_to_user_ptr(rr.data), rr.nr, u64_to_user_ptr(rr.tags)); case IORING_RSRC_BUFFER: if (rr.flags & IORING_RSRC_REGISTER_SPARSE && rr.data) break; return io_sqe_buffers_register(ctx, u64_to_user_ptr(rr.data), rr.nr, u64_to_user_ptr(rr.tags)); } return -EINVAL; } int io_files_update_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_rsrc_update *up = io_kiocb_to_cmd(req, struct io_rsrc_update); if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT))) return -EINVAL; if (sqe->rw_flags || sqe->splice_fd_in) return -EINVAL; up->offset = READ_ONCE(sqe->off); up->nr_args = READ_ONCE(sqe->len); if (!up->nr_args) return -EINVAL; up->arg = READ_ONCE(sqe->addr); return 0; } static int io_files_update_with_index_alloc(struct io_kiocb *req, unsigned int issue_flags) { struct io_rsrc_update *up = io_kiocb_to_cmd(req, struct io_rsrc_update); __s32 __user *fds = u64_to_user_ptr(up->arg); unsigned int done; struct file *file; int ret, fd; if (!req->ctx->file_data) return -ENXIO; for (done = 0; done < up->nr_args; done++) { if (copy_from_user(&fd, &fds[done], sizeof(fd))) { ret = -EFAULT; break; } file = fget(fd); if (!file) { ret = -EBADF; break; } ret = io_fixed_fd_install(req, issue_flags, file, IORING_FILE_INDEX_ALLOC); if (ret < 0) break; if (copy_to_user(&fds[done], &ret, sizeof(ret))) { __io_close_fixed(req->ctx, issue_flags, ret); ret = -EFAULT; break; } } if (done) return done; return ret; } int io_files_update(struct io_kiocb *req, unsigned int issue_flags) { struct io_rsrc_update *up = io_kiocb_to_cmd(req, struct io_rsrc_update); struct io_ring_ctx *ctx = req->ctx; struct io_uring_rsrc_update2 up2; int ret; up2.offset = up->offset; up2.data = up->arg; up2.nr = 0; up2.tags = 0; up2.resv = 0; up2.resv2 = 0; if (up->offset == IORING_FILE_INDEX_ALLOC) { ret = io_files_update_with_index_alloc(req, issue_flags); } else { io_ring_submit_lock(ctx, issue_flags); ret = __io_register_rsrc_update(ctx, IORING_RSRC_FILE, &up2, up->nr_args); io_ring_submit_unlock(ctx, issue_flags); } if (ret < 0) req_set_fail(req); io_req_set_res(req, ret, 0); return IOU_OK; } int io_queue_rsrc_removal(struct io_rsrc_data *data, unsigned idx, void *rsrc) { struct io_ring_ctx *ctx = data->ctx; struct io_rsrc_node *node = ctx->rsrc_node; u64 *tag_slot = io_get_tag_slot(data, idx); ctx->rsrc_node = io_rsrc_node_alloc(ctx); if (unlikely(!ctx->rsrc_node)) { ctx->rsrc_node = node; return -ENOMEM; } node->item.rsrc = rsrc; node->type = data->rsrc_type; node->item.tag = *tag_slot; *tag_slot = 0; list_add_tail(&node->node, &ctx->rsrc_ref_list); io_put_rsrc_node(ctx, node); return 0; } void __io_sqe_files_unregister(struct io_ring_ctx *ctx) { int i; for (i = 0; i < ctx->nr_user_files; i++) { struct file *file = io_file_from_index(&ctx->file_table, i); if (!file) continue; io_file_bitmap_clear(&ctx->file_table, i); fput(file); } io_free_file_tables(&ctx->file_table); io_file_table_set_alloc_range(ctx, 0, 0); io_rsrc_data_free(ctx->file_data); ctx->file_data = NULL; ctx->nr_user_files = 0; } int io_sqe_files_unregister(struct io_ring_ctx *ctx) { unsigned nr = ctx->nr_user_files; int ret; if (!ctx->file_data) return -ENXIO; /* * Quiesce may unlock ->uring_lock, and while it's not held * prevent new requests using the table. */ ctx->nr_user_files = 0; ret = io_rsrc_ref_quiesce(ctx->file_data, ctx); ctx->nr_user_files = nr; if (!ret) __io_sqe_files_unregister(ctx); return ret; } int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg, unsigned nr_args, u64 __user *tags) { __s32 __user *fds = (__s32 __user *) arg; struct file *file; int fd, ret; unsigned i; if (ctx->file_data) return -EBUSY; if (!nr_args) return -EINVAL; if (nr_args > IORING_MAX_FIXED_FILES) return -EMFILE; if (nr_args > rlimit(RLIMIT_NOFILE)) return -EMFILE; ret = io_rsrc_data_alloc(ctx, IORING_RSRC_FILE, tags, nr_args, &ctx->file_data); if (ret) return ret; if (!io_alloc_file_tables(&ctx->file_table, nr_args)) { io_rsrc_data_free(ctx->file_data); ctx->file_data = NULL; return -ENOMEM; } for (i = 0; i < nr_args; i++, ctx->nr_user_files++) { struct io_fixed_file *file_slot; if (fds && copy_from_user(&fd, &fds[i], sizeof(fd))) { ret = -EFAULT; goto fail; } /* allow sparse sets */ if (!fds || fd == -1) { ret = -EINVAL; if (unlikely(*io_get_tag_slot(ctx->file_data, i))) goto fail; continue; } file = fget(fd); ret = -EBADF; if (unlikely(!file)) goto fail; /* * Don't allow io_uring instances to be registered. */ if (io_is_uring_fops(file)) { fput(file); goto fail; } file_slot = io_fixed_file_slot(&ctx->file_table, i); io_fixed_file_set(file_slot, file); io_file_bitmap_set(&ctx->file_table, i); } /* default it to the whole table */ io_file_table_set_alloc_range(ctx, 0, ctx->nr_user_files); return 0; fail: __io_sqe_files_unregister(ctx); return ret; } static void io_rsrc_buf_put(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc) { io_buffer_unmap(ctx, &prsrc->buf); prsrc->buf = NULL; } void __io_sqe_buffers_unregister(struct io_ring_ctx *ctx) { unsigned int i; for (i = 0; i < ctx->nr_user_bufs; i++) io_buffer_unmap(ctx, &ctx->user_bufs[i]); kfree(ctx->user_bufs); io_rsrc_data_free(ctx->buf_data); ctx->user_bufs = NULL; ctx->buf_data = NULL; ctx->nr_user_bufs = 0; } int io_sqe_buffers_unregister(struct io_ring_ctx *ctx) { unsigned nr = ctx->nr_user_bufs; int ret; if (!ctx->buf_data) return -ENXIO; /* * Quiesce may unlock ->uring_lock, and while it's not held * prevent new requests using the table. */ ctx->nr_user_bufs = 0; ret = io_rsrc_ref_quiesce(ctx->buf_data, ctx); ctx->nr_user_bufs = nr; if (!ret) __io_sqe_buffers_unregister(ctx); return ret; } /* * Not super efficient, but this is just a registration time. And we do cache * the last compound head, so generally we'll only do a full search if we don't * match that one. * * We check if the given compound head page has already been accounted, to * avoid double accounting it. This allows us to account the full size of the * page, not just the constituent pages of a huge page. */ static bool headpage_already_acct(struct io_ring_ctx *ctx, struct page **pages, int nr_pages, struct page *hpage) { int i, j; /* check current page array */ for (i = 0; i < nr_pages; i++) { if (!PageCompound(pages[i])) continue; if (compound_head(pages[i]) == hpage) return true; } /* check previously registered pages */ for (i = 0; i < ctx->nr_user_bufs; i++) { struct io_mapped_ubuf *imu = ctx->user_bufs[i]; for (j = 0; j < imu->nr_bvecs; j++) { if (!PageCompound(imu->bvec[j].bv_page)) continue; if (compound_head(imu->bvec[j].bv_page) == hpage) return true; } } return false; } static int io_buffer_account_pin(struct io_ring_ctx *ctx, struct page **pages, int nr_pages, struct io_mapped_ubuf *imu, struct page **last_hpage) { int i, ret; imu->acct_pages = 0; for (i = 0; i < nr_pages; i++) { if (!PageCompound(pages[i])) { imu->acct_pages++; } else { struct page *hpage; hpage = compound_head(pages[i]); if (hpage == *last_hpage) continue; *last_hpage = hpage; if (headpage_already_acct(ctx, pages, i, hpage)) continue; imu->acct_pages += page_size(hpage) >> PAGE_SHIFT; } } if (!imu->acct_pages) return 0; ret = io_account_mem(ctx, imu->acct_pages); if (ret) imu->acct_pages = 0; return ret; } static bool io_do_coalesce_buffer(struct page ***pages, int *nr_pages, struct io_imu_folio_data *data, int nr_folios) { struct page **page_array = *pages, **new_array = NULL; int nr_pages_left = *nr_pages, i, j; /* Store head pages only*/ new_array = kvmalloc_array(nr_folios, sizeof(struct page *), GFP_KERNEL); if (!new_array) return false; new_array[0] = compound_head(page_array[0]); /* * The pages are bound to the folio, it doesn't * actually unpin them but drops all but one reference, * which is usually put down by io_buffer_unmap(). * Note, needs a better helper. */ if (data->nr_pages_head > 1) unpin_user_pages(&page_array[1], data->nr_pages_head - 1); j = data->nr_pages_head; nr_pages_left -= data->nr_pages_head; for (i = 1; i < nr_folios; i++) { unsigned int nr_unpin; new_array[i] = page_array[j]; nr_unpin = min_t(unsigned int, nr_pages_left - 1, data->nr_pages_mid - 1); if (nr_unpin) unpin_user_pages(&page_array[j+1], nr_unpin); j += data->nr_pages_mid; nr_pages_left -= data->nr_pages_mid; } kvfree(page_array); *pages = new_array; *nr_pages = nr_folios; return true; } static bool io_try_coalesce_buffer(struct page ***pages, int *nr_pages, struct io_imu_folio_data *data) { struct page **page_array = *pages; struct folio *folio = page_folio(page_array[0]); unsigned int count = 1, nr_folios = 1; int i; if (*nr_pages <= 1) return false; data->nr_pages_mid = folio_nr_pages(folio); if (data->nr_pages_mid == 1) return false; data->folio_shift = folio_shift(folio); /* * Check if pages are contiguous inside a folio, and all folios have * the same page count except for the head and tail. */ for (i = 1; i < *nr_pages; i++) { if (page_folio(page_array[i]) == folio && page_array[i] == page_array[i-1] + 1) { count++; continue; } if (nr_folios == 1) { if (folio_page_idx(folio, page_array[i-1]) != data->nr_pages_mid - 1) return false; data->nr_pages_head = count; } else if (count != data->nr_pages_mid) { return false; } folio = page_folio(page_array[i]); if (folio_size(folio) != (1UL << data->folio_shift) || folio_page_idx(folio, page_array[i]) != 0) return false; count = 1; nr_folios++; } if (nr_folios == 1) data->nr_pages_head = count; return io_do_coalesce_buffer(pages, nr_pages, data, nr_folios); } static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov, struct io_mapped_ubuf **pimu, struct page **last_hpage) { struct io_mapped_ubuf *imu = NULL; struct page **pages = NULL; unsigned long off; size_t size; int ret, nr_pages, i; struct io_imu_folio_data data; bool coalesced; *pimu = (struct io_mapped_ubuf *)&dummy_ubuf; if (!iov->iov_base) return 0; ret = -ENOMEM; pages = io_pin_pages((unsigned long) iov->iov_base, iov->iov_len, &nr_pages); if (IS_ERR(pages)) { ret = PTR_ERR(pages); pages = NULL; goto done; } /* If it's huge page(s), try to coalesce them into fewer bvec entries */ coalesced = io_try_coalesce_buffer(&pages, &nr_pages, &data); imu = kvmalloc(struct_size(imu, bvec, nr_pages), GFP_KERNEL); if (!imu) goto done; ret = io_buffer_account_pin(ctx, pages, nr_pages, imu, last_hpage); if (ret) { unpin_user_pages(pages, nr_pages); goto done; } size = iov->iov_len; /* store original address for later verification */ imu->ubuf = (unsigned long) iov->iov_base; imu->len = iov->iov_len; imu->nr_bvecs = nr_pages; imu->folio_shift = PAGE_SHIFT; if (coalesced) imu->folio_shift = data.folio_shift; refcount_set(&imu->refs, 1); off = (unsigned long) iov->iov_base & ((1UL << imu->folio_shift) - 1); *pimu = imu; ret = 0; for (i = 0; i < nr_pages; i++) { size_t vec_len; vec_len = min_t(size_t, size, (1UL << imu->folio_shift) - off); bvec_set_page(&imu->bvec[i], pages[i], vec_len, off); off = 0; size -= vec_len; } done: if (ret) kvfree(imu); kvfree(pages); return ret; } static int io_buffers_map_alloc(struct io_ring_ctx *ctx, unsigned int nr_args) { ctx->user_bufs = kcalloc(nr_args, sizeof(*ctx->user_bufs), GFP_KERNEL); return ctx->user_bufs ? 0 : -ENOMEM; } int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg, unsigned int nr_args, u64 __user *tags) { struct page *last_hpage = NULL; struct io_rsrc_data *data; struct iovec fast_iov, *iov = &fast_iov; const struct iovec __user *uvec; int i, ret; BUILD_BUG_ON(IORING_MAX_REG_BUFFERS >= (1u << 16)); if (ctx->user_bufs) return -EBUSY; if (!nr_args || nr_args > IORING_MAX_REG_BUFFERS) return -EINVAL; ret = io_rsrc_data_alloc(ctx, IORING_RSRC_BUFFER, tags, nr_args, &data); if (ret) return ret; ret = io_buffers_map_alloc(ctx, nr_args); if (ret) { io_rsrc_data_free(data); return ret; } if (!arg) memset(iov, 0, sizeof(*iov)); for (i = 0; i < nr_args; i++, ctx->nr_user_bufs++) { if (arg) { uvec = (struct iovec __user *) arg; iov = iovec_from_user(uvec, 1, 1, &fast_iov, ctx->compat); if (IS_ERR(iov)) { ret = PTR_ERR(iov); break; } ret = io_buffer_validate(iov); if (ret) break; if (ctx->compat) arg += sizeof(struct compat_iovec); else arg += sizeof(struct iovec); } if (!iov->iov_base && *io_get_tag_slot(data, i)) { ret = -EINVAL; break; } ret = io_sqe_buffer_register(ctx, iov, &ctx->user_bufs[i], &last_hpage); if (ret) break; } WARN_ON_ONCE(ctx->buf_data); ctx->buf_data = data; if (ret) __io_sqe_buffers_unregister(ctx); return ret; } int io_import_fixed(int ddir, struct iov_iter *iter, struct io_mapped_ubuf *imu, u64 buf_addr, size_t len) { u64 buf_end; size_t offset; if (WARN_ON_ONCE(!imu)) return -EFAULT; if (unlikely(check_add_overflow(buf_addr, (u64)len, &buf_end))) return -EFAULT; /* not inside the mapped region */ if (unlikely(buf_addr < imu->ubuf || buf_end > (imu->ubuf + imu->len))) return -EFAULT; /* * Might not be a start of buffer, set size appropriately * and advance us to the beginning. */ offset = buf_addr - imu->ubuf; iov_iter_bvec(iter, ddir, imu->bvec, imu->nr_bvecs, offset + len); if (offset) { /* * Don't use iov_iter_advance() here, as it's really slow for * using the latter parts of a big fixed buffer - it iterates * over each segment manually. We can cheat a bit here, because * we know that: * * 1) it's a BVEC iter, we set it up * 2) all bvecs are the same in size, except potentially the * first and last bvec * * So just find our index, and adjust the iterator afterwards. * If the offset is within the first bvec (or the whole first * bvec, just use iov_iter_advance(). This makes it easier * since we can just skip the first segment, which may not * be folio_size aligned. */ const struct bio_vec *bvec = imu->bvec; if (offset < bvec->bv_len) { iter->bvec = bvec; iter->count -= offset; iter->iov_offset = offset; } else { unsigned long seg_skip; /* skip first vec */ offset -= bvec->bv_len; seg_skip = 1 + (offset >> imu->folio_shift); iter->bvec = bvec + seg_skip; iter->nr_segs -= seg_skip; iter->count -= bvec->bv_len + offset; iter->iov_offset = offset & ((1UL << imu->folio_shift) - 1); } } return 0; } static int io_clone_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx) { struct io_mapped_ubuf **user_bufs; struct io_rsrc_data *data; int i, ret, nbufs; /* * Drop our own lock here. We'll setup the data we need and reference * the source buffers, then re-grab, check, and assign at the end. */ mutex_unlock(&ctx->uring_lock); mutex_lock(&src_ctx->uring_lock); ret = -ENXIO; nbufs = src_ctx->nr_user_bufs; if (!nbufs) goto out_unlock; ret = io_rsrc_data_alloc(ctx, IORING_RSRC_BUFFER, NULL, nbufs, &data); if (ret) goto out_unlock; ret = -ENOMEM; user_bufs = kcalloc(nbufs, sizeof(*ctx->user_bufs), GFP_KERNEL); if (!user_bufs) goto out_free_data; for (i = 0; i < nbufs; i++) { struct io_mapped_ubuf *src = src_ctx->user_bufs[i]; refcount_inc(&src->refs); user_bufs[i] = src; } /* Have a ref on the bufs now, drop src lock and re-grab our own lock */ mutex_unlock(&src_ctx->uring_lock); mutex_lock(&ctx->uring_lock); if (!ctx->user_bufs) { ctx->user_bufs = user_bufs; ctx->buf_data = data; ctx->nr_user_bufs = nbufs; return 0; } /* someone raced setting up buffers, dump ours */ for (i = 0; i < nbufs; i++) io_buffer_unmap(ctx, &user_bufs[i]); io_rsrc_data_free(data); kfree(user_bufs); return -EBUSY; out_free_data: io_rsrc_data_free(data); out_unlock: mutex_unlock(&src_ctx->uring_lock); mutex_lock(&ctx->uring_lock); return ret; } /* * Copy the registered buffers from the source ring whose file descriptor * is given in the src_fd to the current ring. This is identical to registering * the buffers with ctx, except faster as mappings already exist. * * Since the memory is already accounted once, don't account it again. */ int io_register_clone_buffers(struct io_ring_ctx *ctx, void __user *arg) { struct io_uring_clone_buffers buf; bool registered_src; struct file *file; int ret; if (ctx->user_bufs || ctx->nr_user_bufs) return -EBUSY; if (copy_from_user(&buf, arg, sizeof(buf))) return -EFAULT; if (buf.flags & ~IORING_REGISTER_SRC_REGISTERED) return -EINVAL; if (memchr_inv(buf.pad, 0, sizeof(buf.pad))) return -EINVAL; registered_src = (buf.flags & IORING_REGISTER_SRC_REGISTERED) != 0; file = io_uring_register_get_file(buf.src_fd, registered_src); if (IS_ERR(file)) return PTR_ERR(file); ret = io_clone_buffers(ctx, file->private_data); if (!registered_src) fput(file); return ret; } |
| 1 1 1 1 1 2 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 | // SPDX-License-Identifier: GPL-2.0-only /* Copyright (C) 2009 Red Hat, Inc. * Author: Michael S. Tsirkin <mst@redhat.com> * * virtio-net server in host kernel. */ #include <linux/compat.h> #include <linux/eventfd.h> #include <linux/vhost.h> #include <linux/virtio_net.h> #include <linux/miscdevice.h> #include <linux/module.h> #include <linux/moduleparam.h> #include <linux/mutex.h> #include <linux/workqueue.h> #include <linux/file.h> #include <linux/slab.h> #include <linux/sched/clock.h> #include <linux/sched/signal.h> #include <linux/vmalloc.h> #include <linux/net.h> #include <linux/if_packet.h> #include <linux/if_arp.h> #include <linux/if_tun.h> #include <linux/if_macvlan.h> #include <linux/if_tap.h> #include <linux/if_vlan.h> #include <linux/skb_array.h> #include <linux/skbuff.h> #include <net/sock.h> #include <net/xdp.h> #include "vhost.h" static int experimental_zcopytx = 0; module_param(experimental_zcopytx, int, 0444); MODULE_PARM_DESC(experimental_zcopytx, "Enable Zero Copy TX;" " 1 -Enable; 0 - Disable"); /* Max number of bytes transferred before requeueing the job. * Using this limit prevents one virtqueue from starving others. */ #define VHOST_NET_WEIGHT 0x80000 /* Max number of packets transferred before requeueing the job. * Using this limit prevents one virtqueue from starving others with small * pkts. */ #define VHOST_NET_PKT_WEIGHT 256 /* MAX number of TX used buffers for outstanding zerocopy */ #define VHOST_MAX_PEND 128 #define VHOST_GOODCOPY_LEN 256 /* * For transmit, used buffer len is unused; we override it to track buffer * status internally; used for zerocopy tx only. */ /* Lower device DMA failed */ #define VHOST_DMA_FAILED_LEN ((__force __virtio32)3) /* Lower device DMA done */ #define VHOST_DMA_DONE_LEN ((__force __virtio32)2) /* Lower device DMA in progress */ #define VHOST_DMA_IN_PROGRESS ((__force __virtio32)1) /* Buffer unused */ #define VHOST_DMA_CLEAR_LEN ((__force __virtio32)0) #define VHOST_DMA_IS_DONE(len) ((__force u32)(len) >= (__force u32)VHOST_DMA_DONE_LEN) enum { VHOST_NET_FEATURES = VHOST_FEATURES | (1ULL << VHOST_NET_F_VIRTIO_NET_HDR) | (1ULL << VIRTIO_NET_F_MRG_RXBUF) | (1ULL << VIRTIO_F_ACCESS_PLATFORM) | (1ULL << VIRTIO_F_RING_RESET) }; enum { VHOST_NET_BACKEND_FEATURES = (1ULL << VHOST_BACKEND_F_IOTLB_MSG_V2) }; enum { VHOST_NET_VQ_RX = 0, VHOST_NET_VQ_TX = 1, VHOST_NET_VQ_MAX = 2, }; struct vhost_net_ubuf_ref { /* refcount follows semantics similar to kref: * 0: object is released * 1: no outstanding ubufs * >1: outstanding ubufs */ atomic_t refcount; wait_queue_head_t wait; struct vhost_virtqueue *vq; }; #define VHOST_NET_BATCH 64 struct vhost_net_buf { void **queue; int tail; int head; }; struct vhost_net_virtqueue { struct vhost_virtqueue vq; size_t vhost_hlen; size_t sock_hlen; /* vhost zerocopy support fields below: */ /* last used idx for outstanding DMA zerocopy buffers */ int upend_idx; /* For TX, first used idx for DMA done zerocopy buffers * For RX, number of batched heads */ int done_idx; /* Number of XDP frames batched */ int batched_xdp; /* an array of userspace buffers info */ struct ubuf_info_msgzc *ubuf_info; /* Reference counting for outstanding ubufs. * Protected by vq mutex. Writers must also take device mutex. */ struct vhost_net_ubuf_ref *ubufs; struct ptr_ring *rx_ring; struct vhost_net_buf rxq; /* Batched XDP buffs */ struct xdp_buff *xdp; }; struct vhost_net { struct vhost_dev dev; struct vhost_net_virtqueue vqs[VHOST_NET_VQ_MAX]; struct vhost_poll poll[VHOST_NET_VQ_MAX]; /* Number of TX recently submitted. * Protected by tx vq lock. */ unsigned tx_packets; /* Number of times zerocopy TX recently failed. * Protected by tx vq lock. */ unsigned tx_zcopy_err; /* Flush in progress. Protected by tx vq lock. */ bool tx_flush; /* Private page frag cache */ struct page_frag_cache pf_cache; }; static unsigned vhost_net_zcopy_mask __read_mostly; static void *vhost_net_buf_get_ptr(struct vhost_net_buf *rxq) { if (rxq->tail != rxq->head) return rxq->queue[rxq->head]; else return NULL; } static int vhost_net_buf_get_size(struct vhost_net_buf *rxq) { return rxq->tail - rxq->head; } static int vhost_net_buf_is_empty(struct vhost_net_buf *rxq) { return rxq->tail == rxq->head; } static void *vhost_net_buf_consume(struct vhost_net_buf *rxq) { void *ret = vhost_net_buf_get_ptr(rxq); ++rxq->head; return ret; } static int vhost_net_buf_produce(struct vhost_net_virtqueue *nvq) { struct vhost_net_buf *rxq = &nvq->rxq; rxq->head = 0; rxq->tail = ptr_ring_consume_batched(nvq->rx_ring, rxq->queue, VHOST_NET_BATCH); return rxq->tail; } static void vhost_net_buf_unproduce(struct vhost_net_virtqueue *nvq) { struct vhost_net_buf *rxq = &nvq->rxq; if (nvq->rx_ring && !vhost_net_buf_is_empty(rxq)) { ptr_ring_unconsume(nvq->rx_ring, rxq->queue + rxq->head, vhost_net_buf_get_size(rxq), tun_ptr_free); rxq->head = rxq->tail = 0; } } static int vhost_net_buf_peek_len(void *ptr) { if (tun_is_xdp_frame(ptr)) { struct xdp_frame *xdpf = tun_ptr_to_xdp(ptr); return xdpf->len; } return __skb_array_len_with_tag(ptr); } static int vhost_net_buf_peek(struct vhost_net_virtqueue *nvq) { struct vhost_net_buf *rxq = &nvq->rxq; if (!vhost_net_buf_is_empty(rxq)) goto out; if (!vhost_net_buf_produce(nvq)) return 0; out: return vhost_net_buf_peek_len(vhost_net_buf_get_ptr(rxq)); } static void vhost_net_buf_init(struct vhost_net_buf *rxq) { rxq->head = rxq->tail = 0; } static void vhost_net_enable_zcopy(int vq) { vhost_net_zcopy_mask |= 0x1 << vq; } static struct vhost_net_ubuf_ref * vhost_net_ubuf_alloc(struct vhost_virtqueue *vq, bool zcopy) { struct vhost_net_ubuf_ref *ubufs; /* No zero copy backend? Nothing to count. */ if (!zcopy) return NULL; ubufs = kmalloc(sizeof(*ubufs), GFP_KERNEL); if (!ubufs) return ERR_PTR(-ENOMEM); atomic_set(&ubufs->refcount, 1); init_waitqueue_head(&ubufs->wait); ubufs->vq = vq; return ubufs; } static int vhost_net_ubuf_put(struct vhost_net_ubuf_ref *ubufs) { int r = atomic_sub_return(1, &ubufs->refcount); if (unlikely(!r)) wake_up(&ubufs->wait); return r; } static void vhost_net_ubuf_put_and_wait(struct vhost_net_ubuf_ref *ubufs) { vhost_net_ubuf_put(ubufs); wait_event(ubufs->wait, !atomic_read(&ubufs->refcount)); } static void vhost_net_ubuf_put_wait_and_free(struct vhost_net_ubuf_ref *ubufs) { vhost_net_ubuf_put_and_wait(ubufs); kfree(ubufs); } static void vhost_net_clear_ubuf_info(struct vhost_net *n) { int i; for (i = 0; i < VHOST_NET_VQ_MAX; ++i) { kfree(n->vqs[i].ubuf_info); n->vqs[i].ubuf_info = NULL; } } static int vhost_net_set_ubuf_info(struct vhost_net *n) { bool zcopy; int i; for (i = 0; i < VHOST_NET_VQ_MAX; ++i) { zcopy = vhost_net_zcopy_mask & (0x1 << i); if (!zcopy) continue; n->vqs[i].ubuf_info = kmalloc_array(UIO_MAXIOV, sizeof(*n->vqs[i].ubuf_info), GFP_KERNEL); if (!n->vqs[i].ubuf_info) goto err; } return 0; err: vhost_net_clear_ubuf_info(n); return -ENOMEM; } static void vhost_net_vq_reset(struct vhost_net *n) { int i; vhost_net_clear_ubuf_info(n); for (i = 0; i < VHOST_NET_VQ_MAX; i++) { n->vqs[i].done_idx = 0; n->vqs[i].upend_idx = 0; n->vqs[i].ubufs = NULL; n->vqs[i].vhost_hlen = 0; n->vqs[i].sock_hlen = 0; vhost_net_buf_init(&n->vqs[i].rxq); } } static void vhost_net_tx_packet(struct vhost_net *net) { ++net->tx_packets; if (net->tx_packets < 1024) return; net->tx_packets = 0; net->tx_zcopy_err = 0; } static void vhost_net_tx_err(struct vhost_net *net) { ++net->tx_zcopy_err; } static bool vhost_net_tx_select_zcopy(struct vhost_net *net) { /* TX flush waits for outstanding DMAs to be done. * Don't start new DMAs. */ return !net->tx_flush && net->tx_packets / 64 >= net->tx_zcopy_err; } static bool vhost_sock_zcopy(struct socket *sock) { return unlikely(experimental_zcopytx) && sock_flag(sock->sk, SOCK_ZEROCOPY); } static bool vhost_sock_xdp(struct socket *sock) { return sock_flag(sock->sk, SOCK_XDP); } /* In case of DMA done not in order in lower device driver for some reason. * upend_idx is used to track end of used idx, done_idx is used to track head * of used idx. Once lower device DMA done contiguously, we will signal KVM * guest used idx. */ static void vhost_zerocopy_signal_used(struct vhost_net *net, struct vhost_virtqueue *vq) { struct vhost_net_virtqueue *nvq = container_of(vq, struct vhost_net_virtqueue, vq); int i, add; int j = 0; for (i = nvq->done_idx; i != nvq->upend_idx; i = (i + 1) % UIO_MAXIOV) { if (vq->heads[i].len == VHOST_DMA_FAILED_LEN) vhost_net_tx_err(net); if (VHOST_DMA_IS_DONE(vq->heads[i].len)) { vq->heads[i].len = VHOST_DMA_CLEAR_LEN; ++j; } else break; } while (j) { add = min(UIO_MAXIOV - nvq->done_idx, j); vhost_add_used_and_signal_n(vq->dev, vq, &vq->heads[nvq->done_idx], add); nvq->done_idx = (nvq->done_idx + add) % UIO_MAXIOV; j -= add; } } static void vhost_zerocopy_complete(struct sk_buff *skb, struct ubuf_info *ubuf_base, bool success) { struct ubuf_info_msgzc *ubuf = uarg_to_msgzc(ubuf_base); struct vhost_net_ubuf_ref *ubufs = ubuf->ctx; struct vhost_virtqueue *vq = ubufs->vq; int cnt; rcu_read_lock_bh(); /* set len to mark this desc buffers done DMA */ vq->heads[ubuf->desc].len = success ? VHOST_DMA_DONE_LEN : VHOST_DMA_FAILED_LEN; cnt = vhost_net_ubuf_put(ubufs); /* * Trigger polling thread if guest stopped submitting new buffers: * in this case, the refcount after decrement will eventually reach 1. * We also trigger polling periodically after each 16 packets * (the value 16 here is more or less arbitrary, it's tuned to trigger * less than 10% of times). */ if (cnt <= 1 || !(cnt % 16)) vhost_poll_queue(&vq->poll); rcu_read_unlock_bh(); } static const struct ubuf_info_ops vhost_ubuf_ops = { .complete = vhost_zerocopy_complete, }; static inline unsigned long busy_clock(void) { return local_clock() >> 10; } static bool vhost_can_busy_poll(unsigned long endtime) { return likely(!need_resched() && !time_after(busy_clock(), endtime) && !signal_pending(current)); } static void vhost_net_disable_vq(struct vhost_net *n, struct vhost_virtqueue *vq) { struct vhost_net_virtqueue *nvq = container_of(vq, struct vhost_net_virtqueue, vq); struct vhost_poll *poll = n->poll + (nvq - n->vqs); if (!vhost_vq_get_backend(vq)) return; vhost_poll_stop(poll); } static int vhost_net_enable_vq(struct vhost_net *n, struct vhost_virtqueue *vq) { struct vhost_net_virtqueue *nvq = container_of(vq, struct vhost_net_virtqueue, vq); struct vhost_poll *poll = n->poll + (nvq - n->vqs); struct socket *sock; sock = vhost_vq_get_backend(vq); if (!sock) return 0; return vhost_poll_start(poll, sock->file); } static void vhost_net_signal_used(struct vhost_net_virtqueue *nvq) { struct vhost_virtqueue *vq = &nvq->vq; struct vhost_dev *dev = vq->dev; if (!nvq->done_idx) return; vhost_add_used_and_signal_n(dev, vq, vq->heads, nvq->done_idx); nvq->done_idx = 0; } static void vhost_tx_batch(struct vhost_net *net, struct vhost_net_virtqueue *nvq, struct socket *sock, struct msghdr *msghdr) { struct tun_msg_ctl ctl = { .type = TUN_MSG_PTR, .num = nvq->batched_xdp, .ptr = nvq->xdp, }; int i, err; if (nvq->batched_xdp == 0) goto signal_used; msghdr->msg_control = &ctl; msghdr->msg_controllen = sizeof(ctl); err = sock->ops->sendmsg(sock, msghdr, 0); if (unlikely(err < 0)) { vq_err(&nvq->vq, "Fail to batch sending packets\n"); /* free pages owned by XDP; since this is an unlikely error path, * keep it simple and avoid more complex bulk update for the * used pages */ for (i = 0; i < nvq->batched_xdp; ++i) put_page(virt_to_head_page(nvq->xdp[i].data)); nvq->batched_xdp = 0; nvq->done_idx = 0; return; } signal_used: vhost_net_signal_used(nvq); nvq->batched_xdp = 0; } static int sock_has_rx_data(struct socket *sock) { if (unlikely(!sock)) return 0; if (sock->ops->peek_len) return sock->ops->peek_len(sock); return skb_queue_empty(&sock->sk->sk_receive_queue); } static void vhost_net_busy_poll_try_queue(struct vhost_net *net, struct vhost_virtqueue *vq) { if (!vhost_vq_avail_empty(&net->dev, vq)) { vhost_poll_queue(&vq->poll); } else if (unlikely(vhost_enable_notify(&net->dev, vq))) { vhost_disable_notify(&net->dev, vq); vhost_poll_queue(&vq->poll); } } static void vhost_net_busy_poll(struct vhost_net *net, struct vhost_virtqueue *rvq, struct vhost_virtqueue *tvq, bool *busyloop_intr, bool poll_rx) { unsigned long busyloop_timeout; unsigned long endtime; struct socket *sock; struct vhost_virtqueue *vq = poll_rx ? tvq : rvq; /* Try to hold the vq mutex of the paired virtqueue. We can't * use mutex_lock() here since we could not guarantee a * consistenet lock ordering. */ if (!mutex_trylock(&vq->mutex)) return; vhost_disable_notify(&net->dev, vq); sock = vhost_vq_get_backend(rvq); busyloop_timeout = poll_rx ? rvq->busyloop_timeout: tvq->busyloop_timeout; preempt_disable(); endtime = busy_clock() + busyloop_timeout; while (vhost_can_busy_poll(endtime)) { if (vhost_vq_has_work(vq)) { *busyloop_intr = true; break; } if ((sock_has_rx_data(sock) && !vhost_vq_avail_empty(&net->dev, rvq)) || !vhost_vq_avail_empty(&net->dev, tvq)) break; cpu_relax(); } preempt_enable(); if (poll_rx || sock_has_rx_data(sock)) vhost_net_busy_poll_try_queue(net, vq); else if (!poll_rx) /* On tx here, sock has no rx data. */ vhost_enable_notify(&net->dev, rvq); mutex_unlock(&vq->mutex); } static int vhost_net_tx_get_vq_desc(struct vhost_net *net, struct vhost_net_virtqueue *tnvq, unsigned int *out_num, unsigned int *in_num, struct msghdr *msghdr, bool *busyloop_intr) { struct vhost_net_virtqueue *rnvq = &net->vqs[VHOST_NET_VQ_RX]; struct vhost_virtqueue *rvq = &rnvq->vq; struct vhost_virtqueue *tvq = &tnvq->vq; int r = vhost_get_vq_desc(tvq, tvq->iov, ARRAY_SIZE(tvq->iov), out_num, in_num, NULL, NULL); if (r == tvq->num && tvq->busyloop_timeout) { /* Flush batched packets first */ if (!vhost_sock_zcopy(vhost_vq_get_backend(tvq))) vhost_tx_batch(net, tnvq, vhost_vq_get_backend(tvq), msghdr); vhost_net_busy_poll(net, rvq, tvq, busyloop_intr, false); r = vhost_get_vq_desc(tvq, tvq->iov, ARRAY_SIZE(tvq->iov), out_num, in_num, NULL, NULL); } return r; } static bool vhost_exceeds_maxpend(struct vhost_net *net) { struct vhost_net_virtqueue *nvq = &net->vqs[VHOST_NET_VQ_TX]; struct vhost_virtqueue *vq = &nvq->vq; return (nvq->upend_idx + UIO_MAXIOV - nvq->done_idx) % UIO_MAXIOV > min_t(unsigned int, VHOST_MAX_PEND, vq->num >> 2); } static size_t init_iov_iter(struct vhost_virtqueue *vq, struct iov_iter *iter, size_t hdr_size, int out) { /* Skip header. TODO: support TSO. */ size_t len = iov_length(vq->iov, out); iov_iter_init(iter, ITER_SOURCE, vq->iov, out, len); iov_iter_advance(iter, hdr_size); return iov_iter_count(iter); } static int get_tx_bufs(struct vhost_net *net, struct vhost_net_virtqueue *nvq, struct msghdr *msg, unsigned int *out, unsigned int *in, size_t *len, bool *busyloop_intr) { struct vhost_virtqueue *vq = &nvq->vq; int ret; ret = vhost_net_tx_get_vq_desc(net, nvq, out, in, msg, busyloop_intr); if (ret < 0 || ret == vq->num) return ret; if (*in) { vq_err(vq, "Unexpected descriptor format for TX: out %d, int %d\n", *out, *in); return -EFAULT; } /* Sanity check */ *len = init_iov_iter(vq, &msg->msg_iter, nvq->vhost_hlen, *out); if (*len == 0) { vq_err(vq, "Unexpected header len for TX: %zd expected %zd\n", *len, nvq->vhost_hlen); return -EFAULT; } return ret; } static bool tx_can_batch(struct vhost_virtqueue *vq, size_t total_len) { return total_len < VHOST_NET_WEIGHT && !vhost_vq_avail_empty(vq->dev, vq); } #define VHOST_NET_RX_PAD (NET_IP_ALIGN + NET_SKB_PAD) static int vhost_net_build_xdp(struct vhost_net_virtqueue *nvq, struct iov_iter *from) { struct vhost_virtqueue *vq = &nvq->vq; struct vhost_net *net = container_of(vq->dev, struct vhost_net, dev); struct socket *sock = vhost_vq_get_backend(vq); struct virtio_net_hdr *gso; struct xdp_buff *xdp = &nvq->xdp[nvq->batched_xdp]; struct tun_xdp_hdr *hdr; size_t len = iov_iter_count(from); int headroom = vhost_sock_xdp(sock) ? XDP_PACKET_HEADROOM : 0; int buflen = SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); int pad = SKB_DATA_ALIGN(VHOST_NET_RX_PAD + headroom + nvq->sock_hlen); int sock_hlen = nvq->sock_hlen; void *buf; int copied; int ret; if (unlikely(len < nvq->sock_hlen)) return -EFAULT; if (SKB_DATA_ALIGN(len + pad) + SKB_DATA_ALIGN(sizeof(struct skb_shared_info)) > PAGE_SIZE) return -ENOSPC; buflen += SKB_DATA_ALIGN(len + pad); buf = page_frag_alloc_align(&net->pf_cache, buflen, GFP_KERNEL, SMP_CACHE_BYTES); if (unlikely(!buf)) return -ENOMEM; copied = copy_from_iter(buf + offsetof(struct tun_xdp_hdr, gso), sock_hlen, from); if (copied != sock_hlen) { ret = -EFAULT; goto err; } hdr = buf; gso = &hdr->gso; if (!sock_hlen) memset(buf, 0, pad); if ((gso->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) && vhost16_to_cpu(vq, gso->csum_start) + vhost16_to_cpu(vq, gso->csum_offset) + 2 > vhost16_to_cpu(vq, gso->hdr_len)) { gso->hdr_len = cpu_to_vhost16(vq, vhost16_to_cpu(vq, gso->csum_start) + vhost16_to_cpu(vq, gso->csum_offset) + 2); if (vhost16_to_cpu(vq, gso->hdr_len) > len) { ret = -EINVAL; goto err; } } len -= sock_hlen; copied = copy_from_iter(buf + pad, len, from); if (copied != len) { ret = -EFAULT; goto err; } xdp_init_buff(xdp, buflen, NULL); xdp_prepare_buff(xdp, buf, pad, len, true); hdr->buflen = buflen; ++nvq->batched_xdp; return 0; err: page_frag_free(buf); return ret; } static void handle_tx_copy(struct vhost_net *net, struct socket *sock) { struct vhost_net_virtqueue *nvq = &net->vqs[VHOST_NET_VQ_TX]; struct vhost_virtqueue *vq = &nvq->vq; unsigned out, in; int head; struct msghdr msg = { .msg_name = NULL, .msg_namelen = 0, .msg_control = NULL, .msg_controllen = 0, .msg_flags = MSG_DONTWAIT, }; size_t len, total_len = 0; int err; int sent_pkts = 0; bool sock_can_batch = (sock->sk->sk_sndbuf == INT_MAX); do { bool busyloop_intr = false; if (nvq->done_idx == VHOST_NET_BATCH) vhost_tx_batch(net, nvq, sock, &msg); head = get_tx_bufs(net, nvq, &msg, &out, &in, &len, &busyloop_intr); /* On error, stop handling until the next kick. */ if (unlikely(head < 0)) break; /* Nothing new? Wait for eventfd to tell us they refilled. */ if (head == vq->num) { if (unlikely(busyloop_intr)) { vhost_poll_queue(&vq->poll); } else if (unlikely(vhost_enable_notify(&net->dev, vq))) { vhost_disable_notify(&net->dev, vq); continue; } break; } total_len += len; /* For simplicity, TX batching is only enabled if * sndbuf is unlimited. */ if (sock_can_batch) { err = vhost_net_build_xdp(nvq, &msg.msg_iter); if (!err) { goto done; } else if (unlikely(err != -ENOSPC)) { vhost_tx_batch(net, nvq, sock, &msg); vhost_discard_vq_desc(vq, 1); vhost_net_enable_vq(net, vq); break; } /* We can't build XDP buff, go for single * packet path but let's flush batched * packets. */ vhost_tx_batch(net, nvq, sock, &msg); msg.msg_control = NULL; } else { if (tx_can_batch(vq, total_len)) msg.msg_flags |= MSG_MORE; else msg.msg_flags &= ~MSG_MORE; } err = sock->ops->sendmsg(sock, &msg, len); if (unlikely(err < 0)) { if (err == -EAGAIN || err == -ENOMEM || err == -ENOBUFS) { vhost_discard_vq_desc(vq, 1); vhost_net_enable_vq(net, vq); break; } pr_debug("Fail to send packet: err %d", err); } else if (unlikely(err != len)) pr_debug("Truncated TX packet: len %d != %zd\n", err, len); done: vq->heads[nvq->done_idx].id = cpu_to_vhost32(vq, head); vq->heads[nvq->done_idx].len = 0; ++nvq->done_idx; } while (likely(!vhost_exceeds_weight(vq, ++sent_pkts, total_len))); vhost_tx_batch(net, nvq, sock, &msg); } static void handle_tx_zerocopy(struct vhost_net *net, struct socket *sock) { struct vhost_net_virtqueue *nvq = &net->vqs[VHOST_NET_VQ_TX]; struct vhost_virtqueue *vq = &nvq->vq; unsigned out, in; int head; struct msghdr msg = { .msg_name = NULL, .msg_namelen = 0, .msg_control = NULL, .msg_controllen = 0, .msg_flags = MSG_DONTWAIT, }; struct tun_msg_ctl ctl; size_t len, total_len = 0; int err; struct vhost_net_ubuf_ref *ubufs; struct ubuf_info_msgzc *ubuf; bool zcopy_used; int sent_pkts = 0; do { bool busyloop_intr; /* Release DMAs done buffers first */ vhost_zerocopy_signal_used(net, vq); busyloop_intr = false; head = get_tx_bufs(net, nvq, &msg, &out, &in, &len, &busyloop_intr); /* On error, stop handling until the next kick. */ if (unlikely(head < 0)) break; /* Nothing new? Wait for eventfd to tell us they refilled. */ if (head == vq->num) { if (unlikely(busyloop_intr)) { vhost_poll_queue(&vq->poll); } else if (unlikely(vhost_enable_notify(&net->dev, vq))) { vhost_disable_notify(&net->dev, vq); continue; } break; } zcopy_used = len >= VHOST_GOODCOPY_LEN && !vhost_exceeds_maxpend(net) && vhost_net_tx_select_zcopy(net); /* use msg_control to pass vhost zerocopy ubuf info to skb */ if (zcopy_used) { ubuf = nvq->ubuf_info + nvq->upend_idx; vq->heads[nvq->upend_idx].id = cpu_to_vhost32(vq, head); vq->heads[nvq->upend_idx].len = VHOST_DMA_IN_PROGRESS; ubuf->ctx = nvq->ubufs; ubuf->desc = nvq->upend_idx; ubuf->ubuf.ops = &vhost_ubuf_ops; ubuf->ubuf.flags = SKBFL_ZEROCOPY_FRAG; refcount_set(&ubuf->ubuf.refcnt, 1); msg.msg_control = &ctl; ctl.type = TUN_MSG_UBUF; ctl.ptr = &ubuf->ubuf; msg.msg_controllen = sizeof(ctl); ubufs = nvq->ubufs; atomic_inc(&ubufs->refcount); nvq->upend_idx = (nvq->upend_idx + 1) % UIO_MAXIOV; } else { msg.msg_control = NULL; ubufs = NULL; } total_len += len; if (tx_can_batch(vq, total_len) && likely(!vhost_exceeds_maxpend(net))) { msg.msg_flags |= MSG_MORE; } else { msg.msg_flags &= ~MSG_MORE; } err = sock->ops->sendmsg(sock, &msg, len); if (unlikely(err < 0)) { bool retry = err == -EAGAIN || err == -ENOMEM || err == -ENOBUFS; if (zcopy_used) { if (vq->heads[ubuf->desc].len == VHOST_DMA_IN_PROGRESS) vhost_net_ubuf_put(ubufs); if (retry) nvq->upend_idx = ((unsigned)nvq->upend_idx - 1) % UIO_MAXIOV; else vq->heads[ubuf->desc].len = VHOST_DMA_DONE_LEN; } if (retry) { vhost_discard_vq_desc(vq, 1); vhost_net_enable_vq(net, vq); break; } pr_debug("Fail to send packet: err %d", err); } else if (unlikely(err != len)) pr_debug("Truncated TX packet: " " len %d != %zd\n", err, len); if (!zcopy_used) vhost_add_used_and_signal(&net->dev, vq, head, 0); else vhost_zerocopy_signal_used(net, vq); vhost_net_tx_packet(net); } while (likely(!vhost_exceeds_weight(vq, ++sent_pkts, total_len))); } /* Expects to be always run from workqueue - which acts as * read-size critical section for our kind of RCU. */ static void handle_tx(struct vhost_net *net) { struct vhost_net_virtqueue *nvq = &net->vqs[VHOST_NET_VQ_TX]; struct vhost_virtqueue *vq = &nvq->vq; struct socket *sock; mutex_lock_nested(&vq->mutex, VHOST_NET_VQ_TX); sock = vhost_vq_get_backend(vq); if (!sock) goto out; if (!vq_meta_prefetch(vq)) goto out; vhost_disable_notify(&net->dev, vq); vhost_net_disable_vq(net, vq); if (vhost_sock_zcopy(sock)) handle_tx_zerocopy(net, sock); else handle_tx_copy(net, sock); out: mutex_unlock(&vq->mutex); } static int peek_head_len(struct vhost_net_virtqueue *rvq, struct sock *sk) { struct sk_buff *head; int len = 0; unsigned long flags; if (rvq->rx_ring) return vhost_net_buf_peek(rvq); spin_lock_irqsave(&sk->sk_receive_queue.lock, flags); head = skb_peek(&sk->sk_receive_queue); if (likely(head)) { len = head->len; if (skb_vlan_tag_present(head)) len += VLAN_HLEN; } spin_unlock_irqrestore(&sk->sk_receive_queue.lock, flags); return len; } static int vhost_net_rx_peek_head_len(struct vhost_net *net, struct sock *sk, bool *busyloop_intr) { struct vhost_net_virtqueue *rnvq = &net->vqs[VHOST_NET_VQ_RX]; struct vhost_net_virtqueue *tnvq = &net->vqs[VHOST_NET_VQ_TX]; struct vhost_virtqueue *rvq = &rnvq->vq; struct vhost_virtqueue *tvq = &tnvq->vq; int len = peek_head_len(rnvq, sk); if (!len && rvq->busyloop_timeout) { /* Flush batched heads first */ vhost_net_signal_used(rnvq); /* Both tx vq and rx socket were polled here */ vhost_net_busy_poll(net, rvq, tvq, busyloop_intr, true); len = peek_head_len(rnvq, sk); } return len; } /* This is a multi-buffer version of vhost_get_desc, that works if * vq has read descriptors only. * @vq - the relevant virtqueue * @datalen - data length we'll be reading * @iovcount - returned count of io vectors we fill * @log - vhost log * @log_num - log offset * @quota - headcount quota, 1 for big buffer * returns number of buffer heads allocated, negative on error */ static int get_rx_bufs(struct vhost_virtqueue *vq, struct vring_used_elem *heads, int datalen, unsigned *iovcount, struct vhost_log *log, unsigned *log_num, unsigned int quota) { unsigned int out, in; int seg = 0; int headcount = 0; unsigned d; int r, nlogs = 0; /* len is always initialized before use since we are always called with * datalen > 0. */ u32 len; while (datalen > 0 && headcount < quota) { if (unlikely(seg >= UIO_MAXIOV)) { r = -ENOBUFS; goto err; } r = vhost_get_vq_desc(vq, vq->iov + seg, ARRAY_SIZE(vq->iov) - seg, &out, &in, log, log_num); if (unlikely(r < 0)) goto err; d = r; if (d == vq->num) { r = 0; goto err; } if (unlikely(out || in <= 0)) { vq_err(vq, "unexpected descriptor format for RX: " "out %d, in %d\n", out, in); r = -EINVAL; goto err; } if (unlikely(log)) { nlogs += *log_num; log += *log_num; } heads[headcount].id = cpu_to_vhost32(vq, d); len = iov_length(vq->iov + seg, in); heads[headcount].len = cpu_to_vhost32(vq, len); datalen -= len; ++headcount; seg += in; } heads[headcount - 1].len = cpu_to_vhost32(vq, len + datalen); *iovcount = seg; if (unlikely(log)) *log_num = nlogs; /* Detect overrun */ if (unlikely(datalen > 0)) { r = UIO_MAXIOV + 1; goto err; } return headcount; err: vhost_discard_vq_desc(vq, headcount); return r; } /* Expects to be always run from workqueue - which acts as * read-size critical section for our kind of RCU. */ static void handle_rx(struct vhost_net *net) { struct vhost_net_virtqueue *nvq = &net->vqs[VHOST_NET_VQ_RX]; struct vhost_virtqueue *vq = &nvq->vq; unsigned in, log; struct vhost_log *vq_log; struct msghdr msg = { .msg_name = NULL, .msg_namelen = 0, .msg_control = NULL, /* FIXME: get and handle RX aux data. */ .msg_controllen = 0, .msg_flags = MSG_DONTWAIT, }; struct virtio_net_hdr hdr = { .flags = 0, .gso_type = VIRTIO_NET_HDR_GSO_NONE }; size_t total_len = 0; int err, mergeable; s16 headcount; size_t vhost_hlen, sock_hlen; size_t vhost_len, sock_len; bool busyloop_intr = false; struct socket *sock; struct iov_iter fixup; __virtio16 num_buffers; int recv_pkts = 0; mutex_lock_nested(&vq->mutex, VHOST_NET_VQ_RX); sock = vhost_vq_get_backend(vq); if (!sock) goto out; if (!vq_meta_prefetch(vq)) goto out; vhost_disable_notify(&net->dev, vq); vhost_net_disable_vq(net, vq); vhost_hlen = nvq->vhost_hlen; sock_hlen = nvq->sock_hlen; vq_log = unlikely(vhost_has_feature(vq, VHOST_F_LOG_ALL)) ? vq->log : NULL; mergeable = vhost_has_feature(vq, VIRTIO_NET_F_MRG_RXBUF); do { sock_len = vhost_net_rx_peek_head_len(net, sock->sk, &busyloop_intr); if (!sock_len) break; sock_len += sock_hlen; vhost_len = sock_len + vhost_hlen; headcount = get_rx_bufs(vq, vq->heads + nvq->done_idx, vhost_len, &in, vq_log, &log, likely(mergeable) ? UIO_MAXIOV : 1); /* On error, stop handling until the next kick. */ if (unlikely(headcount < 0)) goto out; /* OK, now we need to know about added descriptors. */ if (!headcount) { if (unlikely(busyloop_intr)) { vhost_poll_queue(&vq->poll); } else if (unlikely(vhost_enable_notify(&net->dev, vq))) { /* They have slipped one in as we were * doing that: check again. */ vhost_disable_notify(&net->dev, vq); continue; } /* Nothing new? Wait for eventfd to tell us * they refilled. */ goto out; } busyloop_intr = false; if (nvq->rx_ring) msg.msg_control = vhost_net_buf_consume(&nvq->rxq); /* On overrun, truncate and discard */ if (unlikely(headcount > UIO_MAXIOV)) { iov_iter_init(&msg.msg_iter, ITER_DEST, vq->iov, 1, 1); err = sock->ops->recvmsg(sock, &msg, 1, MSG_DONTWAIT | MSG_TRUNC); pr_debug("Discarded rx packet: len %zd\n", sock_len); continue; } /* We don't need to be notified again. */ iov_iter_init(&msg.msg_iter, ITER_DEST, vq->iov, in, vhost_len); fixup = msg.msg_iter; if (unlikely((vhost_hlen))) { /* We will supply the header ourselves * TODO: support TSO. */ iov_iter_advance(&msg.msg_iter, vhost_hlen); } err = sock->ops->recvmsg(sock, &msg, sock_len, MSG_DONTWAIT | MSG_TRUNC); /* Userspace might have consumed the packet meanwhile: * it's not supposed to do this usually, but might be hard * to prevent. Discard data we got (if any) and keep going. */ if (unlikely(err != sock_len)) { pr_debug("Discarded rx packet: " " len %d, expected %zd\n", err, sock_len); vhost_discard_vq_desc(vq, headcount); continue; } /* Supply virtio_net_hdr if VHOST_NET_F_VIRTIO_NET_HDR */ if (unlikely(vhost_hlen)) { if (copy_to_iter(&hdr, sizeof(hdr), &fixup) != sizeof(hdr)) { vq_err(vq, "Unable to write vnet_hdr " "at addr %p\n", vq->iov->iov_base); goto out; } } else { /* Header came from socket; we'll need to patch * ->num_buffers over if VIRTIO_NET_F_MRG_RXBUF */ iov_iter_advance(&fixup, sizeof(hdr)); } /* TODO: Should check and handle checksum. */ num_buffers = cpu_to_vhost16(vq, headcount); if (likely(mergeable) && copy_to_iter(&num_buffers, sizeof num_buffers, &fixup) != sizeof num_buffers) { vq_err(vq, "Failed num_buffers write"); vhost_discard_vq_desc(vq, headcount); goto out; } nvq->done_idx += headcount; if (nvq->done_idx > VHOST_NET_BATCH) vhost_net_signal_used(nvq); if (unlikely(vq_log)) vhost_log_write(vq, vq_log, log, vhost_len, vq->iov, in); total_len += vhost_len; } while (likely(!vhost_exceeds_weight(vq, ++recv_pkts, total_len))); if (unlikely(busyloop_intr)) vhost_poll_queue(&vq->poll); else if (!sock_len) vhost_net_enable_vq(net, vq); out: vhost_net_signal_used(nvq); mutex_unlock(&vq->mutex); } static void handle_tx_kick(struct vhost_work *work) { struct vhost_virtqueue *vq = container_of(work, struct vhost_virtqueue, poll.work); struct vhost_net *net = container_of(vq->dev, struct vhost_net, dev); handle_tx(net); } static void handle_rx_kick(struct vhost_work *work) { struct vhost_virtqueue *vq = container_of(work, struct vhost_virtqueue, poll.work); struct vhost_net *net = container_of(vq->dev, struct vhost_net, dev); handle_rx(net); } static void handle_tx_net(struct vhost_work *work) { struct vhost_net *net = container_of(work, struct vhost_net, poll[VHOST_NET_VQ_TX].work); handle_tx(net); } static void handle_rx_net(struct vhost_work *work) { struct vhost_net *net = container_of(work, struct vhost_net, poll[VHOST_NET_VQ_RX].work); handle_rx(net); } static int vhost_net_open(struct inode *inode, struct file *f) { struct vhost_net *n; struct vhost_dev *dev; struct vhost_virtqueue **vqs; void **queue; struct xdp_buff *xdp; int i; n = kvmalloc(sizeof *n, GFP_KERNEL | __GFP_RETRY_MAYFAIL); if (!n) return -ENOMEM; vqs = kmalloc_array(VHOST_NET_VQ_MAX, sizeof(*vqs), GFP_KERNEL); if (!vqs) { kvfree(n); return -ENOMEM; } queue = kmalloc_array(VHOST_NET_BATCH, sizeof(void *), GFP_KERNEL); if (!queue) { kfree(vqs); kvfree(n); return -ENOMEM; } n->vqs[VHOST_NET_VQ_RX].rxq.queue = queue; xdp = kmalloc_array(VHOST_NET_BATCH, sizeof(*xdp), GFP_KERNEL); if (!xdp) { kfree(vqs); kvfree(n); kfree(queue); return -ENOMEM; } n->vqs[VHOST_NET_VQ_TX].xdp = xdp; dev = &n->dev; vqs[VHOST_NET_VQ_TX] = &n->vqs[VHOST_NET_VQ_TX].vq; vqs[VHOST_NET_VQ_RX] = &n->vqs[VHOST_NET_VQ_RX].vq; n->vqs[VHOST_NET_VQ_TX].vq.handle_kick = handle_tx_kick; n->vqs[VHOST_NET_VQ_RX].vq.handle_kick = handle_rx_kick; for (i = 0; i < VHOST_NET_VQ_MAX; i++) { n->vqs[i].ubufs = NULL; n->vqs[i].ubuf_info = NULL; n->vqs[i].upend_idx = 0; n->vqs[i].done_idx = 0; n->vqs[i].batched_xdp = 0; n->vqs[i].vhost_hlen = 0; n->vqs[i].sock_hlen = 0; n->vqs[i].rx_ring = NULL; vhost_net_buf_init(&n->vqs[i].rxq); } vhost_dev_init(dev, vqs, VHOST_NET_VQ_MAX, UIO_MAXIOV + VHOST_NET_BATCH, VHOST_NET_PKT_WEIGHT, VHOST_NET_WEIGHT, true, NULL); vhost_poll_init(n->poll + VHOST_NET_VQ_TX, handle_tx_net, EPOLLOUT, dev, vqs[VHOST_NET_VQ_TX]); vhost_poll_init(n->poll + VHOST_NET_VQ_RX, handle_rx_net, EPOLLIN, dev, vqs[VHOST_NET_VQ_RX]); f->private_data = n; n->pf_cache.va = NULL; return 0; } static struct socket *vhost_net_stop_vq(struct vhost_net *n, struct vhost_virtqueue *vq) { struct socket *sock; struct vhost_net_virtqueue *nvq = container_of(vq, struct vhost_net_virtqueue, vq); mutex_lock(&vq->mutex); sock = vhost_vq_get_backend(vq); vhost_net_disable_vq(n, vq); vhost_vq_set_backend(vq, NULL); vhost_net_buf_unproduce(nvq); nvq->rx_ring = NULL; mutex_unlock(&vq->mutex); return sock; } static void vhost_net_stop(struct vhost_net *n, struct socket **tx_sock, struct socket **rx_sock) { *tx_sock = vhost_net_stop_vq(n, &n->vqs[VHOST_NET_VQ_TX].vq); *rx_sock = vhost_net_stop_vq(n, &n->vqs[VHOST_NET_VQ_RX].vq); } static void vhost_net_flush(struct vhost_net *n) { vhost_dev_flush(&n->dev); if (n->vqs[VHOST_NET_VQ_TX].ubufs) { mutex_lock(&n->vqs[VHOST_NET_VQ_TX].vq.mutex); n->tx_flush = true; mutex_unlock(&n->vqs[VHOST_NET_VQ_TX].vq.mutex); /* Wait for all lower device DMAs done. */ vhost_net_ubuf_put_and_wait(n->vqs[VHOST_NET_VQ_TX].ubufs); mutex_lock(&n->vqs[VHOST_NET_VQ_TX].vq.mutex); n->tx_flush = false; atomic_set(&n->vqs[VHOST_NET_VQ_TX].ubufs->refcount, 1); mutex_unlock(&n->vqs[VHOST_NET_VQ_TX].vq.mutex); } } static int vhost_net_release(struct inode *inode, struct file *f) { struct vhost_net *n = f->private_data; struct socket *tx_sock; struct socket *rx_sock; vhost_net_stop(n, &tx_sock, &rx_sock); vhost_net_flush(n); vhost_dev_stop(&n->dev); vhost_dev_cleanup(&n->dev); vhost_net_vq_reset(n); if (tx_sock) sockfd_put(tx_sock); if (rx_sock) sockfd_put(rx_sock); /* Make sure no callbacks are outstanding */ synchronize_rcu(); /* We do an extra flush before freeing memory, * since jobs can re-queue themselves. */ vhost_net_flush(n); kfree(n->vqs[VHOST_NET_VQ_RX].rxq.queue); kfree(n->vqs[VHOST_NET_VQ_TX].xdp); kfree(n->dev.vqs); page_frag_cache_drain(&n->pf_cache); kvfree(n); return 0; } static struct socket *get_raw_socket(int fd) { int r; struct socket *sock = sockfd_lookup(fd, &r); if (!sock) return ERR_PTR(-ENOTSOCK); /* Parameter checking */ if (sock->sk->sk_type != SOCK_RAW) { r = -ESOCKTNOSUPPORT; goto err; } if (sock->sk->sk_family != AF_PACKET) { r = -EPFNOSUPPORT; goto err; } return sock; err: sockfd_put(sock); return ERR_PTR(r); } static struct ptr_ring *get_tap_ptr_ring(struct file *file) { struct ptr_ring *ring; ring = tun_get_tx_ring(file); if (!IS_ERR(ring)) goto out; ring = tap_get_ptr_ring(file); if (!IS_ERR(ring)) goto out; ring = NULL; out: return ring; } static struct socket *get_tap_socket(int fd) { struct file *file = fget(fd); struct socket *sock; if (!file) return ERR_PTR(-EBADF); sock = tun_get_socket(file); if (!IS_ERR(sock)) return sock; sock = tap_get_socket(file); if (IS_ERR(sock)) fput(file); return sock; } static struct socket *get_socket(int fd) { struct socket *sock; /* special case to disable backend */ if (fd == -1) return NULL; sock = get_raw_socket(fd); if (!IS_ERR(sock)) return sock; sock = get_tap_socket(fd); if (!IS_ERR(sock)) return sock; return ERR_PTR(-ENOTSOCK); } static long vhost_net_set_backend(struct vhost_net *n, unsigned index, int fd) { struct socket *sock, *oldsock; struct vhost_virtqueue *vq; struct vhost_net_virtqueue *nvq; struct vhost_net_ubuf_ref *ubufs, *oldubufs = NULL; int r; mutex_lock(&n->dev.mutex); r = vhost_dev_check_owner(&n->dev); if (r) goto err; if (index >= VHOST_NET_VQ_MAX) { r = -ENOBUFS; goto err; } vq = &n->vqs[index].vq; nvq = &n->vqs[index]; mutex_lock(&vq->mutex); if (fd == -1) vhost_clear_msg(&n->dev); /* Verify that ring has been setup correctly. */ if (!vhost_vq_access_ok(vq)) { r = -EFAULT; goto err_vq; } sock = get_socket(fd); if (IS_ERR(sock)) { r = PTR_ERR(sock); goto err_vq; } /* start polling new socket */ oldsock = vhost_vq_get_backend(vq); if (sock != oldsock) { ubufs = vhost_net_ubuf_alloc(vq, sock && vhost_sock_zcopy(sock)); if (IS_ERR(ubufs)) { r = PTR_ERR(ubufs); goto err_ubufs; } vhost_net_disable_vq(n, vq); vhost_vq_set_backend(vq, sock); vhost_net_buf_unproduce(nvq); r = vhost_vq_init_access(vq); if (r) goto err_used; r = vhost_net_enable_vq(n, vq); if (r) goto err_used; if (index == VHOST_NET_VQ_RX) { if (sock) nvq->rx_ring = get_tap_ptr_ring(sock->file); else nvq->rx_ring = NULL; } oldubufs = nvq->ubufs; nvq->ubufs = ubufs; n->tx_packets = 0; n->tx_zcopy_err = 0; n->tx_flush = false; } mutex_unlock(&vq->mutex); if (oldubufs) { vhost_net_ubuf_put_wait_and_free(oldubufs); mutex_lock(&vq->mutex); vhost_zerocopy_signal_used(n, vq); mutex_unlock(&vq->mutex); } if (oldsock) { vhost_dev_flush(&n->dev); sockfd_put(oldsock); } mutex_unlock(&n->dev.mutex); return 0; err_used: vhost_vq_set_backend(vq, oldsock); vhost_net_enable_vq(n, vq); if (ubufs) vhost_net_ubuf_put_wait_and_free(ubufs); err_ubufs: if (sock) sockfd_put(sock); err_vq: mutex_unlock(&vq->mutex); err: mutex_unlock(&n->dev.mutex); return r; } static long vhost_net_reset_owner(struct vhost_net *n) { struct socket *tx_sock = NULL; struct socket *rx_sock = NULL; long err; struct vhost_iotlb *umem; mutex_lock(&n->dev.mutex); err = vhost_dev_check_owner(&n->dev); if (err) goto done; umem = vhost_dev_reset_owner_prepare(); if (!umem) { err = -ENOMEM; goto done; } vhost_net_stop(n, &tx_sock, &rx_sock); vhost_net_flush(n); vhost_dev_stop(&n->dev); vhost_dev_reset_owner(&n->dev, umem); vhost_net_vq_reset(n); done: mutex_unlock(&n->dev.mutex); if (tx_sock) sockfd_put(tx_sock); if (rx_sock) sockfd_put(rx_sock); return err; } static int vhost_net_set_features(struct vhost_net *n, u64 features) { size_t vhost_hlen, sock_hlen, hdr_len; int i; hdr_len = (features & ((1ULL << VIRTIO_NET_F_MRG_RXBUF) | (1ULL << VIRTIO_F_VERSION_1))) ? sizeof(struct virtio_net_hdr_mrg_rxbuf) : sizeof(struct virtio_net_hdr); if (features & (1 << VHOST_NET_F_VIRTIO_NET_HDR)) { /* vhost provides vnet_hdr */ vhost_hlen = hdr_len; sock_hlen = 0; } else { /* socket provides vnet_hdr */ vhost_hlen = 0; sock_hlen = hdr_len; } mutex_lock(&n->dev.mutex); if ((features & (1 << VHOST_F_LOG_ALL)) && !vhost_log_access_ok(&n->dev)) goto out_unlock; if ((features & (1ULL << VIRTIO_F_ACCESS_PLATFORM))) { if (vhost_init_device_iotlb(&n->dev)) goto out_unlock; } for (i = 0; i < VHOST_NET_VQ_MAX; ++i) { mutex_lock(&n->vqs[i].vq.mutex); n->vqs[i].vq.acked_features = features; n->vqs[i].vhost_hlen = vhost_hlen; n->vqs[i].sock_hlen = sock_hlen; mutex_unlock(&n->vqs[i].vq.mutex); } mutex_unlock(&n->dev.mutex); return 0; out_unlock: mutex_unlock(&n->dev.mutex); return -EFAULT; } static long vhost_net_set_owner(struct vhost_net *n) { int r; mutex_lock(&n->dev.mutex); if (vhost_dev_has_owner(&n->dev)) { r = -EBUSY; goto out; } r = vhost_net_set_ubuf_info(n); if (r) goto out; r = vhost_dev_set_owner(&n->dev); if (r) vhost_net_clear_ubuf_info(n); vhost_net_flush(n); out: mutex_unlock(&n->dev.mutex); return r; } static long vhost_net_ioctl(struct file *f, unsigned int ioctl, unsigned long arg) { struct vhost_net *n = f->private_data; void __user *argp = (void __user *)arg; u64 __user *featurep = argp; struct vhost_vring_file backend; u64 features; int r; switch (ioctl) { case VHOST_NET_SET_BACKEND: if (copy_from_user(&backend, argp, sizeof backend)) return -EFAULT; return vhost_net_set_backend(n, backend.index, backend.fd); case VHOST_GET_FEATURES: features = VHOST_NET_FEATURES; if (copy_to_user(featurep, &features, sizeof features)) return -EFAULT; return 0; case VHOST_SET_FEATURES: if (copy_from_user(&features, featurep, sizeof features)) return -EFAULT; if (features & ~VHOST_NET_FEATURES) return -EOPNOTSUPP; return vhost_net_set_features(n, features); case VHOST_GET_BACKEND_FEATURES: features = VHOST_NET_BACKEND_FEATURES; if (copy_to_user(featurep, &features, sizeof(features))) return -EFAULT; return 0; case VHOST_SET_BACKEND_FEATURES: if (copy_from_user(&features, featurep, sizeof(features))) return -EFAULT; if (features & ~VHOST_NET_BACKEND_FEATURES) return -EOPNOTSUPP; vhost_set_backend_features(&n->dev, features); return 0; case VHOST_RESET_OWNER: return vhost_net_reset_owner(n); case VHOST_SET_OWNER: return vhost_net_set_owner(n); default: mutex_lock(&n->dev.mutex); r = vhost_dev_ioctl(&n->dev, ioctl, argp); if (r == -ENOIOCTLCMD) r = vhost_vring_ioctl(&n->dev, ioctl, argp); else vhost_net_flush(n); mutex_unlock(&n->dev.mutex); return r; } } static ssize_t vhost_net_chr_read_iter(struct kiocb *iocb, struct iov_iter *to) { struct file *file = iocb->ki_filp; struct vhost_net *n = file->private_data; struct vhost_dev *dev = &n->dev; int noblock = file->f_flags & O_NONBLOCK; return vhost_chr_read_iter(dev, to, noblock); } static ssize_t vhost_net_chr_write_iter(struct kiocb *iocb, struct iov_iter *from) { struct file *file = iocb->ki_filp; struct vhost_net *n = file->private_data; struct vhost_dev *dev = &n->dev; return vhost_chr_write_iter(dev, from); } static __poll_t vhost_net_chr_poll(struct file *file, poll_table *wait) { struct vhost_net *n = file->private_data; struct vhost_dev *dev = &n->dev; return vhost_chr_poll(file, dev, wait); } static const struct file_operations vhost_net_fops = { .owner = THIS_MODULE, .release = vhost_net_release, .read_iter = vhost_net_chr_read_iter, .write_iter = vhost_net_chr_write_iter, .poll = vhost_net_chr_poll, .unlocked_ioctl = vhost_net_ioctl, .compat_ioctl = compat_ptr_ioctl, .open = vhost_net_open, .llseek = noop_llseek, }; static struct miscdevice vhost_net_misc = { .minor = VHOST_NET_MINOR, .name = "vhost-net", .fops = &vhost_net_fops, }; static int __init vhost_net_init(void) { if (experimental_zcopytx) vhost_net_enable_zcopy(VHOST_NET_VQ_TX); return misc_register(&vhost_net_misc); } module_init(vhost_net_init); static void __exit vhost_net_exit(void) { misc_deregister(&vhost_net_misc); } module_exit(vhost_net_exit); MODULE_VERSION("0.0.1"); MODULE_LICENSE("GPL v2"); MODULE_AUTHOR("Michael S. Tsirkin"); MODULE_DESCRIPTION("Host kernel accelerator for virtio net"); MODULE_ALIAS_MISCDEV(VHOST_NET_MINOR); MODULE_ALIAS("devname:vhost-net"); |
| 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 | /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * Skb ref helpers. * */ #ifndef _LINUX_SKBUFF_REF_H #define _LINUX_SKBUFF_REF_H #include <linux/skbuff.h> /** * __skb_frag_ref - take an addition reference on a paged fragment. * @frag: the paged fragment * * Takes an additional reference on the paged fragment @frag. */ static inline void __skb_frag_ref(skb_frag_t *frag) { get_page(skb_frag_page(frag)); } /** * skb_frag_ref - take an addition reference on a paged fragment of an skb. * @skb: the buffer * @f: the fragment offset. * * Takes an additional reference on the @f'th paged fragment of @skb. */ static inline void skb_frag_ref(struct sk_buff *skb, int f) { __skb_frag_ref(&skb_shinfo(skb)->frags[f]); } bool napi_pp_put_page(netmem_ref netmem); static inline void skb_page_unref(netmem_ref netmem, bool recycle) { #ifdef CONFIG_PAGE_POOL if (recycle && napi_pp_put_page(netmem)) return; #endif put_page(netmem_to_page(netmem)); } /** * __skb_frag_unref - release a reference on a paged fragment. * @frag: the paged fragment * @recycle: recycle the page if allocated via page_pool * * Releases a reference on the paged fragment @frag * or recycles the page via the page_pool API. */ static inline void __skb_frag_unref(skb_frag_t *frag, bool recycle) { skb_page_unref(skb_frag_netmem(frag), recycle); } /** * skb_frag_unref - release a reference on a paged fragment of an skb. * @skb: the buffer * @f: the fragment offset * * Releases a reference on the @f'th paged fragment of @skb. */ static inline void skb_frag_unref(struct sk_buff *skb, int f) { struct skb_shared_info *shinfo = skb_shinfo(skb); if (!skb_zcopy_managed(skb)) __skb_frag_unref(&shinfo->frags[f], skb->pp_recycle); } #endif /* _LINUX_SKBUFF_REF_H */ |
| 12 12 12 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 | // SPDX-License-Identifier: GPL-2.0 #include <linux/build_bug.h> #include <linux/errno.h> #include <linux/errname.h> #include <linux/kernel.h> #include <linux/math.h> /* * Ensure these tables do not accidentally become gigantic if some * huge errno makes it in. On most architectures, the first table will * only have about 140 entries, but mips and parisc have more sparsely * allocated errnos (with EHWPOISON = 257 on parisc, and EDQUOT = 1133 * on mips), so this wastes a bit of space on those - though we * special case the EDQUOT case. */ #define E(err) [err + BUILD_BUG_ON_ZERO(err <= 0 || err > 300)] = "-" #err static const char *names_0[] = { E(E2BIG), E(EACCES), E(EADDRINUSE), E(EADDRNOTAVAIL), E(EADV), E(EAFNOSUPPORT), E(EAGAIN), /* EWOULDBLOCK */ E(EALREADY), E(EBADE), E(EBADF), E(EBADFD), E(EBADMSG), E(EBADR), E(EBADRQC), E(EBADSLT), E(EBFONT), E(EBUSY), E(ECANCELED), /* ECANCELLED */ E(ECHILD), E(ECHRNG), E(ECOMM), E(ECONNABORTED), E(ECONNREFUSED), /* EREFUSED */ E(ECONNRESET), E(EDEADLK), /* EDEADLOCK */ #if EDEADLK != EDEADLOCK /* mips, sparc, powerpc */ E(EDEADLOCK), #endif E(EDESTADDRREQ), E(EDOM), E(EDOTDOT), #ifndef CONFIG_MIPS E(EDQUOT), #endif E(EEXIST), E(EFAULT), E(EFBIG), E(EHOSTDOWN), E(EHOSTUNREACH), E(EHWPOISON), E(EIDRM), E(EILSEQ), #ifdef EINIT E(EINIT), #endif E(EINPROGRESS), E(EINTR), E(EINVAL), E(EIO), E(EISCONN), E(EISDIR), E(EISNAM), E(EKEYEXPIRED), E(EKEYREJECTED), E(EKEYREVOKED), E(EL2HLT), E(EL2NSYNC), E(EL3HLT), E(EL3RST), E(ELIBACC), E(ELIBBAD), E(ELIBEXEC), E(ELIBMAX), E(ELIBSCN), E(ELNRNG), E(ELOOP), E(EMEDIUMTYPE), E(EMFILE), E(EMLINK), E(EMSGSIZE), E(EMULTIHOP), E(ENAMETOOLONG), E(ENAVAIL), E(ENETDOWN), E(ENETRESET), E(ENETUNREACH), E(ENFILE), E(ENOANO), E(ENOBUFS), E(ENOCSI), E(ENODATA), E(ENODEV), E(ENOENT), E(ENOEXEC), E(ENOKEY), E(ENOLCK), E(ENOLINK), E(ENOMEDIUM), E(ENOMEM), E(ENOMSG), E(ENONET), E(ENOPKG), E(ENOPROTOOPT), E(ENOSPC), E(ENOSR), E(ENOSTR), E(ENOSYS), E(ENOTBLK), E(ENOTCONN), E(ENOTDIR), E(ENOTEMPTY), E(ENOTNAM), E(ENOTRECOVERABLE), E(ENOTSOCK), E(ENOTTY), E(ENOTUNIQ), E(ENXIO), E(EOPNOTSUPP), E(EOVERFLOW), E(EOWNERDEAD), E(EPERM), E(EPFNOSUPPORT), E(EPIPE), #ifdef EPROCLIM E(EPROCLIM), #endif E(EPROTO), E(EPROTONOSUPPORT), E(EPROTOTYPE), E(ERANGE), E(EREMCHG), #ifdef EREMDEV E(EREMDEV), #endif E(EREMOTE), E(EREMOTEIO), E(ERESTART), E(ERFKILL), E(EROFS), #ifdef ERREMOTE E(ERREMOTE), #endif E(ESHUTDOWN), E(ESOCKTNOSUPPORT), E(ESPIPE), E(ESRCH), E(ESRMNT), E(ESTALE), E(ESTRPIPE), E(ETIME), E(ETIMEDOUT), E(ETOOMANYREFS), E(ETXTBSY), E(EUCLEAN), E(EUNATCH), E(EUSERS), E(EXDEV), E(EXFULL), }; #undef E #ifdef EREFUSED /* parisc */ static_assert(EREFUSED == ECONNREFUSED); #endif #ifdef ECANCELLED /* parisc */ static_assert(ECANCELLED == ECANCELED); #endif static_assert(EAGAIN == EWOULDBLOCK); /* everywhere */ #define E(err) [err - 512 + BUILD_BUG_ON_ZERO(err < 512 || err > 550)] = "-" #err static const char *names_512[] = { E(ERESTARTSYS), E(ERESTARTNOINTR), E(ERESTARTNOHAND), E(ENOIOCTLCMD), E(ERESTART_RESTARTBLOCK), E(EPROBE_DEFER), E(EOPENSTALE), E(ENOPARAM), E(EBADHANDLE), E(ENOTSYNC), E(EBADCOOKIE), E(ENOTSUPP), E(ETOOSMALL), E(ESERVERFAULT), E(EBADTYPE), E(EJUKEBOX), E(EIOCBQUEUED), E(ERECALLCONFLICT), }; #undef E static const char *__errname(unsigned err) { if (err < ARRAY_SIZE(names_0)) return names_0[err]; if (err >= 512 && err - 512 < ARRAY_SIZE(names_512)) return names_512[err - 512]; /* But why? */ if (IS_ENABLED(CONFIG_MIPS) && err == EDQUOT) /* 1133 */ return "-EDQUOT"; return NULL; } /* * errname(EIO) -> "EIO" * errname(-EIO) -> "-EIO" */ const char *errname(int err) { const char *name = __errname(abs(err)); if (!name) return NULL; return err > 0 ? name + 1 : name; } EXPORT_SYMBOL(errname); |
| 8 8 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 | // SPDX-License-Identifier: GPL-2.0 /* * linux/kernel/seccomp.c * * Copyright 2004-2005 Andrea Arcangeli <andrea@cpushare.com> * * Copyright (C) 2012 Google, Inc. * Will Drewry <wad@chromium.org> * * This defines a simple but solid secure-computing facility. * * Mode 1 uses a fixed list of allowed system calls. * Mode 2 allows user-defined system call filters in the form * of Berkeley Packet Filters/Linux Socket Filters. */ #define pr_fmt(fmt) "seccomp: " fmt #include <linux/refcount.h> #include <linux/audit.h> #include <linux/compat.h> #include <linux/coredump.h> #include <linux/kmemleak.h> #include <linux/nospec.h> #include <linux/prctl.h> #include <linux/sched.h> #include <linux/sched/task_stack.h> #include <linux/seccomp.h> #include <linux/slab.h> #include <linux/syscalls.h> #include <linux/sysctl.h> /* Not exposed in headers: strictly internal use only. */ #define SECCOMP_MODE_DEAD (SECCOMP_MODE_FILTER + 1) #ifdef CONFIG_HAVE_ARCH_SECCOMP_FILTER #include <asm/syscall.h> #endif #ifdef CONFIG_SECCOMP_FILTER #include <linux/file.h> #include <linux/filter.h> #include <linux/pid.h> #include <linux/ptrace.h> #include <linux/capability.h> #include <linux/uaccess.h> #include <linux/anon_inodes.h> #include <linux/lockdep.h> /* * When SECCOMP_IOCTL_NOTIF_ID_VALID was first introduced, it had the * wrong direction flag in the ioctl number. This is the broken one, * which the kernel needs to keep supporting until all userspaces stop * using the wrong command number. */ #define SECCOMP_IOCTL_NOTIF_ID_VALID_WRONG_DIR SECCOMP_IOR(2, __u64) enum notify_state { SECCOMP_NOTIFY_INIT, SECCOMP_NOTIFY_SENT, SECCOMP_NOTIFY_REPLIED, }; struct seccomp_knotif { /* The struct pid of the task whose filter triggered the notification */ struct task_struct *task; /* The "cookie" for this request; this is unique for this filter. */ u64 id; /* * The seccomp data. This pointer is valid the entire time this * notification is active, since it comes from __seccomp_filter which * eclipses the entire lifecycle here. */ const struct seccomp_data *data; /* * Notification states. When SECCOMP_RET_USER_NOTIF is returned, a * struct seccomp_knotif is created and starts out in INIT. Once the * handler reads the notification off of an FD, it transitions to SENT. * If a signal is received the state transitions back to INIT and * another message is sent. When the userspace handler replies, state * transitions to REPLIED. */ enum notify_state state; /* The return values, only valid when in SECCOMP_NOTIFY_REPLIED */ int error; long val; u32 flags; /* * Signals when this has changed states, such as the listener * dying, a new seccomp addfd message, or changing to REPLIED */ struct completion ready; struct list_head list; /* outstanding addfd requests */ struct list_head addfd; }; /** * struct seccomp_kaddfd - container for seccomp_addfd ioctl messages * * @file: A reference to the file to install in the other task * @fd: The fd number to install it at. If the fd number is -1, it means the * installing process should allocate the fd as normal. * @flags: The flags for the new file descriptor. At the moment, only O_CLOEXEC * is allowed. * @ioctl_flags: The flags used for the seccomp_addfd ioctl. * @setfd: whether or not SECCOMP_ADDFD_FLAG_SETFD was set during notify_addfd * @ret: The return value of the installing process. It is set to the fd num * upon success (>= 0). * @completion: Indicates that the installing process has completed fd * installation, or gone away (either due to successful * reply, or signal) * @list: list_head for chaining seccomp_kaddfd together. * */ struct seccomp_kaddfd { struct file *file; int fd; unsigned int flags; __u32 ioctl_flags; union { bool setfd; /* To only be set on reply */ int ret; }; struct completion completion; struct list_head list; }; /** * struct notification - container for seccomp userspace notifications. Since * most seccomp filters will not have notification listeners attached and this * structure is fairly large, we store the notification-specific stuff in a * separate structure. * * @requests: A semaphore that users of this notification can wait on for * changes. Actual reads and writes are still controlled with * filter->notify_lock. * @flags: A set of SECCOMP_USER_NOTIF_FD_* flags. * @next_id: The id of the next request. * @notifications: A list of struct seccomp_knotif elements. */ struct notification { atomic_t requests; u32 flags; u64 next_id; struct list_head notifications; }; #ifdef SECCOMP_ARCH_NATIVE /** * struct action_cache - per-filter cache of seccomp actions per * arch/syscall pair * * @allow_native: A bitmap where each bit represents whether the * filter will always allow the syscall, for the * native architecture. * @allow_compat: A bitmap where each bit represents whether the * filter will always allow the syscall, for the * compat architecture. */ struct action_cache { DECLARE_BITMAP(allow_native, SECCOMP_ARCH_NATIVE_NR); #ifdef SECCOMP_ARCH_COMPAT DECLARE_BITMAP(allow_compat, SECCOMP_ARCH_COMPAT_NR); #endif }; #else struct action_cache { }; static inline bool seccomp_cache_check_allow(const struct seccomp_filter *sfilter, const struct seccomp_data *sd) { return false; } static inline void seccomp_cache_prepare(struct seccomp_filter *sfilter) { } #endif /* SECCOMP_ARCH_NATIVE */ /** * struct seccomp_filter - container for seccomp BPF programs * * @refs: Reference count to manage the object lifetime. * A filter's reference count is incremented for each directly * attached task, once for the dependent filter, and if * requested for the user notifier. When @refs reaches zero, * the filter can be freed. * @users: A filter's @users count is incremented for each directly * attached task (filter installation, fork(), thread_sync), * and once for the dependent filter (tracked in filter->prev). * When it reaches zero it indicates that no direct or indirect * users of that filter exist. No new tasks can get associated with * this filter after reaching 0. The @users count is always smaller * or equal to @refs. Hence, reaching 0 for @users does not mean * the filter can be freed. * @cache: cache of arch/syscall mappings to actions * @log: true if all actions except for SECCOMP_RET_ALLOW should be logged * @wait_killable_recv: Put notifying process in killable state once the * notification is received by the userspace listener. * @prev: points to a previously installed, or inherited, filter * @prog: the BPF program to evaluate * @notif: the struct that holds all notification related information * @notify_lock: A lock for all notification-related accesses. * @wqh: A wait queue for poll if a notifier is in use. * * seccomp_filter objects are organized in a tree linked via the @prev * pointer. For any task, it appears to be a singly-linked list starting * with current->seccomp.filter, the most recently attached or inherited filter. * However, multiple filters may share a @prev node, by way of fork(), which * results in a unidirectional tree existing in memory. This is similar to * how namespaces work. * * seccomp_filter objects should never be modified after being attached * to a task_struct (other than @refs). */ struct seccomp_filter { refcount_t refs; refcount_t users; bool log; bool wait_killable_recv; struct action_cache cache; struct seccomp_filter *prev; struct bpf_prog *prog; struct notification *notif; struct mutex notify_lock; wait_queue_head_t wqh; }; /* Limit any path through the tree to 256KB worth of instructions. */ #define MAX_INSNS_PER_PATH ((1 << 18) / sizeof(struct sock_filter)) /* * Endianness is explicitly ignored and left for BPF program authors to manage * as per the specific architecture. */ static void populate_seccomp_data(struct seccomp_data *sd) { /* * Instead of using current_pt_reg(), we're already doing the work * to safely fetch "current", so just use "task" everywhere below. */ struct task_struct *task = current; struct pt_regs *regs = task_pt_regs(task); unsigned long args[6]; sd->nr = syscall_get_nr(task, regs); sd->arch = syscall_get_arch(task); syscall_get_arguments(task, regs, args); sd->args[0] = args[0]; sd->args[1] = args[1]; sd->args[2] = args[2]; sd->args[3] = args[3]; sd->args[4] = args[4]; sd->args[5] = args[5]; sd->instruction_pointer = KSTK_EIP(task); } /** * seccomp_check_filter - verify seccomp filter code * @filter: filter to verify * @flen: length of filter * * Takes a previously checked filter (by bpf_check_classic) and * redirects all filter code that loads struct sk_buff data * and related data through seccomp_bpf_load. It also * enforces length and alignment checking of those loads. * * Returns 0 if the rule set is legal or -EINVAL if not. */ static int seccomp_check_filter(struct sock_filter *filter, unsigned int flen) { int pc; for (pc = 0; pc < flen; pc++) { struct sock_filter *ftest = &filter[pc]; u16 code = ftest->code; u32 k = ftest->k; switch (code) { case BPF_LD | BPF_W | BPF_ABS: ftest->code = BPF_LDX | BPF_W | BPF_ABS; /* 32-bit aligned and not out of bounds. */ if (k >= sizeof(struct seccomp_data) || k & 3) return -EINVAL; continue; case BPF_LD | BPF_W | BPF_LEN: ftest->code = BPF_LD | BPF_IMM; ftest->k = sizeof(struct seccomp_data); continue; case BPF_LDX | BPF_W | BPF_LEN: ftest->code = BPF_LDX | BPF_IMM; ftest->k = sizeof(struct seccomp_data); continue; /* Explicitly include allowed calls. */ case BPF_RET | BPF_K: case BPF_RET | BPF_A: case BPF_ALU | BPF_ADD | BPF_K: case BPF_ALU | BPF_ADD | BPF_X: case BPF_ALU | BPF_SUB | BPF_K: case BPF_ALU | BPF_SUB | BPF_X: case BPF_ALU | BPF_MUL | BPF_K: case BPF_ALU | BPF_MUL | BPF_X: case BPF_ALU | BPF_DIV | BPF_K: case BPF_ALU | BPF_DIV | BPF_X: case BPF_ALU | BPF_AND | BPF_K: case BPF_ALU | BPF_AND | BPF_X: case BPF_ALU | BPF_OR | BPF_K: case BPF_ALU | BPF_OR | BPF_X: case BPF_ALU | BPF_XOR | BPF_K: case BPF_ALU | BPF_XOR | BPF_X: case BPF_ALU | BPF_LSH | BPF_K: case BPF_ALU | BPF_LSH | BPF_X: case BPF_ALU | BPF_RSH | BPF_K: case BPF_ALU | BPF_RSH | BPF_X: case BPF_ALU | BPF_NEG: case BPF_LD | BPF_IMM: case BPF_LDX | BPF_IMM: case BPF_MISC | BPF_TAX: case BPF_MISC | BPF_TXA: case BPF_LD | BPF_MEM: case BPF_LDX | BPF_MEM: case BPF_ST: case BPF_STX: case BPF_JMP | BPF_JA: case BPF_JMP | BPF_JEQ | BPF_K: case BPF_JMP | BPF_JEQ | BPF_X: case BPF_JMP | BPF_JGE | BPF_K: case BPF_JMP | BPF_JGE | BPF_X: case BPF_JMP | BPF_JGT | BPF_K: case BPF_JMP | BPF_JGT | BPF_X: case BPF_JMP | BPF_JSET | BPF_K: case BPF_JMP | BPF_JSET | BPF_X: continue; default: return -EINVAL; } } return 0; } #ifdef SECCOMP_ARCH_NATIVE static inline bool seccomp_cache_check_allow_bitmap(const void *bitmap, size_t bitmap_size, int syscall_nr) { if (unlikely(syscall_nr < 0 || syscall_nr >= bitmap_size)) return false; syscall_nr = array_index_nospec(syscall_nr, bitmap_size); return test_bit(syscall_nr, bitmap); } /** * seccomp_cache_check_allow - lookup seccomp cache * @sfilter: The seccomp filter * @sd: The seccomp data to lookup the cache with * * Returns true if the seccomp_data is cached and allowed. */ static inline bool seccomp_cache_check_allow(const struct seccomp_filter *sfilter, const struct seccomp_data *sd) { int syscall_nr = sd->nr; const struct action_cache *cache = &sfilter->cache; #ifndef SECCOMP_ARCH_COMPAT /* A native-only architecture doesn't need to check sd->arch. */ return seccomp_cache_check_allow_bitmap(cache->allow_native, SECCOMP_ARCH_NATIVE_NR, syscall_nr); #else if (likely(sd->arch == SECCOMP_ARCH_NATIVE)) return seccomp_cache_check_allow_bitmap(cache->allow_native, SECCOMP_ARCH_NATIVE_NR, syscall_nr); if (likely(sd->arch == SECCOMP_ARCH_COMPAT)) return seccomp_cache_check_allow_bitmap(cache->allow_compat, SECCOMP_ARCH_COMPAT_NR, syscall_nr); #endif /* SECCOMP_ARCH_COMPAT */ WARN_ON_ONCE(true); return false; } #endif /* SECCOMP_ARCH_NATIVE */ #define ACTION_ONLY(ret) ((s32)((ret) & (SECCOMP_RET_ACTION_FULL))) /** * seccomp_run_filters - evaluates all seccomp filters against @sd * @sd: optional seccomp data to be passed to filters * @match: stores struct seccomp_filter that resulted in the return value, * unless filter returned SECCOMP_RET_ALLOW, in which case it will * be unchanged. * * Returns valid seccomp BPF response codes. */ static u32 seccomp_run_filters(const struct seccomp_data *sd, struct seccomp_filter **match) { u32 ret = SECCOMP_RET_ALLOW; /* Make sure cross-thread synced filter points somewhere sane. */ struct seccomp_filter *f = READ_ONCE(current->seccomp.filter); /* Ensure unexpected behavior doesn't result in failing open. */ if (WARN_ON(f == NULL)) return SECCOMP_RET_KILL_PROCESS; if (seccomp_cache_check_allow(f, sd)) return SECCOMP_RET_ALLOW; /* * All filters in the list are evaluated and the lowest BPF return * value always takes priority (ignoring the DATA). */ for (; f; f = f->prev) { u32 cur_ret = bpf_prog_run_pin_on_cpu(f->prog, sd); if (ACTION_ONLY(cur_ret) < ACTION_ONLY(ret)) { ret = cur_ret; *match = f; } } return ret; } #endif /* CONFIG_SECCOMP_FILTER */ static inline bool seccomp_may_assign_mode(unsigned long seccomp_mode) { assert_spin_locked(¤t->sighand->siglock); if (current->seccomp.mode && current->seccomp.mode != seccomp_mode) return false; return true; } void __weak arch_seccomp_spec_mitigate(struct task_struct *task) { } static inline void seccomp_assign_mode(struct task_struct *task, unsigned long seccomp_mode, unsigned long flags) { assert_spin_locked(&task->sighand->siglock); task->seccomp.mode = seccomp_mode; /* * Make sure SYSCALL_WORK_SECCOMP cannot be set before the mode (and * filter) is set. */ smp_mb__before_atomic(); /* Assume default seccomp processes want spec flaw mitigation. */ if ((flags & SECCOMP_FILTER_FLAG_SPEC_ALLOW) == 0) arch_seccomp_spec_mitigate(task); set_task_syscall_work(task, SECCOMP); } #ifdef CONFIG_SECCOMP_FILTER /* Returns 1 if the parent is an ancestor of the child. */ static int is_ancestor(struct seccomp_filter *parent, struct seccomp_filter *child) { /* NULL is the root ancestor. */ if (parent == NULL) return 1; for (; child; child = child->prev) if (child == parent) return 1; return 0; } /** * seccomp_can_sync_threads: checks if all threads can be synchronized * * Expects sighand and cred_guard_mutex locks to be held. * * Returns 0 on success, -ve on error, or the pid of a thread which was * either not in the correct seccomp mode or did not have an ancestral * seccomp filter. */ static inline pid_t seccomp_can_sync_threads(void) { struct task_struct *thread, *caller; BUG_ON(!mutex_is_locked(¤t->signal->cred_guard_mutex)); assert_spin_locked(¤t->sighand->siglock); /* Validate all threads being eligible for synchronization. */ caller = current; for_each_thread(caller, thread) { pid_t failed; /* Skip current, since it is initiating the sync. */ if (thread == caller) continue; /* Skip exited threads. */ if (thread->flags & PF_EXITING) continue; if (thread->seccomp.mode == SECCOMP_MODE_DISABLED || (thread->seccomp.mode == SECCOMP_MODE_FILTER && is_ancestor(thread->seccomp.filter, caller->seccomp.filter))) continue; /* Return the first thread that cannot be synchronized. */ failed = task_pid_vnr(thread); /* If the pid cannot be resolved, then return -ESRCH */ if (WARN_ON(failed == 0)) failed = -ESRCH; return failed; } return 0; } static inline void seccomp_filter_free(struct seccomp_filter *filter) { if (filter) { bpf_prog_destroy(filter->prog); kfree(filter); } } static void __seccomp_filter_orphan(struct seccomp_filter *orig) { while (orig && refcount_dec_and_test(&orig->users)) { if (waitqueue_active(&orig->wqh)) wake_up_poll(&orig->wqh, EPOLLHUP); orig = orig->prev; } } static void __put_seccomp_filter(struct seccomp_filter *orig) { /* Clean up single-reference branches iteratively. */ while (orig && refcount_dec_and_test(&orig->refs)) { struct seccomp_filter *freeme = orig; orig = orig->prev; seccomp_filter_free(freeme); } } static void __seccomp_filter_release(struct seccomp_filter *orig) { /* Notify about any unused filters in the task's former filter tree. */ __seccomp_filter_orphan(orig); /* Finally drop all references to the task's former tree. */ __put_seccomp_filter(orig); } /** * seccomp_filter_release - Detach the task from its filter tree, * drop its reference count, and notify * about unused filters * * @tsk: task the filter should be released from. * * This function should only be called when the task is exiting as * it detaches it from its filter tree. PF_EXITING has to be set * for the task. */ void seccomp_filter_release(struct task_struct *tsk) { struct seccomp_filter *orig; if (WARN_ON((tsk->flags & PF_EXITING) == 0)) return; spin_lock_irq(&tsk->sighand->siglock); orig = tsk->seccomp.filter; /* Detach task from its filter tree. */ tsk->seccomp.filter = NULL; spin_unlock_irq(&tsk->sighand->siglock); __seccomp_filter_release(orig); } /** * seccomp_sync_threads: sets all threads to use current's filter * * @flags: SECCOMP_FILTER_FLAG_* flags to set during sync. * * Expects sighand and cred_guard_mutex locks to be held, and for * seccomp_can_sync_threads() to have returned success already * without dropping the locks. * */ static inline void seccomp_sync_threads(unsigned long flags) { struct task_struct *thread, *caller; BUG_ON(!mutex_is_locked(¤t->signal->cred_guard_mutex)); assert_spin_locked(¤t->sighand->siglock); /* Synchronize all threads. */ caller = current; for_each_thread(caller, thread) { /* Skip current, since it needs no changes. */ if (thread == caller) continue; /* * Skip exited threads. seccomp_filter_release could have * been already called for this task. */ if (thread->flags & PF_EXITING) continue; /* Get a task reference for the new leaf node. */ get_seccomp_filter(caller); /* * Drop the task reference to the shared ancestor since * current's path will hold a reference. (This also * allows a put before the assignment.) */ __seccomp_filter_release(thread->seccomp.filter); /* Make our new filter tree visible. */ smp_store_release(&thread->seccomp.filter, caller->seccomp.filter); atomic_set(&thread->seccomp.filter_count, atomic_read(&caller->seccomp.filter_count)); /* * Don't let an unprivileged task work around * the no_new_privs restriction by creating * a thread that sets it up, enters seccomp, * then dies. */ if (task_no_new_privs(caller)) task_set_no_new_privs(thread); /* * Opt the other thread into seccomp if needed. * As threads are considered to be trust-realm * equivalent (see ptrace_may_access), it is safe to * allow one thread to transition the other. */ if (thread->seccomp.mode == SECCOMP_MODE_DISABLED) seccomp_assign_mode(thread, SECCOMP_MODE_FILTER, flags); } } /** * seccomp_prepare_filter: Prepares a seccomp filter for use. * @fprog: BPF program to install * * Returns filter on success or an ERR_PTR on failure. */ static struct seccomp_filter *seccomp_prepare_filter(struct sock_fprog *fprog) { struct seccomp_filter *sfilter; int ret; const bool save_orig = #if defined(CONFIG_CHECKPOINT_RESTORE) || defined(SECCOMP_ARCH_NATIVE) true; #else false; #endif if (fprog->len == 0 || fprog->len > BPF_MAXINSNS) return ERR_PTR(-EINVAL); BUG_ON(INT_MAX / fprog->len < sizeof(struct sock_filter)); /* * Installing a seccomp filter requires that the task has * CAP_SYS_ADMIN in its namespace or be running with no_new_privs. * This avoids scenarios where unprivileged tasks can affect the * behavior of privileged children. */ if (!task_no_new_privs(current) && !ns_capable_noaudit(current_user_ns(), CAP_SYS_ADMIN)) return ERR_PTR(-EACCES); /* Allocate a new seccomp_filter */ sfilter = kzalloc(sizeof(*sfilter), GFP_KERNEL | __GFP_NOWARN); if (!sfilter) return ERR_PTR(-ENOMEM); mutex_init(&sfilter->notify_lock); ret = bpf_prog_create_from_user(&sfilter->prog, fprog, seccomp_check_filter, save_orig); if (ret < 0) { kfree(sfilter); return ERR_PTR(ret); } refcount_set(&sfilter->refs, 1); refcount_set(&sfilter->users, 1); init_waitqueue_head(&sfilter->wqh); return sfilter; } /** * seccomp_prepare_user_filter - prepares a user-supplied sock_fprog * @user_filter: pointer to the user data containing a sock_fprog. * * Returns 0 on success and non-zero otherwise. */ static struct seccomp_filter * seccomp_prepare_user_filter(const char __user *user_filter) { struct sock_fprog fprog; struct seccomp_filter *filter = ERR_PTR(-EFAULT); #ifdef CONFIG_COMPAT if (in_compat_syscall()) { struct compat_sock_fprog fprog32; if (copy_from_user(&fprog32, user_filter, sizeof(fprog32))) goto out; fprog.len = fprog32.len; fprog.filter = compat_ptr(fprog32.filter); } else /* falls through to the if below. */ #endif if (copy_from_user(&fprog, user_filter, sizeof(fprog))) goto out; filter = seccomp_prepare_filter(&fprog); out: return filter; } #ifdef SECCOMP_ARCH_NATIVE /** * seccomp_is_const_allow - check if filter is constant allow with given data * @fprog: The BPF programs * @sd: The seccomp data to check against, only syscall number and arch * number are considered constant. */ static bool seccomp_is_const_allow(struct sock_fprog_kern *fprog, struct seccomp_data *sd) { unsigned int reg_value = 0; unsigned int pc; bool op_res; if (WARN_ON_ONCE(!fprog)) return false; for (pc = 0; pc < fprog->len; pc++) { struct sock_filter *insn = &fprog->filter[pc]; u16 code = insn->code; u32 k = insn->k; switch (code) { case BPF_LD | BPF_W | BPF_ABS: switch (k) { case offsetof(struct seccomp_data, nr): reg_value = sd->nr; break; case offsetof(struct seccomp_data, arch): reg_value = sd->arch; break; default: /* can't optimize (non-constant value load) */ return false; } break; case BPF_RET | BPF_K: /* reached return with constant values only, check allow */ return k == SECCOMP_RET_ALLOW; case BPF_JMP | BPF_JA: pc += insn->k; break; case BPF_JMP | BPF_JEQ | BPF_K: case BPF_JMP | BPF_JGE | BPF_K: case BPF_JMP | BPF_JGT | BPF_K: case BPF_JMP | BPF_JSET | BPF_K: switch (BPF_OP(code)) { case BPF_JEQ: op_res = reg_value == k; break; case BPF_JGE: op_res = reg_value >= k; break; case BPF_JGT: op_res = reg_value > k; break; case BPF_JSET: op_res = !!(reg_value & k); break; default: /* can't optimize (unknown jump) */ return false; } pc += op_res ? insn->jt : insn->jf; break; case BPF_ALU | BPF_AND | BPF_K: reg_value &= k; break; default: /* can't optimize (unknown insn) */ return false; } } /* ran off the end of the filter?! */ WARN_ON(1); return false; } static void seccomp_cache_prepare_bitmap(struct seccomp_filter *sfilter, void *bitmap, const void *bitmap_prev, size_t bitmap_size, int arch) { struct sock_fprog_kern *fprog = sfilter->prog->orig_prog; struct seccomp_data sd; int nr; if (bitmap_prev) { /* The new filter must be as restrictive as the last. */ bitmap_copy(bitmap, bitmap_prev, bitmap_size); } else { /* Before any filters, all syscalls are always allowed. */ bitmap_fill(bitmap, bitmap_size); } for (nr = 0; nr < bitmap_size; nr++) { /* No bitmap change: not a cacheable action. */ if (!test_bit(nr, bitmap)) continue; sd.nr = nr; sd.arch = arch; /* No bitmap change: continue to always allow. */ if (seccomp_is_const_allow(fprog, &sd)) continue; /* * Not a cacheable action: always run filters. * atomic clear_bit() not needed, filter not visible yet. */ __clear_bit(nr, bitmap); } } /** * seccomp_cache_prepare - emulate the filter to find cacheable syscalls * @sfilter: The seccomp filter * * Returns 0 if successful or -errno if error occurred. */ static void seccomp_cache_prepare(struct seccomp_filter *sfilter) { struct action_cache *cache = &sfilter->cache; const struct action_cache *cache_prev = sfilter->prev ? &sfilter->prev->cache : NULL; seccomp_cache_prepare_bitmap(sfilter, cache->allow_native, cache_prev ? cache_prev->allow_native : NULL, SECCOMP_ARCH_NATIVE_NR, SECCOMP_ARCH_NATIVE); #ifdef SECCOMP_ARCH_COMPAT seccomp_cache_prepare_bitmap(sfilter, cache->allow_compat, cache_prev ? cache_prev->allow_compat : NULL, SECCOMP_ARCH_COMPAT_NR, SECCOMP_ARCH_COMPAT); #endif /* SECCOMP_ARCH_COMPAT */ } #endif /* SECCOMP_ARCH_NATIVE */ /** * seccomp_attach_filter: validate and attach filter * @flags: flags to change filter behavior * @filter: seccomp filter to add to the current process * * Caller must be holding current->sighand->siglock lock. * * Returns 0 on success, -ve on error, or * - in TSYNC mode: the pid of a thread which was either not in the correct * seccomp mode or did not have an ancestral seccomp filter * - in NEW_LISTENER mode: the fd of the new listener */ static long seccomp_attach_filter(unsigned int flags, struct seccomp_filter *filter) { unsigned long total_insns; struct seccomp_filter *walker; assert_spin_locked(¤t->sighand->siglock); /* Validate resulting filter length. */ total_insns = filter->prog->len; for (walker = current->seccomp.filter; walker; walker = walker->prev) total_insns += walker->prog->len + 4; /* 4 instr penalty */ if (total_insns > MAX_INSNS_PER_PATH) return -ENOMEM; /* If thread sync has been requested, check that it is possible. */ if (flags & SECCOMP_FILTER_FLAG_TSYNC) { int ret; ret = seccomp_can_sync_threads(); if (ret) { if (flags & SECCOMP_FILTER_FLAG_TSYNC_ESRCH) return -ESRCH; else return ret; } } /* Set log flag, if present. */ if (flags & SECCOMP_FILTER_FLAG_LOG) filter->log = true; /* Set wait killable flag, if present. */ if (flags & SECCOMP_FILTER_FLAG_WAIT_KILLABLE_RECV) filter->wait_killable_recv = true; /* * If there is an existing filter, make it the prev and don't drop its * task reference. */ filter->prev = current->seccomp.filter; seccomp_cache_prepare(filter); current->seccomp.filter = filter; atomic_inc(¤t->seccomp.filter_count); /* Now that the new filter is in place, synchronize to all threads. */ if (flags & SECCOMP_FILTER_FLAG_TSYNC) seccomp_sync_threads(flags); return 0; } static void __get_seccomp_filter(struct seccomp_filter *filter) { refcount_inc(&filter->refs); } /* get_seccomp_filter - increments the reference count of the filter on @tsk */ void get_seccomp_filter(struct task_struct *tsk) { struct seccomp_filter *orig = tsk->seccomp.filter; if (!orig) return; __get_seccomp_filter(orig); refcount_inc(&orig->users); } #endif /* CONFIG_SECCOMP_FILTER */ /* For use with seccomp_actions_logged */ #define SECCOMP_LOG_KILL_PROCESS (1 << 0) #define SECCOMP_LOG_KILL_THREAD (1 << 1) #define SECCOMP_LOG_TRAP (1 << 2) #define SECCOMP_LOG_ERRNO (1 << 3) #define SECCOMP_LOG_TRACE (1 << 4) #define SECCOMP_LOG_LOG (1 << 5) #define SECCOMP_LOG_ALLOW (1 << 6) #define SECCOMP_LOG_USER_NOTIF (1 << 7) static u32 seccomp_actions_logged = SECCOMP_LOG_KILL_PROCESS | SECCOMP_LOG_KILL_THREAD | SECCOMP_LOG_TRAP | SECCOMP_LOG_ERRNO | SECCOMP_LOG_USER_NOTIF | SECCOMP_LOG_TRACE | SECCOMP_LOG_LOG; static inline void seccomp_log(unsigned long syscall, long signr, u32 action, bool requested) { bool log = false; switch (action) { case SECCOMP_RET_ALLOW: break; case SECCOMP_RET_TRAP: log = requested && seccomp_actions_logged & SECCOMP_LOG_TRAP; break; case SECCOMP_RET_ERRNO: log = requested && seccomp_actions_logged & SECCOMP_LOG_ERRNO; break; case SECCOMP_RET_TRACE: log = requested && seccomp_actions_logged & SECCOMP_LOG_TRACE; break; case SECCOMP_RET_USER_NOTIF: log = requested && seccomp_actions_logged & SECCOMP_LOG_USER_NOTIF; break; case SECCOMP_RET_LOG: log = seccomp_actions_logged & SECCOMP_LOG_LOG; break; case SECCOMP_RET_KILL_THREAD: log = seccomp_actions_logged & SECCOMP_LOG_KILL_THREAD; break; case SECCOMP_RET_KILL_PROCESS: default: log = seccomp_actions_logged & SECCOMP_LOG_KILL_PROCESS; } /* * Emit an audit message when the action is RET_KILL_*, RET_LOG, or the * FILTER_FLAG_LOG bit was set. The admin has the ability to silence * any action from being logged by removing the action name from the * seccomp_actions_logged sysctl. */ if (!log) return; audit_seccomp(syscall, signr, action); } /* * Secure computing mode 1 allows only read/write/exit/sigreturn. * To be fully secure this must be combined with rlimit * to limit the stack allocations too. */ static const int mode1_syscalls[] = { __NR_seccomp_read, __NR_seccomp_write, __NR_seccomp_exit, __NR_seccomp_sigreturn, -1, /* negative terminated */ }; static void __secure_computing_strict(int this_syscall) { const int *allowed_syscalls = mode1_syscalls; #ifdef CONFIG_COMPAT if (in_compat_syscall()) allowed_syscalls = get_compat_mode1_syscalls(); #endif do { if (*allowed_syscalls == this_syscall) return; } while (*++allowed_syscalls != -1); #ifdef SECCOMP_DEBUG dump_stack(); #endif current->seccomp.mode = SECCOMP_MODE_DEAD; seccomp_log(this_syscall, SIGKILL, SECCOMP_RET_KILL_THREAD, true); do_exit(SIGKILL); } #ifndef CONFIG_HAVE_ARCH_SECCOMP_FILTER void secure_computing_strict(int this_syscall) { int mode = current->seccomp.mode; if (IS_ENABLED(CONFIG_CHECKPOINT_RESTORE) && unlikely(current->ptrace & PT_SUSPEND_SECCOMP)) return; if (mode == SECCOMP_MODE_DISABLED) return; else if (mode == SECCOMP_MODE_STRICT) __secure_computing_strict(this_syscall); else BUG(); } #else #ifdef CONFIG_SECCOMP_FILTER static u64 seccomp_next_notify_id(struct seccomp_filter *filter) { /* * Note: overflow is ok here, the id just needs to be unique per * filter. */ lockdep_assert_held(&filter->notify_lock); return filter->notif->next_id++; } static void seccomp_handle_addfd(struct seccomp_kaddfd *addfd, struct seccomp_knotif *n) { int fd; /* * Remove the notification, and reset the list pointers, indicating * that it has been handled. */ list_del_init(&addfd->list); if (!addfd->setfd) fd = receive_fd(addfd->file, NULL, addfd->flags); else fd = receive_fd_replace(addfd->fd, addfd->file, addfd->flags); addfd->ret = fd; if (addfd->ioctl_flags & SECCOMP_ADDFD_FLAG_SEND) { /* If we fail reset and return an error to the notifier */ if (fd < 0) { n->state = SECCOMP_NOTIFY_SENT; } else { /* Return the FD we just added */ n->flags = 0; n->error = 0; n->val = fd; } } /* * Mark the notification as completed. From this point, addfd mem * might be invalidated and we can't safely read it anymore. */ complete(&addfd->completion); } static bool should_sleep_killable(struct seccomp_filter *match, struct seccomp_knotif *n) { return match->wait_killable_recv && n->state == SECCOMP_NOTIFY_SENT; } static int seccomp_do_user_notification(int this_syscall, struct seccomp_filter *match, const struct seccomp_data *sd) { int err; u32 flags = 0; long ret = 0; struct seccomp_knotif n = {}; struct seccomp_kaddfd *addfd, *tmp; mutex_lock(&match->notify_lock); err = -ENOSYS; if (!match->notif) goto out; n.task = current; n.state = SECCOMP_NOTIFY_INIT; n.data = sd; n.id = seccomp_next_notify_id(match); init_completion(&n.ready); list_add_tail(&n.list, &match->notif->notifications); INIT_LIST_HEAD(&n.addfd); atomic_inc(&match->notif->requests); if (match->notif->flags & SECCOMP_USER_NOTIF_FD_SYNC_WAKE_UP) wake_up_poll_on_current_cpu(&match->wqh, EPOLLIN | EPOLLRDNORM); else wake_up_poll(&match->wqh, EPOLLIN | EPOLLRDNORM); /* * This is where we wait for a reply from userspace. */ do { bool wait_killable = should_sleep_killable(match, &n); mutex_unlock(&match->notify_lock); if (wait_killable) err = wait_for_completion_killable(&n.ready); else err = wait_for_completion_interruptible(&n.ready); mutex_lock(&match->notify_lock); if (err != 0) { /* * Check to see if the notifcation got picked up and * whether we should switch to wait killable. */ if (!wait_killable && should_sleep_killable(match, &n)) continue; goto interrupted; } addfd = list_first_entry_or_null(&n.addfd, struct seccomp_kaddfd, list); /* Check if we were woken up by a addfd message */ if (addfd) seccomp_handle_addfd(addfd, &n); } while (n.state != SECCOMP_NOTIFY_REPLIED); ret = n.val; err = n.error; flags = n.flags; interrupted: /* If there were any pending addfd calls, clear them out */ list_for_each_entry_safe(addfd, tmp, &n.addfd, list) { /* The process went away before we got a chance to handle it */ addfd->ret = -ESRCH; list_del_init(&addfd->list); complete(&addfd->completion); } /* * Note that it's possible the listener died in between the time when * we were notified of a response (or a signal) and when we were able to * re-acquire the lock, so only delete from the list if the * notification actually exists. * * Also note that this test is only valid because there's no way to * *reattach* to a notifier right now. If one is added, we'll need to * keep track of the notif itself and make sure they match here. */ if (match->notif) list_del(&n.list); out: mutex_unlock(&match->notify_lock); /* Userspace requests to continue the syscall. */ if (flags & SECCOMP_USER_NOTIF_FLAG_CONTINUE) return 0; syscall_set_return_value(current, current_pt_regs(), err, ret); return -1; } static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd, const bool recheck_after_trace) { u32 filter_ret, action; struct seccomp_filter *match = NULL; int data; struct seccomp_data sd_local; /* * Make sure that any changes to mode from another thread have * been seen after SYSCALL_WORK_SECCOMP was seen. */ smp_rmb(); if (!sd) { populate_seccomp_data(&sd_local); sd = &sd_local; } filter_ret = seccomp_run_filters(sd, &match); data = filter_ret & SECCOMP_RET_DATA; action = filter_ret & SECCOMP_RET_ACTION_FULL; switch (action) { case SECCOMP_RET_ERRNO: /* Set low-order bits as an errno, capped at MAX_ERRNO. */ if (data > MAX_ERRNO) data = MAX_ERRNO; syscall_set_return_value(current, current_pt_regs(), -data, 0); goto skip; case SECCOMP_RET_TRAP: /* Show the handler the original registers. */ syscall_rollback(current, current_pt_regs()); /* Let the filter pass back 16 bits of data. */ force_sig_seccomp(this_syscall, data, false); goto skip; case SECCOMP_RET_TRACE: /* We've been put in this state by the ptracer already. */ if (recheck_after_trace) return 0; /* ENOSYS these calls if there is no tracer attached. */ if (!ptrace_event_enabled(current, PTRACE_EVENT_SECCOMP)) { syscall_set_return_value(current, current_pt_regs(), -ENOSYS, 0); goto skip; } /* Allow the BPF to provide the event message */ ptrace_event(PTRACE_EVENT_SECCOMP, data); /* * The delivery of a fatal signal during event * notification may silently skip tracer notification, * which could leave us with a potentially unmodified * syscall that the tracer would have liked to have * changed. Since the process is about to die, we just * force the syscall to be skipped and let the signal * kill the process and correctly handle any tracer exit * notifications. */ if (fatal_signal_pending(current)) goto skip; /* Check if the tracer forced the syscall to be skipped. */ this_syscall = syscall_get_nr(current, current_pt_regs()); if (this_syscall < 0) goto skip; /* * Recheck the syscall, since it may have changed. This * intentionally uses a NULL struct seccomp_data to force * a reload of all registers. This does not goto skip since * a skip would have already been reported. */ if (__seccomp_filter(this_syscall, NULL, true)) return -1; return 0; case SECCOMP_RET_USER_NOTIF: if (seccomp_do_user_notification(this_syscall, match, sd)) goto skip; return 0; case SECCOMP_RET_LOG: seccomp_log(this_syscall, 0, action, true); return 0; case SECCOMP_RET_ALLOW: /* * Note that the "match" filter will always be NULL for * this action since SECCOMP_RET_ALLOW is the starting * state in seccomp_run_filters(). */ return 0; case SECCOMP_RET_KILL_THREAD: case SECCOMP_RET_KILL_PROCESS: default: current->seccomp.mode = SECCOMP_MODE_DEAD; seccomp_log(this_syscall, SIGSYS, action, true); /* Dump core only if this is the last remaining thread. */ if (action != SECCOMP_RET_KILL_THREAD || (atomic_read(¤t->signal->live) == 1)) { /* Show the original registers in the dump. */ syscall_rollback(current, current_pt_regs()); /* Trigger a coredump with SIGSYS */ force_sig_seccomp(this_syscall, data, true); } else { do_exit(SIGSYS); } return -1; /* skip the syscall go directly to signal handling */ } unreachable(); skip: seccomp_log(this_syscall, 0, action, match ? match->log : false); return -1; } #else static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd, const bool recheck_after_trace) { BUG(); return -1; } #endif int __secure_computing(const struct seccomp_data *sd) { int mode = current->seccomp.mode; int this_syscall; if (IS_ENABLED(CONFIG_CHECKPOINT_RESTORE) && unlikely(current->ptrace & PT_SUSPEND_SECCOMP)) return 0; this_syscall = sd ? sd->nr : syscall_get_nr(current, current_pt_regs()); switch (mode) { case SECCOMP_MODE_STRICT: __secure_computing_strict(this_syscall); /* may call do_exit */ return 0; case SECCOMP_MODE_FILTER: return __seccomp_filter(this_syscall, sd, false); /* Surviving SECCOMP_RET_KILL_* must be proactively impossible. */ case SECCOMP_MODE_DEAD: WARN_ON_ONCE(1); do_exit(SIGKILL); return -1; default: BUG(); } } #endif /* CONFIG_HAVE_ARCH_SECCOMP_FILTER */ long prctl_get_seccomp(void) { return current->seccomp.mode; } /** * seccomp_set_mode_strict: internal function for setting strict seccomp * * Once current->seccomp.mode is non-zero, it may not be changed. * * Returns 0 on success or -EINVAL on failure. */ static long seccomp_set_mode_strict(void) { const unsigned long seccomp_mode = SECCOMP_MODE_STRICT; long ret = -EINVAL; spin_lock_irq(¤t->sighand->siglock); if (!seccomp_may_assign_mode(seccomp_mode)) goto out; #ifdef TIF_NOTSC disable_TSC(); #endif seccomp_assign_mode(current, seccomp_mode, 0); ret = 0; out: spin_unlock_irq(¤t->sighand->siglock); return ret; } #ifdef CONFIG_SECCOMP_FILTER static void seccomp_notify_free(struct seccomp_filter *filter) { kfree(filter->notif); filter->notif = NULL; } static void seccomp_notify_detach(struct seccomp_filter *filter) { struct seccomp_knotif *knotif; if (!filter) return; mutex_lock(&filter->notify_lock); /* * If this file is being closed because e.g. the task who owned it * died, let's wake everyone up who was waiting on us. */ list_for_each_entry(knotif, &filter->notif->notifications, list) { if (knotif->state == SECCOMP_NOTIFY_REPLIED) continue; knotif->state = SECCOMP_NOTIFY_REPLIED; knotif->error = -ENOSYS; knotif->val = 0; /* * We do not need to wake up any pending addfd messages, as * the notifier will do that for us, as this just looks * like a standard reply. */ complete(&knotif->ready); } seccomp_notify_free(filter); mutex_unlock(&filter->notify_lock); } static int seccomp_notify_release(struct inode *inode, struct file *file) { struct seccomp_filter *filter = file->private_data; seccomp_notify_detach(filter); __put_seccomp_filter(filter); return 0; } /* must be called with notif_lock held */ static inline struct seccomp_knotif * find_notification(struct seccomp_filter *filter, u64 id) { struct seccomp_knotif *cur; lockdep_assert_held(&filter->notify_lock); list_for_each_entry(cur, &filter->notif->notifications, list) { if (cur->id == id) return cur; } return NULL; } static int recv_wake_function(wait_queue_entry_t *wait, unsigned int mode, int sync, void *key) { /* Avoid a wakeup if event not interesting for us. */ if (key && !(key_to_poll(key) & (EPOLLIN | EPOLLERR | EPOLLHUP))) return 0; return autoremove_wake_function(wait, mode, sync, key); } static int recv_wait_event(struct seccomp_filter *filter) { DEFINE_WAIT_FUNC(wait, recv_wake_function); int ret; if (refcount_read(&filter->users) == 0) return 0; if (atomic_dec_if_positive(&filter->notif->requests) >= 0) return 0; for (;;) { ret = prepare_to_wait_event(&filter->wqh, &wait, TASK_INTERRUPTIBLE); if (atomic_dec_if_positive(&filter->notif->requests) >= 0) break; if (refcount_read(&filter->users) == 0) break; if (ret) return ret; schedule(); } finish_wait(&filter->wqh, &wait); return 0; } static long seccomp_notify_recv(struct seccomp_filter *filter, void __user *buf) { struct seccomp_knotif *knotif = NULL, *cur; struct seccomp_notif unotif; ssize_t ret; /* Verify that we're not given garbage to keep struct extensible. */ ret = check_zeroed_user(buf, sizeof(unotif)); if (ret < 0) return ret; if (!ret) return -EINVAL; memset(&unotif, 0, sizeof(unotif)); ret = recv_wait_event(filter); if (ret < 0) return ret; mutex_lock(&filter->notify_lock); list_for_each_entry(cur, &filter->notif->notifications, list) { if (cur->state == SECCOMP_NOTIFY_INIT) { knotif = cur; break; } } /* * If we didn't find a notification, it could be that the task was * interrupted by a fatal signal between the time we were woken and * when we were able to acquire the rw lock. */ if (!knotif) { ret = -ENOENT; goto out; } unotif.id = knotif->id; unotif.pid = task_pid_vnr(knotif->task); unotif.data = *(knotif->data); knotif->state = SECCOMP_NOTIFY_SENT; wake_up_poll(&filter->wqh, EPOLLOUT | EPOLLWRNORM); ret = 0; out: mutex_unlock(&filter->notify_lock); if (ret == 0 && copy_to_user(buf, &unotif, sizeof(unotif))) { ret = -EFAULT; /* * Userspace screwed up. To make sure that we keep this * notification alive, let's reset it back to INIT. It * may have died when we released the lock, so we need to make * sure it's still around. */ mutex_lock(&filter->notify_lock); knotif = find_notification(filter, unotif.id); if (knotif) { /* Reset the process to make sure it's not stuck */ if (should_sleep_killable(filter, knotif)) complete(&knotif->ready); knotif->state = SECCOMP_NOTIFY_INIT; atomic_inc(&filter->notif->requests); wake_up_poll(&filter->wqh, EPOLLIN | EPOLLRDNORM); } mutex_unlock(&filter->notify_lock); } return ret; } static long seccomp_notify_send(struct seccomp_filter *filter, void __user *buf) { struct seccomp_notif_resp resp = {}; struct seccomp_knotif *knotif; long ret; if (copy_from_user(&resp, buf, sizeof(resp))) return -EFAULT; if (resp.flags & ~SECCOMP_USER_NOTIF_FLAG_CONTINUE) return -EINVAL; if ((resp.flags & SECCOMP_USER_NOTIF_FLAG_CONTINUE) && (resp.error || resp.val)) return -EINVAL; ret = mutex_lock_interruptible(&filter->notify_lock); if (ret < 0) return ret; knotif = find_notification(filter, resp.id); if (!knotif) { ret = -ENOENT; goto out; } /* Allow exactly one reply. */ if (knotif->state != SECCOMP_NOTIFY_SENT) { ret = -EINPROGRESS; goto out; } ret = 0; knotif->state = SECCOMP_NOTIFY_REPLIED; knotif->error = resp.error; knotif->val = resp.val; knotif->flags = resp.flags; if (filter->notif->flags & SECCOMP_USER_NOTIF_FD_SYNC_WAKE_UP) complete_on_current_cpu(&knotif->ready); else complete(&knotif->ready); out: mutex_unlock(&filter->notify_lock); return ret; } static long seccomp_notify_id_valid(struct seccomp_filter *filter, void __user *buf) { struct seccomp_knotif *knotif; u64 id; long ret; if (copy_from_user(&id, buf, sizeof(id))) return -EFAULT; ret = mutex_lock_interruptible(&filter->notify_lock); if (ret < 0) return ret; knotif = find_notification(filter, id); if (knotif && knotif->state == SECCOMP_NOTIFY_SENT) ret = 0; else ret = -ENOENT; mutex_unlock(&filter->notify_lock); return ret; } static long seccomp_notify_set_flags(struct seccomp_filter *filter, unsigned long flags) { long ret; if (flags & ~SECCOMP_USER_NOTIF_FD_SYNC_WAKE_UP) return -EINVAL; ret = mutex_lock_interruptible(&filter->notify_lock); if (ret < 0) return ret; filter->notif->flags = flags; mutex_unlock(&filter->notify_lock); return 0; } static long seccomp_notify_addfd(struct seccomp_filter *filter, struct seccomp_notif_addfd __user *uaddfd, unsigned int size) { struct seccomp_notif_addfd addfd; struct seccomp_knotif *knotif; struct seccomp_kaddfd kaddfd; int ret; BUILD_BUG_ON(sizeof(addfd) < SECCOMP_NOTIFY_ADDFD_SIZE_VER0); BUILD_BUG_ON(sizeof(addfd) != SECCOMP_NOTIFY_ADDFD_SIZE_LATEST); if (size < SECCOMP_NOTIFY_ADDFD_SIZE_VER0 || size >= PAGE_SIZE) return -EINVAL; ret = copy_struct_from_user(&addfd, sizeof(addfd), uaddfd, size); if (ret) return ret; if (addfd.newfd_flags & ~O_CLOEXEC) return -EINVAL; if (addfd.flags & ~(SECCOMP_ADDFD_FLAG_SETFD | SECCOMP_ADDFD_FLAG_SEND)) return -EINVAL; if (addfd.newfd && !(addfd.flags & SECCOMP_ADDFD_FLAG_SETFD)) return -EINVAL; kaddfd.file = fget(addfd.srcfd); if (!kaddfd.file) return -EBADF; kaddfd.ioctl_flags = addfd.flags; kaddfd.flags = addfd.newfd_flags; kaddfd.setfd = addfd.flags & SECCOMP_ADDFD_FLAG_SETFD; kaddfd.fd = addfd.newfd; init_completion(&kaddfd.completion); ret = mutex_lock_interruptible(&filter->notify_lock); if (ret < 0) goto out; knotif = find_notification(filter, addfd.id); if (!knotif) { ret = -ENOENT; goto out_unlock; } /* * We do not want to allow for FD injection to occur before the * notification has been picked up by a userspace handler, or after * the notification has been replied to. */ if (knotif->state != SECCOMP_NOTIFY_SENT) { ret = -EINPROGRESS; goto out_unlock; } if (addfd.flags & SECCOMP_ADDFD_FLAG_SEND) { /* * Disallow queuing an atomic addfd + send reply while there are * some addfd requests still to process. * * There is no clear reason to support it and allows us to keep * the loop on the other side straight-forward. */ if (!list_empty(&knotif->addfd)) { ret = -EBUSY; goto out_unlock; } /* Allow exactly only one reply */ knotif->state = SECCOMP_NOTIFY_REPLIED; } list_add(&kaddfd.list, &knotif->addfd); complete(&knotif->ready); mutex_unlock(&filter->notify_lock); /* Now we wait for it to be processed or be interrupted */ ret = wait_for_completion_interruptible(&kaddfd.completion); if (ret == 0) { /* * We had a successful completion. The other side has already * removed us from the addfd queue, and * wait_for_completion_interruptible has a memory barrier upon * success that lets us read this value directly without * locking. */ ret = kaddfd.ret; goto out; } mutex_lock(&filter->notify_lock); /* * Even though we were woken up by a signal and not a successful * completion, a completion may have happened in the mean time. * * We need to check again if the addfd request has been handled, * and if not, we will remove it from the queue. */ if (list_empty(&kaddfd.list)) ret = kaddfd.ret; else list_del(&kaddfd.list); out_unlock: mutex_unlock(&filter->notify_lock); out: fput(kaddfd.file); return ret; } static long seccomp_notify_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { struct seccomp_filter *filter = file->private_data; void __user *buf = (void __user *)arg; /* Fixed-size ioctls */ switch (cmd) { case SECCOMP_IOCTL_NOTIF_RECV: return seccomp_notify_recv(filter, buf); case SECCOMP_IOCTL_NOTIF_SEND: return seccomp_notify_send(filter, buf); case SECCOMP_IOCTL_NOTIF_ID_VALID_WRONG_DIR: case SECCOMP_IOCTL_NOTIF_ID_VALID: return seccomp_notify_id_valid(filter, buf); case SECCOMP_IOCTL_NOTIF_SET_FLAGS: return seccomp_notify_set_flags(filter, arg); } /* Extensible Argument ioctls */ #define EA_IOCTL(cmd) ((cmd) & ~(IOC_INOUT | IOCSIZE_MASK)) switch (EA_IOCTL(cmd)) { case EA_IOCTL(SECCOMP_IOCTL_NOTIF_ADDFD): return seccomp_notify_addfd(filter, buf, _IOC_SIZE(cmd)); default: return -EINVAL; } } static __poll_t seccomp_notify_poll(struct file *file, struct poll_table_struct *poll_tab) { struct seccomp_filter *filter = file->private_data; __poll_t ret = 0; struct seccomp_knotif *cur; poll_wait(file, &filter->wqh, poll_tab); if (mutex_lock_interruptible(&filter->notify_lock) < 0) return EPOLLERR; list_for_each_entry(cur, &filter->notif->notifications, list) { if (cur->state == SECCOMP_NOTIFY_INIT) ret |= EPOLLIN | EPOLLRDNORM; if (cur->state == SECCOMP_NOTIFY_SENT) ret |= EPOLLOUT | EPOLLWRNORM; if ((ret & EPOLLIN) && (ret & EPOLLOUT)) break; } mutex_unlock(&filter->notify_lock); if (refcount_read(&filter->users) == 0) ret |= EPOLLHUP; return ret; } static const struct file_operations seccomp_notify_ops = { .poll = seccomp_notify_poll, .release = seccomp_notify_release, .unlocked_ioctl = seccomp_notify_ioctl, .compat_ioctl = seccomp_notify_ioctl, }; static struct file *init_listener(struct seccomp_filter *filter) { struct file *ret; ret = ERR_PTR(-ENOMEM); filter->notif = kzalloc(sizeof(*(filter->notif)), GFP_KERNEL); if (!filter->notif) goto out; filter->notif->next_id = get_random_u64(); INIT_LIST_HEAD(&filter->notif->notifications); ret = anon_inode_getfile("seccomp notify", &seccomp_notify_ops, filter, O_RDWR); if (IS_ERR(ret)) goto out_notif; /* The file has a reference to it now */ __get_seccomp_filter(filter); out_notif: if (IS_ERR(ret)) seccomp_notify_free(filter); out: return ret; } /* * Does @new_child have a listener while an ancestor also has a listener? * If so, we'll want to reject this filter. * This only has to be tested for the current process, even in the TSYNC case, * because TSYNC installs @child with the same parent on all threads. * Note that @new_child is not hooked up to its parent at this point yet, so * we use current->seccomp.filter. */ static bool has_duplicate_listener(struct seccomp_filter *new_child) { struct seccomp_filter *cur; /* must be protected against concurrent TSYNC */ lockdep_assert_held(¤t->sighand->siglock); if (!new_child->notif) return false; for (cur = current->seccomp.filter; cur; cur = cur->prev) { if (cur->notif) return true; } return false; } /** * seccomp_set_mode_filter: internal function for setting seccomp filter * @flags: flags to change filter behavior * @filter: struct sock_fprog containing filter * * This function may be called repeatedly to install additional filters. * Every filter successfully installed will be evaluated (in reverse order) * for each system call the task makes. * * Once current->seccomp.mode is non-zero, it may not be changed. * * Returns 0 on success or -EINVAL on failure. */ static long seccomp_set_mode_filter(unsigned int flags, const char __user *filter) { const unsigned long seccomp_mode = SECCOMP_MODE_FILTER; struct seccomp_filter *prepared = NULL; long ret = -EINVAL; int listener = -1; struct file *listener_f = NULL; /* Validate flags. */ if (flags & ~SECCOMP_FILTER_FLAG_MASK) return -EINVAL; /* * In the successful case, NEW_LISTENER returns the new listener fd. * But in the failure case, TSYNC returns the thread that died. If you * combine these two flags, there's no way to tell whether something * succeeded or failed. So, let's disallow this combination if the user * has not explicitly requested no errors from TSYNC. */ if ((flags & SECCOMP_FILTER_FLAG_TSYNC) && (flags & SECCOMP_FILTER_FLAG_NEW_LISTENER) && ((flags & SECCOMP_FILTER_FLAG_TSYNC_ESRCH) == 0)) return -EINVAL; /* * The SECCOMP_FILTER_FLAG_WAIT_KILLABLE_SENT flag doesn't make sense * without the SECCOMP_FILTER_FLAG_NEW_LISTENER flag. */ if ((flags & SECCOMP_FILTER_FLAG_WAIT_KILLABLE_RECV) && ((flags & SECCOMP_FILTER_FLAG_NEW_LISTENER) == 0)) return -EINVAL; /* Prepare the new filter before holding any locks. */ prepared = seccomp_prepare_user_filter(filter); if (IS_ERR(prepared)) return PTR_ERR(prepared); if (flags & SECCOMP_FILTER_FLAG_NEW_LISTENER) { listener = get_unused_fd_flags(O_CLOEXEC); if (listener < 0) { ret = listener; goto out_free; } listener_f = init_listener(prepared); if (IS_ERR(listener_f)) { put_unused_fd(listener); ret = PTR_ERR(listener_f); goto out_free; } } /* * Make sure we cannot change seccomp or nnp state via TSYNC * while another thread is in the middle of calling exec. */ if (flags & SECCOMP_FILTER_FLAG_TSYNC && mutex_lock_killable(¤t->signal->cred_guard_mutex)) goto out_put_fd; spin_lock_irq(¤t->sighand->siglock); if (!seccomp_may_assign_mode(seccomp_mode)) goto out; if (has_duplicate_listener(prepared)) { ret = -EBUSY; goto out; } ret = seccomp_attach_filter(flags, prepared); if (ret) goto out; /* Do not free the successfully attached filter. */ prepared = NULL; seccomp_assign_mode(current, seccomp_mode, flags); out: spin_unlock_irq(¤t->sighand->siglock); if (flags & SECCOMP_FILTER_FLAG_TSYNC) mutex_unlock(¤t->signal->cred_guard_mutex); out_put_fd: if (flags & SECCOMP_FILTER_FLAG_NEW_LISTENER) { if (ret) { listener_f->private_data = NULL; fput(listener_f); put_unused_fd(listener); seccomp_notify_detach(prepared); } else { fd_install(listener, listener_f); ret = listener; } } out_free: seccomp_filter_free(prepared); return ret; } #else static inline long seccomp_set_mode_filter(unsigned int flags, const char __user *filter) { return -EINVAL; } #endif static long seccomp_get_action_avail(const char __user *uaction) { u32 action; if (copy_from_user(&action, uaction, sizeof(action))) return -EFAULT; switch (action) { case SECCOMP_RET_KILL_PROCESS: case SECCOMP_RET_KILL_THREAD: case SECCOMP_RET_TRAP: case SECCOMP_RET_ERRNO: case SECCOMP_RET_USER_NOTIF: case SECCOMP_RET_TRACE: case SECCOMP_RET_LOG: case SECCOMP_RET_ALLOW: break; default: return -EOPNOTSUPP; } return 0; } static long seccomp_get_notif_sizes(void __user *usizes) { struct seccomp_notif_sizes sizes = { .seccomp_notif = sizeof(struct seccomp_notif), .seccomp_notif_resp = sizeof(struct seccomp_notif_resp), .seccomp_data = sizeof(struct seccomp_data), }; if (copy_to_user(usizes, &sizes, sizeof(sizes))) return -EFAULT; return 0; } /* Common entry point for both prctl and syscall. */ static long do_seccomp(unsigned int op, unsigned int flags, void __user *uargs) { switch (op) { case SECCOMP_SET_MODE_STRICT: if (flags != 0 || uargs != NULL) return -EINVAL; return seccomp_set_mode_strict(); case SECCOMP_SET_MODE_FILTER: return seccomp_set_mode_filter(flags, uargs); case SECCOMP_GET_ACTION_AVAIL: if (flags != 0) return -EINVAL; return seccomp_get_action_avail(uargs); case SECCOMP_GET_NOTIF_SIZES: if (flags != 0) return -EINVAL; return seccomp_get_notif_sizes(uargs); default: return -EINVAL; } } SYSCALL_DEFINE3(seccomp, unsigned int, op, unsigned int, flags, void __user *, uargs) { return do_seccomp(op, flags, uargs); } /** * prctl_set_seccomp: configures current->seccomp.mode * @seccomp_mode: requested mode to use * @filter: optional struct sock_fprog for use with SECCOMP_MODE_FILTER * * Returns 0 on success or -EINVAL on failure. */ long prctl_set_seccomp(unsigned long seccomp_mode, void __user *filter) { unsigned int op; void __user *uargs; switch (seccomp_mode) { case SECCOMP_MODE_STRICT: op = SECCOMP_SET_MODE_STRICT; /* * Setting strict mode through prctl always ignored filter, * so make sure it is always NULL here to pass the internal * check in do_seccomp(). */ uargs = NULL; break; case SECCOMP_MODE_FILTER: op = SECCOMP_SET_MODE_FILTER; uargs = filter; break; default: return -EINVAL; } /* prctl interface doesn't have flags, so they are always zero. */ return do_seccomp(op, 0, uargs); } #if defined(CONFIG_SECCOMP_FILTER) && defined(CONFIG_CHECKPOINT_RESTORE) static struct seccomp_filter *get_nth_filter(struct task_struct *task, unsigned long filter_off) { struct seccomp_filter *orig, *filter; unsigned long count; /* * Note: this is only correct because the caller should be the (ptrace) * tracer of the task, otherwise lock_task_sighand is needed. */ spin_lock_irq(&task->sighand->siglock); if (task->seccomp.mode != SECCOMP_MODE_FILTER) { spin_unlock_irq(&task->sighand->siglock); return ERR_PTR(-EINVAL); } orig = task->seccomp.filter; __get_seccomp_filter(orig); spin_unlock_irq(&task->sighand->siglock); count = 0; for (filter = orig; filter; filter = filter->prev) count++; if (filter_off >= count) { filter = ERR_PTR(-ENOENT); goto out; } count -= filter_off; for (filter = orig; filter && count > 1; filter = filter->prev) count--; if (WARN_ON(count != 1 || !filter)) { filter = ERR_PTR(-ENOENT); goto out; } __get_seccomp_filter(filter); out: __put_seccomp_filter(orig); return filter; } long seccomp_get_filter(struct task_struct *task, unsigned long filter_off, void __user *data) { struct seccomp_filter *filter; struct sock_fprog_kern *fprog; long ret; if (!capable(CAP_SYS_ADMIN) || current->seccomp.mode != SECCOMP_MODE_DISABLED) { return -EACCES; } filter = get_nth_filter(task, filter_off); if (IS_ERR(filter)) return PTR_ERR(filter); fprog = filter->prog->orig_prog; if (!fprog) { /* This must be a new non-cBPF filter, since we save * every cBPF filter's orig_prog above when * CONFIG_CHECKPOINT_RESTORE is enabled. */ ret = -EMEDIUMTYPE; goto out; } ret = fprog->len; if (!data) goto out; if (copy_to_user(data, fprog->filter, bpf_classic_proglen(fprog))) ret = -EFAULT; out: __put_seccomp_filter(filter); return ret; } long seccomp_get_metadata(struct task_struct *task, unsigned long size, void __user *data) { long ret; struct seccomp_filter *filter; struct seccomp_metadata kmd = {}; if (!capable(CAP_SYS_ADMIN) || current->seccomp.mode != SECCOMP_MODE_DISABLED) { return -EACCES; } size = min_t(unsigned long, size, sizeof(kmd)); if (size < sizeof(kmd.filter_off)) return -EINVAL; if (copy_from_user(&kmd.filter_off, data, sizeof(kmd.filter_off))) return -EFAULT; filter = get_nth_filter(task, kmd.filter_off); if (IS_ERR(filter)) return PTR_ERR(filter); if (filter->log) kmd.flags |= SECCOMP_FILTER_FLAG_LOG; ret = size; if (copy_to_user(data, &kmd, size)) ret = -EFAULT; __put_seccomp_filter(filter); return ret; } #endif #ifdef CONFIG_SYSCTL /* Human readable action names for friendly sysctl interaction */ #define SECCOMP_RET_KILL_PROCESS_NAME "kill_process" #define SECCOMP_RET_KILL_THREAD_NAME "kill_thread" #define SECCOMP_RET_TRAP_NAME "trap" #define SECCOMP_RET_ERRNO_NAME "errno" #define SECCOMP_RET_USER_NOTIF_NAME "user_notif" #define SECCOMP_RET_TRACE_NAME "trace" #define SECCOMP_RET_LOG_NAME "log" #define SECCOMP_RET_ALLOW_NAME "allow" static const char seccomp_actions_avail[] = SECCOMP_RET_KILL_PROCESS_NAME " " SECCOMP_RET_KILL_THREAD_NAME " " SECCOMP_RET_TRAP_NAME " " SECCOMP_RET_ERRNO_NAME " " SECCOMP_RET_USER_NOTIF_NAME " " SECCOMP_RET_TRACE_NAME " " SECCOMP_RET_LOG_NAME " " SECCOMP_RET_ALLOW_NAME; struct seccomp_log_name { u32 log; const char *name; }; static const struct seccomp_log_name seccomp_log_names[] = { { SECCOMP_LOG_KILL_PROCESS, SECCOMP_RET_KILL_PROCESS_NAME }, { SECCOMP_LOG_KILL_THREAD, SECCOMP_RET_KILL_THREAD_NAME }, { SECCOMP_LOG_TRAP, SECCOMP_RET_TRAP_NAME }, { SECCOMP_LOG_ERRNO, SECCOMP_RET_ERRNO_NAME }, { SECCOMP_LOG_USER_NOTIF, SECCOMP_RET_USER_NOTIF_NAME }, { SECCOMP_LOG_TRACE, SECCOMP_RET_TRACE_NAME }, { SECCOMP_LOG_LOG, SECCOMP_RET_LOG_NAME }, { SECCOMP_LOG_ALLOW, SECCOMP_RET_ALLOW_NAME }, { } }; static bool seccomp_names_from_actions_logged(char *names, size_t size, u32 actions_logged, const char *sep) { const struct seccomp_log_name *cur; bool append_sep = false; for (cur = seccomp_log_names; cur->name && size; cur++) { ssize_t ret; if (!(actions_logged & cur->log)) continue; if (append_sep) { ret = strscpy(names, sep, size); if (ret < 0) return false; names += ret; size -= ret; } else append_sep = true; ret = strscpy(names, cur->name, size); if (ret < 0) return false; names += ret; size -= ret; } return true; } static bool seccomp_action_logged_from_name(u32 *action_logged, const char *name) { const struct seccomp_log_name *cur; for (cur = seccomp_log_names; cur->name; cur++) { if (!strcmp(cur->name, name)) { *action_logged = cur->log; return true; } } return false; } static bool seccomp_actions_logged_from_names(u32 *actions_logged, char *names) { char *name; *actions_logged = 0; while ((name = strsep(&names, " ")) && *name) { u32 action_logged = 0; if (!seccomp_action_logged_from_name(&action_logged, name)) return false; *actions_logged |= action_logged; } return true; } static int read_actions_logged(const struct ctl_table *ro_table, void *buffer, size_t *lenp, loff_t *ppos) { char names[sizeof(seccomp_actions_avail)]; struct ctl_table table; memset(names, 0, sizeof(names)); if (!seccomp_names_from_actions_logged(names, sizeof(names), seccomp_actions_logged, " ")) return -EINVAL; table = *ro_table; table.data = names; table.maxlen = sizeof(names); return proc_dostring(&table, 0, buffer, lenp, ppos); } static int write_actions_logged(const struct ctl_table *ro_table, void *buffer, size_t *lenp, loff_t *ppos, u32 *actions_logged) { char names[sizeof(seccomp_actions_avail)]; struct ctl_table table; int ret; if (!capable(CAP_SYS_ADMIN)) return -EPERM; memset(names, 0, sizeof(names)); table = *ro_table; table.data = names; table.maxlen = sizeof(names); ret = proc_dostring(&table, 1, buffer, lenp, ppos); if (ret) return ret; if (!seccomp_actions_logged_from_names(actions_logged, table.data)) return -EINVAL; if (*actions_logged & SECCOMP_LOG_ALLOW) return -EINVAL; seccomp_actions_logged = *actions_logged; return 0; } static void audit_actions_logged(u32 actions_logged, u32 old_actions_logged, int ret) { char names[sizeof(seccomp_actions_avail)]; char old_names[sizeof(seccomp_actions_avail)]; const char *new = names; const char *old = old_names; if (!audit_enabled) return; memset(names, 0, sizeof(names)); memset(old_names, 0, sizeof(old_names)); if (ret) new = "?"; else if (!actions_logged) new = "(none)"; else if (!seccomp_names_from_actions_logged(names, sizeof(names), actions_logged, ",")) new = "?"; if (!old_actions_logged) old = "(none)"; else if (!seccomp_names_from_actions_logged(old_names, sizeof(old_names), old_actions_logged, ",")) old = "?"; return audit_seccomp_actions_logged(new, old, !ret); } static int seccomp_actions_logged_handler(const struct ctl_table *ro_table, int write, void *buffer, size_t *lenp, loff_t *ppos) { int ret; if (write) { u32 actions_logged = 0; u32 old_actions_logged = seccomp_actions_logged; ret = write_actions_logged(ro_table, buffer, lenp, ppos, &actions_logged); audit_actions_logged(actions_logged, old_actions_logged, ret); } else ret = read_actions_logged(ro_table, buffer, lenp, ppos); return ret; } static struct ctl_table seccomp_sysctl_table[] = { { .procname = "actions_avail", .data = (void *) &seccomp_actions_avail, .maxlen = sizeof(seccomp_actions_avail), .mode = 0444, .proc_handler = proc_dostring, }, { .procname = "actions_logged", .mode = 0644, .proc_handler = seccomp_actions_logged_handler, }, }; static int __init seccomp_sysctl_init(void) { register_sysctl_init("kernel/seccomp", seccomp_sysctl_table); return 0; } device_initcall(seccomp_sysctl_init) #endif /* CONFIG_SYSCTL */ #ifdef CONFIG_SECCOMP_CACHE_DEBUG /* Currently CONFIG_SECCOMP_CACHE_DEBUG implies SECCOMP_ARCH_NATIVE */ static void proc_pid_seccomp_cache_arch(struct seq_file *m, const char *name, const void *bitmap, size_t bitmap_size) { int nr; for (nr = 0; nr < bitmap_size; nr++) { bool cached = test_bit(nr, bitmap); char *status = cached ? "ALLOW" : "FILTER"; seq_printf(m, "%s %d %s\n", name, nr, status); } } int proc_pid_seccomp_cache(struct seq_file *m, struct pid_namespace *ns, struct pid *pid, struct task_struct *task) { struct seccomp_filter *f; unsigned long flags; /* * We don't want some sandboxed process to know what their seccomp * filters consist of. */ if (!file_ns_capable(m->file, &init_user_ns, CAP_SYS_ADMIN)) return -EACCES; if (!lock_task_sighand(task, &flags)) return -ESRCH; f = READ_ONCE(task->seccomp.filter); if (!f) { unlock_task_sighand(task, &flags); return 0; } /* prevent filter from being freed while we are printing it */ __get_seccomp_filter(f); unlock_task_sighand(task, &flags); proc_pid_seccomp_cache_arch(m, SECCOMP_ARCH_NATIVE_NAME, f->cache.allow_native, SECCOMP_ARCH_NATIVE_NR); #ifdef SECCOMP_ARCH_COMPAT proc_pid_seccomp_cache_arch(m, SECCOMP_ARCH_COMPAT_NAME, f->cache.allow_compat, SECCOMP_ARCH_COMPAT_NR); #endif /* SECCOMP_ARCH_COMPAT */ __put_seccomp_filter(f); return 0; } #endif /* CONFIG_SECCOMP_CACHE_DEBUG */ |
| 11 4 7 11 11 40 3 3 3 3 3 29 3 6 1 2 1 27 1 3 15 41 41 11 1 3 3 3 41 41 41 43 40 23 6 19 4 26 18 1 24 18 18 18 18 2 2 7 11 18 18 15 16 25 15 7 9 23 25 7 20 25 25 24 25 28 11 28 26 2 25 1 25 25 25 28 28 24 20 9 25 24 20 9 2 1 17 3 6 6 12 1 1 17 2 32 23 26 31 4 23 25 25 18 4 30 11 1 15 2 16 1 1 1 1 43 27 16 31 20 40 7 23 21 21 12 28 32 30 27 1 11 1 28 28 28 1 28 28 25 24 24 25 3 3 3 25 43 43 43 43 12 12 26 11 15 24 26 15 15 15 31 31 31 31 16 2 2 15 16 15 1 1 1 16 16 5 15 2 6 2 1 1 4 4 1 3 2 3 4 2 2 1 1 4 17 1 2 3 13 2 13 1 12 11 11 12 12 9 6 9 6 12 11 3 11 11 11 11 11 12 9 6 11 7 4 17 5 1 4 8 4 1 6 4 4 4 6 2 2 7 7 1 6 17 15 8 7 7 7 8 1 7 5 5 5 5 5 5 4 5 5 13 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 9 9 9 9 9 5 5 5 5 5 5 1 1 1 1 1 5 4 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 1 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 | // SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2011 STRATO. All rights reserved. */ #include <linux/mm.h> #include <linux/rbtree.h> #include <trace/events/btrfs.h> #include "ctree.h" #include "disk-io.h" #include "backref.h" #include "ulist.h" #include "transaction.h" #include "delayed-ref.h" #include "locking.h" #include "misc.h" #include "tree-mod-log.h" #include "fs.h" #include "accessors.h" #include "extent-tree.h" #include "relocation.h" #include "tree-checker.h" /* Just arbitrary numbers so we can be sure one of these happened. */ #define BACKREF_FOUND_SHARED 6 #define BACKREF_FOUND_NOT_SHARED 7 struct extent_inode_elem { u64 inum; u64 offset; u64 num_bytes; struct extent_inode_elem *next; }; static int check_extent_in_eb(struct btrfs_backref_walk_ctx *ctx, const struct btrfs_key *key, const struct extent_buffer *eb, const struct btrfs_file_extent_item *fi, struct extent_inode_elem **eie) { const u64 data_len = btrfs_file_extent_num_bytes(eb, fi); u64 offset = key->offset; struct extent_inode_elem *e; const u64 *root_ids; int root_count; bool cached; if (!ctx->ignore_extent_item_pos && !btrfs_file_extent_compression(eb, fi) && !btrfs_file_extent_encryption(eb, fi) && !btrfs_file_extent_other_encoding(eb, fi)) { u64 data_offset; data_offset = btrfs_file_extent_offset(eb, fi); if (ctx->extent_item_pos < data_offset || ctx->extent_item_pos >= data_offset + data_len) return 1; offset += ctx->extent_item_pos - data_offset; } if (!ctx->indirect_ref_iterator || !ctx->cache_lookup) goto add_inode_elem; cached = ctx->cache_lookup(eb->start, ctx->user_ctx, &root_ids, &root_count); if (!cached) goto add_inode_elem; for (int i = 0; i < root_count; i++) { int ret; ret = ctx->indirect_ref_iterator(key->objectid, offset, data_len, root_ids[i], ctx->user_ctx); if (ret) return ret; } add_inode_elem: e = kmalloc(sizeof(*e), GFP_NOFS); if (!e) return -ENOMEM; e->next = *eie; e->inum = key->objectid; e->offset = offset; e->num_bytes = data_len; *eie = e; return 0; } static void free_inode_elem_list(struct extent_inode_elem *eie) { struct extent_inode_elem *eie_next; for (; eie; eie = eie_next) { eie_next = eie->next; kfree(eie); } } static int find_extent_in_eb(struct btrfs_backref_walk_ctx *ctx, const struct extent_buffer *eb, struct extent_inode_elem **eie) { u64 disk_byte; struct btrfs_key key; struct btrfs_file_extent_item *fi; int slot; int nritems; int extent_type; int ret; /* * from the shared data ref, we only have the leaf but we need * the key. thus, we must look into all items and see that we * find one (some) with a reference to our extent item. */ nritems = btrfs_header_nritems(eb); for (slot = 0; slot < nritems; ++slot) { btrfs_item_key_to_cpu(eb, &key, slot); if (key.type != BTRFS_EXTENT_DATA_KEY) continue; fi = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item); extent_type = btrfs_file_extent_type(eb, fi); if (extent_type == BTRFS_FILE_EXTENT_INLINE) continue; /* don't skip BTRFS_FILE_EXTENT_PREALLOC, we can handle that */ disk_byte = btrfs_file_extent_disk_bytenr(eb, fi); if (disk_byte != ctx->bytenr) continue; ret = check_extent_in_eb(ctx, &key, eb, fi, eie); if (ret == BTRFS_ITERATE_EXTENT_INODES_STOP || ret < 0) return ret; } return 0; } struct preftree { struct rb_root_cached root; unsigned int count; }; #define PREFTREE_INIT { .root = RB_ROOT_CACHED, .count = 0 } struct preftrees { struct preftree direct; /* BTRFS_SHARED_[DATA|BLOCK]_REF_KEY */ struct preftree indirect; /* BTRFS_[TREE_BLOCK|EXTENT_DATA]_REF_KEY */ struct preftree indirect_missing_keys; }; /* * Checks for a shared extent during backref search. * * The share_count tracks prelim_refs (direct and indirect) having a * ref->count >0: * - incremented when a ref->count transitions to >0 * - decremented when a ref->count transitions to <1 */ struct share_check { struct btrfs_backref_share_check_ctx *ctx; struct btrfs_root *root; u64 inum; u64 data_bytenr; u64 data_extent_gen; /* * Counts number of inodes that refer to an extent (different inodes in * the same root or different roots) that we could find. The sharedness * check typically stops once this counter gets greater than 1, so it * may not reflect the total number of inodes. */ int share_count; /* * The number of times we found our inode refers to the data extent we * are determining the sharedness. In other words, how many file extent * items we could find for our inode that point to our target data * extent. The value we get here after finishing the extent sharedness * check may be smaller than reality, but if it ends up being greater * than 1, then we know for sure the inode has multiple file extent * items that point to our inode, and we can safely assume it's useful * to cache the sharedness check result. */ int self_ref_count; bool have_delayed_delete_refs; }; static inline int extent_is_shared(struct share_check *sc) { return (sc && sc->share_count > 1) ? BACKREF_FOUND_SHARED : 0; } static struct kmem_cache *btrfs_prelim_ref_cache; int __init btrfs_prelim_ref_init(void) { btrfs_prelim_ref_cache = kmem_cache_create("btrfs_prelim_ref", sizeof(struct prelim_ref), 0, 0, NULL); if (!btrfs_prelim_ref_cache) return -ENOMEM; return 0; } void __cold btrfs_prelim_ref_exit(void) { kmem_cache_destroy(btrfs_prelim_ref_cache); } static void free_pref(struct prelim_ref *ref) { kmem_cache_free(btrfs_prelim_ref_cache, ref); } /* * Return 0 when both refs are for the same block (and can be merged). * A -1 return indicates ref1 is a 'lower' block than ref2, while 1 * indicates a 'higher' block. */ static int prelim_ref_compare(const struct prelim_ref *ref1, const struct prelim_ref *ref2) { if (ref1->level < ref2->level) return -1; if (ref1->level > ref2->level) return 1; if (ref1->root_id < ref2->root_id) return -1; if (ref1->root_id > ref2->root_id) return 1; if (ref1->key_for_search.type < ref2->key_for_search.type) return -1; if (ref1->key_for_search.type > ref2->key_for_search.type) return 1; if (ref1->key_for_search.objectid < ref2->key_for_search.objectid) return -1; if (ref1->key_for_search.objectid > ref2->key_for_search.objectid) return 1; if (ref1->key_for_search.offset < ref2->key_for_search.offset) return -1; if (ref1->key_for_search.offset > ref2->key_for_search.offset) return 1; if (ref1->parent < ref2->parent) return -1; if (ref1->parent > ref2->parent) return 1; return 0; } static void update_share_count(struct share_check *sc, int oldcount, int newcount, const struct prelim_ref *newref) { if ((!sc) || (oldcount == 0 && newcount < 1)) return; if (oldcount > 0 && newcount < 1) sc->share_count--; else if (oldcount < 1 && newcount > 0) sc->share_count++; if (newref->root_id == btrfs_root_id(sc->root) && newref->wanted_disk_byte == sc->data_bytenr && newref->key_for_search.objectid == sc->inum) sc->self_ref_count += newref->count; } /* * Add @newref to the @root rbtree, merging identical refs. * * Callers should assume that newref has been freed after calling. */ static void prelim_ref_insert(const struct btrfs_fs_info *fs_info, struct preftree *preftree, struct prelim_ref *newref, struct share_check *sc) { struct rb_root_cached *root; struct rb_node **p; struct rb_node *parent = NULL; struct prelim_ref *ref; int result; bool leftmost = true; root = &preftree->root; p = &root->rb_root.rb_node; while (*p) { parent = *p; ref = rb_entry(parent, struct prelim_ref, rbnode); result = prelim_ref_compare(ref, newref); if (result < 0) { p = &(*p)->rb_left; } else if (result > 0) { p = &(*p)->rb_right; leftmost = false; } else { /* Identical refs, merge them and free @newref */ struct extent_inode_elem *eie = ref->inode_list; while (eie && eie->next) eie = eie->next; if (!eie) ref->inode_list = newref->inode_list; else eie->next = newref->inode_list; trace_btrfs_prelim_ref_merge(fs_info, ref, newref, preftree->count); /* * A delayed ref can have newref->count < 0. * The ref->count is updated to follow any * BTRFS_[ADD|DROP]_DELAYED_REF actions. */ update_share_count(sc, ref->count, ref->count + newref->count, newref); ref->count += newref->count; free_pref(newref); return; } } update_share_count(sc, 0, newref->count, newref); preftree->count++; trace_btrfs_prelim_ref_insert(fs_info, newref, NULL, preftree->count); rb_link_node(&newref->rbnode, parent, p); rb_insert_color_cached(&newref->rbnode, root, leftmost); } /* * Release the entire tree. We don't care about internal consistency so * just free everything and then reset the tree root. */ static void prelim_release(struct preftree *preftree) { struct prelim_ref *ref, *next_ref; rbtree_postorder_for_each_entry_safe(ref, next_ref, &preftree->root.rb_root, rbnode) { free_inode_elem_list(ref->inode_list); free_pref(ref); } preftree->root = RB_ROOT_CACHED; preftree->count = 0; } /* * the rules for all callers of this function are: * - obtaining the parent is the goal * - if you add a key, you must know that it is a correct key * - if you cannot add the parent or a correct key, then we will look into the * block later to set a correct key * * delayed refs * ============ * backref type | shared | indirect | shared | indirect * information | tree | tree | data | data * --------------------+--------+----------+--------+---------- * parent logical | y | - | - | - * key to resolve | - | y | y | y * tree block logical | - | - | - | - * root for resolving | y | y | y | y * * - column 1: we've the parent -> done * - column 2, 3, 4: we use the key to find the parent * * on disk refs (inline or keyed) * ============================== * backref type | shared | indirect | shared | indirect * information | tree | tree | data | data * --------------------+--------+----------+--------+---------- * parent logical | y | - | y | - * key to resolve | - | - | - | y * tree block logical | y | y | y | y * root for resolving | - | y | y | y * * - column 1, 3: we've the parent -> done * - column 2: we take the first key from the block to find the parent * (see add_missing_keys) * - column 4: we use the key to find the parent * * additional information that's available but not required to find the parent * block might help in merging entries to gain some speed. */ static int add_prelim_ref(const struct btrfs_fs_info *fs_info, struct preftree *preftree, u64 root_id, const struct btrfs_key *key, int level, u64 parent, u64 wanted_disk_byte, int count, struct share_check *sc, gfp_t gfp_mask) { struct prelim_ref *ref; if (root_id == BTRFS_DATA_RELOC_TREE_OBJECTID) return 0; ref = kmem_cache_alloc(btrfs_prelim_ref_cache, gfp_mask); if (!ref) return -ENOMEM; ref->root_id = root_id; if (key) ref->key_for_search = *key; else memset(&ref->key_for_search, 0, sizeof(ref->key_for_search)); ref->inode_list = NULL; ref->level = level; ref->count = count; ref->parent = parent; ref->wanted_disk_byte = wanted_disk_byte; prelim_ref_insert(fs_info, preftree, ref, sc); return extent_is_shared(sc); } /* direct refs use root == 0, key == NULL */ static int add_direct_ref(const struct btrfs_fs_info *fs_info, struct preftrees *preftrees, int level, u64 parent, u64 wanted_disk_byte, int count, struct share_check *sc, gfp_t gfp_mask) { return add_prelim_ref(fs_info, &preftrees->direct, 0, NULL, level, parent, wanted_disk_byte, count, sc, gfp_mask); } /* indirect refs use parent == 0 */ static int add_indirect_ref(const struct btrfs_fs_info *fs_info, struct preftrees *preftrees, u64 root_id, const struct btrfs_key *key, int level, u64 wanted_disk_byte, int count, struct share_check *sc, gfp_t gfp_mask) { struct preftree *tree = &preftrees->indirect; if (!key) tree = &preftrees->indirect_missing_keys; return add_prelim_ref(fs_info, tree, root_id, key, level, 0, wanted_disk_byte, count, sc, gfp_mask); } static int is_shared_data_backref(struct preftrees *preftrees, u64 bytenr) { struct rb_node **p = &preftrees->direct.root.rb_root.rb_node; struct rb_node *parent = NULL; struct prelim_ref *ref = NULL; struct prelim_ref target = {}; int result; target.parent = bytenr; while (*p) { parent = *p; ref = rb_entry(parent, struct prelim_ref, rbnode); result = prelim_ref_compare(ref, &target); if (result < 0) p = &(*p)->rb_left; else if (result > 0) p = &(*p)->rb_right; else return 1; } return 0; } static int add_all_parents(struct btrfs_backref_walk_ctx *ctx, struct btrfs_root *root, struct btrfs_path *path, struct ulist *parents, struct preftrees *preftrees, struct prelim_ref *ref, int level) { int ret = 0; int slot; struct extent_buffer *eb; struct btrfs_key key; struct btrfs_key *key_for_search = &ref->key_for_search; struct btrfs_file_extent_item *fi; struct extent_inode_elem *eie = NULL, *old = NULL; u64 disk_byte; u64 wanted_disk_byte = ref->wanted_disk_byte; u64 count = 0; u64 data_offset; u8 type; if (level != 0) { eb = path->nodes[level]; ret = ulist_add(parents, eb->start, 0, GFP_NOFS); if (ret < 0) return ret; return 0; } /* * 1. We normally enter this function with the path already pointing to * the first item to check. But sometimes, we may enter it with * slot == nritems. * 2. We are searching for normal backref but bytenr of this leaf * matches shared data backref * 3. The leaf owner is not equal to the root we are searching * * For these cases, go to the next leaf before we continue. */ eb = path->nodes[0]; if (path->slots[0] >= btrfs_header_nritems(eb) || is_shared_data_backref(preftrees, eb->start) || ref->root_id != btrfs_header_owner(eb)) { if (ctx->time_seq == BTRFS_SEQ_LAST) ret = btrfs_next_leaf(root, path); else ret = btrfs_next_old_leaf(root, path, ctx->time_seq); } while (!ret && count < ref->count) { eb = path->nodes[0]; slot = path->slots[0]; btrfs_item_key_to_cpu(eb, &key, slot); if (key.objectid != key_for_search->objectid || key.type != BTRFS_EXTENT_DATA_KEY) break; /* * We are searching for normal backref but bytenr of this leaf * matches shared data backref, OR * the leaf owner is not equal to the root we are searching for */ if (slot == 0 && (is_shared_data_backref(preftrees, eb->start) || ref->root_id != btrfs_header_owner(eb))) { if (ctx->time_seq == BTRFS_SEQ_LAST) ret = btrfs_next_leaf(root, path); else ret = btrfs_next_old_leaf(root, path, ctx->time_seq); continue; } fi = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item); type = btrfs_file_extent_type(eb, fi); if (type == BTRFS_FILE_EXTENT_INLINE) goto next; disk_byte = btrfs_file_extent_disk_bytenr(eb, fi); data_offset = btrfs_file_extent_offset(eb, fi); if (disk_byte == wanted_disk_byte) { eie = NULL; old = NULL; if (ref->key_for_search.offset == key.offset - data_offset) count++; else goto next; if (!ctx->skip_inode_ref_list) { ret = check_extent_in_eb(ctx, &key, eb, fi, &eie); if (ret == BTRFS_ITERATE_EXTENT_INODES_STOP || ret < 0) break; } if (ret > 0) goto next; ret = ulist_add_merge_ptr(parents, eb->start, eie, (void **)&old, GFP_NOFS); if (ret < 0) break; if (!ret && !ctx->skip_inode_ref_list) { while (old->next) old = old->next; old->next = eie; } eie = NULL; } next: if (ctx->time_seq == BTRFS_SEQ_LAST) ret = btrfs_next_item(root, path); else ret = btrfs_next_old_item(root, path, ctx->time_seq); } if (ret == BTRFS_ITERATE_EXTENT_INODES_STOP || ret < 0) free_inode_elem_list(eie); else if (ret > 0) ret = 0; return ret; } /* * resolve an indirect backref in the form (root_id, key, level) * to a logical address */ static int resolve_indirect_ref(struct btrfs_backref_walk_ctx *ctx, struct btrfs_path *path, struct preftrees *preftrees, struct prelim_ref *ref, struct ulist *parents) { struct btrfs_root *root; struct extent_buffer *eb; int ret = 0; int root_level; int level = ref->level; struct btrfs_key search_key = ref->key_for_search; /* * If we're search_commit_root we could possibly be holding locks on * other tree nodes. This happens when qgroups does backref walks when * adding new delayed refs. To deal with this we need to look in cache * for the root, and if we don't find it then we need to search the * tree_root's commit root, thus the btrfs_get_fs_root_commit_root usage * here. */ if (path->search_commit_root) root = btrfs_get_fs_root_commit_root(ctx->fs_info, path, ref->root_id); else root = btrfs_get_fs_root(ctx->fs_info, ref->root_id, false); if (IS_ERR(root)) { ret = PTR_ERR(root); goto out_free; } if (!path->search_commit_root && test_bit(BTRFS_ROOT_DELETING, &root->state)) { ret = -ENOENT; goto out; } if (btrfs_is_testing(ctx->fs_info)) { ret = -ENOENT; goto out; } if (path->search_commit_root) root_level = btrfs_header_level(root->commit_root); else if (ctx->time_seq == BTRFS_SEQ_LAST) root_level = btrfs_header_level(root->node); else root_level = btrfs_old_root_level(root, ctx->time_seq); if (root_level + 1 == level) goto out; /* * We can often find data backrefs with an offset that is too large * (>= LLONG_MAX, maximum allowed file offset) due to underflows when * subtracting a file's offset with the data offset of its * corresponding extent data item. This can happen for example in the * clone ioctl. * * So if we detect such case we set the search key's offset to zero to * make sure we will find the matching file extent item at * add_all_parents(), otherwise we will miss it because the offset * taken form the backref is much larger then the offset of the file * extent item. This can make us scan a very large number of file * extent items, but at least it will not make us miss any. * * This is an ugly workaround for a behaviour that should have never * existed, but it does and a fix for the clone ioctl would touch a lot * of places, cause backwards incompatibility and would not fix the * problem for extents cloned with older kernels. */ if (search_key.type == BTRFS_EXTENT_DATA_KEY && search_key.offset >= LLONG_MAX) search_key.offset = 0; path->lowest_level = level; if (ctx->time_seq == BTRFS_SEQ_LAST) ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0); else ret = btrfs_search_old_slot(root, &search_key, path, ctx->time_seq); btrfs_debug(ctx->fs_info, "search slot in root %llu (level %d, ref count %d) returned %d for key (%llu %u %llu)", ref->root_id, level, ref->count, ret, ref->key_for_search.objectid, ref->key_for_search.type, ref->key_for_search.offset); if (ret < 0) goto out; eb = path->nodes[level]; while (!eb) { if (WARN_ON(!level)) { ret = 1; goto out; } level--; eb = path->nodes[level]; } ret = add_all_parents(ctx, root, path, parents, preftrees, ref, level); out: btrfs_put_root(root); out_free: path->lowest_level = 0; btrfs_release_path(path); return ret; } static struct extent_inode_elem * unode_aux_to_inode_list(struct ulist_node *node) { if (!node) return NULL; return (struct extent_inode_elem *)(uintptr_t)node->aux; } static void free_leaf_list(struct ulist *ulist) { struct ulist_node *node; struct ulist_iterator uiter; ULIST_ITER_INIT(&uiter); while ((node = ulist_next(ulist, &uiter))) free_inode_elem_list(unode_aux_to_inode_list(node)); ulist_free(ulist); } /* * We maintain three separate rbtrees: one for direct refs, one for * indirect refs which have a key, and one for indirect refs which do not * have a key. Each tree does merge on insertion. * * Once all of the references are located, we iterate over the tree of * indirect refs with missing keys. An appropriate key is located and * the ref is moved onto the tree for indirect refs. After all missing * keys are thus located, we iterate over the indirect ref tree, resolve * each reference, and then insert the resolved reference onto the * direct tree (merging there too). * * New backrefs (i.e., for parent nodes) are added to the appropriate * rbtree as they are encountered. The new backrefs are subsequently * resolved as above. */ static int resolve_indirect_refs(struct btrfs_backref_walk_ctx *ctx, struct btrfs_path *path, struct preftrees *preftrees, struct share_check *sc) { int err; int ret = 0; struct ulist *parents; struct ulist_node *node; struct ulist_iterator uiter; struct rb_node *rnode; parents = ulist_alloc(GFP_NOFS); if (!parents) return -ENOMEM; /* * We could trade memory usage for performance here by iterating * the tree, allocating new refs for each insertion, and then * freeing the entire indirect tree when we're done. In some test * cases, the tree can grow quite large (~200k objects). */ while ((rnode = rb_first_cached(&preftrees->indirect.root))) { struct prelim_ref *ref; ref = rb_entry(rnode, struct prelim_ref, rbnode); if (WARN(ref->parent, "BUG: direct ref found in indirect tree")) { ret = -EINVAL; goto out; } rb_erase_cached(&ref->rbnode, &preftrees->indirect.root); preftrees->indirect.count--; if (ref->count == 0) { free_pref(ref); continue; } if (sc && ref->root_id != btrfs_root_id(sc->root)) { free_pref(ref); ret = BACKREF_FOUND_SHARED; goto out; } err = resolve_indirect_ref(ctx, path, preftrees, ref, parents); /* * we can only tolerate ENOENT,otherwise,we should catch error * and return directly. */ if (err == -ENOENT) { prelim_ref_insert(ctx->fs_info, &preftrees->direct, ref, NULL); continue; } else if (err) { free_pref(ref); ret = err; goto out; } /* we put the first parent into the ref at hand */ ULIST_ITER_INIT(&uiter); node = ulist_next(parents, &uiter); ref->parent = node ? node->val : 0; ref->inode_list = unode_aux_to_inode_list(node); /* Add a prelim_ref(s) for any other parent(s). */ while ((node = ulist_next(parents, &uiter))) { struct prelim_ref *new_ref; new_ref = kmem_cache_alloc(btrfs_prelim_ref_cache, GFP_NOFS); if (!new_ref) { free_pref(ref); ret = -ENOMEM; goto out; } memcpy(new_ref, ref, sizeof(*ref)); new_ref->parent = node->val; new_ref->inode_list = unode_aux_to_inode_list(node); prelim_ref_insert(ctx->fs_info, &preftrees->direct, new_ref, NULL); } /* * Now it's a direct ref, put it in the direct tree. We must * do this last because the ref could be merged/freed here. */ prelim_ref_insert(ctx->fs_info, &preftrees->direct, ref, NULL); ulist_reinit(parents); cond_resched(); } out: /* * We may have inode lists attached to refs in the parents ulist, so we * must free them before freeing the ulist and its refs. */ free_leaf_list(parents); return ret; } /* * read tree blocks and add keys where required. */ static int add_missing_keys(struct btrfs_fs_info *fs_info, struct preftrees *preftrees, bool lock) { struct prelim_ref *ref; struct extent_buffer *eb; struct preftree *tree = &preftrees->indirect_missing_keys; struct rb_node *node; while ((node = rb_first_cached(&tree->root))) { struct btrfs_tree_parent_check check = { 0 }; ref = rb_entry(node, struct prelim_ref, rbnode); rb_erase_cached(node, &tree->root); BUG_ON(ref->parent); /* should not be a direct ref */ BUG_ON(ref->key_for_search.type); BUG_ON(!ref->wanted_disk_byte); check.level = ref->level - 1; check.owner_root = ref->root_id; eb = read_tree_block(fs_info, ref->wanted_disk_byte, &check); if (IS_ERR(eb)) { free_pref(ref); return PTR_ERR(eb); } if (!extent_buffer_uptodate(eb)) { free_pref(ref); free_extent_buffer(eb); return -EIO; } if (lock) btrfs_tree_read_lock(eb); if (btrfs_header_level(eb) == 0) btrfs_item_key_to_cpu(eb, &ref->key_for_search, 0); else btrfs_node_key_to_cpu(eb, &ref->key_for_search, 0); if (lock) btrfs_tree_read_unlock(eb); free_extent_buffer(eb); prelim_ref_insert(fs_info, &preftrees->indirect, ref, NULL); cond_resched(); } return 0; } /* * add all currently queued delayed refs from this head whose seq nr is * smaller or equal that seq to the list */ static int add_delayed_refs(const struct btrfs_fs_info *fs_info, struct btrfs_delayed_ref_head *head, u64 seq, struct preftrees *preftrees, struct share_check *sc) { struct btrfs_delayed_ref_node *node; struct btrfs_key key; struct rb_node *n; int count; int ret = 0; spin_lock(&head->lock); for (n = rb_first_cached(&head->ref_tree); n; n = rb_next(n)) { node = rb_entry(n, struct btrfs_delayed_ref_node, ref_node); if (node->seq > seq) continue; switch (node->action) { case BTRFS_ADD_DELAYED_EXTENT: case BTRFS_UPDATE_DELAYED_HEAD: WARN_ON(1); continue; case BTRFS_ADD_DELAYED_REF: count = node->ref_mod; break; case BTRFS_DROP_DELAYED_REF: count = node->ref_mod * -1; break; default: BUG(); } switch (node->type) { case BTRFS_TREE_BLOCK_REF_KEY: { /* NORMAL INDIRECT METADATA backref */ struct btrfs_key *key_ptr = NULL; /* The owner of a tree block ref is the level. */ int level = btrfs_delayed_ref_owner(node); if (head->extent_op && head->extent_op->update_key) { btrfs_disk_key_to_cpu(&key, &head->extent_op->key); key_ptr = &key; } ret = add_indirect_ref(fs_info, preftrees, node->ref_root, key_ptr, level + 1, node->bytenr, count, sc, GFP_ATOMIC); break; } case BTRFS_SHARED_BLOCK_REF_KEY: { /* * SHARED DIRECT METADATA backref * * The owner of a tree block ref is the level. */ int level = btrfs_delayed_ref_owner(node); ret = add_direct_ref(fs_info, preftrees, level + 1, node->parent, node->bytenr, count, sc, GFP_ATOMIC); break; } case BTRFS_EXTENT_DATA_REF_KEY: { /* NORMAL INDIRECT DATA backref */ key.objectid = btrfs_delayed_ref_owner(node); key.type = BTRFS_EXTENT_DATA_KEY; key.offset = btrfs_delayed_ref_offset(node); /* * If we have a share check context and a reference for * another inode, we can't exit immediately. This is * because even if this is a BTRFS_ADD_DELAYED_REF * reference we may find next a BTRFS_DROP_DELAYED_REF * which cancels out this ADD reference. * * If this is a DROP reference and there was no previous * ADD reference, then we need to signal that when we * process references from the extent tree (through * add_inline_refs() and add_keyed_refs()), we should * not exit early if we find a reference for another * inode, because one of the delayed DROP references * may cancel that reference in the extent tree. */ if (sc && count < 0) sc->have_delayed_delete_refs = true; ret = add_indirect_ref(fs_info, preftrees, node->ref_root, &key, 0, node->bytenr, count, sc, GFP_ATOMIC); break; } case BTRFS_SHARED_DATA_REF_KEY: { /* SHARED DIRECT FULL backref */ ret = add_direct_ref(fs_info, preftrees, 0, node->parent, node->bytenr, count, sc, GFP_ATOMIC); break; } default: WARN_ON(1); } /* * We must ignore BACKREF_FOUND_SHARED until all delayed * refs have been checked. */ if (ret && (ret != BACKREF_FOUND_SHARED)) break; } if (!ret) ret = extent_is_shared(sc); spin_unlock(&head->lock); return ret; } /* * add all inline backrefs for bytenr to the list * * Returns 0 on success, <0 on error, or BACKREF_FOUND_SHARED. */ static int add_inline_refs(struct btrfs_backref_walk_ctx *ctx, struct btrfs_path *path, int *info_level, struct preftrees *preftrees, struct share_check *sc) { int ret = 0; int slot; struct extent_buffer *leaf; struct btrfs_key key; struct btrfs_key found_key; unsigned long ptr; unsigned long end; struct btrfs_extent_item *ei; u64 flags; u64 item_size; /* * enumerate all inline refs */ leaf = path->nodes[0]; slot = path->slots[0]; item_size = btrfs_item_size(leaf, slot); ei = btrfs_item_ptr(leaf, slot, struct btrfs_extent_item); if (ctx->check_extent_item) { ret = ctx->check_extent_item(ctx->bytenr, ei, leaf, ctx->user_ctx); if (ret) return ret; } flags = btrfs_extent_flags(leaf, ei); btrfs_item_key_to_cpu(leaf, &found_key, slot); ptr = (unsigned long)(ei + 1); end = (unsigned long)ei + item_size; if (found_key.type == BTRFS_EXTENT_ITEM_KEY && flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { struct btrfs_tree_block_info *info; info = (struct btrfs_tree_block_info *)ptr; *info_level = btrfs_tree_block_level(leaf, info); ptr += sizeof(struct btrfs_tree_block_info); BUG_ON(ptr > end); } else if (found_key.type == BTRFS_METADATA_ITEM_KEY) { *info_level = found_key.offset; } else { BUG_ON(!(flags & BTRFS_EXTENT_FLAG_DATA)); } while (ptr < end) { struct btrfs_extent_inline_ref *iref; u64 offset; int type; iref = (struct btrfs_extent_inline_ref *)ptr; type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_ANY); if (type == BTRFS_REF_TYPE_INVALID) return -EUCLEAN; offset = btrfs_extent_inline_ref_offset(leaf, iref); switch (type) { case BTRFS_SHARED_BLOCK_REF_KEY: ret = add_direct_ref(ctx->fs_info, preftrees, *info_level + 1, offset, ctx->bytenr, 1, NULL, GFP_NOFS); break; case BTRFS_SHARED_DATA_REF_KEY: { struct btrfs_shared_data_ref *sdref; int count; sdref = (struct btrfs_shared_data_ref *)(iref + 1); count = btrfs_shared_data_ref_count(leaf, sdref); ret = add_direct_ref(ctx->fs_info, preftrees, 0, offset, ctx->bytenr, count, sc, GFP_NOFS); break; } case BTRFS_TREE_BLOCK_REF_KEY: ret = add_indirect_ref(ctx->fs_info, preftrees, offset, NULL, *info_level + 1, ctx->bytenr, 1, NULL, GFP_NOFS); break; case BTRFS_EXTENT_DATA_REF_KEY: { struct btrfs_extent_data_ref *dref; int count; u64 root; dref = (struct btrfs_extent_data_ref *)(&iref->offset); count = btrfs_extent_data_ref_count(leaf, dref); key.objectid = btrfs_extent_data_ref_objectid(leaf, dref); key.type = BTRFS_EXTENT_DATA_KEY; key.offset = btrfs_extent_data_ref_offset(leaf, dref); if (sc && key.objectid != sc->inum && !sc->have_delayed_delete_refs) { ret = BACKREF_FOUND_SHARED; break; } root = btrfs_extent_data_ref_root(leaf, dref); if (!ctx->skip_data_ref || !ctx->skip_data_ref(root, key.objectid, key.offset, ctx->user_ctx)) ret = add_indirect_ref(ctx->fs_info, preftrees, root, &key, 0, ctx->bytenr, count, sc, GFP_NOFS); break; } case BTRFS_EXTENT_OWNER_REF_KEY: ASSERT(btrfs_fs_incompat(ctx->fs_info, SIMPLE_QUOTA)); break; default: WARN_ON(1); } if (ret) return ret; ptr += btrfs_extent_inline_ref_size(type); } return 0; } /* * add all non-inline backrefs for bytenr to the list * * Returns 0 on success, <0 on error, or BACKREF_FOUND_SHARED. */ static int add_keyed_refs(struct btrfs_backref_walk_ctx *ctx, struct btrfs_root *extent_root, struct btrfs_path *path, int info_level, struct preftrees *preftrees, struct share_check *sc) { struct btrfs_fs_info *fs_info = extent_root->fs_info; int ret; int slot; struct extent_buffer *leaf; struct btrfs_key key; while (1) { ret = btrfs_next_item(extent_root, path); if (ret < 0) break; if (ret) { ret = 0; break; } slot = path->slots[0]; leaf = path->nodes[0]; btrfs_item_key_to_cpu(leaf, &key, slot); if (key.objectid != ctx->bytenr) break; if (key.type < BTRFS_TREE_BLOCK_REF_KEY) continue; if (key.type > BTRFS_SHARED_DATA_REF_KEY) break; switch (key.type) { case BTRFS_SHARED_BLOCK_REF_KEY: /* SHARED DIRECT METADATA backref */ ret = add_direct_ref(fs_info, preftrees, info_level + 1, key.offset, ctx->bytenr, 1, NULL, GFP_NOFS); break; case BTRFS_SHARED_DATA_REF_KEY: { /* SHARED DIRECT FULL backref */ struct btrfs_shared_data_ref *sdref; int count; sdref = btrfs_item_ptr(leaf, slot, struct btrfs_shared_data_ref); count = btrfs_shared_data_ref_count(leaf, sdref); ret = add_direct_ref(fs_info, preftrees, 0, key.offset, ctx->bytenr, count, sc, GFP_NOFS); break; } case BTRFS_TREE_BLOCK_REF_KEY: /* NORMAL INDIRECT METADATA backref */ ret = add_indirect_ref(fs_info, preftrees, key.offset, NULL, info_level + 1, ctx->bytenr, 1, NULL, GFP_NOFS); break; case BTRFS_EXTENT_DATA_REF_KEY: { /* NORMAL INDIRECT DATA backref */ struct btrfs_extent_data_ref *dref; int count; u64 root; dref = btrfs_item_ptr(leaf, slot, struct btrfs_extent_data_ref); count = btrfs_extent_data_ref_count(leaf, dref); key.objectid = btrfs_extent_data_ref_objectid(leaf, dref); key.type = BTRFS_EXTENT_DATA_KEY; key.offset = btrfs_extent_data_ref_offset(leaf, dref); if (sc && key.objectid != sc->inum && !sc->have_delayed_delete_refs) { ret = BACKREF_FOUND_SHARED; break; } root = btrfs_extent_data_ref_root(leaf, dref); if (!ctx->skip_data_ref || !ctx->skip_data_ref(root, key.objectid, key.offset, ctx->user_ctx)) ret = add_indirect_ref(fs_info, preftrees, root, &key, 0, ctx->bytenr, count, sc, GFP_NOFS); break; } default: WARN_ON(1); } if (ret) return ret; } return ret; } /* * The caller has joined a transaction or is holding a read lock on the * fs_info->commit_root_sem semaphore, so no need to worry about the root's last * snapshot field changing while updating or checking the cache. */ static bool lookup_backref_shared_cache(struct btrfs_backref_share_check_ctx *ctx, struct btrfs_root *root, u64 bytenr, int level, bool *is_shared) { const struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_backref_shared_cache_entry *entry; if (!current->journal_info) lockdep_assert_held(&fs_info->commit_root_sem); if (!ctx->use_path_cache) return false; if (WARN_ON_ONCE(level >= BTRFS_MAX_LEVEL)) return false; /* * Level -1 is used for the data extent, which is not reliable to cache * because its reference count can increase or decrease without us * realizing. We cache results only for extent buffers that lead from * the root node down to the leaf with the file extent item. */ ASSERT(level >= 0); entry = &ctx->path_cache_entries[level]; /* Unused cache entry or being used for some other extent buffer. */ if (entry->bytenr != bytenr) return false; /* * We cached a false result, but the last snapshot generation of the * root changed, so we now have a snapshot. Don't trust the result. */ if (!entry->is_shared && entry->gen != btrfs_root_last_snapshot(&root->root_item)) return false; /* * If we cached a true result and the last generation used for dropping * a root changed, we can not trust the result, because the dropped root * could be a snapshot sharing this extent buffer. */ if (entry->is_shared && entry->gen != btrfs_get_last_root_drop_gen(fs_info)) return false; *is_shared = entry->is_shared; /* * If the node at this level is shared, than all nodes below are also * shared. Currently some of the nodes below may be marked as not shared * because we have just switched from one leaf to another, and switched * also other nodes above the leaf and below the current level, so mark * them as shared. */ if (*is_shared) { for (int i = 0; i < level; i++) { ctx->path_cache_entries[i].is_shared = true; ctx->path_cache_entries[i].gen = entry->gen; } } return true; } /* * The caller has joined a transaction or is holding a read lock on the * fs_info->commit_root_sem semaphore, so no need to worry about the root's last * snapshot field changing while updating or checking the cache. */ static void store_backref_shared_cache(struct btrfs_backref_share_check_ctx *ctx, struct btrfs_root *root, u64 bytenr, int level, bool is_shared) { const struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_backref_shared_cache_entry *entry; u64 gen; if (!current->journal_info) lockdep_assert_held(&fs_info->commit_root_sem); if (!ctx->use_path_cache) return; if (WARN_ON_ONCE(level >= BTRFS_MAX_LEVEL)) return; /* * Level -1 is used for the data extent, which is not reliable to cache * because its reference count can increase or decrease without us * realizing. We cache results only for extent buffers that lead from * the root node down to the leaf with the file extent item. */ ASSERT(level >= 0); if (is_shared) gen = btrfs_get_last_root_drop_gen(fs_info); else gen = btrfs_root_last_snapshot(&root->root_item); entry = &ctx->path_cache_entries[level]; entry->bytenr = bytenr; entry->is_shared = is_shared; entry->gen = gen; /* * If we found an extent buffer is shared, set the cache result for all * extent buffers below it to true. As nodes in the path are COWed, * their sharedness is moved to their children, and if a leaf is COWed, * then the sharedness of a data extent becomes direct, the refcount of * data extent is increased in the extent item at the extent tree. */ if (is_shared) { for (int i = 0; i < level; i++) { entry = &ctx->path_cache_entries[i]; entry->is_shared = is_shared; entry->gen = gen; } } } /* * this adds all existing backrefs (inline backrefs, backrefs and delayed * refs) for the given bytenr to the refs list, merges duplicates and resolves * indirect refs to their parent bytenr. * When roots are found, they're added to the roots list * * @ctx: Backref walking context object, must be not NULL. * @sc: If !NULL, then immediately return BACKREF_FOUND_SHARED when a * shared extent is detected. * * Otherwise this returns 0 for success and <0 for an error. * * FIXME some caching might speed things up */ static int find_parent_nodes(struct btrfs_backref_walk_ctx *ctx, struct share_check *sc) { struct btrfs_root *root = btrfs_extent_root(ctx->fs_info, ctx->bytenr); struct btrfs_key key; struct btrfs_path *path; struct btrfs_delayed_ref_root *delayed_refs = NULL; struct btrfs_delayed_ref_head *head; int info_level = 0; int ret; struct prelim_ref *ref; struct rb_node *node; struct extent_inode_elem *eie = NULL; struct preftrees preftrees = { .direct = PREFTREE_INIT, .indirect = PREFTREE_INIT, .indirect_missing_keys = PREFTREE_INIT }; /* Roots ulist is not needed when using a sharedness check context. */ if (sc) ASSERT(ctx->roots == NULL); key.objectid = ctx->bytenr; key.offset = (u64)-1; if (btrfs_fs_incompat(ctx->fs_info, SKINNY_METADATA)) key.type = BTRFS_METADATA_ITEM_KEY; else key.type = BTRFS_EXTENT_ITEM_KEY; path = btrfs_alloc_path(); if (!path) return -ENOMEM; if (!ctx->trans) { path->search_commit_root = 1; path->skip_locking = 1; } if (ctx->time_seq == BTRFS_SEQ_LAST) path->skip_locking = 1; again: head = NULL; ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); if (ret < 0) goto out; if (ret == 0) { /* * Key with offset -1 found, there would have to exist an extent * item with such offset, but this is out of the valid range. */ ret = -EUCLEAN; goto out; } if (ctx->trans && likely(ctx->trans->type != __TRANS_DUMMY) && ctx->time_seq != BTRFS_SEQ_LAST) { /* * We have a specific time_seq we care about and trans which * means we have the path lock, we need to grab the ref head and * lock it so we have a consistent view of the refs at the given * time. */ delayed_refs = &ctx->trans->transaction->delayed_refs; spin_lock(&delayed_refs->lock); head = btrfs_find_delayed_ref_head(delayed_refs, ctx->bytenr); if (head) { if (!mutex_trylock(&head->mutex)) { refcount_inc(&head->refs); spin_unlock(&delayed_refs->lock); btrfs_release_path(path); /* * Mutex was contended, block until it's * released and try again */ mutex_lock(&head->mutex); mutex_unlock(&head->mutex); btrfs_put_delayed_ref_head(head); goto again; } spin_unlock(&delayed_refs->lock); ret = add_delayed_refs(ctx->fs_info, head, ctx->time_seq, &preftrees, sc); mutex_unlock(&head->mutex); if (ret) goto out; } else { spin_unlock(&delayed_refs->lock); } } if (path->slots[0]) { struct extent_buffer *leaf; int slot; path->slots[0]--; leaf = path->nodes[0]; slot = path->slots[0]; btrfs_item_key_to_cpu(leaf, &key, slot); if (key.objectid == ctx->bytenr && (key.type == BTRFS_EXTENT_ITEM_KEY || key.type == BTRFS_METADATA_ITEM_KEY)) { ret = add_inline_refs(ctx, path, &info_level, &preftrees, sc); if (ret) goto out; ret = add_keyed_refs(ctx, root, path, info_level, &preftrees, sc); if (ret) goto out; } } /* * If we have a share context and we reached here, it means the extent * is not directly shared (no multiple reference items for it), * otherwise we would have exited earlier with a return value of * BACKREF_FOUND_SHARED after processing delayed references or while * processing inline or keyed references from the extent tree. * The extent may however be indirectly shared through shared subtrees * as a result from creating snapshots, so we determine below what is * its parent node, in case we are dealing with a metadata extent, or * what's the leaf (or leaves), from a fs tree, that has a file extent * item pointing to it in case we are dealing with a data extent. */ ASSERT(extent_is_shared(sc) == 0); /* * If we are here for a data extent and we have a share_check structure * it means the data extent is not directly shared (does not have * multiple reference items), so we have to check if a path in the fs * tree (going from the root node down to the leaf that has the file * extent item pointing to the data extent) is shared, that is, if any * of the extent buffers in the path is referenced by other trees. */ if (sc && ctx->bytenr == sc->data_bytenr) { /* * If our data extent is from a generation more recent than the * last generation used to snapshot the root, then we know that * it can not be shared through subtrees, so we can skip * resolving indirect references, there's no point in * determining the extent buffers for the path from the fs tree * root node down to the leaf that has the file extent item that * points to the data extent. */ if (sc->data_extent_gen > btrfs_root_last_snapshot(&sc->root->root_item)) { ret = BACKREF_FOUND_NOT_SHARED; goto out; } /* * If we are only determining if a data extent is shared or not * and the corresponding file extent item is located in the same * leaf as the previous file extent item, we can skip resolving * indirect references for a data extent, since the fs tree path * is the same (same leaf, so same path). We skip as long as the * cached result for the leaf is valid and only if there's only * one file extent item pointing to the data extent, because in * the case of multiple file extent items, they may be located * in different leaves and therefore we have multiple paths. */ if (sc->ctx->curr_leaf_bytenr == sc->ctx->prev_leaf_bytenr && sc->self_ref_count == 1) { bool cached; bool is_shared; cached = lookup_backref_shared_cache(sc->ctx, sc->root, sc->ctx->curr_leaf_bytenr, 0, &is_shared); if (cached) { if (is_shared) ret = BACKREF_FOUND_SHARED; else ret = BACKREF_FOUND_NOT_SHARED; goto out; } } } btrfs_release_path(path); ret = add_missing_keys(ctx->fs_info, &preftrees, path->skip_locking == 0); if (ret) goto out; WARN_ON(!RB_EMPTY_ROOT(&preftrees.indirect_missing_keys.root.rb_root)); ret = resolve_indirect_refs(ctx, path, &preftrees, sc); if (ret) goto out; WARN_ON(!RB_EMPTY_ROOT(&preftrees.indirect.root.rb_root)); /* * This walks the tree of merged and resolved refs. Tree blocks are * read in as needed. Unique entries are added to the ulist, and * the list of found roots is updated. * * We release the entire tree in one go before returning. */ node = rb_first_cached(&preftrees.direct.root); while (node) { ref = rb_entry(node, struct prelim_ref, rbnode); node = rb_next(&ref->rbnode); /* * ref->count < 0 can happen here if there are delayed * refs with a node->action of BTRFS_DROP_DELAYED_REF. * prelim_ref_insert() relies on this when merging * identical refs to keep the overall count correct. * prelim_ref_insert() will merge only those refs * which compare identically. Any refs having * e.g. different offsets would not be merged, * and would retain their original ref->count < 0. */ if (ctx->roots && ref->count && ref->root_id && ref->parent == 0) { /* no parent == root of tree */ ret = ulist_add(ctx->roots, ref->root_id, 0, GFP_NOFS); if (ret < 0) goto out; } if (ref->count && ref->parent) { if (!ctx->skip_inode_ref_list && !ref->inode_list && ref->level == 0) { struct btrfs_tree_parent_check check = { 0 }; struct extent_buffer *eb; check.level = ref->level; eb = read_tree_block(ctx->fs_info, ref->parent, &check); if (IS_ERR(eb)) { ret = PTR_ERR(eb); goto out; } if (!extent_buffer_uptodate(eb)) { free_extent_buffer(eb); ret = -EIO; goto out; } if (!path->skip_locking) btrfs_tree_read_lock(eb); ret = find_extent_in_eb(ctx, eb, &eie); if (!path->skip_locking) btrfs_tree_read_unlock(eb); free_extent_buffer(eb); if (ret == BTRFS_ITERATE_EXTENT_INODES_STOP || ret < 0) goto out; ref->inode_list = eie; /* * We transferred the list ownership to the ref, * so set to NULL to avoid a double free in case * an error happens after this. */ eie = NULL; } ret = ulist_add_merge_ptr(ctx->refs, ref->parent, ref->inode_list, (void **)&eie, GFP_NOFS); if (ret < 0) goto out; if (!ret && !ctx->skip_inode_ref_list) { /* * We've recorded that parent, so we must extend * its inode list here. * * However if there was corruption we may not * have found an eie, return an error in this * case. */ ASSERT(eie); if (!eie) { ret = -EUCLEAN; goto out; } while (eie->next) eie = eie->next; eie->next = ref->inode_list; } eie = NULL; /* * We have transferred the inode list ownership from * this ref to the ref we added to the 'refs' ulist. * So set this ref's inode list to NULL to avoid * use-after-free when our caller uses it or double * frees in case an error happens before we return. */ ref->inode_list = NULL; } cond_resched(); } out: btrfs_free_path(path); prelim_release(&preftrees.direct); prelim_release(&preftrees.indirect); prelim_release(&preftrees.indirect_missing_keys); if (ret == BTRFS_ITERATE_EXTENT_INODES_STOP || ret < 0) free_inode_elem_list(eie); return ret; } /* * Finds all leaves with a reference to the specified combination of * @ctx->bytenr and @ctx->extent_item_pos. The bytenr of the found leaves are * added to the ulist at @ctx->refs, and that ulist is allocated by this * function. The caller should free the ulist with free_leaf_list() if * @ctx->ignore_extent_item_pos is false, otherwise a fimple ulist_free() is * enough. * * Returns 0 on success and < 0 on error. On error @ctx->refs is not allocated. */ int btrfs_find_all_leafs(struct btrfs_backref_walk_ctx *ctx) { int ret; ASSERT(ctx->refs == NULL); ctx->refs = ulist_alloc(GFP_NOFS); if (!ctx->refs) return -ENOMEM; ret = find_parent_nodes(ctx, NULL); if (ret == BTRFS_ITERATE_EXTENT_INODES_STOP || (ret < 0 && ret != -ENOENT)) { free_leaf_list(ctx->refs); ctx->refs = NULL; return ret; } return 0; } /* * Walk all backrefs for a given extent to find all roots that reference this * extent. Walking a backref means finding all extents that reference this * extent and in turn walk the backrefs of those, too. Naturally this is a * recursive process, but here it is implemented in an iterative fashion: We * find all referencing extents for the extent in question and put them on a * list. In turn, we find all referencing extents for those, further appending * to the list. The way we iterate the list allows adding more elements after * the current while iterating. The process stops when we reach the end of the * list. * * Found roots are added to @ctx->roots, which is allocated by this function if * it points to NULL, in which case the caller is responsible for freeing it * after it's not needed anymore. * This function requires @ctx->refs to be NULL, as it uses it for allocating a * ulist to do temporary work, and frees it before returning. * * Returns 0 on success, < 0 on error. */ static int btrfs_find_all_roots_safe(struct btrfs_backref_walk_ctx *ctx) { const u64 orig_bytenr = ctx->bytenr; const bool orig_skip_inode_ref_list = ctx->skip_inode_ref_list; bool roots_ulist_allocated = false; struct ulist_iterator uiter; int ret = 0; ASSERT(ctx->refs == NULL); ctx->refs = ulist_alloc(GFP_NOFS); if (!ctx->refs) return -ENOMEM; if (!ctx->roots) { ctx->roots = ulist_alloc(GFP_NOFS); if (!ctx->roots) { ulist_free(ctx->refs); ctx->refs = NULL; return -ENOMEM; } roots_ulist_allocated = true; } ctx->skip_inode_ref_list = true; ULIST_ITER_INIT(&uiter); while (1) { struct ulist_node *node; ret = find_parent_nodes(ctx, NULL); if (ret < 0 && ret != -ENOENT) { if (roots_ulist_allocated) { ulist_free(ctx->roots); ctx->roots = NULL; } break; } ret = 0; node = ulist_next(ctx->refs, &uiter); if (!node) break; ctx->bytenr = node->val; cond_resched(); } ulist_free(ctx->refs); ctx->refs = NULL; ctx->bytenr = orig_bytenr; ctx->skip_inode_ref_list = orig_skip_inode_ref_list; return ret; } int btrfs_find_all_roots(struct btrfs_backref_walk_ctx *ctx, bool skip_commit_root_sem) { int ret; if (!ctx->trans && !skip_commit_root_sem) down_read(&ctx->fs_info->commit_root_sem); ret = btrfs_find_all_roots_safe(ctx); if (!ctx->trans && !skip_commit_root_sem) up_read(&ctx->fs_info->commit_root_sem); return ret; } struct btrfs_backref_share_check_ctx *btrfs_alloc_backref_share_check_ctx(void) { struct btrfs_backref_share_check_ctx *ctx; ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); if (!ctx) return NULL; ulist_init(&ctx->refs); return ctx; } void btrfs_free_backref_share_ctx(struct btrfs_backref_share_check_ctx *ctx) { if (!ctx) return; ulist_release(&ctx->refs); kfree(ctx); } /* * Check if a data extent is shared or not. * * @inode: The inode whose extent we are checking. * @bytenr: Logical bytenr of the extent we are checking. * @extent_gen: Generation of the extent (file extent item) or 0 if it is * not known. * @ctx: A backref sharedness check context. * * btrfs_is_data_extent_shared uses the backref walking code but will short * circuit as soon as it finds a root or inode that doesn't match the * one passed in. This provides a significant performance benefit for * callers (such as fiemap) which want to know whether the extent is * shared but do not need a ref count. * * This attempts to attach to the running transaction in order to account for * delayed refs, but continues on even when no running transaction exists. * * Return: 0 if extent is not shared, 1 if it is shared, < 0 on error. */ int btrfs_is_data_extent_shared(struct btrfs_inode *inode, u64 bytenr, u64 extent_gen, struct btrfs_backref_share_check_ctx *ctx) { struct btrfs_backref_walk_ctx walk_ctx = { 0 }; struct btrfs_root *root = inode->root; struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_trans_handle *trans; struct ulist_iterator uiter; struct ulist_node *node; struct btrfs_seq_list elem = BTRFS_SEQ_LIST_INIT(elem); int ret = 0; struct share_check shared = { .ctx = ctx, .root = root, .inum = btrfs_ino(inode), .data_bytenr = bytenr, .data_extent_gen = extent_gen, .share_count = 0, .self_ref_count = 0, .have_delayed_delete_refs = false, }; int level; bool leaf_cached; bool leaf_is_shared; for (int i = 0; i < BTRFS_BACKREF_CTX_PREV_EXTENTS_SIZE; i++) { if (ctx->prev_extents_cache[i].bytenr == bytenr) return ctx->prev_extents_cache[i].is_shared; } ulist_init(&ctx->refs); trans = btrfs_join_transaction_nostart(root); if (IS_ERR(trans)) { if (PTR_ERR(trans) != -ENOENT && PTR_ERR(trans) != -EROFS) { ret = PTR_ERR(trans); goto out; } trans = NULL; down_read(&fs_info->commit_root_sem); } else { btrfs_get_tree_mod_seq(fs_info, &elem); walk_ctx.time_seq = elem.seq; } ctx->use_path_cache = true; /* * We may have previously determined that the current leaf is shared. * If it is, then we have a data extent that is shared due to a shared * subtree (caused by snapshotting) and we don't need to check for data * backrefs. If the leaf is not shared, then we must do backref walking * to determine if the data extent is shared through reflinks. */ leaf_cached = lookup_backref_shared_cache(ctx, root, ctx->curr_leaf_bytenr, 0, &leaf_is_shared); if (leaf_cached && leaf_is_shared) { ret = 1; goto out_trans; } walk_ctx.skip_inode_ref_list = true; walk_ctx.trans = trans; walk_ctx.fs_info = fs_info; walk_ctx.refs = &ctx->refs; /* -1 means we are in the bytenr of the data extent. */ level = -1; ULIST_ITER_INIT(&uiter); while (1) { const unsigned long prev_ref_count = ctx->refs.nnodes; walk_ctx.bytenr = bytenr; ret = find_parent_nodes(&walk_ctx, &shared); if (ret == BACKREF_FOUND_SHARED || ret == BACKREF_FOUND_NOT_SHARED) { /* If shared must return 1, otherwise return 0. */ ret = (ret == BACKREF_FOUND_SHARED) ? 1 : 0; if (level >= 0) store_backref_shared_cache(ctx, root, bytenr, level, ret == 1); break; } if (ret < 0 && ret != -ENOENT) break; ret = 0; /* * More than one extent buffer (bytenr) may have been added to * the ctx->refs ulist, in which case we have to check multiple * tree paths in case the first one is not shared, so we can not * use the path cache which is made for a single path. Multiple * extent buffers at the current level happen when: * * 1) level -1, the data extent: If our data extent was not * directly shared (without multiple reference items), then * it might have a single reference item with a count > 1 for * the same offset, which means there are 2 (or more) file * extent items that point to the data extent - this happens * when a file extent item needs to be split and then one * item gets moved to another leaf due to a b+tree leaf split * when inserting some item. In this case the file extent * items may be located in different leaves and therefore * some of the leaves may be referenced through shared * subtrees while others are not. Since our extent buffer * cache only works for a single path (by far the most common * case and simpler to deal with), we can not use it if we * have multiple leaves (which implies multiple paths). * * 2) level >= 0, a tree node/leaf: We can have a mix of direct * and indirect references on a b+tree node/leaf, so we have * to check multiple paths, and the extent buffer (the * current bytenr) may be shared or not. One example is * during relocation as we may get a shared tree block ref * (direct ref) and a non-shared tree block ref (indirect * ref) for the same node/leaf. */ if ((ctx->refs.nnodes - prev_ref_count) > 1) ctx->use_path_cache = false; if (level >= 0) store_backref_shared_cache(ctx, root, bytenr, level, false); node = ulist_next(&ctx->refs, &uiter); if (!node) break; bytenr = node->val; if (ctx->use_path_cache) { bool is_shared; bool cached; level++; cached = lookup_backref_shared_cache(ctx, root, bytenr, level, &is_shared); if (cached) { ret = (is_shared ? 1 : 0); break; } } shared.share_count = 0; shared.have_delayed_delete_refs = false; cond_resched(); } /* * If the path cache is disabled, then it means at some tree level we * got multiple parents due to a mix of direct and indirect backrefs or * multiple leaves with file extent items pointing to the same data * extent. We have to invalidate the cache and cache only the sharedness * result for the levels where we got only one node/reference. */ if (!ctx->use_path_cache) { int i = 0; level--; if (ret >= 0 && level >= 0) { bytenr = ctx->path_cache_entries[level].bytenr; ctx->use_path_cache = true; store_backref_shared_cache(ctx, root, bytenr, level, ret); i = level + 1; } for ( ; i < BTRFS_MAX_LEVEL; i++) ctx->path_cache_entries[i].bytenr = 0; } /* * Cache the sharedness result for the data extent if we know our inode * has more than 1 file extent item that refers to the data extent. */ if (ret >= 0 && shared.self_ref_count > 1) { int slot = ctx->prev_extents_cache_slot; ctx->prev_extents_cache[slot].bytenr = shared.data_bytenr; ctx->prev_extents_cache[slot].is_shared = (ret == 1); slot = (slot + 1) % BTRFS_BACKREF_CTX_PREV_EXTENTS_SIZE; ctx->prev_extents_cache_slot = slot; } out_trans: if (trans) { btrfs_put_tree_mod_seq(fs_info, &elem); btrfs_end_transaction(trans); } else { up_read(&fs_info->commit_root_sem); } out: ulist_release(&ctx->refs); ctx->prev_leaf_bytenr = ctx->curr_leaf_bytenr; return ret; } int btrfs_find_one_extref(struct btrfs_root *root, u64 inode_objectid, u64 start_off, struct btrfs_path *path, struct btrfs_inode_extref **ret_extref, u64 *found_off) { int ret, slot; struct btrfs_key key; struct btrfs_key found_key; struct btrfs_inode_extref *extref; const struct extent_buffer *leaf; unsigned long ptr; key.objectid = inode_objectid; key.type = BTRFS_INODE_EXTREF_KEY; key.offset = start_off; ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); if (ret < 0) return ret; while (1) { leaf = path->nodes[0]; slot = path->slots[0]; if (slot >= btrfs_header_nritems(leaf)) { /* * If the item at offset is not found, * btrfs_search_slot will point us to the slot * where it should be inserted. In our case * that will be the slot directly before the * next INODE_REF_KEY_V2 item. In the case * that we're pointing to the last slot in a * leaf, we must move one leaf over. */ ret = btrfs_next_leaf(root, path); if (ret) { if (ret >= 1) ret = -ENOENT; break; } continue; } btrfs_item_key_to_cpu(leaf, &found_key, slot); /* * Check that we're still looking at an extended ref key for * this particular objectid. If we have different * objectid or type then there are no more to be found * in the tree and we can exit. */ ret = -ENOENT; if (found_key.objectid != inode_objectid) break; if (found_key.type != BTRFS_INODE_EXTREF_KEY) break; ret = 0; ptr = btrfs_item_ptr_offset(leaf, path->slots[0]); extref = (struct btrfs_inode_extref *)ptr; *ret_extref = extref; if (found_off) *found_off = found_key.offset; break; } return ret; } /* * this iterates to turn a name (from iref/extref) into a full filesystem path. * Elements of the path are separated by '/' and the path is guaranteed to be * 0-terminated. the path is only given within the current file system. * Therefore, it never starts with a '/'. the caller is responsible to provide * "size" bytes in "dest". the dest buffer will be filled backwards. finally, * the start point of the resulting string is returned. this pointer is within * dest, normally. * in case the path buffer would overflow, the pointer is decremented further * as if output was written to the buffer, though no more output is actually * generated. that way, the caller can determine how much space would be * required for the path to fit into the buffer. in that case, the returned * value will be smaller than dest. callers must check this! */ char *btrfs_ref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path, u32 name_len, unsigned long name_off, struct extent_buffer *eb_in, u64 parent, char *dest, u32 size) { int slot; u64 next_inum; int ret; s64 bytes_left = ((s64)size) - 1; struct extent_buffer *eb = eb_in; struct btrfs_key found_key; struct btrfs_inode_ref *iref; if (bytes_left >= 0) dest[bytes_left] = '\0'; while (1) { bytes_left -= name_len; if (bytes_left >= 0) read_extent_buffer(eb, dest + bytes_left, name_off, name_len); if (eb != eb_in) { if (!path->skip_locking) btrfs_tree_read_unlock(eb); free_extent_buffer(eb); } ret = btrfs_find_item(fs_root, path, parent, 0, BTRFS_INODE_REF_KEY, &found_key); if (ret > 0) ret = -ENOENT; if (ret) break; next_inum = found_key.offset; /* regular exit ahead */ if (parent == next_inum) break; slot = path->slots[0]; eb = path->nodes[0]; /* make sure we can use eb after releasing the path */ if (eb != eb_in) { path->nodes[0] = NULL; path->locks[0] = 0; } btrfs_release_path(path); iref = btrfs_item_ptr(eb, slot, struct btrfs_inode_ref); name_len = btrfs_inode_ref_name_len(eb, iref); name_off = (unsigned long)(iref + 1); parent = next_inum; --bytes_left; if (bytes_left >= 0) dest[bytes_left] = '/'; } btrfs_release_path(path); if (ret) return ERR_PTR(ret); return dest + bytes_left; } /* * this makes the path point to (logical EXTENT_ITEM *) * returns BTRFS_EXTENT_FLAG_DATA for data, BTRFS_EXTENT_FLAG_TREE_BLOCK for * tree blocks and <0 on error. */ int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical, struct btrfs_path *path, struct btrfs_key *found_key, u64 *flags_ret) { struct btrfs_root *extent_root = btrfs_extent_root(fs_info, logical); int ret; u64 flags; u64 size = 0; u32 item_size; const struct extent_buffer *eb; struct btrfs_extent_item *ei; struct btrfs_key key; if (btrfs_fs_incompat(fs_info, SKINNY_METADATA)) key.type = BTRFS_METADATA_ITEM_KEY; else key.type = BTRFS_EXTENT_ITEM_KEY; key.objectid = logical; key.offset = (u64)-1; ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0); if (ret < 0) return ret; if (ret == 0) { /* * Key with offset -1 found, there would have to exist an extent * item with such offset, but this is out of the valid range. */ return -EUCLEAN; } ret = btrfs_previous_extent_item(extent_root, path, 0); if (ret) { if (ret > 0) ret = -ENOENT; return ret; } btrfs_item_key_to_cpu(path->nodes[0], found_key, path->slots[0]); if (found_key->type == BTRFS_METADATA_ITEM_KEY) size = fs_info->nodesize; else if (found_key->type == BTRFS_EXTENT_ITEM_KEY) size = found_key->offset; if (found_key->objectid > logical || found_key->objectid + size <= logical) { btrfs_debug(fs_info, "logical %llu is not within any extent", logical); return -ENOENT; } eb = path->nodes[0]; item_size = btrfs_item_size(eb, path->slots[0]); ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item); flags = btrfs_extent_flags(eb, ei); btrfs_debug(fs_info, "logical %llu is at position %llu within the extent (%llu EXTENT_ITEM %llu) flags %#llx size %u", logical, logical - found_key->objectid, found_key->objectid, found_key->offset, flags, item_size); WARN_ON(!flags_ret); if (flags_ret) { if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) *flags_ret = BTRFS_EXTENT_FLAG_TREE_BLOCK; else if (flags & BTRFS_EXTENT_FLAG_DATA) *flags_ret = BTRFS_EXTENT_FLAG_DATA; else BUG(); return 0; } return -EIO; } /* * helper function to iterate extent inline refs. ptr must point to a 0 value * for the first call and may be modified. it is used to track state. * if more refs exist, 0 is returned and the next call to * get_extent_inline_ref must pass the modified ptr parameter to get the * next ref. after the last ref was processed, 1 is returned. * returns <0 on error */ static int get_extent_inline_ref(unsigned long *ptr, const struct extent_buffer *eb, const struct btrfs_key *key, const struct btrfs_extent_item *ei, u32 item_size, struct btrfs_extent_inline_ref **out_eiref, int *out_type) { unsigned long end; u64 flags; struct btrfs_tree_block_info *info; if (!*ptr) { /* first call */ flags = btrfs_extent_flags(eb, ei); if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { if (key->type == BTRFS_METADATA_ITEM_KEY) { /* a skinny metadata extent */ *out_eiref = (struct btrfs_extent_inline_ref *)(ei + 1); } else { WARN_ON(key->type != BTRFS_EXTENT_ITEM_KEY); info = (struct btrfs_tree_block_info *)(ei + 1); *out_eiref = (struct btrfs_extent_inline_ref *)(info + 1); } } else { *out_eiref = (struct btrfs_extent_inline_ref *)(ei + 1); } *ptr = (unsigned long)*out_eiref; if ((unsigned long)(*ptr) >= (unsigned long)ei + item_size) return -ENOENT; } end = (unsigned long)ei + item_size; *out_eiref = (struct btrfs_extent_inline_ref *)(*ptr); *out_type = btrfs_get_extent_inline_ref_type(eb, *out_eiref, BTRFS_REF_TYPE_ANY); if (*out_type == BTRFS_REF_TYPE_INVALID) return -EUCLEAN; *ptr += btrfs_extent_inline_ref_size(*out_type); WARN_ON(*ptr > end); if (*ptr == end) return 1; /* last */ return 0; } /* * reads the tree block backref for an extent. tree level and root are returned * through out_level and out_root. ptr must point to a 0 value for the first * call and may be modified (see get_extent_inline_ref comment). * returns 0 if data was provided, 1 if there was no more data to provide or * <0 on error. */ int tree_backref_for_extent(unsigned long *ptr, struct extent_buffer *eb, struct btrfs_key *key, struct btrfs_extent_item *ei, u32 item_size, u64 *out_root, u8 *out_level) { int ret; int type; struct btrfs_extent_inline_ref *eiref; if (*ptr == (unsigned long)-1) return 1; while (1) { ret = get_extent_inline_ref(ptr, eb, key, ei, item_size, &eiref, &type); if (ret < 0) return ret; if (type == BTRFS_TREE_BLOCK_REF_KEY || type == BTRFS_SHARED_BLOCK_REF_KEY) break; if (ret == 1) return 1; } /* we can treat both ref types equally here */ *out_root = btrfs_extent_inline_ref_offset(eb, eiref); if (key->type == BTRFS_EXTENT_ITEM_KEY) { struct btrfs_tree_block_info *info; info = (struct btrfs_tree_block_info *)(ei + 1); *out_level = btrfs_tree_block_level(eb, info); } else { ASSERT(key->type == BTRFS_METADATA_ITEM_KEY); *out_level = (u8)key->offset; } if (ret == 1) *ptr = (unsigned long)-1; return 0; } static int iterate_leaf_refs(struct btrfs_fs_info *fs_info, struct extent_inode_elem *inode_list, u64 root, u64 extent_item_objectid, iterate_extent_inodes_t *iterate, void *ctx) { struct extent_inode_elem *eie; int ret = 0; for (eie = inode_list; eie; eie = eie->next) { btrfs_debug(fs_info, "ref for %llu resolved, key (%llu EXTEND_DATA %llu), root %llu", extent_item_objectid, eie->inum, eie->offset, root); ret = iterate(eie->inum, eie->offset, eie->num_bytes, root, ctx); if (ret) { btrfs_debug(fs_info, "stopping iteration for %llu due to ret=%d", extent_item_objectid, ret); break; } } return ret; } /* * calls iterate() for every inode that references the extent identified by * the given parameters. * when the iterator function returns a non-zero value, iteration stops. */ int iterate_extent_inodes(struct btrfs_backref_walk_ctx *ctx, bool search_commit_root, iterate_extent_inodes_t *iterate, void *user_ctx) { int ret; struct ulist *refs; struct ulist_node *ref_node; struct btrfs_seq_list seq_elem = BTRFS_SEQ_LIST_INIT(seq_elem); struct ulist_iterator ref_uiter; btrfs_debug(ctx->fs_info, "resolving all inodes for extent %llu", ctx->bytenr); ASSERT(ctx->trans == NULL); ASSERT(ctx->roots == NULL); if (!search_commit_root) { struct btrfs_trans_handle *trans; trans = btrfs_attach_transaction(ctx->fs_info->tree_root); if (IS_ERR(trans)) { if (PTR_ERR(trans) != -ENOENT && PTR_ERR(trans) != -EROFS) return PTR_ERR(trans); trans = NULL; } ctx->trans = trans; } if (ctx->trans) { btrfs_get_tree_mod_seq(ctx->fs_info, &seq_elem); ctx->time_seq = seq_elem.seq; } else { down_read(&ctx->fs_info->commit_root_sem); } ret = btrfs_find_all_leafs(ctx); if (ret) goto out; refs = ctx->refs; ctx->refs = NULL; ULIST_ITER_INIT(&ref_uiter); while (!ret && (ref_node = ulist_next(refs, &ref_uiter))) { const u64 leaf_bytenr = ref_node->val; struct ulist_node *root_node; struct ulist_iterator root_uiter; struct extent_inode_elem *inode_list; inode_list = (struct extent_inode_elem *)(uintptr_t)ref_node->aux; if (ctx->cache_lookup) { const u64 *root_ids; int root_count; bool cached; cached = ctx->cache_lookup(leaf_bytenr, ctx->user_ctx, &root_ids, &root_count); if (cached) { for (int i = 0; i < root_count; i++) { ret = iterate_leaf_refs(ctx->fs_info, inode_list, root_ids[i], leaf_bytenr, iterate, user_ctx); if (ret) break; } continue; } } if (!ctx->roots) { ctx->roots = ulist_alloc(GFP_NOFS); if (!ctx->roots) { ret = -ENOMEM; break; } } ctx->bytenr = leaf_bytenr; ret = btrfs_find_all_roots_safe(ctx); if (ret) break; if (ctx->cache_store) ctx->cache_store(leaf_bytenr, ctx->roots, ctx->user_ctx); ULIST_ITER_INIT(&root_uiter); while (!ret && (root_node = ulist_next(ctx->roots, &root_uiter))) { btrfs_debug(ctx->fs_info, "root %llu references leaf %llu, data list %#llx", root_node->val, ref_node->val, ref_node->aux); ret = iterate_leaf_refs(ctx->fs_info, inode_list, root_node->val, ctx->bytenr, iterate, user_ctx); } ulist_reinit(ctx->roots); } free_leaf_list(refs); out: if (ctx->trans) { btrfs_put_tree_mod_seq(ctx->fs_info, &seq_elem); btrfs_end_transaction(ctx->trans); ctx->trans = NULL; } else { up_read(&ctx->fs_info->commit_root_sem); } ulist_free(ctx->roots); ctx->roots = NULL; if (ret == BTRFS_ITERATE_EXTENT_INODES_STOP) ret = 0; return ret; } static int build_ino_list(u64 inum, u64 offset, u64 num_bytes, u64 root, void *ctx) { struct btrfs_data_container *inodes = ctx; const size_t c = 3 * sizeof(u64); if (inodes->bytes_left >= c) { inodes->bytes_left -= c; inodes->val[inodes->elem_cnt] = inum; inodes->val[inodes->elem_cnt + 1] = offset; inodes->val[inodes->elem_cnt + 2] = root; inodes->elem_cnt += 3; } else { inodes->bytes_missing += c - inodes->bytes_left; inodes->bytes_left = 0; inodes->elem_missed += 3; } return 0; } int iterate_inodes_from_logical(u64 logical, struct btrfs_fs_info *fs_info, struct btrfs_path *path, void *ctx, bool ignore_offset) { struct btrfs_backref_walk_ctx walk_ctx = { 0 }; int ret; u64 flags = 0; struct btrfs_key found_key; int search_commit_root = path->search_commit_root; ret = extent_from_logical(fs_info, logical, path, &found_key, &flags); btrfs_release_path(path); if (ret < 0) return ret; if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) return -EINVAL; walk_ctx.bytenr = found_key.objectid; if (ignore_offset) walk_ctx.ignore_extent_item_pos = true; else walk_ctx.extent_item_pos = logical - found_key.objectid; walk_ctx.fs_info = fs_info; return iterate_extent_inodes(&walk_ctx, search_commit_root, build_ino_list, ctx); } static int inode_to_path(u64 inum, u32 name_len, unsigned long name_off, struct extent_buffer *eb, struct inode_fs_paths *ipath); static int iterate_inode_refs(u64 inum, struct inode_fs_paths *ipath) { int ret = 0; int slot; u32 cur; u32 len; u32 name_len; u64 parent = 0; int found = 0; struct btrfs_root *fs_root = ipath->fs_root; struct btrfs_path *path = ipath->btrfs_path; struct extent_buffer *eb; struct btrfs_inode_ref *iref; struct btrfs_key found_key; while (!ret) { ret = btrfs_find_item(fs_root, path, inum, parent ? parent + 1 : 0, BTRFS_INODE_REF_KEY, &found_key); if (ret < 0) break; if (ret) { ret = found ? 0 : -ENOENT; break; } ++found; parent = found_key.offset; slot = path->slots[0]; eb = btrfs_clone_extent_buffer(path->nodes[0]); if (!eb) { ret = -ENOMEM; break; } btrfs_release_path(path); iref = btrfs_item_ptr(eb, slot, struct btrfs_inode_ref); for (cur = 0; cur < btrfs_item_size(eb, slot); cur += len) { name_len = btrfs_inode_ref_name_len(eb, iref); /* path must be released before calling iterate()! */ btrfs_debug(fs_root->fs_info, "following ref at offset %u for inode %llu in tree %llu", cur, found_key.objectid, btrfs_root_id(fs_root)); ret = inode_to_path(parent, name_len, (unsigned long)(iref + 1), eb, ipath); if (ret) break; len = sizeof(*iref) + name_len; iref = (struct btrfs_inode_ref *)((char *)iref + len); } free_extent_buffer(eb); } btrfs_release_path(path); return ret; } static int iterate_inode_extrefs(u64 inum, struct inode_fs_paths *ipath) { int ret; int slot; u64 offset = 0; u64 parent; int found = 0; struct btrfs_root *fs_root = ipath->fs_root; struct btrfs_path *path = ipath->btrfs_path; struct extent_buffer *eb; struct btrfs_inode_extref *extref; u32 item_size; u32 cur_offset; unsigned long ptr; while (1) { ret = btrfs_find_one_extref(fs_root, inum, offset, path, &extref, &offset); if (ret < 0) break; if (ret) { ret = found ? 0 : -ENOENT; break; } ++found; slot = path->slots[0]; eb = btrfs_clone_extent_buffer(path->nodes[0]); if (!eb) { ret = -ENOMEM; break; } btrfs_release_path(path); item_size = btrfs_item_size(eb, slot); ptr = btrfs_item_ptr_offset(eb, slot); cur_offset = 0; while (cur_offset < item_size) { u32 name_len; extref = (struct btrfs_inode_extref *)(ptr + cur_offset); parent = btrfs_inode_extref_parent(eb, extref); name_len = btrfs_inode_extref_name_len(eb, extref); ret = inode_to_path(parent, name_len, (unsigned long)&extref->name, eb, ipath); if (ret) break; cur_offset += btrfs_inode_extref_name_len(eb, extref); cur_offset += sizeof(*extref); } free_extent_buffer(eb); offset++; } btrfs_release_path(path); return ret; } /* * returns 0 if the path could be dumped (probably truncated) * returns <0 in case of an error */ static int inode_to_path(u64 inum, u32 name_len, unsigned long name_off, struct extent_buffer *eb, struct inode_fs_paths *ipath) { char *fspath; char *fspath_min; int i = ipath->fspath->elem_cnt; const int s_ptr = sizeof(char *); u32 bytes_left; bytes_left = ipath->fspath->bytes_left > s_ptr ? ipath->fspath->bytes_left - s_ptr : 0; fspath_min = (char *)ipath->fspath->val + (i + 1) * s_ptr; fspath = btrfs_ref_to_path(ipath->fs_root, ipath->btrfs_path, name_len, name_off, eb, inum, fspath_min, bytes_left); if (IS_ERR(fspath)) return PTR_ERR(fspath); if (fspath > fspath_min) { ipath->fspath->val[i] = (u64)(unsigned long)fspath; ++ipath->fspath->elem_cnt; ipath->fspath->bytes_left = fspath - fspath_min; } else { ++ipath->fspath->elem_missed; ipath->fspath->bytes_missing += fspath_min - fspath; ipath->fspath->bytes_left = 0; } return 0; } /* * this dumps all file system paths to the inode into the ipath struct, provided * is has been created large enough. each path is zero-terminated and accessed * from ipath->fspath->val[i]. * when it returns, there are ipath->fspath->elem_cnt number of paths available * in ipath->fspath->val[]. when the allocated space wasn't sufficient, the * number of missed paths is recorded in ipath->fspath->elem_missed, otherwise, * it's zero. ipath->fspath->bytes_missing holds the number of bytes that would * have been needed to return all paths. */ int paths_from_inode(u64 inum, struct inode_fs_paths *ipath) { int ret; int found_refs = 0; ret = iterate_inode_refs(inum, ipath); if (!ret) ++found_refs; else if (ret != -ENOENT) return ret; ret = iterate_inode_extrefs(inum, ipath); if (ret == -ENOENT && found_refs) return 0; return ret; } struct btrfs_data_container *init_data_container(u32 total_bytes) { struct btrfs_data_container *data; size_t alloc_bytes; alloc_bytes = max_t(size_t, total_bytes, sizeof(*data)); data = kvzalloc(alloc_bytes, GFP_KERNEL); if (!data) return ERR_PTR(-ENOMEM); if (total_bytes >= sizeof(*data)) data->bytes_left = total_bytes - sizeof(*data); else data->bytes_missing = sizeof(*data) - total_bytes; return data; } /* * allocates space to return multiple file system paths for an inode. * total_bytes to allocate are passed, note that space usable for actual path * information will be total_bytes - sizeof(struct inode_fs_paths). * the returned pointer must be freed with free_ipath() in the end. */ struct inode_fs_paths *init_ipath(s32 total_bytes, struct btrfs_root *fs_root, struct btrfs_path *path) { struct inode_fs_paths *ifp; struct btrfs_data_container *fspath; fspath = init_data_container(total_bytes); if (IS_ERR(fspath)) return ERR_CAST(fspath); ifp = kmalloc(sizeof(*ifp), GFP_KERNEL); if (!ifp) { kvfree(fspath); return ERR_PTR(-ENOMEM); } ifp->btrfs_path = path; ifp->fspath = fspath; ifp->fs_root = fs_root; return ifp; } void free_ipath(struct inode_fs_paths *ipath) { if (!ipath) return; kvfree(ipath->fspath); kfree(ipath); } struct btrfs_backref_iter *btrfs_backref_iter_alloc(struct btrfs_fs_info *fs_info) { struct btrfs_backref_iter *ret; ret = kzalloc(sizeof(*ret), GFP_NOFS); if (!ret) return NULL; ret->path = btrfs_alloc_path(); if (!ret->path) { kfree(ret); return NULL; } /* Current backref iterator only supports iteration in commit root */ ret->path->search_commit_root = 1; ret->path->skip_locking = 1; ret->fs_info = fs_info; return ret; } static void btrfs_backref_iter_release(struct btrfs_backref_iter *iter) { iter->bytenr = 0; iter->item_ptr = 0; iter->cur_ptr = 0; iter->end_ptr = 0; btrfs_release_path(iter->path); memset(&iter->cur_key, 0, sizeof(iter->cur_key)); } int btrfs_backref_iter_start(struct btrfs_backref_iter *iter, u64 bytenr) { struct btrfs_fs_info *fs_info = iter->fs_info; struct btrfs_root *extent_root = btrfs_extent_root(fs_info, bytenr); struct btrfs_path *path = iter->path; struct btrfs_extent_item *ei; struct btrfs_key key; int ret; key.objectid = bytenr; key.type = BTRFS_METADATA_ITEM_KEY; key.offset = (u64)-1; iter->bytenr = bytenr; ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0); if (ret < 0) return ret; if (ret == 0) { /* * Key with offset -1 found, there would have to exist an extent * item with such offset, but this is out of the valid range. */ ret = -EUCLEAN; goto release; } if (path->slots[0] == 0) { WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG)); ret = -EUCLEAN; goto release; } path->slots[0]--; btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); if ((key.type != BTRFS_EXTENT_ITEM_KEY && key.type != BTRFS_METADATA_ITEM_KEY) || key.objectid != bytenr) { ret = -ENOENT; goto release; } memcpy(&iter->cur_key, &key, sizeof(key)); iter->item_ptr = (u32)btrfs_item_ptr_offset(path->nodes[0], path->slots[0]); iter->end_ptr = (u32)(iter->item_ptr + btrfs_item_size(path->nodes[0], path->slots[0])); ei = btrfs_item_ptr(path->nodes[0], path->slots[0], struct btrfs_extent_item); /* * Only support iteration on tree backref yet. * * This is an extra precaution for non skinny-metadata, where * EXTENT_ITEM is also used for tree blocks, that we can only use * extent flags to determine if it's a tree block. */ if (btrfs_extent_flags(path->nodes[0], ei) & BTRFS_EXTENT_FLAG_DATA) { ret = -ENOTSUPP; goto release; } iter->cur_ptr = (u32)(iter->item_ptr + sizeof(*ei)); /* If there is no inline backref, go search for keyed backref */ if (iter->cur_ptr >= iter->end_ptr) { ret = btrfs_next_item(extent_root, path); /* No inline nor keyed ref */ if (ret > 0) { ret = -ENOENT; goto release; } if (ret < 0) goto release; btrfs_item_key_to_cpu(path->nodes[0], &iter->cur_key, path->slots[0]); if (iter->cur_key.objectid != bytenr || (iter->cur_key.type != BTRFS_SHARED_BLOCK_REF_KEY && iter->cur_key.type != BTRFS_TREE_BLOCK_REF_KEY)) { ret = -ENOENT; goto release; } iter->cur_ptr = (u32)btrfs_item_ptr_offset(path->nodes[0], path->slots[0]); iter->item_ptr = iter->cur_ptr; iter->end_ptr = (u32)(iter->item_ptr + btrfs_item_size( path->nodes[0], path->slots[0])); } return 0; release: btrfs_backref_iter_release(iter); return ret; } static bool btrfs_backref_iter_is_inline_ref(struct btrfs_backref_iter *iter) { if (iter->cur_key.type == BTRFS_EXTENT_ITEM_KEY || iter->cur_key.type == BTRFS_METADATA_ITEM_KEY) return true; return false; } /* * Go to the next backref item of current bytenr, can be either inlined or * keyed. * * Caller needs to check whether it's inline ref or not by iter->cur_key. * * Return 0 if we get next backref without problem. * Return >0 if there is no extra backref for this bytenr. * Return <0 if there is something wrong happened. */ int btrfs_backref_iter_next(struct btrfs_backref_iter *iter) { struct extent_buffer *eb = iter->path->nodes[0]; struct btrfs_root *extent_root; struct btrfs_path *path = iter->path; struct btrfs_extent_inline_ref *iref; int ret; u32 size; if (btrfs_backref_iter_is_inline_ref(iter)) { /* We're still inside the inline refs */ ASSERT(iter->cur_ptr < iter->end_ptr); if (btrfs_backref_has_tree_block_info(iter)) { /* First tree block info */ size = sizeof(struct btrfs_tree_block_info); } else { /* Use inline ref type to determine the size */ int type; iref = (struct btrfs_extent_inline_ref *) ((unsigned long)iter->cur_ptr); type = btrfs_extent_inline_ref_type(eb, iref); size = btrfs_extent_inline_ref_size(type); } iter->cur_ptr += size; if (iter->cur_ptr < iter->end_ptr) return 0; /* All inline items iterated, fall through */ } /* We're at keyed items, there is no inline item, go to the next one */ extent_root = btrfs_extent_root(iter->fs_info, iter->bytenr); ret = btrfs_next_item(extent_root, iter->path); if (ret) return ret; btrfs_item_key_to_cpu(path->nodes[0], &iter->cur_key, path->slots[0]); if (iter->cur_key.objectid != iter->bytenr || (iter->cur_key.type != BTRFS_TREE_BLOCK_REF_KEY && iter->cur_key.type != BTRFS_SHARED_BLOCK_REF_KEY)) return 1; iter->item_ptr = (u32)btrfs_item_ptr_offset(path->nodes[0], path->slots[0]); iter->cur_ptr = iter->item_ptr; iter->end_ptr = iter->item_ptr + (u32)btrfs_item_size(path->nodes[0], path->slots[0]); return 0; } void btrfs_backref_init_cache(struct btrfs_fs_info *fs_info, struct btrfs_backref_cache *cache, bool is_reloc) { int i; cache->rb_root = RB_ROOT; for (i = 0; i < BTRFS_MAX_LEVEL; i++) INIT_LIST_HEAD(&cache->pending[i]); INIT_LIST_HEAD(&cache->changed); INIT_LIST_HEAD(&cache->detached); INIT_LIST_HEAD(&cache->leaves); INIT_LIST_HEAD(&cache->pending_edge); INIT_LIST_HEAD(&cache->useless_node); cache->fs_info = fs_info; cache->is_reloc = is_reloc; } struct btrfs_backref_node *btrfs_backref_alloc_node( struct btrfs_backref_cache *cache, u64 bytenr, int level) { struct btrfs_backref_node *node; ASSERT(level >= 0 && level < BTRFS_MAX_LEVEL); node = kzalloc(sizeof(*node), GFP_NOFS); if (!node) return node; INIT_LIST_HEAD(&node->list); INIT_LIST_HEAD(&node->upper); INIT_LIST_HEAD(&node->lower); RB_CLEAR_NODE(&node->rb_node); cache->nr_nodes++; node->level = level; node->bytenr = bytenr; return node; } void btrfs_backref_free_node(struct btrfs_backref_cache *cache, struct btrfs_backref_node *node) { if (node) { ASSERT(list_empty(&node->list)); ASSERT(list_empty(&node->lower)); ASSERT(node->eb == NULL); cache->nr_nodes--; btrfs_put_root(node->root); kfree(node); } } struct btrfs_backref_edge *btrfs_backref_alloc_edge( struct btrfs_backref_cache *cache) { struct btrfs_backref_edge *edge; edge = kzalloc(sizeof(*edge), GFP_NOFS); if (edge) cache->nr_edges++; return edge; } void btrfs_backref_free_edge(struct btrfs_backref_cache *cache, struct btrfs_backref_edge *edge) { if (edge) { cache->nr_edges--; kfree(edge); } } void btrfs_backref_unlock_node_buffer(struct btrfs_backref_node *node) { if (node->locked) { btrfs_tree_unlock(node->eb); node->locked = 0; } } void btrfs_backref_drop_node_buffer(struct btrfs_backref_node *node) { if (node->eb) { btrfs_backref_unlock_node_buffer(node); free_extent_buffer(node->eb); node->eb = NULL; } } /* * Drop the backref node from cache without cleaning up its children * edges. * * This can only be called on node without parent edges. * The children edges are still kept as is. */ void btrfs_backref_drop_node(struct btrfs_backref_cache *tree, struct btrfs_backref_node *node) { ASSERT(list_empty(&node->upper)); btrfs_backref_drop_node_buffer(node); list_del_init(&node->list); list_del_init(&node->lower); if (!RB_EMPTY_NODE(&node->rb_node)) rb_erase(&node->rb_node, &tree->rb_root); btrfs_backref_free_node(tree, node); } /* * Drop the backref node from cache, also cleaning up all its * upper edges and any uncached nodes in the path. * * This cleanup happens bottom up, thus the node should either * be the lowest node in the cache or a detached node. */ void btrfs_backref_cleanup_node(struct btrfs_backref_cache *cache, struct btrfs_backref_node *node) { struct btrfs_backref_node *upper; struct btrfs_backref_edge *edge; if (!node) return; BUG_ON(!node->lowest && !node->detached); while (!list_empty(&node->upper)) { edge = list_entry(node->upper.next, struct btrfs_backref_edge, list[LOWER]); upper = edge->node[UPPER]; list_del(&edge->list[LOWER]); list_del(&edge->list[UPPER]); btrfs_backref_free_edge(cache, edge); /* * Add the node to leaf node list if no other child block * cached. */ if (list_empty(&upper->lower)) { list_add_tail(&upper->lower, &cache->leaves); upper->lowest = 1; } } btrfs_backref_drop_node(cache, node); } /* * Release all nodes/edges from current cache */ void btrfs_backref_release_cache(struct btrfs_backref_cache *cache) { struct btrfs_backref_node *node; int i; while (!list_empty(&cache->detached)) { node = list_entry(cache->detached.next, struct btrfs_backref_node, list); btrfs_backref_cleanup_node(cache, node); } while (!list_empty(&cache->leaves)) { node = list_entry(cache->leaves.next, struct btrfs_backref_node, lower); btrfs_backref_cleanup_node(cache, node); } for (i = 0; i < BTRFS_MAX_LEVEL; i++) { while (!list_empty(&cache->pending[i])) { node = list_first_entry(&cache->pending[i], struct btrfs_backref_node, list); btrfs_backref_cleanup_node(cache, node); } } ASSERT(list_empty(&cache->pending_edge)); ASSERT(list_empty(&cache->useless_node)); ASSERT(list_empty(&cache->changed)); ASSERT(list_empty(&cache->detached)); ASSERT(RB_EMPTY_ROOT(&cache->rb_root)); ASSERT(!cache->nr_nodes); ASSERT(!cache->nr_edges); } void btrfs_backref_link_edge(struct btrfs_backref_edge *edge, struct btrfs_backref_node *lower, struct btrfs_backref_node *upper, int link_which) { ASSERT(upper && lower && upper->level == lower->level + 1); edge->node[LOWER] = lower; edge->node[UPPER] = upper; if (link_which & LINK_LOWER) list_add_tail(&edge->list[LOWER], &lower->upper); if (link_which & LINK_UPPER) list_add_tail(&edge->list[UPPER], &upper->lower); } /* * Handle direct tree backref * * Direct tree backref means, the backref item shows its parent bytenr * directly. This is for SHARED_BLOCK_REF backref (keyed or inlined). * * @ref_key: The converted backref key. * For keyed backref, it's the item key. * For inlined backref, objectid is the bytenr, * type is btrfs_inline_ref_type, offset is * btrfs_inline_ref_offset. */ static int handle_direct_tree_backref(struct btrfs_backref_cache *cache, struct btrfs_key *ref_key, struct btrfs_backref_node *cur) { struct btrfs_backref_edge *edge; struct btrfs_backref_node *upper; struct rb_node *rb_node; ASSERT(ref_key->type == BTRFS_SHARED_BLOCK_REF_KEY); /* Only reloc root uses backref pointing to itself */ if (ref_key->objectid == ref_key->offset) { struct btrfs_root *root; cur->is_reloc_root = 1; /* Only reloc backref cache cares about a specific root */ if (cache->is_reloc) { root = find_reloc_root(cache->fs_info, cur->bytenr); if (!root) return -ENOENT; cur->root = root; } else { /* * For generic purpose backref cache, reloc root node * is useless. */ list_add(&cur->list, &cache->useless_node); } return 0; } edge = btrfs_backref_alloc_edge(cache); if (!edge) return -ENOMEM; rb_node = rb_simple_search(&cache->rb_root, ref_key->offset); if (!rb_node) { /* Parent node not yet cached */ upper = btrfs_backref_alloc_node(cache, ref_key->offset, cur->level + 1); if (!upper) { btrfs_backref_free_edge(cache, edge); return -ENOMEM; } /* * Backrefs for the upper level block isn't cached, add the * block to pending list */ list_add_tail(&edge->list[UPPER], &cache->pending_edge); } else { /* Parent node already cached */ upper = rb_entry(rb_node, struct btrfs_backref_node, rb_node); ASSERT(upper->checked); INIT_LIST_HEAD(&edge->list[UPPER]); } btrfs_backref_link_edge(edge, cur, upper, LINK_LOWER); return 0; } /* * Handle indirect tree backref * * Indirect tree backref means, we only know which tree the node belongs to. * We still need to do a tree search to find out the parents. This is for * TREE_BLOCK_REF backref (keyed or inlined). * * @trans: Transaction handle. * @ref_key: The same as @ref_key in handle_direct_tree_backref() * @tree_key: The first key of this tree block. * @path: A clean (released) path, to avoid allocating path every time * the function get called. */ static int handle_indirect_tree_backref(struct btrfs_trans_handle *trans, struct btrfs_backref_cache *cache, struct btrfs_path *path, struct btrfs_key *ref_key, struct btrfs_key *tree_key, struct btrfs_backref_node *cur) { struct btrfs_fs_info *fs_info = cache->fs_info; struct btrfs_backref_node *upper; struct btrfs_backref_node *lower; struct btrfs_backref_edge *edge; struct extent_buffer *eb; struct btrfs_root *root; struct rb_node *rb_node; int level; bool need_check = true; int ret; root = btrfs_get_fs_root(fs_info, ref_key->offset, false); if (IS_ERR(root)) return PTR_ERR(root); if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) cur->cowonly = 1; if (btrfs_root_level(&root->root_item) == cur->level) { /* Tree root */ ASSERT(btrfs_root_bytenr(&root->root_item) == cur->bytenr); /* * For reloc backref cache, we may ignore reloc root. But for * general purpose backref cache, we can't rely on * btrfs_should_ignore_reloc_root() as it may conflict with * current running relocation and lead to missing root. * * For general purpose backref cache, reloc root detection is * completely relying on direct backref (key->offset is parent * bytenr), thus only do such check for reloc cache. */ if (btrfs_should_ignore_reloc_root(root) && cache->is_reloc) { btrfs_put_root(root); list_add(&cur->list, &cache->useless_node); } else { cur->root = root; } return 0; } level = cur->level + 1; /* Search the tree to find parent blocks referring to the block */ path->search_commit_root = 1; path->skip_locking = 1; path->lowest_level = level; ret = btrfs_search_slot(NULL, root, tree_key, path, 0, 0); path->lowest_level = 0; if (ret < 0) { btrfs_put_root(root); return ret; } if (ret > 0 && path->slots[level] > 0) path->slots[level]--; eb = path->nodes[level]; if (btrfs_node_blockptr(eb, path->slots[level]) != cur->bytenr) { btrfs_err(fs_info, "couldn't find block (%llu) (level %d) in tree (%llu) with key (%llu %u %llu)", cur->bytenr, level - 1, btrfs_root_id(root), tree_key->objectid, tree_key->type, tree_key->offset); btrfs_put_root(root); ret = -ENOENT; goto out; } lower = cur; /* Add all nodes and edges in the path */ for (; level < BTRFS_MAX_LEVEL; level++) { if (!path->nodes[level]) { ASSERT(btrfs_root_bytenr(&root->root_item) == lower->bytenr); /* Same as previous should_ignore_reloc_root() call */ if (btrfs_should_ignore_reloc_root(root) && cache->is_reloc) { btrfs_put_root(root); list_add(&lower->list, &cache->useless_node); } else { lower->root = root; } break; } edge = btrfs_backref_alloc_edge(cache); if (!edge) { btrfs_put_root(root); ret = -ENOMEM; goto out; } eb = path->nodes[level]; rb_node = rb_simple_search(&cache->rb_root, eb->start); if (!rb_node) { upper = btrfs_backref_alloc_node(cache, eb->start, lower->level + 1); if (!upper) { btrfs_put_root(root); btrfs_backref_free_edge(cache, edge); ret = -ENOMEM; goto out; } upper->owner = btrfs_header_owner(eb); if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) upper->cowonly = 1; /* * If we know the block isn't shared we can avoid * checking its backrefs. */ if (btrfs_block_can_be_shared(trans, root, eb)) upper->checked = 0; else upper->checked = 1; /* * Add the block to pending list if we need to check its * backrefs, we only do this once while walking up a * tree as we will catch anything else later on. */ if (!upper->checked && need_check) { need_check = false; list_add_tail(&edge->list[UPPER], &cache->pending_edge); } else { if (upper->checked) need_check = true; INIT_LIST_HEAD(&edge->list[UPPER]); } } else { upper = rb_entry(rb_node, struct btrfs_backref_node, rb_node); ASSERT(upper->checked); INIT_LIST_HEAD(&edge->list[UPPER]); if (!upper->owner) upper->owner = btrfs_header_owner(eb); } btrfs_backref_link_edge(edge, lower, upper, LINK_LOWER); if (rb_node) { btrfs_put_root(root); break; } lower = upper; upper = NULL; } out: btrfs_release_path(path); return ret; } /* * Add backref node @cur into @cache. * * NOTE: Even if the function returned 0, @cur is not yet cached as its upper * links aren't yet bi-directional. Needs to finish such links. * Use btrfs_backref_finish_upper_links() to finish such linkage. * * @trans: Transaction handle. * @path: Released path for indirect tree backref lookup * @iter: Released backref iter for extent tree search * @node_key: The first key of the tree block */ int btrfs_backref_add_tree_node(struct btrfs_trans_handle *trans, struct btrfs_backref_cache *cache, struct btrfs_path *path, struct btrfs_backref_iter *iter, struct btrfs_key *node_key, struct btrfs_backref_node *cur) { struct btrfs_backref_edge *edge; struct btrfs_backref_node *exist; int ret; ret = btrfs_backref_iter_start(iter, cur->bytenr); if (ret < 0) return ret; /* * We skip the first btrfs_tree_block_info, as we don't use the key * stored in it, but fetch it from the tree block */ if (btrfs_backref_has_tree_block_info(iter)) { ret = btrfs_backref_iter_next(iter); if (ret < 0) goto out; /* No extra backref? This means the tree block is corrupted */ if (ret > 0) { ret = -EUCLEAN; goto out; } } WARN_ON(cur->checked); if (!list_empty(&cur->upper)) { /* * The backref was added previously when processing backref of * type BTRFS_TREE_BLOCK_REF_KEY */ ASSERT(list_is_singular(&cur->upper)); edge = list_entry(cur->upper.next, struct btrfs_backref_edge, list[LOWER]); ASSERT(list_empty(&edge->list[UPPER])); exist = edge->node[UPPER]; /* * Add the upper level block to pending list if we need check * its backrefs */ if (!exist->checked) list_add_tail(&edge->list[UPPER], &cache->pending_edge); } else { exist = NULL; } for (; ret == 0; ret = btrfs_backref_iter_next(iter)) { struct extent_buffer *eb; struct btrfs_key key; int type; cond_resched(); eb = iter->path->nodes[0]; key.objectid = iter->bytenr; if (btrfs_backref_iter_is_inline_ref(iter)) { struct btrfs_extent_inline_ref *iref; /* Update key for inline backref */ iref = (struct btrfs_extent_inline_ref *) ((unsigned long)iter->cur_ptr); type = btrfs_get_extent_inline_ref_type(eb, iref, BTRFS_REF_TYPE_BLOCK); if (type == BTRFS_REF_TYPE_INVALID) { ret = -EUCLEAN; goto out; } key.type = type; key.offset = btrfs_extent_inline_ref_offset(eb, iref); } else { key.type = iter->cur_key.type; key.offset = iter->cur_key.offset; } /* * Parent node found and matches current inline ref, no need to * rebuild this node for this inline ref */ if (exist && ((key.type == BTRFS_TREE_BLOCK_REF_KEY && exist->owner == key.offset) || (key.type == BTRFS_SHARED_BLOCK_REF_KEY && exist->bytenr == key.offset))) { exist = NULL; continue; } /* SHARED_BLOCK_REF means key.offset is the parent bytenr */ if (key.type == BTRFS_SHARED_BLOCK_REF_KEY) { ret = handle_direct_tree_backref(cache, &key, cur); if (ret < 0) goto out; } else if (key.type == BTRFS_TREE_BLOCK_REF_KEY) { /* * key.type == BTRFS_TREE_BLOCK_REF_KEY, inline ref * offset means the root objectid. We need to search * the tree to get its parent bytenr. */ ret = handle_indirect_tree_backref(trans, cache, path, &key, node_key, cur); if (ret < 0) goto out; } /* * Unrecognized tree backref items (if it can pass tree-checker) * would be ignored. */ } ret = 0; cur->checked = 1; WARN_ON(exist); out: btrfs_backref_iter_release(iter); return ret; } /* * Finish the upwards linkage created by btrfs_backref_add_tree_node() */ int btrfs_backref_finish_upper_links(struct btrfs_backref_cache *cache, struct btrfs_backref_node *start) { struct list_head *useless_node = &cache->useless_node; struct btrfs_backref_edge *edge; struct rb_node *rb_node; LIST_HEAD(pending_edge); ASSERT(start->checked); /* Insert this node to cache if it's not COW-only */ if (!start->cowonly) { rb_node = rb_simple_insert(&cache->rb_root, start->bytenr, &start->rb_node); if (rb_node) btrfs_backref_panic(cache->fs_info, start->bytenr, -EEXIST); list_add_tail(&start->lower, &cache->leaves); } /* * Use breadth first search to iterate all related edges. * * The starting points are all the edges of this node */ list_for_each_entry(edge, &start->upper, list[LOWER]) list_add_tail(&edge->list[UPPER], &pending_edge); while (!list_empty(&pending_edge)) { struct btrfs_backref_node *upper; struct btrfs_backref_node *lower; edge = list_first_entry(&pending_edge, struct btrfs_backref_edge, list[UPPER]); list_del_init(&edge->list[UPPER]); upper = edge->node[UPPER]; lower = edge->node[LOWER]; /* Parent is detached, no need to keep any edges */ if (upper->detached) { list_del(&edge->list[LOWER]); btrfs_backref_free_edge(cache, edge); /* Lower node is orphan, queue for cleanup */ if (list_empty(&lower->upper)) list_add(&lower->list, useless_node); continue; } /* * All new nodes added in current build_backref_tree() haven't * been linked to the cache rb tree. * So if we have upper->rb_node populated, this means a cache * hit. We only need to link the edge, as @upper and all its * parents have already been linked. */ if (!RB_EMPTY_NODE(&upper->rb_node)) { if (upper->lowest) { list_del_init(&upper->lower); upper->lowest = 0; } list_add_tail(&edge->list[UPPER], &upper->lower); continue; } /* Sanity check, we shouldn't have any unchecked nodes */ if (!upper->checked) { ASSERT(0); return -EUCLEAN; } /* Sanity check, COW-only node has non-COW-only parent */ if (start->cowonly != upper->cowonly) { ASSERT(0); return -EUCLEAN; } /* Only cache non-COW-only (subvolume trees) tree blocks */ if (!upper->cowonly) { rb_node = rb_simple_insert(&cache->rb_root, upper->bytenr, &upper->rb_node); if (rb_node) { btrfs_backref_panic(cache->fs_info, upper->bytenr, -EEXIST); return -EUCLEAN; } } list_add_tail(&edge->list[UPPER], &upper->lower); /* * Also queue all the parent edges of this uncached node * to finish the upper linkage */ list_for_each_entry(edge, &upper->upper, list[LOWER]) list_add_tail(&edge->list[UPPER], &pending_edge); } return 0; } void btrfs_backref_error_cleanup(struct btrfs_backref_cache *cache, struct btrfs_backref_node *node) { struct btrfs_backref_node *lower; struct btrfs_backref_node *upper; struct btrfs_backref_edge *edge; while (!list_empty(&cache->useless_node)) { lower = list_first_entry(&cache->useless_node, struct btrfs_backref_node, list); list_del_init(&lower->list); } while (!list_empty(&cache->pending_edge)) { edge = list_first_entry(&cache->pending_edge, struct btrfs_backref_edge, list[UPPER]); list_del(&edge->list[UPPER]); list_del(&edge->list[LOWER]); lower = edge->node[LOWER]; upper = edge->node[UPPER]; btrfs_backref_free_edge(cache, edge); /* * Lower is no longer linked to any upper backref nodes and * isn't in the cache, we can free it ourselves. */ if (list_empty(&lower->upper) && RB_EMPTY_NODE(&lower->rb_node)) list_add(&lower->list, &cache->useless_node); if (!RB_EMPTY_NODE(&upper->rb_node)) continue; /* Add this guy's upper edges to the list to process */ list_for_each_entry(edge, &upper->upper, list[LOWER]) list_add_tail(&edge->list[UPPER], &cache->pending_edge); if (list_empty(&upper->upper)) list_add(&upper->list, &cache->useless_node); } while (!list_empty(&cache->useless_node)) { lower = list_first_entry(&cache->useless_node, struct btrfs_backref_node, list); list_del_init(&lower->list); if (lower == node) node = NULL; btrfs_backref_drop_node(cache, lower); } btrfs_backref_cleanup_node(cache, node); ASSERT(list_empty(&cache->useless_node) && list_empty(&cache->pending_edge)); } |
| 2 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 | // SPDX-License-Identifier: GPL-2.0 /* * Simple file system for zoned block devices exposing zones as files. * * Copyright (C) 2022 Western Digital Corporation or its affiliates. */ #include <linux/fs.h> #include <linux/seq_file.h> #include <linux/blkdev.h> #include "zonefs.h" struct zonefs_sysfs_attr { struct attribute attr; ssize_t (*show)(struct zonefs_sb_info *sbi, char *buf); }; #define ZONEFS_SYSFS_ATTR_RO(name) \ static struct zonefs_sysfs_attr zonefs_sysfs_attr_##name = __ATTR_RO(name) #define ATTR_LIST(name) &zonefs_sysfs_attr_##name.attr static ssize_t zonefs_sysfs_attr_show(struct kobject *kobj, struct attribute *attr, char *buf) { struct zonefs_sb_info *sbi = container_of(kobj, struct zonefs_sb_info, s_kobj); struct zonefs_sysfs_attr *zonefs_attr = container_of(attr, struct zonefs_sysfs_attr, attr); if (!zonefs_attr->show) return 0; return zonefs_attr->show(sbi, buf); } static ssize_t max_wro_seq_files_show(struct zonefs_sb_info *sbi, char *buf) { return sysfs_emit(buf, "%u\n", sbi->s_max_wro_seq_files); } ZONEFS_SYSFS_ATTR_RO(max_wro_seq_files); static ssize_t nr_wro_seq_files_show(struct zonefs_sb_info *sbi, char *buf) { return sysfs_emit(buf, "%d\n", atomic_read(&sbi->s_wro_seq_files)); } ZONEFS_SYSFS_ATTR_RO(nr_wro_seq_files); static ssize_t max_active_seq_files_show(struct zonefs_sb_info *sbi, char *buf) { return sysfs_emit(buf, "%u\n", sbi->s_max_active_seq_files); } ZONEFS_SYSFS_ATTR_RO(max_active_seq_files); static ssize_t nr_active_seq_files_show(struct zonefs_sb_info *sbi, char *buf) { return sysfs_emit(buf, "%d\n", atomic_read(&sbi->s_active_seq_files)); } ZONEFS_SYSFS_ATTR_RO(nr_active_seq_files); static struct attribute *zonefs_sysfs_attrs[] = { ATTR_LIST(max_wro_seq_files), ATTR_LIST(nr_wro_seq_files), ATTR_LIST(max_active_seq_files), ATTR_LIST(nr_active_seq_files), NULL, }; ATTRIBUTE_GROUPS(zonefs_sysfs); static void zonefs_sysfs_sb_release(struct kobject *kobj) { struct zonefs_sb_info *sbi = container_of(kobj, struct zonefs_sb_info, s_kobj); complete(&sbi->s_kobj_unregister); } static const struct sysfs_ops zonefs_sysfs_attr_ops = { .show = zonefs_sysfs_attr_show, }; static const struct kobj_type zonefs_sb_ktype = { .default_groups = zonefs_sysfs_groups, .sysfs_ops = &zonefs_sysfs_attr_ops, .release = zonefs_sysfs_sb_release, }; static struct kobject *zonefs_sysfs_root; int zonefs_sysfs_register(struct super_block *sb) { struct zonefs_sb_info *sbi = ZONEFS_SB(sb); int ret; super_set_sysfs_name_id(sb); init_completion(&sbi->s_kobj_unregister); ret = kobject_init_and_add(&sbi->s_kobj, &zonefs_sb_ktype, zonefs_sysfs_root, "%s", sb->s_id); if (ret) { kobject_put(&sbi->s_kobj); wait_for_completion(&sbi->s_kobj_unregister); return ret; } sbi->s_sysfs_registered = true; return 0; } void zonefs_sysfs_unregister(struct super_block *sb) { struct zonefs_sb_info *sbi = ZONEFS_SB(sb); if (!sbi || !sbi->s_sysfs_registered) return; kobject_del(&sbi->s_kobj); kobject_put(&sbi->s_kobj); wait_for_completion(&sbi->s_kobj_unregister); } int __init zonefs_sysfs_init(void) { zonefs_sysfs_root = kobject_create_and_add("zonefs", fs_kobj); if (!zonefs_sysfs_root) return -ENOMEM; return 0; } void zonefs_sysfs_exit(void) { kobject_put(zonefs_sysfs_root); zonefs_sysfs_root = NULL; } |
| 117 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 | // SPDX-License-Identifier: GPL-2.0-or-later /* * xpress_decompress.c - A decompressor for the XPRESS compression format * (Huffman variant), which can be used in "System Compressed" files. This is * based on the code from wimlib. * * Copyright (C) 2015 Eric Biggers */ #include "decompress_common.h" #include "lib.h" #define XPRESS_NUM_SYMBOLS 512 #define XPRESS_MAX_CODEWORD_LEN 15 #define XPRESS_MIN_MATCH_LEN 3 /* This value is chosen for fast decompression. */ #define XPRESS_TABLEBITS 12 /* Reusable heap-allocated memory for XPRESS decompression */ struct xpress_decompressor { /* The Huffman decoding table */ u16 decode_table[(1 << XPRESS_TABLEBITS) + 2 * XPRESS_NUM_SYMBOLS]; /* An array that maps symbols to codeword lengths */ u8 lens[XPRESS_NUM_SYMBOLS]; /* Temporary space for make_huffman_decode_table() */ u16 working_space[2 * (1 + XPRESS_MAX_CODEWORD_LEN) + XPRESS_NUM_SYMBOLS]; }; /* * xpress_allocate_decompressor - Allocate an XPRESS decompressor * * Return the pointer to the decompressor on success, or return NULL and set * errno on failure. */ struct xpress_decompressor *xpress_allocate_decompressor(void) { return kmalloc(sizeof(struct xpress_decompressor), GFP_NOFS); } /* * xpress_decompress - Decompress a buffer of XPRESS-compressed data * * @decompressor: A decompressor that was allocated with * xpress_allocate_decompressor() * @compressed_data: The buffer of data to decompress * @compressed_size: Number of bytes of compressed data * @uncompressed_data: The buffer in which to store the decompressed data * @uncompressed_size: The number of bytes the data decompresses into * * Return 0 on success, or return -1 and set errno on failure. */ int xpress_decompress(struct xpress_decompressor *decompressor, const void *compressed_data, size_t compressed_size, void *uncompressed_data, size_t uncompressed_size) { struct xpress_decompressor *d = decompressor; const u8 * const in_begin = compressed_data; u8 * const out_begin = uncompressed_data; u8 *out_next = out_begin; u8 * const out_end = out_begin + uncompressed_size; struct input_bitstream is; u32 i; /* Read the Huffman codeword lengths. */ if (compressed_size < XPRESS_NUM_SYMBOLS / 2) goto invalid; for (i = 0; i < XPRESS_NUM_SYMBOLS / 2; i++) { d->lens[i*2 + 0] = in_begin[i] & 0xF; d->lens[i*2 + 1] = in_begin[i] >> 4; } /* Build a decoding table for the Huffman code. */ if (make_huffman_decode_table(d->decode_table, XPRESS_NUM_SYMBOLS, XPRESS_TABLEBITS, d->lens, XPRESS_MAX_CODEWORD_LEN, d->working_space)) goto invalid; /* Decode the matches and literals. */ init_input_bitstream(&is, in_begin + XPRESS_NUM_SYMBOLS / 2, compressed_size - XPRESS_NUM_SYMBOLS / 2); while (out_next != out_end) { u32 sym; u32 log2_offset; u32 length; u32 offset; sym = read_huffsym(&is, d->decode_table, XPRESS_TABLEBITS, XPRESS_MAX_CODEWORD_LEN); if (sym < 256) { /* Literal */ *out_next++ = sym; } else { /* Match */ length = sym & 0xf; log2_offset = (sym >> 4) & 0xf; bitstream_ensure_bits(&is, 16); offset = ((u32)1 << log2_offset) | bitstream_pop_bits(&is, log2_offset); if (length == 0xf) { length += bitstream_read_byte(&is); if (length == 0xf + 0xff) length = bitstream_read_u16(&is); } length += XPRESS_MIN_MATCH_LEN; if (offset > (size_t)(out_next - out_begin)) goto invalid; if (length > (size_t)(out_end - out_next)) goto invalid; out_next = lz_copy(out_next, length, offset, out_end, XPRESS_MIN_MATCH_LEN); } } return 0; invalid: return -1; } /* * xpress_free_decompressor - Free an XPRESS decompressor * * @decompressor: A decompressor that was allocated with * xpress_allocate_decompressor(), or NULL. */ void xpress_free_decompressor(struct xpress_decompressor *decompressor) { kfree(decompressor); } |
| 3 3 3 3 3 3 3 1 3 3 3 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 | /* * This file implement the Wireless Extensions proc API. * * Authors : Jean Tourrilhes - HPL - <jt@hpl.hp.com> * Copyright (c) 1997-2007 Jean Tourrilhes, All Rights Reserved. * * (As all part of the Linux kernel, this file is GPL) */ /* * The /proc/net/wireless file is a human readable user-space interface * exporting various wireless specific statistics from the wireless devices. * This is the most popular part of the Wireless Extensions ;-) * * This interface is a pure clone of /proc/net/dev (in net/core/dev.c). * The content of the file is basically the content of "struct iw_statistics". */ #include <linux/module.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <linux/wireless.h> #include <linux/netdevice.h> #include <linux/rtnetlink.h> #include <net/iw_handler.h> #include <net/wext.h> static void wireless_seq_printf_stats(struct seq_file *seq, struct net_device *dev) { /* Get stats from the driver */ struct iw_statistics *stats = get_wireless_stats(dev); static struct iw_statistics nullstats = {}; /* show device if it's wireless regardless of current stats */ if (!stats) { #ifdef CONFIG_WIRELESS_EXT if (dev->wireless_handlers) stats = &nullstats; #endif #ifdef CONFIG_CFG80211 if (dev->ieee80211_ptr) stats = &nullstats; #endif } if (stats) { seq_printf(seq, "%6s: %04x %3d%c %3d%c %3d%c %6d %6d %6d " "%6d %6d %6d\n", dev->name, stats->status, stats->qual.qual, stats->qual.updated & IW_QUAL_QUAL_UPDATED ? '.' : ' ', ((__s32) stats->qual.level) - ((stats->qual.updated & IW_QUAL_DBM) ? 0x100 : 0), stats->qual.updated & IW_QUAL_LEVEL_UPDATED ? '.' : ' ', ((__s32) stats->qual.noise) - ((stats->qual.updated & IW_QUAL_DBM) ? 0x100 : 0), stats->qual.updated & IW_QUAL_NOISE_UPDATED ? '.' : ' ', stats->discard.nwid, stats->discard.code, stats->discard.fragment, stats->discard.retries, stats->discard.misc, stats->miss.beacon); if (stats != &nullstats) stats->qual.updated &= ~IW_QUAL_ALL_UPDATED; } } /* ---------------------------------------------------------------- */ /* * Print info for /proc/net/wireless (print all entries) */ static int wireless_dev_seq_show(struct seq_file *seq, void *v) { might_sleep(); if (v == SEQ_START_TOKEN) seq_printf(seq, "Inter-| sta-| Quality | Discarded " "packets | Missed | WE\n" " face | tus | link level noise | nwid " "crypt frag retry misc | beacon | %d\n", WIRELESS_EXT); else wireless_seq_printf_stats(seq, v); return 0; } static void *wireless_dev_seq_start(struct seq_file *seq, loff_t *pos) { struct net *net = seq_file_net(seq); loff_t off; struct net_device *dev; rtnl_lock(); if (!*pos) return SEQ_START_TOKEN; off = 1; for_each_netdev(net, dev) if (off++ == *pos) return dev; return NULL; } static void *wireless_dev_seq_next(struct seq_file *seq, void *v, loff_t *pos) { struct net *net = seq_file_net(seq); ++*pos; return v == SEQ_START_TOKEN ? first_net_device(net) : next_net_device(v); } static void wireless_dev_seq_stop(struct seq_file *seq, void *v) { rtnl_unlock(); } static const struct seq_operations wireless_seq_ops = { .start = wireless_dev_seq_start, .next = wireless_dev_seq_next, .stop = wireless_dev_seq_stop, .show = wireless_dev_seq_show, }; int __net_init wext_proc_init(struct net *net) { /* Create /proc/net/wireless entry */ if (!proc_create_net("wireless", 0444, net->proc_net, &wireless_seq_ops, sizeof(struct seq_net_private))) return -ENOMEM; return 0; } void __net_exit wext_proc_exit(struct net *net) { remove_proc_entry("wireless", net->proc_net); } |
| 69 69 69 54 24 69 69 1 1 1 1 1 1 2 2 2 2 2 13 13 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 | // SPDX-License-Identifier: GPL-2.0 /* * Block stat tracking code * * Copyright (C) 2016 Jens Axboe */ #include <linux/kernel.h> #include <linux/rculist.h> #include "blk-stat.h" #include "blk-mq.h" #include "blk.h" struct blk_queue_stats { struct list_head callbacks; spinlock_t lock; int accounting; }; void blk_rq_stat_init(struct blk_rq_stat *stat) { stat->min = -1ULL; stat->max = stat->nr_samples = stat->mean = 0; stat->batch = 0; } /* src is a per-cpu stat, mean isn't initialized */ void blk_rq_stat_sum(struct blk_rq_stat *dst, struct blk_rq_stat *src) { if (dst->nr_samples + src->nr_samples <= dst->nr_samples) return; dst->min = min(dst->min, src->min); dst->max = max(dst->max, src->max); dst->mean = div_u64(src->batch + dst->mean * dst->nr_samples, dst->nr_samples + src->nr_samples); dst->nr_samples += src->nr_samples; } void blk_rq_stat_add(struct blk_rq_stat *stat, u64 value) { stat->min = min(stat->min, value); stat->max = max(stat->max, value); stat->batch += value; stat->nr_samples++; } void blk_stat_add(struct request *rq, u64 now) { struct request_queue *q = rq->q; struct blk_stat_callback *cb; struct blk_rq_stat *stat; int bucket, cpu; u64 value; value = (now >= rq->io_start_time_ns) ? now - rq->io_start_time_ns : 0; rcu_read_lock(); cpu = get_cpu(); list_for_each_entry_rcu(cb, &q->stats->callbacks, list) { if (!blk_stat_is_active(cb)) continue; bucket = cb->bucket_fn(rq); if (bucket < 0) continue; stat = &per_cpu_ptr(cb->cpu_stat, cpu)[bucket]; blk_rq_stat_add(stat, value); } put_cpu(); rcu_read_unlock(); } static void blk_stat_timer_fn(struct timer_list *t) { struct blk_stat_callback *cb = from_timer(cb, t, timer); unsigned int bucket; int cpu; for (bucket = 0; bucket < cb->buckets; bucket++) blk_rq_stat_init(&cb->stat[bucket]); for_each_online_cpu(cpu) { struct blk_rq_stat *cpu_stat; cpu_stat = per_cpu_ptr(cb->cpu_stat, cpu); for (bucket = 0; bucket < cb->buckets; bucket++) { blk_rq_stat_sum(&cb->stat[bucket], &cpu_stat[bucket]); blk_rq_stat_init(&cpu_stat[bucket]); } } cb->timer_fn(cb); } struct blk_stat_callback * blk_stat_alloc_callback(void (*timer_fn)(struct blk_stat_callback *), int (*bucket_fn)(const struct request *), unsigned int buckets, void *data) { struct blk_stat_callback *cb; cb = kmalloc(sizeof(*cb), GFP_KERNEL); if (!cb) return NULL; cb->stat = kmalloc_array(buckets, sizeof(struct blk_rq_stat), GFP_KERNEL); if (!cb->stat) { kfree(cb); return NULL; } cb->cpu_stat = __alloc_percpu(buckets * sizeof(struct blk_rq_stat), __alignof__(struct blk_rq_stat)); if (!cb->cpu_stat) { kfree(cb->stat); kfree(cb); return NULL; } cb->timer_fn = timer_fn; cb->bucket_fn = bucket_fn; cb->data = data; cb->buckets = buckets; timer_setup(&cb->timer, blk_stat_timer_fn, 0); return cb; } void blk_stat_add_callback(struct request_queue *q, struct blk_stat_callback *cb) { unsigned int bucket; unsigned long flags; int cpu; for_each_possible_cpu(cpu) { struct blk_rq_stat *cpu_stat; cpu_stat = per_cpu_ptr(cb->cpu_stat, cpu); for (bucket = 0; bucket < cb->buckets; bucket++) blk_rq_stat_init(&cpu_stat[bucket]); } spin_lock_irqsave(&q->stats->lock, flags); list_add_tail_rcu(&cb->list, &q->stats->callbacks); blk_queue_flag_set(QUEUE_FLAG_STATS, q); spin_unlock_irqrestore(&q->stats->lock, flags); } void blk_stat_remove_callback(struct request_queue *q, struct blk_stat_callback *cb) { unsigned long flags; spin_lock_irqsave(&q->stats->lock, flags); list_del_rcu(&cb->list); if (list_empty(&q->stats->callbacks) && !q->stats->accounting) blk_queue_flag_clear(QUEUE_FLAG_STATS, q); spin_unlock_irqrestore(&q->stats->lock, flags); del_timer_sync(&cb->timer); } static void blk_stat_free_callback_rcu(struct rcu_head *head) { struct blk_stat_callback *cb; cb = container_of(head, struct blk_stat_callback, rcu); free_percpu(cb->cpu_stat); kfree(cb->stat); kfree(cb); } void blk_stat_free_callback(struct blk_stat_callback *cb) { if (cb) call_rcu(&cb->rcu, blk_stat_free_callback_rcu); } void blk_stat_disable_accounting(struct request_queue *q) { unsigned long flags; spin_lock_irqsave(&q->stats->lock, flags); if (!--q->stats->accounting && list_empty(&q->stats->callbacks)) blk_queue_flag_clear(QUEUE_FLAG_STATS, q); spin_unlock_irqrestore(&q->stats->lock, flags); } EXPORT_SYMBOL_GPL(blk_stat_disable_accounting); void blk_stat_enable_accounting(struct request_queue *q) { unsigned long flags; spin_lock_irqsave(&q->stats->lock, flags); if (!q->stats->accounting++ && list_empty(&q->stats->callbacks)) blk_queue_flag_set(QUEUE_FLAG_STATS, q); spin_unlock_irqrestore(&q->stats->lock, flags); } EXPORT_SYMBOL_GPL(blk_stat_enable_accounting); struct blk_queue_stats *blk_alloc_queue_stats(void) { struct blk_queue_stats *stats; stats = kmalloc(sizeof(*stats), GFP_KERNEL); if (!stats) return NULL; INIT_LIST_HEAD(&stats->callbacks); spin_lock_init(&stats->lock); stats->accounting = 0; return stats; } void blk_free_queue_stats(struct blk_queue_stats *stats) { if (!stats) return; WARN_ON(!list_empty(&stats->callbacks)); kfree(stats); } |
| 1 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 | // SPDX-License-Identifier: GPL-2.0-or-later /* * User-space Probes (UProbes) for x86 * * Copyright (C) IBM Corporation, 2008-2011 * Authors: * Srikar Dronamraju * Jim Keniston */ #include <linux/kernel.h> #include <linux/sched.h> #include <linux/ptrace.h> #include <linux/uprobes.h> #include <linux/uaccess.h> #include <linux/syscalls.h> #include <linux/kdebug.h> #include <asm/processor.h> #include <asm/insn.h> #include <asm/mmu_context.h> /* Post-execution fixups. */ /* Adjust IP back to vicinity of actual insn */ #define UPROBE_FIX_IP 0x01 /* Adjust the return address of a call insn */ #define UPROBE_FIX_CALL 0x02 /* Instruction will modify TF, don't change it */ #define UPROBE_FIX_SETF 0x04 #define UPROBE_FIX_RIP_SI 0x08 #define UPROBE_FIX_RIP_DI 0x10 #define UPROBE_FIX_RIP_BX 0x20 #define UPROBE_FIX_RIP_MASK \ (UPROBE_FIX_RIP_SI | UPROBE_FIX_RIP_DI | UPROBE_FIX_RIP_BX) #define UPROBE_TRAP_NR UINT_MAX /* Adaptations for mhiramat x86 decoder v14. */ #define OPCODE1(insn) ((insn)->opcode.bytes[0]) #define OPCODE2(insn) ((insn)->opcode.bytes[1]) #define OPCODE3(insn) ((insn)->opcode.bytes[2]) #define MODRM_REG(insn) X86_MODRM_REG((insn)->modrm.value) #define W(row, b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, ba, bb, bc, bd, be, bf)\ (((b0##UL << 0x0)|(b1##UL << 0x1)|(b2##UL << 0x2)|(b3##UL << 0x3) | \ (b4##UL << 0x4)|(b5##UL << 0x5)|(b6##UL << 0x6)|(b7##UL << 0x7) | \ (b8##UL << 0x8)|(b9##UL << 0x9)|(ba##UL << 0xa)|(bb##UL << 0xb) | \ (bc##UL << 0xc)|(bd##UL << 0xd)|(be##UL << 0xe)|(bf##UL << 0xf)) \ << (row % 32)) /* * Good-instruction tables for 32-bit apps. This is non-const and volatile * to keep gcc from statically optimizing it out, as variable_test_bit makes * some versions of gcc to think only *(unsigned long*) is used. * * Opcodes we'll probably never support: * 6c-6f - ins,outs. SEGVs if used in userspace * e4-e7 - in,out imm. SEGVs if used in userspace * ec-ef - in,out acc. SEGVs if used in userspace * cc - int3. SIGTRAP if used in userspace * ce - into. Not used in userspace - no kernel support to make it useful. SEGVs * (why we support bound (62) then? it's similar, and similarly unused...) * f1 - int1. SIGTRAP if used in userspace * f4 - hlt. SEGVs if used in userspace * fa - cli. SEGVs if used in userspace * fb - sti. SEGVs if used in userspace * * Opcodes which need some work to be supported: * 07,17,1f - pop es/ss/ds * Normally not used in userspace, but would execute if used. * Can cause GP or stack exception if tries to load wrong segment descriptor. * We hesitate to run them under single step since kernel's handling * of userspace single-stepping (TF flag) is fragile. * We can easily refuse to support push es/cs/ss/ds (06/0e/16/1e) * on the same grounds that they are never used. * cd - int N. * Used by userspace for "int 80" syscall entry. (Other "int N" * cause GP -> SEGV since their IDT gates don't allow calls from CPL 3). * Not supported since kernel's handling of userspace single-stepping * (TF flag) is fragile. * cf - iret. Normally not used in userspace. Doesn't SEGV unless arguments are bad */ #if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION) static volatile u32 good_insns_32[256 / 32] = { /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ /* ---------------------------------------------- */ W(0x00, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1) | /* 00 */ W(0x10, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0) , /* 10 */ W(0x20, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 20 */ W(0x30, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 30 */ W(0x40, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 40 */ W(0x50, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 50 */ W(0x60, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0) | /* 60 */ W(0x70, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 70 */ W(0x80, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 80 */ W(0x90, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 90 */ W(0xa0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* a0 */ W(0xb0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* b0 */ W(0xc0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0) | /* c0 */ W(0xd0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* d0 */ W(0xe0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0) | /* e0 */ W(0xf0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1) /* f0 */ /* ---------------------------------------------- */ /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ }; #else #define good_insns_32 NULL #endif /* Good-instruction tables for 64-bit apps. * * Genuinely invalid opcodes: * 06,07 - formerly push/pop es * 0e - formerly push cs * 16,17 - formerly push/pop ss * 1e,1f - formerly push/pop ds * 27,2f,37,3f - formerly daa/das/aaa/aas * 60,61 - formerly pusha/popa * 62 - formerly bound. EVEX prefix for AVX512 (not yet supported) * 82 - formerly redundant encoding of Group1 * 9a - formerly call seg:ofs * ce - formerly into * d4,d5 - formerly aam/aad * d6 - formerly undocumented salc * ea - formerly jmp seg:ofs * * Opcodes we'll probably never support: * 6c-6f - ins,outs. SEGVs if used in userspace * e4-e7 - in,out imm. SEGVs if used in userspace * ec-ef - in,out acc. SEGVs if used in userspace * cc - int3. SIGTRAP if used in userspace * f1 - int1. SIGTRAP if used in userspace * f4 - hlt. SEGVs if used in userspace * fa - cli. SEGVs if used in userspace * fb - sti. SEGVs if used in userspace * * Opcodes which need some work to be supported: * cd - int N. * Used by userspace for "int 80" syscall entry. (Other "int N" * cause GP -> SEGV since their IDT gates don't allow calls from CPL 3). * Not supported since kernel's handling of userspace single-stepping * (TF flag) is fragile. * cf - iret. Normally not used in userspace. Doesn't SEGV unless arguments are bad */ #if defined(CONFIG_X86_64) static volatile u32 good_insns_64[256 / 32] = { /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ /* ---------------------------------------------- */ W(0x00, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1) | /* 00 */ W(0x10, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0) , /* 10 */ W(0x20, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0) | /* 20 */ W(0x30, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0) , /* 30 */ W(0x40, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 40 */ W(0x50, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 50 */ W(0x60, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0) | /* 60 */ W(0x70, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 70 */ W(0x80, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 80 */ W(0x90, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1) , /* 90 */ W(0xa0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* a0 */ W(0xb0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* b0 */ W(0xc0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0) | /* c0 */ W(0xd0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* d0 */ W(0xe0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0) | /* e0 */ W(0xf0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1) /* f0 */ /* ---------------------------------------------- */ /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ }; #else #define good_insns_64 NULL #endif /* Using this for both 64-bit and 32-bit apps. * Opcodes we don't support: * 0f 00 - SLDT/STR/LLDT/LTR/VERR/VERW/-/- group. System insns * 0f 01 - SGDT/SIDT/LGDT/LIDT/SMSW/-/LMSW/INVLPG group. * Also encodes tons of other system insns if mod=11. * Some are in fact non-system: xend, xtest, rdtscp, maybe more * 0f 05 - syscall * 0f 06 - clts (CPL0 insn) * 0f 07 - sysret * 0f 08 - invd (CPL0 insn) * 0f 09 - wbinvd (CPL0 insn) * 0f 0b - ud2 * 0f 30 - wrmsr (CPL0 insn) (then why rdmsr is allowed, it's also CPL0 insn?) * 0f 34 - sysenter * 0f 35 - sysexit * 0f 37 - getsec * 0f 78 - vmread (Intel VMX. CPL0 insn) * 0f 79 - vmwrite (Intel VMX. CPL0 insn) * Note: with prefixes, these two opcodes are * extrq/insertq/AVX512 convert vector ops. * 0f ae - group15: [f]xsave,[f]xrstor,[v]{ld,st}mxcsr,clflush[opt], * {rd,wr}{fs,gs}base,{s,l,m}fence. * Why? They are all user-executable. */ static volatile u32 good_2byte_insns[256 / 32] = { /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ /* ---------------------------------------------- */ W(0x00, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1) | /* 00 */ W(0x10, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 10 */ W(0x20, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 20 */ W(0x30, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1) , /* 30 */ W(0x40, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 40 */ W(0x50, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 50 */ W(0x60, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 60 */ W(0x70, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1) , /* 70 */ W(0x80, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 80 */ W(0x90, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 90 */ W(0xa0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1) | /* a0 */ W(0xb0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* b0 */ W(0xc0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* c0 */ W(0xd0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* d0 */ W(0xe0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* e0 */ W(0xf0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) /* f0 */ /* ---------------------------------------------- */ /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ }; #undef W /* * opcodes we may need to refine support for: * * 0f - 2-byte instructions: For many of these instructions, the validity * depends on the prefix and/or the reg field. On such instructions, we * just consider the opcode combination valid if it corresponds to any * valid instruction. * * 8f - Group 1 - only reg = 0 is OK * c6-c7 - Group 11 - only reg = 0 is OK * d9-df - fpu insns with some illegal encodings * f2, f3 - repnz, repz prefixes. These are also the first byte for * certain floating-point instructions, such as addsd. * * fe - Group 4 - only reg = 0 or 1 is OK * ff - Group 5 - only reg = 0-6 is OK * * others -- Do we need to support these? * * 0f - (floating-point?) prefetch instructions * 07, 17, 1f - pop es, pop ss, pop ds * 26, 2e, 36, 3e - es:, cs:, ss:, ds: segment prefixes -- * but 64 and 65 (fs: and gs:) seem to be used, so we support them * 67 - addr16 prefix * ce - into * f0 - lock prefix */ /* * TODO: * - Where necessary, examine the modrm byte and allow only valid instructions * in the different Groups and fpu instructions. */ static bool is_prefix_bad(struct insn *insn) { insn_byte_t p; int i; for_each_insn_prefix(insn, i, p) { insn_attr_t attr; attr = inat_get_opcode_attribute(p); switch (attr) { case INAT_MAKE_PREFIX(INAT_PFX_ES): case INAT_MAKE_PREFIX(INAT_PFX_CS): case INAT_MAKE_PREFIX(INAT_PFX_DS): case INAT_MAKE_PREFIX(INAT_PFX_SS): case INAT_MAKE_PREFIX(INAT_PFX_LOCK): return true; } } return false; } static int uprobe_init_insn(struct arch_uprobe *auprobe, struct insn *insn, bool x86_64) { enum insn_mode m = x86_64 ? INSN_MODE_64 : INSN_MODE_32; u32 volatile *good_insns; int ret; ret = insn_decode(insn, auprobe->insn, sizeof(auprobe->insn), m); if (ret < 0) return -ENOEXEC; if (is_prefix_bad(insn)) return -ENOTSUPP; /* We should not singlestep on the exception masking instructions */ if (insn_masking_exception(insn)) return -ENOTSUPP; if (x86_64) good_insns = good_insns_64; else good_insns = good_insns_32; if (test_bit(OPCODE1(insn), (unsigned long *)good_insns)) return 0; if (insn->opcode.nbytes == 2) { if (test_bit(OPCODE2(insn), (unsigned long *)good_2byte_insns)) return 0; } return -ENOTSUPP; } #ifdef CONFIG_X86_64 asm ( ".pushsection .rodata\n" ".global uretprobe_trampoline_entry\n" "uretprobe_trampoline_entry:\n" "pushq %rax\n" "pushq %rcx\n" "pushq %r11\n" "movq $" __stringify(__NR_uretprobe) ", %rax\n" "syscall\n" ".global uretprobe_syscall_check\n" "uretprobe_syscall_check:\n" "popq %r11\n" "popq %rcx\n" /* The uretprobe syscall replaces stored %rax value with final * return address, so we don't restore %rax in here and just * call ret. */ "retq\n" ".global uretprobe_trampoline_end\n" "uretprobe_trampoline_end:\n" ".popsection\n" ); extern u8 uretprobe_trampoline_entry[]; extern u8 uretprobe_trampoline_end[]; extern u8 uretprobe_syscall_check[]; void *arch_uprobe_trampoline(unsigned long *psize) { static uprobe_opcode_t insn = UPROBE_SWBP_INSN; struct pt_regs *regs = task_pt_regs(current); /* * At the moment the uretprobe syscall trampoline is supported * only for native 64-bit process, the compat process still uses * standard breakpoint. */ if (user_64bit_mode(regs)) { *psize = uretprobe_trampoline_end - uretprobe_trampoline_entry; return uretprobe_trampoline_entry; } *psize = UPROBE_SWBP_INSN_SIZE; return &insn; } static unsigned long trampoline_check_ip(void) { unsigned long tramp = uprobe_get_trampoline_vaddr(); return tramp + (uretprobe_syscall_check - uretprobe_trampoline_entry); } SYSCALL_DEFINE0(uretprobe) { struct pt_regs *regs = task_pt_regs(current); unsigned long err, ip, sp, r11_cx_ax[3]; if (regs->ip != trampoline_check_ip()) goto sigill; err = copy_from_user(r11_cx_ax, (void __user *)regs->sp, sizeof(r11_cx_ax)); if (err) goto sigill; /* expose the "right" values of r11/cx/ax/sp to uprobe_consumer/s */ regs->r11 = r11_cx_ax[0]; regs->cx = r11_cx_ax[1]; regs->ax = r11_cx_ax[2]; regs->sp += sizeof(r11_cx_ax); regs->orig_ax = -1; ip = regs->ip; sp = regs->sp; uprobe_handle_trampoline(regs); /* * Some of the uprobe consumers has changed sp, we can do nothing, * just return via iret. * .. or shadow stack is enabled, in which case we need to skip * return through the user space stack address. */ if (regs->sp != sp || shstk_is_enabled()) return regs->ax; regs->sp -= sizeof(r11_cx_ax); /* for the case uprobe_consumer has changed r11/cx */ r11_cx_ax[0] = regs->r11; r11_cx_ax[1] = regs->cx; /* * ax register is passed through as return value, so we can use * its space on stack for ip value and jump to it through the * trampoline's ret instruction */ r11_cx_ax[2] = regs->ip; regs->ip = ip; err = copy_to_user((void __user *)regs->sp, r11_cx_ax, sizeof(r11_cx_ax)); if (err) goto sigill; /* ensure sysret, see do_syscall_64() */ regs->r11 = regs->flags; regs->cx = regs->ip; return regs->ax; sigill: force_sig(SIGILL); return -1; } /* * If arch_uprobe->insn doesn't use rip-relative addressing, return * immediately. Otherwise, rewrite the instruction so that it accesses * its memory operand indirectly through a scratch register. Set * defparam->fixups accordingly. (The contents of the scratch register * will be saved before we single-step the modified instruction, * and restored afterward). * * We do this because a rip-relative instruction can access only a * relatively small area (+/- 2 GB from the instruction), and the XOL * area typically lies beyond that area. At least for instructions * that store to memory, we can't execute the original instruction * and "fix things up" later, because the misdirected store could be * disastrous. * * Some useful facts about rip-relative instructions: * * - There's always a modrm byte with bit layout "00 reg 101". * - There's never a SIB byte. * - The displacement is always 4 bytes. * - REX.B=1 bit in REX prefix, which normally extends r/m field, * has no effect on rip-relative mode. It doesn't make modrm byte * with r/m=101 refer to register 1101 = R13. */ static void riprel_analyze(struct arch_uprobe *auprobe, struct insn *insn) { u8 *cursor; u8 reg; u8 reg2; if (!insn_rip_relative(insn)) return; /* * insn_rip_relative() would have decoded rex_prefix, vex_prefix, modrm. * Clear REX.b bit (extension of MODRM.rm field): * we want to encode low numbered reg, not r8+. */ if (insn->rex_prefix.nbytes) { cursor = auprobe->insn + insn_offset_rex_prefix(insn); /* REX byte has 0100wrxb layout, clearing REX.b bit */ *cursor &= 0xfe; } /* * Similar treatment for VEX3/EVEX prefix. * TODO: add XOP treatment when insn decoder supports them */ if (insn->vex_prefix.nbytes >= 3) { /* * vex2: c5 rvvvvLpp (has no b bit) * vex3/xop: c4/8f rxbmmmmm wvvvvLpp * evex: 62 rxbR00mm wvvvv1pp zllBVaaa * Setting VEX3.b (setting because it has inverted meaning). * Setting EVEX.x since (in non-SIB encoding) EVEX.x * is the 4th bit of MODRM.rm, and needs the same treatment. * For VEX3-encoded insns, VEX3.x value has no effect in * non-SIB encoding, the change is superfluous but harmless. */ cursor = auprobe->insn + insn_offset_vex_prefix(insn) + 1; *cursor |= 0x60; } /* * Convert from rip-relative addressing to register-relative addressing * via a scratch register. * * This is tricky since there are insns with modrm byte * which also use registers not encoded in modrm byte: * [i]div/[i]mul: implicitly use dx:ax * shift ops: implicitly use cx * cmpxchg: implicitly uses ax * cmpxchg8/16b: implicitly uses dx:ax and bx:cx * Encoding: 0f c7/1 modrm * The code below thinks that reg=1 (cx), chooses si as scratch. * mulx: implicitly uses dx: mulx r/m,r1,r2 does r1:r2 = dx * r/m. * First appeared in Haswell (BMI2 insn). It is vex-encoded. * Example where none of bx,cx,dx can be used as scratch reg: * c4 e2 63 f6 0d disp32 mulx disp32(%rip),%ebx,%ecx * [v]pcmpistri: implicitly uses cx, xmm0 * [v]pcmpistrm: implicitly uses xmm0 * [v]pcmpestri: implicitly uses ax, dx, cx, xmm0 * [v]pcmpestrm: implicitly uses ax, dx, xmm0 * Evil SSE4.2 string comparison ops from hell. * maskmovq/[v]maskmovdqu: implicitly uses (ds:rdi) as destination. * Encoding: 0f f7 modrm, 66 0f f7 modrm, vex-encoded: c5 f9 f7 modrm. * Store op1, byte-masked by op2 msb's in each byte, to (ds:rdi). * AMD says it has no 3-operand form (vex.vvvv must be 1111) * and that it can have only register operands, not mem * (its modrm byte must have mode=11). * If these restrictions will ever be lifted, * we'll need code to prevent selection of di as scratch reg! * * Summary: I don't know any insns with modrm byte which * use SI register implicitly. DI register is used only * by one insn (maskmovq) and BX register is used * only by one too (cmpxchg8b). * BP is stack-segment based (may be a problem?). * AX, DX, CX are off-limits (many implicit users). * SP is unusable (it's stack pointer - think about "pop mem"; * also, rsp+disp32 needs sib encoding -> insn length change). */ reg = MODRM_REG(insn); /* Fetch modrm.reg */ reg2 = 0xff; /* Fetch vex.vvvv */ if (insn->vex_prefix.nbytes) reg2 = insn->vex_prefix.bytes[2]; /* * TODO: add XOP vvvv reading. * * vex.vvvv field is in bits 6-3, bits are inverted. * But in 32-bit mode, high-order bit may be ignored. * Therefore, let's consider only 3 low-order bits. */ reg2 = ((reg2 >> 3) & 0x7) ^ 0x7; /* * Register numbering is ax,cx,dx,bx, sp,bp,si,di, r8..r15. * * Choose scratch reg. Order is important: must not select bx * if we can use si (cmpxchg8b case!) */ if (reg != 6 && reg2 != 6) { reg2 = 6; auprobe->defparam.fixups |= UPROBE_FIX_RIP_SI; } else if (reg != 7 && reg2 != 7) { reg2 = 7; auprobe->defparam.fixups |= UPROBE_FIX_RIP_DI; /* TODO (paranoia): force maskmovq to not use di */ } else { reg2 = 3; auprobe->defparam.fixups |= UPROBE_FIX_RIP_BX; } /* * Point cursor at the modrm byte. The next 4 bytes are the * displacement. Beyond the displacement, for some instructions, * is the immediate operand. */ cursor = auprobe->insn + insn_offset_modrm(insn); /* * Change modrm from "00 reg 101" to "10 reg reg2". Example: * 89 05 disp32 mov %eax,disp32(%rip) becomes * 89 86 disp32 mov %eax,disp32(%rsi) */ *cursor = 0x80 | (reg << 3) | reg2; } static inline unsigned long * scratch_reg(struct arch_uprobe *auprobe, struct pt_regs *regs) { if (auprobe->defparam.fixups & UPROBE_FIX_RIP_SI) return ®s->si; if (auprobe->defparam.fixups & UPROBE_FIX_RIP_DI) return ®s->di; return ®s->bx; } /* * If we're emulating a rip-relative instruction, save the contents * of the scratch register and store the target address in that register. */ static void riprel_pre_xol(struct arch_uprobe *auprobe, struct pt_regs *regs) { if (auprobe->defparam.fixups & UPROBE_FIX_RIP_MASK) { struct uprobe_task *utask = current->utask; unsigned long *sr = scratch_reg(auprobe, regs); utask->autask.saved_scratch_register = *sr; *sr = utask->vaddr + auprobe->defparam.ilen; } } static void riprel_post_xol(struct arch_uprobe *auprobe, struct pt_regs *regs) { if (auprobe->defparam.fixups & UPROBE_FIX_RIP_MASK) { struct uprobe_task *utask = current->utask; unsigned long *sr = scratch_reg(auprobe, regs); *sr = utask->autask.saved_scratch_register; } } #else /* 32-bit: */ /* * No RIP-relative addressing on 32-bit */ static void riprel_analyze(struct arch_uprobe *auprobe, struct insn *insn) { } static void riprel_pre_xol(struct arch_uprobe *auprobe, struct pt_regs *regs) { } static void riprel_post_xol(struct arch_uprobe *auprobe, struct pt_regs *regs) { } #endif /* CONFIG_X86_64 */ struct uprobe_xol_ops { bool (*emulate)(struct arch_uprobe *, struct pt_regs *); int (*pre_xol)(struct arch_uprobe *, struct pt_regs *); int (*post_xol)(struct arch_uprobe *, struct pt_regs *); void (*abort)(struct arch_uprobe *, struct pt_regs *); }; static inline int sizeof_long(struct pt_regs *regs) { /* * Check registers for mode as in_xxx_syscall() does not apply here. */ return user_64bit_mode(regs) ? 8 : 4; } static int default_pre_xol_op(struct arch_uprobe *auprobe, struct pt_regs *regs) { riprel_pre_xol(auprobe, regs); return 0; } static int emulate_push_stack(struct pt_regs *regs, unsigned long val) { unsigned long new_sp = regs->sp - sizeof_long(regs); if (copy_to_user((void __user *)new_sp, &val, sizeof_long(regs))) return -EFAULT; regs->sp = new_sp; return 0; } /* * We have to fix things up as follows: * * Typically, the new ip is relative to the copied instruction. We need * to make it relative to the original instruction (FIX_IP). Exceptions * are return instructions and absolute or indirect jump or call instructions. * * If the single-stepped instruction was a call, the return address that * is atop the stack is the address following the copied instruction. We * need to make it the address following the original instruction (FIX_CALL). * * If the original instruction was a rip-relative instruction such as * "movl %edx,0xnnnn(%rip)", we have instead executed an equivalent * instruction using a scratch register -- e.g., "movl %edx,0xnnnn(%rsi)". * We need to restore the contents of the scratch register * (FIX_RIP_reg). */ static int default_post_xol_op(struct arch_uprobe *auprobe, struct pt_regs *regs) { struct uprobe_task *utask = current->utask; riprel_post_xol(auprobe, regs); if (auprobe->defparam.fixups & UPROBE_FIX_IP) { long correction = utask->vaddr - utask->xol_vaddr; regs->ip += correction; } else if (auprobe->defparam.fixups & UPROBE_FIX_CALL) { regs->sp += sizeof_long(regs); /* Pop incorrect return address */ if (emulate_push_stack(regs, utask->vaddr + auprobe->defparam.ilen)) return -ERESTART; } /* popf; tell the caller to not touch TF */ if (auprobe->defparam.fixups & UPROBE_FIX_SETF) utask->autask.saved_tf = true; return 0; } static void default_abort_op(struct arch_uprobe *auprobe, struct pt_regs *regs) { riprel_post_xol(auprobe, regs); } static const struct uprobe_xol_ops default_xol_ops = { .pre_xol = default_pre_xol_op, .post_xol = default_post_xol_op, .abort = default_abort_op, }; static bool branch_is_call(struct arch_uprobe *auprobe) { return auprobe->branch.opc1 == 0xe8; } #define CASE_COND \ COND(70, 71, XF(OF)) \ COND(72, 73, XF(CF)) \ COND(74, 75, XF(ZF)) \ COND(78, 79, XF(SF)) \ COND(7a, 7b, XF(PF)) \ COND(76, 77, XF(CF) || XF(ZF)) \ COND(7c, 7d, XF(SF) != XF(OF)) \ COND(7e, 7f, XF(ZF) || XF(SF) != XF(OF)) #define COND(op_y, op_n, expr) \ case 0x ## op_y: DO((expr) != 0) \ case 0x ## op_n: DO((expr) == 0) #define XF(xf) (!!(flags & X86_EFLAGS_ ## xf)) static bool is_cond_jmp_opcode(u8 opcode) { switch (opcode) { #define DO(expr) \ return true; CASE_COND #undef DO default: return false; } } static bool check_jmp_cond(struct arch_uprobe *auprobe, struct pt_regs *regs) { unsigned long flags = regs->flags; switch (auprobe->branch.opc1) { #define DO(expr) \ return expr; CASE_COND #undef DO default: /* not a conditional jmp */ return true; } } #undef XF #undef COND #undef CASE_COND static bool branch_emulate_op(struct arch_uprobe *auprobe, struct pt_regs *regs) { unsigned long new_ip = regs->ip += auprobe->branch.ilen; unsigned long offs = (long)auprobe->branch.offs; if (branch_is_call(auprobe)) { /* * If it fails we execute this (mangled, see the comment in * branch_clear_offset) insn out-of-line. In the likely case * this should trigger the trap, and the probed application * should die or restart the same insn after it handles the * signal, arch_uprobe_post_xol() won't be even called. * * But there is corner case, see the comment in ->post_xol(). */ if (emulate_push_stack(regs, new_ip)) return false; } else if (!check_jmp_cond(auprobe, regs)) { offs = 0; } regs->ip = new_ip + offs; return true; } static bool push_emulate_op(struct arch_uprobe *auprobe, struct pt_regs *regs) { unsigned long *src_ptr = (void *)regs + auprobe->push.reg_offset; if (emulate_push_stack(regs, *src_ptr)) return false; regs->ip += auprobe->push.ilen; return true; } static int branch_post_xol_op(struct arch_uprobe *auprobe, struct pt_regs *regs) { BUG_ON(!branch_is_call(auprobe)); /* * We can only get here if branch_emulate_op() failed to push the ret * address _and_ another thread expanded our stack before the (mangled) * "call" insn was executed out-of-line. Just restore ->sp and restart. * We could also restore ->ip and try to call branch_emulate_op() again. */ regs->sp += sizeof_long(regs); return -ERESTART; } static void branch_clear_offset(struct arch_uprobe *auprobe, struct insn *insn) { /* * Turn this insn into "call 1f; 1:", this is what we will execute * out-of-line if ->emulate() fails. We only need this to generate * a trap, so that the probed task receives the correct signal with * the properly filled siginfo. * * But see the comment in ->post_xol(), in the unlikely case it can * succeed. So we need to ensure that the new ->ip can not fall into * the non-canonical area and trigger #GP. * * We could turn it into (say) "pushf", but then we would need to * divorce ->insn[] and ->ixol[]. We need to preserve the 1st byte * of ->insn[] for set_orig_insn(). */ memset(auprobe->insn + insn_offset_immediate(insn), 0, insn->immediate.nbytes); } static const struct uprobe_xol_ops branch_xol_ops = { .emulate = branch_emulate_op, .post_xol = branch_post_xol_op, }; static const struct uprobe_xol_ops push_xol_ops = { .emulate = push_emulate_op, }; /* Returns -ENOSYS if branch_xol_ops doesn't handle this insn */ static int branch_setup_xol_ops(struct arch_uprobe *auprobe, struct insn *insn) { u8 opc1 = OPCODE1(insn); insn_byte_t p; int i; switch (opc1) { case 0xeb: /* jmp 8 */ case 0xe9: /* jmp 32 */ break; case 0x90: /* prefix* + nop; same as jmp with .offs = 0 */ goto setup; case 0xe8: /* call relative */ branch_clear_offset(auprobe, insn); break; case 0x0f: if (insn->opcode.nbytes != 2) return -ENOSYS; /* * If it is a "near" conditional jmp, OPCODE2() - 0x10 matches * OPCODE1() of the "short" jmp which checks the same condition. */ opc1 = OPCODE2(insn) - 0x10; fallthrough; default: if (!is_cond_jmp_opcode(opc1)) return -ENOSYS; } /* * 16-bit overrides such as CALLW (66 e8 nn nn) are not supported. * Intel and AMD behavior differ in 64-bit mode: Intel ignores 66 prefix. * No one uses these insns, reject any branch insns with such prefix. */ for_each_insn_prefix(insn, i, p) { if (p == 0x66) return -ENOTSUPP; } setup: auprobe->branch.opc1 = opc1; auprobe->branch.ilen = insn->length; auprobe->branch.offs = insn->immediate.value; auprobe->ops = &branch_xol_ops; return 0; } /* Returns -ENOSYS if push_xol_ops doesn't handle this insn */ static int push_setup_xol_ops(struct arch_uprobe *auprobe, struct insn *insn) { u8 opc1 = OPCODE1(insn), reg_offset = 0; if (opc1 < 0x50 || opc1 > 0x57) return -ENOSYS; if (insn->length > 2) return -ENOSYS; if (insn->length == 2) { /* only support rex_prefix 0x41 (x64 only) */ #ifdef CONFIG_X86_64 if (insn->rex_prefix.nbytes != 1 || insn->rex_prefix.bytes[0] != 0x41) return -ENOSYS; switch (opc1) { case 0x50: reg_offset = offsetof(struct pt_regs, r8); break; case 0x51: reg_offset = offsetof(struct pt_regs, r9); break; case 0x52: reg_offset = offsetof(struct pt_regs, r10); break; case 0x53: reg_offset = offsetof(struct pt_regs, r11); break; case 0x54: reg_offset = offsetof(struct pt_regs, r12); break; case 0x55: reg_offset = offsetof(struct pt_regs, r13); break; case 0x56: reg_offset = offsetof(struct pt_regs, r14); break; case 0x57: reg_offset = offsetof(struct pt_regs, r15); break; } #else return -ENOSYS; #endif } else { switch (opc1) { case 0x50: reg_offset = offsetof(struct pt_regs, ax); break; case 0x51: reg_offset = offsetof(struct pt_regs, cx); break; case 0x52: reg_offset = offsetof(struct pt_regs, dx); break; case 0x53: reg_offset = offsetof(struct pt_regs, bx); break; case 0x54: reg_offset = offsetof(struct pt_regs, sp); break; case 0x55: reg_offset = offsetof(struct pt_regs, bp); break; case 0x56: reg_offset = offsetof(struct pt_regs, si); break; case 0x57: reg_offset = offsetof(struct pt_regs, di); break; } } auprobe->push.reg_offset = reg_offset; auprobe->push.ilen = insn->length; auprobe->ops = &push_xol_ops; return 0; } /** * arch_uprobe_analyze_insn - instruction analysis including validity and fixups. * @auprobe: the probepoint information. * @mm: the probed address space. * @addr: virtual address at which to install the probepoint * Return 0 on success or a -ve number on error. */ int arch_uprobe_analyze_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long addr) { struct insn insn; u8 fix_ip_or_call = UPROBE_FIX_IP; int ret; ret = uprobe_init_insn(auprobe, &insn, is_64bit_mm(mm)); if (ret) return ret; ret = branch_setup_xol_ops(auprobe, &insn); if (ret != -ENOSYS) return ret; ret = push_setup_xol_ops(auprobe, &insn); if (ret != -ENOSYS) return ret; /* * Figure out which fixups default_post_xol_op() will need to perform, * and annotate defparam->fixups accordingly. */ switch (OPCODE1(&insn)) { case 0x9d: /* popf */ auprobe->defparam.fixups |= UPROBE_FIX_SETF; break; case 0xc3: /* ret or lret -- ip is correct */ case 0xcb: case 0xc2: case 0xca: case 0xea: /* jmp absolute -- ip is correct */ fix_ip_or_call = 0; break; case 0x9a: /* call absolute - Fix return addr, not ip */ fix_ip_or_call = UPROBE_FIX_CALL; break; case 0xff: switch (MODRM_REG(&insn)) { case 2: case 3: /* call or lcall, indirect */ fix_ip_or_call = UPROBE_FIX_CALL; break; case 4: case 5: /* jmp or ljmp, indirect */ fix_ip_or_call = 0; break; } fallthrough; default: riprel_analyze(auprobe, &insn); } auprobe->defparam.ilen = insn.length; auprobe->defparam.fixups |= fix_ip_or_call; auprobe->ops = &default_xol_ops; return 0; } /* * arch_uprobe_pre_xol - prepare to execute out of line. * @auprobe: the probepoint information. * @regs: reflects the saved user state of current task. */ int arch_uprobe_pre_xol(struct arch_uprobe *auprobe, struct pt_regs *regs) { struct uprobe_task *utask = current->utask; if (auprobe->ops->pre_xol) { int err = auprobe->ops->pre_xol(auprobe, regs); if (err) return err; } regs->ip = utask->xol_vaddr; utask->autask.saved_trap_nr = current->thread.trap_nr; current->thread.trap_nr = UPROBE_TRAP_NR; utask->autask.saved_tf = !!(regs->flags & X86_EFLAGS_TF); regs->flags |= X86_EFLAGS_TF; if (test_tsk_thread_flag(current, TIF_BLOCKSTEP)) set_task_blockstep(current, false); return 0; } /* * If xol insn itself traps and generates a signal(Say, * SIGILL/SIGSEGV/etc), then detect the case where a singlestepped * instruction jumps back to its own address. It is assumed that anything * like do_page_fault/do_trap/etc sets thread.trap_nr != -1. * * arch_uprobe_pre_xol/arch_uprobe_post_xol save/restore thread.trap_nr, * arch_uprobe_xol_was_trapped() simply checks that ->trap_nr is not equal to * UPROBE_TRAP_NR == -1 set by arch_uprobe_pre_xol(). */ bool arch_uprobe_xol_was_trapped(struct task_struct *t) { if (t->thread.trap_nr != UPROBE_TRAP_NR) return true; return false; } /* * Called after single-stepping. To avoid the SMP problems that can * occur when we temporarily put back the original opcode to * single-step, we single-stepped a copy of the instruction. * * This function prepares to resume execution after the single-step. */ int arch_uprobe_post_xol(struct arch_uprobe *auprobe, struct pt_regs *regs) { struct uprobe_task *utask = current->utask; bool send_sigtrap = utask->autask.saved_tf; int err = 0; WARN_ON_ONCE(current->thread.trap_nr != UPROBE_TRAP_NR); current->thread.trap_nr = utask->autask.saved_trap_nr; if (auprobe->ops->post_xol) { err = auprobe->ops->post_xol(auprobe, regs); if (err) { /* * Restore ->ip for restart or post mortem analysis. * ->post_xol() must not return -ERESTART unless this * is really possible. */ regs->ip = utask->vaddr; if (err == -ERESTART) err = 0; send_sigtrap = false; } } /* * arch_uprobe_pre_xol() doesn't save the state of TIF_BLOCKSTEP * so we can get an extra SIGTRAP if we do not clear TF. We need * to examine the opcode to make it right. */ if (send_sigtrap) send_sig(SIGTRAP, current, 0); if (!utask->autask.saved_tf) regs->flags &= ~X86_EFLAGS_TF; return err; } /* callback routine for handling exceptions. */ int arch_uprobe_exception_notify(struct notifier_block *self, unsigned long val, void *data) { struct die_args *args = data; struct pt_regs *regs = args->regs; int ret = NOTIFY_DONE; /* We are only interested in userspace traps */ if (regs && !user_mode(regs)) return NOTIFY_DONE; switch (val) { case DIE_INT3: if (uprobe_pre_sstep_notifier(regs)) ret = NOTIFY_STOP; break; case DIE_DEBUG: if (uprobe_post_sstep_notifier(regs)) ret = NOTIFY_STOP; break; default: break; } return ret; } /* * This function gets called when XOL instruction either gets trapped or * the thread has a fatal signal. Reset the instruction pointer to its * probed address for the potential restart or for post mortem analysis. */ void arch_uprobe_abort_xol(struct arch_uprobe *auprobe, struct pt_regs *regs) { struct uprobe_task *utask = current->utask; if (auprobe->ops->abort) auprobe->ops->abort(auprobe, regs); current->thread.trap_nr = utask->autask.saved_trap_nr; regs->ip = utask->vaddr; /* clear TF if it was set by us in arch_uprobe_pre_xol() */ if (!utask->autask.saved_tf) regs->flags &= ~X86_EFLAGS_TF; } static bool __skip_sstep(struct arch_uprobe *auprobe, struct pt_regs *regs) { if (auprobe->ops->emulate) return auprobe->ops->emulate(auprobe, regs); return false; } bool arch_uprobe_skip_sstep(struct arch_uprobe *auprobe, struct pt_regs *regs) { bool ret = __skip_sstep(auprobe, regs); if (ret && (regs->flags & X86_EFLAGS_TF)) send_sig(SIGTRAP, current, 0); return ret; } unsigned long arch_uretprobe_hijack_return_addr(unsigned long trampoline_vaddr, struct pt_regs *regs) { int rasize = sizeof_long(regs), nleft; unsigned long orig_ret_vaddr = 0; /* clear high bits for 32-bit apps */ if (copy_from_user(&orig_ret_vaddr, (void __user *)regs->sp, rasize)) return -1; /* check whether address has been already hijacked */ if (orig_ret_vaddr == trampoline_vaddr) return orig_ret_vaddr; nleft = copy_to_user((void __user *)regs->sp, &trampoline_vaddr, rasize); if (likely(!nleft)) { if (shstk_update_last_frame(trampoline_vaddr)) { force_sig(SIGSEGV); return -1; } return orig_ret_vaddr; } if (nleft != rasize) { pr_err("return address clobbered: pid=%d, %%sp=%#lx, %%ip=%#lx\n", current->pid, regs->sp, regs->ip); force_sig(SIGSEGV); } return -1; } bool arch_uretprobe_is_alive(struct return_instance *ret, enum rp_check ctx, struct pt_regs *regs) { if (ctx == RP_CHECK_CALL) /* sp was just decremented by "call" insn */ return regs->sp < ret->stack; else return regs->sp <= ret->stack; } |
| 2 92 31 3 4 26 47 47 49 18 22 6 6 5 5 28 28 28 20 20 28 28 19 19 51 1 50 11 4 3 7 14 6 15 16 1 9 8 18 7 5 8 4 14 15 11 7 7 7 7 7 7 7 7 7 7 7 7 3 4 4 3 7 7 3 4 5 35 3 2 1 48 8 40 46 48 4 39 2 3 6 14 6 12 2 40 1 39 37 19 37 37 19 8 10 10 36 1 36 1 4 12 11 8 18 2 19 16 35 1 1 2 2 9 10 3 3 130 130 130 129 130 2 14 130 129 13 2 130 129 2 1 13 18 18 6 18 18 18 17 18 18 1 3 89 89 90 49 52 90 18 86 63 27 86 15 88 13 75 15 78 12 87 3 80 10 37 55 9 3 18 19 92 96 94 95 19 92 3 90 48 52 52 52 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 | // SPDX-License-Identifier: GPL-2.0-only /* * linux/fs/isofs/inode.c * * (C) 1991 Linus Torvalds - minix filesystem * 1992, 1993, 1994 Eric Youngdale Modified for ISO 9660 filesystem. * 1994 Eberhard Mönkeberg - multi session handling. * 1995 Mark Dobie - allow mounting of some weird VideoCDs and PhotoCDs. * 1997 Gordon Chaffee - Joliet CDs * 1998 Eric Lammerts - ISO 9660 Level 3 * 2004 Paul Serice - Inode Support pushed out from 4GB to 128GB * 2004 Paul Serice - NFS Export Operations */ #include <linux/init.h> #include <linux/module.h> #include <linux/slab.h> #include <linux/cred.h> #include <linux/nls.h> #include <linux/ctype.h> #include <linux/statfs.h> #include <linux/cdrom.h> #include <linux/mpage.h> #include <linux/user_namespace.h> #include <linux/seq_file.h> #include <linux/blkdev.h> #include <linux/fs_context.h> #include <linux/fs_parser.h> #include "isofs.h" #include "zisofs.h" /* max tz offset is 13 hours */ #define MAX_TZ_OFFSET (52*15*60) #define BEQUIET static int isofs_hashi(const struct dentry *parent, struct qstr *qstr); static int isofs_dentry_cmpi(const struct dentry *dentry, unsigned int len, const char *str, const struct qstr *name); #ifdef CONFIG_JOLIET static int isofs_hashi_ms(const struct dentry *parent, struct qstr *qstr); static int isofs_hash_ms(const struct dentry *parent, struct qstr *qstr); static int isofs_dentry_cmpi_ms(const struct dentry *dentry, unsigned int len, const char *str, const struct qstr *name); static int isofs_dentry_cmp_ms(const struct dentry *dentry, unsigned int len, const char *str, const struct qstr *name); #endif static void isofs_put_super(struct super_block *sb) { struct isofs_sb_info *sbi = ISOFS_SB(sb); #ifdef CONFIG_JOLIET unload_nls(sbi->s_nls_iocharset); #endif kfree(sbi); sb->s_fs_info = NULL; return; } static int isofs_read_inode(struct inode *, int relocated); static int isofs_statfs (struct dentry *, struct kstatfs *); static int isofs_show_options(struct seq_file *, struct dentry *); static struct kmem_cache *isofs_inode_cachep; static struct inode *isofs_alloc_inode(struct super_block *sb) { struct iso_inode_info *ei; ei = alloc_inode_sb(sb, isofs_inode_cachep, GFP_KERNEL); if (!ei) return NULL; return &ei->vfs_inode; } static void isofs_free_inode(struct inode *inode) { kmem_cache_free(isofs_inode_cachep, ISOFS_I(inode)); } static void init_once(void *foo) { struct iso_inode_info *ei = foo; inode_init_once(&ei->vfs_inode); } static int __init init_inodecache(void) { isofs_inode_cachep = kmem_cache_create("isofs_inode_cache", sizeof(struct iso_inode_info), 0, (SLAB_RECLAIM_ACCOUNT| SLAB_ACCOUNT), init_once); if (!isofs_inode_cachep) return -ENOMEM; return 0; } static void destroy_inodecache(void) { /* * Make sure all delayed rcu free inodes are flushed before we * destroy cache. */ rcu_barrier(); kmem_cache_destroy(isofs_inode_cachep); } static int isofs_reconfigure(struct fs_context *fc) { sync_filesystem(fc->root->d_sb); if (!(fc->sb_flags & SB_RDONLY)) return -EROFS; return 0; } static const struct super_operations isofs_sops = { .alloc_inode = isofs_alloc_inode, .free_inode = isofs_free_inode, .put_super = isofs_put_super, .statfs = isofs_statfs, .show_options = isofs_show_options, }; static const struct dentry_operations isofs_dentry_ops[] = { { .d_hash = isofs_hashi, .d_compare = isofs_dentry_cmpi, }, #ifdef CONFIG_JOLIET { .d_hash = isofs_hash_ms, .d_compare = isofs_dentry_cmp_ms, }, { .d_hash = isofs_hashi_ms, .d_compare = isofs_dentry_cmpi_ms, }, #endif }; struct isofs_options{ unsigned int rock:1; unsigned int joliet:1; unsigned int cruft:1; unsigned int hide:1; unsigned int showassoc:1; unsigned int nocompress:1; unsigned int overriderockperm:1; unsigned int uid_set:1; unsigned int gid_set:1; unsigned char map; unsigned char check; unsigned int blocksize; umode_t fmode; umode_t dmode; kgid_t gid; kuid_t uid; char *iocharset; /* LVE */ s32 session; s32 sbsector; }; /* * Compute the hash for the isofs name corresponding to the dentry. */ static int isofs_hashi_common(const struct dentry *dentry, struct qstr *qstr, int ms) { const char *name; int len; char c; unsigned long hash; len = qstr->len; name = qstr->name; if (ms) { while (len && name[len-1] == '.') len--; } hash = init_name_hash(dentry); while (len--) { c = tolower(*name++); hash = partial_name_hash(c, hash); } qstr->hash = end_name_hash(hash); return 0; } /* * Compare of two isofs names. */ static int isofs_dentry_cmp_common( unsigned int len, const char *str, const struct qstr *name, int ms, int ci) { int alen, blen; /* A filename cannot end in '.' or we treat it like it has none */ alen = name->len; blen = len; if (ms) { while (alen && name->name[alen-1] == '.') alen--; while (blen && str[blen-1] == '.') blen--; } if (alen == blen) { if (ci) { if (strncasecmp(name->name, str, alen) == 0) return 0; } else { if (strncmp(name->name, str, alen) == 0) return 0; } } return 1; } static int isofs_hashi(const struct dentry *dentry, struct qstr *qstr) { return isofs_hashi_common(dentry, qstr, 0); } static int isofs_dentry_cmpi(const struct dentry *dentry, unsigned int len, const char *str, const struct qstr *name) { return isofs_dentry_cmp_common(len, str, name, 0, 1); } #ifdef CONFIG_JOLIET /* * Compute the hash for the isofs name corresponding to the dentry. */ static int isofs_hash_common(const struct dentry *dentry, struct qstr *qstr, int ms) { const char *name; int len; len = qstr->len; name = qstr->name; if (ms) { while (len && name[len-1] == '.') len--; } qstr->hash = full_name_hash(dentry, name, len); return 0; } static int isofs_hash_ms(const struct dentry *dentry, struct qstr *qstr) { return isofs_hash_common(dentry, qstr, 1); } static int isofs_hashi_ms(const struct dentry *dentry, struct qstr *qstr) { return isofs_hashi_common(dentry, qstr, 1); } static int isofs_dentry_cmp_ms(const struct dentry *dentry, unsigned int len, const char *str, const struct qstr *name) { return isofs_dentry_cmp_common(len, str, name, 1, 0); } static int isofs_dentry_cmpi_ms(const struct dentry *dentry, unsigned int len, const char *str, const struct qstr *name) { return isofs_dentry_cmp_common(len, str, name, 1, 1); } #endif enum { Opt_block, Opt_check, Opt_cruft, Opt_gid, Opt_ignore, Opt_iocharset, Opt_map, Opt_mode, Opt_nojoliet, Opt_norock, Opt_sb, Opt_session, Opt_uid, Opt_unhide, Opt_utf8, Opt_err, Opt_nocompress, Opt_hide, Opt_showassoc, Opt_dmode, Opt_overriderockperm, }; static const struct constant_table isofs_param_map[] = { {"acorn", 'a'}, {"a", 'a'}, {"normal", 'n'}, {"n", 'n'}, {"off", 'o'}, {"o", 'o'}, {} }; static const struct constant_table isofs_param_check[] = { {"relaxed", 'r'}, {"r", 'r'}, {"strict", 's'}, {"s", 's'}, {} }; static const struct fs_parameter_spec isofs_param_spec[] = { fsparam_flag ("norock", Opt_norock), fsparam_flag ("nojoliet", Opt_nojoliet), fsparam_flag ("unhide", Opt_unhide), fsparam_flag ("hide", Opt_hide), fsparam_flag ("showassoc", Opt_showassoc), fsparam_flag ("cruft", Opt_cruft), fsparam_flag ("utf8", Opt_utf8), fsparam_string ("iocharset", Opt_iocharset), fsparam_enum ("map", Opt_map, isofs_param_map), fsparam_u32 ("session", Opt_session), fsparam_u32 ("sbsector", Opt_sb), fsparam_enum ("check", Opt_check, isofs_param_check), fsparam_uid ("uid", Opt_uid), fsparam_gid ("gid", Opt_gid), /* Note: mode/dmode historically accepted %u not strictly %o */ fsparam_u32 ("mode", Opt_mode), fsparam_u32 ("dmode", Opt_dmode), fsparam_flag ("overriderockperm", Opt_overriderockperm), fsparam_u32 ("block", Opt_block), fsparam_string ("conv", Opt_ignore), fsparam_flag ("nocompress", Opt_nocompress), {} }; static int isofs_parse_param(struct fs_context *fc, struct fs_parameter *param) { struct isofs_options *popt = fc->fs_private; struct fs_parse_result result; int opt; unsigned int n; /* There are no remountable options */ if (fc->purpose == FS_CONTEXT_FOR_RECONFIGURE) return 0; opt = fs_parse(fc, isofs_param_spec, param, &result); if (opt < 0) return opt; switch (opt) { case Opt_norock: popt->rock = 0; break; case Opt_nojoliet: popt->joliet = 0; break; case Opt_hide: popt->hide = 1; break; case Opt_unhide: case Opt_showassoc: popt->showassoc = 1; break; case Opt_cruft: popt->cruft = 1; break; #ifdef CONFIG_JOLIET case Opt_utf8: kfree(popt->iocharset); popt->iocharset = kstrdup("utf8", GFP_KERNEL); if (!popt->iocharset) return -ENOMEM; break; case Opt_iocharset: kfree(popt->iocharset); popt->iocharset = kstrdup(param->string, GFP_KERNEL); if (!popt->iocharset) return -ENOMEM; break; #endif case Opt_map: popt->map = result.uint_32; break; case Opt_session: n = result.uint_32; /* * Track numbers are supposed to be in range 1-99, the * mount option starts indexing at 0. */ if (n >= 99) return -EINVAL; popt->session = n + 1; break; case Opt_sb: popt->sbsector = result.uint_32; break; case Opt_check: popt->check = result.uint_32; break; case Opt_ignore: break; case Opt_uid: popt->uid = result.uid; popt->uid_set = 1; break; case Opt_gid: popt->gid = result.gid; popt->gid_set = 1; break; case Opt_mode: popt->fmode = result.uint_32; break; case Opt_dmode: popt->dmode = result.uint_32; break; case Opt_overriderockperm: popt->overriderockperm = 1; break; case Opt_block: n = result.uint_32; if (n != 512 && n != 1024 && n != 2048) return -EINVAL; popt->blocksize = n; break; case Opt_nocompress: popt->nocompress = 1; break; default: return -EINVAL; } return 0; } /* * Display the mount options in /proc/mounts. */ static int isofs_show_options(struct seq_file *m, struct dentry *root) { struct isofs_sb_info *sbi = ISOFS_SB(root->d_sb); if (!sbi->s_rock) seq_puts(m, ",norock"); else if (!sbi->s_joliet_level) seq_puts(m, ",nojoliet"); if (sbi->s_cruft) seq_puts(m, ",cruft"); if (sbi->s_hide) seq_puts(m, ",hide"); if (sbi->s_nocompress) seq_puts(m, ",nocompress"); if (sbi->s_overriderockperm) seq_puts(m, ",overriderockperm"); if (sbi->s_showassoc) seq_puts(m, ",showassoc"); if (sbi->s_check) seq_printf(m, ",check=%c", sbi->s_check); if (sbi->s_mapping) seq_printf(m, ",map=%c", sbi->s_mapping); if (sbi->s_session != 255) seq_printf(m, ",session=%u", sbi->s_session - 1); if (sbi->s_sbsector != -1) seq_printf(m, ",sbsector=%u", sbi->s_sbsector); if (root->d_sb->s_blocksize != 1024) seq_printf(m, ",blocksize=%lu", root->d_sb->s_blocksize); if (sbi->s_uid_set) seq_printf(m, ",uid=%u", from_kuid_munged(&init_user_ns, sbi->s_uid)); if (sbi->s_gid_set) seq_printf(m, ",gid=%u", from_kgid_munged(&init_user_ns, sbi->s_gid)); if (sbi->s_dmode != ISOFS_INVALID_MODE) seq_printf(m, ",dmode=%o", sbi->s_dmode); if (sbi->s_fmode != ISOFS_INVALID_MODE) seq_printf(m, ",fmode=%o", sbi->s_fmode); #ifdef CONFIG_JOLIET if (sbi->s_nls_iocharset) seq_printf(m, ",iocharset=%s", sbi->s_nls_iocharset->charset); else seq_puts(m, ",iocharset=utf8"); #endif return 0; } /* * look if the driver can tell the multi session redirection value * * don't change this if you don't know what you do, please! * Multisession is legal only with XA disks. * A non-XA disk with more than one volume descriptor may do it right, but * usually is written in a nowhere standardized "multi-partition" manner. * Multisession uses absolute addressing (solely the first frame of the whole * track is #0), multi-partition uses relative addressing (each first frame of * each track is #0), and a track is not a session. * * A broken CDwriter software or drive firmware does not set new standards, * at least not if conflicting with the existing ones. * * emoenke@gwdg.de */ #define WE_OBEY_THE_WRITTEN_STANDARDS 1 static unsigned int isofs_get_last_session(struct super_block *sb, s32 session) { struct cdrom_device_info *cdi = disk_to_cdi(sb->s_bdev->bd_disk); unsigned int vol_desc_start = 0; if (session > 0) { struct cdrom_tocentry te; if (!cdi) return 0; te.cdte_track = session; te.cdte_format = CDROM_LBA; if (cdrom_read_tocentry(cdi, &te) == 0) { printk(KERN_DEBUG "ISOFS: Session %d start %d type %d\n", session, te.cdte_addr.lba, te.cdte_ctrl & CDROM_DATA_TRACK); if ((te.cdte_ctrl & CDROM_DATA_TRACK) == 4) return te.cdte_addr.lba; } printk(KERN_ERR "ISOFS: Invalid session number or type of track\n"); } if (cdi) { struct cdrom_multisession ms_info; ms_info.addr_format = CDROM_LBA; if (cdrom_multisession(cdi, &ms_info) == 0) { #if WE_OBEY_THE_WRITTEN_STANDARDS /* necessary for a valid ms_info.addr */ if (ms_info.xa_flag) #endif vol_desc_start = ms_info.addr.lba; } } return vol_desc_start; } /* * Check if root directory is empty (has less than 3 files). * * Used to detect broken CDs where ISO root directory is empty but Joliet root * directory is OK. If such CD has Rock Ridge extensions, they will be disabled * (and Joliet used instead) or else no files would be visible. */ static bool rootdir_empty(struct super_block *sb, unsigned long block) { int offset = 0, files = 0, de_len; struct iso_directory_record *de; struct buffer_head *bh; bh = sb_bread(sb, block); if (!bh) return true; while (files < 3) { de = (struct iso_directory_record *) (bh->b_data + offset); de_len = *(unsigned char *) de; if (de_len == 0) break; files++; offset += de_len; } brelse(bh); return files < 3; } /* * Initialize the superblock and read the root inode. */ static int isofs_fill_super(struct super_block *s, struct fs_context *fc) { struct buffer_head *bh = NULL, *pri_bh = NULL; struct hs_primary_descriptor *h_pri = NULL; struct iso_primary_descriptor *pri = NULL; struct iso_supplementary_descriptor *sec = NULL; struct iso_directory_record *rootp; struct inode *inode; struct isofs_options *opt = fc->fs_private; struct isofs_sb_info *sbi; unsigned long first_data_zone; int joliet_level = 0; int iso_blknum, block; int orig_zonesize; int table, error = -EINVAL; unsigned int vol_desc_start; int silent = fc->sb_flags & SB_SILENT; sbi = kzalloc(sizeof(*sbi), GFP_KERNEL); if (!sbi) return -ENOMEM; s->s_fs_info = sbi; /* * First of all, get the hardware blocksize for this device. * If we don't know what it is, or the hardware blocksize is * larger than the blocksize the user specified, then use * that value. */ /* * What if bugger tells us to go beyond page size? */ if (bdev_logical_block_size(s->s_bdev) > 2048) { printk(KERN_WARNING "ISOFS: unsupported/invalid hardware sector size %d\n", bdev_logical_block_size(s->s_bdev)); goto out_freesbi; } opt->blocksize = sb_min_blocksize(s, opt->blocksize); sbi->s_high_sierra = 0; /* default is iso9660 */ sbi->s_session = opt->session; sbi->s_sbsector = opt->sbsector; vol_desc_start = (opt->sbsector != -1) ? opt->sbsector : isofs_get_last_session(s, opt->session); for (iso_blknum = vol_desc_start+16; iso_blknum < vol_desc_start+100; iso_blknum++) { struct hs_volume_descriptor *hdp; struct iso_volume_descriptor *vdp; block = iso_blknum << (ISOFS_BLOCK_BITS - s->s_blocksize_bits); if (!(bh = sb_bread(s, block))) goto out_no_read; vdp = (struct iso_volume_descriptor *)bh->b_data; hdp = (struct hs_volume_descriptor *)bh->b_data; /* * Due to the overlapping physical location of the descriptors, * ISO CDs can match hdp->id==HS_STANDARD_ID as well. To ensure * proper identification in this case, we first check for ISO. */ if (strncmp (vdp->id, ISO_STANDARD_ID, sizeof vdp->id) == 0) { if (isonum_711(vdp->type) == ISO_VD_END) break; if (isonum_711(vdp->type) == ISO_VD_PRIMARY) { if (!pri) { pri = (struct iso_primary_descriptor *)vdp; /* Save the buffer in case we need it ... */ pri_bh = bh; bh = NULL; } } #ifdef CONFIG_JOLIET else if (isonum_711(vdp->type) == ISO_VD_SUPPLEMENTARY) { sec = (struct iso_supplementary_descriptor *)vdp; if (sec->escape[0] == 0x25 && sec->escape[1] == 0x2f) { if (opt->joliet) { if (sec->escape[2] == 0x40) joliet_level = 1; else if (sec->escape[2] == 0x43) joliet_level = 2; else if (sec->escape[2] == 0x45) joliet_level = 3; printk(KERN_DEBUG "ISO 9660 Extensions: " "Microsoft Joliet Level %d\n", joliet_level); } goto root_found; } else { /* Unknown supplementary volume descriptor */ sec = NULL; } } #endif } else { if (strncmp (hdp->id, HS_STANDARD_ID, sizeof hdp->id) == 0) { if (isonum_711(hdp->type) != ISO_VD_PRIMARY) goto out_freebh; sbi->s_high_sierra = 1; opt->rock = 0; h_pri = (struct hs_primary_descriptor *)vdp; goto root_found; } } /* Just skip any volume descriptors we don't recognize */ brelse(bh); bh = NULL; } /* * If we fall through, either no volume descriptor was found, * or else we passed a primary descriptor looking for others. */ if (!pri) goto out_unknown_format; brelse(bh); bh = pri_bh; pri_bh = NULL; root_found: /* We don't support read-write mounts */ if (!sb_rdonly(s)) { error = -EACCES; goto out_freebh; } if (joliet_level && (!pri || !opt->rock)) { /* This is the case of Joliet with the norock mount flag. * A disc with both Joliet and Rock Ridge is handled later */ pri = (struct iso_primary_descriptor *) sec; } if(sbi->s_high_sierra){ rootp = (struct iso_directory_record *) h_pri->root_directory_record; sbi->s_nzones = isonum_733(h_pri->volume_space_size); sbi->s_log_zone_size = isonum_723(h_pri->logical_block_size); sbi->s_max_size = isonum_733(h_pri->volume_space_size); } else { if (!pri) goto out_freebh; rootp = (struct iso_directory_record *) pri->root_directory_record; sbi->s_nzones = isonum_733(pri->volume_space_size); sbi->s_log_zone_size = isonum_723(pri->logical_block_size); sbi->s_max_size = isonum_733(pri->volume_space_size); } sbi->s_ninodes = 0; /* No way to figure this out easily */ orig_zonesize = sbi->s_log_zone_size; /* * If the zone size is smaller than the hardware sector size, * this is a fatal error. This would occur if the disc drive * had sectors that were 2048 bytes, but the filesystem had * blocks that were 512 bytes (which should only very rarely * happen.) */ if (orig_zonesize < opt->blocksize) goto out_bad_size; /* RDE: convert log zone size to bit shift */ switch (sbi->s_log_zone_size) { case 512: sbi->s_log_zone_size = 9; break; case 1024: sbi->s_log_zone_size = 10; break; case 2048: sbi->s_log_zone_size = 11; break; default: goto out_bad_zone_size; } s->s_magic = ISOFS_SUPER_MAGIC; /* * With multi-extent files, file size is only limited by the maximum * size of a file system, which is 8 TB. */ s->s_maxbytes = 0x80000000000LL; /* ECMA-119 timestamp from 1900/1/1 with tz offset */ s->s_time_min = mktime64(1900, 1, 1, 0, 0, 0) - MAX_TZ_OFFSET; s->s_time_max = mktime64(U8_MAX+1900, 12, 31, 23, 59, 59) + MAX_TZ_OFFSET; /* Set this for reference. Its not currently used except on write which we don't have .. */ first_data_zone = isonum_733(rootp->extent) + isonum_711(rootp->ext_attr_length); sbi->s_firstdatazone = first_data_zone; #ifndef BEQUIET printk(KERN_DEBUG "ISOFS: Max size:%ld Log zone size:%ld\n", sbi->s_max_size, 1UL << sbi->s_log_zone_size); printk(KERN_DEBUG "ISOFS: First datazone:%ld\n", sbi->s_firstdatazone); if(sbi->s_high_sierra) printk(KERN_DEBUG "ISOFS: Disc in High Sierra format.\n"); #endif /* * If the Joliet level is set, we _may_ decide to use the * secondary descriptor, but can't be sure until after we * read the root inode. But before reading the root inode * we may need to change the device blocksize, and would * rather release the old buffer first. So, we cache the * first_data_zone value from the secondary descriptor. */ if (joliet_level) { pri = (struct iso_primary_descriptor *) sec; rootp = (struct iso_directory_record *) pri->root_directory_record; first_data_zone = isonum_733(rootp->extent) + isonum_711(rootp->ext_attr_length); } /* * We're all done using the volume descriptor, and may need * to change the device blocksize, so release the buffer now. */ brelse(pri_bh); brelse(bh); /* * Force the blocksize to 512 for 512 byte sectors. The file * read primitives really get it wrong in a bad way if we don't * do this. * * Note - we should never be setting the blocksize to something * less than the hardware sector size for the device. If we * do, we would end up having to read larger buffers and split * out portions to satisfy requests. * * Note2- the idea here is that we want to deal with the optimal * zonesize in the filesystem. If we have it set to something less, * then we have horrible problems with trying to piece together * bits of adjacent blocks in order to properly read directory * entries. By forcing the blocksize in this way, we ensure * that we will never be required to do this. */ sb_set_blocksize(s, orig_zonesize); sbi->s_nls_iocharset = NULL; #ifdef CONFIG_JOLIET if (joliet_level) { char *p = opt->iocharset ? opt->iocharset : CONFIG_NLS_DEFAULT; if (strcmp(p, "utf8") != 0) { sbi->s_nls_iocharset = opt->iocharset ? load_nls(opt->iocharset) : load_nls_default(); if (!sbi->s_nls_iocharset) goto out_freesbi; } } #endif s->s_op = &isofs_sops; s->s_export_op = &isofs_export_ops; sbi->s_mapping = opt->map; sbi->s_rock = (opt->rock ? 2 : 0); sbi->s_rock_offset = -1; /* initial offset, will guess until SP is found*/ sbi->s_cruft = opt->cruft; sbi->s_hide = opt->hide; sbi->s_showassoc = opt->showassoc; sbi->s_uid = opt->uid; sbi->s_gid = opt->gid; sbi->s_uid_set = opt->uid_set; sbi->s_gid_set = opt->gid_set; sbi->s_nocompress = opt->nocompress; sbi->s_overriderockperm = opt->overriderockperm; /* * It would be incredibly stupid to allow people to mark every file * on the disk as suid, so we merely allow them to set the default * permissions. */ if (opt->fmode != ISOFS_INVALID_MODE) sbi->s_fmode = opt->fmode & 0777; else sbi->s_fmode = ISOFS_INVALID_MODE; if (opt->dmode != ISOFS_INVALID_MODE) sbi->s_dmode = opt->dmode & 0777; else sbi->s_dmode = ISOFS_INVALID_MODE; /* * Read the root inode, which _may_ result in changing * the s_rock flag. Once we have the final s_rock value, * we then decide whether to use the Joliet descriptor. */ inode = isofs_iget(s, sbi->s_firstdatazone, 0); /* * Fix for broken CDs with a corrupt root inode but a correct Joliet * root directory. */ if (IS_ERR(inode)) { if (joliet_level && sbi->s_firstdatazone != first_data_zone) { printk(KERN_NOTICE "ISOFS: root inode is unusable. " "Disabling Rock Ridge and switching to Joliet."); sbi->s_rock = 0; inode = NULL; } else { goto out_no_root; } } /* * Fix for broken CDs with Rock Ridge and empty ISO root directory but * correct Joliet root directory. */ if (sbi->s_rock == 1 && joliet_level && rootdir_empty(s, sbi->s_firstdatazone)) { printk(KERN_NOTICE "ISOFS: primary root directory is empty. " "Disabling Rock Ridge and switching to Joliet."); sbi->s_rock = 0; } /* * If this disk has both Rock Ridge and Joliet on it, then we * want to use Rock Ridge by default. This can be overridden * by using the norock mount option. There is still one other * possibility that is not taken into account: a Rock Ridge * CD with Unicode names. Until someone sees such a beast, it * will not be supported. */ if (sbi->s_rock == 1) { joliet_level = 0; } else if (joliet_level) { sbi->s_rock = 0; if (sbi->s_firstdatazone != first_data_zone) { sbi->s_firstdatazone = first_data_zone; printk(KERN_DEBUG "ISOFS: changing to secondary root\n"); iput(inode); inode = isofs_iget(s, sbi->s_firstdatazone, 0); if (IS_ERR(inode)) goto out_no_root; } } if (opt->check == 'u') { /* Only Joliet is case insensitive by default */ if (joliet_level) opt->check = 'r'; else opt->check = 's'; } sbi->s_joliet_level = joliet_level; /* Make sure the root inode is a directory */ if (!S_ISDIR(inode->i_mode)) { printk(KERN_WARNING "isofs_fill_super: root inode is not a directory. " "Corrupted media?\n"); goto out_iput; } table = 0; if (joliet_level) table += 2; if (opt->check == 'r') table++; sbi->s_check = opt->check; if (table) s->s_d_op = &isofs_dentry_ops[table - 1]; /* get the root dentry */ s->s_root = d_make_root(inode); if (!(s->s_root)) { error = -ENOMEM; goto out_no_inode; } kfree(opt->iocharset); return 0; /* * Display error messages and free resources. */ out_iput: iput(inode); goto out_no_inode; out_no_root: error = PTR_ERR(inode); if (error != -ENOMEM) printk(KERN_WARNING "%s: get root inode failed\n", __func__); out_no_inode: #ifdef CONFIG_JOLIET unload_nls(sbi->s_nls_iocharset); #endif goto out_freesbi; out_no_read: printk(KERN_WARNING "%s: bread failed, dev=%s, iso_blknum=%d, block=%d\n", __func__, s->s_id, iso_blknum, block); goto out_freebh; out_bad_zone_size: printk(KERN_WARNING "ISOFS: Bad logical zone size %ld\n", sbi->s_log_zone_size); goto out_freebh; out_bad_size: printk(KERN_WARNING "ISOFS: Logical zone size(%d) < hardware blocksize(%u)\n", orig_zonesize, opt->blocksize); goto out_freebh; out_unknown_format: if (!silent) printk(KERN_WARNING "ISOFS: Unable to identify CD-ROM format.\n"); out_freebh: brelse(bh); brelse(pri_bh); out_freesbi: kfree(opt->iocharset); kfree(sbi); s->s_fs_info = NULL; return error; } static int isofs_statfs (struct dentry *dentry, struct kstatfs *buf) { struct super_block *sb = dentry->d_sb; u64 id = huge_encode_dev(sb->s_bdev->bd_dev); buf->f_type = ISOFS_SUPER_MAGIC; buf->f_bsize = sb->s_blocksize; buf->f_blocks = (ISOFS_SB(sb)->s_nzones << (ISOFS_SB(sb)->s_log_zone_size - sb->s_blocksize_bits)); buf->f_bfree = 0; buf->f_bavail = 0; buf->f_files = ISOFS_SB(sb)->s_ninodes; buf->f_ffree = 0; buf->f_fsid = u64_to_fsid(id); buf->f_namelen = NAME_MAX; return 0; } /* * Get a set of blocks; filling in buffer_heads if already allocated * or getblk() if they are not. Returns the number of blocks inserted * (-ve == error.) */ int isofs_get_blocks(struct inode *inode, sector_t iblock, struct buffer_head **bh, unsigned long nblocks) { unsigned long b_off = iblock; unsigned offset, sect_size; unsigned int firstext; unsigned long nextblk, nextoff; int section, rv, error; struct iso_inode_info *ei = ISOFS_I(inode); error = -EIO; rv = 0; if (iblock != b_off) { printk(KERN_DEBUG "%s: block number too large\n", __func__); goto abort; } offset = 0; firstext = ei->i_first_extent; sect_size = ei->i_section_size >> ISOFS_BUFFER_BITS(inode); nextblk = ei->i_next_section_block; nextoff = ei->i_next_section_offset; section = 0; while (nblocks) { /* If we are *way* beyond the end of the file, print a message. * Access beyond the end of the file up to the next page boundary * is normal, however because of the way the page cache works. * In this case, we just return 0 so that we can properly fill * the page with useless information without generating any * I/O errors. */ if (b_off > ((inode->i_size + PAGE_SIZE - 1) >> ISOFS_BUFFER_BITS(inode))) { printk(KERN_DEBUG "%s: block >= EOF (%lu, %llu)\n", __func__, b_off, (unsigned long long)inode->i_size); goto abort; } /* On the last section, nextblk == 0, section size is likely to * exceed sect_size by a partial block, and access beyond the * end of the file will reach beyond the section size, too. */ while (nextblk && (b_off >= (offset + sect_size))) { struct inode *ninode; offset += sect_size; ninode = isofs_iget(inode->i_sb, nextblk, nextoff); if (IS_ERR(ninode)) { error = PTR_ERR(ninode); goto abort; } firstext = ISOFS_I(ninode)->i_first_extent; sect_size = ISOFS_I(ninode)->i_section_size >> ISOFS_BUFFER_BITS(ninode); nextblk = ISOFS_I(ninode)->i_next_section_block; nextoff = ISOFS_I(ninode)->i_next_section_offset; iput(ninode); if (++section > 100) { printk(KERN_DEBUG "%s: More than 100 file sections ?!?" " aborting...\n", __func__); printk(KERN_DEBUG "%s: block=%lu firstext=%u sect_size=%u " "nextblk=%lu nextoff=%lu\n", __func__, b_off, firstext, (unsigned) sect_size, nextblk, nextoff); goto abort; } } if (*bh) { map_bh(*bh, inode->i_sb, firstext + b_off - offset); } else { *bh = sb_getblk(inode->i_sb, firstext+b_off-offset); if (!*bh) goto abort; } bh++; /* Next buffer head */ b_off++; /* Next buffer offset */ nblocks--; rv++; } error = 0; abort: return rv != 0 ? rv : error; } /* * Used by the standard interfaces. */ static int isofs_get_block(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create) { int ret; if (create) { printk(KERN_DEBUG "%s: Kernel tries to allocate a block\n", __func__); return -EROFS; } ret = isofs_get_blocks(inode, iblock, &bh_result, 1); return ret < 0 ? ret : 0; } static int isofs_bmap(struct inode *inode, sector_t block) { struct buffer_head dummy; int error; dummy.b_state = 0; dummy.b_blocknr = -1000; error = isofs_get_block(inode, block, &dummy, 0); if (!error) return dummy.b_blocknr; return 0; } struct buffer_head *isofs_bread(struct inode *inode, sector_t block) { sector_t blknr = isofs_bmap(inode, block); if (!blknr) return NULL; return sb_bread(inode->i_sb, blknr); } static int isofs_read_folio(struct file *file, struct folio *folio) { return mpage_read_folio(folio, isofs_get_block); } static void isofs_readahead(struct readahead_control *rac) { mpage_readahead(rac, isofs_get_block); } static sector_t _isofs_bmap(struct address_space *mapping, sector_t block) { return generic_block_bmap(mapping,block,isofs_get_block); } static const struct address_space_operations isofs_aops = { .read_folio = isofs_read_folio, .readahead = isofs_readahead, .bmap = _isofs_bmap }; static int isofs_read_level3_size(struct inode *inode) { unsigned long bufsize = ISOFS_BUFFER_SIZE(inode); int high_sierra = ISOFS_SB(inode->i_sb)->s_high_sierra; struct buffer_head *bh = NULL; unsigned long block, offset, block_saved, offset_saved; int i = 0; int more_entries = 0; struct iso_directory_record *tmpde = NULL; struct iso_inode_info *ei = ISOFS_I(inode); inode->i_size = 0; /* The first 16 blocks are reserved as the System Area. Thus, * no inodes can appear in block 0. We use this to flag that * this is the last section. */ ei->i_next_section_block = 0; ei->i_next_section_offset = 0; block = ei->i_iget5_block; offset = ei->i_iget5_offset; do { struct iso_directory_record *de; unsigned int de_len; if (!bh) { bh = sb_bread(inode->i_sb, block); if (!bh) goto out_noread; } de = (struct iso_directory_record *) (bh->b_data + offset); de_len = *(unsigned char *) de; if (de_len == 0) { brelse(bh); bh = NULL; ++block; offset = 0; continue; } block_saved = block; offset_saved = offset; offset += de_len; /* Make sure we have a full directory entry */ if (offset >= bufsize) { int slop = bufsize - offset + de_len; if (!tmpde) { tmpde = kmalloc(256, GFP_KERNEL); if (!tmpde) goto out_nomem; } memcpy(tmpde, de, slop); offset &= bufsize - 1; block++; brelse(bh); bh = NULL; if (offset) { bh = sb_bread(inode->i_sb, block); if (!bh) goto out_noread; memcpy((void *)tmpde+slop, bh->b_data, offset); } de = tmpde; } inode->i_size += isonum_733(de->size); if (i == 1) { ei->i_next_section_block = block_saved; ei->i_next_section_offset = offset_saved; } more_entries = de->flags[-high_sierra] & 0x80; i++; if (i > 100) goto out_toomany; } while (more_entries); out: kfree(tmpde); brelse(bh); return 0; out_nomem: brelse(bh); return -ENOMEM; out_noread: printk(KERN_INFO "ISOFS: unable to read i-node block %lu\n", block); kfree(tmpde); return -EIO; out_toomany: printk(KERN_INFO "%s: More than 100 file sections ?!?, aborting...\n" "isofs_read_level3_size: inode=%lu\n", __func__, inode->i_ino); goto out; } static int isofs_read_inode(struct inode *inode, int relocated) { struct super_block *sb = inode->i_sb; struct isofs_sb_info *sbi = ISOFS_SB(sb); unsigned long bufsize = ISOFS_BUFFER_SIZE(inode); unsigned long block; int high_sierra = sbi->s_high_sierra; struct buffer_head *bh; struct iso_directory_record *de; struct iso_directory_record *tmpde = NULL; unsigned int de_len; unsigned long offset; struct iso_inode_info *ei = ISOFS_I(inode); int ret = -EIO; block = ei->i_iget5_block; bh = sb_bread(inode->i_sb, block); if (!bh) goto out_badread; offset = ei->i_iget5_offset; de = (struct iso_directory_record *) (bh->b_data + offset); de_len = *(unsigned char *) de; if (de_len < sizeof(struct iso_directory_record)) goto fail; if (offset + de_len > bufsize) { int frag1 = bufsize - offset; tmpde = kmalloc(de_len, GFP_KERNEL); if (!tmpde) { ret = -ENOMEM; goto fail; } memcpy(tmpde, bh->b_data + offset, frag1); brelse(bh); bh = sb_bread(inode->i_sb, ++block); if (!bh) goto out_badread; memcpy((char *)tmpde+frag1, bh->b_data, de_len - frag1); de = tmpde; } inode->i_ino = isofs_get_ino(ei->i_iget5_block, ei->i_iget5_offset, ISOFS_BUFFER_BITS(inode)); /* Assume it is a normal-format file unless told otherwise */ ei->i_file_format = isofs_file_normal; if (de->flags[-high_sierra] & 2) { if (sbi->s_dmode != ISOFS_INVALID_MODE) inode->i_mode = S_IFDIR | sbi->s_dmode; else inode->i_mode = S_IFDIR | S_IRUGO | S_IXUGO; set_nlink(inode, 1); /* * Set to 1. We know there are 2, but * the find utility tries to optimize * if it is 2, and it screws up. It is * easier to give 1 which tells find to * do it the hard way. */ } else { if (sbi->s_fmode != ISOFS_INVALID_MODE) { inode->i_mode = S_IFREG | sbi->s_fmode; } else { /* * Set default permissions: r-x for all. The disc * could be shared with DOS machines so virtually * anything could be a valid executable. */ inode->i_mode = S_IFREG | S_IRUGO | S_IXUGO; } set_nlink(inode, 1); } inode->i_uid = sbi->s_uid; inode->i_gid = sbi->s_gid; inode->i_blocks = 0; ei->i_format_parm[0] = 0; ei->i_format_parm[1] = 0; ei->i_format_parm[2] = 0; ei->i_section_size = isonum_733(de->size); if (de->flags[-high_sierra] & 0x80) { ret = isofs_read_level3_size(inode); if (ret < 0) goto fail; ret = -EIO; } else { ei->i_next_section_block = 0; ei->i_next_section_offset = 0; inode->i_size = isonum_733(de->size); } /* * Some dipshit decided to store some other bit of information * in the high byte of the file length. Truncate size in case * this CDROM was mounted with the cruft option. */ if (sbi->s_cruft) inode->i_size &= 0x00ffffff; if (de->interleave[0]) { printk(KERN_DEBUG "ISOFS: Interleaved files not (yet) supported.\n"); inode->i_size = 0; } /* I have no idea what file_unit_size is used for, so we will flag it for now */ if (de->file_unit_size[0] != 0) { printk(KERN_DEBUG "ISOFS: File unit size != 0 for ISO file (%ld).\n", inode->i_ino); } /* I have no idea what other flag bits are used for, so we will flag it for now */ #ifdef DEBUG if((de->flags[-high_sierra] & ~2)!= 0){ printk(KERN_DEBUG "ISOFS: Unusual flag settings for ISO file " "(%ld %x).\n", inode->i_ino, de->flags[-high_sierra]); } #endif inode_set_mtime_to_ts(inode, inode_set_atime_to_ts(inode, inode_set_ctime(inode, iso_date(de->date, high_sierra), 0))); ei->i_first_extent = (isonum_733(de->extent) + isonum_711(de->ext_attr_length)); /* Set the number of blocks for stat() - should be done before RR */ inode->i_blocks = (inode->i_size + 511) >> 9; /* * Now test for possible Rock Ridge extensions which will override * some of these numbers in the inode structure. */ if (!high_sierra) { parse_rock_ridge_inode(de, inode, relocated); /* if we want uid/gid set, override the rock ridge setting */ if (sbi->s_uid_set) inode->i_uid = sbi->s_uid; if (sbi->s_gid_set) inode->i_gid = sbi->s_gid; } /* Now set final access rights if overriding rock ridge setting */ if (S_ISDIR(inode->i_mode) && sbi->s_overriderockperm && sbi->s_dmode != ISOFS_INVALID_MODE) inode->i_mode = S_IFDIR | sbi->s_dmode; if (S_ISREG(inode->i_mode) && sbi->s_overriderockperm && sbi->s_fmode != ISOFS_INVALID_MODE) inode->i_mode = S_IFREG | sbi->s_fmode; /* Install the inode operations vector */ if (S_ISREG(inode->i_mode)) { inode->i_fop = &generic_ro_fops; switch (ei->i_file_format) { #ifdef CONFIG_ZISOFS case isofs_file_compressed: inode->i_data.a_ops = &zisofs_aops; break; #endif default: inode->i_data.a_ops = &isofs_aops; break; } } else if (S_ISDIR(inode->i_mode)) { inode->i_op = &isofs_dir_inode_operations; inode->i_fop = &isofs_dir_operations; } else if (S_ISLNK(inode->i_mode)) { inode->i_op = &page_symlink_inode_operations; inode_nohighmem(inode); inode->i_data.a_ops = &isofs_symlink_aops; } else /* XXX - parse_rock_ridge_inode() had already set i_rdev. */ init_special_inode(inode, inode->i_mode, inode->i_rdev); ret = 0; out: kfree(tmpde); brelse(bh); return ret; out_badread: printk(KERN_WARNING "ISOFS: unable to read i-node block\n"); fail: goto out; } struct isofs_iget5_callback_data { unsigned long block; unsigned long offset; }; static int isofs_iget5_test(struct inode *ino, void *data) { struct iso_inode_info *i = ISOFS_I(ino); struct isofs_iget5_callback_data *d = (struct isofs_iget5_callback_data*)data; return (i->i_iget5_block == d->block) && (i->i_iget5_offset == d->offset); } static int isofs_iget5_set(struct inode *ino, void *data) { struct iso_inode_info *i = ISOFS_I(ino); struct isofs_iget5_callback_data *d = (struct isofs_iget5_callback_data*)data; i->i_iget5_block = d->block; i->i_iget5_offset = d->offset; return 0; } /* Store, in the inode's containing structure, the block and block * offset that point to the underlying meta-data for the inode. The * code below is otherwise similar to the iget() code in * include/linux/fs.h */ struct inode *__isofs_iget(struct super_block *sb, unsigned long block, unsigned long offset, int relocated) { unsigned long hashval; struct inode *inode; struct isofs_iget5_callback_data data; long ret; if (offset >= 1ul << sb->s_blocksize_bits) return ERR_PTR(-EINVAL); data.block = block; data.offset = offset; hashval = (block << sb->s_blocksize_bits) | offset; inode = iget5_locked(sb, hashval, &isofs_iget5_test, &isofs_iget5_set, &data); if (!inode) return ERR_PTR(-ENOMEM); if (inode->i_state & I_NEW) { ret = isofs_read_inode(inode, relocated); if (ret < 0) { iget_failed(inode); inode = ERR_PTR(ret); } else { unlock_new_inode(inode); } } return inode; } static int isofs_get_tree(struct fs_context *fc) { return get_tree_bdev(fc, isofs_fill_super); } static void isofs_free_fc(struct fs_context *fc) { kfree(fc->fs_private); } static const struct fs_context_operations isofs_context_ops = { .parse_param = isofs_parse_param, .get_tree = isofs_get_tree, .reconfigure = isofs_reconfigure, .free = isofs_free_fc, }; static int isofs_init_fs_context(struct fs_context *fc) { struct isofs_options *opt; opt = kzalloc(sizeof(*opt), GFP_KERNEL); if (!opt) return -ENOMEM; opt->map = 'n'; opt->rock = 1; opt->joliet = 1; opt->cruft = 0; opt->hide = 0; opt->showassoc = 0; opt->check = 'u'; /* unset */ opt->nocompress = 0; opt->blocksize = 1024; opt->fmode = opt->dmode = ISOFS_INVALID_MODE; opt->uid_set = 0; opt->gid_set = 0; opt->gid = GLOBAL_ROOT_GID; opt->uid = GLOBAL_ROOT_UID; opt->iocharset = NULL; opt->overriderockperm = 0; opt->session = -1; opt->sbsector = -1; fc->fs_private = opt; fc->ops = &isofs_context_ops; return 0; } static struct file_system_type iso9660_fs_type = { .owner = THIS_MODULE, .name = "iso9660", .kill_sb = kill_block_super, .fs_flags = FS_REQUIRES_DEV, .init_fs_context = isofs_init_fs_context, .parameters = isofs_param_spec, }; MODULE_ALIAS_FS("iso9660"); MODULE_ALIAS("iso9660"); static int __init init_iso9660_fs(void) { int err = init_inodecache(); if (err) goto out; #ifdef CONFIG_ZISOFS err = zisofs_init(); if (err) goto out1; #endif err = register_filesystem(&iso9660_fs_type); if (err) goto out2; return 0; out2: #ifdef CONFIG_ZISOFS zisofs_cleanup(); out1: #endif destroy_inodecache(); out: return err; } static void __exit exit_iso9660_fs(void) { unregister_filesystem(&iso9660_fs_type); #ifdef CONFIG_ZISOFS zisofs_cleanup(); #endif destroy_inodecache(); } module_init(init_iso9660_fs) module_exit(exit_iso9660_fs) MODULE_DESCRIPTION("ISO 9660 CDROM file system support"); MODULE_LICENSE("GPL"); |
| 3 1 2 3 13 4 12 8 1 12 12 7 8 11 10 2 5 3 11 10 3 3 9 2 1 10 3 3 2 7 12 12 12 1 11 4 2 2 7 1 4 2 6 15 2 13 15 6 6 1 6 7 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 | /* Block- or MTD-based romfs * * Copyright © 2007 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) * * Derived from: ROMFS file system, Linux implementation * * Copyright © 1997-1999 Janos Farkas <chexum@shadow.banki.hu> * * Using parts of the minix filesystem * Copyright © 1991, 1992 Linus Torvalds * * and parts of the affs filesystem additionally * Copyright © 1993 Ray Burr * Copyright © 1996 Hans-Joachim Widmaier * * Changes * Changed for 2.1.19 modules * Jan 1997 Initial release * Jun 1997 2.1.43+ changes * Proper page locking in read_folio * Changed to work with 2.1.45+ fs * Jul 1997 Fixed follow_link * 2.1.47 * lookup shouldn't return -ENOENT * from Horst von Brand: * fail on wrong checksum * double unlock_super was possible * correct namelen for statfs * spotted by Bill Hawes: * readlink shouldn't iput() * Jun 1998 2.1.106 from Avery Pennarun: glibc scandir() * exposed a problem in readdir * 2.1.107 code-freeze spellchecker run * Aug 1998 2.1.118+ VFS changes * Sep 1998 2.1.122 another VFS change (follow_link) * Apr 1999 2.2.7 no more EBADF checking in * lookup/readdir, use ERR_PTR * Jun 1999 2.3.6 d_alloc_root use changed * 2.3.9 clean up usage of ENOENT/negative * dentries in lookup * clean up page flags setting * (error, uptodate, locking) in * in read_folio * use init_special_inode for * fifos/sockets (and streamline) in * read_inode, fix _ops table order * Aug 1999 2.3.16 __initfunc() => __init change * Oct 1999 2.3.24 page->owner hack obsoleted * Nov 1999 2.3.27 2.3.25+ page->offset => index change * * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public Licence * as published by the Free Software Foundation; either version * 2 of the Licence, or (at your option) any later version. */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/module.h> #include <linux/string.h> #include <linux/fs.h> #include <linux/time.h> #include <linux/slab.h> #include <linux/init.h> #include <linux/blkdev.h> #include <linux/fs_context.h> #include <linux/mount.h> #include <linux/namei.h> #include <linux/statfs.h> #include <linux/mtd/super.h> #include <linux/ctype.h> #include <linux/highmem.h> #include <linux/pagemap.h> #include <linux/uaccess.h> #include <linux/major.h> #include "internal.h" static struct kmem_cache *romfs_inode_cachep; static const umode_t romfs_modemap[8] = { 0, /* hard link */ S_IFDIR | 0644, /* directory */ S_IFREG | 0644, /* regular file */ S_IFLNK | 0777, /* symlink */ S_IFBLK | 0600, /* blockdev */ S_IFCHR | 0600, /* chardev */ S_IFSOCK | 0644, /* socket */ S_IFIFO | 0644 /* FIFO */ }; static const unsigned char romfs_dtype_table[] = { DT_UNKNOWN, DT_DIR, DT_REG, DT_LNK, DT_BLK, DT_CHR, DT_SOCK, DT_FIFO }; static struct inode *romfs_iget(struct super_block *sb, unsigned long pos); /* * read a page worth of data from the image */ static int romfs_read_folio(struct file *file, struct folio *folio) { struct inode *inode = folio->mapping->host; loff_t offset, size; unsigned long fillsize, pos; void *buf; int ret; buf = kmap_local_folio(folio, 0); offset = folio_pos(folio); size = i_size_read(inode); fillsize = 0; ret = 0; if (offset < size) { size -= offset; fillsize = size > PAGE_SIZE ? PAGE_SIZE : size; pos = ROMFS_I(inode)->i_dataoffset + offset; ret = romfs_dev_read(inode->i_sb, pos, buf, fillsize); if (ret < 0) { fillsize = 0; ret = -EIO; } } buf = folio_zero_tail(folio, fillsize, buf + fillsize); kunmap_local(buf); folio_end_read(folio, ret == 0); return ret; } static const struct address_space_operations romfs_aops = { .read_folio = romfs_read_folio }; /* * read the entries from a directory */ static int romfs_readdir(struct file *file, struct dir_context *ctx) { struct inode *i = file_inode(file); struct romfs_inode ri; unsigned long offset, maxoff; int j, ino, nextfh; char fsname[ROMFS_MAXFN]; /* XXX dynamic? */ int ret; maxoff = romfs_maxsize(i->i_sb); offset = ctx->pos; if (!offset) { offset = i->i_ino & ROMFH_MASK; ret = romfs_dev_read(i->i_sb, offset, &ri, ROMFH_SIZE); if (ret < 0) goto out; offset = be32_to_cpu(ri.spec) & ROMFH_MASK; } /* Not really failsafe, but we are read-only... */ for (;;) { if (!offset || offset >= maxoff) { offset = maxoff; ctx->pos = offset; goto out; } ctx->pos = offset; /* Fetch inode info */ ret = romfs_dev_read(i->i_sb, offset, &ri, ROMFH_SIZE); if (ret < 0) goto out; j = romfs_dev_strnlen(i->i_sb, offset + ROMFH_SIZE, sizeof(fsname) - 1); if (j < 0) goto out; ret = romfs_dev_read(i->i_sb, offset + ROMFH_SIZE, fsname, j); if (ret < 0) goto out; fsname[j] = '\0'; ino = offset; nextfh = be32_to_cpu(ri.next); if ((nextfh & ROMFH_TYPE) == ROMFH_HRD) ino = be32_to_cpu(ri.spec); if (!dir_emit(ctx, fsname, j, ino, romfs_dtype_table[nextfh & ROMFH_TYPE])) goto out; offset = nextfh & ROMFH_MASK; } out: return 0; } /* * look up an entry in a directory */ static struct dentry *romfs_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) { unsigned long offset, maxoff; struct inode *inode = NULL; struct romfs_inode ri; const char *name; /* got from dentry */ int len, ret; offset = dir->i_ino & ROMFH_MASK; ret = romfs_dev_read(dir->i_sb, offset, &ri, ROMFH_SIZE); if (ret < 0) goto error; /* search all the file entries in the list starting from the one * pointed to by the directory's special data */ maxoff = romfs_maxsize(dir->i_sb); offset = be32_to_cpu(ri.spec) & ROMFH_MASK; name = dentry->d_name.name; len = dentry->d_name.len; for (;;) { if (!offset || offset >= maxoff) break; ret = romfs_dev_read(dir->i_sb, offset, &ri, sizeof(ri)); if (ret < 0) goto error; /* try to match the first 16 bytes of name */ ret = romfs_dev_strcmp(dir->i_sb, offset + ROMFH_SIZE, name, len); if (ret < 0) goto error; if (ret == 1) { /* Hard link handling */ if ((be32_to_cpu(ri.next) & ROMFH_TYPE) == ROMFH_HRD) offset = be32_to_cpu(ri.spec) & ROMFH_MASK; inode = romfs_iget(dir->i_sb, offset); break; } /* next entry */ offset = be32_to_cpu(ri.next) & ROMFH_MASK; } return d_splice_alias(inode, dentry); error: return ERR_PTR(ret); } static const struct file_operations romfs_dir_operations = { .read = generic_read_dir, .iterate_shared = romfs_readdir, .llseek = generic_file_llseek, }; static const struct inode_operations romfs_dir_inode_operations = { .lookup = romfs_lookup, }; /* * get a romfs inode based on its position in the image (which doubles as the * inode number) */ static struct inode *romfs_iget(struct super_block *sb, unsigned long pos) { struct romfs_inode_info *inode; struct romfs_inode ri; struct inode *i; unsigned long nlen; unsigned nextfh; int ret; umode_t mode; /* we might have to traverse a chain of "hard link" file entries to get * to the actual file */ for (;;) { ret = romfs_dev_read(sb, pos, &ri, sizeof(ri)); if (ret < 0) goto error; /* XXX: do romfs_checksum here too (with name) */ nextfh = be32_to_cpu(ri.next); if ((nextfh & ROMFH_TYPE) != ROMFH_HRD) break; pos = be32_to_cpu(ri.spec) & ROMFH_MASK; } /* determine the length of the filename */ nlen = romfs_dev_strnlen(sb, pos + ROMFH_SIZE, ROMFS_MAXFN); if (IS_ERR_VALUE(nlen)) goto eio; /* get an inode for this image position */ i = iget_locked(sb, pos); if (!i) return ERR_PTR(-ENOMEM); if (!(i->i_state & I_NEW)) return i; /* precalculate the data offset */ inode = ROMFS_I(i); inode->i_metasize = (ROMFH_SIZE + nlen + 1 + ROMFH_PAD) & ROMFH_MASK; inode->i_dataoffset = pos + inode->i_metasize; set_nlink(i, 1); /* Hard to decide.. */ i->i_size = be32_to_cpu(ri.size); inode_set_mtime_to_ts(i, inode_set_atime_to_ts(i, inode_set_ctime(i, 0, 0))); /* set up mode and ops */ mode = romfs_modemap[nextfh & ROMFH_TYPE]; switch (nextfh & ROMFH_TYPE) { case ROMFH_DIR: i->i_size = ROMFS_I(i)->i_metasize; i->i_op = &romfs_dir_inode_operations; i->i_fop = &romfs_dir_operations; if (nextfh & ROMFH_EXEC) mode |= S_IXUGO; break; case ROMFH_REG: i->i_fop = &romfs_ro_fops; i->i_data.a_ops = &romfs_aops; if (nextfh & ROMFH_EXEC) mode |= S_IXUGO; break; case ROMFH_SYM: i->i_op = &page_symlink_inode_operations; inode_nohighmem(i); i->i_data.a_ops = &romfs_aops; mode |= S_IRWXUGO; break; default: /* depending on MBZ for sock/fifos */ nextfh = be32_to_cpu(ri.spec); init_special_inode(i, mode, MKDEV(nextfh >> 16, nextfh & 0xffff)); break; } i->i_mode = mode; i->i_blocks = (i->i_size + 511) >> 9; unlock_new_inode(i); return i; eio: ret = -EIO; error: pr_err("read error for inode 0x%lx\n", pos); return ERR_PTR(ret); } /* * allocate a new inode */ static struct inode *romfs_alloc_inode(struct super_block *sb) { struct romfs_inode_info *inode; inode = alloc_inode_sb(sb, romfs_inode_cachep, GFP_KERNEL); return inode ? &inode->vfs_inode : NULL; } /* * return a spent inode to the slab cache */ static void romfs_free_inode(struct inode *inode) { kmem_cache_free(romfs_inode_cachep, ROMFS_I(inode)); } /* * get filesystem statistics */ static int romfs_statfs(struct dentry *dentry, struct kstatfs *buf) { struct super_block *sb = dentry->d_sb; u64 id = 0; /* When calling huge_encode_dev(), * use sb->s_bdev->bd_dev when, * - CONFIG_ROMFS_ON_BLOCK defined * use sb->s_dev when, * - CONFIG_ROMFS_ON_BLOCK undefined and * - CONFIG_ROMFS_ON_MTD defined * leave id as 0 when, * - CONFIG_ROMFS_ON_BLOCK undefined and * - CONFIG_ROMFS_ON_MTD undefined */ if (sb->s_bdev) id = huge_encode_dev(sb->s_bdev->bd_dev); else if (sb->s_dev) id = huge_encode_dev(sb->s_dev); buf->f_type = ROMFS_MAGIC; buf->f_namelen = ROMFS_MAXFN; buf->f_bsize = ROMBSIZE; buf->f_bfree = buf->f_bavail = buf->f_ffree; buf->f_blocks = (romfs_maxsize(dentry->d_sb) + ROMBSIZE - 1) >> ROMBSBITS; buf->f_fsid = u64_to_fsid(id); return 0; } /* * remounting must involve read-only */ static int romfs_reconfigure(struct fs_context *fc) { sync_filesystem(fc->root->d_sb); fc->sb_flags |= SB_RDONLY; return 0; } static const struct super_operations romfs_super_ops = { .alloc_inode = romfs_alloc_inode, .free_inode = romfs_free_inode, .statfs = romfs_statfs, }; /* * checksum check on part of a romfs filesystem */ static __u32 romfs_checksum(const void *data, int size) { const __be32 *ptr = data; __u32 sum; sum = 0; size >>= 2; while (size > 0) { sum += be32_to_cpu(*ptr++); size--; } return sum; } /* * fill in the superblock */ static int romfs_fill_super(struct super_block *sb, struct fs_context *fc) { struct romfs_super_block *rsb; struct inode *root; unsigned long pos, img_size; const char *storage; size_t len; int ret; #ifdef CONFIG_BLOCK if (!sb->s_mtd) { sb_set_blocksize(sb, ROMBSIZE); } else { sb->s_blocksize = ROMBSIZE; sb->s_blocksize_bits = blksize_bits(ROMBSIZE); } #endif sb->s_maxbytes = 0xFFFFFFFF; sb->s_magic = ROMFS_MAGIC; sb->s_flags |= SB_RDONLY | SB_NOATIME; sb->s_time_min = 0; sb->s_time_max = 0; sb->s_op = &romfs_super_ops; #ifdef CONFIG_ROMFS_ON_MTD /* Use same dev ID from the underlying mtdblock device */ if (sb->s_mtd) sb->s_dev = MKDEV(MTD_BLOCK_MAJOR, sb->s_mtd->index); #endif /* read the image superblock and check it */ rsb = kmalloc(512, GFP_KERNEL); if (!rsb) return -ENOMEM; sb->s_fs_info = (void *) 512; ret = romfs_dev_read(sb, 0, rsb, 512); if (ret < 0) goto error_rsb; img_size = be32_to_cpu(rsb->size); if (sb->s_mtd && img_size > sb->s_mtd->size) goto error_rsb_inval; sb->s_fs_info = (void *) img_size; if (rsb->word0 != ROMSB_WORD0 || rsb->word1 != ROMSB_WORD1 || img_size < ROMFH_SIZE) { if (!(fc->sb_flags & SB_SILENT)) errorf(fc, "VFS: Can't find a romfs filesystem on dev %s.\n", sb->s_id); goto error_rsb_inval; } if (romfs_checksum(rsb, min_t(size_t, img_size, 512))) { pr_err("bad initial checksum on dev %s.\n", sb->s_id); goto error_rsb_inval; } storage = sb->s_mtd ? "MTD" : "the block layer"; len = strnlen(rsb->name, ROMFS_MAXFN); if (!(fc->sb_flags & SB_SILENT)) pr_notice("Mounting image '%*.*s' through %s\n", (unsigned) len, (unsigned) len, rsb->name, storage); kfree(rsb); rsb = NULL; /* find the root directory */ pos = (ROMFH_SIZE + len + 1 + ROMFH_PAD) & ROMFH_MASK; root = romfs_iget(sb, pos); if (IS_ERR(root)) return PTR_ERR(root); sb->s_root = d_make_root(root); if (!sb->s_root) return -ENOMEM; return 0; error_rsb_inval: ret = -EINVAL; error_rsb: kfree(rsb); return ret; } /* * get a superblock for mounting */ static int romfs_get_tree(struct fs_context *fc) { int ret = -EINVAL; #ifdef CONFIG_ROMFS_ON_MTD ret = get_tree_mtd(fc, romfs_fill_super); #endif #ifdef CONFIG_ROMFS_ON_BLOCK if (ret == -EINVAL) ret = get_tree_bdev(fc, romfs_fill_super); #endif return ret; } static const struct fs_context_operations romfs_context_ops = { .get_tree = romfs_get_tree, .reconfigure = romfs_reconfigure, }; /* * Set up the filesystem mount context. */ static int romfs_init_fs_context(struct fs_context *fc) { fc->ops = &romfs_context_ops; return 0; } /* * destroy a romfs superblock in the appropriate manner */ static void romfs_kill_sb(struct super_block *sb) { generic_shutdown_super(sb); #ifdef CONFIG_ROMFS_ON_MTD if (sb->s_mtd) { put_mtd_device(sb->s_mtd); sb->s_mtd = NULL; } #endif #ifdef CONFIG_ROMFS_ON_BLOCK if (sb->s_bdev) { sync_blockdev(sb->s_bdev); bdev_fput(sb->s_bdev_file); } #endif } static struct file_system_type romfs_fs_type = { .owner = THIS_MODULE, .name = "romfs", .init_fs_context = romfs_init_fs_context, .kill_sb = romfs_kill_sb, .fs_flags = FS_REQUIRES_DEV, }; MODULE_ALIAS_FS("romfs"); /* * inode storage initialiser */ static void romfs_i_init_once(void *_inode) { struct romfs_inode_info *inode = _inode; inode_init_once(&inode->vfs_inode); } /* * romfs module initialisation */ static int __init init_romfs_fs(void) { int ret; pr_info("ROMFS MTD (C) 2007 Red Hat, Inc.\n"); romfs_inode_cachep = kmem_cache_create("romfs_i", sizeof(struct romfs_inode_info), 0, SLAB_RECLAIM_ACCOUNT | SLAB_ACCOUNT, romfs_i_init_once); if (!romfs_inode_cachep) { pr_err("Failed to initialise inode cache\n"); return -ENOMEM; } ret = register_filesystem(&romfs_fs_type); if (ret) { pr_err("Failed to register filesystem\n"); goto error_register; } return 0; error_register: kmem_cache_destroy(romfs_inode_cachep); return ret; } /* * romfs module removal */ static void __exit exit_romfs_fs(void) { unregister_filesystem(&romfs_fs_type); /* * Make sure all delayed rcu free inodes are flushed before we * destroy cache. */ rcu_barrier(); kmem_cache_destroy(romfs_inode_cachep); } module_init(init_romfs_fs); module_exit(exit_romfs_fs); MODULE_DESCRIPTION("Direct-MTD Capable RomFS"); MODULE_AUTHOR("Red Hat, Inc."); MODULE_LICENSE("GPL"); /* Actually dual-licensed, but it doesn't matter for */ |
| 172 133 1 60 96 1 2 15 6 77 1 3 77 1 2 1 4 4 56 73 32 68 7 19 52 8 45 34 4 30 13 25 1 22 32 1 18 13 14 16 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 | /* * linux/fs/hfs/dir.c * * Copyright (C) 1995-1997 Paul H. Hargrove * (C) 2003 Ardis Technologies <roman@ardistech.com> * This file may be distributed under the terms of the GNU General Public License. * * This file contains directory-related functions independent of which * scheme is being used to represent forks. * * Based on the minix file system code, (C) 1991, 1992 by Linus Torvalds */ #include "hfs_fs.h" #include "btree.h" /* * hfs_lookup() */ static struct dentry *hfs_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) { hfs_cat_rec rec; struct hfs_find_data fd; struct inode *inode = NULL; int res; res = hfs_find_init(HFS_SB(dir->i_sb)->cat_tree, &fd); if (res) return ERR_PTR(res); hfs_cat_build_key(dir->i_sb, fd.search_key, dir->i_ino, &dentry->d_name); res = hfs_brec_read(&fd, &rec, sizeof(rec)); if (res) { if (res != -ENOENT) inode = ERR_PTR(res); } else { inode = hfs_iget(dir->i_sb, &fd.search_key->cat, &rec); if (!inode) inode = ERR_PTR(-EACCES); } hfs_find_exit(&fd); return d_splice_alias(inode, dentry); } /* * hfs_readdir */ static int hfs_readdir(struct file *file, struct dir_context *ctx) { struct inode *inode = file_inode(file); struct super_block *sb = inode->i_sb; int len, err; char strbuf[HFS_MAX_NAMELEN]; union hfs_cat_rec entry; struct hfs_find_data fd; struct hfs_readdir_data *rd; u16 type; if (ctx->pos >= inode->i_size) return 0; err = hfs_find_init(HFS_SB(sb)->cat_tree, &fd); if (err) return err; hfs_cat_build_key(sb, fd.search_key, inode->i_ino, NULL); err = hfs_brec_find(&fd); if (err) goto out; if (ctx->pos == 0) { /* This is completely artificial... */ if (!dir_emit_dot(file, ctx)) goto out; ctx->pos = 1; } if (ctx->pos == 1) { if (fd.entrylength > sizeof(entry) || fd.entrylength < 0) { err = -EIO; goto out; } hfs_bnode_read(fd.bnode, &entry, fd.entryoffset, fd.entrylength); if (entry.type != HFS_CDR_THD) { pr_err("bad catalog folder thread\n"); err = -EIO; goto out; } //if (fd.entrylength < HFS_MIN_THREAD_SZ) { // pr_err("truncated catalog thread\n"); // err = -EIO; // goto out; //} if (!dir_emit(ctx, "..", 2, be32_to_cpu(entry.thread.ParID), DT_DIR)) goto out; ctx->pos = 2; } if (ctx->pos >= inode->i_size) goto out; err = hfs_brec_goto(&fd, ctx->pos - 1); if (err) goto out; for (;;) { if (be32_to_cpu(fd.key->cat.ParID) != inode->i_ino) { pr_err("walked past end of dir\n"); err = -EIO; goto out; } if (fd.entrylength > sizeof(entry) || fd.entrylength < 0) { err = -EIO; goto out; } hfs_bnode_read(fd.bnode, &entry, fd.entryoffset, fd.entrylength); type = entry.type; len = hfs_mac2asc(sb, strbuf, &fd.key->cat.CName); if (type == HFS_CDR_DIR) { if (fd.entrylength < sizeof(struct hfs_cat_dir)) { pr_err("small dir entry\n"); err = -EIO; goto out; } if (!dir_emit(ctx, strbuf, len, be32_to_cpu(entry.dir.DirID), DT_DIR)) break; } else if (type == HFS_CDR_FIL) { if (fd.entrylength < sizeof(struct hfs_cat_file)) { pr_err("small file entry\n"); err = -EIO; goto out; } if (!dir_emit(ctx, strbuf, len, be32_to_cpu(entry.file.FlNum), DT_REG)) break; } else { pr_err("bad catalog entry type %d\n", type); err = -EIO; goto out; } ctx->pos++; if (ctx->pos >= inode->i_size) goto out; err = hfs_brec_goto(&fd, 1); if (err) goto out; } rd = file->private_data; if (!rd) { rd = kmalloc(sizeof(struct hfs_readdir_data), GFP_KERNEL); if (!rd) { err = -ENOMEM; goto out; } file->private_data = rd; rd->file = file; spin_lock(&HFS_I(inode)->open_dir_lock); list_add(&rd->list, &HFS_I(inode)->open_dir_list); spin_unlock(&HFS_I(inode)->open_dir_lock); } /* * Can be done after the list insertion; exclusion with * hfs_delete_cat() is provided by directory lock. */ memcpy(&rd->key, &fd.key->cat, sizeof(struct hfs_cat_key)); out: hfs_find_exit(&fd); return err; } static int hfs_dir_release(struct inode *inode, struct file *file) { struct hfs_readdir_data *rd = file->private_data; if (rd) { spin_lock(&HFS_I(inode)->open_dir_lock); list_del(&rd->list); spin_unlock(&HFS_I(inode)->open_dir_lock); kfree(rd); } return 0; } /* * hfs_create() * * This is the create() entry in the inode_operations structure for * regular HFS directories. The purpose is to create a new file in * a directory and return a corresponding inode, given the inode for * the directory and the name (and its length) of the new file. */ static int hfs_create(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, bool excl) { struct inode *inode; int res; inode = hfs_new_inode(dir, &dentry->d_name, mode); if (!inode) return -ENOMEM; res = hfs_cat_create(inode->i_ino, dir, &dentry->d_name, inode); if (res) { clear_nlink(inode); hfs_delete_inode(inode); iput(inode); return res; } d_instantiate(dentry, inode); mark_inode_dirty(inode); return 0; } /* * hfs_mkdir() * * This is the mkdir() entry in the inode_operations structure for * regular HFS directories. The purpose is to create a new directory * in a directory, given the inode for the parent directory and the * name (and its length) of the new directory. */ static int hfs_mkdir(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode) { struct inode *inode; int res; inode = hfs_new_inode(dir, &dentry->d_name, S_IFDIR | mode); if (!inode) return -ENOMEM; res = hfs_cat_create(inode->i_ino, dir, &dentry->d_name, inode); if (res) { clear_nlink(inode); hfs_delete_inode(inode); iput(inode); return res; } d_instantiate(dentry, inode); mark_inode_dirty(inode); return 0; } /* * hfs_remove() * * This serves as both unlink() and rmdir() in the inode_operations * structure for regular HFS directories. The purpose is to delete * an existing child, given the inode for the parent directory and * the name (and its length) of the existing directory. * * HFS does not have hardlinks, so both rmdir and unlink set the * link count to 0. The only difference is the emptiness check. */ static int hfs_remove(struct inode *dir, struct dentry *dentry) { struct inode *inode = d_inode(dentry); int res; if (S_ISDIR(inode->i_mode) && inode->i_size != 2) return -ENOTEMPTY; res = hfs_cat_delete(inode->i_ino, dir, &dentry->d_name); if (res) return res; clear_nlink(inode); inode_set_ctime_current(inode); hfs_delete_inode(inode); mark_inode_dirty(inode); return 0; } /* * hfs_rename() * * This is the rename() entry in the inode_operations structure for * regular HFS directories. The purpose is to rename an existing * file or directory, given the inode for the current directory and * the name (and its length) of the existing file/directory and the * inode for the new directory and the name (and its length) of the * new file/directory. * XXX: how do you handle must_be dir? */ static int hfs_rename(struct mnt_idmap *idmap, struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags) { int res; if (flags & ~RENAME_NOREPLACE) return -EINVAL; /* Unlink destination if it already exists */ if (d_really_is_positive(new_dentry)) { res = hfs_remove(new_dir, new_dentry); if (res) return res; } res = hfs_cat_move(d_inode(old_dentry)->i_ino, old_dir, &old_dentry->d_name, new_dir, &new_dentry->d_name); if (!res) hfs_cat_build_key(old_dir->i_sb, (btree_key *)&HFS_I(d_inode(old_dentry))->cat_key, new_dir->i_ino, &new_dentry->d_name); return res; } const struct file_operations hfs_dir_operations = { .read = generic_read_dir, .iterate_shared = hfs_readdir, .llseek = generic_file_llseek, .release = hfs_dir_release, }; const struct inode_operations hfs_dir_inode_operations = { .create = hfs_create, .lookup = hfs_lookup, .unlink = hfs_remove, .mkdir = hfs_mkdir, .rmdir = hfs_remove, .rename = hfs_rename, .setattr = hfs_inode_setattr, }; |
| 64 274 30 69 262 8 14 13 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_SEQ_FILE_H #define _LINUX_SEQ_FILE_H #include <linux/types.h> #include <linux/string.h> #include <linux/string_helpers.h> #include <linux/bug.h> #include <linux/mutex.h> #include <linux/nodemask.h> #include <linux/fs.h> #include <linux/cred.h> struct seq_operations; struct seq_file { char *buf; size_t size; size_t from; size_t count; size_t pad_until; loff_t index; loff_t read_pos; struct mutex lock; const struct seq_operations *op; int poll_event; const struct file *file; void *private; }; struct seq_operations { void * (*start) (struct seq_file *m, loff_t *pos); void (*stop) (struct seq_file *m, void *v); void * (*next) (struct seq_file *m, void *v, loff_t *pos); int (*show) (struct seq_file *m, void *v); }; #define SEQ_SKIP 1 /** * seq_has_overflowed - check if the buffer has overflowed * @m: the seq_file handle * * seq_files have a buffer which may overflow. When this happens a larger * buffer is reallocated and all the data will be printed again. * The overflow state is true when m->count == m->size. * * Returns true if the buffer received more than it can hold. */ static inline bool seq_has_overflowed(struct seq_file *m) { return m->count == m->size; } /** * seq_get_buf - get buffer to write arbitrary data to * @m: the seq_file handle * @bufp: the beginning of the buffer is stored here * * Return the number of bytes available in the buffer, or zero if * there's no space. */ static inline size_t seq_get_buf(struct seq_file *m, char **bufp) { BUG_ON(m->count > m->size); if (m->count < m->size) *bufp = m->buf + m->count; else *bufp = NULL; return m->size - m->count; } /** * seq_commit - commit data to the buffer * @m: the seq_file handle * @num: the number of bytes to commit * * Commit @num bytes of data written to a buffer previously acquired * by seq_buf_get. To signal an error condition, or that the data * didn't fit in the available space, pass a negative @num value. */ static inline void seq_commit(struct seq_file *m, int num) { if (num < 0) { m->count = m->size; } else { BUG_ON(m->count + num > m->size); m->count += num; } } /** * seq_setwidth - set padding width * @m: the seq_file handle * @size: the max number of bytes to pad. * * Call seq_setwidth() for setting max width, then call seq_printf() etc. and * finally call seq_pad() to pad the remaining bytes. */ static inline void seq_setwidth(struct seq_file *m, size_t size) { m->pad_until = m->count + size; } void seq_pad(struct seq_file *m, char c); char *mangle_path(char *s, const char *p, const char *esc); int seq_open(struct file *, const struct seq_operations *); ssize_t seq_read(struct file *, char __user *, size_t, loff_t *); ssize_t seq_read_iter(struct kiocb *iocb, struct iov_iter *iter); loff_t seq_lseek(struct file *, loff_t, int); int seq_release(struct inode *, struct file *); int seq_write(struct seq_file *seq, const void *data, size_t len); __printf(2, 0) void seq_vprintf(struct seq_file *m, const char *fmt, va_list args); __printf(2, 3) void seq_printf(struct seq_file *m, const char *fmt, ...); void seq_putc(struct seq_file *m, char c); void __seq_puts(struct seq_file *m, const char *s); static __always_inline void seq_puts(struct seq_file *m, const char *s) { if (!__builtin_constant_p(*s)) __seq_puts(m, s); else if (s[0] && !s[1]) seq_putc(m, s[0]); else seq_write(m, s, __builtin_strlen(s)); } void seq_put_decimal_ull_width(struct seq_file *m, const char *delimiter, unsigned long long num, unsigned int width); void seq_put_decimal_ull(struct seq_file *m, const char *delimiter, unsigned long long num); void seq_put_decimal_ll(struct seq_file *m, const char *delimiter, long long num); void seq_put_hex_ll(struct seq_file *m, const char *delimiter, unsigned long long v, unsigned int width); void seq_escape_mem(struct seq_file *m, const char *src, size_t len, unsigned int flags, const char *esc); static inline void seq_escape_str(struct seq_file *m, const char *src, unsigned int flags, const char *esc) { seq_escape_mem(m, src, strlen(src), flags, esc); } /** * seq_escape - print string into buffer, escaping some characters * @m: target buffer * @s: NULL-terminated string * @esc: set of characters that need escaping * * Puts string into buffer, replacing each occurrence of character from * @esc with usual octal escape. * * Use seq_has_overflowed() to check for errors. */ static inline void seq_escape(struct seq_file *m, const char *s, const char *esc) { seq_escape_str(m, s, ESCAPE_OCTAL, esc); } void seq_hex_dump(struct seq_file *m, const char *prefix_str, int prefix_type, int rowsize, int groupsize, const void *buf, size_t len, bool ascii); int seq_path(struct seq_file *, const struct path *, const char *); int seq_file_path(struct seq_file *, struct file *, const char *); int seq_dentry(struct seq_file *, struct dentry *, const char *); int seq_path_root(struct seq_file *m, const struct path *path, const struct path *root, const char *esc); void *single_start(struct seq_file *, loff_t *); int single_open(struct file *, int (*)(struct seq_file *, void *), void *); int single_open_size(struct file *, int (*)(struct seq_file *, void *), void *, size_t); int single_release(struct inode *, struct file *); void *__seq_open_private(struct file *, const struct seq_operations *, int); int seq_open_private(struct file *, const struct seq_operations *, int); int seq_release_private(struct inode *, struct file *); #ifdef CONFIG_BINARY_PRINTF void seq_bprintf(struct seq_file *m, const char *f, const u32 *binary); #endif #define DEFINE_SEQ_ATTRIBUTE(__name) \ static int __name ## _open(struct inode *inode, struct file *file) \ { \ int ret = seq_open(file, &__name ## _sops); \ if (!ret && inode->i_private) { \ struct seq_file *seq_f = file->private_data; \ seq_f->private = inode->i_private; \ } \ return ret; \ } \ \ static const struct file_operations __name ## _fops = { \ .owner = THIS_MODULE, \ .open = __name ## _open, \ .read = seq_read, \ .llseek = seq_lseek, \ .release = seq_release, \ } #define DEFINE_SHOW_ATTRIBUTE(__name) \ static int __name ## _open(struct inode *inode, struct file *file) \ { \ return single_open(file, __name ## _show, inode->i_private); \ } \ \ static const struct file_operations __name ## _fops = { \ .owner = THIS_MODULE, \ .open = __name ## _open, \ .read = seq_read, \ .llseek = seq_lseek, \ .release = single_release, \ } #define DEFINE_SHOW_STORE_ATTRIBUTE(__name) \ static int __name ## _open(struct inode *inode, struct file *file) \ { \ return single_open(file, __name ## _show, inode->i_private); \ } \ \ static const struct file_operations __name ## _fops = { \ .owner = THIS_MODULE, \ .open = __name ## _open, \ .read = seq_read, \ .write = __name ## _write, \ .llseek = seq_lseek, \ .release = single_release, \ } #define DEFINE_PROC_SHOW_ATTRIBUTE(__name) \ static int __name ## _open(struct inode *inode, struct file *file) \ { \ return single_open(file, __name ## _show, pde_data(inode)); \ } \ \ static const struct proc_ops __name ## _proc_ops = { \ .proc_open = __name ## _open, \ .proc_read = seq_read, \ .proc_lseek = seq_lseek, \ .proc_release = single_release, \ } static inline struct user_namespace *seq_user_ns(struct seq_file *seq) { #ifdef CONFIG_USER_NS return seq->file->f_cred->user_ns; #else extern struct user_namespace init_user_ns; return &init_user_ns; #endif } /** * seq_show_options - display mount options with appropriate escapes. * @m: the seq_file handle * @name: the mount option name * @value: the mount option name's value, can be NULL */ static inline void seq_show_option(struct seq_file *m, const char *name, const char *value) { seq_putc(m, ','); seq_escape(m, name, ",= \t\n\\"); if (value) { seq_putc(m, '='); seq_escape(m, value, ", \t\n\\"); } } /** * seq_show_option_n - display mount options with appropriate escapes * where @value must be a specific length (i.e. * not NUL-terminated). * @m: the seq_file handle * @name: the mount option name * @value: the mount option name's value, cannot be NULL * @length: the exact length of @value to display, must be constant expression * * This is a macro since this uses "length" to define the size of the * stack buffer. */ #define seq_show_option_n(m, name, value, length) { \ char val_buf[length + 1]; \ memcpy(val_buf, value, length); \ val_buf[length] = '\0'; \ seq_show_option(m, name, val_buf); \ } #define SEQ_START_TOKEN ((void *)1) /* * Helpers for iteration over list_head-s in seq_files */ extern struct list_head *seq_list_start(struct list_head *head, loff_t pos); extern struct list_head *seq_list_start_head(struct list_head *head, loff_t pos); extern struct list_head *seq_list_next(void *v, struct list_head *head, loff_t *ppos); extern struct list_head *seq_list_start_rcu(struct list_head *head, loff_t pos); extern struct list_head *seq_list_start_head_rcu(struct list_head *head, loff_t pos); extern struct list_head *seq_list_next_rcu(void *v, struct list_head *head, loff_t *ppos); /* * Helpers for iteration over hlist_head-s in seq_files */ extern struct hlist_node *seq_hlist_start(struct hlist_head *head, loff_t pos); extern struct hlist_node *seq_hlist_start_head(struct hlist_head *head, loff_t pos); extern struct hlist_node *seq_hlist_next(void *v, struct hlist_head *head, loff_t *ppos); extern struct hlist_node *seq_hlist_start_rcu(struct hlist_head *head, loff_t pos); extern struct hlist_node *seq_hlist_start_head_rcu(struct hlist_head *head, loff_t pos); extern struct hlist_node *seq_hlist_next_rcu(void *v, struct hlist_head *head, loff_t *ppos); /* Helpers for iterating over per-cpu hlist_head-s in seq_files */ extern struct hlist_node *seq_hlist_start_percpu(struct hlist_head __percpu *head, int *cpu, loff_t pos); extern struct hlist_node *seq_hlist_next_percpu(void *v, struct hlist_head __percpu *head, int *cpu, loff_t *pos); void seq_file_init(void); #endif |
| 3 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 | /* SPDX-License-Identifier: GPL-2.0 */ #undef TRACE_SYSTEM #define TRACE_SYSTEM xdp #if !defined(_TRACE_XDP_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_XDP_H #include <linux/netdevice.h> #include <linux/filter.h> #include <linux/tracepoint.h> #include <linux/bpf.h> #include <net/xdp.h> #define __XDP_ACT_MAP(FN) \ FN(ABORTED) \ FN(DROP) \ FN(PASS) \ FN(TX) \ FN(REDIRECT) #define __XDP_ACT_TP_FN(x) \ TRACE_DEFINE_ENUM(XDP_##x); #define __XDP_ACT_SYM_FN(x) \ { XDP_##x, #x }, #define __XDP_ACT_SYM_TAB \ __XDP_ACT_MAP(__XDP_ACT_SYM_FN) { -1, NULL } __XDP_ACT_MAP(__XDP_ACT_TP_FN) TRACE_EVENT(xdp_exception, TP_PROTO(const struct net_device *dev, const struct bpf_prog *xdp, u32 act), TP_ARGS(dev, xdp, act), TP_STRUCT__entry( __field(int, prog_id) __field(u32, act) __field(int, ifindex) ), TP_fast_assign( __entry->prog_id = xdp->aux->id; __entry->act = act; __entry->ifindex = dev->ifindex; ), TP_printk("prog_id=%d action=%s ifindex=%d", __entry->prog_id, __print_symbolic(__entry->act, __XDP_ACT_SYM_TAB), __entry->ifindex) ); TRACE_EVENT(xdp_bulk_tx, TP_PROTO(const struct net_device *dev, int sent, int drops, int err), TP_ARGS(dev, sent, drops, err), TP_STRUCT__entry( __field(int, ifindex) __field(u32, act) __field(int, drops) __field(int, sent) __field(int, err) ), TP_fast_assign( __entry->ifindex = dev->ifindex; __entry->act = XDP_TX; __entry->drops = drops; __entry->sent = sent; __entry->err = err; ), TP_printk("ifindex=%d action=%s sent=%d drops=%d err=%d", __entry->ifindex, __print_symbolic(__entry->act, __XDP_ACT_SYM_TAB), __entry->sent, __entry->drops, __entry->err) ); #ifndef __DEVMAP_OBJ_TYPE #define __DEVMAP_OBJ_TYPE struct _bpf_dtab_netdev { struct net_device *dev; }; #endif /* __DEVMAP_OBJ_TYPE */ DECLARE_EVENT_CLASS(xdp_redirect_template, TP_PROTO(const struct net_device *dev, const struct bpf_prog *xdp, const void *tgt, int err, enum bpf_map_type map_type, u32 map_id, u32 index), TP_ARGS(dev, xdp, tgt, err, map_type, map_id, index), TP_STRUCT__entry( __field(int, prog_id) __field(u32, act) __field(int, ifindex) __field(int, err) __field(int, to_ifindex) __field(u32, map_id) __field(int, map_index) ), TP_fast_assign( u32 ifindex = 0, map_index = index; if (map_type == BPF_MAP_TYPE_DEVMAP || map_type == BPF_MAP_TYPE_DEVMAP_HASH) { /* Just leave to_ifindex to 0 if do broadcast redirect, * as tgt will be NULL. */ if (tgt) ifindex = ((struct _bpf_dtab_netdev *)tgt)->dev->ifindex; } else if (map_type == BPF_MAP_TYPE_UNSPEC && map_id == INT_MAX) { ifindex = index; map_index = 0; } __entry->prog_id = xdp->aux->id; __entry->act = XDP_REDIRECT; __entry->ifindex = dev->ifindex; __entry->err = err; __entry->to_ifindex = ifindex; __entry->map_id = map_id; __entry->map_index = map_index; ), TP_printk("prog_id=%d action=%s ifindex=%d to_ifindex=%d err=%d" " map_id=%d map_index=%d", __entry->prog_id, __print_symbolic(__entry->act, __XDP_ACT_SYM_TAB), __entry->ifindex, __entry->to_ifindex, __entry->err, __entry->map_id, __entry->map_index) ); DEFINE_EVENT(xdp_redirect_template, xdp_redirect, TP_PROTO(const struct net_device *dev, const struct bpf_prog *xdp, const void *tgt, int err, enum bpf_map_type map_type, u32 map_id, u32 index), TP_ARGS(dev, xdp, tgt, err, map_type, map_id, index) ); DEFINE_EVENT(xdp_redirect_template, xdp_redirect_err, TP_PROTO(const struct net_device *dev, const struct bpf_prog *xdp, const void *tgt, int err, enum bpf_map_type map_type, u32 map_id, u32 index), TP_ARGS(dev, xdp, tgt, err, map_type, map_id, index) ); #define _trace_xdp_redirect(dev, xdp, to) \ trace_xdp_redirect(dev, xdp, NULL, 0, BPF_MAP_TYPE_UNSPEC, INT_MAX, to) #define _trace_xdp_redirect_err(dev, xdp, to, err) \ trace_xdp_redirect_err(dev, xdp, NULL, err, BPF_MAP_TYPE_UNSPEC, INT_MAX, to) #define _trace_xdp_redirect_map(dev, xdp, to, map_type, map_id, index) \ trace_xdp_redirect(dev, xdp, to, 0, map_type, map_id, index) #define _trace_xdp_redirect_map_err(dev, xdp, to, map_type, map_id, index, err) \ trace_xdp_redirect_err(dev, xdp, to, err, map_type, map_id, index) /* not used anymore, but kept around so as not to break old programs */ DEFINE_EVENT(xdp_redirect_template, xdp_redirect_map, TP_PROTO(const struct net_device *dev, const struct bpf_prog *xdp, const void *tgt, int err, enum bpf_map_type map_type, u32 map_id, u32 index), TP_ARGS(dev, xdp, tgt, err, map_type, map_id, index) ); DEFINE_EVENT(xdp_redirect_template, xdp_redirect_map_err, TP_PROTO(const struct net_device *dev, const struct bpf_prog *xdp, const void *tgt, int err, enum bpf_map_type map_type, u32 map_id, u32 index), TP_ARGS(dev, xdp, tgt, err, map_type, map_id, index) ); TRACE_EVENT(xdp_cpumap_kthread, TP_PROTO(int map_id, unsigned int processed, unsigned int drops, int sched, struct xdp_cpumap_stats *xdp_stats), TP_ARGS(map_id, processed, drops, sched, xdp_stats), TP_STRUCT__entry( __field(int, map_id) __field(u32, act) __field(int, cpu) __field(unsigned int, drops) __field(unsigned int, processed) __field(int, sched) __field(unsigned int, xdp_pass) __field(unsigned int, xdp_drop) __field(unsigned int, xdp_redirect) ), TP_fast_assign( __entry->map_id = map_id; __entry->act = XDP_REDIRECT; __entry->cpu = smp_processor_id(); __entry->drops = drops; __entry->processed = processed; __entry->sched = sched; __entry->xdp_pass = xdp_stats->pass; __entry->xdp_drop = xdp_stats->drop; __entry->xdp_redirect = xdp_stats->redirect; ), TP_printk("kthread" " cpu=%d map_id=%d action=%s" " processed=%u drops=%u" " sched=%d" " xdp_pass=%u xdp_drop=%u xdp_redirect=%u", __entry->cpu, __entry->map_id, __print_symbolic(__entry->act, __XDP_ACT_SYM_TAB), __entry->processed, __entry->drops, __entry->sched, __entry->xdp_pass, __entry->xdp_drop, __entry->xdp_redirect) ); TRACE_EVENT(xdp_cpumap_enqueue, TP_PROTO(int map_id, unsigned int processed, unsigned int drops, int to_cpu), TP_ARGS(map_id, processed, drops, to_cpu), TP_STRUCT__entry( __field(int, map_id) __field(u32, act) __field(int, cpu) __field(unsigned int, drops) __field(unsigned int, processed) __field(int, to_cpu) ), TP_fast_assign( __entry->map_id = map_id; __entry->act = XDP_REDIRECT; __entry->cpu = smp_processor_id(); __entry->drops = drops; __entry->processed = processed; __entry->to_cpu = to_cpu; ), TP_printk("enqueue" " cpu=%d map_id=%d action=%s" " processed=%u drops=%u" " to_cpu=%d", __entry->cpu, __entry->map_id, __print_symbolic(__entry->act, __XDP_ACT_SYM_TAB), __entry->processed, __entry->drops, __entry->to_cpu) ); TRACE_EVENT(xdp_devmap_xmit, TP_PROTO(const struct net_device *from_dev, const struct net_device *to_dev, int sent, int drops, int err), TP_ARGS(from_dev, to_dev, sent, drops, err), TP_STRUCT__entry( __field(int, from_ifindex) __field(u32, act) __field(int, to_ifindex) __field(int, drops) __field(int, sent) __field(int, err) ), TP_fast_assign( __entry->from_ifindex = from_dev->ifindex; __entry->act = XDP_REDIRECT; __entry->to_ifindex = to_dev->ifindex; __entry->drops = drops; __entry->sent = sent; __entry->err = err; ), TP_printk("ndo_xdp_xmit" " from_ifindex=%d to_ifindex=%d action=%s" " sent=%d drops=%d" " err=%d", __entry->from_ifindex, __entry->to_ifindex, __print_symbolic(__entry->act, __XDP_ACT_SYM_TAB), __entry->sent, __entry->drops, __entry->err) ); /* Expect users already include <net/xdp.h>, but not xdp_priv.h */ #include <net/xdp_priv.h> #define __MEM_TYPE_MAP(FN) \ FN(PAGE_SHARED) \ FN(PAGE_ORDER0) \ FN(PAGE_POOL) \ FN(XSK_BUFF_POOL) #define __MEM_TYPE_TP_FN(x) \ TRACE_DEFINE_ENUM(MEM_TYPE_##x); #define __MEM_TYPE_SYM_FN(x) \ { MEM_TYPE_##x, #x }, #define __MEM_TYPE_SYM_TAB \ __MEM_TYPE_MAP(__MEM_TYPE_SYM_FN) { -1, 0 } __MEM_TYPE_MAP(__MEM_TYPE_TP_FN) TRACE_EVENT(mem_disconnect, TP_PROTO(const struct xdp_mem_allocator *xa), TP_ARGS(xa), TP_STRUCT__entry( __field(const struct xdp_mem_allocator *, xa) __field(u32, mem_id) __field(u32, mem_type) __field(const void *, allocator) ), TP_fast_assign( __entry->xa = xa; __entry->mem_id = xa->mem.id; __entry->mem_type = xa->mem.type; __entry->allocator = xa->allocator; ), TP_printk("mem_id=%d mem_type=%s allocator=%p", __entry->mem_id, __print_symbolic(__entry->mem_type, __MEM_TYPE_SYM_TAB), __entry->allocator ) ); TRACE_EVENT(mem_connect, TP_PROTO(const struct xdp_mem_allocator *xa, const struct xdp_rxq_info *rxq), TP_ARGS(xa, rxq), TP_STRUCT__entry( __field(const struct xdp_mem_allocator *, xa) __field(u32, mem_id) __field(u32, mem_type) __field(const void *, allocator) __field(const struct xdp_rxq_info *, rxq) __field(int, ifindex) ), TP_fast_assign( __entry->xa = xa; __entry->mem_id = xa->mem.id; __entry->mem_type = xa->mem.type; __entry->allocator = xa->allocator; __entry->rxq = rxq; __entry->ifindex = rxq->dev->ifindex; ), TP_printk("mem_id=%d mem_type=%s allocator=%p" " ifindex=%d", __entry->mem_id, __print_symbolic(__entry->mem_type, __MEM_TYPE_SYM_TAB), __entry->allocator, __entry->ifindex ) ); TRACE_EVENT(mem_return_failed, TP_PROTO(const struct xdp_mem_info *mem, const struct page *page), TP_ARGS(mem, page), TP_STRUCT__entry( __field(const struct page *, page) __field(u32, mem_id) __field(u32, mem_type) ), TP_fast_assign( __entry->page = page; __entry->mem_id = mem->id; __entry->mem_type = mem->type; ), TP_printk("mem_id=%d mem_type=%s page=%p", __entry->mem_id, __print_symbolic(__entry->mem_type, __MEM_TYPE_SYM_TAB), __entry->page ) ); TRACE_EVENT(bpf_xdp_link_attach_failed, TP_PROTO(const char *msg), TP_ARGS(msg), TP_STRUCT__entry( __string(msg, msg) ), TP_fast_assign( __assign_str(msg); ), TP_printk("errmsg=%s", __get_str(msg)) ); #endif /* _TRACE_XDP_H */ #include <trace/define_trace.h> |
| 3 3 3 3 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 | // SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 1992, 1998-2006 Linus Torvalds, Ingo Molnar * Copyright (C) 2005-2006, Thomas Gleixner * * This file contains the IRQ-resend code * * If the interrupt is waiting to be processed, we try to re-run it. * We can't directly run it from here since the caller might be in an * interrupt-protected region. Not all irq controller chips can * retrigger interrupts at the hardware level, so in those cases * we allow the resending of IRQs via a tasklet. */ #include <linux/irq.h> #include <linux/module.h> #include <linux/random.h> #include <linux/interrupt.h> #include "internals.h" #ifdef CONFIG_HARDIRQS_SW_RESEND /* hlist_head to handle software resend of interrupts: */ static HLIST_HEAD(irq_resend_list); static DEFINE_RAW_SPINLOCK(irq_resend_lock); /* * Run software resends of IRQ's */ static void resend_irqs(struct tasklet_struct *unused) { struct irq_desc *desc; raw_spin_lock_irq(&irq_resend_lock); while (!hlist_empty(&irq_resend_list)) { desc = hlist_entry(irq_resend_list.first, struct irq_desc, resend_node); hlist_del_init(&desc->resend_node); raw_spin_unlock(&irq_resend_lock); desc->handle_irq(desc); raw_spin_lock(&irq_resend_lock); } raw_spin_unlock_irq(&irq_resend_lock); } /* Tasklet to handle resend: */ static DECLARE_TASKLET(resend_tasklet, resend_irqs); static int irq_sw_resend(struct irq_desc *desc) { /* * Validate whether this interrupt can be safely injected from * non interrupt context */ if (handle_enforce_irqctx(&desc->irq_data)) return -EINVAL; /* * If the interrupt is running in the thread context of the parent * irq we need to be careful, because we cannot trigger it * directly. */ if (irq_settings_is_nested_thread(desc)) { /* * If the parent_irq is valid, we retrigger the parent, * otherwise we do nothing. */ if (!desc->parent_irq) return -EINVAL; desc = irq_to_desc(desc->parent_irq); if (!desc) return -EINVAL; } /* Add to resend_list and activate the softirq: */ raw_spin_lock(&irq_resend_lock); if (hlist_unhashed(&desc->resend_node)) hlist_add_head(&desc->resend_node, &irq_resend_list); raw_spin_unlock(&irq_resend_lock); tasklet_schedule(&resend_tasklet); return 0; } void clear_irq_resend(struct irq_desc *desc) { raw_spin_lock(&irq_resend_lock); hlist_del_init(&desc->resend_node); raw_spin_unlock(&irq_resend_lock); } void irq_resend_init(struct irq_desc *desc) { INIT_HLIST_NODE(&desc->resend_node); } #else void clear_irq_resend(struct irq_desc *desc) {} void irq_resend_init(struct irq_desc *desc) {} static int irq_sw_resend(struct irq_desc *desc) { return -EINVAL; } #endif static int try_retrigger(struct irq_desc *desc) { if (desc->irq_data.chip->irq_retrigger) return desc->irq_data.chip->irq_retrigger(&desc->irq_data); #ifdef CONFIG_IRQ_DOMAIN_HIERARCHY return irq_chip_retrigger_hierarchy(&desc->irq_data); #else return 0; #endif } /* * IRQ resend * * Is called with interrupts disabled and desc->lock held. */ int check_irq_resend(struct irq_desc *desc, bool inject) { int err = 0; /* * We do not resend level type interrupts. Level type interrupts * are resent by hardware when they are still active. Clear the * pending bit so suspend/resume does not get confused. */ if (irq_settings_is_level(desc)) { desc->istate &= ~IRQS_PENDING; return -EINVAL; } if (desc->istate & IRQS_REPLAY) return -EBUSY; if (!(desc->istate & IRQS_PENDING) && !inject) return 0; desc->istate &= ~IRQS_PENDING; if (!try_retrigger(desc)) err = irq_sw_resend(desc); /* If the retrigger was successful, mark it with the REPLAY bit */ if (!err) desc->istate |= IRQS_REPLAY; return err; } #ifdef CONFIG_GENERIC_IRQ_INJECTION /** * irq_inject_interrupt - Inject an interrupt for testing/error injection * @irq: The interrupt number * * This function must only be used for debug and testing purposes! * * Especially on x86 this can cause a premature completion of an interrupt * affinity change causing the interrupt line to become stale. Very * unlikely, but possible. * * The injection can fail for various reasons: * - Interrupt is not activated * - Interrupt is NMI type or currently replaying * - Interrupt is level type * - Interrupt does not support hardware retrigger and software resend is * either not enabled or not possible for the interrupt. */ int irq_inject_interrupt(unsigned int irq) { struct irq_desc *desc; unsigned long flags; int err; /* Try the state injection hardware interface first */ if (!irq_set_irqchip_state(irq, IRQCHIP_STATE_PENDING, true)) return 0; /* That failed, try via the resend mechanism */ desc = irq_get_desc_buslock(irq, &flags, 0); if (!desc) return -EINVAL; /* * Only try to inject when the interrupt is: * - not NMI type * - activated */ if (irq_is_nmi(desc) || !irqd_is_activated(&desc->irq_data)) err = -EINVAL; else err = check_irq_resend(desc, true); irq_put_desc_busunlock(desc, flags); return err; } EXPORT_SYMBOL_GPL(irq_inject_interrupt); #endif |
| 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_PART_STAT_H #define _LINUX_PART_STAT_H #include <linux/blkdev.h> #include <asm/local.h> struct disk_stats { u64 nsecs[NR_STAT_GROUPS]; unsigned long sectors[NR_STAT_GROUPS]; unsigned long ios[NR_STAT_GROUPS]; unsigned long merges[NR_STAT_GROUPS]; unsigned long io_ticks; local_t in_flight[2]; }; /* * Macros to operate on percpu disk statistics: * * {disk|part|all}_stat_{add|sub|inc|dec}() modify the stat counters and should * be called between disk_stat_lock() and disk_stat_unlock(). * * part_stat_read() can be called at any time. */ #define part_stat_lock() preempt_disable() #define part_stat_unlock() preempt_enable() #define part_stat_get_cpu(part, field, cpu) \ (per_cpu_ptr((part)->bd_stats, (cpu))->field) #define part_stat_get(part, field) \ part_stat_get_cpu(part, field, smp_processor_id()) #define part_stat_read(part, field) \ ({ \ typeof((part)->bd_stats->field) res = 0; \ unsigned int _cpu; \ for_each_possible_cpu(_cpu) \ res += per_cpu_ptr((part)->bd_stats, _cpu)->field; \ res; \ }) static inline void part_stat_set_all(struct block_device *part, int value) { int i; for_each_possible_cpu(i) memset(per_cpu_ptr(part->bd_stats, i), value, sizeof(struct disk_stats)); } #define part_stat_read_accum(part, field) \ (part_stat_read(part, field[STAT_READ]) + \ part_stat_read(part, field[STAT_WRITE]) + \ part_stat_read(part, field[STAT_DISCARD])) #define __part_stat_add(part, field, addnd) \ __this_cpu_add((part)->bd_stats->field, addnd) #define part_stat_add(part, field, addnd) do { \ __part_stat_add((part), field, addnd); \ if (bdev_is_partition(part)) \ __part_stat_add(bdev_whole(part), field, addnd); \ } while (0) #define part_stat_dec(part, field) \ part_stat_add(part, field, -1) #define part_stat_inc(part, field) \ part_stat_add(part, field, 1) #define part_stat_sub(part, field, subnd) \ part_stat_add(part, field, -subnd) #define part_stat_local_dec(part, field) \ local_dec(&(part_stat_get(part, field))) #define part_stat_local_inc(part, field) \ local_inc(&(part_stat_get(part, field))) #define part_stat_local_read(part, field) \ local_read(&(part_stat_get(part, field))) #define part_stat_local_read_cpu(part, field, cpu) \ local_read(&(part_stat_get_cpu(part, field, cpu))) #endif /* _LINUX_PART_STAT_H */ |
| 4 3 4 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 | // SPDX-License-Identifier: GPL-2.0-only /* * CAIF USB handler * Copyright (C) ST-Ericsson AB 2011 * Author: Sjur Brendeland */ #define pr_fmt(fmt) KBUILD_MODNAME ":%s(): " fmt, __func__ #include <linux/module.h> #include <linux/netdevice.h> #include <linux/slab.h> #include <linux/mii.h> #include <linux/usb.h> #include <linux/usb/usbnet.h> #include <linux/etherdevice.h> #include <net/netns/generic.h> #include <net/caif/caif_dev.h> #include <net/caif/caif_layer.h> #include <net/caif/cfpkt.h> #include <net/caif/cfcnfg.h> MODULE_DESCRIPTION("ST-Ericsson CAIF modem protocol USB support"); MODULE_LICENSE("GPL"); #define CFUSB_PAD_DESCR_SZ 1 /* Alignment descriptor length */ #define CFUSB_ALIGNMENT 4 /* Number of bytes to align. */ #define CFUSB_MAX_HEADLEN (CFUSB_PAD_DESCR_SZ + CFUSB_ALIGNMENT-1) #define STE_USB_VID 0x04cc /* USB Product ID for ST-Ericsson */ #define STE_USB_PID_CAIF 0x230f /* Product id for CAIF Modems */ struct cfusbl { struct cflayer layer; u8 tx_eth_hdr[ETH_HLEN]; }; static bool pack_added; static int cfusbl_receive(struct cflayer *layr, struct cfpkt *pkt) { u8 hpad; /* Remove padding. */ cfpkt_extr_head(pkt, &hpad, 1); cfpkt_extr_head(pkt, NULL, hpad); return layr->up->receive(layr->up, pkt); } static int cfusbl_transmit(struct cflayer *layr, struct cfpkt *pkt) { struct caif_payload_info *info; u8 hpad; u8 zeros[CFUSB_ALIGNMENT]; struct sk_buff *skb; struct cfusbl *usbl = container_of(layr, struct cfusbl, layer); skb = cfpkt_tonative(pkt); skb_reset_network_header(skb); skb->protocol = htons(ETH_P_IP); info = cfpkt_info(pkt); hpad = (info->hdr_len + CFUSB_PAD_DESCR_SZ) & (CFUSB_ALIGNMENT - 1); if (skb_headroom(skb) < ETH_HLEN + CFUSB_PAD_DESCR_SZ + hpad) { pr_warn("Headroom too small\n"); kfree_skb(skb); return -EIO; } memset(zeros, 0, hpad); cfpkt_add_head(pkt, zeros, hpad); cfpkt_add_head(pkt, &hpad, 1); cfpkt_add_head(pkt, usbl->tx_eth_hdr, sizeof(usbl->tx_eth_hdr)); return layr->dn->transmit(layr->dn, pkt); } static void cfusbl_ctrlcmd(struct cflayer *layr, enum caif_ctrlcmd ctrl, int phyid) { if (layr->up && layr->up->ctrlcmd) layr->up->ctrlcmd(layr->up, ctrl, layr->id); } static struct cflayer *cfusbl_create(int phyid, const u8 ethaddr[ETH_ALEN], u8 braddr[ETH_ALEN]) { struct cfusbl *this = kmalloc(sizeof(struct cfusbl), GFP_ATOMIC); if (!this) return NULL; caif_assert(offsetof(struct cfusbl, layer) == 0); memset(&this->layer, 0, sizeof(this->layer)); this->layer.receive = cfusbl_receive; this->layer.transmit = cfusbl_transmit; this->layer.ctrlcmd = cfusbl_ctrlcmd; snprintf(this->layer.name, CAIF_LAYER_NAME_SZ, "usb%d", phyid); this->layer.id = phyid; /* * Construct TX ethernet header: * 0-5 destination address * 5-11 source address * 12-13 protocol type */ ether_addr_copy(&this->tx_eth_hdr[ETH_ALEN], braddr); ether_addr_copy(&this->tx_eth_hdr[ETH_ALEN], ethaddr); this->tx_eth_hdr[12] = cpu_to_be16(ETH_P_802_EX1) & 0xff; this->tx_eth_hdr[13] = (cpu_to_be16(ETH_P_802_EX1) >> 8) & 0xff; pr_debug("caif ethernet TX-header dst:%pM src:%pM type:%02x%02x\n", this->tx_eth_hdr, this->tx_eth_hdr + ETH_ALEN, this->tx_eth_hdr[12], this->tx_eth_hdr[13]); return (struct cflayer *) this; } static void cfusbl_release(struct cflayer *layer) { kfree(layer); } static struct packet_type caif_usb_type __read_mostly = { .type = cpu_to_be16(ETH_P_802_EX1), }; static int cfusbl_device_notify(struct notifier_block *me, unsigned long what, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct caif_dev_common common; struct cflayer *layer, *link_support; struct usbnet *usbnet; struct usb_device *usbdev; int res; if (what == NETDEV_UNREGISTER && dev->reg_state >= NETREG_UNREGISTERED) return 0; /* Check whether we have a NCM device, and find its VID/PID. */ if (!(dev->dev.parent && dev->dev.parent->driver && strcmp(dev->dev.parent->driver->name, "cdc_ncm") == 0)) return 0; usbnet = netdev_priv(dev); usbdev = usbnet->udev; pr_debug("USB CDC NCM device VID:0x%4x PID:0x%4x\n", le16_to_cpu(usbdev->descriptor.idVendor), le16_to_cpu(usbdev->descriptor.idProduct)); /* Check for VID/PID that supports CAIF */ if (!(le16_to_cpu(usbdev->descriptor.idVendor) == STE_USB_VID && le16_to_cpu(usbdev->descriptor.idProduct) == STE_USB_PID_CAIF)) return 0; if (what == NETDEV_UNREGISTER) module_put(THIS_MODULE); if (what != NETDEV_REGISTER) return 0; __module_get(THIS_MODULE); memset(&common, 0, sizeof(common)); common.use_frag = false; common.use_fcs = false; common.use_stx = false; common.link_select = CAIF_LINK_HIGH_BANDW; common.flowctrl = NULL; link_support = cfusbl_create(dev->ifindex, dev->dev_addr, dev->broadcast); if (!link_support) return -ENOMEM; if (dev->num_tx_queues > 1) pr_warn("USB device uses more than one tx queue\n"); res = caif_enroll_dev(dev, &common, link_support, CFUSB_MAX_HEADLEN, &layer, &caif_usb_type.func); if (res) goto err; if (!pack_added) dev_add_pack(&caif_usb_type); pack_added = true; strscpy(layer->name, dev->name, sizeof(layer->name)); return 0; err: cfusbl_release(link_support); return res; } static struct notifier_block caif_device_notifier = { .notifier_call = cfusbl_device_notify, .priority = 0, }; static int __init cfusbl_init(void) { return register_netdevice_notifier(&caif_device_notifier); } static void __exit cfusbl_exit(void) { unregister_netdevice_notifier(&caif_device_notifier); dev_remove_pack(&caif_usb_type); } module_init(cfusbl_init); module_exit(cfusbl_exit); |
| 38 129 11 117 5 99 25 78 46 92 32 124 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 | // SPDX-License-Identifier: GPL-2.0-only /* * ialloc.c * * PURPOSE * Inode allocation handling routines for the OSTA-UDF(tm) filesystem. * * COPYRIGHT * (C) 1998-2001 Ben Fennema * * HISTORY * * 02/24/99 blf Created. * */ #include "udfdecl.h" #include <linux/fs.h> #include <linux/sched.h> #include <linux/slab.h> #include "udf_i.h" #include "udf_sb.h" void udf_free_inode(struct inode *inode) { udf_free_blocks(inode->i_sb, NULL, &UDF_I(inode)->i_location, 0, 1); } struct inode *udf_new_inode(struct inode *dir, umode_t mode) { struct super_block *sb = dir->i_sb; struct udf_sb_info *sbi = UDF_SB(sb); struct inode *inode; udf_pblk_t block; uint32_t start = UDF_I(dir)->i_location.logicalBlockNum; struct udf_inode_info *iinfo; struct udf_inode_info *dinfo = UDF_I(dir); int err; inode = new_inode(sb); if (!inode) return ERR_PTR(-ENOMEM); iinfo = UDF_I(inode); if (UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_USE_EXTENDED_FE)) { iinfo->i_efe = 1; if (UDF_VERS_USE_EXTENDED_FE > sbi->s_udfrev) sbi->s_udfrev = UDF_VERS_USE_EXTENDED_FE; iinfo->i_data = kzalloc(inode->i_sb->s_blocksize - sizeof(struct extendedFileEntry), GFP_KERNEL); } else { iinfo->i_efe = 0; iinfo->i_data = kzalloc(inode->i_sb->s_blocksize - sizeof(struct fileEntry), GFP_KERNEL); } if (!iinfo->i_data) { make_bad_inode(inode); iput(inode); return ERR_PTR(-ENOMEM); } err = -ENOSPC; block = udf_new_block(dir->i_sb, NULL, dinfo->i_location.partitionReferenceNum, start, &err); if (err) { make_bad_inode(inode); iput(inode); return ERR_PTR(err); } iinfo->i_unique = lvid_get_unique_id(sb); inode->i_generation = iinfo->i_unique; inode_init_owner(&nop_mnt_idmap, inode, dir, mode); if (UDF_QUERY_FLAG(sb, UDF_FLAG_UID_SET)) inode->i_uid = sbi->s_uid; if (UDF_QUERY_FLAG(sb, UDF_FLAG_GID_SET)) inode->i_gid = sbi->s_gid; iinfo->i_location.logicalBlockNum = block; iinfo->i_location.partitionReferenceNum = dinfo->i_location.partitionReferenceNum; inode->i_ino = udf_get_lb_pblock(sb, &iinfo->i_location, 0); inode->i_blocks = 0; iinfo->i_lenEAttr = 0; iinfo->i_lenAlloc = 0; iinfo->i_use = 0; iinfo->i_checkpoint = 1; iinfo->i_extraPerms = FE_PERM_U_CHATTR; udf_update_extra_perms(inode, mode); if (UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_USE_AD_IN_ICB)) iinfo->i_alloc_type = ICBTAG_FLAG_AD_IN_ICB; else if (UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_USE_SHORT_AD)) iinfo->i_alloc_type = ICBTAG_FLAG_AD_SHORT; else iinfo->i_alloc_type = ICBTAG_FLAG_AD_LONG; simple_inode_init_ts(inode); iinfo->i_crtime = inode_get_mtime(inode); if (unlikely(insert_inode_locked(inode) < 0)) { make_bad_inode(inode); iput(inode); return ERR_PTR(-EIO); } mark_inode_dirty(inode); return inode; } |
| 169 29 145 366 366 120 120 7 7 126 126 15 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 | // SPDX-License-Identifier: GPL-2.0 /* * f2fs compress support * * Copyright (c) 2019 Chao Yu <chao@kernel.org> */ #include <linux/fs.h> #include <linux/f2fs_fs.h> #include <linux/moduleparam.h> #include <linux/writeback.h> #include <linux/backing-dev.h> #include <linux/lzo.h> #include <linux/lz4.h> #include <linux/zstd.h> #include <linux/pagevec.h> #include "f2fs.h" #include "node.h" #include "segment.h" #include <trace/events/f2fs.h> static struct kmem_cache *cic_entry_slab; static struct kmem_cache *dic_entry_slab; static void *page_array_alloc(struct inode *inode, int nr) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); unsigned int size = sizeof(struct page *) * nr; if (likely(size <= sbi->page_array_slab_size)) return f2fs_kmem_cache_alloc(sbi->page_array_slab, GFP_F2FS_ZERO, false, F2FS_I_SB(inode)); return f2fs_kzalloc(sbi, size, GFP_NOFS); } static void page_array_free(struct inode *inode, void *pages, int nr) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); unsigned int size = sizeof(struct page *) * nr; if (!pages) return; if (likely(size <= sbi->page_array_slab_size)) kmem_cache_free(sbi->page_array_slab, pages); else kfree(pages); } struct f2fs_compress_ops { int (*init_compress_ctx)(struct compress_ctx *cc); void (*destroy_compress_ctx)(struct compress_ctx *cc); int (*compress_pages)(struct compress_ctx *cc); int (*init_decompress_ctx)(struct decompress_io_ctx *dic); void (*destroy_decompress_ctx)(struct decompress_io_ctx *dic); int (*decompress_pages)(struct decompress_io_ctx *dic); bool (*is_level_valid)(int level); }; static unsigned int offset_in_cluster(struct compress_ctx *cc, pgoff_t index) { return index & (cc->cluster_size - 1); } static pgoff_t cluster_idx(struct compress_ctx *cc, pgoff_t index) { return index >> cc->log_cluster_size; } static pgoff_t start_idx_of_cluster(struct compress_ctx *cc) { return cc->cluster_idx << cc->log_cluster_size; } bool f2fs_is_compressed_page(struct page *page) { if (!PagePrivate(page)) return false; if (!page_private(page)) return false; if (page_private_nonpointer(page)) return false; f2fs_bug_on(F2FS_M_SB(page->mapping), *((u32 *)page_private(page)) != F2FS_COMPRESSED_PAGE_MAGIC); return true; } static void f2fs_set_compressed_page(struct page *page, struct inode *inode, pgoff_t index, void *data) { struct folio *folio = page_folio(page); folio_attach_private(folio, (void *)data); /* i_crypto_info and iv index */ folio->index = index; folio->mapping = inode->i_mapping; } static void f2fs_drop_rpages(struct compress_ctx *cc, int len, bool unlock) { int i; for (i = 0; i < len; i++) { if (!cc->rpages[i]) continue; if (unlock) unlock_page(cc->rpages[i]); else put_page(cc->rpages[i]); } } static void f2fs_put_rpages(struct compress_ctx *cc) { f2fs_drop_rpages(cc, cc->cluster_size, false); } static void f2fs_unlock_rpages(struct compress_ctx *cc, int len) { f2fs_drop_rpages(cc, len, true); } static void f2fs_put_rpages_wbc(struct compress_ctx *cc, struct writeback_control *wbc, bool redirty, int unlock) { unsigned int i; for (i = 0; i < cc->cluster_size; i++) { if (!cc->rpages[i]) continue; if (redirty) redirty_page_for_writepage(wbc, cc->rpages[i]); f2fs_put_page(cc->rpages[i], unlock); } } struct page *f2fs_compress_control_page(struct page *page) { return ((struct compress_io_ctx *)page_private(page))->rpages[0]; } int f2fs_init_compress_ctx(struct compress_ctx *cc) { if (cc->rpages) return 0; cc->rpages = page_array_alloc(cc->inode, cc->cluster_size); return cc->rpages ? 0 : -ENOMEM; } void f2fs_destroy_compress_ctx(struct compress_ctx *cc, bool reuse) { page_array_free(cc->inode, cc->rpages, cc->cluster_size); cc->rpages = NULL; cc->nr_rpages = 0; cc->nr_cpages = 0; cc->valid_nr_cpages = 0; if (!reuse) cc->cluster_idx = NULL_CLUSTER; } void f2fs_compress_ctx_add_page(struct compress_ctx *cc, struct folio *folio) { unsigned int cluster_ofs; if (!f2fs_cluster_can_merge_page(cc, folio->index)) f2fs_bug_on(F2FS_I_SB(cc->inode), 1); cluster_ofs = offset_in_cluster(cc, folio->index); cc->rpages[cluster_ofs] = folio_page(folio, 0); cc->nr_rpages++; cc->cluster_idx = cluster_idx(cc, folio->index); } #ifdef CONFIG_F2FS_FS_LZO static int lzo_init_compress_ctx(struct compress_ctx *cc) { cc->private = f2fs_kvmalloc(F2FS_I_SB(cc->inode), LZO1X_MEM_COMPRESS, GFP_NOFS); if (!cc->private) return -ENOMEM; cc->clen = lzo1x_worst_compress(PAGE_SIZE << cc->log_cluster_size); return 0; } static void lzo_destroy_compress_ctx(struct compress_ctx *cc) { kvfree(cc->private); cc->private = NULL; } static int lzo_compress_pages(struct compress_ctx *cc) { int ret; ret = lzo1x_1_compress(cc->rbuf, cc->rlen, cc->cbuf->cdata, &cc->clen, cc->private); if (ret != LZO_E_OK) { f2fs_err_ratelimited(F2FS_I_SB(cc->inode), "lzo compress failed, ret:%d", ret); return -EIO; } return 0; } static int lzo_decompress_pages(struct decompress_io_ctx *dic) { int ret; ret = lzo1x_decompress_safe(dic->cbuf->cdata, dic->clen, dic->rbuf, &dic->rlen); if (ret != LZO_E_OK) { f2fs_err_ratelimited(F2FS_I_SB(dic->inode), "lzo decompress failed, ret:%d", ret); return -EIO; } if (dic->rlen != PAGE_SIZE << dic->log_cluster_size) { f2fs_err_ratelimited(F2FS_I_SB(dic->inode), "lzo invalid rlen:%zu, expected:%lu", dic->rlen, PAGE_SIZE << dic->log_cluster_size); return -EIO; } return 0; } static const struct f2fs_compress_ops f2fs_lzo_ops = { .init_compress_ctx = lzo_init_compress_ctx, .destroy_compress_ctx = lzo_destroy_compress_ctx, .compress_pages = lzo_compress_pages, .decompress_pages = lzo_decompress_pages, }; #endif #ifdef CONFIG_F2FS_FS_LZ4 static int lz4_init_compress_ctx(struct compress_ctx *cc) { unsigned int size = LZ4_MEM_COMPRESS; #ifdef CONFIG_F2FS_FS_LZ4HC if (F2FS_I(cc->inode)->i_compress_level) size = LZ4HC_MEM_COMPRESS; #endif cc->private = f2fs_kvmalloc(F2FS_I_SB(cc->inode), size, GFP_NOFS); if (!cc->private) return -ENOMEM; /* * we do not change cc->clen to LZ4_compressBound(inputsize) to * adapt worst compress case, because lz4 compressor can handle * output budget properly. */ cc->clen = cc->rlen - PAGE_SIZE - COMPRESS_HEADER_SIZE; return 0; } static void lz4_destroy_compress_ctx(struct compress_ctx *cc) { kvfree(cc->private); cc->private = NULL; } static int lz4_compress_pages(struct compress_ctx *cc) { int len = -EINVAL; unsigned char level = F2FS_I(cc->inode)->i_compress_level; if (!level) len = LZ4_compress_default(cc->rbuf, cc->cbuf->cdata, cc->rlen, cc->clen, cc->private); #ifdef CONFIG_F2FS_FS_LZ4HC else len = LZ4_compress_HC(cc->rbuf, cc->cbuf->cdata, cc->rlen, cc->clen, level, cc->private); #endif if (len < 0) return len; if (!len) return -EAGAIN; cc->clen = len; return 0; } static int lz4_decompress_pages(struct decompress_io_ctx *dic) { int ret; ret = LZ4_decompress_safe(dic->cbuf->cdata, dic->rbuf, dic->clen, dic->rlen); if (ret < 0) { f2fs_err_ratelimited(F2FS_I_SB(dic->inode), "lz4 decompress failed, ret:%d", ret); return -EIO; } if (ret != PAGE_SIZE << dic->log_cluster_size) { f2fs_err_ratelimited(F2FS_I_SB(dic->inode), "lz4 invalid ret:%d, expected:%lu", ret, PAGE_SIZE << dic->log_cluster_size); return -EIO; } return 0; } static bool lz4_is_level_valid(int lvl) { #ifdef CONFIG_F2FS_FS_LZ4HC return !lvl || (lvl >= LZ4HC_MIN_CLEVEL && lvl <= LZ4HC_MAX_CLEVEL); #else return lvl == 0; #endif } static const struct f2fs_compress_ops f2fs_lz4_ops = { .init_compress_ctx = lz4_init_compress_ctx, .destroy_compress_ctx = lz4_destroy_compress_ctx, .compress_pages = lz4_compress_pages, .decompress_pages = lz4_decompress_pages, .is_level_valid = lz4_is_level_valid, }; #endif #ifdef CONFIG_F2FS_FS_ZSTD static int zstd_init_compress_ctx(struct compress_ctx *cc) { zstd_parameters params; zstd_cstream *stream; void *workspace; unsigned int workspace_size; unsigned char level = F2FS_I(cc->inode)->i_compress_level; /* Need to remain this for backward compatibility */ if (!level) level = F2FS_ZSTD_DEFAULT_CLEVEL; params = zstd_get_params(level, cc->rlen); workspace_size = zstd_cstream_workspace_bound(¶ms.cParams); workspace = f2fs_kvmalloc(F2FS_I_SB(cc->inode), workspace_size, GFP_NOFS); if (!workspace) return -ENOMEM; stream = zstd_init_cstream(¶ms, 0, workspace, workspace_size); if (!stream) { f2fs_err_ratelimited(F2FS_I_SB(cc->inode), "%s zstd_init_cstream failed", __func__); kvfree(workspace); return -EIO; } cc->private = workspace; cc->private2 = stream; cc->clen = cc->rlen - PAGE_SIZE - COMPRESS_HEADER_SIZE; return 0; } static void zstd_destroy_compress_ctx(struct compress_ctx *cc) { kvfree(cc->private); cc->private = NULL; cc->private2 = NULL; } static int zstd_compress_pages(struct compress_ctx *cc) { zstd_cstream *stream = cc->private2; zstd_in_buffer inbuf; zstd_out_buffer outbuf; int src_size = cc->rlen; int dst_size = src_size - PAGE_SIZE - COMPRESS_HEADER_SIZE; int ret; inbuf.pos = 0; inbuf.src = cc->rbuf; inbuf.size = src_size; outbuf.pos = 0; outbuf.dst = cc->cbuf->cdata; outbuf.size = dst_size; ret = zstd_compress_stream(stream, &outbuf, &inbuf); if (zstd_is_error(ret)) { f2fs_err_ratelimited(F2FS_I_SB(cc->inode), "%s zstd_compress_stream failed, ret: %d", __func__, zstd_get_error_code(ret)); return -EIO; } ret = zstd_end_stream(stream, &outbuf); if (zstd_is_error(ret)) { f2fs_err_ratelimited(F2FS_I_SB(cc->inode), "%s zstd_end_stream returned %d", __func__, zstd_get_error_code(ret)); return -EIO; } /* * there is compressed data remained in intermediate buffer due to * no more space in cbuf.cdata */ if (ret) return -EAGAIN; cc->clen = outbuf.pos; return 0; } static int zstd_init_decompress_ctx(struct decompress_io_ctx *dic) { zstd_dstream *stream; void *workspace; unsigned int workspace_size; unsigned int max_window_size = MAX_COMPRESS_WINDOW_SIZE(dic->log_cluster_size); workspace_size = zstd_dstream_workspace_bound(max_window_size); workspace = f2fs_kvmalloc(F2FS_I_SB(dic->inode), workspace_size, GFP_NOFS); if (!workspace) return -ENOMEM; stream = zstd_init_dstream(max_window_size, workspace, workspace_size); if (!stream) { f2fs_err_ratelimited(F2FS_I_SB(dic->inode), "%s zstd_init_dstream failed", __func__); kvfree(workspace); return -EIO; } dic->private = workspace; dic->private2 = stream; return 0; } static void zstd_destroy_decompress_ctx(struct decompress_io_ctx *dic) { kvfree(dic->private); dic->private = NULL; dic->private2 = NULL; } static int zstd_decompress_pages(struct decompress_io_ctx *dic) { zstd_dstream *stream = dic->private2; zstd_in_buffer inbuf; zstd_out_buffer outbuf; int ret; inbuf.pos = 0; inbuf.src = dic->cbuf->cdata; inbuf.size = dic->clen; outbuf.pos = 0; outbuf.dst = dic->rbuf; outbuf.size = dic->rlen; ret = zstd_decompress_stream(stream, &outbuf, &inbuf); if (zstd_is_error(ret)) { f2fs_err_ratelimited(F2FS_I_SB(dic->inode), "%s zstd_decompress_stream failed, ret: %d", __func__, zstd_get_error_code(ret)); return -EIO; } if (dic->rlen != outbuf.pos) { f2fs_err_ratelimited(F2FS_I_SB(dic->inode), "%s ZSTD invalid rlen:%zu, expected:%lu", __func__, dic->rlen, PAGE_SIZE << dic->log_cluster_size); return -EIO; } return 0; } static bool zstd_is_level_valid(int lvl) { return lvl >= zstd_min_clevel() && lvl <= zstd_max_clevel(); } static const struct f2fs_compress_ops f2fs_zstd_ops = { .init_compress_ctx = zstd_init_compress_ctx, .destroy_compress_ctx = zstd_destroy_compress_ctx, .compress_pages = zstd_compress_pages, .init_decompress_ctx = zstd_init_decompress_ctx, .destroy_decompress_ctx = zstd_destroy_decompress_ctx, .decompress_pages = zstd_decompress_pages, .is_level_valid = zstd_is_level_valid, }; #endif #ifdef CONFIG_F2FS_FS_LZO #ifdef CONFIG_F2FS_FS_LZORLE static int lzorle_compress_pages(struct compress_ctx *cc) { int ret; ret = lzorle1x_1_compress(cc->rbuf, cc->rlen, cc->cbuf->cdata, &cc->clen, cc->private); if (ret != LZO_E_OK) { f2fs_err_ratelimited(F2FS_I_SB(cc->inode), "lzo-rle compress failed, ret:%d", ret); return -EIO; } return 0; } static const struct f2fs_compress_ops f2fs_lzorle_ops = { .init_compress_ctx = lzo_init_compress_ctx, .destroy_compress_ctx = lzo_destroy_compress_ctx, .compress_pages = lzorle_compress_pages, .decompress_pages = lzo_decompress_pages, }; #endif #endif static const struct f2fs_compress_ops *f2fs_cops[COMPRESS_MAX] = { #ifdef CONFIG_F2FS_FS_LZO &f2fs_lzo_ops, #else NULL, #endif #ifdef CONFIG_F2FS_FS_LZ4 &f2fs_lz4_ops, #else NULL, #endif #ifdef CONFIG_F2FS_FS_ZSTD &f2fs_zstd_ops, #else NULL, #endif #if defined(CONFIG_F2FS_FS_LZO) && defined(CONFIG_F2FS_FS_LZORLE) &f2fs_lzorle_ops, #else NULL, #endif }; bool f2fs_is_compress_backend_ready(struct inode *inode) { if (!f2fs_compressed_file(inode)) return true; return f2fs_cops[F2FS_I(inode)->i_compress_algorithm]; } bool f2fs_is_compress_level_valid(int alg, int lvl) { const struct f2fs_compress_ops *cops = f2fs_cops[alg]; if (cops->is_level_valid) return cops->is_level_valid(lvl); return lvl == 0; } static mempool_t *compress_page_pool; static int num_compress_pages = 512; module_param(num_compress_pages, uint, 0444); MODULE_PARM_DESC(num_compress_pages, "Number of intermediate compress pages to preallocate"); int __init f2fs_init_compress_mempool(void) { compress_page_pool = mempool_create_page_pool(num_compress_pages, 0); return compress_page_pool ? 0 : -ENOMEM; } void f2fs_destroy_compress_mempool(void) { mempool_destroy(compress_page_pool); } static struct page *f2fs_compress_alloc_page(void) { struct page *page; page = mempool_alloc(compress_page_pool, GFP_NOFS); lock_page(page); return page; } static void f2fs_compress_free_page(struct page *page) { if (!page) return; detach_page_private(page); page->mapping = NULL; unlock_page(page); mempool_free(page, compress_page_pool); } #define MAX_VMAP_RETRIES 3 static void *f2fs_vmap(struct page **pages, unsigned int count) { int i; void *buf = NULL; for (i = 0; i < MAX_VMAP_RETRIES; i++) { buf = vm_map_ram(pages, count, -1); if (buf) break; vm_unmap_aliases(); } return buf; } static int f2fs_compress_pages(struct compress_ctx *cc) { struct f2fs_inode_info *fi = F2FS_I(cc->inode); const struct f2fs_compress_ops *cops = f2fs_cops[fi->i_compress_algorithm]; unsigned int max_len, new_nr_cpages; u32 chksum = 0; int i, ret; trace_f2fs_compress_pages_start(cc->inode, cc->cluster_idx, cc->cluster_size, fi->i_compress_algorithm); if (cops->init_compress_ctx) { ret = cops->init_compress_ctx(cc); if (ret) goto out; } max_len = COMPRESS_HEADER_SIZE + cc->clen; cc->nr_cpages = DIV_ROUND_UP(max_len, PAGE_SIZE); cc->valid_nr_cpages = cc->nr_cpages; cc->cpages = page_array_alloc(cc->inode, cc->nr_cpages); if (!cc->cpages) { ret = -ENOMEM; goto destroy_compress_ctx; } for (i = 0; i < cc->nr_cpages; i++) cc->cpages[i] = f2fs_compress_alloc_page(); cc->rbuf = f2fs_vmap(cc->rpages, cc->cluster_size); if (!cc->rbuf) { ret = -ENOMEM; goto out_free_cpages; } cc->cbuf = f2fs_vmap(cc->cpages, cc->nr_cpages); if (!cc->cbuf) { ret = -ENOMEM; goto out_vunmap_rbuf; } ret = cops->compress_pages(cc); if (ret) goto out_vunmap_cbuf; max_len = PAGE_SIZE * (cc->cluster_size - 1) - COMPRESS_HEADER_SIZE; if (cc->clen > max_len) { ret = -EAGAIN; goto out_vunmap_cbuf; } cc->cbuf->clen = cpu_to_le32(cc->clen); if (fi->i_compress_flag & BIT(COMPRESS_CHKSUM)) chksum = f2fs_crc32(F2FS_I_SB(cc->inode), cc->cbuf->cdata, cc->clen); cc->cbuf->chksum = cpu_to_le32(chksum); for (i = 0; i < COMPRESS_DATA_RESERVED_SIZE; i++) cc->cbuf->reserved[i] = cpu_to_le32(0); new_nr_cpages = DIV_ROUND_UP(cc->clen + COMPRESS_HEADER_SIZE, PAGE_SIZE); /* zero out any unused part of the last page */ memset(&cc->cbuf->cdata[cc->clen], 0, (new_nr_cpages * PAGE_SIZE) - (cc->clen + COMPRESS_HEADER_SIZE)); vm_unmap_ram(cc->cbuf, cc->nr_cpages); vm_unmap_ram(cc->rbuf, cc->cluster_size); for (i = new_nr_cpages; i < cc->nr_cpages; i++) { f2fs_compress_free_page(cc->cpages[i]); cc->cpages[i] = NULL; } if (cops->destroy_compress_ctx) cops->destroy_compress_ctx(cc); cc->valid_nr_cpages = new_nr_cpages; trace_f2fs_compress_pages_end(cc->inode, cc->cluster_idx, cc->clen, ret); return 0; out_vunmap_cbuf: vm_unmap_ram(cc->cbuf, cc->nr_cpages); out_vunmap_rbuf: vm_unmap_ram(cc->rbuf, cc->cluster_size); out_free_cpages: for (i = 0; i < cc->nr_cpages; i++) { if (cc->cpages[i]) f2fs_compress_free_page(cc->cpages[i]); } page_array_free(cc->inode, cc->cpages, cc->nr_cpages); cc->cpages = NULL; destroy_compress_ctx: if (cops->destroy_compress_ctx) cops->destroy_compress_ctx(cc); out: trace_f2fs_compress_pages_end(cc->inode, cc->cluster_idx, cc->clen, ret); return ret; } static int f2fs_prepare_decomp_mem(struct decompress_io_ctx *dic, bool pre_alloc); static void f2fs_release_decomp_mem(struct decompress_io_ctx *dic, bool bypass_destroy_callback, bool pre_alloc); void f2fs_decompress_cluster(struct decompress_io_ctx *dic, bool in_task) { struct f2fs_sb_info *sbi = F2FS_I_SB(dic->inode); struct f2fs_inode_info *fi = F2FS_I(dic->inode); const struct f2fs_compress_ops *cops = f2fs_cops[fi->i_compress_algorithm]; bool bypass_callback = false; int ret; trace_f2fs_decompress_pages_start(dic->inode, dic->cluster_idx, dic->cluster_size, fi->i_compress_algorithm); if (dic->failed) { ret = -EIO; goto out_end_io; } ret = f2fs_prepare_decomp_mem(dic, false); if (ret) { bypass_callback = true; goto out_release; } dic->clen = le32_to_cpu(dic->cbuf->clen); dic->rlen = PAGE_SIZE << dic->log_cluster_size; if (dic->clen > PAGE_SIZE * dic->nr_cpages - COMPRESS_HEADER_SIZE) { ret = -EFSCORRUPTED; /* Avoid f2fs_commit_super in irq context */ if (!in_task) f2fs_handle_error_async(sbi, ERROR_FAIL_DECOMPRESSION); else f2fs_handle_error(sbi, ERROR_FAIL_DECOMPRESSION); goto out_release; } ret = cops->decompress_pages(dic); if (!ret && (fi->i_compress_flag & BIT(COMPRESS_CHKSUM))) { u32 provided = le32_to_cpu(dic->cbuf->chksum); u32 calculated = f2fs_crc32(sbi, dic->cbuf->cdata, dic->clen); if (provided != calculated) { if (!is_inode_flag_set(dic->inode, FI_COMPRESS_CORRUPT)) { set_inode_flag(dic->inode, FI_COMPRESS_CORRUPT); f2fs_info_ratelimited(sbi, "checksum invalid, nid = %lu, %x vs %x", dic->inode->i_ino, provided, calculated); } set_sbi_flag(sbi, SBI_NEED_FSCK); } } out_release: f2fs_release_decomp_mem(dic, bypass_callback, false); out_end_io: trace_f2fs_decompress_pages_end(dic->inode, dic->cluster_idx, dic->clen, ret); f2fs_decompress_end_io(dic, ret, in_task); } /* * This is called when a page of a compressed cluster has been read from disk * (or failed to be read from disk). It checks whether this page was the last * page being waited on in the cluster, and if so, it decompresses the cluster * (or in the case of a failure, cleans up without actually decompressing). */ void f2fs_end_read_compressed_page(struct page *page, bool failed, block_t blkaddr, bool in_task) { struct decompress_io_ctx *dic = (struct decompress_io_ctx *)page_private(page); struct f2fs_sb_info *sbi = F2FS_I_SB(dic->inode); dec_page_count(sbi, F2FS_RD_DATA); if (failed) WRITE_ONCE(dic->failed, true); else if (blkaddr && in_task) f2fs_cache_compressed_page(sbi, page, dic->inode->i_ino, blkaddr); if (atomic_dec_and_test(&dic->remaining_pages)) f2fs_decompress_cluster(dic, in_task); } static bool is_page_in_cluster(struct compress_ctx *cc, pgoff_t index) { if (cc->cluster_idx == NULL_CLUSTER) return true; return cc->cluster_idx == cluster_idx(cc, index); } bool f2fs_cluster_is_empty(struct compress_ctx *cc) { return cc->nr_rpages == 0; } static bool f2fs_cluster_is_full(struct compress_ctx *cc) { return cc->cluster_size == cc->nr_rpages; } bool f2fs_cluster_can_merge_page(struct compress_ctx *cc, pgoff_t index) { if (f2fs_cluster_is_empty(cc)) return true; return is_page_in_cluster(cc, index); } bool f2fs_all_cluster_page_ready(struct compress_ctx *cc, struct page **pages, int index, int nr_pages, bool uptodate) { unsigned long pgidx = pages[index]->index; int i = uptodate ? 0 : 1; /* * when uptodate set to true, try to check all pages in cluster is * uptodate or not. */ if (uptodate && (pgidx % cc->cluster_size)) return false; if (nr_pages - index < cc->cluster_size) return false; for (; i < cc->cluster_size; i++) { if (pages[index + i]->index != pgidx + i) return false; if (uptodate && !PageUptodate(pages[index + i])) return false; } return true; } static bool cluster_has_invalid_data(struct compress_ctx *cc) { loff_t i_size = i_size_read(cc->inode); unsigned nr_pages = DIV_ROUND_UP(i_size, PAGE_SIZE); int i; for (i = 0; i < cc->cluster_size; i++) { struct page *page = cc->rpages[i]; f2fs_bug_on(F2FS_I_SB(cc->inode), !page); /* beyond EOF */ if (page_folio(page)->index >= nr_pages) return true; } return false; } bool f2fs_sanity_check_cluster(struct dnode_of_data *dn) { #ifdef CONFIG_F2FS_CHECK_FS struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode); unsigned int cluster_size = F2FS_I(dn->inode)->i_cluster_size; int cluster_end = 0; unsigned int count; int i; char *reason = ""; if (dn->data_blkaddr != COMPRESS_ADDR) return false; /* [..., COMPR_ADDR, ...] */ if (dn->ofs_in_node % cluster_size) { reason = "[*|C|*|*]"; goto out; } for (i = 1, count = 1; i < cluster_size; i++, count++) { block_t blkaddr = data_blkaddr(dn->inode, dn->node_page, dn->ofs_in_node + i); /* [COMPR_ADDR, ..., COMPR_ADDR] */ if (blkaddr == COMPRESS_ADDR) { reason = "[C|*|C|*]"; goto out; } if (!__is_valid_data_blkaddr(blkaddr)) { if (!cluster_end) cluster_end = i; continue; } /* [COMPR_ADDR, NULL_ADDR or NEW_ADDR, valid_blkaddr] */ if (cluster_end) { reason = "[C|N|N|V]"; goto out; } } f2fs_bug_on(F2FS_I_SB(dn->inode), count != cluster_size && !is_inode_flag_set(dn->inode, FI_COMPRESS_RELEASED)); return false; out: f2fs_warn(sbi, "access invalid cluster, ino:%lu, nid:%u, ofs_in_node:%u, reason:%s", dn->inode->i_ino, dn->nid, dn->ofs_in_node, reason); set_sbi_flag(sbi, SBI_NEED_FSCK); return true; #else return false; #endif } static int __f2fs_get_cluster_blocks(struct inode *inode, struct dnode_of_data *dn) { unsigned int cluster_size = F2FS_I(inode)->i_cluster_size; int count, i; for (i = 0, count = 0; i < cluster_size; i++) { block_t blkaddr = data_blkaddr(dn->inode, dn->node_page, dn->ofs_in_node + i); if (__is_valid_data_blkaddr(blkaddr)) count++; } return count; } static int __f2fs_cluster_blocks(struct inode *inode, unsigned int cluster_idx, enum cluster_check_type type) { struct dnode_of_data dn; unsigned int start_idx = cluster_idx << F2FS_I(inode)->i_log_cluster_size; int ret; set_new_dnode(&dn, inode, NULL, NULL, 0); ret = f2fs_get_dnode_of_data(&dn, start_idx, LOOKUP_NODE); if (ret) { if (ret == -ENOENT) ret = 0; goto fail; } if (f2fs_sanity_check_cluster(&dn)) { ret = -EFSCORRUPTED; goto fail; } if (dn.data_blkaddr == COMPRESS_ADDR) { if (type == CLUSTER_COMPR_BLKS) ret = 1 + __f2fs_get_cluster_blocks(inode, &dn); else if (type == CLUSTER_IS_COMPR) ret = 1; } else if (type == CLUSTER_RAW_BLKS) { ret = __f2fs_get_cluster_blocks(inode, &dn); } fail: f2fs_put_dnode(&dn); return ret; } /* return # of compressed blocks in compressed cluster */ static int f2fs_compressed_blocks(struct compress_ctx *cc) { return __f2fs_cluster_blocks(cc->inode, cc->cluster_idx, CLUSTER_COMPR_BLKS); } /* return # of raw blocks in non-compressed cluster */ static int f2fs_decompressed_blocks(struct inode *inode, unsigned int cluster_idx) { return __f2fs_cluster_blocks(inode, cluster_idx, CLUSTER_RAW_BLKS); } /* return whether cluster is compressed one or not */ int f2fs_is_compressed_cluster(struct inode *inode, pgoff_t index) { return __f2fs_cluster_blocks(inode, index >> F2FS_I(inode)->i_log_cluster_size, CLUSTER_IS_COMPR); } /* return whether cluster contains non raw blocks or not */ bool f2fs_is_sparse_cluster(struct inode *inode, pgoff_t index) { unsigned int cluster_idx = index >> F2FS_I(inode)->i_log_cluster_size; return f2fs_decompressed_blocks(inode, cluster_idx) != F2FS_I(inode)->i_cluster_size; } static bool cluster_may_compress(struct compress_ctx *cc) { if (!f2fs_need_compress_data(cc->inode)) return false; if (f2fs_is_atomic_file(cc->inode)) return false; if (!f2fs_cluster_is_full(cc)) return false; if (unlikely(f2fs_cp_error(F2FS_I_SB(cc->inode)))) return false; return !cluster_has_invalid_data(cc); } static void set_cluster_writeback(struct compress_ctx *cc) { int i; for (i = 0; i < cc->cluster_size; i++) { if (cc->rpages[i]) set_page_writeback(cc->rpages[i]); } } static void cancel_cluster_writeback(struct compress_ctx *cc, struct compress_io_ctx *cic, int submitted) { int i; /* Wait for submitted IOs. */ if (submitted > 1) { f2fs_submit_merged_write(F2FS_I_SB(cc->inode), DATA); while (atomic_read(&cic->pending_pages) != (cc->valid_nr_cpages - submitted + 1)) f2fs_io_schedule_timeout(DEFAULT_IO_TIMEOUT); } /* Cancel writeback and stay locked. */ for (i = 0; i < cc->cluster_size; i++) { if (i < submitted) { inode_inc_dirty_pages(cc->inode); lock_page(cc->rpages[i]); } clear_page_private_gcing(cc->rpages[i]); if (folio_test_writeback(page_folio(cc->rpages[i]))) end_page_writeback(cc->rpages[i]); } } static void set_cluster_dirty(struct compress_ctx *cc) { int i; for (i = 0; i < cc->cluster_size; i++) if (cc->rpages[i]) { set_page_dirty(cc->rpages[i]); set_page_private_gcing(cc->rpages[i]); } } static int prepare_compress_overwrite(struct compress_ctx *cc, struct page **pagep, pgoff_t index, void **fsdata) { struct f2fs_sb_info *sbi = F2FS_I_SB(cc->inode); struct address_space *mapping = cc->inode->i_mapping; struct page *page; sector_t last_block_in_bio; fgf_t fgp_flag = FGP_LOCK | FGP_WRITE | FGP_CREAT; pgoff_t start_idx = start_idx_of_cluster(cc); int i, ret; retry: ret = f2fs_is_compressed_cluster(cc->inode, start_idx); if (ret <= 0) return ret; ret = f2fs_init_compress_ctx(cc); if (ret) return ret; /* keep page reference to avoid page reclaim */ for (i = 0; i < cc->cluster_size; i++) { page = f2fs_pagecache_get_page(mapping, start_idx + i, fgp_flag, GFP_NOFS); if (!page) { ret = -ENOMEM; goto unlock_pages; } if (PageUptodate(page)) f2fs_put_page(page, 1); else f2fs_compress_ctx_add_page(cc, page_folio(page)); } if (!f2fs_cluster_is_empty(cc)) { struct bio *bio = NULL; ret = f2fs_read_multi_pages(cc, &bio, cc->cluster_size, &last_block_in_bio, NULL, true); f2fs_put_rpages(cc); f2fs_destroy_compress_ctx(cc, true); if (ret) goto out; if (bio) f2fs_submit_read_bio(sbi, bio, DATA); ret = f2fs_init_compress_ctx(cc); if (ret) goto out; } for (i = 0; i < cc->cluster_size; i++) { f2fs_bug_on(sbi, cc->rpages[i]); page = find_lock_page(mapping, start_idx + i); if (!page) { /* page can be truncated */ goto release_and_retry; } f2fs_wait_on_page_writeback(page, DATA, true, true); f2fs_compress_ctx_add_page(cc, page_folio(page)); if (!PageUptodate(page)) { release_and_retry: f2fs_put_rpages(cc); f2fs_unlock_rpages(cc, i + 1); f2fs_destroy_compress_ctx(cc, true); goto retry; } } if (likely(!ret)) { *fsdata = cc->rpages; *pagep = cc->rpages[offset_in_cluster(cc, index)]; return cc->cluster_size; } unlock_pages: f2fs_put_rpages(cc); f2fs_unlock_rpages(cc, i); f2fs_destroy_compress_ctx(cc, true); out: return ret; } int f2fs_prepare_compress_overwrite(struct inode *inode, struct page **pagep, pgoff_t index, void **fsdata) { struct compress_ctx cc = { .inode = inode, .log_cluster_size = F2FS_I(inode)->i_log_cluster_size, .cluster_size = F2FS_I(inode)->i_cluster_size, .cluster_idx = index >> F2FS_I(inode)->i_log_cluster_size, .rpages = NULL, .nr_rpages = 0, }; return prepare_compress_overwrite(&cc, pagep, index, fsdata); } bool f2fs_compress_write_end(struct inode *inode, void *fsdata, pgoff_t index, unsigned copied) { struct compress_ctx cc = { .inode = inode, .log_cluster_size = F2FS_I(inode)->i_log_cluster_size, .cluster_size = F2FS_I(inode)->i_cluster_size, .rpages = fsdata, }; bool first_index = (index == cc.rpages[0]->index); if (copied) set_cluster_dirty(&cc); f2fs_put_rpages_wbc(&cc, NULL, false, 1); f2fs_destroy_compress_ctx(&cc, false); return first_index; } int f2fs_truncate_partial_cluster(struct inode *inode, u64 from, bool lock) { void *fsdata = NULL; struct page *pagep; int log_cluster_size = F2FS_I(inode)->i_log_cluster_size; pgoff_t start_idx = from >> (PAGE_SHIFT + log_cluster_size) << log_cluster_size; int err; err = f2fs_is_compressed_cluster(inode, start_idx); if (err < 0) return err; /* truncate normal cluster */ if (!err) return f2fs_do_truncate_blocks(inode, from, lock); /* truncate compressed cluster */ err = f2fs_prepare_compress_overwrite(inode, &pagep, start_idx, &fsdata); /* should not be a normal cluster */ f2fs_bug_on(F2FS_I_SB(inode), err == 0); if (err <= 0) return err; if (err > 0) { struct page **rpages = fsdata; int cluster_size = F2FS_I(inode)->i_cluster_size; int i; for (i = cluster_size - 1; i >= 0; i--) { loff_t start = rpages[i]->index << PAGE_SHIFT; if (from <= start) { zero_user_segment(rpages[i], 0, PAGE_SIZE); } else { zero_user_segment(rpages[i], from - start, PAGE_SIZE); break; } } f2fs_compress_write_end(inode, fsdata, start_idx, true); } return 0; } static int f2fs_write_compressed_pages(struct compress_ctx *cc, int *submitted, struct writeback_control *wbc, enum iostat_type io_type) { struct inode *inode = cc->inode; struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct f2fs_inode_info *fi = F2FS_I(inode); struct f2fs_io_info fio = { .sbi = sbi, .ino = cc->inode->i_ino, .type = DATA, .op = REQ_OP_WRITE, .op_flags = wbc_to_write_flags(wbc), .old_blkaddr = NEW_ADDR, .page = NULL, .encrypted_page = NULL, .compressed_page = NULL, .io_type = io_type, .io_wbc = wbc, .encrypted = fscrypt_inode_uses_fs_layer_crypto(cc->inode) ? 1 : 0, }; struct dnode_of_data dn; struct node_info ni; struct compress_io_ctx *cic; pgoff_t start_idx = start_idx_of_cluster(cc); unsigned int last_index = cc->cluster_size - 1; loff_t psize; int i, err; bool quota_inode = IS_NOQUOTA(inode); /* we should bypass data pages to proceed the kworker jobs */ if (unlikely(f2fs_cp_error(sbi))) { mapping_set_error(cc->rpages[0]->mapping, -EIO); goto out_free; } if (quota_inode) { /* * We need to wait for node_write to avoid block allocation during * checkpoint. This can only happen to quota writes which can cause * the below discard race condition. */ f2fs_down_read(&sbi->node_write); } else if (!f2fs_trylock_op(sbi)) { goto out_free; } set_new_dnode(&dn, cc->inode, NULL, NULL, 0); err = f2fs_get_dnode_of_data(&dn, start_idx, LOOKUP_NODE); if (err) goto out_unlock_op; for (i = 0; i < cc->cluster_size; i++) { if (data_blkaddr(dn.inode, dn.node_page, dn.ofs_in_node + i) == NULL_ADDR) goto out_put_dnode; } psize = (loff_t)(cc->rpages[last_index]->index + 1) << PAGE_SHIFT; err = f2fs_get_node_info(fio.sbi, dn.nid, &ni, false); if (err) goto out_put_dnode; fio.version = ni.version; cic = f2fs_kmem_cache_alloc(cic_entry_slab, GFP_F2FS_ZERO, false, sbi); if (!cic) goto out_put_dnode; cic->magic = F2FS_COMPRESSED_PAGE_MAGIC; cic->inode = inode; atomic_set(&cic->pending_pages, cc->valid_nr_cpages); cic->rpages = page_array_alloc(cc->inode, cc->cluster_size); if (!cic->rpages) goto out_put_cic; cic->nr_rpages = cc->cluster_size; for (i = 0; i < cc->valid_nr_cpages; i++) { f2fs_set_compressed_page(cc->cpages[i], inode, cc->rpages[i + 1]->index, cic); fio.compressed_page = cc->cpages[i]; fio.old_blkaddr = data_blkaddr(dn.inode, dn.node_page, dn.ofs_in_node + i + 1); /* wait for GCed page writeback via META_MAPPING */ f2fs_wait_on_block_writeback(inode, fio.old_blkaddr); if (fio.encrypted) { fio.page = cc->rpages[i + 1]; err = f2fs_encrypt_one_page(&fio); if (err) goto out_destroy_crypt; cc->cpages[i] = fio.encrypted_page; } } set_cluster_writeback(cc); for (i = 0; i < cc->cluster_size; i++) cic->rpages[i] = cc->rpages[i]; for (i = 0; i < cc->cluster_size; i++, dn.ofs_in_node++) { block_t blkaddr; blkaddr = f2fs_data_blkaddr(&dn); fio.page = cc->rpages[i]; fio.old_blkaddr = blkaddr; /* cluster header */ if (i == 0) { if (blkaddr == COMPRESS_ADDR) fio.compr_blocks++; if (__is_valid_data_blkaddr(blkaddr)) f2fs_invalidate_blocks(sbi, blkaddr); f2fs_update_data_blkaddr(&dn, COMPRESS_ADDR); goto unlock_continue; } if (fio.compr_blocks && __is_valid_data_blkaddr(blkaddr)) fio.compr_blocks++; if (i > cc->valid_nr_cpages) { if (__is_valid_data_blkaddr(blkaddr)) { f2fs_invalidate_blocks(sbi, blkaddr); f2fs_update_data_blkaddr(&dn, NEW_ADDR); } goto unlock_continue; } f2fs_bug_on(fio.sbi, blkaddr == NULL_ADDR); if (fio.encrypted) fio.encrypted_page = cc->cpages[i - 1]; else fio.compressed_page = cc->cpages[i - 1]; cc->cpages[i - 1] = NULL; fio.submitted = 0; f2fs_outplace_write_data(&dn, &fio); if (unlikely(!fio.submitted)) { cancel_cluster_writeback(cc, cic, i); /* To call fscrypt_finalize_bounce_page */ i = cc->valid_nr_cpages; *submitted = 0; goto out_destroy_crypt; } (*submitted)++; unlock_continue: inode_dec_dirty_pages(cc->inode); unlock_page(fio.page); } if (fio.compr_blocks) f2fs_i_compr_blocks_update(inode, fio.compr_blocks - 1, false); f2fs_i_compr_blocks_update(inode, cc->valid_nr_cpages, true); add_compr_block_stat(inode, cc->valid_nr_cpages); set_inode_flag(cc->inode, FI_APPEND_WRITE); f2fs_put_dnode(&dn); if (quota_inode) f2fs_up_read(&sbi->node_write); else f2fs_unlock_op(sbi); spin_lock(&fi->i_size_lock); if (fi->last_disk_size < psize) fi->last_disk_size = psize; spin_unlock(&fi->i_size_lock); f2fs_put_rpages(cc); page_array_free(cc->inode, cc->cpages, cc->nr_cpages); cc->cpages = NULL; f2fs_destroy_compress_ctx(cc, false); return 0; out_destroy_crypt: page_array_free(cc->inode, cic->rpages, cc->cluster_size); for (--i; i >= 0; i--) { if (!cc->cpages[i]) continue; fscrypt_finalize_bounce_page(&cc->cpages[i]); } out_put_cic: kmem_cache_free(cic_entry_slab, cic); out_put_dnode: f2fs_put_dnode(&dn); out_unlock_op: if (quota_inode) f2fs_up_read(&sbi->node_write); else f2fs_unlock_op(sbi); out_free: for (i = 0; i < cc->valid_nr_cpages; i++) { f2fs_compress_free_page(cc->cpages[i]); cc->cpages[i] = NULL; } page_array_free(cc->inode, cc->cpages, cc->nr_cpages); cc->cpages = NULL; return -EAGAIN; } void f2fs_compress_write_end_io(struct bio *bio, struct page *page) { struct f2fs_sb_info *sbi = bio->bi_private; struct compress_io_ctx *cic = (struct compress_io_ctx *)page_private(page); enum count_type type = WB_DATA_TYPE(page, f2fs_is_compressed_page(page)); int i; if (unlikely(bio->bi_status)) mapping_set_error(cic->inode->i_mapping, -EIO); f2fs_compress_free_page(page); dec_page_count(sbi, type); if (atomic_dec_return(&cic->pending_pages)) return; for (i = 0; i < cic->nr_rpages; i++) { WARN_ON(!cic->rpages[i]); clear_page_private_gcing(cic->rpages[i]); end_page_writeback(cic->rpages[i]); } page_array_free(cic->inode, cic->rpages, cic->nr_rpages); kmem_cache_free(cic_entry_slab, cic); } static int f2fs_write_raw_pages(struct compress_ctx *cc, int *submitted_p, struct writeback_control *wbc, enum iostat_type io_type) { struct address_space *mapping = cc->inode->i_mapping; struct f2fs_sb_info *sbi = F2FS_M_SB(mapping); int submitted, compr_blocks, i; int ret = 0; compr_blocks = f2fs_compressed_blocks(cc); for (i = 0; i < cc->cluster_size; i++) { if (!cc->rpages[i]) continue; redirty_page_for_writepage(wbc, cc->rpages[i]); unlock_page(cc->rpages[i]); } if (compr_blocks < 0) return compr_blocks; /* overwrite compressed cluster w/ normal cluster */ if (compr_blocks > 0) f2fs_lock_op(sbi); for (i = 0; i < cc->cluster_size; i++) { if (!cc->rpages[i]) continue; retry_write: lock_page(cc->rpages[i]); if (cc->rpages[i]->mapping != mapping) { continue_unlock: unlock_page(cc->rpages[i]); continue; } if (!PageDirty(cc->rpages[i])) goto continue_unlock; if (folio_test_writeback(page_folio(cc->rpages[i]))) { if (wbc->sync_mode == WB_SYNC_NONE) goto continue_unlock; f2fs_wait_on_page_writeback(cc->rpages[i], DATA, true, true); } if (!clear_page_dirty_for_io(cc->rpages[i])) goto continue_unlock; ret = f2fs_write_single_data_page(page_folio(cc->rpages[i]), &submitted, NULL, NULL, wbc, io_type, compr_blocks, false); if (ret) { if (ret == AOP_WRITEPAGE_ACTIVATE) { unlock_page(cc->rpages[i]); ret = 0; } else if (ret == -EAGAIN) { ret = 0; /* * for quota file, just redirty left pages to * avoid deadlock caused by cluster update race * from foreground operation. */ if (IS_NOQUOTA(cc->inode)) goto out; f2fs_io_schedule_timeout(DEFAULT_IO_TIMEOUT); goto retry_write; } goto out; } *submitted_p += submitted; } out: if (compr_blocks > 0) f2fs_unlock_op(sbi); f2fs_balance_fs(sbi, true); return ret; } int f2fs_write_multi_pages(struct compress_ctx *cc, int *submitted, struct writeback_control *wbc, enum iostat_type io_type) { int err; *submitted = 0; if (cluster_may_compress(cc)) { err = f2fs_compress_pages(cc); if (err == -EAGAIN) { add_compr_block_stat(cc->inode, cc->cluster_size); goto write; } else if (err) { f2fs_put_rpages_wbc(cc, wbc, true, 1); goto destroy_out; } err = f2fs_write_compressed_pages(cc, submitted, wbc, io_type); if (!err) return 0; f2fs_bug_on(F2FS_I_SB(cc->inode), err != -EAGAIN); } write: f2fs_bug_on(F2FS_I_SB(cc->inode), *submitted); err = f2fs_write_raw_pages(cc, submitted, wbc, io_type); f2fs_put_rpages_wbc(cc, wbc, false, 0); destroy_out: f2fs_destroy_compress_ctx(cc, false); return err; } static inline bool allow_memalloc_for_decomp(struct f2fs_sb_info *sbi, bool pre_alloc) { return pre_alloc ^ f2fs_low_mem_mode(sbi); } static int f2fs_prepare_decomp_mem(struct decompress_io_ctx *dic, bool pre_alloc) { const struct f2fs_compress_ops *cops = f2fs_cops[F2FS_I(dic->inode)->i_compress_algorithm]; int i; if (!allow_memalloc_for_decomp(F2FS_I_SB(dic->inode), pre_alloc)) return 0; dic->tpages = page_array_alloc(dic->inode, dic->cluster_size); if (!dic->tpages) return -ENOMEM; for (i = 0; i < dic->cluster_size; i++) { if (dic->rpages[i]) { dic->tpages[i] = dic->rpages[i]; continue; } dic->tpages[i] = f2fs_compress_alloc_page(); } dic->rbuf = f2fs_vmap(dic->tpages, dic->cluster_size); if (!dic->rbuf) return -ENOMEM; dic->cbuf = f2fs_vmap(dic->cpages, dic->nr_cpages); if (!dic->cbuf) return -ENOMEM; if (cops->init_decompress_ctx) return cops->init_decompress_ctx(dic); return 0; } static void f2fs_release_decomp_mem(struct decompress_io_ctx *dic, bool bypass_destroy_callback, bool pre_alloc) { const struct f2fs_compress_ops *cops = f2fs_cops[F2FS_I(dic->inode)->i_compress_algorithm]; if (!allow_memalloc_for_decomp(F2FS_I_SB(dic->inode), pre_alloc)) return; if (!bypass_destroy_callback && cops->destroy_decompress_ctx) cops->destroy_decompress_ctx(dic); if (dic->cbuf) vm_unmap_ram(dic->cbuf, dic->nr_cpages); if (dic->rbuf) vm_unmap_ram(dic->rbuf, dic->cluster_size); } static void f2fs_free_dic(struct decompress_io_ctx *dic, bool bypass_destroy_callback); struct decompress_io_ctx *f2fs_alloc_dic(struct compress_ctx *cc) { struct decompress_io_ctx *dic; pgoff_t start_idx = start_idx_of_cluster(cc); struct f2fs_sb_info *sbi = F2FS_I_SB(cc->inode); int i, ret; dic = f2fs_kmem_cache_alloc(dic_entry_slab, GFP_F2FS_ZERO, false, sbi); if (!dic) return ERR_PTR(-ENOMEM); dic->rpages = page_array_alloc(cc->inode, cc->cluster_size); if (!dic->rpages) { kmem_cache_free(dic_entry_slab, dic); return ERR_PTR(-ENOMEM); } dic->magic = F2FS_COMPRESSED_PAGE_MAGIC; dic->inode = cc->inode; atomic_set(&dic->remaining_pages, cc->nr_cpages); dic->cluster_idx = cc->cluster_idx; dic->cluster_size = cc->cluster_size; dic->log_cluster_size = cc->log_cluster_size; dic->nr_cpages = cc->nr_cpages; refcount_set(&dic->refcnt, 1); dic->failed = false; dic->need_verity = f2fs_need_verity(cc->inode, start_idx); for (i = 0; i < dic->cluster_size; i++) dic->rpages[i] = cc->rpages[i]; dic->nr_rpages = cc->cluster_size; dic->cpages = page_array_alloc(dic->inode, dic->nr_cpages); if (!dic->cpages) { ret = -ENOMEM; goto out_free; } for (i = 0; i < dic->nr_cpages; i++) { struct page *page; page = f2fs_compress_alloc_page(); f2fs_set_compressed_page(page, cc->inode, start_idx + i + 1, dic); dic->cpages[i] = page; } ret = f2fs_prepare_decomp_mem(dic, true); if (ret) goto out_free; return dic; out_free: f2fs_free_dic(dic, true); return ERR_PTR(ret); } static void f2fs_free_dic(struct decompress_io_ctx *dic, bool bypass_destroy_callback) { int i; f2fs_release_decomp_mem(dic, bypass_destroy_callback, true); if (dic->tpages) { for (i = 0; i < dic->cluster_size; i++) { if (dic->rpages[i]) continue; if (!dic->tpages[i]) continue; f2fs_compress_free_page(dic->tpages[i]); } page_array_free(dic->inode, dic->tpages, dic->cluster_size); } if (dic->cpages) { for (i = 0; i < dic->nr_cpages; i++) { if (!dic->cpages[i]) continue; f2fs_compress_free_page(dic->cpages[i]); } page_array_free(dic->inode, dic->cpages, dic->nr_cpages); } page_array_free(dic->inode, dic->rpages, dic->nr_rpages); kmem_cache_free(dic_entry_slab, dic); } static void f2fs_late_free_dic(struct work_struct *work) { struct decompress_io_ctx *dic = container_of(work, struct decompress_io_ctx, free_work); f2fs_free_dic(dic, false); } static void f2fs_put_dic(struct decompress_io_ctx *dic, bool in_task) { if (refcount_dec_and_test(&dic->refcnt)) { if (in_task) { f2fs_free_dic(dic, false); } else { INIT_WORK(&dic->free_work, f2fs_late_free_dic); queue_work(F2FS_I_SB(dic->inode)->post_read_wq, &dic->free_work); } } } static void f2fs_verify_cluster(struct work_struct *work) { struct decompress_io_ctx *dic = container_of(work, struct decompress_io_ctx, verity_work); int i; /* Verify, update, and unlock the decompressed pages. */ for (i = 0; i < dic->cluster_size; i++) { struct page *rpage = dic->rpages[i]; if (!rpage) continue; if (fsverity_verify_page(rpage)) SetPageUptodate(rpage); else ClearPageUptodate(rpage); unlock_page(rpage); } f2fs_put_dic(dic, true); } /* * This is called when a compressed cluster has been decompressed * (or failed to be read and/or decompressed). */ void f2fs_decompress_end_io(struct decompress_io_ctx *dic, bool failed, bool in_task) { int i; if (!failed && dic->need_verity) { /* * Note that to avoid deadlocks, the verity work can't be done * on the decompression workqueue. This is because verifying * the data pages can involve reading metadata pages from the * file, and these metadata pages may be compressed. */ INIT_WORK(&dic->verity_work, f2fs_verify_cluster); fsverity_enqueue_verify_work(&dic->verity_work); return; } /* Update and unlock the cluster's pagecache pages. */ for (i = 0; i < dic->cluster_size; i++) { struct page *rpage = dic->rpages[i]; if (!rpage) continue; if (failed) ClearPageUptodate(rpage); else SetPageUptodate(rpage); unlock_page(rpage); } /* * Release the reference to the decompress_io_ctx that was being held * for I/O completion. */ f2fs_put_dic(dic, in_task); } /* * Put a reference to a compressed page's decompress_io_ctx. * * This is called when the page is no longer needed and can be freed. */ void f2fs_put_page_dic(struct page *page, bool in_task) { struct decompress_io_ctx *dic = (struct decompress_io_ctx *)page_private(page); f2fs_put_dic(dic, in_task); } /* * check whether cluster blocks are contiguous, and add extent cache entry * only if cluster blocks are logically and physically contiguous. */ unsigned int f2fs_cluster_blocks_are_contiguous(struct dnode_of_data *dn, unsigned int ofs_in_node) { bool compressed = data_blkaddr(dn->inode, dn->node_page, ofs_in_node) == COMPRESS_ADDR; int i = compressed ? 1 : 0; block_t first_blkaddr = data_blkaddr(dn->inode, dn->node_page, ofs_in_node + i); for (i += 1; i < F2FS_I(dn->inode)->i_cluster_size; i++) { block_t blkaddr = data_blkaddr(dn->inode, dn->node_page, ofs_in_node + i); if (!__is_valid_data_blkaddr(blkaddr)) break; if (first_blkaddr + i - (compressed ? 1 : 0) != blkaddr) return 0; } return compressed ? i - 1 : i; } const struct address_space_operations f2fs_compress_aops = { .release_folio = f2fs_release_folio, .invalidate_folio = f2fs_invalidate_folio, .migrate_folio = filemap_migrate_folio, }; struct address_space *COMPRESS_MAPPING(struct f2fs_sb_info *sbi) { return sbi->compress_inode->i_mapping; } void f2fs_invalidate_compress_page(struct f2fs_sb_info *sbi, block_t blkaddr) { if (!sbi->compress_inode) return; invalidate_mapping_pages(COMPRESS_MAPPING(sbi), blkaddr, blkaddr); } void f2fs_cache_compressed_page(struct f2fs_sb_info *sbi, struct page *page, nid_t ino, block_t blkaddr) { struct page *cpage; int ret; if (!test_opt(sbi, COMPRESS_CACHE)) return; if (!f2fs_is_valid_blkaddr(sbi, blkaddr, DATA_GENERIC_ENHANCE_READ)) return; if (!f2fs_available_free_memory(sbi, COMPRESS_PAGE)) return; cpage = find_get_page(COMPRESS_MAPPING(sbi), blkaddr); if (cpage) { f2fs_put_page(cpage, 0); return; } cpage = alloc_page(__GFP_NOWARN | __GFP_IO); if (!cpage) return; ret = add_to_page_cache_lru(cpage, COMPRESS_MAPPING(sbi), blkaddr, GFP_NOFS); if (ret) { f2fs_put_page(cpage, 0); return; } set_page_private_data(cpage, ino); memcpy(page_address(cpage), page_address(page), PAGE_SIZE); SetPageUptodate(cpage); f2fs_put_page(cpage, 1); } bool f2fs_load_compressed_page(struct f2fs_sb_info *sbi, struct page *page, block_t blkaddr) { struct page *cpage; bool hitted = false; if (!test_opt(sbi, COMPRESS_CACHE)) return false; cpage = f2fs_pagecache_get_page(COMPRESS_MAPPING(sbi), blkaddr, FGP_LOCK | FGP_NOWAIT, GFP_NOFS); if (cpage) { if (PageUptodate(cpage)) { atomic_inc(&sbi->compress_page_hit); memcpy(page_address(page), page_address(cpage), PAGE_SIZE); hitted = true; } f2fs_put_page(cpage, 1); } return hitted; } void f2fs_invalidate_compress_pages(struct f2fs_sb_info *sbi, nid_t ino) { struct address_space *mapping = COMPRESS_MAPPING(sbi); struct folio_batch fbatch; pgoff_t index = 0; pgoff_t end = MAX_BLKADDR(sbi); if (!mapping->nrpages) return; folio_batch_init(&fbatch); do { unsigned int nr, i; nr = filemap_get_folios(mapping, &index, end - 1, &fbatch); if (!nr) break; for (i = 0; i < nr; i++) { struct folio *folio = fbatch.folios[i]; folio_lock(folio); if (folio->mapping != mapping) { folio_unlock(folio); continue; } if (ino != get_page_private_data(&folio->page)) { folio_unlock(folio); continue; } generic_error_remove_folio(mapping, folio); folio_unlock(folio); } folio_batch_release(&fbatch); cond_resched(); } while (index < end); } int f2fs_init_compress_inode(struct f2fs_sb_info *sbi) { struct inode *inode; if (!test_opt(sbi, COMPRESS_CACHE)) return 0; inode = f2fs_iget(sbi->sb, F2FS_COMPRESS_INO(sbi)); if (IS_ERR(inode)) return PTR_ERR(inode); sbi->compress_inode = inode; sbi->compress_percent = COMPRESS_PERCENT; sbi->compress_watermark = COMPRESS_WATERMARK; atomic_set(&sbi->compress_page_hit, 0); return 0; } void f2fs_destroy_compress_inode(struct f2fs_sb_info *sbi) { if (!sbi->compress_inode) return; iput(sbi->compress_inode); sbi->compress_inode = NULL; } int f2fs_init_page_array_cache(struct f2fs_sb_info *sbi) { dev_t dev = sbi->sb->s_bdev->bd_dev; char slab_name[35]; if (!f2fs_sb_has_compression(sbi)) return 0; sprintf(slab_name, "f2fs_page_array_entry-%u:%u", MAJOR(dev), MINOR(dev)); sbi->page_array_slab_size = sizeof(struct page *) << F2FS_OPTION(sbi).compress_log_size; sbi->page_array_slab = f2fs_kmem_cache_create(slab_name, sbi->page_array_slab_size); return sbi->page_array_slab ? 0 : -ENOMEM; } void f2fs_destroy_page_array_cache(struct f2fs_sb_info *sbi) { kmem_cache_destroy(sbi->page_array_slab); } int __init f2fs_init_compress_cache(void) { cic_entry_slab = f2fs_kmem_cache_create("f2fs_cic_entry", sizeof(struct compress_io_ctx)); if (!cic_entry_slab) return -ENOMEM; dic_entry_slab = f2fs_kmem_cache_create("f2fs_dic_entry", sizeof(struct decompress_io_ctx)); if (!dic_entry_slab) goto free_cic; return 0; free_cic: kmem_cache_destroy(cic_entry_slab); return -ENOMEM; } void f2fs_destroy_compress_cache(void) { kmem_cache_destroy(dic_entry_slab); kmem_cache_destroy(cic_entry_slab); } |
| 87 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 | // SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2006-2007 Silicon Graphics, Inc. * All Rights Reserved. */ #ifndef __XFS_FILESTREAM_H__ #define __XFS_FILESTREAM_H__ struct xfs_mount; struct xfs_inode; struct xfs_bmalloca; struct xfs_alloc_arg; int xfs_filestream_mount(struct xfs_mount *mp); void xfs_filestream_unmount(struct xfs_mount *mp); void xfs_filestream_deassociate(struct xfs_inode *ip); int xfs_filestream_select_ag(struct xfs_bmalloca *ap, struct xfs_alloc_arg *args, xfs_extlen_t *blen); static inline int xfs_inode_is_filestream( struct xfs_inode *ip) { return xfs_has_filestreams(ip->i_mount) || (ip->i_diflags & XFS_DIFLAG_FILESTREAM); } #endif /* __XFS_FILESTREAM_H__ */ |
| 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 | /* SPDX-License-Identifier: GPL-2.0 */ /* interrupt.h */ #ifndef _LINUX_INTERRUPT_H #define _LINUX_INTERRUPT_H #include <linux/kernel.h> #include <linux/bitops.h> #include <linux/cleanup.h> #include <linux/irqreturn.h> #include <linux/irqnr.h> #include <linux/hardirq.h> #include <linux/irqflags.h> #include <linux/hrtimer.h> #include <linux/kref.h> #include <linux/cpumask_types.h> #include <linux/workqueue.h> #include <linux/jump_label.h> #include <linux/atomic.h> #include <asm/ptrace.h> #include <asm/irq.h> #include <asm/sections.h> /* * These correspond to the IORESOURCE_IRQ_* defines in * linux/ioport.h to select the interrupt line behaviour. When * requesting an interrupt without specifying a IRQF_TRIGGER, the * setting should be assumed to be "as already configured", which * may be as per machine or firmware initialisation. */ #define IRQF_TRIGGER_NONE 0x00000000 #define IRQF_TRIGGER_RISING 0x00000001 #define IRQF_TRIGGER_FALLING 0x00000002 #define IRQF_TRIGGER_HIGH 0x00000004 #define IRQF_TRIGGER_LOW 0x00000008 #define IRQF_TRIGGER_MASK (IRQF_TRIGGER_HIGH | IRQF_TRIGGER_LOW | \ IRQF_TRIGGER_RISING | IRQF_TRIGGER_FALLING) #define IRQF_TRIGGER_PROBE 0x00000010 /* * These flags used only by the kernel as part of the * irq handling routines. * * IRQF_SHARED - allow sharing the irq among several devices * IRQF_PROBE_SHARED - set by callers when they expect sharing mismatches to occur * IRQF_TIMER - Flag to mark this interrupt as timer interrupt * IRQF_PERCPU - Interrupt is per cpu * IRQF_NOBALANCING - Flag to exclude this interrupt from irq balancing * IRQF_IRQPOLL - Interrupt is used for polling (only the interrupt that is * registered first in a shared interrupt is considered for * performance reasons) * IRQF_ONESHOT - Interrupt is not reenabled after the hardirq handler finished. * Used by threaded interrupts which need to keep the * irq line disabled until the threaded handler has been run. * IRQF_NO_SUSPEND - Do not disable this IRQ during suspend. Does not guarantee * that this interrupt will wake the system from a suspended * state. See Documentation/power/suspend-and-interrupts.rst * IRQF_FORCE_RESUME - Force enable it on resume even if IRQF_NO_SUSPEND is set * IRQF_NO_THREAD - Interrupt cannot be threaded * IRQF_EARLY_RESUME - Resume IRQ early during syscore instead of at device * resume time. * IRQF_COND_SUSPEND - If the IRQ is shared with a NO_SUSPEND user, execute this * interrupt handler after suspending interrupts. For system * wakeup devices users need to implement wakeup detection in * their interrupt handlers. * IRQF_NO_AUTOEN - Don't enable IRQ or NMI automatically when users request it. * Users will enable it explicitly by enable_irq() or enable_nmi() * later. * IRQF_NO_DEBUG - Exclude from runnaway detection for IPI and similar handlers, * depends on IRQF_PERCPU. * IRQF_COND_ONESHOT - Agree to do IRQF_ONESHOT if already set for a shared * interrupt. */ #define IRQF_SHARED 0x00000080 #define IRQF_PROBE_SHARED 0x00000100 #define __IRQF_TIMER 0x00000200 #define IRQF_PERCPU 0x00000400 #define IRQF_NOBALANCING 0x00000800 #define IRQF_IRQPOLL 0x00001000 #define IRQF_ONESHOT 0x00002000 #define IRQF_NO_SUSPEND 0x00004000 #define IRQF_FORCE_RESUME 0x00008000 #define IRQF_NO_THREAD 0x00010000 #define IRQF_EARLY_RESUME 0x00020000 #define IRQF_COND_SUSPEND 0x00040000 #define IRQF_NO_AUTOEN 0x00080000 #define IRQF_NO_DEBUG 0x00100000 #define IRQF_COND_ONESHOT 0x00200000 #define IRQF_TIMER (__IRQF_TIMER | IRQF_NO_SUSPEND | IRQF_NO_THREAD) /* * These values can be returned by request_any_context_irq() and * describe the context the interrupt will be run in. * * IRQC_IS_HARDIRQ - interrupt runs in hardirq context * IRQC_IS_NESTED - interrupt runs in a nested threaded context */ enum { IRQC_IS_HARDIRQ = 0, IRQC_IS_NESTED, }; typedef irqreturn_t (*irq_handler_t)(int, void *); /** * struct irqaction - per interrupt action descriptor * @handler: interrupt handler function * @name: name of the device * @dev_id: cookie to identify the device * @percpu_dev_id: cookie to identify the device * @next: pointer to the next irqaction for shared interrupts * @irq: interrupt number * @flags: flags (see IRQF_* above) * @thread_fn: interrupt handler function for threaded interrupts * @thread: thread pointer for threaded interrupts * @secondary: pointer to secondary irqaction (force threading) * @thread_flags: flags related to @thread * @thread_mask: bitmask for keeping track of @thread activity * @dir: pointer to the proc/irq/NN/name entry */ struct irqaction { irq_handler_t handler; void *dev_id; void __percpu *percpu_dev_id; struct irqaction *next; irq_handler_t thread_fn; struct task_struct *thread; struct irqaction *secondary; unsigned int irq; unsigned int flags; unsigned long thread_flags; unsigned long thread_mask; const char *name; struct proc_dir_entry *dir; } ____cacheline_internodealigned_in_smp; extern irqreturn_t no_action(int cpl, void *dev_id); /* * If a (PCI) device interrupt is not connected we set dev->irq to * IRQ_NOTCONNECTED. This causes request_irq() to fail with -ENOTCONN, so we * can distingiush that case from other error returns. * * 0x80000000 is guaranteed to be outside the available range of interrupts * and easy to distinguish from other possible incorrect values. */ #define IRQ_NOTCONNECTED (1U << 31) extern int __must_check request_threaded_irq(unsigned int irq, irq_handler_t handler, irq_handler_t thread_fn, unsigned long flags, const char *name, void *dev); /** * request_irq - Add a handler for an interrupt line * @irq: The interrupt line to allocate * @handler: Function to be called when the IRQ occurs. * Primary handler for threaded interrupts * If NULL, the default primary handler is installed * @flags: Handling flags * @name: Name of the device generating this interrupt * @dev: A cookie passed to the handler function * * This call allocates an interrupt and establishes a handler; see * the documentation for request_threaded_irq() for details. */ static inline int __must_check request_irq(unsigned int irq, irq_handler_t handler, unsigned long flags, const char *name, void *dev) { return request_threaded_irq(irq, handler, NULL, flags | IRQF_COND_ONESHOT, name, dev); } extern int __must_check request_any_context_irq(unsigned int irq, irq_handler_t handler, unsigned long flags, const char *name, void *dev_id); extern int __must_check __request_percpu_irq(unsigned int irq, irq_handler_t handler, unsigned long flags, const char *devname, void __percpu *percpu_dev_id); extern int __must_check request_nmi(unsigned int irq, irq_handler_t handler, unsigned long flags, const char *name, void *dev); static inline int __must_check request_percpu_irq(unsigned int irq, irq_handler_t handler, const char *devname, void __percpu *percpu_dev_id) { return __request_percpu_irq(irq, handler, 0, devname, percpu_dev_id); } extern int __must_check request_percpu_nmi(unsigned int irq, irq_handler_t handler, const char *devname, void __percpu *dev); extern const void *free_irq(unsigned int, void *); extern void free_percpu_irq(unsigned int, void __percpu *); extern const void *free_nmi(unsigned int irq, void *dev_id); extern void free_percpu_nmi(unsigned int irq, void __percpu *percpu_dev_id); struct device; extern int __must_check devm_request_threaded_irq(struct device *dev, unsigned int irq, irq_handler_t handler, irq_handler_t thread_fn, unsigned long irqflags, const char *devname, void *dev_id); static inline int __must_check devm_request_irq(struct device *dev, unsigned int irq, irq_handler_t handler, unsigned long irqflags, const char *devname, void *dev_id) { return devm_request_threaded_irq(dev, irq, handler, NULL, irqflags, devname, dev_id); } extern int __must_check devm_request_any_context_irq(struct device *dev, unsigned int irq, irq_handler_t handler, unsigned long irqflags, const char *devname, void *dev_id); extern void devm_free_irq(struct device *dev, unsigned int irq, void *dev_id); bool irq_has_action(unsigned int irq); extern void disable_irq_nosync(unsigned int irq); extern bool disable_hardirq(unsigned int irq); extern void disable_irq(unsigned int irq); extern void disable_percpu_irq(unsigned int irq); extern void enable_irq(unsigned int irq); extern void enable_percpu_irq(unsigned int irq, unsigned int type); extern bool irq_percpu_is_enabled(unsigned int irq); extern void irq_wake_thread(unsigned int irq, void *dev_id); DEFINE_LOCK_GUARD_1(disable_irq, int, disable_irq(*_T->lock), enable_irq(*_T->lock)) extern void disable_nmi_nosync(unsigned int irq); extern void disable_percpu_nmi(unsigned int irq); extern void enable_nmi(unsigned int irq); extern void enable_percpu_nmi(unsigned int irq, unsigned int type); extern int prepare_percpu_nmi(unsigned int irq); extern void teardown_percpu_nmi(unsigned int irq); extern int irq_inject_interrupt(unsigned int irq); /* The following three functions are for the core kernel use only. */ extern void suspend_device_irqs(void); extern void resume_device_irqs(void); extern void rearm_wake_irq(unsigned int irq); /** * struct irq_affinity_notify - context for notification of IRQ affinity changes * @irq: Interrupt to which notification applies * @kref: Reference count, for internal use * @work: Work item, for internal use * @notify: Function to be called on change. This will be * called in process context. * @release: Function to be called on release. This will be * called in process context. Once registered, the * structure must only be freed when this function is * called or later. */ struct irq_affinity_notify { unsigned int irq; struct kref kref; struct work_struct work; void (*notify)(struct irq_affinity_notify *, const cpumask_t *mask); void (*release)(struct kref *ref); }; #define IRQ_AFFINITY_MAX_SETS 4 /** * struct irq_affinity - Description for automatic irq affinity assignments * @pre_vectors: Don't apply affinity to @pre_vectors at beginning of * the MSI(-X) vector space * @post_vectors: Don't apply affinity to @post_vectors at end of * the MSI(-X) vector space * @nr_sets: The number of interrupt sets for which affinity * spreading is required * @set_size: Array holding the size of each interrupt set * @calc_sets: Callback for calculating the number and size * of interrupt sets * @priv: Private data for usage by @calc_sets, usually a * pointer to driver/device specific data. */ struct irq_affinity { unsigned int pre_vectors; unsigned int post_vectors; unsigned int nr_sets; unsigned int set_size[IRQ_AFFINITY_MAX_SETS]; void (*calc_sets)(struct irq_affinity *, unsigned int nvecs); void *priv; }; /** * struct irq_affinity_desc - Interrupt affinity descriptor * @mask: cpumask to hold the affinity assignment * @is_managed: 1 if the interrupt is managed internally */ struct irq_affinity_desc { struct cpumask mask; unsigned int is_managed : 1; }; #if defined(CONFIG_SMP) extern cpumask_var_t irq_default_affinity; extern int irq_set_affinity(unsigned int irq, const struct cpumask *cpumask); extern int irq_force_affinity(unsigned int irq, const struct cpumask *cpumask); extern int irq_can_set_affinity(unsigned int irq); extern int irq_select_affinity(unsigned int irq); extern int __irq_apply_affinity_hint(unsigned int irq, const struct cpumask *m, bool setaffinity); /** * irq_update_affinity_hint - Update the affinity hint * @irq: Interrupt to update * @m: cpumask pointer (NULL to clear the hint) * * Updates the affinity hint, but does not change the affinity of the interrupt. */ static inline int irq_update_affinity_hint(unsigned int irq, const struct cpumask *m) { return __irq_apply_affinity_hint(irq, m, false); } /** * irq_set_affinity_and_hint - Update the affinity hint and apply the provided * cpumask to the interrupt * @irq: Interrupt to update * @m: cpumask pointer (NULL to clear the hint) * * Updates the affinity hint and if @m is not NULL it applies it as the * affinity of that interrupt. */ static inline int irq_set_affinity_and_hint(unsigned int irq, const struct cpumask *m) { return __irq_apply_affinity_hint(irq, m, true); } /* * Deprecated. Use irq_update_affinity_hint() or irq_set_affinity_and_hint() * instead. */ static inline int irq_set_affinity_hint(unsigned int irq, const struct cpumask *m) { return irq_set_affinity_and_hint(irq, m); } extern int irq_update_affinity_desc(unsigned int irq, struct irq_affinity_desc *affinity); extern int irq_set_affinity_notifier(unsigned int irq, struct irq_affinity_notify *notify); struct irq_affinity_desc * irq_create_affinity_masks(unsigned int nvec, struct irq_affinity *affd); unsigned int irq_calc_affinity_vectors(unsigned int minvec, unsigned int maxvec, const struct irq_affinity *affd); #else /* CONFIG_SMP */ static inline int irq_set_affinity(unsigned int irq, const struct cpumask *m) { return -EINVAL; } static inline int irq_force_affinity(unsigned int irq, const struct cpumask *cpumask) { return 0; } static inline int irq_can_set_affinity(unsigned int irq) { return 0; } static inline int irq_select_affinity(unsigned int irq) { return 0; } static inline int irq_update_affinity_hint(unsigned int irq, const struct cpumask *m) { return -EINVAL; } static inline int irq_set_affinity_and_hint(unsigned int irq, const struct cpumask *m) { return -EINVAL; } static inline int irq_set_affinity_hint(unsigned int irq, const struct cpumask *m) { return -EINVAL; } static inline int irq_update_affinity_desc(unsigned int irq, struct irq_affinity_desc *affinity) { return -EINVAL; } static inline int irq_set_affinity_notifier(unsigned int irq, struct irq_affinity_notify *notify) { return 0; } static inline struct irq_affinity_desc * irq_create_affinity_masks(unsigned int nvec, struct irq_affinity *affd) { return NULL; } static inline unsigned int irq_calc_affinity_vectors(unsigned int minvec, unsigned int maxvec, const struct irq_affinity *affd) { return maxvec; } #endif /* CONFIG_SMP */ /* * Special lockdep variants of irq disabling/enabling. * These should be used for locking constructs that * know that a particular irq context which is disabled, * and which is the only irq-context user of a lock, * that it's safe to take the lock in the irq-disabled * section without disabling hardirqs. * * On !CONFIG_LOCKDEP they are equivalent to the normal * irq disable/enable methods. */ static inline void disable_irq_nosync_lockdep(unsigned int irq) { disable_irq_nosync(irq); #ifdef CONFIG_LOCKDEP local_irq_disable(); #endif } static inline void disable_irq_nosync_lockdep_irqsave(unsigned int irq, unsigned long *flags) { disable_irq_nosync(irq); #ifdef CONFIG_LOCKDEP local_irq_save(*flags); #endif } static inline void disable_irq_lockdep(unsigned int irq) { disable_irq(irq); #ifdef CONFIG_LOCKDEP local_irq_disable(); #endif } static inline void enable_irq_lockdep(unsigned int irq) { #ifdef CONFIG_LOCKDEP local_irq_enable(); #endif enable_irq(irq); } static inline void enable_irq_lockdep_irqrestore(unsigned int irq, unsigned long *flags) { #ifdef CONFIG_LOCKDEP local_irq_restore(*flags); #endif enable_irq(irq); } /* IRQ wakeup (PM) control: */ extern int irq_set_irq_wake(unsigned int irq, unsigned int on); static inline int enable_irq_wake(unsigned int irq) { return irq_set_irq_wake(irq, 1); } static inline int disable_irq_wake(unsigned int irq) { return irq_set_irq_wake(irq, 0); } /* * irq_get_irqchip_state/irq_set_irqchip_state specific flags */ enum irqchip_irq_state { IRQCHIP_STATE_PENDING, /* Is interrupt pending? */ IRQCHIP_STATE_ACTIVE, /* Is interrupt in progress? */ IRQCHIP_STATE_MASKED, /* Is interrupt masked? */ IRQCHIP_STATE_LINE_LEVEL, /* Is IRQ line high? */ }; extern int irq_get_irqchip_state(unsigned int irq, enum irqchip_irq_state which, bool *state); extern int irq_set_irqchip_state(unsigned int irq, enum irqchip_irq_state which, bool state); #ifdef CONFIG_IRQ_FORCED_THREADING # ifdef CONFIG_PREEMPT_RT # define force_irqthreads() (true) # else DECLARE_STATIC_KEY_FALSE(force_irqthreads_key); # define force_irqthreads() (static_branch_unlikely(&force_irqthreads_key)) # endif #else #define force_irqthreads() (false) #endif #ifndef local_softirq_pending #ifndef local_softirq_pending_ref #define local_softirq_pending_ref irq_stat.__softirq_pending #endif #define local_softirq_pending() (__this_cpu_read(local_softirq_pending_ref)) #define set_softirq_pending(x) (__this_cpu_write(local_softirq_pending_ref, (x))) #define or_softirq_pending(x) (__this_cpu_or(local_softirq_pending_ref, (x))) #endif /* local_softirq_pending */ /* Some architectures might implement lazy enabling/disabling of * interrupts. In some cases, such as stop_machine, we might want * to ensure that after a local_irq_disable(), interrupts have * really been disabled in hardware. Such architectures need to * implement the following hook. */ #ifndef hard_irq_disable #define hard_irq_disable() do { } while(0) #endif /* PLEASE, avoid to allocate new softirqs, if you need not _really_ high frequency threaded job scheduling. For almost all the purposes tasklets are more than enough. F.e. all serial device BHs et al. should be converted to tasklets, not to softirqs. */ enum { HI_SOFTIRQ=0, TIMER_SOFTIRQ, NET_TX_SOFTIRQ, NET_RX_SOFTIRQ, BLOCK_SOFTIRQ, IRQ_POLL_SOFTIRQ, TASKLET_SOFTIRQ, SCHED_SOFTIRQ, HRTIMER_SOFTIRQ, RCU_SOFTIRQ, /* Preferable RCU should always be the last softirq */ NR_SOFTIRQS }; /* * The following vectors can be safely ignored after ksoftirqd is parked: * * _ RCU: * 1) rcutree_migrate_callbacks() migrates the queue. * 2) rcutree_report_cpu_dead() reports the final quiescent states. * * _ IRQ_POLL: irq_poll_cpu_dead() migrates the queue * * _ (HR)TIMER_SOFTIRQ: (hr)timers_dead_cpu() migrates the queue */ #define SOFTIRQ_HOTPLUG_SAFE_MASK (BIT(TIMER_SOFTIRQ) | BIT(IRQ_POLL_SOFTIRQ) |\ BIT(HRTIMER_SOFTIRQ) | BIT(RCU_SOFTIRQ)) /* map softirq index to softirq name. update 'softirq_to_name' in * kernel/softirq.c when adding a new softirq. */ extern const char * const softirq_to_name[NR_SOFTIRQS]; /* softirq mask and active fields moved to irq_cpustat_t in * asm/hardirq.h to get better cache usage. KAO */ struct softirq_action { void (*action)(void); }; asmlinkage void do_softirq(void); asmlinkage void __do_softirq(void); #ifdef CONFIG_PREEMPT_RT extern void do_softirq_post_smp_call_flush(unsigned int was_pending); #else static inline void do_softirq_post_smp_call_flush(unsigned int unused) { do_softirq(); } #endif extern void open_softirq(int nr, void (*action)(void)); extern void softirq_init(void); extern void __raise_softirq_irqoff(unsigned int nr); extern void raise_softirq_irqoff(unsigned int nr); extern void raise_softirq(unsigned int nr); DECLARE_PER_CPU(struct task_struct *, ksoftirqd); static inline struct task_struct *this_cpu_ksoftirqd(void) { return this_cpu_read(ksoftirqd); } /* Tasklets --- multithreaded analogue of BHs. This API is deprecated. Please consider using threaded IRQs instead: https://lore.kernel.org/lkml/20200716081538.2sivhkj4hcyrusem@linutronix.de Main feature differing them of generic softirqs: tasklet is running only on one CPU simultaneously. Main feature differing them of BHs: different tasklets may be run simultaneously on different CPUs. Properties: * If tasklet_schedule() is called, then tasklet is guaranteed to be executed on some cpu at least once after this. * If the tasklet is already scheduled, but its execution is still not started, it will be executed only once. * If this tasklet is already running on another CPU (or schedule is called from tasklet itself), it is rescheduled for later. * Tasklet is strictly serialized wrt itself, but not wrt another tasklets. If client needs some intertask synchronization, he makes it with spinlocks. */ struct tasklet_struct { struct tasklet_struct *next; unsigned long state; atomic_t count; bool use_callback; union { void (*func)(unsigned long data); void (*callback)(struct tasklet_struct *t); }; unsigned long data; }; #define DECLARE_TASKLET(name, _callback) \ struct tasklet_struct name = { \ .count = ATOMIC_INIT(0), \ .callback = _callback, \ .use_callback = true, \ } #define DECLARE_TASKLET_DISABLED(name, _callback) \ struct tasklet_struct name = { \ .count = ATOMIC_INIT(1), \ .callback = _callback, \ .use_callback = true, \ } #define from_tasklet(var, callback_tasklet, tasklet_fieldname) \ container_of(callback_tasklet, typeof(*var), tasklet_fieldname) #define DECLARE_TASKLET_OLD(name, _func) \ struct tasklet_struct name = { \ .count = ATOMIC_INIT(0), \ .func = _func, \ } #define DECLARE_TASKLET_DISABLED_OLD(name, _func) \ struct tasklet_struct name = { \ .count = ATOMIC_INIT(1), \ .func = _func, \ } enum { TASKLET_STATE_SCHED, /* Tasklet is scheduled for execution */ TASKLET_STATE_RUN /* Tasklet is running (SMP only) */ }; #if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT) static inline int tasklet_trylock(struct tasklet_struct *t) { return !test_and_set_bit(TASKLET_STATE_RUN, &(t)->state); } void tasklet_unlock(struct tasklet_struct *t); void tasklet_unlock_wait(struct tasklet_struct *t); void tasklet_unlock_spin_wait(struct tasklet_struct *t); #else static inline int tasklet_trylock(struct tasklet_struct *t) { return 1; } static inline void tasklet_unlock(struct tasklet_struct *t) { } static inline void tasklet_unlock_wait(struct tasklet_struct *t) { } static inline void tasklet_unlock_spin_wait(struct tasklet_struct *t) { } #endif extern void __tasklet_schedule(struct tasklet_struct *t); static inline void tasklet_schedule(struct tasklet_struct *t) { if (!test_and_set_bit(TASKLET_STATE_SCHED, &t->state)) __tasklet_schedule(t); } extern void __tasklet_hi_schedule(struct tasklet_struct *t); static inline void tasklet_hi_schedule(struct tasklet_struct *t) { if (!test_and_set_bit(TASKLET_STATE_SCHED, &t->state)) __tasklet_hi_schedule(t); } static inline void tasklet_disable_nosync(struct tasklet_struct *t) { atomic_inc(&t->count); smp_mb__after_atomic(); } /* * Do not use in new code. Disabling tasklets from atomic contexts is * error prone and should be avoided. */ static inline void tasklet_disable_in_atomic(struct tasklet_struct *t) { tasklet_disable_nosync(t); tasklet_unlock_spin_wait(t); smp_mb(); } static inline void tasklet_disable(struct tasklet_struct *t) { tasklet_disable_nosync(t); tasklet_unlock_wait(t); smp_mb(); } static inline void tasklet_enable(struct tasklet_struct *t) { smp_mb__before_atomic(); atomic_dec(&t->count); } extern void tasklet_kill(struct tasklet_struct *t); extern void tasklet_init(struct tasklet_struct *t, void (*func)(unsigned long), unsigned long data); extern void tasklet_setup(struct tasklet_struct *t, void (*callback)(struct tasklet_struct *)); /* * Autoprobing for irqs: * * probe_irq_on() and probe_irq_off() provide robust primitives * for accurate IRQ probing during kernel initialization. They are * reasonably simple to use, are not "fooled" by spurious interrupts, * and, unlike other attempts at IRQ probing, they do not get hung on * stuck interrupts (such as unused PS2 mouse interfaces on ASUS boards). * * For reasonably foolproof probing, use them as follows: * * 1. clear and/or mask the device's internal interrupt. * 2. sti(); * 3. irqs = probe_irq_on(); // "take over" all unassigned idle IRQs * 4. enable the device and cause it to trigger an interrupt. * 5. wait for the device to interrupt, using non-intrusive polling or a delay. * 6. irq = probe_irq_off(irqs); // get IRQ number, 0=none, negative=multiple * 7. service the device to clear its pending interrupt. * 8. loop again if paranoia is required. * * probe_irq_on() returns a mask of allocated irq's. * * probe_irq_off() takes the mask as a parameter, * and returns the irq number which occurred, * or zero if none occurred, or a negative irq number * if more than one irq occurred. */ #if !defined(CONFIG_GENERIC_IRQ_PROBE) static inline unsigned long probe_irq_on(void) { return 0; } static inline int probe_irq_off(unsigned long val) { return 0; } static inline unsigned int probe_irq_mask(unsigned long val) { return 0; } #else extern unsigned long probe_irq_on(void); /* returns 0 on failure */ extern int probe_irq_off(unsigned long); /* returns 0 or negative on failure */ extern unsigned int probe_irq_mask(unsigned long); /* returns mask of ISA interrupts */ #endif #ifdef CONFIG_PROC_FS /* Initialize /proc/irq/ */ extern void init_irq_proc(void); #else static inline void init_irq_proc(void) { } #endif #ifdef CONFIG_IRQ_TIMINGS void irq_timings_enable(void); void irq_timings_disable(void); u64 irq_timings_next_event(u64 now); #endif struct seq_file; int show_interrupts(struct seq_file *p, void *v); int arch_show_interrupts(struct seq_file *p, int prec); extern int early_irq_init(void); extern int arch_probe_nr_irqs(void); extern int arch_early_irq_init(void); /* * We want to know which function is an entrypoint of a hardirq or a softirq. */ #ifndef __irq_entry # define __irq_entry __section(".irqentry.text") #endif #define __softirq_entry __section(".softirqentry.text") #endif |
| 2 44 8 46 84 84 56 83 1 70 2 2 44 44 71 70 70 2 53 3 2 4 10 41 63 68 42 28 6 15 19 35 36 28 21 24 39 32 2 3 36 6 2 24 24 24 37 40 40 39 4 31 7 8 36 3 26 23 18 26 25 17 21 19 9 17 7 21 4 28 15 9 5 15 44 1 43 43 42 40 42 40 37 4 17 14 28 37 28 14 42 36 9 45 45 44 56 58 60 5 30 4 65 60 28 6 24 40 20 21 39 66 1 5 58 4 66 4 53 54 53 1 2 1 1 1 1 27 4 22 24 1 1 44 27 1 16 2 37 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 | // SPDX-License-Identifier: GPL-2.0 /* * This file contains the procedures for the handling of select and poll * * Created for Linux based loosely upon Mathius Lattner's minix * patches by Peter MacDonald. Heavily edited by Linus. * * 4 February 1994 * COFF/ELF binary emulation. If the process has the STICKY_TIMEOUTS * flag set in its personality we do *not* modify the given timeout * parameter to reflect time remaining. * * 24 January 2000 * Changed sys_poll()/do_poll() to use PAGE_SIZE chunk-based allocation * of fds to overcome nfds < 16390 descriptors limit (Tigran Aivazian). */ #include <linux/compat.h> #include <linux/kernel.h> #include <linux/sched/signal.h> #include <linux/sched/rt.h> #include <linux/syscalls.h> #include <linux/export.h> #include <linux/slab.h> #include <linux/poll.h> #include <linux/personality.h> /* for STICKY_TIMEOUTS */ #include <linux/file.h> #include <linux/fdtable.h> #include <linux/fs.h> #include <linux/rcupdate.h> #include <linux/hrtimer.h> #include <linux/freezer.h> #include <net/busy_poll.h> #include <linux/vmalloc.h> #include <linux/uaccess.h> /* * Estimate expected accuracy in ns from a timeval. * * After quite a bit of churning around, we've settled on * a simple thing of taking 0.1% of the timeout as the * slack, with a cap of 100 msec. * "nice" tasks get a 0.5% slack instead. * * Consider this comment an open invitation to come up with even * better solutions.. */ #define MAX_SLACK (100 * NSEC_PER_MSEC) static long __estimate_accuracy(struct timespec64 *tv) { long slack; int divfactor = 1000; if (tv->tv_sec < 0) return 0; if (task_nice(current) > 0) divfactor = divfactor / 5; if (tv->tv_sec > MAX_SLACK / (NSEC_PER_SEC/divfactor)) return MAX_SLACK; slack = tv->tv_nsec / divfactor; slack += tv->tv_sec * (NSEC_PER_SEC/divfactor); if (slack > MAX_SLACK) return MAX_SLACK; return slack; } u64 select_estimate_accuracy(struct timespec64 *tv) { u64 ret; struct timespec64 now; u64 slack = current->timer_slack_ns; if (slack == 0) return 0; ktime_get_ts64(&now); now = timespec64_sub(*tv, now); ret = __estimate_accuracy(&now); if (ret < slack) return slack; return ret; } struct poll_table_page { struct poll_table_page * next; struct poll_table_entry * entry; struct poll_table_entry entries[]; }; #define POLL_TABLE_FULL(table) \ ((unsigned long)((table)->entry+1) > PAGE_SIZE + (unsigned long)(table)) /* * Ok, Peter made a complicated, but straightforward multiple_wait() function. * I have rewritten this, taking some shortcuts: This code may not be easy to * follow, but it should be free of race-conditions, and it's practical. If you * understand what I'm doing here, then you understand how the linux * sleep/wakeup mechanism works. * * Two very simple procedures, poll_wait() and poll_freewait() make all the * work. poll_wait() is an inline-function defined in <linux/poll.h>, * as all select/poll functions have to call it to add an entry to the * poll table. */ static void __pollwait(struct file *filp, wait_queue_head_t *wait_address, poll_table *p); void poll_initwait(struct poll_wqueues *pwq) { init_poll_funcptr(&pwq->pt, __pollwait); pwq->polling_task = current; pwq->triggered = 0; pwq->error = 0; pwq->table = NULL; pwq->inline_index = 0; } EXPORT_SYMBOL(poll_initwait); static void free_poll_entry(struct poll_table_entry *entry) { remove_wait_queue(entry->wait_address, &entry->wait); fput(entry->filp); } void poll_freewait(struct poll_wqueues *pwq) { struct poll_table_page * p = pwq->table; int i; for (i = 0; i < pwq->inline_index; i++) free_poll_entry(pwq->inline_entries + i); while (p) { struct poll_table_entry * entry; struct poll_table_page *old; entry = p->entry; do { entry--; free_poll_entry(entry); } while (entry > p->entries); old = p; p = p->next; free_page((unsigned long) old); } } EXPORT_SYMBOL(poll_freewait); static struct poll_table_entry *poll_get_entry(struct poll_wqueues *p) { struct poll_table_page *table = p->table; if (p->inline_index < N_INLINE_POLL_ENTRIES) return p->inline_entries + p->inline_index++; if (!table || POLL_TABLE_FULL(table)) { struct poll_table_page *new_table; new_table = (struct poll_table_page *) __get_free_page(GFP_KERNEL); if (!new_table) { p->error = -ENOMEM; return NULL; } new_table->entry = new_table->entries; new_table->next = table; p->table = new_table; table = new_table; } return table->entry++; } static int __pollwake(wait_queue_entry_t *wait, unsigned mode, int sync, void *key) { struct poll_wqueues *pwq = wait->private; DECLARE_WAITQUEUE(dummy_wait, pwq->polling_task); /* * Although this function is called under waitqueue lock, LOCK * doesn't imply write barrier and the users expect write * barrier semantics on wakeup functions. The following * smp_wmb() is equivalent to smp_wmb() in try_to_wake_up() * and is paired with smp_store_mb() in poll_schedule_timeout. */ smp_wmb(); pwq->triggered = 1; /* * Perform the default wake up operation using a dummy * waitqueue. * * TODO: This is hacky but there currently is no interface to * pass in @sync. @sync is scheduled to be removed and once * that happens, wake_up_process() can be used directly. */ return default_wake_function(&dummy_wait, mode, sync, key); } static int pollwake(wait_queue_entry_t *wait, unsigned mode, int sync, void *key) { struct poll_table_entry *entry; entry = container_of(wait, struct poll_table_entry, wait); if (key && !(key_to_poll(key) & entry->key)) return 0; return __pollwake(wait, mode, sync, key); } /* Add a new entry */ static void __pollwait(struct file *filp, wait_queue_head_t *wait_address, poll_table *p) { struct poll_wqueues *pwq = container_of(p, struct poll_wqueues, pt); struct poll_table_entry *entry = poll_get_entry(pwq); if (!entry) return; entry->filp = get_file(filp); entry->wait_address = wait_address; entry->key = p->_key; init_waitqueue_func_entry(&entry->wait, pollwake); entry->wait.private = pwq; add_wait_queue(wait_address, &entry->wait); } static int poll_schedule_timeout(struct poll_wqueues *pwq, int state, ktime_t *expires, unsigned long slack) { int rc = -EINTR; set_current_state(state); if (!pwq->triggered) rc = schedule_hrtimeout_range(expires, slack, HRTIMER_MODE_ABS); __set_current_state(TASK_RUNNING); /* * Prepare for the next iteration. * * The following smp_store_mb() serves two purposes. First, it's * the counterpart rmb of the wmb in pollwake() such that data * written before wake up is always visible after wake up. * Second, the full barrier guarantees that triggered clearing * doesn't pass event check of the next iteration. Note that * this problem doesn't exist for the first iteration as * add_wait_queue() has full barrier semantics. */ smp_store_mb(pwq->triggered, 0); return rc; } /** * poll_select_set_timeout - helper function to setup the timeout value * @to: pointer to timespec64 variable for the final timeout * @sec: seconds (from user space) * @nsec: nanoseconds (from user space) * * Note, we do not use a timespec for the user space value here, That * way we can use the function for timeval and compat interfaces as well. * * Returns -EINVAL if sec/nsec are not normalized. Otherwise 0. */ int poll_select_set_timeout(struct timespec64 *to, time64_t sec, long nsec) { struct timespec64 ts = {.tv_sec = sec, .tv_nsec = nsec}; if (!timespec64_valid(&ts)) return -EINVAL; /* Optimize for the zero timeout value here */ if (!sec && !nsec) { to->tv_sec = to->tv_nsec = 0; } else { ktime_get_ts64(to); *to = timespec64_add_safe(*to, ts); } return 0; } enum poll_time_type { PT_TIMEVAL = 0, PT_OLD_TIMEVAL = 1, PT_TIMESPEC = 2, PT_OLD_TIMESPEC = 3, }; static int poll_select_finish(struct timespec64 *end_time, void __user *p, enum poll_time_type pt_type, int ret) { struct timespec64 rts; restore_saved_sigmask_unless(ret == -ERESTARTNOHAND); if (!p) return ret; if (current->personality & STICKY_TIMEOUTS) goto sticky; /* No update for zero timeout */ if (!end_time->tv_sec && !end_time->tv_nsec) return ret; ktime_get_ts64(&rts); rts = timespec64_sub(*end_time, rts); if (rts.tv_sec < 0) rts.tv_sec = rts.tv_nsec = 0; switch (pt_type) { case PT_TIMEVAL: { struct __kernel_old_timeval rtv; if (sizeof(rtv) > sizeof(rtv.tv_sec) + sizeof(rtv.tv_usec)) memset(&rtv, 0, sizeof(rtv)); rtv.tv_sec = rts.tv_sec; rtv.tv_usec = rts.tv_nsec / NSEC_PER_USEC; if (!copy_to_user(p, &rtv, sizeof(rtv))) return ret; } break; case PT_OLD_TIMEVAL: { struct old_timeval32 rtv; rtv.tv_sec = rts.tv_sec; rtv.tv_usec = rts.tv_nsec / NSEC_PER_USEC; if (!copy_to_user(p, &rtv, sizeof(rtv))) return ret; } break; case PT_TIMESPEC: if (!put_timespec64(&rts, p)) return ret; break; case PT_OLD_TIMESPEC: if (!put_old_timespec32(&rts, p)) return ret; break; default: BUG(); } /* * If an application puts its timeval in read-only memory, we * don't want the Linux-specific update to the timeval to * cause a fault after the select has completed * successfully. However, because we're not updating the * timeval, we can't restart the system call. */ sticky: if (ret == -ERESTARTNOHAND) ret = -EINTR; return ret; } /* * Scalable version of the fd_set. */ typedef struct { unsigned long *in, *out, *ex; unsigned long *res_in, *res_out, *res_ex; } fd_set_bits; /* * How many longwords for "nr" bits? */ #define FDS_BITPERLONG (8*sizeof(long)) #define FDS_LONGS(nr) (((nr)+FDS_BITPERLONG-1)/FDS_BITPERLONG) #define FDS_BYTES(nr) (FDS_LONGS(nr)*sizeof(long)) /* * Use "unsigned long" accesses to let user-mode fd_set's be long-aligned. */ static inline int get_fd_set(unsigned long nr, void __user *ufdset, unsigned long *fdset) { nr = FDS_BYTES(nr); if (ufdset) return copy_from_user(fdset, ufdset, nr) ? -EFAULT : 0; memset(fdset, 0, nr); return 0; } static inline unsigned long __must_check set_fd_set(unsigned long nr, void __user *ufdset, unsigned long *fdset) { if (ufdset) return __copy_to_user(ufdset, fdset, FDS_BYTES(nr)); return 0; } static inline void zero_fd_set(unsigned long nr, unsigned long *fdset) { memset(fdset, 0, FDS_BYTES(nr)); } #define FDS_IN(fds, n) (fds->in + n) #define FDS_OUT(fds, n) (fds->out + n) #define FDS_EX(fds, n) (fds->ex + n) #define BITS(fds, n) (*FDS_IN(fds, n)|*FDS_OUT(fds, n)|*FDS_EX(fds, n)) static int max_select_fd(unsigned long n, fd_set_bits *fds) { unsigned long *open_fds; unsigned long set; int max; struct fdtable *fdt; /* handle last in-complete long-word first */ set = ~(~0UL << (n & (BITS_PER_LONG-1))); n /= BITS_PER_LONG; fdt = files_fdtable(current->files); open_fds = fdt->open_fds + n; max = 0; if (set) { set &= BITS(fds, n); if (set) { if (!(set & ~*open_fds)) goto get_max; return -EBADF; } } while (n) { open_fds--; n--; set = BITS(fds, n); if (!set) continue; if (set & ~*open_fds) return -EBADF; if (max) continue; get_max: do { max++; set >>= 1; } while (set); max += n * BITS_PER_LONG; } return max; } #define POLLIN_SET (EPOLLRDNORM | EPOLLRDBAND | EPOLLIN | EPOLLHUP | EPOLLERR |\ EPOLLNVAL) #define POLLOUT_SET (EPOLLWRBAND | EPOLLWRNORM | EPOLLOUT | EPOLLERR |\ EPOLLNVAL) #define POLLEX_SET (EPOLLPRI | EPOLLNVAL) static inline void wait_key_set(poll_table *wait, unsigned long in, unsigned long out, unsigned long bit, __poll_t ll_flag) { wait->_key = POLLEX_SET | ll_flag; if (in & bit) wait->_key |= POLLIN_SET; if (out & bit) wait->_key |= POLLOUT_SET; } static noinline_for_stack int do_select(int n, fd_set_bits *fds, struct timespec64 *end_time) { ktime_t expire, *to = NULL; struct poll_wqueues table; poll_table *wait; int retval, i, timed_out = 0; u64 slack = 0; __poll_t busy_flag = net_busy_loop_on() ? POLL_BUSY_LOOP : 0; unsigned long busy_start = 0; rcu_read_lock(); retval = max_select_fd(n, fds); rcu_read_unlock(); if (retval < 0) return retval; n = retval; poll_initwait(&table); wait = &table.pt; if (end_time && !end_time->tv_sec && !end_time->tv_nsec) { wait->_qproc = NULL; timed_out = 1; } if (end_time && !timed_out) slack = select_estimate_accuracy(end_time); retval = 0; for (;;) { unsigned long *rinp, *routp, *rexp, *inp, *outp, *exp; bool can_busy_loop = false; inp = fds->in; outp = fds->out; exp = fds->ex; rinp = fds->res_in; routp = fds->res_out; rexp = fds->res_ex; for (i = 0; i < n; ++rinp, ++routp, ++rexp) { unsigned long in, out, ex, all_bits, bit = 1, j; unsigned long res_in = 0, res_out = 0, res_ex = 0; __poll_t mask; in = *inp++; out = *outp++; ex = *exp++; all_bits = in | out | ex; if (all_bits == 0) { i += BITS_PER_LONG; continue; } for (j = 0; j < BITS_PER_LONG; ++j, ++i, bit <<= 1) { struct fd f; if (i >= n) break; if (!(bit & all_bits)) continue; mask = EPOLLNVAL; f = fdget(i); if (fd_file(f)) { wait_key_set(wait, in, out, bit, busy_flag); mask = vfs_poll(fd_file(f), wait); fdput(f); } if ((mask & POLLIN_SET) && (in & bit)) { res_in |= bit; retval++; wait->_qproc = NULL; } if ((mask & POLLOUT_SET) && (out & bit)) { res_out |= bit; retval++; wait->_qproc = NULL; } if ((mask & POLLEX_SET) && (ex & bit)) { res_ex |= bit; retval++; wait->_qproc = NULL; } /* got something, stop busy polling */ if (retval) { can_busy_loop = false; busy_flag = 0; /* * only remember a returned * POLL_BUSY_LOOP if we asked for it */ } else if (busy_flag & mask) can_busy_loop = true; } if (res_in) *rinp = res_in; if (res_out) *routp = res_out; if (res_ex) *rexp = res_ex; cond_resched(); } wait->_qproc = NULL; if (retval || timed_out || signal_pending(current)) break; if (table.error) { retval = table.error; break; } /* only if found POLL_BUSY_LOOP sockets && not out of time */ if (can_busy_loop && !need_resched()) { if (!busy_start) { busy_start = busy_loop_current_time(); continue; } if (!busy_loop_timeout(busy_start)) continue; } busy_flag = 0; /* * If this is the first loop and we have a timeout * given, then we convert to ktime_t and set the to * pointer to the expiry value. */ if (end_time && !to) { expire = timespec64_to_ktime(*end_time); to = &expire; } if (!poll_schedule_timeout(&table, TASK_INTERRUPTIBLE, to, slack)) timed_out = 1; } poll_freewait(&table); return retval; } /* * We can actually return ERESTARTSYS instead of EINTR, but I'd * like to be certain this leads to no problems. So I return * EINTR just for safety. * * Update: ERESTARTSYS breaks at least the xview clock binary, so * I'm trying ERESTARTNOHAND which restart only when you want to. */ int core_sys_select(int n, fd_set __user *inp, fd_set __user *outp, fd_set __user *exp, struct timespec64 *end_time) { fd_set_bits fds; void *bits; int ret, max_fds; size_t size, alloc_size; struct fdtable *fdt; /* Allocate small arguments on the stack to save memory and be faster */ long stack_fds[SELECT_STACK_ALLOC/sizeof(long)]; ret = -EINVAL; if (n < 0) goto out_nofds; /* max_fds can increase, so grab it once to avoid race */ rcu_read_lock(); fdt = files_fdtable(current->files); max_fds = fdt->max_fds; rcu_read_unlock(); if (n > max_fds) n = max_fds; /* * We need 6 bitmaps (in/out/ex for both incoming and outgoing), * since we used fdset we need to allocate memory in units of * long-words. */ size = FDS_BYTES(n); bits = stack_fds; if (size > sizeof(stack_fds) / 6) { /* Not enough space in on-stack array; must use kmalloc */ ret = -ENOMEM; if (size > (SIZE_MAX / 6)) goto out_nofds; alloc_size = 6 * size; bits = kvmalloc(alloc_size, GFP_KERNEL); if (!bits) goto out_nofds; } fds.in = bits; fds.out = bits + size; fds.ex = bits + 2*size; fds.res_in = bits + 3*size; fds.res_out = bits + 4*size; fds.res_ex = bits + 5*size; if ((ret = get_fd_set(n, inp, fds.in)) || (ret = get_fd_set(n, outp, fds.out)) || (ret = get_fd_set(n, exp, fds.ex))) goto out; zero_fd_set(n, fds.res_in); zero_fd_set(n, fds.res_out); zero_fd_set(n, fds.res_ex); ret = do_select(n, &fds, end_time); if (ret < 0) goto out; if (!ret) { ret = -ERESTARTNOHAND; if (signal_pending(current)) goto out; ret = 0; } if (set_fd_set(n, inp, fds.res_in) || set_fd_set(n, outp, fds.res_out) || set_fd_set(n, exp, fds.res_ex)) ret = -EFAULT; out: if (bits != stack_fds) kvfree(bits); out_nofds: return ret; } static int kern_select(int n, fd_set __user *inp, fd_set __user *outp, fd_set __user *exp, struct __kernel_old_timeval __user *tvp) { struct timespec64 end_time, *to = NULL; struct __kernel_old_timeval tv; int ret; if (tvp) { if (copy_from_user(&tv, tvp, sizeof(tv))) return -EFAULT; to = &end_time; if (poll_select_set_timeout(to, tv.tv_sec + (tv.tv_usec / USEC_PER_SEC), (tv.tv_usec % USEC_PER_SEC) * NSEC_PER_USEC)) return -EINVAL; } ret = core_sys_select(n, inp, outp, exp, to); return poll_select_finish(&end_time, tvp, PT_TIMEVAL, ret); } SYSCALL_DEFINE5(select, int, n, fd_set __user *, inp, fd_set __user *, outp, fd_set __user *, exp, struct __kernel_old_timeval __user *, tvp) { return kern_select(n, inp, outp, exp, tvp); } static long do_pselect(int n, fd_set __user *inp, fd_set __user *outp, fd_set __user *exp, void __user *tsp, const sigset_t __user *sigmask, size_t sigsetsize, enum poll_time_type type) { struct timespec64 ts, end_time, *to = NULL; int ret; if (tsp) { switch (type) { case PT_TIMESPEC: if (get_timespec64(&ts, tsp)) return -EFAULT; break; case PT_OLD_TIMESPEC: if (get_old_timespec32(&ts, tsp)) return -EFAULT; break; default: BUG(); } to = &end_time; if (poll_select_set_timeout(to, ts.tv_sec, ts.tv_nsec)) return -EINVAL; } ret = set_user_sigmask(sigmask, sigsetsize); if (ret) return ret; ret = core_sys_select(n, inp, outp, exp, to); return poll_select_finish(&end_time, tsp, type, ret); } /* * Most architectures can't handle 7-argument syscalls. So we provide a * 6-argument version where the sixth argument is a pointer to a structure * which has a pointer to the sigset_t itself followed by a size_t containing * the sigset size. */ struct sigset_argpack { sigset_t __user *p; size_t size; }; static inline int get_sigset_argpack(struct sigset_argpack *to, struct sigset_argpack __user *from) { // the path is hot enough for overhead of copy_from_user() to matter if (from) { if (can_do_masked_user_access()) from = masked_user_access_begin(from); else if (!user_read_access_begin(from, sizeof(*from))) return -EFAULT; unsafe_get_user(to->p, &from->p, Efault); unsafe_get_user(to->size, &from->size, Efault); user_read_access_end(); } return 0; Efault: user_access_end(); return -EFAULT; } SYSCALL_DEFINE6(pselect6, int, n, fd_set __user *, inp, fd_set __user *, outp, fd_set __user *, exp, struct __kernel_timespec __user *, tsp, void __user *, sig) { struct sigset_argpack x = {NULL, 0}; if (get_sigset_argpack(&x, sig)) return -EFAULT; return do_pselect(n, inp, outp, exp, tsp, x.p, x.size, PT_TIMESPEC); } #if defined(CONFIG_COMPAT_32BIT_TIME) && !defined(CONFIG_64BIT) SYSCALL_DEFINE6(pselect6_time32, int, n, fd_set __user *, inp, fd_set __user *, outp, fd_set __user *, exp, struct old_timespec32 __user *, tsp, void __user *, sig) { struct sigset_argpack x = {NULL, 0}; if (get_sigset_argpack(&x, sig)) return -EFAULT; return do_pselect(n, inp, outp, exp, tsp, x.p, x.size, PT_OLD_TIMESPEC); } #endif #ifdef __ARCH_WANT_SYS_OLD_SELECT struct sel_arg_struct { unsigned long n; fd_set __user *inp, *outp, *exp; struct __kernel_old_timeval __user *tvp; }; SYSCALL_DEFINE1(old_select, struct sel_arg_struct __user *, arg) { struct sel_arg_struct a; if (copy_from_user(&a, arg, sizeof(a))) return -EFAULT; return kern_select(a.n, a.inp, a.outp, a.exp, a.tvp); } #endif struct poll_list { struct poll_list *next; unsigned int len; struct pollfd entries[] __counted_by(len); }; #define POLLFD_PER_PAGE ((PAGE_SIZE-sizeof(struct poll_list)) / sizeof(struct pollfd)) /* * Fish for pollable events on the pollfd->fd file descriptor. We're only * interested in events matching the pollfd->events mask, and the result * matching that mask is both recorded in pollfd->revents and returned. The * pwait poll_table will be used by the fd-provided poll handler for waiting, * if pwait->_qproc is non-NULL. */ static inline __poll_t do_pollfd(struct pollfd *pollfd, poll_table *pwait, bool *can_busy_poll, __poll_t busy_flag) { int fd = pollfd->fd; __poll_t mask = 0, filter; struct fd f; if (fd < 0) goto out; mask = EPOLLNVAL; f = fdget(fd); if (!fd_file(f)) goto out; /* userland u16 ->events contains POLL... bitmap */ filter = demangle_poll(pollfd->events) | EPOLLERR | EPOLLHUP; pwait->_key = filter | busy_flag; mask = vfs_poll(fd_file(f), pwait); if (mask & busy_flag) *can_busy_poll = true; mask &= filter; /* Mask out unneeded events. */ fdput(f); out: /* ... and so does ->revents */ pollfd->revents = mangle_poll(mask); return mask; } static int do_poll(struct poll_list *list, struct poll_wqueues *wait, struct timespec64 *end_time) { poll_table* pt = &wait->pt; ktime_t expire, *to = NULL; int timed_out = 0, count = 0; u64 slack = 0; __poll_t busy_flag = net_busy_loop_on() ? POLL_BUSY_LOOP : 0; unsigned long busy_start = 0; /* Optimise the no-wait case */ if (end_time && !end_time->tv_sec && !end_time->tv_nsec) { pt->_qproc = NULL; timed_out = 1; } if (end_time && !timed_out) slack = select_estimate_accuracy(end_time); for (;;) { struct poll_list *walk; bool can_busy_loop = false; for (walk = list; walk != NULL; walk = walk->next) { struct pollfd * pfd, * pfd_end; pfd = walk->entries; pfd_end = pfd + walk->len; for (; pfd != pfd_end; pfd++) { /* * Fish for events. If we found one, record it * and kill poll_table->_qproc, so we don't * needlessly register any other waiters after * this. They'll get immediately deregistered * when we break out and return. */ if (do_pollfd(pfd, pt, &can_busy_loop, busy_flag)) { count++; pt->_qproc = NULL; /* found something, stop busy polling */ busy_flag = 0; can_busy_loop = false; } } } /* * All waiters have already been registered, so don't provide * a poll_table->_qproc to them on the next loop iteration. */ pt->_qproc = NULL; if (!count) { count = wait->error; if (signal_pending(current)) count = -ERESTARTNOHAND; } if (count || timed_out) break; /* only if found POLL_BUSY_LOOP sockets && not out of time */ if (can_busy_loop && !need_resched()) { if (!busy_start) { busy_start = busy_loop_current_time(); continue; } if (!busy_loop_timeout(busy_start)) continue; } busy_flag = 0; /* * If this is the first loop and we have a timeout * given, then we convert to ktime_t and set the to * pointer to the expiry value. */ if (end_time && !to) { expire = timespec64_to_ktime(*end_time); to = &expire; } if (!poll_schedule_timeout(wait, TASK_INTERRUPTIBLE, to, slack)) timed_out = 1; } return count; } #define N_STACK_PPS ((sizeof(stack_pps) - sizeof(struct poll_list)) / \ sizeof(struct pollfd)) static int do_sys_poll(struct pollfd __user *ufds, unsigned int nfds, struct timespec64 *end_time) { struct poll_wqueues table; int err = -EFAULT, fdcount; /* Allocate small arguments on the stack to save memory and be faster - use long to make sure the buffer is aligned properly on 64 bit archs to avoid unaligned access */ long stack_pps[POLL_STACK_ALLOC/sizeof(long)]; struct poll_list *const head = (struct poll_list *)stack_pps; struct poll_list *walk = head; unsigned int todo = nfds; unsigned int len; if (nfds > rlimit(RLIMIT_NOFILE)) return -EINVAL; len = min_t(unsigned int, nfds, N_STACK_PPS); for (;;) { walk->next = NULL; walk->len = len; if (!len) break; if (copy_from_user(walk->entries, ufds + nfds-todo, sizeof(struct pollfd) * walk->len)) goto out_fds; if (walk->len >= todo) break; todo -= walk->len; len = min(todo, POLLFD_PER_PAGE); walk = walk->next = kmalloc(struct_size(walk, entries, len), GFP_KERNEL); if (!walk) { err = -ENOMEM; goto out_fds; } } poll_initwait(&table); fdcount = do_poll(head, &table, end_time); poll_freewait(&table); if (!user_write_access_begin(ufds, nfds * sizeof(*ufds))) goto out_fds; for (walk = head; walk; walk = walk->next) { struct pollfd *fds = walk->entries; unsigned int j; for (j = walk->len; j; fds++, ufds++, j--) unsafe_put_user(fds->revents, &ufds->revents, Efault); } user_write_access_end(); err = fdcount; out_fds: walk = head->next; while (walk) { struct poll_list *pos = walk; walk = walk->next; kfree(pos); } return err; Efault: user_write_access_end(); err = -EFAULT; goto out_fds; } static long do_restart_poll(struct restart_block *restart_block) { struct pollfd __user *ufds = restart_block->poll.ufds; int nfds = restart_block->poll.nfds; struct timespec64 *to = NULL, end_time; int ret; if (restart_block->poll.has_timeout) { end_time.tv_sec = restart_block->poll.tv_sec; end_time.tv_nsec = restart_block->poll.tv_nsec; to = &end_time; } ret = do_sys_poll(ufds, nfds, to); if (ret == -ERESTARTNOHAND) ret = set_restart_fn(restart_block, do_restart_poll); return ret; } SYSCALL_DEFINE3(poll, struct pollfd __user *, ufds, unsigned int, nfds, int, timeout_msecs) { struct timespec64 end_time, *to = NULL; int ret; if (timeout_msecs >= 0) { to = &end_time; poll_select_set_timeout(to, timeout_msecs / MSEC_PER_SEC, NSEC_PER_MSEC * (timeout_msecs % MSEC_PER_SEC)); } ret = do_sys_poll(ufds, nfds, to); if (ret == -ERESTARTNOHAND) { struct restart_block *restart_block; restart_block = ¤t->restart_block; restart_block->poll.ufds = ufds; restart_block->poll.nfds = nfds; if (timeout_msecs >= 0) { restart_block->poll.tv_sec = end_time.tv_sec; restart_block->poll.tv_nsec = end_time.tv_nsec; restart_block->poll.has_timeout = 1; } else restart_block->poll.has_timeout = 0; ret = set_restart_fn(restart_block, do_restart_poll); } return ret; } SYSCALL_DEFINE5(ppoll, struct pollfd __user *, ufds, unsigned int, nfds, struct __kernel_timespec __user *, tsp, const sigset_t __user *, sigmask, size_t, sigsetsize) { struct timespec64 ts, end_time, *to = NULL; int ret; if (tsp) { if (get_timespec64(&ts, tsp)) return -EFAULT; to = &end_time; if (poll_select_set_timeout(to, ts.tv_sec, ts.tv_nsec)) return -EINVAL; } ret = set_user_sigmask(sigmask, sigsetsize); if (ret) return ret; ret = do_sys_poll(ufds, nfds, to); return poll_select_finish(&end_time, tsp, PT_TIMESPEC, ret); } #if defined(CONFIG_COMPAT_32BIT_TIME) && !defined(CONFIG_64BIT) SYSCALL_DEFINE5(ppoll_time32, struct pollfd __user *, ufds, unsigned int, nfds, struct old_timespec32 __user *, tsp, const sigset_t __user *, sigmask, size_t, sigsetsize) { struct timespec64 ts, end_time, *to = NULL; int ret; if (tsp) { if (get_old_timespec32(&ts, tsp)) return -EFAULT; to = &end_time; if (poll_select_set_timeout(to, ts.tv_sec, ts.tv_nsec)) return -EINVAL; } ret = set_user_sigmask(sigmask, sigsetsize); if (ret) return ret; ret = do_sys_poll(ufds, nfds, to); return poll_select_finish(&end_time, tsp, PT_OLD_TIMESPEC, ret); } #endif #ifdef CONFIG_COMPAT #define __COMPAT_NFDBITS (8 * sizeof(compat_ulong_t)) /* * Ooo, nasty. We need here to frob 32-bit unsigned longs to * 64-bit unsigned longs. */ static int compat_get_fd_set(unsigned long nr, compat_ulong_t __user *ufdset, unsigned long *fdset) { if (ufdset) { return compat_get_bitmap(fdset, ufdset, nr); } else { zero_fd_set(nr, fdset); return 0; } } static int compat_set_fd_set(unsigned long nr, compat_ulong_t __user *ufdset, unsigned long *fdset) { if (!ufdset) return 0; return compat_put_bitmap(ufdset, fdset, nr); } /* * This is a virtual copy of sys_select from fs/select.c and probably * should be compared to it from time to time */ /* * We can actually return ERESTARTSYS instead of EINTR, but I'd * like to be certain this leads to no problems. So I return * EINTR just for safety. * * Update: ERESTARTSYS breaks at least the xview clock binary, so * I'm trying ERESTARTNOHAND which restart only when you want to. */ static int compat_core_sys_select(int n, compat_ulong_t __user *inp, compat_ulong_t __user *outp, compat_ulong_t __user *exp, struct timespec64 *end_time) { fd_set_bits fds; void *bits; int size, max_fds, ret = -EINVAL; struct fdtable *fdt; long stack_fds[SELECT_STACK_ALLOC/sizeof(long)]; if (n < 0) goto out_nofds; /* max_fds can increase, so grab it once to avoid race */ rcu_read_lock(); fdt = files_fdtable(current->files); max_fds = fdt->max_fds; rcu_read_unlock(); if (n > max_fds) n = max_fds; /* * We need 6 bitmaps (in/out/ex for both incoming and outgoing), * since we used fdset we need to allocate memory in units of * long-words. */ size = FDS_BYTES(n); bits = stack_fds; if (size > sizeof(stack_fds) / 6) { bits = kmalloc_array(6, size, GFP_KERNEL); ret = -ENOMEM; if (!bits) goto out_nofds; } fds.in = (unsigned long *) bits; fds.out = (unsigned long *) (bits + size); fds.ex = (unsigned long *) (bits + 2*size); fds.res_in = (unsigned long *) (bits + 3*size); fds.res_out = (unsigned long *) (bits + 4*size); fds.res_ex = (unsigned long *) (bits + 5*size); if ((ret = compat_get_fd_set(n, inp, fds.in)) || (ret = compat_get_fd_set(n, outp, fds.out)) || (ret = compat_get_fd_set(n, exp, fds.ex))) goto out; zero_fd_set(n, fds.res_in); zero_fd_set(n, fds.res_out); zero_fd_set(n, fds.res_ex); ret = do_select(n, &fds, end_time); if (ret < 0) goto out; if (!ret) { ret = -ERESTARTNOHAND; if (signal_pending(current)) goto out; ret = 0; } if (compat_set_fd_set(n, inp, fds.res_in) || compat_set_fd_set(n, outp, fds.res_out) || compat_set_fd_set(n, exp, fds.res_ex)) ret = -EFAULT; out: if (bits != stack_fds) kfree(bits); out_nofds: return ret; } static int do_compat_select(int n, compat_ulong_t __user *inp, compat_ulong_t __user *outp, compat_ulong_t __user *exp, struct old_timeval32 __user *tvp) { struct timespec64 end_time, *to = NULL; struct old_timeval32 tv; int ret; if (tvp) { if (copy_from_user(&tv, tvp, sizeof(tv))) return -EFAULT; to = &end_time; if (poll_select_set_timeout(to, tv.tv_sec + (tv.tv_usec / USEC_PER_SEC), (tv.tv_usec % USEC_PER_SEC) * NSEC_PER_USEC)) return -EINVAL; } ret = compat_core_sys_select(n, inp, outp, exp, to); return poll_select_finish(&end_time, tvp, PT_OLD_TIMEVAL, ret); } COMPAT_SYSCALL_DEFINE5(select, int, n, compat_ulong_t __user *, inp, compat_ulong_t __user *, outp, compat_ulong_t __user *, exp, struct old_timeval32 __user *, tvp) { return do_compat_select(n, inp, outp, exp, tvp); } struct compat_sel_arg_struct { compat_ulong_t n; compat_uptr_t inp; compat_uptr_t outp; compat_uptr_t exp; compat_uptr_t tvp; }; COMPAT_SYSCALL_DEFINE1(old_select, struct compat_sel_arg_struct __user *, arg) { struct compat_sel_arg_struct a; if (copy_from_user(&a, arg, sizeof(a))) return -EFAULT; return do_compat_select(a.n, compat_ptr(a.inp), compat_ptr(a.outp), compat_ptr(a.exp), compat_ptr(a.tvp)); } static long do_compat_pselect(int n, compat_ulong_t __user *inp, compat_ulong_t __user *outp, compat_ulong_t __user *exp, void __user *tsp, compat_sigset_t __user *sigmask, compat_size_t sigsetsize, enum poll_time_type type) { struct timespec64 ts, end_time, *to = NULL; int ret; if (tsp) { switch (type) { case PT_OLD_TIMESPEC: if (get_old_timespec32(&ts, tsp)) return -EFAULT; break; case PT_TIMESPEC: if (get_timespec64(&ts, tsp)) return -EFAULT; break; default: BUG(); } to = &end_time; if (poll_select_set_timeout(to, ts.tv_sec, ts.tv_nsec)) return -EINVAL; } ret = set_compat_user_sigmask(sigmask, sigsetsize); if (ret) return ret; ret = compat_core_sys_select(n, inp, outp, exp, to); return poll_select_finish(&end_time, tsp, type, ret); } struct compat_sigset_argpack { compat_uptr_t p; compat_size_t size; }; static inline int get_compat_sigset_argpack(struct compat_sigset_argpack *to, struct compat_sigset_argpack __user *from) { if (from) { if (!user_read_access_begin(from, sizeof(*from))) return -EFAULT; unsafe_get_user(to->p, &from->p, Efault); unsafe_get_user(to->size, &from->size, Efault); user_read_access_end(); } return 0; Efault: user_access_end(); return -EFAULT; } COMPAT_SYSCALL_DEFINE6(pselect6_time64, int, n, compat_ulong_t __user *, inp, compat_ulong_t __user *, outp, compat_ulong_t __user *, exp, struct __kernel_timespec __user *, tsp, void __user *, sig) { struct compat_sigset_argpack x = {0, 0}; if (get_compat_sigset_argpack(&x, sig)) return -EFAULT; return do_compat_pselect(n, inp, outp, exp, tsp, compat_ptr(x.p), x.size, PT_TIMESPEC); } #if defined(CONFIG_COMPAT_32BIT_TIME) COMPAT_SYSCALL_DEFINE6(pselect6_time32, int, n, compat_ulong_t __user *, inp, compat_ulong_t __user *, outp, compat_ulong_t __user *, exp, struct old_timespec32 __user *, tsp, void __user *, sig) { struct compat_sigset_argpack x = {0, 0}; if (get_compat_sigset_argpack(&x, sig)) return -EFAULT; return do_compat_pselect(n, inp, outp, exp, tsp, compat_ptr(x.p), x.size, PT_OLD_TIMESPEC); } #endif #if defined(CONFIG_COMPAT_32BIT_TIME) COMPAT_SYSCALL_DEFINE5(ppoll_time32, struct pollfd __user *, ufds, unsigned int, nfds, struct old_timespec32 __user *, tsp, const compat_sigset_t __user *, sigmask, compat_size_t, sigsetsize) { struct timespec64 ts, end_time, *to = NULL; int ret; if (tsp) { if (get_old_timespec32(&ts, tsp)) return -EFAULT; to = &end_time; if (poll_select_set_timeout(to, ts.tv_sec, ts.tv_nsec)) return -EINVAL; } ret = set_compat_user_sigmask(sigmask, sigsetsize); if (ret) return ret; ret = do_sys_poll(ufds, nfds, to); return poll_select_finish(&end_time, tsp, PT_OLD_TIMESPEC, ret); } #endif /* New compat syscall for 64 bit time_t*/ COMPAT_SYSCALL_DEFINE5(ppoll_time64, struct pollfd __user *, ufds, unsigned int, nfds, struct __kernel_timespec __user *, tsp, const compat_sigset_t __user *, sigmask, compat_size_t, sigsetsize) { struct timespec64 ts, end_time, *to = NULL; int ret; if (tsp) { if (get_timespec64(&ts, tsp)) return -EFAULT; to = &end_time; if (poll_select_set_timeout(to, ts.tv_sec, ts.tv_nsec)) return -EINVAL; } ret = set_compat_user_sigmask(sigmask, sigsetsize); if (ret) return ret; ret = do_sys_poll(ufds, nfds, to); return poll_select_finish(&end_time, tsp, PT_TIMESPEC, ret); } #endif |
| 3 3 3 4 3 4 3 3 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 | // SPDX-License-Identifier: GPL-2.0-only /* * net/core/fib_rules.c Generic Routing Rules * * Authors: Thomas Graf <tgraf@suug.ch> */ #include <linux/types.h> #include <linux/kernel.h> #include <linux/slab.h> #include <linux/list.h> #include <linux/module.h> #include <net/net_namespace.h> #include <net/inet_dscp.h> #include <net/sock.h> #include <net/fib_rules.h> #include <net/ip_tunnels.h> #include <linux/indirect_call_wrapper.h> #if defined(CONFIG_IPV6) && defined(CONFIG_IPV6_MULTIPLE_TABLES) #ifdef CONFIG_IP_MULTIPLE_TABLES #define INDIRECT_CALL_MT(f, f2, f1, ...) \ INDIRECT_CALL_INET(f, f2, f1, __VA_ARGS__) #else #define INDIRECT_CALL_MT(f, f2, f1, ...) INDIRECT_CALL_1(f, f2, __VA_ARGS__) #endif #elif defined(CONFIG_IP_MULTIPLE_TABLES) #define INDIRECT_CALL_MT(f, f2, f1, ...) INDIRECT_CALL_1(f, f1, __VA_ARGS__) #else #define INDIRECT_CALL_MT(f, f2, f1, ...) f(__VA_ARGS__) #endif static const struct fib_kuid_range fib_kuid_range_unset = { KUIDT_INIT(0), KUIDT_INIT(~0), }; bool fib_rule_matchall(const struct fib_rule *rule) { if (rule->iifindex || rule->oifindex || rule->mark || rule->tun_id || rule->flags) return false; if (rule->suppress_ifgroup != -1 || rule->suppress_prefixlen != -1) return false; if (!uid_eq(rule->uid_range.start, fib_kuid_range_unset.start) || !uid_eq(rule->uid_range.end, fib_kuid_range_unset.end)) return false; if (fib_rule_port_range_set(&rule->sport_range)) return false; if (fib_rule_port_range_set(&rule->dport_range)) return false; return true; } EXPORT_SYMBOL_GPL(fib_rule_matchall); int fib_default_rule_add(struct fib_rules_ops *ops, u32 pref, u32 table) { struct fib_rule *r; r = kzalloc(ops->rule_size, GFP_KERNEL_ACCOUNT); if (r == NULL) return -ENOMEM; refcount_set(&r->refcnt, 1); r->action = FR_ACT_TO_TBL; r->pref = pref; r->table = table; r->proto = RTPROT_KERNEL; r->fr_net = ops->fro_net; r->uid_range = fib_kuid_range_unset; r->suppress_prefixlen = -1; r->suppress_ifgroup = -1; /* The lock is not required here, the list in unreachable * at the moment this function is called */ list_add_tail(&r->list, &ops->rules_list); return 0; } EXPORT_SYMBOL(fib_default_rule_add); static u32 fib_default_rule_pref(struct fib_rules_ops *ops) { struct list_head *pos; struct fib_rule *rule; if (!list_empty(&ops->rules_list)) { pos = ops->rules_list.next; if (pos->next != &ops->rules_list) { rule = list_entry(pos->next, struct fib_rule, list); if (rule->pref) return rule->pref - 1; } } return 0; } static void notify_rule_change(int event, struct fib_rule *rule, struct fib_rules_ops *ops, struct nlmsghdr *nlh, u32 pid); static struct fib_rules_ops *lookup_rules_ops(struct net *net, int family) { struct fib_rules_ops *ops; rcu_read_lock(); list_for_each_entry_rcu(ops, &net->rules_ops, list) { if (ops->family == family) { if (!try_module_get(ops->owner)) ops = NULL; rcu_read_unlock(); return ops; } } rcu_read_unlock(); return NULL; } static void rules_ops_put(struct fib_rules_ops *ops) { if (ops) module_put(ops->owner); } static void flush_route_cache(struct fib_rules_ops *ops) { if (ops->flush_cache) ops->flush_cache(ops); } static int __fib_rules_register(struct fib_rules_ops *ops) { int err = -EEXIST; struct fib_rules_ops *o; struct net *net; net = ops->fro_net; if (ops->rule_size < sizeof(struct fib_rule)) return -EINVAL; if (ops->match == NULL || ops->configure == NULL || ops->compare == NULL || ops->fill == NULL || ops->action == NULL) return -EINVAL; spin_lock(&net->rules_mod_lock); list_for_each_entry(o, &net->rules_ops, list) if (ops->family == o->family) goto errout; list_add_tail_rcu(&ops->list, &net->rules_ops); err = 0; errout: spin_unlock(&net->rules_mod_lock); return err; } struct fib_rules_ops * fib_rules_register(const struct fib_rules_ops *tmpl, struct net *net) { struct fib_rules_ops *ops; int err; ops = kmemdup(tmpl, sizeof(*ops), GFP_KERNEL); if (ops == NULL) return ERR_PTR(-ENOMEM); INIT_LIST_HEAD(&ops->rules_list); ops->fro_net = net; err = __fib_rules_register(ops); if (err) { kfree(ops); ops = ERR_PTR(err); } return ops; } EXPORT_SYMBOL_GPL(fib_rules_register); static void fib_rules_cleanup_ops(struct fib_rules_ops *ops) { struct fib_rule *rule, *tmp; list_for_each_entry_safe(rule, tmp, &ops->rules_list, list) { list_del_rcu(&rule->list); if (ops->delete) ops->delete(rule); fib_rule_put(rule); } } void fib_rules_unregister(struct fib_rules_ops *ops) { struct net *net = ops->fro_net; spin_lock(&net->rules_mod_lock); list_del_rcu(&ops->list); spin_unlock(&net->rules_mod_lock); fib_rules_cleanup_ops(ops); kfree_rcu(ops, rcu); } EXPORT_SYMBOL_GPL(fib_rules_unregister); static int uid_range_set(struct fib_kuid_range *range) { return uid_valid(range->start) && uid_valid(range->end); } static struct fib_kuid_range nla_get_kuid_range(struct nlattr **tb) { struct fib_rule_uid_range *in; struct fib_kuid_range out; in = (struct fib_rule_uid_range *)nla_data(tb[FRA_UID_RANGE]); out.start = make_kuid(current_user_ns(), in->start); out.end = make_kuid(current_user_ns(), in->end); return out; } static int nla_put_uid_range(struct sk_buff *skb, struct fib_kuid_range *range) { struct fib_rule_uid_range out = { from_kuid_munged(current_user_ns(), range->start), from_kuid_munged(current_user_ns(), range->end) }; return nla_put(skb, FRA_UID_RANGE, sizeof(out), &out); } static int nla_get_port_range(struct nlattr *pattr, struct fib_rule_port_range *port_range) { const struct fib_rule_port_range *pr = nla_data(pattr); if (!fib_rule_port_range_valid(pr)) return -EINVAL; port_range->start = pr->start; port_range->end = pr->end; return 0; } static int nla_put_port_range(struct sk_buff *skb, int attrtype, struct fib_rule_port_range *range) { return nla_put(skb, attrtype, sizeof(*range), range); } static int fib_rule_match(struct fib_rule *rule, struct fib_rules_ops *ops, struct flowi *fl, int flags, struct fib_lookup_arg *arg) { int ret = 0; if (rule->iifindex && (rule->iifindex != fl->flowi_iif)) goto out; if (rule->oifindex && (rule->oifindex != fl->flowi_oif)) goto out; if ((rule->mark ^ fl->flowi_mark) & rule->mark_mask) goto out; if (rule->tun_id && (rule->tun_id != fl->flowi_tun_key.tun_id)) goto out; if (rule->l3mdev && !l3mdev_fib_rule_match(rule->fr_net, fl, arg)) goto out; if (uid_lt(fl->flowi_uid, rule->uid_range.start) || uid_gt(fl->flowi_uid, rule->uid_range.end)) goto out; ret = INDIRECT_CALL_MT(ops->match, fib6_rule_match, fib4_rule_match, rule, fl, flags); out: return (rule->flags & FIB_RULE_INVERT) ? !ret : ret; } int fib_rules_lookup(struct fib_rules_ops *ops, struct flowi *fl, int flags, struct fib_lookup_arg *arg) { struct fib_rule *rule; int err; rcu_read_lock(); list_for_each_entry_rcu(rule, &ops->rules_list, list) { jumped: if (!fib_rule_match(rule, ops, fl, flags, arg)) continue; if (rule->action == FR_ACT_GOTO) { struct fib_rule *target; target = rcu_dereference(rule->ctarget); if (target == NULL) { continue; } else { rule = target; goto jumped; } } else if (rule->action == FR_ACT_NOP) continue; else err = INDIRECT_CALL_MT(ops->action, fib6_rule_action, fib4_rule_action, rule, fl, flags, arg); if (!err && ops->suppress && INDIRECT_CALL_MT(ops->suppress, fib6_rule_suppress, fib4_rule_suppress, rule, flags, arg)) continue; if (err != -EAGAIN) { if ((arg->flags & FIB_LOOKUP_NOREF) || likely(refcount_inc_not_zero(&rule->refcnt))) { arg->rule = rule; goto out; } break; } } err = -ESRCH; out: rcu_read_unlock(); return err; } EXPORT_SYMBOL_GPL(fib_rules_lookup); static int call_fib_rule_notifier(struct notifier_block *nb, enum fib_event_type event_type, struct fib_rule *rule, int family, struct netlink_ext_ack *extack) { struct fib_rule_notifier_info info = { .info.family = family, .info.extack = extack, .rule = rule, }; return call_fib_notifier(nb, event_type, &info.info); } static int call_fib_rule_notifiers(struct net *net, enum fib_event_type event_type, struct fib_rule *rule, struct fib_rules_ops *ops, struct netlink_ext_ack *extack) { struct fib_rule_notifier_info info = { .info.family = ops->family, .info.extack = extack, .rule = rule, }; ops->fib_rules_seq++; return call_fib_notifiers(net, event_type, &info.info); } /* Called with rcu_read_lock() */ int fib_rules_dump(struct net *net, struct notifier_block *nb, int family, struct netlink_ext_ack *extack) { struct fib_rules_ops *ops; struct fib_rule *rule; int err = 0; ops = lookup_rules_ops(net, family); if (!ops) return -EAFNOSUPPORT; list_for_each_entry_rcu(rule, &ops->rules_list, list) { err = call_fib_rule_notifier(nb, FIB_EVENT_RULE_ADD, rule, family, extack); if (err) break; } rules_ops_put(ops); return err; } EXPORT_SYMBOL_GPL(fib_rules_dump); unsigned int fib_rules_seq_read(struct net *net, int family) { unsigned int fib_rules_seq; struct fib_rules_ops *ops; ASSERT_RTNL(); ops = lookup_rules_ops(net, family); if (!ops) return 0; fib_rules_seq = ops->fib_rules_seq; rules_ops_put(ops); return fib_rules_seq; } EXPORT_SYMBOL_GPL(fib_rules_seq_read); static struct fib_rule *rule_find(struct fib_rules_ops *ops, struct fib_rule_hdr *frh, struct nlattr **tb, struct fib_rule *rule, bool user_priority) { struct fib_rule *r; list_for_each_entry(r, &ops->rules_list, list) { if (rule->action && r->action != rule->action) continue; if (rule->table && r->table != rule->table) continue; if (user_priority && r->pref != rule->pref) continue; if (rule->iifname[0] && memcmp(r->iifname, rule->iifname, IFNAMSIZ)) continue; if (rule->oifname[0] && memcmp(r->oifname, rule->oifname, IFNAMSIZ)) continue; if (rule->mark && r->mark != rule->mark) continue; if (rule->suppress_ifgroup != -1 && r->suppress_ifgroup != rule->suppress_ifgroup) continue; if (rule->suppress_prefixlen != -1 && r->suppress_prefixlen != rule->suppress_prefixlen) continue; if (rule->mark_mask && r->mark_mask != rule->mark_mask) continue; if (rule->tun_id && r->tun_id != rule->tun_id) continue; if (r->fr_net != rule->fr_net) continue; if (rule->l3mdev && r->l3mdev != rule->l3mdev) continue; if (uid_range_set(&rule->uid_range) && (!uid_eq(r->uid_range.start, rule->uid_range.start) || !uid_eq(r->uid_range.end, rule->uid_range.end))) continue; if (rule->ip_proto && r->ip_proto != rule->ip_proto) continue; if (rule->proto && r->proto != rule->proto) continue; if (fib_rule_port_range_set(&rule->sport_range) && !fib_rule_port_range_compare(&r->sport_range, &rule->sport_range)) continue; if (fib_rule_port_range_set(&rule->dport_range) && !fib_rule_port_range_compare(&r->dport_range, &rule->dport_range)) continue; if (!ops->compare(r, frh, tb)) continue; return r; } return NULL; } #ifdef CONFIG_NET_L3_MASTER_DEV static int fib_nl2rule_l3mdev(struct nlattr *nla, struct fib_rule *nlrule, struct netlink_ext_ack *extack) { nlrule->l3mdev = nla_get_u8(nla); if (nlrule->l3mdev != 1) { NL_SET_ERR_MSG(extack, "Invalid l3mdev attribute"); return -1; } return 0; } #else static int fib_nl2rule_l3mdev(struct nlattr *nla, struct fib_rule *nlrule, struct netlink_ext_ack *extack) { NL_SET_ERR_MSG(extack, "l3mdev support is not enabled in kernel"); return -1; } #endif static int fib_nl2rule(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack, struct fib_rules_ops *ops, struct nlattr *tb[], struct fib_rule **rule, bool *user_priority) { struct net *net = sock_net(skb->sk); struct fib_rule_hdr *frh = nlmsg_data(nlh); struct fib_rule *nlrule = NULL; int err = -EINVAL; if (frh->src_len) if (!tb[FRA_SRC] || frh->src_len > (ops->addr_size * 8) || nla_len(tb[FRA_SRC]) != ops->addr_size) { NL_SET_ERR_MSG(extack, "Invalid source address"); goto errout; } if (frh->dst_len) if (!tb[FRA_DST] || frh->dst_len > (ops->addr_size * 8) || nla_len(tb[FRA_DST]) != ops->addr_size) { NL_SET_ERR_MSG(extack, "Invalid dst address"); goto errout; } nlrule = kzalloc(ops->rule_size, GFP_KERNEL_ACCOUNT); if (!nlrule) { err = -ENOMEM; goto errout; } refcount_set(&nlrule->refcnt, 1); nlrule->fr_net = net; if (tb[FRA_PRIORITY]) { nlrule->pref = nla_get_u32(tb[FRA_PRIORITY]); *user_priority = true; } else { nlrule->pref = fib_default_rule_pref(ops); } nlrule->proto = tb[FRA_PROTOCOL] ? nla_get_u8(tb[FRA_PROTOCOL]) : RTPROT_UNSPEC; if (tb[FRA_IIFNAME]) { struct net_device *dev; nlrule->iifindex = -1; nla_strscpy(nlrule->iifname, tb[FRA_IIFNAME], IFNAMSIZ); dev = __dev_get_by_name(net, nlrule->iifname); if (dev) nlrule->iifindex = dev->ifindex; } if (tb[FRA_OIFNAME]) { struct net_device *dev; nlrule->oifindex = -1; nla_strscpy(nlrule->oifname, tb[FRA_OIFNAME], IFNAMSIZ); dev = __dev_get_by_name(net, nlrule->oifname); if (dev) nlrule->oifindex = dev->ifindex; } if (tb[FRA_FWMARK]) { nlrule->mark = nla_get_u32(tb[FRA_FWMARK]); if (nlrule->mark) /* compatibility: if the mark value is non-zero all bits * are compared unless a mask is explicitly specified. */ nlrule->mark_mask = 0xFFFFFFFF; } if (tb[FRA_FWMASK]) nlrule->mark_mask = nla_get_u32(tb[FRA_FWMASK]); if (tb[FRA_TUN_ID]) nlrule->tun_id = nla_get_be64(tb[FRA_TUN_ID]); if (tb[FRA_L3MDEV] && fib_nl2rule_l3mdev(tb[FRA_L3MDEV], nlrule, extack) < 0) goto errout_free; nlrule->action = frh->action; nlrule->flags = frh->flags; nlrule->table = frh_get_table(frh, tb); if (tb[FRA_SUPPRESS_PREFIXLEN]) nlrule->suppress_prefixlen = nla_get_u32(tb[FRA_SUPPRESS_PREFIXLEN]); else nlrule->suppress_prefixlen = -1; if (tb[FRA_SUPPRESS_IFGROUP]) nlrule->suppress_ifgroup = nla_get_u32(tb[FRA_SUPPRESS_IFGROUP]); else nlrule->suppress_ifgroup = -1; if (tb[FRA_GOTO]) { if (nlrule->action != FR_ACT_GOTO) { NL_SET_ERR_MSG(extack, "Unexpected goto"); goto errout_free; } nlrule->target = nla_get_u32(tb[FRA_GOTO]); /* Backward jumps are prohibited to avoid endless loops */ if (nlrule->target <= nlrule->pref) { NL_SET_ERR_MSG(extack, "Backward goto not supported"); goto errout_free; } } else if (nlrule->action == FR_ACT_GOTO) { NL_SET_ERR_MSG(extack, "Missing goto target for action goto"); goto errout_free; } if (nlrule->l3mdev && nlrule->table) { NL_SET_ERR_MSG(extack, "l3mdev and table are mutually exclusive"); goto errout_free; } if (tb[FRA_UID_RANGE]) { if (current_user_ns() != net->user_ns) { err = -EPERM; NL_SET_ERR_MSG(extack, "No permission to set uid"); goto errout_free; } nlrule->uid_range = nla_get_kuid_range(tb); if (!uid_range_set(&nlrule->uid_range) || !uid_lte(nlrule->uid_range.start, nlrule->uid_range.end)) { NL_SET_ERR_MSG(extack, "Invalid uid range"); goto errout_free; } } else { nlrule->uid_range = fib_kuid_range_unset; } if (tb[FRA_IP_PROTO]) nlrule->ip_proto = nla_get_u8(tb[FRA_IP_PROTO]); if (tb[FRA_SPORT_RANGE]) { err = nla_get_port_range(tb[FRA_SPORT_RANGE], &nlrule->sport_range); if (err) { NL_SET_ERR_MSG(extack, "Invalid sport range"); goto errout_free; } } if (tb[FRA_DPORT_RANGE]) { err = nla_get_port_range(tb[FRA_DPORT_RANGE], &nlrule->dport_range); if (err) { NL_SET_ERR_MSG(extack, "Invalid dport range"); goto errout_free; } } *rule = nlrule; return 0; errout_free: kfree(nlrule); errout: return err; } static int rule_exists(struct fib_rules_ops *ops, struct fib_rule_hdr *frh, struct nlattr **tb, struct fib_rule *rule) { struct fib_rule *r; list_for_each_entry(r, &ops->rules_list, list) { if (r->action != rule->action) continue; if (r->table != rule->table) continue; if (r->pref != rule->pref) continue; if (memcmp(r->iifname, rule->iifname, IFNAMSIZ)) continue; if (memcmp(r->oifname, rule->oifname, IFNAMSIZ)) continue; if (r->mark != rule->mark) continue; if (r->suppress_ifgroup != rule->suppress_ifgroup) continue; if (r->suppress_prefixlen != rule->suppress_prefixlen) continue; if (r->mark_mask != rule->mark_mask) continue; if (r->tun_id != rule->tun_id) continue; if (r->fr_net != rule->fr_net) continue; if (r->l3mdev != rule->l3mdev) continue; if (!uid_eq(r->uid_range.start, rule->uid_range.start) || !uid_eq(r->uid_range.end, rule->uid_range.end)) continue; if (r->ip_proto != rule->ip_proto) continue; if (r->proto != rule->proto) continue; if (!fib_rule_port_range_compare(&r->sport_range, &rule->sport_range)) continue; if (!fib_rule_port_range_compare(&r->dport_range, &rule->dport_range)) continue; if (!ops->compare(r, frh, tb)) continue; return 1; } return 0; } static const struct nla_policy fib_rule_policy[FRA_MAX + 1] = { [FRA_UNSPEC] = { .strict_start_type = FRA_DPORT_RANGE + 1 }, [FRA_IIFNAME] = { .type = NLA_STRING, .len = IFNAMSIZ - 1 }, [FRA_OIFNAME] = { .type = NLA_STRING, .len = IFNAMSIZ - 1 }, [FRA_PRIORITY] = { .type = NLA_U32 }, [FRA_FWMARK] = { .type = NLA_U32 }, [FRA_FLOW] = { .type = NLA_U32 }, [FRA_TUN_ID] = { .type = NLA_U64 }, [FRA_FWMASK] = { .type = NLA_U32 }, [FRA_TABLE] = { .type = NLA_U32 }, [FRA_SUPPRESS_PREFIXLEN] = { .type = NLA_U32 }, [FRA_SUPPRESS_IFGROUP] = { .type = NLA_U32 }, [FRA_GOTO] = { .type = NLA_U32 }, [FRA_L3MDEV] = { .type = NLA_U8 }, [FRA_UID_RANGE] = { .len = sizeof(struct fib_rule_uid_range) }, [FRA_PROTOCOL] = { .type = NLA_U8 }, [FRA_IP_PROTO] = { .type = NLA_U8 }, [FRA_SPORT_RANGE] = { .len = sizeof(struct fib_rule_port_range) }, [FRA_DPORT_RANGE] = { .len = sizeof(struct fib_rule_port_range) }, [FRA_DSCP] = NLA_POLICY_MAX(NLA_U8, INET_DSCP_MASK >> 2), }; int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); struct fib_rule_hdr *frh = nlmsg_data(nlh); struct fib_rules_ops *ops = NULL; struct fib_rule *rule = NULL, *r, *last = NULL; struct nlattr *tb[FRA_MAX + 1]; int err = -EINVAL, unresolved = 0; bool user_priority = false; if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*frh))) { NL_SET_ERR_MSG(extack, "Invalid msg length"); goto errout; } ops = lookup_rules_ops(net, frh->family); if (!ops) { err = -EAFNOSUPPORT; NL_SET_ERR_MSG(extack, "Rule family not supported"); goto errout; } err = nlmsg_parse_deprecated(nlh, sizeof(*frh), tb, FRA_MAX, fib_rule_policy, extack); if (err < 0) { NL_SET_ERR_MSG(extack, "Error parsing msg"); goto errout; } err = fib_nl2rule(skb, nlh, extack, ops, tb, &rule, &user_priority); if (err) goto errout; if ((nlh->nlmsg_flags & NLM_F_EXCL) && rule_exists(ops, frh, tb, rule)) { err = -EEXIST; goto errout_free; } err = ops->configure(rule, skb, frh, tb, extack); if (err < 0) goto errout_free; err = call_fib_rule_notifiers(net, FIB_EVENT_RULE_ADD, rule, ops, extack); if (err < 0) goto errout_free; list_for_each_entry(r, &ops->rules_list, list) { if (r->pref == rule->target) { RCU_INIT_POINTER(rule->ctarget, r); break; } } if (rcu_dereference_protected(rule->ctarget, 1) == NULL) unresolved = 1; list_for_each_entry(r, &ops->rules_list, list) { if (r->pref > rule->pref) break; last = r; } if (last) list_add_rcu(&rule->list, &last->list); else list_add_rcu(&rule->list, &ops->rules_list); if (ops->unresolved_rules) { /* * There are unresolved goto rules in the list, check if * any of them are pointing to this new rule. */ list_for_each_entry(r, &ops->rules_list, list) { if (r->action == FR_ACT_GOTO && r->target == rule->pref && rtnl_dereference(r->ctarget) == NULL) { rcu_assign_pointer(r->ctarget, rule); if (--ops->unresolved_rules == 0) break; } } } if (rule->action == FR_ACT_GOTO) ops->nr_goto_rules++; if (unresolved) ops->unresolved_rules++; if (rule->tun_id) ip_tunnel_need_metadata(); notify_rule_change(RTM_NEWRULE, rule, ops, nlh, NETLINK_CB(skb).portid); flush_route_cache(ops); rules_ops_put(ops); return 0; errout_free: kfree(rule); errout: rules_ops_put(ops); return err; } EXPORT_SYMBOL_GPL(fib_nl_newrule); int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); struct fib_rule_hdr *frh = nlmsg_data(nlh); struct fib_rules_ops *ops = NULL; struct fib_rule *rule = NULL, *r, *nlrule = NULL; struct nlattr *tb[FRA_MAX+1]; int err = -EINVAL; bool user_priority = false; if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*frh))) { NL_SET_ERR_MSG(extack, "Invalid msg length"); goto errout; } ops = lookup_rules_ops(net, frh->family); if (ops == NULL) { err = -EAFNOSUPPORT; NL_SET_ERR_MSG(extack, "Rule family not supported"); goto errout; } err = nlmsg_parse_deprecated(nlh, sizeof(*frh), tb, FRA_MAX, fib_rule_policy, extack); if (err < 0) { NL_SET_ERR_MSG(extack, "Error parsing msg"); goto errout; } err = fib_nl2rule(skb, nlh, extack, ops, tb, &nlrule, &user_priority); if (err) goto errout; rule = rule_find(ops, frh, tb, nlrule, user_priority); if (!rule) { err = -ENOENT; goto errout; } if (rule->flags & FIB_RULE_PERMANENT) { err = -EPERM; goto errout; } if (ops->delete) { err = ops->delete(rule); if (err) goto errout; } if (rule->tun_id) ip_tunnel_unneed_metadata(); list_del_rcu(&rule->list); if (rule->action == FR_ACT_GOTO) { ops->nr_goto_rules--; if (rtnl_dereference(rule->ctarget) == NULL) ops->unresolved_rules--; } /* * Check if this rule is a target to any of them. If so, * adjust to the next one with the same preference or * disable them. As this operation is eventually very * expensive, it is only performed if goto rules, except * current if it is goto rule, have actually been added. */ if (ops->nr_goto_rules > 0) { struct fib_rule *n; n = list_next_entry(rule, list); if (&n->list == &ops->rules_list || n->pref != rule->pref) n = NULL; list_for_each_entry(r, &ops->rules_list, list) { if (rtnl_dereference(r->ctarget) != rule) continue; rcu_assign_pointer(r->ctarget, n); if (!n) ops->unresolved_rules++; } } call_fib_rule_notifiers(net, FIB_EVENT_RULE_DEL, rule, ops, NULL); notify_rule_change(RTM_DELRULE, rule, ops, nlh, NETLINK_CB(skb).portid); fib_rule_put(rule); flush_route_cache(ops); rules_ops_put(ops); kfree(nlrule); return 0; errout: kfree(nlrule); rules_ops_put(ops); return err; } EXPORT_SYMBOL_GPL(fib_nl_delrule); static inline size_t fib_rule_nlmsg_size(struct fib_rules_ops *ops, struct fib_rule *rule) { size_t payload = NLMSG_ALIGN(sizeof(struct fib_rule_hdr)) + nla_total_size(IFNAMSIZ) /* FRA_IIFNAME */ + nla_total_size(IFNAMSIZ) /* FRA_OIFNAME */ + nla_total_size(4) /* FRA_PRIORITY */ + nla_total_size(4) /* FRA_TABLE */ + nla_total_size(4) /* FRA_SUPPRESS_PREFIXLEN */ + nla_total_size(4) /* FRA_SUPPRESS_IFGROUP */ + nla_total_size(4) /* FRA_FWMARK */ + nla_total_size(4) /* FRA_FWMASK */ + nla_total_size_64bit(8) /* FRA_TUN_ID */ + nla_total_size(sizeof(struct fib_kuid_range)) + nla_total_size(1) /* FRA_PROTOCOL */ + nla_total_size(1) /* FRA_IP_PROTO */ + nla_total_size(sizeof(struct fib_rule_port_range)) /* FRA_SPORT_RANGE */ + nla_total_size(sizeof(struct fib_rule_port_range)); /* FRA_DPORT_RANGE */ if (ops->nlmsg_payload) payload += ops->nlmsg_payload(rule); return payload; } static int fib_nl_fill_rule(struct sk_buff *skb, struct fib_rule *rule, u32 pid, u32 seq, int type, int flags, struct fib_rules_ops *ops) { struct nlmsghdr *nlh; struct fib_rule_hdr *frh; nlh = nlmsg_put(skb, pid, seq, type, sizeof(*frh), flags); if (nlh == NULL) return -EMSGSIZE; frh = nlmsg_data(nlh); frh->family = ops->family; frh->table = rule->table < 256 ? rule->table : RT_TABLE_COMPAT; if (nla_put_u32(skb, FRA_TABLE, rule->table)) goto nla_put_failure; if (nla_put_u32(skb, FRA_SUPPRESS_PREFIXLEN, rule->suppress_prefixlen)) goto nla_put_failure; frh->res1 = 0; frh->res2 = 0; frh->action = rule->action; frh->flags = rule->flags; if (nla_put_u8(skb, FRA_PROTOCOL, rule->proto)) goto nla_put_failure; if (rule->action == FR_ACT_GOTO && rcu_access_pointer(rule->ctarget) == NULL) frh->flags |= FIB_RULE_UNRESOLVED; if (rule->iifname[0]) { if (nla_put_string(skb, FRA_IIFNAME, rule->iifname)) goto nla_put_failure; if (rule->iifindex == -1) frh->flags |= FIB_RULE_IIF_DETACHED; } if (rule->oifname[0]) { if (nla_put_string(skb, FRA_OIFNAME, rule->oifname)) goto nla_put_failure; if (rule->oifindex == -1) frh->flags |= FIB_RULE_OIF_DETACHED; } if ((rule->pref && nla_put_u32(skb, FRA_PRIORITY, rule->pref)) || (rule->mark && nla_put_u32(skb, FRA_FWMARK, rule->mark)) || ((rule->mark_mask || rule->mark) && nla_put_u32(skb, FRA_FWMASK, rule->mark_mask)) || (rule->target && nla_put_u32(skb, FRA_GOTO, rule->target)) || (rule->tun_id && nla_put_be64(skb, FRA_TUN_ID, rule->tun_id, FRA_PAD)) || (rule->l3mdev && nla_put_u8(skb, FRA_L3MDEV, rule->l3mdev)) || (uid_range_set(&rule->uid_range) && nla_put_uid_range(skb, &rule->uid_range)) || (fib_rule_port_range_set(&rule->sport_range) && nla_put_port_range(skb, FRA_SPORT_RANGE, &rule->sport_range)) || (fib_rule_port_range_set(&rule->dport_range) && nla_put_port_range(skb, FRA_DPORT_RANGE, &rule->dport_range)) || (rule->ip_proto && nla_put_u8(skb, FRA_IP_PROTO, rule->ip_proto))) goto nla_put_failure; if (rule->suppress_ifgroup != -1) { if (nla_put_u32(skb, FRA_SUPPRESS_IFGROUP, rule->suppress_ifgroup)) goto nla_put_failure; } if (ops->fill(rule, skb, frh) < 0) goto nla_put_failure; nlmsg_end(skb, nlh); return 0; nla_put_failure: nlmsg_cancel(skb, nlh); return -EMSGSIZE; } static int dump_rules(struct sk_buff *skb, struct netlink_callback *cb, struct fib_rules_ops *ops) { int idx = 0; struct fib_rule *rule; int err = 0; rcu_read_lock(); list_for_each_entry_rcu(rule, &ops->rules_list, list) { if (idx < cb->args[1]) goto skip; err = fib_nl_fill_rule(skb, rule, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, RTM_NEWRULE, NLM_F_MULTI, ops); if (err) break; skip: idx++; } rcu_read_unlock(); cb->args[1] = idx; rules_ops_put(ops); return err; } static int fib_valid_dumprule_req(const struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct fib_rule_hdr *frh; if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*frh))) { NL_SET_ERR_MSG(extack, "Invalid header for fib rule dump request"); return -EINVAL; } frh = nlmsg_data(nlh); if (frh->dst_len || frh->src_len || frh->tos || frh->table || frh->res1 || frh->res2 || frh->action || frh->flags) { NL_SET_ERR_MSG(extack, "Invalid values in header for fib rule dump request"); return -EINVAL; } if (nlmsg_attrlen(nlh, sizeof(*frh))) { NL_SET_ERR_MSG(extack, "Invalid data after header in fib rule dump request"); return -EINVAL; } return 0; } static int fib_nl_dumprule(struct sk_buff *skb, struct netlink_callback *cb) { const struct nlmsghdr *nlh = cb->nlh; struct net *net = sock_net(skb->sk); struct fib_rules_ops *ops; int err, idx = 0, family; if (cb->strict_check) { err = fib_valid_dumprule_req(nlh, cb->extack); if (err < 0) return err; } family = rtnl_msg_family(nlh); if (family != AF_UNSPEC) { /* Protocol specific dump request */ ops = lookup_rules_ops(net, family); if (ops == NULL) return -EAFNOSUPPORT; return dump_rules(skb, cb, ops); } err = 0; rcu_read_lock(); list_for_each_entry_rcu(ops, &net->rules_ops, list) { if (idx < cb->args[0] || !try_module_get(ops->owner)) goto skip; err = dump_rules(skb, cb, ops); if (err < 0) break; cb->args[1] = 0; skip: idx++; } rcu_read_unlock(); cb->args[0] = idx; return err; } static void notify_rule_change(int event, struct fib_rule *rule, struct fib_rules_ops *ops, struct nlmsghdr *nlh, u32 pid) { struct net *net; struct sk_buff *skb; int err = -ENOMEM; net = ops->fro_net; skb = nlmsg_new(fib_rule_nlmsg_size(ops, rule), GFP_KERNEL); if (skb == NULL) goto errout; err = fib_nl_fill_rule(skb, rule, pid, nlh->nlmsg_seq, event, 0, ops); if (err < 0) { /* -EMSGSIZE implies BUG in fib_rule_nlmsg_size() */ WARN_ON(err == -EMSGSIZE); kfree_skb(skb); goto errout; } rtnl_notify(skb, net, pid, ops->nlgroup, nlh, GFP_KERNEL); return; errout: rtnl_set_sk_err(net, ops->nlgroup, err); } static void attach_rules(struct list_head *rules, struct net_device *dev) { struct fib_rule *rule; list_for_each_entry(rule, rules, list) { if (rule->iifindex == -1 && strcmp(dev->name, rule->iifname) == 0) rule->iifindex = dev->ifindex; if (rule->oifindex == -1 && strcmp(dev->name, rule->oifname) == 0) rule->oifindex = dev->ifindex; } } static void detach_rules(struct list_head *rules, struct net_device *dev) { struct fib_rule *rule; list_for_each_entry(rule, rules, list) { if (rule->iifindex == dev->ifindex) rule->iifindex = -1; if (rule->oifindex == dev->ifindex) rule->oifindex = -1; } } static int fib_rules_event(struct notifier_block *this, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct net *net = dev_net(dev); struct fib_rules_ops *ops; ASSERT_RTNL(); switch (event) { case NETDEV_REGISTER: list_for_each_entry(ops, &net->rules_ops, list) attach_rules(&ops->rules_list, dev); break; case NETDEV_CHANGENAME: list_for_each_entry(ops, &net->rules_ops, list) { detach_rules(&ops->rules_list, dev); attach_rules(&ops->rules_list, dev); } break; case NETDEV_UNREGISTER: list_for_each_entry(ops, &net->rules_ops, list) detach_rules(&ops->rules_list, dev); break; } return NOTIFY_DONE; } static struct notifier_block fib_rules_notifier = { .notifier_call = fib_rules_event, }; static int __net_init fib_rules_net_init(struct net *net) { INIT_LIST_HEAD(&net->rules_ops); spin_lock_init(&net->rules_mod_lock); return 0; } static void __net_exit fib_rules_net_exit(struct net *net) { WARN_ON_ONCE(!list_empty(&net->rules_ops)); } static struct pernet_operations fib_rules_net_ops = { .init = fib_rules_net_init, .exit = fib_rules_net_exit, }; static int __init fib_rules_init(void) { int err; rtnl_register(PF_UNSPEC, RTM_NEWRULE, fib_nl_newrule, NULL, 0); rtnl_register(PF_UNSPEC, RTM_DELRULE, fib_nl_delrule, NULL, 0); rtnl_register(PF_UNSPEC, RTM_GETRULE, NULL, fib_nl_dumprule, RTNL_FLAG_DUMP_UNLOCKED); err = register_pernet_subsys(&fib_rules_net_ops); if (err < 0) goto fail; err = register_netdevice_notifier(&fib_rules_notifier); if (err < 0) goto fail_unregister; return 0; fail_unregister: unregister_pernet_subsys(&fib_rules_net_ops); fail: rtnl_unregister(PF_UNSPEC, RTM_NEWRULE); rtnl_unregister(PF_UNSPEC, RTM_DELRULE); rtnl_unregister(PF_UNSPEC, RTM_GETRULE); return err; } subsys_initcall(fib_rules_init); |
| 1 1 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef DRIVERS_PCI_H #define DRIVERS_PCI_H #include <linux/pci.h> /* Number of possible devfns: 0.0 to 1f.7 inclusive */ #define MAX_NR_DEVFNS 256 #define PCI_FIND_CAP_TTL 48 #define PCI_VSEC_ID_INTEL_TBT 0x1234 /* Thunderbolt */ #define PCIE_LINK_RETRAIN_TIMEOUT_MS 1000 /* * Power stable to PERST# inactive. * * See the "Power Sequencing and Reset Signal Timings" table of the PCI Express * Card Electromechanical Specification, Revision 5.1, Section 2.9.2, Symbol * "T_PVPERL". */ #define PCIE_T_PVPERL_MS 100 /* * REFCLK stable before PERST# inactive. * * See the "Power Sequencing and Reset Signal Timings" table of the PCI Express * Card Electromechanical Specification, Revision 5.1, Section 2.9.2, Symbol * "T_PERST-CLK". */ #define PCIE_T_PERST_CLK_US 100 /* * End of conventional reset (PERST# de-asserted) to first configuration * request (device able to respond with a "Request Retry Status" completion), * from PCIe r6.0, sec 6.6.1. */ #define PCIE_T_RRS_READY_MS 100 /* * PCIe r6.0, sec 5.3.3.2.1 <PME Synchronization> * Recommends 1ms to 10ms timeout to check L2 ready. */ #define PCIE_PME_TO_L2_TIMEOUT_US 10000 /* * PCIe r6.0, sec 6.6.1 <Conventional Reset> * * - "With a Downstream Port that does not support Link speeds greater * than 5.0 GT/s, software must wait a minimum of 100 ms following exit * from a Conventional Reset before sending a Configuration Request to * the device immediately below that Port." * * - "With a Downstream Port that supports Link speeds greater than * 5.0 GT/s, software must wait a minimum of 100 ms after Link training * completes before sending a Configuration Request to the device * immediately below that Port." */ #define PCIE_RESET_CONFIG_DEVICE_WAIT_MS 100 /* Message Routing (r[2:0]); PCIe r6.0, sec 2.2.8 */ #define PCIE_MSG_TYPE_R_RC 0 #define PCIE_MSG_TYPE_R_ADDR 1 #define PCIE_MSG_TYPE_R_ID 2 #define PCIE_MSG_TYPE_R_BC 3 #define PCIE_MSG_TYPE_R_LOCAL 4 #define PCIE_MSG_TYPE_R_GATHER 5 /* Power Management Messages; PCIe r6.0, sec 2.2.8.2 */ #define PCIE_MSG_CODE_PME_TURN_OFF 0x19 /* INTx Mechanism Messages; PCIe r6.0, sec 2.2.8.1 */ #define PCIE_MSG_CODE_ASSERT_INTA 0x20 #define PCIE_MSG_CODE_ASSERT_INTB 0x21 #define PCIE_MSG_CODE_ASSERT_INTC 0x22 #define PCIE_MSG_CODE_ASSERT_INTD 0x23 #define PCIE_MSG_CODE_DEASSERT_INTA 0x24 #define PCIE_MSG_CODE_DEASSERT_INTB 0x25 #define PCIE_MSG_CODE_DEASSERT_INTC 0x26 #define PCIE_MSG_CODE_DEASSERT_INTD 0x27 extern const unsigned char pcie_link_speed[]; extern bool pci_early_dump; bool pcie_cap_has_lnkctl(const struct pci_dev *dev); bool pcie_cap_has_lnkctl2(const struct pci_dev *dev); bool pcie_cap_has_rtctl(const struct pci_dev *dev); /* Functions internal to the PCI core code */ #ifdef CONFIG_DMI extern const struct attribute_group pci_dev_smbios_attr_group; #endif enum pci_mmap_api { PCI_MMAP_SYSFS, /* mmap on /sys/bus/pci/devices/<BDF>/resource<N> */ PCI_MMAP_PROCFS /* mmap on /proc/bus/pci/<BDF> */ }; int pci_mmap_fits(struct pci_dev *pdev, int resno, struct vm_area_struct *vmai, enum pci_mmap_api mmap_api); bool pci_reset_supported(struct pci_dev *dev); void pci_init_reset_methods(struct pci_dev *dev); int pci_bridge_secondary_bus_reset(struct pci_dev *dev); int pci_bus_error_reset(struct pci_dev *dev); struct pci_cap_saved_data { u16 cap_nr; bool cap_extended; unsigned int size; u32 data[]; }; struct pci_cap_saved_state { struct hlist_node next; struct pci_cap_saved_data cap; }; void pci_allocate_cap_save_buffers(struct pci_dev *dev); void pci_free_cap_save_buffers(struct pci_dev *dev); int pci_add_cap_save_buffer(struct pci_dev *dev, char cap, unsigned int size); int pci_add_ext_cap_save_buffer(struct pci_dev *dev, u16 cap, unsigned int size); struct pci_cap_saved_state *pci_find_saved_cap(struct pci_dev *dev, char cap); struct pci_cap_saved_state *pci_find_saved_ext_cap(struct pci_dev *dev, u16 cap); #define PCI_PM_D2_DELAY 200 /* usec; see PCIe r4.0, sec 5.9.1 */ #define PCI_PM_D3HOT_WAIT 10 /* msec */ #define PCI_PM_D3COLD_WAIT 100 /* msec */ void pci_update_current_state(struct pci_dev *dev, pci_power_t state); void pci_refresh_power_state(struct pci_dev *dev); int pci_power_up(struct pci_dev *dev); void pci_disable_enabled_device(struct pci_dev *dev); int pci_finish_runtime_suspend(struct pci_dev *dev); void pcie_clear_device_status(struct pci_dev *dev); void pcie_clear_root_pme_status(struct pci_dev *dev); bool pci_check_pme_status(struct pci_dev *dev); void pci_pme_wakeup_bus(struct pci_bus *bus); void pci_pme_restore(struct pci_dev *dev); bool pci_dev_need_resume(struct pci_dev *dev); void pci_dev_adjust_pme(struct pci_dev *dev); void pci_dev_complete_resume(struct pci_dev *pci_dev); void pci_config_pm_runtime_get(struct pci_dev *dev); void pci_config_pm_runtime_put(struct pci_dev *dev); void pci_pm_init(struct pci_dev *dev); void pci_ea_init(struct pci_dev *dev); void pci_msi_init(struct pci_dev *dev); void pci_msix_init(struct pci_dev *dev); bool pci_bridge_d3_possible(struct pci_dev *dev); void pci_bridge_d3_update(struct pci_dev *dev); int pci_bridge_wait_for_secondary_bus(struct pci_dev *dev, char *reset_type); static inline bool pci_bus_rrs_vendor_id(u32 l) { return (l & 0xffff) == PCI_VENDOR_ID_PCI_SIG; } static inline void pci_wakeup_event(struct pci_dev *dev) { /* Wait 100 ms before the system can be put into a sleep state. */ pm_wakeup_event(&dev->dev, 100); } static inline bool pci_has_subordinate(struct pci_dev *pci_dev) { return !!(pci_dev->subordinate); } static inline bool pci_power_manageable(struct pci_dev *pci_dev) { /* * Currently we allow normal PCI devices and PCI bridges transition * into D3 if their bridge_d3 is set. */ return !pci_has_subordinate(pci_dev) || pci_dev->bridge_d3; } static inline bool pcie_downstream_port(const struct pci_dev *dev) { int type = pci_pcie_type(dev); return type == PCI_EXP_TYPE_ROOT_PORT || type == PCI_EXP_TYPE_DOWNSTREAM || type == PCI_EXP_TYPE_PCIE_BRIDGE; } void pci_vpd_init(struct pci_dev *dev); extern const struct attribute_group pci_dev_vpd_attr_group; /* PCI Virtual Channel */ int pci_save_vc_state(struct pci_dev *dev); void pci_restore_vc_state(struct pci_dev *dev); void pci_allocate_vc_save_buffers(struct pci_dev *dev); /* PCI /proc functions */ #ifdef CONFIG_PROC_FS int pci_proc_attach_device(struct pci_dev *dev); int pci_proc_detach_device(struct pci_dev *dev); int pci_proc_detach_bus(struct pci_bus *bus); #else static inline int pci_proc_attach_device(struct pci_dev *dev) { return 0; } static inline int pci_proc_detach_device(struct pci_dev *dev) { return 0; } static inline int pci_proc_detach_bus(struct pci_bus *bus) { return 0; } #endif /* Functions for PCI Hotplug drivers to use */ int pci_hp_add_bridge(struct pci_dev *dev); #if defined(CONFIG_SYSFS) && defined(HAVE_PCI_LEGACY) void pci_create_legacy_files(struct pci_bus *bus); void pci_remove_legacy_files(struct pci_bus *bus); #else static inline void pci_create_legacy_files(struct pci_bus *bus) { } static inline void pci_remove_legacy_files(struct pci_bus *bus) { } #endif /* Lock for read/write access to pci device and bus lists */ extern struct rw_semaphore pci_bus_sem; extern struct mutex pci_slot_mutex; extern raw_spinlock_t pci_lock; extern unsigned int pci_pm_d3hot_delay; #ifdef CONFIG_PCI_MSI void pci_no_msi(void); #else static inline void pci_no_msi(void) { } #endif void pci_realloc_get_opt(char *); static inline int pci_no_d1d2(struct pci_dev *dev) { unsigned int parent_dstates = 0; if (dev->bus->self) parent_dstates = dev->bus->self->no_d1d2; return (dev->no_d1d2 || parent_dstates); } #ifdef CONFIG_SYSFS int pci_create_sysfs_dev_files(struct pci_dev *pdev); void pci_remove_sysfs_dev_files(struct pci_dev *pdev); extern const struct attribute_group *pci_dev_groups[]; extern const struct attribute_group *pci_dev_attr_groups[]; extern const struct attribute_group *pcibus_groups[]; extern const struct attribute_group *pci_bus_groups[]; #else static inline int pci_create_sysfs_dev_files(struct pci_dev *pdev) { return 0; } static inline void pci_remove_sysfs_dev_files(struct pci_dev *pdev) { } #define pci_dev_groups NULL #define pci_dev_attr_groups NULL #define pcibus_groups NULL #define pci_bus_groups NULL #endif extern unsigned long pci_hotplug_io_size; extern unsigned long pci_hotplug_mmio_size; extern unsigned long pci_hotplug_mmio_pref_size; extern unsigned long pci_hotplug_bus_size; /** * pci_match_one_device - Tell if a PCI device structure has a matching * PCI device id structure * @id: single PCI device id structure to match * @dev: the PCI device structure to match against * * Returns the matching pci_device_id structure or %NULL if there is no match. */ static inline const struct pci_device_id * pci_match_one_device(const struct pci_device_id *id, const struct pci_dev *dev) { if ((id->vendor == PCI_ANY_ID || id->vendor == dev->vendor) && (id->device == PCI_ANY_ID || id->device == dev->device) && (id->subvendor == PCI_ANY_ID || id->subvendor == dev->subsystem_vendor) && (id->subdevice == PCI_ANY_ID || id->subdevice == dev->subsystem_device) && !((id->class ^ dev->class) & id->class_mask)) return id; return NULL; } /* PCI slot sysfs helper code */ #define to_pci_slot(s) container_of(s, struct pci_slot, kobj) extern struct kset *pci_slots_kset; struct pci_slot_attribute { struct attribute attr; ssize_t (*show)(struct pci_slot *, char *); ssize_t (*store)(struct pci_slot *, const char *, size_t); }; #define to_pci_slot_attr(s) container_of(s, struct pci_slot_attribute, attr) enum pci_bar_type { pci_bar_unknown, /* Standard PCI BAR probe */ pci_bar_io, /* An I/O port BAR */ pci_bar_mem32, /* A 32-bit memory BAR */ pci_bar_mem64, /* A 64-bit memory BAR */ }; struct device *pci_get_host_bridge_device(struct pci_dev *dev); void pci_put_host_bridge_device(struct device *dev); int pci_configure_extended_tags(struct pci_dev *dev, void *ign); bool pci_bus_read_dev_vendor_id(struct pci_bus *bus, int devfn, u32 *pl, int rrs_timeout); bool pci_bus_generic_read_dev_vendor_id(struct pci_bus *bus, int devfn, u32 *pl, int rrs_timeout); int pci_idt_bus_quirk(struct pci_bus *bus, int devfn, u32 *pl, int rrs_timeout); int pci_setup_device(struct pci_dev *dev); int __pci_read_base(struct pci_dev *dev, enum pci_bar_type type, struct resource *res, unsigned int reg); void pci_configure_ari(struct pci_dev *dev); void __pci_bus_size_bridges(struct pci_bus *bus, struct list_head *realloc_head); void __pci_bus_assign_resources(const struct pci_bus *bus, struct list_head *realloc_head, struct list_head *fail_head); bool pci_bus_clip_resource(struct pci_dev *dev, int idx); const char *pci_resource_name(struct pci_dev *dev, unsigned int i); void pci_reassigndev_resource_alignment(struct pci_dev *dev); void pci_disable_bridge_window(struct pci_dev *dev); struct pci_bus *pci_bus_get(struct pci_bus *bus); void pci_bus_put(struct pci_bus *bus); /* PCIe link information from Link Capabilities 2 */ #define PCIE_LNKCAP2_SLS2SPEED(lnkcap2) \ ((lnkcap2) & PCI_EXP_LNKCAP2_SLS_64_0GB ? PCIE_SPEED_64_0GT : \ (lnkcap2) & PCI_EXP_LNKCAP2_SLS_32_0GB ? PCIE_SPEED_32_0GT : \ (lnkcap2) & PCI_EXP_LNKCAP2_SLS_16_0GB ? PCIE_SPEED_16_0GT : \ (lnkcap2) & PCI_EXP_LNKCAP2_SLS_8_0GB ? PCIE_SPEED_8_0GT : \ (lnkcap2) & PCI_EXP_LNKCAP2_SLS_5_0GB ? PCIE_SPEED_5_0GT : \ (lnkcap2) & PCI_EXP_LNKCAP2_SLS_2_5GB ? PCIE_SPEED_2_5GT : \ PCI_SPEED_UNKNOWN) /* PCIe speed to Mb/s reduced by encoding overhead */ #define PCIE_SPEED2MBS_ENC(speed) \ ((speed) == PCIE_SPEED_64_0GT ? 64000*1/1 : \ (speed) == PCIE_SPEED_32_0GT ? 32000*128/130 : \ (speed) == PCIE_SPEED_16_0GT ? 16000*128/130 : \ (speed) == PCIE_SPEED_8_0GT ? 8000*128/130 : \ (speed) == PCIE_SPEED_5_0GT ? 5000*8/10 : \ (speed) == PCIE_SPEED_2_5GT ? 2500*8/10 : \ 0) static inline int pcie_dev_speed_mbps(enum pci_bus_speed speed) { switch (speed) { case PCIE_SPEED_2_5GT: return 2500; case PCIE_SPEED_5_0GT: return 5000; case PCIE_SPEED_8_0GT: return 8000; case PCIE_SPEED_16_0GT: return 16000; case PCIE_SPEED_32_0GT: return 32000; case PCIE_SPEED_64_0GT: return 64000; default: break; } return -EINVAL; } const char *pci_speed_string(enum pci_bus_speed speed); enum pci_bus_speed pcie_get_speed_cap(struct pci_dev *dev); enum pcie_link_width pcie_get_width_cap(struct pci_dev *dev); void __pcie_print_link_status(struct pci_dev *dev, bool verbose); void pcie_report_downtraining(struct pci_dev *dev); void pcie_update_link_speed(struct pci_bus *bus, u16 link_status); /* Single Root I/O Virtualization */ struct pci_sriov { int pos; /* Capability position */ int nres; /* Number of resources */ u32 cap; /* SR-IOV Capabilities */ u16 ctrl; /* SR-IOV Control */ u16 total_VFs; /* Total VFs associated with the PF */ u16 initial_VFs; /* Initial VFs associated with the PF */ u16 num_VFs; /* Number of VFs available */ u16 offset; /* First VF Routing ID offset */ u16 stride; /* Following VF stride */ u16 vf_device; /* VF device ID */ u32 pgsz; /* Page size for BAR alignment */ u8 link; /* Function Dependency Link */ u8 max_VF_buses; /* Max buses consumed by VFs */ u16 driver_max_VFs; /* Max num VFs driver supports */ struct pci_dev *dev; /* Lowest numbered PF */ struct pci_dev *self; /* This PF */ u32 class; /* VF device */ u8 hdr_type; /* VF header type */ u16 subsystem_vendor; /* VF subsystem vendor */ u16 subsystem_device; /* VF subsystem device */ resource_size_t barsz[PCI_SRIOV_NUM_BARS]; /* VF BAR size */ bool drivers_autoprobe; /* Auto probing of VFs by driver */ }; #ifdef CONFIG_PCI_DOE void pci_doe_init(struct pci_dev *pdev); void pci_doe_destroy(struct pci_dev *pdev); void pci_doe_disconnected(struct pci_dev *pdev); #else static inline void pci_doe_init(struct pci_dev *pdev) { } static inline void pci_doe_destroy(struct pci_dev *pdev) { } static inline void pci_doe_disconnected(struct pci_dev *pdev) { } #endif #ifdef CONFIG_PCI_NPEM void pci_npem_create(struct pci_dev *dev); void pci_npem_remove(struct pci_dev *dev); #else static inline void pci_npem_create(struct pci_dev *dev) { } static inline void pci_npem_remove(struct pci_dev *dev) { } #endif /** * pci_dev_set_io_state - Set the new error state if possible. * * @dev: PCI device to set new error_state * @new: the state we want dev to be in * * If the device is experiencing perm_failure, it has to remain in that state. * Any other transition is allowed. * * Returns true if state has been changed to the requested state. */ static inline bool pci_dev_set_io_state(struct pci_dev *dev, pci_channel_state_t new) { pci_channel_state_t old; switch (new) { case pci_channel_io_perm_failure: xchg(&dev->error_state, pci_channel_io_perm_failure); return true; case pci_channel_io_frozen: old = cmpxchg(&dev->error_state, pci_channel_io_normal, pci_channel_io_frozen); return old != pci_channel_io_perm_failure; case pci_channel_io_normal: old = cmpxchg(&dev->error_state, pci_channel_io_frozen, pci_channel_io_normal); return old != pci_channel_io_perm_failure; default: return false; } } static inline int pci_dev_set_disconnected(struct pci_dev *dev, void *unused) { pci_dev_set_io_state(dev, pci_channel_io_perm_failure); pci_doe_disconnected(dev); return 0; } /* pci_dev priv_flags */ #define PCI_DEV_ADDED 0 #define PCI_DPC_RECOVERED 1 #define PCI_DPC_RECOVERING 2 static inline void pci_dev_assign_added(struct pci_dev *dev, bool added) { assign_bit(PCI_DEV_ADDED, &dev->priv_flags, added); } static inline bool pci_dev_is_added(const struct pci_dev *dev) { return test_bit(PCI_DEV_ADDED, &dev->priv_flags); } #ifdef CONFIG_PCIEAER #include <linux/aer.h> #define AER_MAX_MULTI_ERR_DEVICES 5 /* Not likely to have more */ struct aer_err_info { struct pci_dev *dev[AER_MAX_MULTI_ERR_DEVICES]; int error_dev_num; unsigned int id:16; unsigned int severity:2; /* 0:NONFATAL | 1:FATAL | 2:COR */ unsigned int __pad1:5; unsigned int multi_error_valid:1; unsigned int first_error:5; unsigned int __pad2:2; unsigned int tlp_header_valid:1; unsigned int status; /* COR/UNCOR Error Status */ unsigned int mask; /* COR/UNCOR Error Mask */ struct pcie_tlp_log tlp; /* TLP Header */ }; int aer_get_device_error_info(struct pci_dev *dev, struct aer_err_info *info); void aer_print_error(struct pci_dev *dev, struct aer_err_info *info); #endif /* CONFIG_PCIEAER */ #ifdef CONFIG_PCIEPORTBUS /* Cached RCEC Endpoint Association */ struct rcec_ea { u8 nextbusn; u8 lastbusn; u32 bitmap; }; #endif #ifdef CONFIG_PCIE_DPC void pci_save_dpc_state(struct pci_dev *dev); void pci_restore_dpc_state(struct pci_dev *dev); void pci_dpc_init(struct pci_dev *pdev); void dpc_process_error(struct pci_dev *pdev); pci_ers_result_t dpc_reset_link(struct pci_dev *pdev); bool pci_dpc_recovered(struct pci_dev *pdev); #else static inline void pci_save_dpc_state(struct pci_dev *dev) { } static inline void pci_restore_dpc_state(struct pci_dev *dev) { } static inline void pci_dpc_init(struct pci_dev *pdev) { } static inline bool pci_dpc_recovered(struct pci_dev *pdev) { return false; } #endif #ifdef CONFIG_PCIEPORTBUS void pci_rcec_init(struct pci_dev *dev); void pci_rcec_exit(struct pci_dev *dev); void pcie_link_rcec(struct pci_dev *rcec); void pcie_walk_rcec(struct pci_dev *rcec, int (*cb)(struct pci_dev *, void *), void *userdata); #else static inline void pci_rcec_init(struct pci_dev *dev) { } static inline void pci_rcec_exit(struct pci_dev *dev) { } static inline void pcie_link_rcec(struct pci_dev *rcec) { } static inline void pcie_walk_rcec(struct pci_dev *rcec, int (*cb)(struct pci_dev *, void *), void *userdata) { } #endif #ifdef CONFIG_PCI_ATS /* Address Translation Service */ void pci_ats_init(struct pci_dev *dev); void pci_restore_ats_state(struct pci_dev *dev); #else static inline void pci_ats_init(struct pci_dev *d) { } static inline void pci_restore_ats_state(struct pci_dev *dev) { } #endif /* CONFIG_PCI_ATS */ #ifdef CONFIG_PCI_PRI void pci_pri_init(struct pci_dev *dev); void pci_restore_pri_state(struct pci_dev *pdev); #else static inline void pci_pri_init(struct pci_dev *dev) { } static inline void pci_restore_pri_state(struct pci_dev *pdev) { } #endif #ifdef CONFIG_PCI_PASID void pci_pasid_init(struct pci_dev *dev); void pci_restore_pasid_state(struct pci_dev *pdev); #else static inline void pci_pasid_init(struct pci_dev *dev) { } static inline void pci_restore_pasid_state(struct pci_dev *pdev) { } #endif #ifdef CONFIG_PCI_IOV int pci_iov_init(struct pci_dev *dev); void pci_iov_release(struct pci_dev *dev); void pci_iov_remove(struct pci_dev *dev); void pci_iov_update_resource(struct pci_dev *dev, int resno); resource_size_t pci_sriov_resource_alignment(struct pci_dev *dev, int resno); void pci_restore_iov_state(struct pci_dev *dev); int pci_iov_bus_range(struct pci_bus *bus); extern const struct attribute_group sriov_pf_dev_attr_group; extern const struct attribute_group sriov_vf_dev_attr_group; #else static inline int pci_iov_init(struct pci_dev *dev) { return -ENODEV; } static inline void pci_iov_release(struct pci_dev *dev) { } static inline void pci_iov_remove(struct pci_dev *dev) { } static inline void pci_restore_iov_state(struct pci_dev *dev) { } static inline int pci_iov_bus_range(struct pci_bus *bus) { return 0; } #endif /* CONFIG_PCI_IOV */ #ifdef CONFIG_PCIE_PTM void pci_ptm_init(struct pci_dev *dev); void pci_save_ptm_state(struct pci_dev *dev); void pci_restore_ptm_state(struct pci_dev *dev); void pci_suspend_ptm(struct pci_dev *dev); void pci_resume_ptm(struct pci_dev *dev); #else static inline void pci_ptm_init(struct pci_dev *dev) { } static inline void pci_save_ptm_state(struct pci_dev *dev) { } static inline void pci_restore_ptm_state(struct pci_dev *dev) { } static inline void pci_suspend_ptm(struct pci_dev *dev) { } static inline void pci_resume_ptm(struct pci_dev *dev) { } #endif unsigned long pci_cardbus_resource_alignment(struct resource *); static inline resource_size_t pci_resource_alignment(struct pci_dev *dev, struct resource *res) { #ifdef CONFIG_PCI_IOV int resno = res - dev->resource; if (resno >= PCI_IOV_RESOURCES && resno <= PCI_IOV_RESOURCE_END) return pci_sriov_resource_alignment(dev, resno); #endif if (dev->class >> 8 == PCI_CLASS_BRIDGE_CARDBUS) return pci_cardbus_resource_alignment(res); return resource_alignment(res); } void pci_acs_init(struct pci_dev *dev); #ifdef CONFIG_PCI_QUIRKS int pci_dev_specific_acs_enabled(struct pci_dev *dev, u16 acs_flags); int pci_dev_specific_enable_acs(struct pci_dev *dev); int pci_dev_specific_disable_acs_redir(struct pci_dev *dev); int pcie_failed_link_retrain(struct pci_dev *dev); #else static inline int pci_dev_specific_acs_enabled(struct pci_dev *dev, u16 acs_flags) { return -ENOTTY; } static inline int pci_dev_specific_enable_acs(struct pci_dev *dev) { return -ENOTTY; } static inline int pci_dev_specific_disable_acs_redir(struct pci_dev *dev) { return -ENOTTY; } static inline int pcie_failed_link_retrain(struct pci_dev *dev) { return -ENOTTY; } #endif /* PCI error reporting and recovery */ pci_ers_result_t pcie_do_recovery(struct pci_dev *dev, pci_channel_state_t state, pci_ers_result_t (*reset_subordinates)(struct pci_dev *pdev)); bool pcie_wait_for_link(struct pci_dev *pdev, bool active); int pcie_retrain_link(struct pci_dev *pdev, bool use_lt); /* ASPM-related functionality we need even without CONFIG_PCIEASPM */ void pci_save_ltr_state(struct pci_dev *dev); void pci_restore_ltr_state(struct pci_dev *dev); void pci_configure_aspm_l1ss(struct pci_dev *dev); void pci_save_aspm_l1ss_state(struct pci_dev *dev); void pci_restore_aspm_l1ss_state(struct pci_dev *dev); #ifdef CONFIG_PCIEASPM void pcie_aspm_init_link_state(struct pci_dev *pdev); void pcie_aspm_exit_link_state(struct pci_dev *pdev); void pcie_aspm_pm_state_change(struct pci_dev *pdev, bool locked); void pcie_aspm_powersave_config_link(struct pci_dev *pdev); void pci_configure_ltr(struct pci_dev *pdev); void pci_bridge_reconfigure_ltr(struct pci_dev *pdev); #else static inline void pcie_aspm_init_link_state(struct pci_dev *pdev) { } static inline void pcie_aspm_exit_link_state(struct pci_dev *pdev) { } static inline void pcie_aspm_pm_state_change(struct pci_dev *pdev, bool locked) { } static inline void pcie_aspm_powersave_config_link(struct pci_dev *pdev) { } static inline void pci_configure_ltr(struct pci_dev *pdev) { } static inline void pci_bridge_reconfigure_ltr(struct pci_dev *pdev) { } #endif #ifdef CONFIG_PCIE_ECRC void pcie_set_ecrc_checking(struct pci_dev *dev); void pcie_ecrc_get_policy(char *str); #else static inline void pcie_set_ecrc_checking(struct pci_dev *dev) { } static inline void pcie_ecrc_get_policy(char *str) { } #endif struct pci_dev_reset_methods { u16 vendor; u16 device; int (*reset)(struct pci_dev *dev, bool probe); }; struct pci_reset_fn_method { int (*reset_fn)(struct pci_dev *pdev, bool probe); char *name; }; #ifdef CONFIG_PCI_QUIRKS int pci_dev_specific_reset(struct pci_dev *dev, bool probe); #else static inline int pci_dev_specific_reset(struct pci_dev *dev, bool probe) { return -ENOTTY; } #endif #if defined(CONFIG_PCI_QUIRKS) && defined(CONFIG_ARM64) int acpi_get_rc_resources(struct device *dev, const char *hid, u16 segment, struct resource *res); #else static inline int acpi_get_rc_resources(struct device *dev, const char *hid, u16 segment, struct resource *res) { return -ENODEV; } #endif int pci_rebar_get_current_size(struct pci_dev *pdev, int bar); int pci_rebar_set_size(struct pci_dev *pdev, int bar, int size); static inline u64 pci_rebar_size_to_bytes(int size) { return 1ULL << (size + 20); } struct device_node; #ifdef CONFIG_OF int of_pci_parse_bus_range(struct device_node *node, struct resource *res); int of_get_pci_domain_nr(struct device_node *node); int of_pci_get_max_link_speed(struct device_node *node); u32 of_pci_get_slot_power_limit(struct device_node *node, u8 *slot_power_limit_value, u8 *slot_power_limit_scale); bool of_pci_preserve_config(struct device_node *node); int pci_set_of_node(struct pci_dev *dev); void pci_release_of_node(struct pci_dev *dev); void pci_set_bus_of_node(struct pci_bus *bus); void pci_release_bus_of_node(struct pci_bus *bus); int devm_of_pci_bridge_init(struct device *dev, struct pci_host_bridge *bridge); #else static inline int of_pci_parse_bus_range(struct device_node *node, struct resource *res) { return -EINVAL; } static inline int of_get_pci_domain_nr(struct device_node *node) { return -1; } static inline int of_pci_get_max_link_speed(struct device_node *node) { return -EINVAL; } static inline u32 of_pci_get_slot_power_limit(struct device_node *node, u8 *slot_power_limit_value, u8 *slot_power_limit_scale) { if (slot_power_limit_value) *slot_power_limit_value = 0; if (slot_power_limit_scale) *slot_power_limit_scale = 0; return 0; } static inline bool of_pci_preserve_config(struct device_node *node) { return false; } static inline int pci_set_of_node(struct pci_dev *dev) { return 0; } static inline void pci_release_of_node(struct pci_dev *dev) { } static inline void pci_set_bus_of_node(struct pci_bus *bus) { } static inline void pci_release_bus_of_node(struct pci_bus *bus) { } static inline int devm_of_pci_bridge_init(struct device *dev, struct pci_host_bridge *bridge) { return 0; } #endif /* CONFIG_OF */ struct of_changeset; #ifdef CONFIG_PCI_DYNAMIC_OF_NODES void of_pci_make_dev_node(struct pci_dev *pdev); void of_pci_remove_node(struct pci_dev *pdev); int of_pci_add_properties(struct pci_dev *pdev, struct of_changeset *ocs, struct device_node *np); #else static inline void of_pci_make_dev_node(struct pci_dev *pdev) { } static inline void of_pci_remove_node(struct pci_dev *pdev) { } #endif #ifdef CONFIG_PCIEAER void pci_no_aer(void); void pci_aer_init(struct pci_dev *dev); void pci_aer_exit(struct pci_dev *dev); extern const struct attribute_group aer_stats_attr_group; void pci_aer_clear_fatal_status(struct pci_dev *dev); int pci_aer_clear_status(struct pci_dev *dev); int pci_aer_raw_clear_status(struct pci_dev *dev); void pci_save_aer_state(struct pci_dev *dev); void pci_restore_aer_state(struct pci_dev *dev); #else static inline void pci_no_aer(void) { } static inline void pci_aer_init(struct pci_dev *d) { } static inline void pci_aer_exit(struct pci_dev *d) { } static inline void pci_aer_clear_fatal_status(struct pci_dev *dev) { } static inline int pci_aer_clear_status(struct pci_dev *dev) { return -EINVAL; } static inline int pci_aer_raw_clear_status(struct pci_dev *dev) { return -EINVAL; } static inline void pci_save_aer_state(struct pci_dev *dev) { } static inline void pci_restore_aer_state(struct pci_dev *dev) { } #endif #ifdef CONFIG_ACPI bool pci_acpi_preserve_config(struct pci_host_bridge *bridge); int pci_acpi_program_hp_params(struct pci_dev *dev); extern const struct attribute_group pci_dev_acpi_attr_group; void pci_set_acpi_fwnode(struct pci_dev *dev); int pci_dev_acpi_reset(struct pci_dev *dev, bool probe); bool acpi_pci_power_manageable(struct pci_dev *dev); bool acpi_pci_bridge_d3(struct pci_dev *dev); int acpi_pci_set_power_state(struct pci_dev *dev, pci_power_t state); pci_power_t acpi_pci_get_power_state(struct pci_dev *dev); void acpi_pci_refresh_power_state(struct pci_dev *dev); int acpi_pci_wakeup(struct pci_dev *dev, bool enable); bool acpi_pci_need_resume(struct pci_dev *dev); pci_power_t acpi_pci_choose_state(struct pci_dev *pdev); #else static inline bool pci_acpi_preserve_config(struct pci_host_bridge *bridge) { return false; } static inline int pci_dev_acpi_reset(struct pci_dev *dev, bool probe) { return -ENOTTY; } static inline void pci_set_acpi_fwnode(struct pci_dev *dev) { } static inline int pci_acpi_program_hp_params(struct pci_dev *dev) { return -ENODEV; } static inline bool acpi_pci_power_manageable(struct pci_dev *dev) { return false; } static inline bool acpi_pci_bridge_d3(struct pci_dev *dev) { return false; } static inline int acpi_pci_set_power_state(struct pci_dev *dev, pci_power_t state) { return -ENODEV; } static inline pci_power_t acpi_pci_get_power_state(struct pci_dev *dev) { return PCI_UNKNOWN; } static inline void acpi_pci_refresh_power_state(struct pci_dev *dev) { } static inline int acpi_pci_wakeup(struct pci_dev *dev, bool enable) { return -ENODEV; } static inline bool acpi_pci_need_resume(struct pci_dev *dev) { return false; } static inline pci_power_t acpi_pci_choose_state(struct pci_dev *pdev) { return PCI_POWER_ERROR; } #endif #ifdef CONFIG_PCIEASPM extern const struct attribute_group aspm_ctrl_attr_group; #endif extern const struct attribute_group pci_dev_reset_method_attr_group; #ifdef CONFIG_X86_INTEL_MID bool pci_use_mid_pm(void); int mid_pci_set_power_state(struct pci_dev *pdev, pci_power_t state); pci_power_t mid_pci_get_power_state(struct pci_dev *pdev); #else static inline bool pci_use_mid_pm(void) { return false; } static inline int mid_pci_set_power_state(struct pci_dev *pdev, pci_power_t state) { return -ENODEV; } static inline pci_power_t mid_pci_get_power_state(struct pci_dev *pdev) { return PCI_UNKNOWN; } #endif int pcim_intx(struct pci_dev *dev, int enable); int pcim_request_region_exclusive(struct pci_dev *pdev, int bar, const char *name); void pcim_release_region(struct pci_dev *pdev, int bar); /* * Config Address for PCI Configuration Mechanism #1 * * See PCI Local Bus Specification, Revision 3.0, * Section 3.2.2.3.2, Figure 3-2, p. 50. */ #define PCI_CONF1_BUS_SHIFT 16 /* Bus number */ #define PCI_CONF1_DEV_SHIFT 11 /* Device number */ #define PCI_CONF1_FUNC_SHIFT 8 /* Function number */ #define PCI_CONF1_BUS_MASK 0xff #define PCI_CONF1_DEV_MASK 0x1f #define PCI_CONF1_FUNC_MASK 0x7 #define PCI_CONF1_REG_MASK 0xfc /* Limit aligned offset to a maximum of 256B */ #define PCI_CONF1_ENABLE BIT(31) #define PCI_CONF1_BUS(x) (((x) & PCI_CONF1_BUS_MASK) << PCI_CONF1_BUS_SHIFT) #define PCI_CONF1_DEV(x) (((x) & PCI_CONF1_DEV_MASK) << PCI_CONF1_DEV_SHIFT) #define PCI_CONF1_FUNC(x) (((x) & PCI_CONF1_FUNC_MASK) << PCI_CONF1_FUNC_SHIFT) #define PCI_CONF1_REG(x) ((x) & PCI_CONF1_REG_MASK) #define PCI_CONF1_ADDRESS(bus, dev, func, reg) \ (PCI_CONF1_ENABLE | \ PCI_CONF1_BUS(bus) | \ PCI_CONF1_DEV(dev) | \ PCI_CONF1_FUNC(func) | \ PCI_CONF1_REG(reg)) /* * Extension of PCI Config Address for accessing extended PCIe registers * * No standardized specification, but used on lot of non-ECAM-compliant ARM SoCs * or on AMD Barcelona and new CPUs. Reserved bits [27:24] of PCI Config Address * are used for specifying additional 4 high bits of PCI Express register. */ #define PCI_CONF1_EXT_REG_SHIFT 16 #define PCI_CONF1_EXT_REG_MASK 0xf00 #define PCI_CONF1_EXT_REG(x) (((x) & PCI_CONF1_EXT_REG_MASK) << PCI_CONF1_EXT_REG_SHIFT) #define PCI_CONF1_EXT_ADDRESS(bus, dev, func, reg) \ (PCI_CONF1_ADDRESS(bus, dev, func, reg) | \ PCI_CONF1_EXT_REG(reg)) #endif /* DRIVERS_PCI_H */ |
| 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 | // SPDX-License-Identifier: GPL-2.0-or-later /* * Core registration and callback routines for MTD * drivers and users. * * Copyright © 1999-2010 David Woodhouse <dwmw2@infradead.org> * Copyright © 2006 Red Hat UK Limited */ #include <linux/module.h> #include <linux/kernel.h> #include <linux/ptrace.h> #include <linux/seq_file.h> #include <linux/string.h> #include <linux/timer.h> #include <linux/major.h> #include <linux/fs.h> #include <linux/err.h> #include <linux/ioctl.h> #include <linux/init.h> #include <linux/of.h> #include <linux/proc_fs.h> #include <linux/idr.h> #include <linux/backing-dev.h> #include <linux/gfp.h> #include <linux/random.h> #include <linux/slab.h> #include <linux/reboot.h> #include <linux/leds.h> #include <linux/debugfs.h> #include <linux/nvmem-provider.h> #include <linux/root_dev.h> #include <linux/error-injection.h> #include <linux/mtd/mtd.h> #include <linux/mtd/partitions.h> #include "mtdcore.h" struct backing_dev_info *mtd_bdi; #ifdef CONFIG_PM_SLEEP static int mtd_cls_suspend(struct device *dev) { struct mtd_info *mtd = dev_get_drvdata(dev); return mtd ? mtd_suspend(mtd) : 0; } static int mtd_cls_resume(struct device *dev) { struct mtd_info *mtd = dev_get_drvdata(dev); if (mtd) mtd_resume(mtd); return 0; } static SIMPLE_DEV_PM_OPS(mtd_cls_pm_ops, mtd_cls_suspend, mtd_cls_resume); #define MTD_CLS_PM_OPS (&mtd_cls_pm_ops) #else #define MTD_CLS_PM_OPS NULL #endif static struct class mtd_class = { .name = "mtd", .pm = MTD_CLS_PM_OPS, }; static DEFINE_IDR(mtd_idr); /* These are exported solely for the purpose of mtd_blkdevs.c. You should not use them for _anything_ else */ DEFINE_MUTEX(mtd_table_mutex); EXPORT_SYMBOL_GPL(mtd_table_mutex); struct mtd_info *__mtd_next_device(int i) { return idr_get_next(&mtd_idr, &i); } EXPORT_SYMBOL_GPL(__mtd_next_device); static LIST_HEAD(mtd_notifiers); #define MTD_DEVT(index) MKDEV(MTD_CHAR_MAJOR, (index)*2) /* REVISIT once MTD uses the driver model better, whoever allocates * the mtd_info will probably want to use the release() hook... */ static void mtd_release(struct device *dev) { struct mtd_info *mtd = dev_get_drvdata(dev); dev_t index = MTD_DEVT(mtd->index); idr_remove(&mtd_idr, mtd->index); of_node_put(mtd_get_of_node(mtd)); if (mtd_is_partition(mtd)) release_mtd_partition(mtd); /* remove /dev/mtdXro node */ device_destroy(&mtd_class, index + 1); } static void mtd_device_release(struct kref *kref) { struct mtd_info *mtd = container_of(kref, struct mtd_info, refcnt); bool is_partition = mtd_is_partition(mtd); debugfs_remove_recursive(mtd->dbg.dfs_dir); /* Try to remove the NVMEM provider */ nvmem_unregister(mtd->nvmem); device_unregister(&mtd->dev); /* * Clear dev so mtd can be safely re-registered later if desired. * Should not be done for partition, * as it was already destroyed in device_unregister(). */ if (!is_partition) memset(&mtd->dev, 0, sizeof(mtd->dev)); module_put(THIS_MODULE); } #define MTD_DEVICE_ATTR_RO(name) \ static DEVICE_ATTR(name, 0444, mtd_##name##_show, NULL) #define MTD_DEVICE_ATTR_RW(name) \ static DEVICE_ATTR(name, 0644, mtd_##name##_show, mtd_##name##_store) static ssize_t mtd_type_show(struct device *dev, struct device_attribute *attr, char *buf) { struct mtd_info *mtd = dev_get_drvdata(dev); char *type; switch (mtd->type) { case MTD_ABSENT: type = "absent"; break; case MTD_RAM: type = "ram"; break; case MTD_ROM: type = "rom"; break; case MTD_NORFLASH: type = "nor"; break; case MTD_NANDFLASH: type = "nand"; break; case MTD_DATAFLASH: type = "dataflash"; break; case MTD_UBIVOLUME: type = "ubi"; break; case MTD_MLCNANDFLASH: type = "mlc-nand"; break; default: type = "unknown"; } return sysfs_emit(buf, "%s\n", type); } MTD_DEVICE_ATTR_RO(type); static ssize_t mtd_flags_show(struct device *dev, struct device_attribute *attr, char *buf) { struct mtd_info *mtd = dev_get_drvdata(dev); return sysfs_emit(buf, "0x%lx\n", (unsigned long)mtd->flags); } MTD_DEVICE_ATTR_RO(flags); static ssize_t mtd_size_show(struct device *dev, struct device_attribute *attr, char *buf) { struct mtd_info *mtd = dev_get_drvdata(dev); return sysfs_emit(buf, "%llu\n", (unsigned long long)mtd->size); } MTD_DEVICE_ATTR_RO(size); static ssize_t mtd_erasesize_show(struct device *dev, struct device_attribute *attr, char *buf) { struct mtd_info *mtd = dev_get_drvdata(dev); return sysfs_emit(buf, "%lu\n", (unsigned long)mtd->erasesize); } MTD_DEVICE_ATTR_RO(erasesize); static ssize_t mtd_writesize_show(struct device *dev, struct device_attribute *attr, char *buf) { struct mtd_info *mtd = dev_get_drvdata(dev); return sysfs_emit(buf, "%lu\n", (unsigned long)mtd->writesize); } MTD_DEVICE_ATTR_RO(writesize); static ssize_t mtd_subpagesize_show(struct device *dev, struct device_attribute *attr, char *buf) { struct mtd_info *mtd = dev_get_drvdata(dev); unsigned int subpagesize = mtd->writesize >> mtd->subpage_sft; return sysfs_emit(buf, "%u\n", subpagesize); } MTD_DEVICE_ATTR_RO(subpagesize); static ssize_t mtd_oobsize_show(struct device *dev, struct device_attribute *attr, char *buf) { struct mtd_info *mtd = dev_get_drvdata(dev); return sysfs_emit(buf, "%lu\n", (unsigned long)mtd->oobsize); } MTD_DEVICE_ATTR_RO(oobsize); static ssize_t mtd_oobavail_show(struct device *dev, struct device_attribute *attr, char *buf) { struct mtd_info *mtd = dev_get_drvdata(dev); return sysfs_emit(buf, "%u\n", mtd->oobavail); } MTD_DEVICE_ATTR_RO(oobavail); static ssize_t mtd_numeraseregions_show(struct device *dev, struct device_attribute *attr, char *buf) { struct mtd_info *mtd = dev_get_drvdata(dev); return sysfs_emit(buf, "%u\n", mtd->numeraseregions); } MTD_DEVICE_ATTR_RO(numeraseregions); static ssize_t mtd_name_show(struct device *dev, struct device_attribute *attr, char *buf) { struct mtd_info *mtd = dev_get_drvdata(dev); return sysfs_emit(buf, "%s\n", mtd->name); } MTD_DEVICE_ATTR_RO(name); static ssize_t mtd_ecc_strength_show(struct device *dev, struct device_attribute *attr, char *buf) { struct mtd_info *mtd = dev_get_drvdata(dev); return sysfs_emit(buf, "%u\n", mtd->ecc_strength); } MTD_DEVICE_ATTR_RO(ecc_strength); static ssize_t mtd_bitflip_threshold_show(struct device *dev, struct device_attribute *attr, char *buf) { struct mtd_info *mtd = dev_get_drvdata(dev); return sysfs_emit(buf, "%u\n", mtd->bitflip_threshold); } static ssize_t mtd_bitflip_threshold_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct mtd_info *mtd = dev_get_drvdata(dev); unsigned int bitflip_threshold; int retval; retval = kstrtouint(buf, 0, &bitflip_threshold); if (retval) return retval; mtd->bitflip_threshold = bitflip_threshold; return count; } MTD_DEVICE_ATTR_RW(bitflip_threshold); static ssize_t mtd_ecc_step_size_show(struct device *dev, struct device_attribute *attr, char *buf) { struct mtd_info *mtd = dev_get_drvdata(dev); return sysfs_emit(buf, "%u\n", mtd->ecc_step_size); } MTD_DEVICE_ATTR_RO(ecc_step_size); static ssize_t mtd_corrected_bits_show(struct device *dev, struct device_attribute *attr, char *buf) { struct mtd_info *mtd = dev_get_drvdata(dev); struct mtd_ecc_stats *ecc_stats = &mtd->ecc_stats; return sysfs_emit(buf, "%u\n", ecc_stats->corrected); } MTD_DEVICE_ATTR_RO(corrected_bits); /* ecc stats corrected */ static ssize_t mtd_ecc_failures_show(struct device *dev, struct device_attribute *attr, char *buf) { struct mtd_info *mtd = dev_get_drvdata(dev); struct mtd_ecc_stats *ecc_stats = &mtd->ecc_stats; return sysfs_emit(buf, "%u\n", ecc_stats->failed); } MTD_DEVICE_ATTR_RO(ecc_failures); /* ecc stats errors */ static ssize_t mtd_bad_blocks_show(struct device *dev, struct device_attribute *attr, char *buf) { struct mtd_info *mtd = dev_get_drvdata(dev); struct mtd_ecc_stats *ecc_stats = &mtd->ecc_stats; return sysfs_emit(buf, "%u\n", ecc_stats->badblocks); } MTD_DEVICE_ATTR_RO(bad_blocks); static ssize_t mtd_bbt_blocks_show(struct device *dev, struct device_attribute *attr, char *buf) { struct mtd_info *mtd = dev_get_drvdata(dev); struct mtd_ecc_stats *ecc_stats = &mtd->ecc_stats; return sysfs_emit(buf, "%u\n", ecc_stats->bbtblocks); } MTD_DEVICE_ATTR_RO(bbt_blocks); static struct attribute *mtd_attrs[] = { &dev_attr_type.attr, &dev_attr_flags.attr, &dev_attr_size.attr, &dev_attr_erasesize.attr, &dev_attr_writesize.attr, &dev_attr_subpagesize.attr, &dev_attr_oobsize.attr, &dev_attr_oobavail.attr, &dev_attr_numeraseregions.attr, &dev_attr_name.attr, &dev_attr_ecc_strength.attr, &dev_attr_ecc_step_size.attr, &dev_attr_corrected_bits.attr, &dev_attr_ecc_failures.attr, &dev_attr_bad_blocks.attr, &dev_attr_bbt_blocks.attr, &dev_attr_bitflip_threshold.attr, NULL, }; ATTRIBUTE_GROUPS(mtd); static const struct device_type mtd_devtype = { .name = "mtd", .groups = mtd_groups, .release = mtd_release, }; static bool mtd_expert_analysis_mode; #ifdef CONFIG_DEBUG_FS bool mtd_check_expert_analysis_mode(void) { const char *mtd_expert_analysis_warning = "Bad block checks have been entirely disabled.\n" "This is only reserved for post-mortem forensics and debug purposes.\n" "Never enable this mode if you do not know what you are doing!\n"; return WARN_ONCE(mtd_expert_analysis_mode, mtd_expert_analysis_warning); } EXPORT_SYMBOL_GPL(mtd_check_expert_analysis_mode); #endif static struct dentry *dfs_dir_mtd; static void mtd_debugfs_populate(struct mtd_info *mtd) { struct device *dev = &mtd->dev; if (IS_ERR_OR_NULL(dfs_dir_mtd)) return; mtd->dbg.dfs_dir = debugfs_create_dir(dev_name(dev), dfs_dir_mtd); } #ifndef CONFIG_MMU unsigned mtd_mmap_capabilities(struct mtd_info *mtd) { switch (mtd->type) { case MTD_RAM: return NOMMU_MAP_COPY | NOMMU_MAP_DIRECT | NOMMU_MAP_EXEC | NOMMU_MAP_READ | NOMMU_MAP_WRITE; case MTD_ROM: return NOMMU_MAP_COPY | NOMMU_MAP_DIRECT | NOMMU_MAP_EXEC | NOMMU_MAP_READ; default: return NOMMU_MAP_COPY; } } EXPORT_SYMBOL_GPL(mtd_mmap_capabilities); #endif static int mtd_reboot_notifier(struct notifier_block *n, unsigned long state, void *cmd) { struct mtd_info *mtd; mtd = container_of(n, struct mtd_info, reboot_notifier); mtd->_reboot(mtd); return NOTIFY_DONE; } /** * mtd_wunit_to_pairing_info - get pairing information of a wunit * @mtd: pointer to new MTD device info structure * @wunit: write unit we are interested in * @info: returned pairing information * * Retrieve pairing information associated to the wunit. * This is mainly useful when dealing with MLC/TLC NANDs where pages can be * paired together, and where programming a page may influence the page it is * paired with. * The notion of page is replaced by the term wunit (write-unit) to stay * consistent with the ->writesize field. * * The @wunit argument can be extracted from an absolute offset using * mtd_offset_to_wunit(). @info is filled with the pairing information attached * to @wunit. * * From the pairing info the MTD user can find all the wunits paired with * @wunit using the following loop: * * for (i = 0; i < mtd_pairing_groups(mtd); i++) { * info.pair = i; * mtd_pairing_info_to_wunit(mtd, &info); * ... * } */ int mtd_wunit_to_pairing_info(struct mtd_info *mtd, int wunit, struct mtd_pairing_info *info) { struct mtd_info *master = mtd_get_master(mtd); int npairs = mtd_wunit_per_eb(master) / mtd_pairing_groups(master); if (wunit < 0 || wunit >= npairs) return -EINVAL; if (master->pairing && master->pairing->get_info) return master->pairing->get_info(master, wunit, info); info->group = 0; info->pair = wunit; return 0; } EXPORT_SYMBOL_GPL(mtd_wunit_to_pairing_info); /** * mtd_pairing_info_to_wunit - get wunit from pairing information * @mtd: pointer to new MTD device info structure * @info: pairing information struct * * Returns a positive number representing the wunit associated to the info * struct, or a negative error code. * * This is the reverse of mtd_wunit_to_pairing_info(), and can help one to * iterate over all wunits of a given pair (see mtd_wunit_to_pairing_info() * doc). * * It can also be used to only program the first page of each pair (i.e. * page attached to group 0), which allows one to use an MLC NAND in * software-emulated SLC mode: * * info.group = 0; * npairs = mtd_wunit_per_eb(mtd) / mtd_pairing_groups(mtd); * for (info.pair = 0; info.pair < npairs; info.pair++) { * wunit = mtd_pairing_info_to_wunit(mtd, &info); * mtd_write(mtd, mtd_wunit_to_offset(mtd, blkoffs, wunit), * mtd->writesize, &retlen, buf + (i * mtd->writesize)); * } */ int mtd_pairing_info_to_wunit(struct mtd_info *mtd, const struct mtd_pairing_info *info) { struct mtd_info *master = mtd_get_master(mtd); int ngroups = mtd_pairing_groups(master); int npairs = mtd_wunit_per_eb(master) / ngroups; if (!info || info->pair < 0 || info->pair >= npairs || info->group < 0 || info->group >= ngroups) return -EINVAL; if (master->pairing && master->pairing->get_wunit) return mtd->pairing->get_wunit(master, info); return info->pair; } EXPORT_SYMBOL_GPL(mtd_pairing_info_to_wunit); /** * mtd_pairing_groups - get the number of pairing groups * @mtd: pointer to new MTD device info structure * * Returns the number of pairing groups. * * This number is usually equal to the number of bits exposed by a single * cell, and can be used in conjunction with mtd_pairing_info_to_wunit() * to iterate over all pages of a given pair. */ int mtd_pairing_groups(struct mtd_info *mtd) { struct mtd_info *master = mtd_get_master(mtd); if (!master->pairing || !master->pairing->ngroups) return 1; return master->pairing->ngroups; } EXPORT_SYMBOL_GPL(mtd_pairing_groups); static int mtd_nvmem_reg_read(void *priv, unsigned int offset, void *val, size_t bytes) { struct mtd_info *mtd = priv; size_t retlen; int err; err = mtd_read(mtd, offset, bytes, &retlen, val); if (err && err != -EUCLEAN) return err; return retlen == bytes ? 0 : -EIO; } static int mtd_nvmem_add(struct mtd_info *mtd) { struct device_node *node = mtd_get_of_node(mtd); struct nvmem_config config = {}; config.id = NVMEM_DEVID_NONE; config.dev = &mtd->dev; config.name = dev_name(&mtd->dev); config.owner = THIS_MODULE; config.add_legacy_fixed_of_cells = of_device_is_compatible(node, "nvmem-cells"); config.reg_read = mtd_nvmem_reg_read; config.size = mtd->size; config.word_size = 1; config.stride = 1; config.read_only = true; config.root_only = true; config.ignore_wp = true; config.priv = mtd; mtd->nvmem = nvmem_register(&config); if (IS_ERR(mtd->nvmem)) { /* Just ignore if there is no NVMEM support in the kernel */ if (PTR_ERR(mtd->nvmem) == -EOPNOTSUPP) mtd->nvmem = NULL; else return dev_err_probe(&mtd->dev, PTR_ERR(mtd->nvmem), "Failed to register NVMEM device\n"); } return 0; } static void mtd_check_of_node(struct mtd_info *mtd) { struct device_node *partitions, *parent_dn, *mtd_dn = NULL; const char *pname, *prefix = "partition-"; int plen, mtd_name_len, offset, prefix_len; /* Check if MTD already has a device node */ if (mtd_get_of_node(mtd)) return; if (!mtd_is_partition(mtd)) return; parent_dn = of_node_get(mtd_get_of_node(mtd->parent)); if (!parent_dn) return; if (mtd_is_partition(mtd->parent)) partitions = of_node_get(parent_dn); else partitions = of_get_child_by_name(parent_dn, "partitions"); if (!partitions) goto exit_parent; prefix_len = strlen(prefix); mtd_name_len = strlen(mtd->name); /* Search if a partition is defined with the same name */ for_each_child_of_node(partitions, mtd_dn) { /* Skip partition with no/wrong prefix */ if (!of_node_name_prefix(mtd_dn, prefix)) continue; /* Label have priority. Check that first */ if (!of_property_read_string(mtd_dn, "label", &pname)) { offset = 0; } else { pname = mtd_dn->name; offset = prefix_len; } plen = strlen(pname) - offset; if (plen == mtd_name_len && !strncmp(mtd->name, pname + offset, plen)) { mtd_set_of_node(mtd, mtd_dn); of_node_put(mtd_dn); break; } } of_node_put(partitions); exit_parent: of_node_put(parent_dn); } /** * add_mtd_device - register an MTD device * @mtd: pointer to new MTD device info structure * * Add a device to the list of MTD devices present in the system, and * notify each currently active MTD 'user' of its arrival. Returns * zero on success or non-zero on failure. */ int add_mtd_device(struct mtd_info *mtd) { struct device_node *np = mtd_get_of_node(mtd); struct mtd_info *master = mtd_get_master(mtd); struct mtd_notifier *not; int i, error, ofidx; /* * May occur, for instance, on buggy drivers which call * mtd_device_parse_register() multiple times on the same master MTD, * especially with CONFIG_MTD_PARTITIONED_MASTER=y. */ if (WARN_ONCE(mtd->dev.type, "MTD already registered\n")) return -EEXIST; BUG_ON(mtd->writesize == 0); /* * MTD drivers should implement ->_{write,read}() or * ->_{write,read}_oob(), but not both. */ if (WARN_ON((mtd->_write && mtd->_write_oob) || (mtd->_read && mtd->_read_oob))) return -EINVAL; if (WARN_ON((!mtd->erasesize || !master->_erase) && !(mtd->flags & MTD_NO_ERASE))) return -EINVAL; /* * MTD_SLC_ON_MLC_EMULATION can only be set on partitions, when the * master is an MLC NAND and has a proper pairing scheme defined. * We also reject masters that implement ->_writev() for now, because * NAND controller drivers don't implement this hook, and adding the * SLC -> MLC address/length conversion to this path is useless if we * don't have a user. */ if (mtd->flags & MTD_SLC_ON_MLC_EMULATION && (!mtd_is_partition(mtd) || master->type != MTD_MLCNANDFLASH || !master->pairing || master->_writev)) return -EINVAL; mutex_lock(&mtd_table_mutex); ofidx = -1; if (np) ofidx = of_alias_get_id(np, "mtd"); if (ofidx >= 0) i = idr_alloc(&mtd_idr, mtd, ofidx, ofidx + 1, GFP_KERNEL); else i = idr_alloc(&mtd_idr, mtd, 0, 0, GFP_KERNEL); if (i < 0) { error = i; goto fail_locked; } mtd->index = i; kref_init(&mtd->refcnt); /* default value if not set by driver */ if (mtd->bitflip_threshold == 0) mtd->bitflip_threshold = mtd->ecc_strength; if (mtd->flags & MTD_SLC_ON_MLC_EMULATION) { int ngroups = mtd_pairing_groups(master); mtd->erasesize /= ngroups; mtd->size = (u64)mtd_div_by_eb(mtd->size, master) * mtd->erasesize; } if (is_power_of_2(mtd->erasesize)) mtd->erasesize_shift = ffs(mtd->erasesize) - 1; else mtd->erasesize_shift = 0; if (is_power_of_2(mtd->writesize)) mtd->writesize_shift = ffs(mtd->writesize) - 1; else mtd->writesize_shift = 0; mtd->erasesize_mask = (1 << mtd->erasesize_shift) - 1; mtd->writesize_mask = (1 << mtd->writesize_shift) - 1; /* Some chips always power up locked. Unlock them now */ if ((mtd->flags & MTD_WRITEABLE) && (mtd->flags & MTD_POWERUP_LOCK)) { error = mtd_unlock(mtd, 0, mtd->size); if (error && error != -EOPNOTSUPP) printk(KERN_WARNING "%s: unlock failed, writes may not work\n", mtd->name); /* Ignore unlock failures? */ error = 0; } /* Caller should have set dev.parent to match the * physical device, if appropriate. */ mtd->dev.type = &mtd_devtype; mtd->dev.class = &mtd_class; mtd->dev.devt = MTD_DEVT(i); dev_set_name(&mtd->dev, "mtd%d", i); dev_set_drvdata(&mtd->dev, mtd); mtd_check_of_node(mtd); of_node_get(mtd_get_of_node(mtd)); error = device_register(&mtd->dev); if (error) { put_device(&mtd->dev); goto fail_added; } /* Add the nvmem provider */ error = mtd_nvmem_add(mtd); if (error) goto fail_nvmem_add; mtd_debugfs_populate(mtd); device_create(&mtd_class, mtd->dev.parent, MTD_DEVT(i) + 1, NULL, "mtd%dro", i); pr_debug("mtd: Giving out device %d to %s\n", i, mtd->name); /* No need to get a refcount on the module containing the notifier, since we hold the mtd_table_mutex */ list_for_each_entry(not, &mtd_notifiers, list) not->add(mtd); mutex_unlock(&mtd_table_mutex); if (of_property_read_bool(mtd_get_of_node(mtd), "linux,rootfs")) { if (IS_BUILTIN(CONFIG_MTD)) { pr_info("mtd: setting mtd%d (%s) as root device\n", mtd->index, mtd->name); ROOT_DEV = MKDEV(MTD_BLOCK_MAJOR, mtd->index); } else { pr_warn("mtd: can't set mtd%d (%s) as root device - mtd must be builtin\n", mtd->index, mtd->name); } } /* We _know_ we aren't being removed, because our caller is still holding us here. So none of this try_ nonsense, and no bitching about it either. :) */ __module_get(THIS_MODULE); return 0; fail_nvmem_add: device_unregister(&mtd->dev); fail_added: of_node_put(mtd_get_of_node(mtd)); idr_remove(&mtd_idr, i); fail_locked: mutex_unlock(&mtd_table_mutex); return error; } /** * del_mtd_device - unregister an MTD device * @mtd: pointer to MTD device info structure * * Remove a device from the list of MTD devices present in the system, * and notify each currently active MTD 'user' of its departure. * Returns zero on success or 1 on failure, which currently will happen * if the requested device does not appear to be present in the list. */ int del_mtd_device(struct mtd_info *mtd) { int ret; struct mtd_notifier *not; mutex_lock(&mtd_table_mutex); if (idr_find(&mtd_idr, mtd->index) != mtd) { ret = -ENODEV; goto out_error; } /* No need to get a refcount on the module containing the notifier, since we hold the mtd_table_mutex */ list_for_each_entry(not, &mtd_notifiers, list) not->remove(mtd); kref_put(&mtd->refcnt, mtd_device_release); ret = 0; out_error: mutex_unlock(&mtd_table_mutex); return ret; } /* * Set a few defaults based on the parent devices, if not provided by the * driver */ static void mtd_set_dev_defaults(struct mtd_info *mtd) { if (mtd->dev.parent) { if (!mtd->owner && mtd->dev.parent->driver) mtd->owner = mtd->dev.parent->driver->owner; if (!mtd->name) mtd->name = dev_name(mtd->dev.parent); } else { pr_debug("mtd device won't show a device symlink in sysfs\n"); } INIT_LIST_HEAD(&mtd->partitions); mutex_init(&mtd->master.partitions_lock); mutex_init(&mtd->master.chrdev_lock); } static ssize_t mtd_otp_size(struct mtd_info *mtd, bool is_user) { struct otp_info *info; ssize_t size = 0; unsigned int i; size_t retlen; int ret; info = kmalloc(PAGE_SIZE, GFP_KERNEL); if (!info) return -ENOMEM; if (is_user) ret = mtd_get_user_prot_info(mtd, PAGE_SIZE, &retlen, info); else ret = mtd_get_fact_prot_info(mtd, PAGE_SIZE, &retlen, info); if (ret) goto err; for (i = 0; i < retlen / sizeof(*info); i++) size += info[i].length; kfree(info); return size; err: kfree(info); /* ENODATA means there is no OTP region. */ return ret == -ENODATA ? 0 : ret; } static struct nvmem_device *mtd_otp_nvmem_register(struct mtd_info *mtd, const char *compatible, int size, nvmem_reg_read_t reg_read) { struct nvmem_device *nvmem = NULL; struct nvmem_config config = {}; struct device_node *np; /* DT binding is optional */ np = of_get_compatible_child(mtd->dev.of_node, compatible); /* OTP nvmem will be registered on the physical device */ config.dev = mtd->dev.parent; config.name = compatible; config.id = NVMEM_DEVID_AUTO; config.owner = THIS_MODULE; config.add_legacy_fixed_of_cells = !mtd_type_is_nand(mtd); config.type = NVMEM_TYPE_OTP; config.root_only = true; config.ignore_wp = true; config.reg_read = reg_read; config.size = size; config.of_node = np; config.priv = mtd; nvmem = nvmem_register(&config); /* Just ignore if there is no NVMEM support in the kernel */ if (IS_ERR(nvmem) && PTR_ERR(nvmem) == -EOPNOTSUPP) nvmem = NULL; of_node_put(np); return nvmem; } static int mtd_nvmem_user_otp_reg_read(void *priv, unsigned int offset, void *val, size_t bytes) { struct mtd_info *mtd = priv; size_t retlen; int ret; ret = mtd_read_user_prot_reg(mtd, offset, bytes, &retlen, val); if (ret) return ret; return retlen == bytes ? 0 : -EIO; } static int mtd_nvmem_fact_otp_reg_read(void *priv, unsigned int offset, void *val, size_t bytes) { struct mtd_info *mtd = priv; size_t retlen; int ret; ret = mtd_read_fact_prot_reg(mtd, offset, bytes, &retlen, val); if (ret) return ret; return retlen == bytes ? 0 : -EIO; } static int mtd_otp_nvmem_add(struct mtd_info *mtd) { struct device *dev = mtd->dev.parent; struct nvmem_device *nvmem; ssize_t size; int err; if (mtd->_get_user_prot_info && mtd->_read_user_prot_reg) { size = mtd_otp_size(mtd, true); if (size < 0) { err = size; goto err; } if (size > 0) { nvmem = mtd_otp_nvmem_register(mtd, "user-otp", size, mtd_nvmem_user_otp_reg_read); if (IS_ERR(nvmem)) { err = PTR_ERR(nvmem); goto err; } mtd->otp_user_nvmem = nvmem; } } if (mtd->_get_fact_prot_info && mtd->_read_fact_prot_reg) { size = mtd_otp_size(mtd, false); if (size < 0) { err = size; goto err; } if (size > 0) { /* * The factory OTP contains thing such as a unique serial * number and is small, so let's read it out and put it * into the entropy pool. */ void *otp; otp = kmalloc(size, GFP_KERNEL); if (!otp) { err = -ENOMEM; goto err; } err = mtd_nvmem_fact_otp_reg_read(mtd, 0, otp, size); if (err < 0) { kfree(otp); goto err; } add_device_randomness(otp, err); kfree(otp); nvmem = mtd_otp_nvmem_register(mtd, "factory-otp", size, mtd_nvmem_fact_otp_reg_read); if (IS_ERR(nvmem)) { err = PTR_ERR(nvmem); goto err; } mtd->otp_factory_nvmem = nvmem; } } return 0; err: nvmem_unregister(mtd->otp_user_nvmem); /* Don't report error if OTP is not supported. */ if (err == -EOPNOTSUPP) return 0; return dev_err_probe(dev, err, "Failed to register OTP NVMEM device\n"); } /** * mtd_device_parse_register - parse partitions and register an MTD device. * * @mtd: the MTD device to register * @types: the list of MTD partition probes to try, see * 'parse_mtd_partitions()' for more information * @parser_data: MTD partition parser-specific data * @parts: fallback partition information to register, if parsing fails; * only valid if %nr_parts > %0 * @nr_parts: the number of partitions in parts, if zero then the full * MTD device is registered if no partition info is found * * This function aggregates MTD partitions parsing (done by * 'parse_mtd_partitions()') and MTD device and partitions registering. It * basically follows the most common pattern found in many MTD drivers: * * * If the MTD_PARTITIONED_MASTER option is set, then the device as a whole is * registered first. * * Then It tries to probe partitions on MTD device @mtd using parsers * specified in @types (if @types is %NULL, then the default list of parsers * is used, see 'parse_mtd_partitions()' for more information). If none are * found this functions tries to fallback to information specified in * @parts/@nr_parts. * * If no partitions were found this function just registers the MTD device * @mtd and exits. * * Returns zero in case of success and a negative error code in case of failure. */ int mtd_device_parse_register(struct mtd_info *mtd, const char * const *types, struct mtd_part_parser_data *parser_data, const struct mtd_partition *parts, int nr_parts) { int ret; mtd_set_dev_defaults(mtd); ret = mtd_otp_nvmem_add(mtd); if (ret) goto out; if (IS_ENABLED(CONFIG_MTD_PARTITIONED_MASTER)) { ret = add_mtd_device(mtd); if (ret) goto out; } /* Prefer parsed partitions over driver-provided fallback */ ret = parse_mtd_partitions(mtd, types, parser_data); if (ret == -EPROBE_DEFER) goto out; if (ret > 0) ret = 0; else if (nr_parts) ret = add_mtd_partitions(mtd, parts, nr_parts); else if (!device_is_registered(&mtd->dev)) ret = add_mtd_device(mtd); else ret = 0; if (ret) goto out; /* * FIXME: some drivers unfortunately call this function more than once. * So we have to check if we've already assigned the reboot notifier. * * Generally, we can make multiple calls work for most cases, but it * does cause problems with parse_mtd_partitions() above (e.g., * cmdlineparts will register partitions more than once). */ WARN_ONCE(mtd->_reboot && mtd->reboot_notifier.notifier_call, "MTD already registered\n"); if (mtd->_reboot && !mtd->reboot_notifier.notifier_call) { mtd->reboot_notifier.notifier_call = mtd_reboot_notifier; register_reboot_notifier(&mtd->reboot_notifier); } out: if (ret) { nvmem_unregister(mtd->otp_user_nvmem); nvmem_unregister(mtd->otp_factory_nvmem); } if (ret && device_is_registered(&mtd->dev)) del_mtd_device(mtd); return ret; } EXPORT_SYMBOL_GPL(mtd_device_parse_register); /** * mtd_device_unregister - unregister an existing MTD device. * * @master: the MTD device to unregister. This will unregister both the master * and any partitions if registered. */ int mtd_device_unregister(struct mtd_info *master) { int err; if (master->_reboot) { unregister_reboot_notifier(&master->reboot_notifier); memset(&master->reboot_notifier, 0, sizeof(master->reboot_notifier)); } nvmem_unregister(master->otp_user_nvmem); nvmem_unregister(master->otp_factory_nvmem); err = del_mtd_partitions(master); if (err) return err; if (!device_is_registered(&master->dev)) return 0; return del_mtd_device(master); } EXPORT_SYMBOL_GPL(mtd_device_unregister); /** * register_mtd_user - register a 'user' of MTD devices. * @new: pointer to notifier info structure * * Registers a pair of callbacks function to be called upon addition * or removal of MTD devices. Causes the 'add' callback to be immediately * invoked for each MTD device currently present in the system. */ void register_mtd_user (struct mtd_notifier *new) { struct mtd_info *mtd; mutex_lock(&mtd_table_mutex); list_add(&new->list, &mtd_notifiers); __module_get(THIS_MODULE); mtd_for_each_device(mtd) new->add(mtd); mutex_unlock(&mtd_table_mutex); } EXPORT_SYMBOL_GPL(register_mtd_user); /** * unregister_mtd_user - unregister a 'user' of MTD devices. * @old: pointer to notifier info structure * * Removes a callback function pair from the list of 'users' to be * notified upon addition or removal of MTD devices. Causes the * 'remove' callback to be immediately invoked for each MTD device * currently present in the system. */ int unregister_mtd_user (struct mtd_notifier *old) { struct mtd_info *mtd; mutex_lock(&mtd_table_mutex); module_put(THIS_MODULE); mtd_for_each_device(mtd) old->remove(mtd); list_del(&old->list); mutex_unlock(&mtd_table_mutex); return 0; } EXPORT_SYMBOL_GPL(unregister_mtd_user); /** * get_mtd_device - obtain a validated handle for an MTD device * @mtd: last known address of the required MTD device * @num: internal device number of the required MTD device * * Given a number and NULL address, return the num'th entry in the device * table, if any. Given an address and num == -1, search the device table * for a device with that address and return if it's still present. Given * both, return the num'th driver only if its address matches. Return * error code if not. */ struct mtd_info *get_mtd_device(struct mtd_info *mtd, int num) { struct mtd_info *ret = NULL, *other; int err = -ENODEV; mutex_lock(&mtd_table_mutex); if (num == -1) { mtd_for_each_device(other) { if (other == mtd) { ret = mtd; break; } } } else if (num >= 0) { ret = idr_find(&mtd_idr, num); if (mtd && mtd != ret) ret = NULL; } if (!ret) { ret = ERR_PTR(err); goto out; } err = __get_mtd_device(ret); if (err) ret = ERR_PTR(err); out: mutex_unlock(&mtd_table_mutex); return ret; } EXPORT_SYMBOL_GPL(get_mtd_device); int __get_mtd_device(struct mtd_info *mtd) { struct mtd_info *master = mtd_get_master(mtd); int err; if (master->_get_device) { err = master->_get_device(mtd); if (err) return err; } if (!try_module_get(master->owner)) { if (master->_put_device) master->_put_device(master); return -ENODEV; } while (mtd) { if (mtd != master) kref_get(&mtd->refcnt); mtd = mtd->parent; } if (IS_ENABLED(CONFIG_MTD_PARTITIONED_MASTER)) kref_get(&master->refcnt); return 0; } EXPORT_SYMBOL_GPL(__get_mtd_device); /** * of_get_mtd_device_by_node - obtain an MTD device associated with a given node * * @np: device tree node */ struct mtd_info *of_get_mtd_device_by_node(struct device_node *np) { struct mtd_info *mtd = NULL; struct mtd_info *tmp; int err; mutex_lock(&mtd_table_mutex); err = -EPROBE_DEFER; mtd_for_each_device(tmp) { if (mtd_get_of_node(tmp) == np) { mtd = tmp; err = __get_mtd_device(mtd); break; } } mutex_unlock(&mtd_table_mutex); return err ? ERR_PTR(err) : mtd; } EXPORT_SYMBOL_GPL(of_get_mtd_device_by_node); /** * get_mtd_device_nm - obtain a validated handle for an MTD device by * device name * @name: MTD device name to open * * This function returns MTD device description structure in case of * success and an error code in case of failure. */ struct mtd_info *get_mtd_device_nm(const char *name) { int err = -ENODEV; struct mtd_info *mtd = NULL, *other; mutex_lock(&mtd_table_mutex); mtd_for_each_device(other) { if (!strcmp(name, other->name)) { mtd = other; break; } } if (!mtd) goto out_unlock; err = __get_mtd_device(mtd); if (err) goto out_unlock; mutex_unlock(&mtd_table_mutex); return mtd; out_unlock: mutex_unlock(&mtd_table_mutex); return ERR_PTR(err); } EXPORT_SYMBOL_GPL(get_mtd_device_nm); void put_mtd_device(struct mtd_info *mtd) { mutex_lock(&mtd_table_mutex); __put_mtd_device(mtd); mutex_unlock(&mtd_table_mutex); } EXPORT_SYMBOL_GPL(put_mtd_device); void __put_mtd_device(struct mtd_info *mtd) { struct mtd_info *master = mtd_get_master(mtd); while (mtd) { /* kref_put() can relese mtd, so keep a reference mtd->parent */ struct mtd_info *parent = mtd->parent; if (mtd != master) kref_put(&mtd->refcnt, mtd_device_release); mtd = parent; } if (IS_ENABLED(CONFIG_MTD_PARTITIONED_MASTER)) kref_put(&master->refcnt, mtd_device_release); module_put(master->owner); /* must be the last as master can be freed in the _put_device */ if (master->_put_device) master->_put_device(master); } EXPORT_SYMBOL_GPL(__put_mtd_device); /* * Erase is an synchronous operation. Device drivers are epected to return a * negative error code if the operation failed and update instr->fail_addr * to point the portion that was not properly erased. */ int mtd_erase(struct mtd_info *mtd, struct erase_info *instr) { struct mtd_info *master = mtd_get_master(mtd); u64 mst_ofs = mtd_get_master_ofs(mtd, 0); struct erase_info adjinstr; int ret; instr->fail_addr = MTD_FAIL_ADDR_UNKNOWN; adjinstr = *instr; if (!mtd->erasesize || !master->_erase) return -ENOTSUPP; if (instr->addr >= mtd->size || instr->len > mtd->size - instr->addr) return -EINVAL; if (!(mtd->flags & MTD_WRITEABLE)) return -EROFS; if (!instr->len) return 0; ledtrig_mtd_activity(); if (mtd->flags & MTD_SLC_ON_MLC_EMULATION) { adjinstr.addr = (loff_t)mtd_div_by_eb(instr->addr, mtd) * master->erasesize; adjinstr.len = ((u64)mtd_div_by_eb(instr->addr + instr->len, mtd) * master->erasesize) - adjinstr.addr; } adjinstr.addr += mst_ofs; ret = master->_erase(master, &adjinstr); if (adjinstr.fail_addr != MTD_FAIL_ADDR_UNKNOWN) { instr->fail_addr = adjinstr.fail_addr - mst_ofs; if (mtd->flags & MTD_SLC_ON_MLC_EMULATION) { instr->fail_addr = mtd_div_by_eb(instr->fail_addr, master); instr->fail_addr *= mtd->erasesize; } } return ret; } EXPORT_SYMBOL_GPL(mtd_erase); ALLOW_ERROR_INJECTION(mtd_erase, ERRNO); /* * This stuff for eXecute-In-Place. phys is optional and may be set to NULL. */ int mtd_point(struct mtd_info *mtd, loff_t from, size_t len, size_t *retlen, void **virt, resource_size_t *phys) { struct mtd_info *master = mtd_get_master(mtd); *retlen = 0; *virt = NULL; if (phys) *phys = 0; if (!master->_point) return -EOPNOTSUPP; if (from < 0 || from >= mtd->size || len > mtd->size - from) return -EINVAL; if (!len) return 0; from = mtd_get_master_ofs(mtd, from); return master->_point(master, from, len, retlen, virt, phys); } EXPORT_SYMBOL_GPL(mtd_point); /* We probably shouldn't allow XIP if the unpoint isn't a NULL */ int mtd_unpoint(struct mtd_info *mtd, loff_t from, size_t len) { struct mtd_info *master = mtd_get_master(mtd); if (!master->_unpoint) return -EOPNOTSUPP; if (from < 0 || from >= mtd->size || len > mtd->size - from) return -EINVAL; if (!len) return 0; return master->_unpoint(master, mtd_get_master_ofs(mtd, from), len); } EXPORT_SYMBOL_GPL(mtd_unpoint); /* * Allow NOMMU mmap() to directly map the device (if not NULL) * - return the address to which the offset maps * - return -ENOSYS to indicate refusal to do the mapping */ unsigned long mtd_get_unmapped_area(struct mtd_info *mtd, unsigned long len, unsigned long offset, unsigned long flags) { size_t retlen; void *virt; int ret; ret = mtd_point(mtd, offset, len, &retlen, &virt, NULL); if (ret) return ret; if (retlen != len) { mtd_unpoint(mtd, offset, retlen); return -ENOSYS; } return (unsigned long)virt; } EXPORT_SYMBOL_GPL(mtd_get_unmapped_area); static void mtd_update_ecc_stats(struct mtd_info *mtd, struct mtd_info *master, const struct mtd_ecc_stats *old_stats) { struct mtd_ecc_stats diff; if (master == mtd) return; diff = master->ecc_stats; diff.failed -= old_stats->failed; diff.corrected -= old_stats->corrected; while (mtd->parent) { mtd->ecc_stats.failed += diff.failed; mtd->ecc_stats.corrected += diff.corrected; mtd = mtd->parent; } } int mtd_read(struct mtd_info *mtd, loff_t from, size_t len, size_t *retlen, u_char *buf) { struct mtd_oob_ops ops = { .len = len, .datbuf = buf, }; int ret; ret = mtd_read_oob(mtd, from, &ops); *retlen = ops.retlen; WARN_ON_ONCE(*retlen != len && mtd_is_bitflip_or_eccerr(ret)); return ret; } EXPORT_SYMBOL_GPL(mtd_read); ALLOW_ERROR_INJECTION(mtd_read, ERRNO); int mtd_write(struct mtd_info *mtd, loff_t to, size_t len, size_t *retlen, const u_char *buf) { struct mtd_oob_ops ops = { .len = len, .datbuf = (u8 *)buf, }; int ret; ret = mtd_write_oob(mtd, to, &ops); *retlen = ops.retlen; return ret; } EXPORT_SYMBOL_GPL(mtd_write); ALLOW_ERROR_INJECTION(mtd_write, ERRNO); /* * In blackbox flight recorder like scenarios we want to make successful writes * in interrupt context. panic_write() is only intended to be called when its * known the kernel is about to panic and we need the write to succeed. Since * the kernel is not going to be running for much longer, this function can * break locks and delay to ensure the write succeeds (but not sleep). */ int mtd_panic_write(struct mtd_info *mtd, loff_t to, size_t len, size_t *retlen, const u_char *buf) { struct mtd_info *master = mtd_get_master(mtd); *retlen = 0; if (!master->_panic_write) return -EOPNOTSUPP; if (to < 0 || to >= mtd->size || len > mtd->size - to) return -EINVAL; if (!(mtd->flags & MTD_WRITEABLE)) return -EROFS; if (!len) return 0; if (!master->oops_panic_write) master->oops_panic_write = true; return master->_panic_write(master, mtd_get_master_ofs(mtd, to), len, retlen, buf); } EXPORT_SYMBOL_GPL(mtd_panic_write); static int mtd_check_oob_ops(struct mtd_info *mtd, loff_t offs, struct mtd_oob_ops *ops) { /* * Some users are setting ->datbuf or ->oobbuf to NULL, but are leaving * ->len or ->ooblen uninitialized. Force ->len and ->ooblen to 0 in * this case. */ if (!ops->datbuf) ops->len = 0; if (!ops->oobbuf) ops->ooblen = 0; if (offs < 0 || offs + ops->len > mtd->size) return -EINVAL; if (ops->ooblen) { size_t maxooblen; if (ops->ooboffs >= mtd_oobavail(mtd, ops)) return -EINVAL; maxooblen = ((size_t)(mtd_div_by_ws(mtd->size, mtd) - mtd_div_by_ws(offs, mtd)) * mtd_oobavail(mtd, ops)) - ops->ooboffs; if (ops->ooblen > maxooblen) return -EINVAL; } return 0; } static int mtd_read_oob_std(struct mtd_info *mtd, loff_t from, struct mtd_oob_ops *ops) { struct mtd_info *master = mtd_get_master(mtd); int ret; from = mtd_get_master_ofs(mtd, from); if (master->_read_oob) ret = master->_read_oob(master, from, ops); else ret = master->_read(master, from, ops->len, &ops->retlen, ops->datbuf); return ret; } static int mtd_write_oob_std(struct mtd_info *mtd, loff_t to, struct mtd_oob_ops *ops) { struct mtd_info *master = mtd_get_master(mtd); int ret; to = mtd_get_master_ofs(mtd, to); if (master->_write_oob) ret = master->_write_oob(master, to, ops); else ret = master->_write(master, to, ops->len, &ops->retlen, ops->datbuf); return ret; } static int mtd_io_emulated_slc(struct mtd_info *mtd, loff_t start, bool read, struct mtd_oob_ops *ops) { struct mtd_info *master = mtd_get_master(mtd); int ngroups = mtd_pairing_groups(master); int npairs = mtd_wunit_per_eb(master) / ngroups; struct mtd_oob_ops adjops = *ops; unsigned int wunit, oobavail; struct mtd_pairing_info info; int max_bitflips = 0; u32 ebofs, pageofs; loff_t base, pos; ebofs = mtd_mod_by_eb(start, mtd); base = (loff_t)mtd_div_by_eb(start, mtd) * master->erasesize; info.group = 0; info.pair = mtd_div_by_ws(ebofs, mtd); pageofs = mtd_mod_by_ws(ebofs, mtd); oobavail = mtd_oobavail(mtd, ops); while (ops->retlen < ops->len || ops->oobretlen < ops->ooblen) { int ret; if (info.pair >= npairs) { info.pair = 0; base += master->erasesize; } wunit = mtd_pairing_info_to_wunit(master, &info); pos = mtd_wunit_to_offset(mtd, base, wunit); adjops.len = ops->len - ops->retlen; if (adjops.len > mtd->writesize - pageofs) adjops.len = mtd->writesize - pageofs; adjops.ooblen = ops->ooblen - ops->oobretlen; if (adjops.ooblen > oobavail - adjops.ooboffs) adjops.ooblen = oobavail - adjops.ooboffs; if (read) { ret = mtd_read_oob_std(mtd, pos + pageofs, &adjops); if (ret > 0) max_bitflips = max(max_bitflips, ret); } else { ret = mtd_write_oob_std(mtd, pos + pageofs, &adjops); } if (ret < 0) return ret; max_bitflips = max(max_bitflips, ret); ops->retlen += adjops.retlen; ops->oobretlen += adjops.oobretlen; adjops.datbuf += adjops.retlen; adjops.oobbuf += adjops.oobretlen; adjops.ooboffs = 0; pageofs = 0; info.pair++; } return max_bitflips; } int mtd_read_oob(struct mtd_info *mtd, loff_t from, struct mtd_oob_ops *ops) { struct mtd_info *master = mtd_get_master(mtd); struct mtd_ecc_stats old_stats = master->ecc_stats; int ret_code; ops->retlen = ops->oobretlen = 0; ret_code = mtd_check_oob_ops(mtd, from, ops); if (ret_code) return ret_code; ledtrig_mtd_activity(); /* Check the validity of a potential fallback on mtd->_read */ if (!master->_read_oob && (!master->_read || ops->oobbuf)) return -EOPNOTSUPP; if (ops->stats) memset(ops->stats, 0, sizeof(*ops->stats)); if (mtd->flags & MTD_SLC_ON_MLC_EMULATION) ret_code = mtd_io_emulated_slc(mtd, from, true, ops); else ret_code = mtd_read_oob_std(mtd, from, ops); mtd_update_ecc_stats(mtd, master, &old_stats); /* * In cases where ops->datbuf != NULL, mtd->_read_oob() has semantics * similar to mtd->_read(), returning a non-negative integer * representing max bitflips. In other cases, mtd->_read_oob() may * return -EUCLEAN. In all cases, perform similar logic to mtd_read(). */ if (unlikely(ret_code < 0)) return ret_code; if (mtd->ecc_strength == 0) return 0; /* device lacks ecc */ if (ops->stats) ops->stats->max_bitflips = ret_code; return ret_code >= mtd->bitflip_threshold ? -EUCLEAN : 0; } EXPORT_SYMBOL_GPL(mtd_read_oob); int mtd_write_oob(struct mtd_info *mtd, loff_t to, struct mtd_oob_ops *ops) { struct mtd_info *master = mtd_get_master(mtd); int ret; ops->retlen = ops->oobretlen = 0; if (!(mtd->flags & MTD_WRITEABLE)) return -EROFS; ret = mtd_check_oob_ops(mtd, to, ops); if (ret) return ret; ledtrig_mtd_activity(); /* Check the validity of a potential fallback on mtd->_write */ if (!master->_write_oob && (!master->_write || ops->oobbuf)) return -EOPNOTSUPP; if (mtd->flags & MTD_SLC_ON_MLC_EMULATION) return mtd_io_emulated_slc(mtd, to, false, ops); return mtd_write_oob_std(mtd, to, ops); } EXPORT_SYMBOL_GPL(mtd_write_oob); /** * mtd_ooblayout_ecc - Get the OOB region definition of a specific ECC section * @mtd: MTD device structure * @section: ECC section. Depending on the layout you may have all the ECC * bytes stored in a single contiguous section, or one section * per ECC chunk (and sometime several sections for a single ECC * ECC chunk) * @oobecc: OOB region struct filled with the appropriate ECC position * information * * This function returns ECC section information in the OOB area. If you want * to get all the ECC bytes information, then you should call * mtd_ooblayout_ecc(mtd, section++, oobecc) until it returns -ERANGE. * * Returns zero on success, a negative error code otherwise. */ int mtd_ooblayout_ecc(struct mtd_info *mtd, int section, struct mtd_oob_region *oobecc) { struct mtd_info *master = mtd_get_master(mtd); memset(oobecc, 0, sizeof(*oobecc)); if (!master || section < 0) return -EINVAL; if (!master->ooblayout || !master->ooblayout->ecc) return -ENOTSUPP; return master->ooblayout->ecc(master, section, oobecc); } EXPORT_SYMBOL_GPL(mtd_ooblayout_ecc); /** * mtd_ooblayout_free - Get the OOB region definition of a specific free * section * @mtd: MTD device structure * @section: Free section you are interested in. Depending on the layout * you may have all the free bytes stored in a single contiguous * section, or one section per ECC chunk plus an extra section * for the remaining bytes (or other funky layout). * @oobfree: OOB region struct filled with the appropriate free position * information * * This function returns free bytes position in the OOB area. If you want * to get all the free bytes information, then you should call * mtd_ooblayout_free(mtd, section++, oobfree) until it returns -ERANGE. * * Returns zero on success, a negative error code otherwise. */ int mtd_ooblayout_free(struct mtd_info *mtd, int section, struct mtd_oob_region *oobfree) { struct mtd_info *master = mtd_get_master(mtd); memset(oobfree, 0, sizeof(*oobfree)); if (!master || section < 0) return -EINVAL; if (!master->ooblayout || !master->ooblayout->free) return -ENOTSUPP; return master->ooblayout->free(master, section, oobfree); } EXPORT_SYMBOL_GPL(mtd_ooblayout_free); /** * mtd_ooblayout_find_region - Find the region attached to a specific byte * @mtd: mtd info structure * @byte: the byte we are searching for * @sectionp: pointer where the section id will be stored * @oobregion: used to retrieve the ECC position * @iter: iterator function. Should be either mtd_ooblayout_free or * mtd_ooblayout_ecc depending on the region type you're searching for * * This function returns the section id and oobregion information of a * specific byte. For example, say you want to know where the 4th ECC byte is * stored, you'll use: * * mtd_ooblayout_find_region(mtd, 3, §ion, &oobregion, mtd_ooblayout_ecc); * * Returns zero on success, a negative error code otherwise. */ static int mtd_ooblayout_find_region(struct mtd_info *mtd, int byte, int *sectionp, struct mtd_oob_region *oobregion, int (*iter)(struct mtd_info *, int section, struct mtd_oob_region *oobregion)) { int pos = 0, ret, section = 0; memset(oobregion, 0, sizeof(*oobregion)); while (1) { ret = iter(mtd, section, oobregion); if (ret) return ret; if (pos + oobregion->length > byte) break; pos += oobregion->length; section++; } /* * Adjust region info to make it start at the beginning at the * 'start' ECC byte. */ oobregion->offset += byte - pos; oobregion->length -= byte - pos; *sectionp = section; return 0; } /** * mtd_ooblayout_find_eccregion - Find the ECC region attached to a specific * ECC byte * @mtd: mtd info structure * @eccbyte: the byte we are searching for * @section: pointer where the section id will be stored * @oobregion: OOB region information * * Works like mtd_ooblayout_find_region() except it searches for a specific ECC * byte. * * Returns zero on success, a negative error code otherwise. */ int mtd_ooblayout_find_eccregion(struct mtd_info *mtd, int eccbyte, int *section, struct mtd_oob_region *oobregion) { return mtd_ooblayout_find_region(mtd, eccbyte, section, oobregion, mtd_ooblayout_ecc); } EXPORT_SYMBOL_GPL(mtd_ooblayout_find_eccregion); /** * mtd_ooblayout_get_bytes - Extract OOB bytes from the oob buffer * @mtd: mtd info structure * @buf: destination buffer to store OOB bytes * @oobbuf: OOB buffer * @start: first byte to retrieve * @nbytes: number of bytes to retrieve * @iter: section iterator * * Extract bytes attached to a specific category (ECC or free) * from the OOB buffer and copy them into buf. * * Returns zero on success, a negative error code otherwise. */ static int mtd_ooblayout_get_bytes(struct mtd_info *mtd, u8 *buf, const u8 *oobbuf, int start, int nbytes, int (*iter)(struct mtd_info *, int section, struct mtd_oob_region *oobregion)) { struct mtd_oob_region oobregion; int section, ret; ret = mtd_ooblayout_find_region(mtd, start, §ion, &oobregion, iter); while (!ret) { int cnt; cnt = min_t(int, nbytes, oobregion.length); memcpy(buf, oobbuf + oobregion.offset, cnt); buf += cnt; nbytes -= cnt; if (!nbytes) break; ret = iter(mtd, ++section, &oobregion); } return ret; } /** * mtd_ooblayout_set_bytes - put OOB bytes into the oob buffer * @mtd: mtd info structure * @buf: source buffer to get OOB bytes from * @oobbuf: OOB buffer * @start: first OOB byte to set * @nbytes: number of OOB bytes to set * @iter: section iterator * * Fill the OOB buffer with data provided in buf. The category (ECC or free) * is selected by passing the appropriate iterator. * * Returns zero on success, a negative error code otherwise. */ static int mtd_ooblayout_set_bytes(struct mtd_info *mtd, const u8 *buf, u8 *oobbuf, int start, int nbytes, int (*iter)(struct mtd_info *, int section, struct mtd_oob_region *oobregion)) { struct mtd_oob_region oobregion; int section, ret; ret = mtd_ooblayout_find_region(mtd, start, §ion, &oobregion, iter); while (!ret) { int cnt; cnt = min_t(int, nbytes, oobregion.length); memcpy(oobbuf + oobregion.offset, buf, cnt); buf += cnt; nbytes -= cnt; if (!nbytes) break; ret = iter(mtd, ++section, &oobregion); } return ret; } /** * mtd_ooblayout_count_bytes - count the number of bytes in a OOB category * @mtd: mtd info structure * @iter: category iterator * * Count the number of bytes in a given category. * * Returns a positive value on success, a negative error code otherwise. */ static int mtd_ooblayout_count_bytes(struct mtd_info *mtd, int (*iter)(struct mtd_info *, int section, struct mtd_oob_region *oobregion)) { struct mtd_oob_region oobregion; int section = 0, ret, nbytes = 0; while (1) { ret = iter(mtd, section++, &oobregion); if (ret) { if (ret == -ERANGE) ret = nbytes; break; } nbytes += oobregion.length; } return ret; } /** * mtd_ooblayout_get_eccbytes - extract ECC bytes from the oob buffer * @mtd: mtd info structure * @eccbuf: destination buffer to store ECC bytes * @oobbuf: OOB buffer * @start: first ECC byte to retrieve * @nbytes: number of ECC bytes to retrieve * * Works like mtd_ooblayout_get_bytes(), except it acts on ECC bytes. * * Returns zero on success, a negative error code otherwise. */ int mtd_ooblayout_get_eccbytes(struct mtd_info *mtd, u8 *eccbuf, const u8 *oobbuf, int start, int nbytes) { return mtd_ooblayout_get_bytes(mtd, eccbuf, oobbuf, start, nbytes, mtd_ooblayout_ecc); } EXPORT_SYMBOL_GPL(mtd_ooblayout_get_eccbytes); /** * mtd_ooblayout_set_eccbytes - set ECC bytes into the oob buffer * @mtd: mtd info structure * @eccbuf: source buffer to get ECC bytes from * @oobbuf: OOB buffer * @start: first ECC byte to set * @nbytes: number of ECC bytes to set * * Works like mtd_ooblayout_set_bytes(), except it acts on ECC bytes. * * Returns zero on success, a negative error code otherwise. */ int mtd_ooblayout_set_eccbytes(struct mtd_info *mtd, const u8 *eccbuf, u8 *oobbuf, int start, int nbytes) { return mtd_ooblayout_set_bytes(mtd, eccbuf, oobbuf, start, nbytes, mtd_ooblayout_ecc); } EXPORT_SYMBOL_GPL(mtd_ooblayout_set_eccbytes); /** * mtd_ooblayout_get_databytes - extract data bytes from the oob buffer * @mtd: mtd info structure * @databuf: destination buffer to store ECC bytes * @oobbuf: OOB buffer * @start: first ECC byte to retrieve * @nbytes: number of ECC bytes to retrieve * * Works like mtd_ooblayout_get_bytes(), except it acts on free bytes. * * Returns zero on success, a negative error code otherwise. */ int mtd_ooblayout_get_databytes(struct mtd_info *mtd, u8 *databuf, const u8 *oobbuf, int start, int nbytes) { return mtd_ooblayout_get_bytes(mtd, databuf, oobbuf, start, nbytes, mtd_ooblayout_free); } EXPORT_SYMBOL_GPL(mtd_ooblayout_get_databytes); /** * mtd_ooblayout_set_databytes - set data bytes into the oob buffer * @mtd: mtd info structure * @databuf: source buffer to get data bytes from * @oobbuf: OOB buffer * @start: first ECC byte to set * @nbytes: number of ECC bytes to set * * Works like mtd_ooblayout_set_bytes(), except it acts on free bytes. * * Returns zero on success, a negative error code otherwise. */ int mtd_ooblayout_set_databytes(struct mtd_info *mtd, const u8 *databuf, u8 *oobbuf, int start, int nbytes) { return mtd_ooblayout_set_bytes(mtd, databuf, oobbuf, start, nbytes, mtd_ooblayout_free); } EXPORT_SYMBOL_GPL(mtd_ooblayout_set_databytes); /** * mtd_ooblayout_count_freebytes - count the number of free bytes in OOB * @mtd: mtd info structure * * Works like mtd_ooblayout_count_bytes(), except it count free bytes. * * Returns zero on success, a negative error code otherwise. */ int mtd_ooblayout_count_freebytes(struct mtd_info *mtd) { return mtd_ooblayout_count_bytes(mtd, mtd_ooblayout_free); } EXPORT_SYMBOL_GPL(mtd_ooblayout_count_freebytes); /** * mtd_ooblayout_count_eccbytes - count the number of ECC bytes in OOB * @mtd: mtd info structure * * Works like mtd_ooblayout_count_bytes(), except it count ECC bytes. * * Returns zero on success, a negative error code otherwise. */ int mtd_ooblayout_count_eccbytes(struct mtd_info *mtd) { return mtd_ooblayout_count_bytes(mtd, mtd_ooblayout_ecc); } EXPORT_SYMBOL_GPL(mtd_ooblayout_count_eccbytes); /* * Method to access the protection register area, present in some flash * devices. The user data is one time programmable but the factory data is read * only. */ int mtd_get_fact_prot_info(struct mtd_info *mtd, size_t len, size_t *retlen, struct otp_info *buf) { struct mtd_info *master = mtd_get_master(mtd); if (!master->_get_fact_prot_info) return -EOPNOTSUPP; if (!len) return 0; return master->_get_fact_prot_info(master, len, retlen, buf); } EXPORT_SYMBOL_GPL(mtd_get_fact_prot_info); int mtd_read_fact_prot_reg(struct mtd_info *mtd, loff_t from, size_t len, size_t *retlen, u_char *buf) { struct mtd_info *master = mtd_get_master(mtd); *retlen = 0; if (!master->_read_fact_prot_reg) return -EOPNOTSUPP; if (!len) return 0; return master->_read_fact_prot_reg(master, from, len, retlen, buf); } EXPORT_SYMBOL_GPL(mtd_read_fact_prot_reg); int mtd_get_user_prot_info(struct mtd_info *mtd, size_t len, size_t *retlen, struct otp_info *buf) { struct mtd_info *master = mtd_get_master(mtd); if (!master->_get_user_prot_info) return -EOPNOTSUPP; if (!len) return 0; return master->_get_user_prot_info(master, len, retlen, buf); } EXPORT_SYMBOL_GPL(mtd_get_user_prot_info); int mtd_read_user_prot_reg(struct mtd_info *mtd, loff_t from, size_t len, size_t *retlen, u_char *buf) { struct mtd_info *master = mtd_get_master(mtd); *retlen = 0; if (!master->_read_user_prot_reg) return -EOPNOTSUPP; if (!len) return 0; return master->_read_user_prot_reg(master, from, len, retlen, buf); } EXPORT_SYMBOL_GPL(mtd_read_user_prot_reg); int mtd_write_user_prot_reg(struct mtd_info *mtd, loff_t to, size_t len, size_t *retlen, const u_char *buf) { struct mtd_info *master = mtd_get_master(mtd); int ret; *retlen = 0; if (!master->_write_user_prot_reg) return -EOPNOTSUPP; if (!len) return 0; ret = master->_write_user_prot_reg(master, to, len, retlen, buf); if (ret) return ret; /* * If no data could be written at all, we are out of memory and * must return -ENOSPC. */ return (*retlen) ? 0 : -ENOSPC; } EXPORT_SYMBOL_GPL(mtd_write_user_prot_reg); int mtd_lock_user_prot_reg(struct mtd_info *mtd, loff_t from, size_t len) { struct mtd_info *master = mtd_get_master(mtd); if (!master->_lock_user_prot_reg) return -EOPNOTSUPP; if (!len) return 0; return master->_lock_user_prot_reg(master, from, len); } EXPORT_SYMBOL_GPL(mtd_lock_user_prot_reg); int mtd_erase_user_prot_reg(struct mtd_info *mtd, loff_t from, size_t len) { struct mtd_info *master = mtd_get_master(mtd); if (!master->_erase_user_prot_reg) return -EOPNOTSUPP; if (!len) return 0; return master->_erase_user_prot_reg(master, from, len); } EXPORT_SYMBOL_GPL(mtd_erase_user_prot_reg); /* Chip-supported device locking */ int mtd_lock(struct mtd_info *mtd, loff_t ofs, uint64_t len) { struct mtd_info *master = mtd_get_master(mtd); if (!master->_lock) return -EOPNOTSUPP; if (ofs < 0 || ofs >= mtd->size || len > mtd->size - ofs) return -EINVAL; if (!len) return 0; if (mtd->flags & MTD_SLC_ON_MLC_EMULATION) { ofs = (loff_t)mtd_div_by_eb(ofs, mtd) * master->erasesize; len = (u64)mtd_div_by_eb(len, mtd) * master->erasesize; } return master->_lock(master, mtd_get_master_ofs(mtd, ofs), len); } EXPORT_SYMBOL_GPL(mtd_lock); int mtd_unlock(struct mtd_info *mtd, loff_t ofs, uint64_t len) { struct mtd_info *master = mtd_get_master(mtd); if (!master->_unlock) return -EOPNOTSUPP; if (ofs < 0 || ofs >= mtd->size || len > mtd->size - ofs) return -EINVAL; if (!len) return 0; if (mtd->flags & MTD_SLC_ON_MLC_EMULATION) { ofs = (loff_t)mtd_div_by_eb(ofs, mtd) * master->erasesize; len = (u64)mtd_div_by_eb(len, mtd) * master->erasesize; } return master->_unlock(master, mtd_get_master_ofs(mtd, ofs), len); } EXPORT_SYMBOL_GPL(mtd_unlock); int mtd_is_locked(struct mtd_info *mtd, loff_t ofs, uint64_t len) { struct mtd_info *master = mtd_get_master(mtd); if (!master->_is_locked) return -EOPNOTSUPP; if (ofs < 0 || ofs >= mtd->size || len > mtd->size - ofs) return -EINVAL; if (!len) return 0; if (mtd->flags & MTD_SLC_ON_MLC_EMULATION) { ofs = (loff_t)mtd_div_by_eb(ofs, mtd) * master->erasesize; len = (u64)mtd_div_by_eb(len, mtd) * master->erasesize; } return master->_is_locked(master, mtd_get_master_ofs(mtd, ofs), len); } EXPORT_SYMBOL_GPL(mtd_is_locked); int mtd_block_isreserved(struct mtd_info *mtd, loff_t ofs) { struct mtd_info *master = mtd_get_master(mtd); if (ofs < 0 || ofs >= mtd->size) return -EINVAL; if (!master->_block_isreserved) return 0; if (mtd->flags & MTD_SLC_ON_MLC_EMULATION) ofs = (loff_t)mtd_div_by_eb(ofs, mtd) * master->erasesize; return master->_block_isreserved(master, mtd_get_master_ofs(mtd, ofs)); } EXPORT_SYMBOL_GPL(mtd_block_isreserved); int mtd_block_isbad(struct mtd_info *mtd, loff_t ofs) { struct mtd_info *master = mtd_get_master(mtd); if (ofs < 0 || ofs >= mtd->size) return -EINVAL; if (!master->_block_isbad) return 0; if (mtd->flags & MTD_SLC_ON_MLC_EMULATION) ofs = (loff_t)mtd_div_by_eb(ofs, mtd) * master->erasesize; return master->_block_isbad(master, mtd_get_master_ofs(mtd, ofs)); } EXPORT_SYMBOL_GPL(mtd_block_isbad); int mtd_block_markbad(struct mtd_info *mtd, loff_t ofs) { struct mtd_info *master = mtd_get_master(mtd); int ret; if (!master->_block_markbad) return -EOPNOTSUPP; if (ofs < 0 || ofs >= mtd->size) return -EINVAL; if (!(mtd->flags & MTD_WRITEABLE)) return -EROFS; if (mtd->flags & MTD_SLC_ON_MLC_EMULATION) ofs = (loff_t)mtd_div_by_eb(ofs, mtd) * master->erasesize; ret = master->_block_markbad(master, mtd_get_master_ofs(mtd, ofs)); if (ret) return ret; while (mtd->parent) { mtd->ecc_stats.badblocks++; mtd = mtd->parent; } return 0; } EXPORT_SYMBOL_GPL(mtd_block_markbad); ALLOW_ERROR_INJECTION(mtd_block_markbad, ERRNO); /* * default_mtd_writev - the default writev method * @mtd: mtd device description object pointer * @vecs: the vectors to write * @count: count of vectors in @vecs * @to: the MTD device offset to write to * @retlen: on exit contains the count of bytes written to the MTD device. * * This function returns zero in case of success and a negative error code in * case of failure. */ static int default_mtd_writev(struct mtd_info *mtd, const struct kvec *vecs, unsigned long count, loff_t to, size_t *retlen) { unsigned long i; size_t totlen = 0, thislen; int ret = 0; for (i = 0; i < count; i++) { if (!vecs[i].iov_len) continue; ret = mtd_write(mtd, to, vecs[i].iov_len, &thislen, vecs[i].iov_base); totlen += thislen; if (ret || thislen != vecs[i].iov_len) break; to += vecs[i].iov_len; } *retlen = totlen; return ret; } /* * mtd_writev - the vector-based MTD write method * @mtd: mtd device description object pointer * @vecs: the vectors to write * @count: count of vectors in @vecs * @to: the MTD device offset to write to * @retlen: on exit contains the count of bytes written to the MTD device. * * This function returns zero in case of success and a negative error code in * case of failure. */ int mtd_writev(struct mtd_info *mtd, const struct kvec *vecs, unsigned long count, loff_t to, size_t *retlen) { struct mtd_info *master = mtd_get_master(mtd); *retlen = 0; if (!(mtd->flags & MTD_WRITEABLE)) return -EROFS; if (!master->_writev) return default_mtd_writev(mtd, vecs, count, to, retlen); return master->_writev(master, vecs, count, mtd_get_master_ofs(mtd, to), retlen); } EXPORT_SYMBOL_GPL(mtd_writev); /** * mtd_kmalloc_up_to - allocate a contiguous buffer up to the specified size * @mtd: mtd device description object pointer * @size: a pointer to the ideal or maximum size of the allocation, points * to the actual allocation size on success. * * This routine attempts to allocate a contiguous kernel buffer up to * the specified size, backing off the size of the request exponentially * until the request succeeds or until the allocation size falls below * the system page size. This attempts to make sure it does not adversely * impact system performance, so when allocating more than one page, we * ask the memory allocator to avoid re-trying, swapping, writing back * or performing I/O. * * Note, this function also makes sure that the allocated buffer is aligned to * the MTD device's min. I/O unit, i.e. the "mtd->writesize" value. * * This is called, for example by mtd_{read,write} and jffs2_scan_medium, * to handle smaller (i.e. degraded) buffer allocations under low- or * fragmented-memory situations where such reduced allocations, from a * requested ideal, are allowed. * * Returns a pointer to the allocated buffer on success; otherwise, NULL. */ void *mtd_kmalloc_up_to(const struct mtd_info *mtd, size_t *size) { gfp_t flags = __GFP_NOWARN | __GFP_DIRECT_RECLAIM | __GFP_NORETRY; size_t min_alloc = max_t(size_t, mtd->writesize, PAGE_SIZE); void *kbuf; *size = min_t(size_t, *size, KMALLOC_MAX_SIZE); while (*size > min_alloc) { kbuf = kmalloc(*size, flags); if (kbuf) return kbuf; *size >>= 1; *size = ALIGN(*size, mtd->writesize); } /* * For the last resort allocation allow 'kmalloc()' to do all sorts of * things (write-back, dropping caches, etc) by using GFP_KERNEL. */ return kmalloc(*size, GFP_KERNEL); } EXPORT_SYMBOL_GPL(mtd_kmalloc_up_to); #ifdef CONFIG_PROC_FS /*====================================================================*/ /* Support for /proc/mtd */ static int mtd_proc_show(struct seq_file *m, void *v) { struct mtd_info *mtd; seq_puts(m, "dev: size erasesize name\n"); mutex_lock(&mtd_table_mutex); mtd_for_each_device(mtd) { seq_printf(m, "mtd%d: %8.8llx %8.8x \"%s\"\n", mtd->index, (unsigned long long)mtd->size, mtd->erasesize, mtd->name); } mutex_unlock(&mtd_table_mutex); return 0; } #endif /* CONFIG_PROC_FS */ /*====================================================================*/ /* Init code */ static struct backing_dev_info * __init mtd_bdi_init(const char *name) { struct backing_dev_info *bdi; int ret; bdi = bdi_alloc(NUMA_NO_NODE); if (!bdi) return ERR_PTR(-ENOMEM); bdi->ra_pages = 0; bdi->io_pages = 0; /* * We put '-0' suffix to the name to get the same name format as we * used to get. Since this is called only once, we get a unique name. */ ret = bdi_register(bdi, "%.28s-0", name); if (ret) bdi_put(bdi); return ret ? ERR_PTR(ret) : bdi; } static struct proc_dir_entry *proc_mtd; static int __init init_mtd(void) { int ret; ret = class_register(&mtd_class); if (ret) goto err_reg; mtd_bdi = mtd_bdi_init("mtd"); if (IS_ERR(mtd_bdi)) { ret = PTR_ERR(mtd_bdi); goto err_bdi; } proc_mtd = proc_create_single("mtd", 0, NULL, mtd_proc_show); ret = init_mtdchar(); if (ret) goto out_procfs; dfs_dir_mtd = debugfs_create_dir("mtd", NULL); debugfs_create_bool("expert_analysis_mode", 0600, dfs_dir_mtd, &mtd_expert_analysis_mode); return 0; out_procfs: if (proc_mtd) remove_proc_entry("mtd", NULL); bdi_unregister(mtd_bdi); bdi_put(mtd_bdi); err_bdi: class_unregister(&mtd_class); err_reg: pr_err("Error registering mtd class or bdi: %d\n", ret); return ret; } static void __exit cleanup_mtd(void) { debugfs_remove_recursive(dfs_dir_mtd); cleanup_mtdchar(); if (proc_mtd) remove_proc_entry("mtd", NULL); class_unregister(&mtd_class); bdi_unregister(mtd_bdi); bdi_put(mtd_bdi); idr_destroy(&mtd_idr); } module_init(init_mtd); module_exit(cleanup_mtd); MODULE_LICENSE("GPL"); MODULE_AUTHOR("David Woodhouse <dwmw2@infradead.org>"); MODULE_DESCRIPTION("Core MTD registration and access routines"); |
| 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 | // SPDX-License-Identifier: GPL-2.0 /* * Hibernation support for x86 * * Copyright (c) 2007 Rafael J. Wysocki <rjw@sisk.pl> * Copyright (c) 2002 Pavel Machek <pavel@ucw.cz> * Copyright (c) 2001 Patrick Mochel <mochel@osdl.org> */ #include <linux/gfp.h> #include <linux/smp.h> #include <linux/suspend.h> #include <linux/scatterlist.h> #include <linux/kdebug.h> #include <linux/cpu.h> #include <linux/pgtable.h> #include <linux/types.h> #include <linux/crc32.h> #include <asm/e820/api.h> #include <asm/init.h> #include <asm/proto.h> #include <asm/page.h> #include <asm/mtrr.h> #include <asm/sections.h> #include <asm/suspend.h> #include <asm/tlbflush.h> /* * Address to jump to in the last phase of restore in order to get to the image * kernel's text (this value is passed in the image header). */ unsigned long restore_jump_address __visible; unsigned long jump_address_phys; /* * Value of the cr3 register from before the hibernation (this value is passed * in the image header). */ unsigned long restore_cr3 __visible; unsigned long temp_pgt __visible; unsigned long relocated_restore_code __visible; /** * pfn_is_nosave - check if given pfn is in the 'nosave' section */ int pfn_is_nosave(unsigned long pfn) { unsigned long nosave_begin_pfn; unsigned long nosave_end_pfn; nosave_begin_pfn = __pa_symbol(&__nosave_begin) >> PAGE_SHIFT; nosave_end_pfn = PAGE_ALIGN(__pa_symbol(&__nosave_end)) >> PAGE_SHIFT; return pfn >= nosave_begin_pfn && pfn < nosave_end_pfn; } struct restore_data_record { unsigned long jump_address; unsigned long jump_address_phys; unsigned long cr3; unsigned long magic; unsigned long e820_checksum; }; /** * compute_e820_crc32 - calculate crc32 of a given e820 table * * @table: the e820 table to be calculated * * Return: the resulting checksum */ static inline u32 compute_e820_crc32(struct e820_table *table) { int size = offsetof(struct e820_table, entries) + sizeof(struct e820_entry) * table->nr_entries; return ~crc32_le(~0, (unsigned char const *)table, size); } #ifdef CONFIG_X86_64 #define RESTORE_MAGIC 0x23456789ABCDEF02UL #else #define RESTORE_MAGIC 0x12345679UL #endif /** * arch_hibernation_header_save - populate the architecture specific part * of a hibernation image header * @addr: address to save the data at */ int arch_hibernation_header_save(void *addr, unsigned int max_size) { struct restore_data_record *rdr = addr; if (max_size < sizeof(struct restore_data_record)) return -EOVERFLOW; rdr->magic = RESTORE_MAGIC; rdr->jump_address = (unsigned long)restore_registers; rdr->jump_address_phys = __pa_symbol(restore_registers); /* * The restore code fixes up CR3 and CR4 in the following sequence: * * [in hibernation asm] * 1. CR3 <= temporary page tables * 2. CR4 <= mmu_cr4_features (from the kernel that restores us) * 3. CR3 <= rdr->cr3 * 4. CR4 <= mmu_cr4_features (from us, i.e. the image kernel) * [in restore_processor_state()] * 5. CR4 <= saved CR4 * 6. CR3 <= saved CR3 * * Our mmu_cr4_features has CR4.PCIDE=0, and toggling * CR4.PCIDE while CR3's PCID bits are nonzero is illegal, so * rdr->cr3 needs to point to valid page tables but must not * have any of the PCID bits set. */ rdr->cr3 = restore_cr3 & ~CR3_PCID_MASK; rdr->e820_checksum = compute_e820_crc32(e820_table_firmware); return 0; } /** * arch_hibernation_header_restore - read the architecture specific data * from the hibernation image header * @addr: address to read the data from */ int arch_hibernation_header_restore(void *addr) { struct restore_data_record *rdr = addr; if (rdr->magic != RESTORE_MAGIC) { pr_crit("Unrecognized hibernate image header format!\n"); return -EINVAL; } restore_jump_address = rdr->jump_address; jump_address_phys = rdr->jump_address_phys; restore_cr3 = rdr->cr3; if (rdr->e820_checksum != compute_e820_crc32(e820_table_firmware)) { pr_crit("Hibernate inconsistent memory map detected!\n"); return -ENODEV; } return 0; } int relocate_restore_code(void) { pgd_t *pgd; p4d_t *p4d; pud_t *pud; pmd_t *pmd; pte_t *pte; relocated_restore_code = get_safe_page(GFP_ATOMIC); if (!relocated_restore_code) return -ENOMEM; __memcpy((void *)relocated_restore_code, core_restore_code, PAGE_SIZE); /* Make the page containing the relocated code executable */ pgd = (pgd_t *)__va(read_cr3_pa()) + pgd_index(relocated_restore_code); p4d = p4d_offset(pgd, relocated_restore_code); if (p4d_leaf(*p4d)) { set_p4d(p4d, __p4d(p4d_val(*p4d) & ~_PAGE_NX)); goto out; } pud = pud_offset(p4d, relocated_restore_code); if (pud_leaf(*pud)) { set_pud(pud, __pud(pud_val(*pud) & ~_PAGE_NX)); goto out; } pmd = pmd_offset(pud, relocated_restore_code); if (pmd_leaf(*pmd)) { set_pmd(pmd, __pmd(pmd_val(*pmd) & ~_PAGE_NX)); goto out; } pte = pte_offset_kernel(pmd, relocated_restore_code); set_pte(pte, __pte(pte_val(*pte) & ~_PAGE_NX)); out: __flush_tlb_all(); return 0; } int arch_resume_nosmt(void) { int ret = 0; /* * We reached this while coming out of hibernation. This means * that SMT siblings are sleeping in hlt, as mwait is not safe * against control transition during resume (see comment in * hibernate_resume_nonboot_cpu_disable()). * * If the resumed kernel has SMT disabled, we have to take all the * SMT siblings out of hlt, and offline them again so that they * end up in mwait proper. * * Called with hotplug disabled. */ cpu_hotplug_enable(); if (cpu_smt_control == CPU_SMT_DISABLED || cpu_smt_control == CPU_SMT_FORCE_DISABLED) { enum cpuhp_smt_control old = cpu_smt_control; ret = cpuhp_smt_enable(); if (ret) goto out; ret = cpuhp_smt_disable(old); if (ret) goto out; } out: cpu_hotplug_disable(); return ret; } |
| 1 1 1 1 1 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 | // SPDX-License-Identifier: GPL-2.0-only /* * linux/net/sunrpc/xprt.c * * This is a generic RPC call interface supporting congestion avoidance, * and asynchronous calls. * * The interface works like this: * * - When a process places a call, it allocates a request slot if * one is available. Otherwise, it sleeps on the backlog queue * (xprt_reserve). * - Next, the caller puts together the RPC message, stuffs it into * the request struct, and calls xprt_transmit(). * - xprt_transmit sends the message and installs the caller on the * transport's wait list. At the same time, if a reply is expected, * it installs a timer that is run after the packet's timeout has * expired. * - When a packet arrives, the data_ready handler walks the list of * pending requests for that transport. If a matching XID is found, the * caller is woken up, and the timer removed. * - When no reply arrives within the timeout interval, the timer is * fired by the kernel and runs xprt_timer(). It either adjusts the * timeout values (minor timeout) or wakes up the caller with a status * of -ETIMEDOUT. * - When the caller receives a notification from RPC that a reply arrived, * it should release the RPC slot, and process the reply. * If the call timed out, it may choose to retry the operation by * adjusting the initial timeout value, and simply calling rpc_call * again. * * Support for async RPC is done through a set of RPC-specific scheduling * primitives that `transparently' work for processes as well as async * tasks that rely on callbacks. * * Copyright (C) 1995-1997, Olaf Kirch <okir@monad.swb.de> * * Transport switch API copyright (C) 2005, Chuck Lever <cel@netapp.com> */ #include <linux/module.h> #include <linux/types.h> #include <linux/interrupt.h> #include <linux/workqueue.h> #include <linux/net.h> #include <linux/ktime.h> #include <linux/sunrpc/clnt.h> #include <linux/sunrpc/metrics.h> #include <linux/sunrpc/bc_xprt.h> #include <linux/rcupdate.h> #include <linux/sched/mm.h> #include <trace/events/sunrpc.h> #include "sunrpc.h" #include "sysfs.h" #include "fail.h" /* * Local variables */ #if IS_ENABLED(CONFIG_SUNRPC_DEBUG) # define RPCDBG_FACILITY RPCDBG_XPRT #endif /* * Local functions */ static void xprt_init(struct rpc_xprt *xprt, struct net *net); static __be32 xprt_alloc_xid(struct rpc_xprt *xprt); static void xprt_destroy(struct rpc_xprt *xprt); static void xprt_request_init(struct rpc_task *task); static int xprt_request_prepare(struct rpc_rqst *req, struct xdr_buf *buf); static DEFINE_SPINLOCK(xprt_list_lock); static LIST_HEAD(xprt_list); static unsigned long xprt_request_timeout(const struct rpc_rqst *req) { unsigned long timeout = jiffies + req->rq_timeout; if (time_before(timeout, req->rq_majortimeo)) return timeout; return req->rq_majortimeo; } /** * xprt_register_transport - register a transport implementation * @transport: transport to register * * If a transport implementation is loaded as a kernel module, it can * call this interface to make itself known to the RPC client. * * Returns: * 0: transport successfully registered * -EEXIST: transport already registered * -EINVAL: transport module being unloaded */ int xprt_register_transport(struct xprt_class *transport) { struct xprt_class *t; int result; result = -EEXIST; spin_lock(&xprt_list_lock); list_for_each_entry(t, &xprt_list, list) { /* don't register the same transport class twice */ if (t->ident == transport->ident) goto out; } list_add_tail(&transport->list, &xprt_list); printk(KERN_INFO "RPC: Registered %s transport module.\n", transport->name); result = 0; out: spin_unlock(&xprt_list_lock); return result; } EXPORT_SYMBOL_GPL(xprt_register_transport); /** * xprt_unregister_transport - unregister a transport implementation * @transport: transport to unregister * * Returns: * 0: transport successfully unregistered * -ENOENT: transport never registered */ int xprt_unregister_transport(struct xprt_class *transport) { struct xprt_class *t; int result; result = 0; spin_lock(&xprt_list_lock); list_for_each_entry(t, &xprt_list, list) { if (t == transport) { printk(KERN_INFO "RPC: Unregistered %s transport module.\n", transport->name); list_del_init(&transport->list); goto out; } } result = -ENOENT; out: spin_unlock(&xprt_list_lock); return result; } EXPORT_SYMBOL_GPL(xprt_unregister_transport); static void xprt_class_release(const struct xprt_class *t) { module_put(t->owner); } static const struct xprt_class * xprt_class_find_by_ident_locked(int ident) { const struct xprt_class *t; list_for_each_entry(t, &xprt_list, list) { if (t->ident != ident) continue; if (!try_module_get(t->owner)) continue; return t; } return NULL; } static const struct xprt_class * xprt_class_find_by_ident(int ident) { const struct xprt_class *t; spin_lock(&xprt_list_lock); t = xprt_class_find_by_ident_locked(ident); spin_unlock(&xprt_list_lock); return t; } static const struct xprt_class * xprt_class_find_by_netid_locked(const char *netid) { const struct xprt_class *t; unsigned int i; list_for_each_entry(t, &xprt_list, list) { for (i = 0; t->netid[i][0] != '\0'; i++) { if (strcmp(t->netid[i], netid) != 0) continue; if (!try_module_get(t->owner)) continue; return t; } } return NULL; } static const struct xprt_class * xprt_class_find_by_netid(const char *netid) { const struct xprt_class *t; spin_lock(&xprt_list_lock); t = xprt_class_find_by_netid_locked(netid); if (!t) { spin_unlock(&xprt_list_lock); request_module("rpc%s", netid); spin_lock(&xprt_list_lock); t = xprt_class_find_by_netid_locked(netid); } spin_unlock(&xprt_list_lock); return t; } /** * xprt_find_transport_ident - convert a netid into a transport identifier * @netid: transport to load * * Returns: * > 0: transport identifier * -ENOENT: transport module not available */ int xprt_find_transport_ident(const char *netid) { const struct xprt_class *t; int ret; t = xprt_class_find_by_netid(netid); if (!t) return -ENOENT; ret = t->ident; xprt_class_release(t); return ret; } EXPORT_SYMBOL_GPL(xprt_find_transport_ident); static void xprt_clear_locked(struct rpc_xprt *xprt) { xprt->snd_task = NULL; if (!test_bit(XPRT_CLOSE_WAIT, &xprt->state)) clear_bit_unlock(XPRT_LOCKED, &xprt->state); else queue_work(xprtiod_workqueue, &xprt->task_cleanup); } /** * xprt_reserve_xprt - serialize write access to transports * @task: task that is requesting access to the transport * @xprt: pointer to the target transport * * This prevents mixing the payload of separate requests, and prevents * transport connects from colliding with writes. No congestion control * is provided. */ int xprt_reserve_xprt(struct rpc_xprt *xprt, struct rpc_task *task) { struct rpc_rqst *req = task->tk_rqstp; if (test_and_set_bit(XPRT_LOCKED, &xprt->state)) { if (task == xprt->snd_task) goto out_locked; goto out_sleep; } if (test_bit(XPRT_WRITE_SPACE, &xprt->state)) goto out_unlock; xprt->snd_task = task; out_locked: trace_xprt_reserve_xprt(xprt, task); return 1; out_unlock: xprt_clear_locked(xprt); out_sleep: task->tk_status = -EAGAIN; if (RPC_IS_SOFT(task) || RPC_IS_SOFTCONN(task)) rpc_sleep_on_timeout(&xprt->sending, task, NULL, xprt_request_timeout(req)); else rpc_sleep_on(&xprt->sending, task, NULL); return 0; } EXPORT_SYMBOL_GPL(xprt_reserve_xprt); static bool xprt_need_congestion_window_wait(struct rpc_xprt *xprt) { return test_bit(XPRT_CWND_WAIT, &xprt->state); } static void xprt_set_congestion_window_wait(struct rpc_xprt *xprt) { if (!list_empty(&xprt->xmit_queue)) { /* Peek at head of queue to see if it can make progress */ if (list_first_entry(&xprt->xmit_queue, struct rpc_rqst, rq_xmit)->rq_cong) return; } set_bit(XPRT_CWND_WAIT, &xprt->state); } static void xprt_test_and_clear_congestion_window_wait(struct rpc_xprt *xprt) { if (!RPCXPRT_CONGESTED(xprt)) clear_bit(XPRT_CWND_WAIT, &xprt->state); } /* * xprt_reserve_xprt_cong - serialize write access to transports * @task: task that is requesting access to the transport * * Same as xprt_reserve_xprt, but Van Jacobson congestion control is * integrated into the decision of whether a request is allowed to be * woken up and given access to the transport. * Note that the lock is only granted if we know there are free slots. */ int xprt_reserve_xprt_cong(struct rpc_xprt *xprt, struct rpc_task *task) { struct rpc_rqst *req = task->tk_rqstp; if (test_and_set_bit(XPRT_LOCKED, &xprt->state)) { if (task == xprt->snd_task) goto out_locked; goto out_sleep; } if (req == NULL) { xprt->snd_task = task; goto out_locked; } if (test_bit(XPRT_WRITE_SPACE, &xprt->state)) goto out_unlock; if (!xprt_need_congestion_window_wait(xprt)) { xprt->snd_task = task; goto out_locked; } out_unlock: xprt_clear_locked(xprt); out_sleep: task->tk_status = -EAGAIN; if (RPC_IS_SOFT(task) || RPC_IS_SOFTCONN(task)) rpc_sleep_on_timeout(&xprt->sending, task, NULL, xprt_request_timeout(req)); else rpc_sleep_on(&xprt->sending, task, NULL); return 0; out_locked: trace_xprt_reserve_cong(xprt, task); return 1; } EXPORT_SYMBOL_GPL(xprt_reserve_xprt_cong); static inline int xprt_lock_write(struct rpc_xprt *xprt, struct rpc_task *task) { int retval; if (test_bit(XPRT_LOCKED, &xprt->state) && xprt->snd_task == task) return 1; spin_lock(&xprt->transport_lock); retval = xprt->ops->reserve_xprt(xprt, task); spin_unlock(&xprt->transport_lock); return retval; } static bool __xprt_lock_write_func(struct rpc_task *task, void *data) { struct rpc_xprt *xprt = data; xprt->snd_task = task; return true; } static void __xprt_lock_write_next(struct rpc_xprt *xprt) { if (test_and_set_bit(XPRT_LOCKED, &xprt->state)) return; if (test_bit(XPRT_WRITE_SPACE, &xprt->state)) goto out_unlock; if (rpc_wake_up_first_on_wq(xprtiod_workqueue, &xprt->sending, __xprt_lock_write_func, xprt)) return; out_unlock: xprt_clear_locked(xprt); } static void __xprt_lock_write_next_cong(struct rpc_xprt *xprt) { if (test_and_set_bit(XPRT_LOCKED, &xprt->state)) return; if (test_bit(XPRT_WRITE_SPACE, &xprt->state)) goto out_unlock; if (xprt_need_congestion_window_wait(xprt)) goto out_unlock; if (rpc_wake_up_first_on_wq(xprtiod_workqueue, &xprt->sending, __xprt_lock_write_func, xprt)) return; out_unlock: xprt_clear_locked(xprt); } /** * xprt_release_xprt - allow other requests to use a transport * @xprt: transport with other tasks potentially waiting * @task: task that is releasing access to the transport * * Note that "task" can be NULL. No congestion control is provided. */ void xprt_release_xprt(struct rpc_xprt *xprt, struct rpc_task *task) { if (xprt->snd_task == task) { xprt_clear_locked(xprt); __xprt_lock_write_next(xprt); } trace_xprt_release_xprt(xprt, task); } EXPORT_SYMBOL_GPL(xprt_release_xprt); /** * xprt_release_xprt_cong - allow other requests to use a transport * @xprt: transport with other tasks potentially waiting * @task: task that is releasing access to the transport * * Note that "task" can be NULL. Another task is awoken to use the * transport if the transport's congestion window allows it. */ void xprt_release_xprt_cong(struct rpc_xprt *xprt, struct rpc_task *task) { if (xprt->snd_task == task) { xprt_clear_locked(xprt); __xprt_lock_write_next_cong(xprt); } trace_xprt_release_cong(xprt, task); } EXPORT_SYMBOL_GPL(xprt_release_xprt_cong); void xprt_release_write(struct rpc_xprt *xprt, struct rpc_task *task) { if (xprt->snd_task != task) return; spin_lock(&xprt->transport_lock); xprt->ops->release_xprt(xprt, task); spin_unlock(&xprt->transport_lock); } /* * Van Jacobson congestion avoidance. Check if the congestion window * overflowed. Put the task to sleep if this is the case. */ static int __xprt_get_cong(struct rpc_xprt *xprt, struct rpc_rqst *req) { if (req->rq_cong) return 1; trace_xprt_get_cong(xprt, req->rq_task); if (RPCXPRT_CONGESTED(xprt)) { xprt_set_congestion_window_wait(xprt); return 0; } req->rq_cong = 1; xprt->cong += RPC_CWNDSCALE; return 1; } /* * Adjust the congestion window, and wake up the next task * that has been sleeping due to congestion */ static void __xprt_put_cong(struct rpc_xprt *xprt, struct rpc_rqst *req) { if (!req->rq_cong) return; req->rq_cong = 0; xprt->cong -= RPC_CWNDSCALE; xprt_test_and_clear_congestion_window_wait(xprt); trace_xprt_put_cong(xprt, req->rq_task); __xprt_lock_write_next_cong(xprt); } /** * xprt_request_get_cong - Request congestion control credits * @xprt: pointer to transport * @req: pointer to RPC request * * Useful for transports that require congestion control. */ bool xprt_request_get_cong(struct rpc_xprt *xprt, struct rpc_rqst *req) { bool ret = false; if (req->rq_cong) return true; spin_lock(&xprt->transport_lock); ret = __xprt_get_cong(xprt, req) != 0; spin_unlock(&xprt->transport_lock); return ret; } EXPORT_SYMBOL_GPL(xprt_request_get_cong); /** * xprt_release_rqst_cong - housekeeping when request is complete * @task: RPC request that recently completed * * Useful for transports that require congestion control. */ void xprt_release_rqst_cong(struct rpc_task *task) { struct rpc_rqst *req = task->tk_rqstp; __xprt_put_cong(req->rq_xprt, req); } EXPORT_SYMBOL_GPL(xprt_release_rqst_cong); static void xprt_clear_congestion_window_wait_locked(struct rpc_xprt *xprt) { if (test_and_clear_bit(XPRT_CWND_WAIT, &xprt->state)) __xprt_lock_write_next_cong(xprt); } /* * Clear the congestion window wait flag and wake up the next * entry on xprt->sending */ static void xprt_clear_congestion_window_wait(struct rpc_xprt *xprt) { if (test_and_clear_bit(XPRT_CWND_WAIT, &xprt->state)) { spin_lock(&xprt->transport_lock); __xprt_lock_write_next_cong(xprt); spin_unlock(&xprt->transport_lock); } } /** * xprt_adjust_cwnd - adjust transport congestion window * @xprt: pointer to xprt * @task: recently completed RPC request used to adjust window * @result: result code of completed RPC request * * The transport code maintains an estimate on the maximum number of out- * standing RPC requests, using a smoothed version of the congestion * avoidance implemented in 44BSD. This is basically the Van Jacobson * congestion algorithm: If a retransmit occurs, the congestion window is * halved; otherwise, it is incremented by 1/cwnd when * * - a reply is received and * - a full number of requests are outstanding and * - the congestion window hasn't been updated recently. */ void xprt_adjust_cwnd(struct rpc_xprt *xprt, struct rpc_task *task, int result) { struct rpc_rqst *req = task->tk_rqstp; unsigned long cwnd = xprt->cwnd; if (result >= 0 && cwnd <= xprt->cong) { /* The (cwnd >> 1) term makes sure * the result gets rounded properly. */ cwnd += (RPC_CWNDSCALE * RPC_CWNDSCALE + (cwnd >> 1)) / cwnd; if (cwnd > RPC_MAXCWND(xprt)) cwnd = RPC_MAXCWND(xprt); __xprt_lock_write_next_cong(xprt); } else if (result == -ETIMEDOUT) { cwnd >>= 1; if (cwnd < RPC_CWNDSCALE) cwnd = RPC_CWNDSCALE; } dprintk("RPC: cong %ld, cwnd was %ld, now %ld\n", xprt->cong, xprt->cwnd, cwnd); xprt->cwnd = cwnd; __xprt_put_cong(xprt, req); } EXPORT_SYMBOL_GPL(xprt_adjust_cwnd); /** * xprt_wake_pending_tasks - wake all tasks on a transport's pending queue * @xprt: transport with waiting tasks * @status: result code to plant in each task before waking it * */ void xprt_wake_pending_tasks(struct rpc_xprt *xprt, int status) { if (status < 0) rpc_wake_up_status(&xprt->pending, status); else rpc_wake_up(&xprt->pending); } EXPORT_SYMBOL_GPL(xprt_wake_pending_tasks); /** * xprt_wait_for_buffer_space - wait for transport output buffer to clear * @xprt: transport * * Note that we only set the timer for the case of RPC_IS_SOFT(), since * we don't in general want to force a socket disconnection due to * an incomplete RPC call transmission. */ void xprt_wait_for_buffer_space(struct rpc_xprt *xprt) { set_bit(XPRT_WRITE_SPACE, &xprt->state); } EXPORT_SYMBOL_GPL(xprt_wait_for_buffer_space); static bool xprt_clear_write_space_locked(struct rpc_xprt *xprt) { if (test_and_clear_bit(XPRT_WRITE_SPACE, &xprt->state)) { __xprt_lock_write_next(xprt); dprintk("RPC: write space: waking waiting task on " "xprt %p\n", xprt); return true; } return false; } /** * xprt_write_space - wake the task waiting for transport output buffer space * @xprt: transport with waiting tasks * * Can be called in a soft IRQ context, so xprt_write_space never sleeps. */ bool xprt_write_space(struct rpc_xprt *xprt) { bool ret; if (!test_bit(XPRT_WRITE_SPACE, &xprt->state)) return false; spin_lock(&xprt->transport_lock); ret = xprt_clear_write_space_locked(xprt); spin_unlock(&xprt->transport_lock); return ret; } EXPORT_SYMBOL_GPL(xprt_write_space); static unsigned long xprt_abs_ktime_to_jiffies(ktime_t abstime) { s64 delta = ktime_to_ns(ktime_get() - abstime); return likely(delta >= 0) ? jiffies - nsecs_to_jiffies(delta) : jiffies + nsecs_to_jiffies(-delta); } static unsigned long xprt_calc_majortimeo(struct rpc_rqst *req, const struct rpc_timeout *to) { unsigned long majortimeo = req->rq_timeout; if (to->to_exponential) majortimeo <<= to->to_retries; else majortimeo += to->to_increment * to->to_retries; if (majortimeo > to->to_maxval || majortimeo == 0) majortimeo = to->to_maxval; return majortimeo; } static void xprt_reset_majortimeo(struct rpc_rqst *req, const struct rpc_timeout *to) { req->rq_majortimeo += xprt_calc_majortimeo(req, to); } static void xprt_reset_minortimeo(struct rpc_rqst *req) { req->rq_minortimeo += req->rq_timeout; } static void xprt_init_majortimeo(struct rpc_task *task, struct rpc_rqst *req, const struct rpc_timeout *to) { unsigned long time_init; struct rpc_xprt *xprt = req->rq_xprt; if (likely(xprt && xprt_connected(xprt))) time_init = jiffies; else time_init = xprt_abs_ktime_to_jiffies(task->tk_start); req->rq_timeout = to->to_initval; req->rq_majortimeo = time_init + xprt_calc_majortimeo(req, to); req->rq_minortimeo = time_init + req->rq_timeout; } /** * xprt_adjust_timeout - adjust timeout values for next retransmit * @req: RPC request containing parameters to use for the adjustment * */ int xprt_adjust_timeout(struct rpc_rqst *req) { struct rpc_xprt *xprt = req->rq_xprt; const struct rpc_timeout *to = req->rq_task->tk_client->cl_timeout; int status = 0; if (time_before(jiffies, req->rq_majortimeo)) { if (time_before(jiffies, req->rq_minortimeo)) return status; if (to->to_exponential) req->rq_timeout <<= 1; else req->rq_timeout += to->to_increment; if (to->to_maxval && req->rq_timeout >= to->to_maxval) req->rq_timeout = to->to_maxval; req->rq_retries++; } else { req->rq_timeout = to->to_initval; req->rq_retries = 0; xprt_reset_majortimeo(req, to); /* Reset the RTT counters == "slow start" */ spin_lock(&xprt->transport_lock); rpc_init_rtt(req->rq_task->tk_client->cl_rtt, to->to_initval); spin_unlock(&xprt->transport_lock); status = -ETIMEDOUT; } xprt_reset_minortimeo(req); if (req->rq_timeout == 0) { printk(KERN_WARNING "xprt_adjust_timeout: rq_timeout = 0!\n"); req->rq_timeout = 5 * HZ; } return status; } static void xprt_autoclose(struct work_struct *work) { struct rpc_xprt *xprt = container_of(work, struct rpc_xprt, task_cleanup); unsigned int pflags = memalloc_nofs_save(); trace_xprt_disconnect_auto(xprt); xprt->connect_cookie++; smp_mb__before_atomic(); clear_bit(XPRT_CLOSE_WAIT, &xprt->state); xprt->ops->close(xprt); xprt_release_write(xprt, NULL); wake_up_bit(&xprt->state, XPRT_LOCKED); memalloc_nofs_restore(pflags); } /** * xprt_disconnect_done - mark a transport as disconnected * @xprt: transport to flag for disconnect * */ void xprt_disconnect_done(struct rpc_xprt *xprt) { trace_xprt_disconnect_done(xprt); spin_lock(&xprt->transport_lock); xprt_clear_connected(xprt); xprt_clear_write_space_locked(xprt); xprt_clear_congestion_window_wait_locked(xprt); xprt_wake_pending_tasks(xprt, -ENOTCONN); spin_unlock(&xprt->transport_lock); } EXPORT_SYMBOL_GPL(xprt_disconnect_done); /** * xprt_schedule_autoclose_locked - Try to schedule an autoclose RPC call * @xprt: transport to disconnect */ static void xprt_schedule_autoclose_locked(struct rpc_xprt *xprt) { if (test_and_set_bit(XPRT_CLOSE_WAIT, &xprt->state)) return; if (test_and_set_bit(XPRT_LOCKED, &xprt->state) == 0) queue_work(xprtiod_workqueue, &xprt->task_cleanup); else if (xprt->snd_task && !test_bit(XPRT_SND_IS_COOKIE, &xprt->state)) rpc_wake_up_queued_task_set_status(&xprt->pending, xprt->snd_task, -ENOTCONN); } /** * xprt_force_disconnect - force a transport to disconnect * @xprt: transport to disconnect * */ void xprt_force_disconnect(struct rpc_xprt *xprt) { trace_xprt_disconnect_force(xprt); /* Don't race with the test_bit() in xprt_clear_locked() */ spin_lock(&xprt->transport_lock); xprt_schedule_autoclose_locked(xprt); spin_unlock(&xprt->transport_lock); } EXPORT_SYMBOL_GPL(xprt_force_disconnect); static unsigned int xprt_connect_cookie(struct rpc_xprt *xprt) { return READ_ONCE(xprt->connect_cookie); } static bool xprt_request_retransmit_after_disconnect(struct rpc_task *task) { struct rpc_rqst *req = task->tk_rqstp; struct rpc_xprt *xprt = req->rq_xprt; return req->rq_connect_cookie != xprt_connect_cookie(xprt) || !xprt_connected(xprt); } /** * xprt_conditional_disconnect - force a transport to disconnect * @xprt: transport to disconnect * @cookie: 'connection cookie' * * This attempts to break the connection if and only if 'cookie' matches * the current transport 'connection cookie'. It ensures that we don't * try to break the connection more than once when we need to retransmit * a batch of RPC requests. * */ void xprt_conditional_disconnect(struct rpc_xprt *xprt, unsigned int cookie) { /* Don't race with the test_bit() in xprt_clear_locked() */ spin_lock(&xprt->transport_lock); if (cookie != xprt->connect_cookie) goto out; if (test_bit(XPRT_CLOSING, &xprt->state)) goto out; xprt_schedule_autoclose_locked(xprt); out: spin_unlock(&xprt->transport_lock); } static bool xprt_has_timer(const struct rpc_xprt *xprt) { return xprt->idle_timeout != 0; } static void xprt_schedule_autodisconnect(struct rpc_xprt *xprt) __must_hold(&xprt->transport_lock) { xprt->last_used = jiffies; if (RB_EMPTY_ROOT(&xprt->recv_queue) && xprt_has_timer(xprt)) mod_timer(&xprt->timer, xprt->last_used + xprt->idle_timeout); } static void xprt_init_autodisconnect(struct timer_list *t) { struct rpc_xprt *xprt = from_timer(xprt, t, timer); if (!RB_EMPTY_ROOT(&xprt->recv_queue)) return; /* Reset xprt->last_used to avoid connect/autodisconnect cycling */ xprt->last_used = jiffies; if (test_and_set_bit(XPRT_LOCKED, &xprt->state)) return; queue_work(xprtiod_workqueue, &xprt->task_cleanup); } #if IS_ENABLED(CONFIG_FAIL_SUNRPC) static void xprt_inject_disconnect(struct rpc_xprt *xprt) { if (!fail_sunrpc.ignore_client_disconnect && should_fail(&fail_sunrpc.attr, 1)) xprt->ops->inject_disconnect(xprt); } #else static inline void xprt_inject_disconnect(struct rpc_xprt *xprt) { } #endif bool xprt_lock_connect(struct rpc_xprt *xprt, struct rpc_task *task, void *cookie) { bool ret = false; spin_lock(&xprt->transport_lock); if (!test_bit(XPRT_LOCKED, &xprt->state)) goto out; if (xprt->snd_task != task) goto out; set_bit(XPRT_SND_IS_COOKIE, &xprt->state); xprt->snd_task = cookie; ret = true; out: spin_unlock(&xprt->transport_lock); return ret; } EXPORT_SYMBOL_GPL(xprt_lock_connect); void xprt_unlock_connect(struct rpc_xprt *xprt, void *cookie) { spin_lock(&xprt->transport_lock); if (xprt->snd_task != cookie) goto out; if (!test_bit(XPRT_LOCKED, &xprt->state)) goto out; xprt->snd_task =NULL; clear_bit(XPRT_SND_IS_COOKIE, &xprt->state); xprt->ops->release_xprt(xprt, NULL); xprt_schedule_autodisconnect(xprt); out: spin_unlock(&xprt->transport_lock); wake_up_bit(&xprt->state, XPRT_LOCKED); } EXPORT_SYMBOL_GPL(xprt_unlock_connect); /** * xprt_connect - schedule a transport connect operation * @task: RPC task that is requesting the connect * */ void xprt_connect(struct rpc_task *task) { struct rpc_xprt *xprt = task->tk_rqstp->rq_xprt; trace_xprt_connect(xprt); if (!xprt_bound(xprt)) { task->tk_status = -EAGAIN; return; } if (!xprt_lock_write(xprt, task)) return; if (!xprt_connected(xprt) && !test_bit(XPRT_CLOSE_WAIT, &xprt->state)) { task->tk_rqstp->rq_connect_cookie = xprt->connect_cookie; rpc_sleep_on_timeout(&xprt->pending, task, NULL, xprt_request_timeout(task->tk_rqstp)); if (test_bit(XPRT_CLOSING, &xprt->state)) return; if (xprt_test_and_set_connecting(xprt)) return; /* Race breaker */ if (!xprt_connected(xprt)) { xprt->stat.connect_start = jiffies; xprt->ops->connect(xprt, task); } else { xprt_clear_connecting(xprt); task->tk_status = 0; rpc_wake_up_queued_task(&xprt->pending, task); } } xprt_release_write(xprt, task); } /** * xprt_reconnect_delay - compute the wait before scheduling a connect * @xprt: transport instance * */ unsigned long xprt_reconnect_delay(const struct rpc_xprt *xprt) { unsigned long start, now = jiffies; start = xprt->stat.connect_start + xprt->reestablish_timeout; if (time_after(start, now)) return start - now; return 0; } EXPORT_SYMBOL_GPL(xprt_reconnect_delay); /** * xprt_reconnect_backoff - compute the new re-establish timeout * @xprt: transport instance * @init_to: initial reestablish timeout * */ void xprt_reconnect_backoff(struct rpc_xprt *xprt, unsigned long init_to) { xprt->reestablish_timeout <<= 1; if (xprt->reestablish_timeout > xprt->max_reconnect_timeout) xprt->reestablish_timeout = xprt->max_reconnect_timeout; if (xprt->reestablish_timeout < init_to) xprt->reestablish_timeout = init_to; } EXPORT_SYMBOL_GPL(xprt_reconnect_backoff); enum xprt_xid_rb_cmp { XID_RB_EQUAL, XID_RB_LEFT, XID_RB_RIGHT, }; static enum xprt_xid_rb_cmp xprt_xid_cmp(__be32 xid1, __be32 xid2) { if (xid1 == xid2) return XID_RB_EQUAL; if ((__force u32)xid1 < (__force u32)xid2) return XID_RB_LEFT; return XID_RB_RIGHT; } static struct rpc_rqst * xprt_request_rb_find(struct rpc_xprt *xprt, __be32 xid) { struct rb_node *n = xprt->recv_queue.rb_node; struct rpc_rqst *req; while (n != NULL) { req = rb_entry(n, struct rpc_rqst, rq_recv); switch (xprt_xid_cmp(xid, req->rq_xid)) { case XID_RB_LEFT: n = n->rb_left; break; case XID_RB_RIGHT: n = n->rb_right; break; case XID_RB_EQUAL: return req; } } return NULL; } static void xprt_request_rb_insert(struct rpc_xprt *xprt, struct rpc_rqst *new) { struct rb_node **p = &xprt->recv_queue.rb_node; struct rb_node *n = NULL; struct rpc_rqst *req; while (*p != NULL) { n = *p; req = rb_entry(n, struct rpc_rqst, rq_recv); switch(xprt_xid_cmp(new->rq_xid, req->rq_xid)) { case XID_RB_LEFT: p = &n->rb_left; break; case XID_RB_RIGHT: p = &n->rb_right; break; case XID_RB_EQUAL: WARN_ON_ONCE(new != req); return; } } rb_link_node(&new->rq_recv, n, p); rb_insert_color(&new->rq_recv, &xprt->recv_queue); } static void xprt_request_rb_remove(struct rpc_xprt *xprt, struct rpc_rqst *req) { rb_erase(&req->rq_recv, &xprt->recv_queue); } /** * xprt_lookup_rqst - find an RPC request corresponding to an XID * @xprt: transport on which the original request was transmitted * @xid: RPC XID of incoming reply * * Caller holds xprt->queue_lock. */ struct rpc_rqst *xprt_lookup_rqst(struct rpc_xprt *xprt, __be32 xid) { struct rpc_rqst *entry; entry = xprt_request_rb_find(xprt, xid); if (entry != NULL) { trace_xprt_lookup_rqst(xprt, xid, 0); entry->rq_rtt = ktime_sub(ktime_get(), entry->rq_xtime); return entry; } dprintk("RPC: xprt_lookup_rqst did not find xid %08x\n", ntohl(xid)); trace_xprt_lookup_rqst(xprt, xid, -ENOENT); xprt->stat.bad_xids++; return NULL; } EXPORT_SYMBOL_GPL(xprt_lookup_rqst); static bool xprt_is_pinned_rqst(struct rpc_rqst *req) { return atomic_read(&req->rq_pin) != 0; } /** * xprt_pin_rqst - Pin a request on the transport receive list * @req: Request to pin * * Caller must ensure this is atomic with the call to xprt_lookup_rqst() * so should be holding xprt->queue_lock. */ void xprt_pin_rqst(struct rpc_rqst *req) { atomic_inc(&req->rq_pin); } EXPORT_SYMBOL_GPL(xprt_pin_rqst); /** * xprt_unpin_rqst - Unpin a request on the transport receive list * @req: Request to pin * * Caller should be holding xprt->queue_lock. */ void xprt_unpin_rqst(struct rpc_rqst *req) { if (!test_bit(RPC_TASK_MSG_PIN_WAIT, &req->rq_task->tk_runstate)) { atomic_dec(&req->rq_pin); return; } if (atomic_dec_and_test(&req->rq_pin)) wake_up_var(&req->rq_pin); } EXPORT_SYMBOL_GPL(xprt_unpin_rqst); static void xprt_wait_on_pinned_rqst(struct rpc_rqst *req) { wait_var_event(&req->rq_pin, !xprt_is_pinned_rqst(req)); } static bool xprt_request_data_received(struct rpc_task *task) { return !test_bit(RPC_TASK_NEED_RECV, &task->tk_runstate) && READ_ONCE(task->tk_rqstp->rq_reply_bytes_recvd) != 0; } static bool xprt_request_need_enqueue_receive(struct rpc_task *task, struct rpc_rqst *req) { return !test_bit(RPC_TASK_NEED_RECV, &task->tk_runstate) && READ_ONCE(task->tk_rqstp->rq_reply_bytes_recvd) == 0; } /** * xprt_request_enqueue_receive - Add an request to the receive queue * @task: RPC task * */ int xprt_request_enqueue_receive(struct rpc_task *task) { struct rpc_rqst *req = task->tk_rqstp; struct rpc_xprt *xprt = req->rq_xprt; int ret; if (!xprt_request_need_enqueue_receive(task, req)) return 0; ret = xprt_request_prepare(task->tk_rqstp, &req->rq_rcv_buf); if (ret) return ret; spin_lock(&xprt->queue_lock); /* Update the softirq receive buffer */ memcpy(&req->rq_private_buf, &req->rq_rcv_buf, sizeof(req->rq_private_buf)); /* Add request to the receive list */ xprt_request_rb_insert(xprt, req); set_bit(RPC_TASK_NEED_RECV, &task->tk_runstate); spin_unlock(&xprt->queue_lock); /* Turn off autodisconnect */ del_timer_sync(&xprt->timer); return 0; } /** * xprt_request_dequeue_receive_locked - Remove a request from the receive queue * @task: RPC task * * Caller must hold xprt->queue_lock. */ static void xprt_request_dequeue_receive_locked(struct rpc_task *task) { struct rpc_rqst *req = task->tk_rqstp; if (test_and_clear_bit(RPC_TASK_NEED_RECV, &task->tk_runstate)) xprt_request_rb_remove(req->rq_xprt, req); } /** * xprt_update_rtt - Update RPC RTT statistics * @task: RPC request that recently completed * * Caller holds xprt->queue_lock. */ void xprt_update_rtt(struct rpc_task *task) { struct rpc_rqst *req = task->tk_rqstp; struct rpc_rtt *rtt = task->tk_client->cl_rtt; unsigned int timer = task->tk_msg.rpc_proc->p_timer; long m = usecs_to_jiffies(ktime_to_us(req->rq_rtt)); if (timer) { if (req->rq_ntrans == 1) rpc_update_rtt(rtt, timer, m); rpc_set_timeo(rtt, timer, req->rq_ntrans - 1); } } EXPORT_SYMBOL_GPL(xprt_update_rtt); /** * xprt_complete_rqst - called when reply processing is complete * @task: RPC request that recently completed * @copied: actual number of bytes received from the transport * * Caller holds xprt->queue_lock. */ void xprt_complete_rqst(struct rpc_task *task, int copied) { struct rpc_rqst *req = task->tk_rqstp; struct rpc_xprt *xprt = req->rq_xprt; xprt->stat.recvs++; xdr_free_bvec(&req->rq_rcv_buf); req->rq_private_buf.bvec = NULL; req->rq_private_buf.len = copied; /* Ensure all writes are done before we update */ /* req->rq_reply_bytes_recvd */ smp_wmb(); req->rq_reply_bytes_recvd = copied; xprt_request_dequeue_receive_locked(task); rpc_wake_up_queued_task(&xprt->pending, task); } EXPORT_SYMBOL_GPL(xprt_complete_rqst); static void xprt_timer(struct rpc_task *task) { struct rpc_rqst *req = task->tk_rqstp; struct rpc_xprt *xprt = req->rq_xprt; if (task->tk_status != -ETIMEDOUT) return; trace_xprt_timer(xprt, req->rq_xid, task->tk_status); if (!req->rq_reply_bytes_recvd) { if (xprt->ops->timer) xprt->ops->timer(xprt, task); } else task->tk_status = 0; } /** * xprt_wait_for_reply_request_def - wait for reply * @task: pointer to rpc_task * * Set a request's retransmit timeout based on the transport's * default timeout parameters. Used by transports that don't adjust * the retransmit timeout based on round-trip time estimation, * and put the task to sleep on the pending queue. */ void xprt_wait_for_reply_request_def(struct rpc_task *task) { struct rpc_rqst *req = task->tk_rqstp; rpc_sleep_on_timeout(&req->rq_xprt->pending, task, xprt_timer, xprt_request_timeout(req)); } EXPORT_SYMBOL_GPL(xprt_wait_for_reply_request_def); /** * xprt_wait_for_reply_request_rtt - wait for reply using RTT estimator * @task: pointer to rpc_task * * Set a request's retransmit timeout using the RTT estimator, * and put the task to sleep on the pending queue. */ void xprt_wait_for_reply_request_rtt(struct rpc_task *task) { int timer = task->tk_msg.rpc_proc->p_timer; struct rpc_clnt *clnt = task->tk_client; struct rpc_rtt *rtt = clnt->cl_rtt; struct rpc_rqst *req = task->tk_rqstp; unsigned long max_timeout = clnt->cl_timeout->to_maxval; unsigned long timeout; timeout = rpc_calc_rto(rtt, timer); timeout <<= rpc_ntimeo(rtt, timer) + req->rq_retries; if (timeout > max_timeout || timeout == 0) timeout = max_timeout; rpc_sleep_on_timeout(&req->rq_xprt->pending, task, xprt_timer, jiffies + timeout); } EXPORT_SYMBOL_GPL(xprt_wait_for_reply_request_rtt); /** * xprt_request_wait_receive - wait for the reply to an RPC request * @task: RPC task about to send a request * */ void xprt_request_wait_receive(struct rpc_task *task) { struct rpc_rqst *req = task->tk_rqstp; struct rpc_xprt *xprt = req->rq_xprt; if (!test_bit(RPC_TASK_NEED_RECV, &task->tk_runstate)) return; /* * Sleep on the pending queue if we're expecting a reply. * The spinlock ensures atomicity between the test of * req->rq_reply_bytes_recvd, and the call to rpc_sleep_on(). */ spin_lock(&xprt->queue_lock); if (test_bit(RPC_TASK_NEED_RECV, &task->tk_runstate)) { xprt->ops->wait_for_reply_request(task); /* * Send an extra queue wakeup call if the * connection was dropped in case the call to * rpc_sleep_on() raced. */ if (xprt_request_retransmit_after_disconnect(task)) rpc_wake_up_queued_task_set_status(&xprt->pending, task, -ENOTCONN); } spin_unlock(&xprt->queue_lock); } static bool xprt_request_need_enqueue_transmit(struct rpc_task *task, struct rpc_rqst *req) { return !test_bit(RPC_TASK_NEED_XMIT, &task->tk_runstate); } /** * xprt_request_enqueue_transmit - queue a task for transmission * @task: pointer to rpc_task * * Add a task to the transmission queue. */ void xprt_request_enqueue_transmit(struct rpc_task *task) { struct rpc_rqst *pos, *req = task->tk_rqstp; struct rpc_xprt *xprt = req->rq_xprt; int ret; if (xprt_request_need_enqueue_transmit(task, req)) { ret = xprt_request_prepare(task->tk_rqstp, &req->rq_snd_buf); if (ret) { task->tk_status = ret; return; } req->rq_bytes_sent = 0; spin_lock(&xprt->queue_lock); /* * Requests that carry congestion control credits are added * to the head of the list to avoid starvation issues. */ if (req->rq_cong) { xprt_clear_congestion_window_wait(xprt); list_for_each_entry(pos, &xprt->xmit_queue, rq_xmit) { if (pos->rq_cong) continue; /* Note: req is added _before_ pos */ list_add_tail(&req->rq_xmit, &pos->rq_xmit); INIT_LIST_HEAD(&req->rq_xmit2); goto out; } } else if (!req->rq_seqno) { list_for_each_entry(pos, &xprt->xmit_queue, rq_xmit) { if (pos->rq_task->tk_owner != task->tk_owner) continue; list_add_tail(&req->rq_xmit2, &pos->rq_xmit2); INIT_LIST_HEAD(&req->rq_xmit); goto out; } } list_add_tail(&req->rq_xmit, &xprt->xmit_queue); INIT_LIST_HEAD(&req->rq_xmit2); out: atomic_long_inc(&xprt->xmit_queuelen); set_bit(RPC_TASK_NEED_XMIT, &task->tk_runstate); spin_unlock(&xprt->queue_lock); } } /** * xprt_request_dequeue_transmit_locked - remove a task from the transmission queue * @task: pointer to rpc_task * * Remove a task from the transmission queue * Caller must hold xprt->queue_lock */ static void xprt_request_dequeue_transmit_locked(struct rpc_task *task) { struct rpc_rqst *req = task->tk_rqstp; if (!test_and_clear_bit(RPC_TASK_NEED_XMIT, &task->tk_runstate)) return; if (!list_empty(&req->rq_xmit)) { struct rpc_xprt *xprt = req->rq_xprt; if (list_is_first(&req->rq_xmit, &xprt->xmit_queue) && xprt->ops->abort_send_request) xprt->ops->abort_send_request(req); list_del(&req->rq_xmit); if (!list_empty(&req->rq_xmit2)) { struct rpc_rqst *next = list_first_entry(&req->rq_xmit2, struct rpc_rqst, rq_xmit2); list_del(&req->rq_xmit2); list_add_tail(&next->rq_xmit, &next->rq_xprt->xmit_queue); } } else list_del(&req->rq_xmit2); atomic_long_dec(&req->rq_xprt->xmit_queuelen); xdr_free_bvec(&req->rq_snd_buf); } /** * xprt_request_dequeue_transmit - remove a task from the transmission queue * @task: pointer to rpc_task * * Remove a task from the transmission queue */ static void xprt_request_dequeue_transmit(struct rpc_task *task) { struct rpc_rqst *req = task->tk_rqstp; struct rpc_xprt *xprt = req->rq_xprt; spin_lock(&xprt->queue_lock); xprt_request_dequeue_transmit_locked(task); spin_unlock(&xprt->queue_lock); } /** * xprt_request_dequeue_xprt - remove a task from the transmit+receive queue * @task: pointer to rpc_task * * Remove a task from the transmit and receive queues, and ensure that * it is not pinned by the receive work item. */ void xprt_request_dequeue_xprt(struct rpc_task *task) { struct rpc_rqst *req = task->tk_rqstp; struct rpc_xprt *xprt = req->rq_xprt; if (test_bit(RPC_TASK_NEED_XMIT, &task->tk_runstate) || test_bit(RPC_TASK_NEED_RECV, &task->tk_runstate) || xprt_is_pinned_rqst(req)) { spin_lock(&xprt->queue_lock); while (xprt_is_pinned_rqst(req)) { set_bit(RPC_TASK_MSG_PIN_WAIT, &task->tk_runstate); spin_unlock(&xprt->queue_lock); xprt_wait_on_pinned_rqst(req); spin_lock(&xprt->queue_lock); clear_bit(RPC_TASK_MSG_PIN_WAIT, &task->tk_runstate); } xprt_request_dequeue_transmit_locked(task); xprt_request_dequeue_receive_locked(task); spin_unlock(&xprt->queue_lock); xdr_free_bvec(&req->rq_rcv_buf); } } /** * xprt_request_prepare - prepare an encoded request for transport * @req: pointer to rpc_rqst * @buf: pointer to send/rcv xdr_buf * * Calls into the transport layer to do whatever is needed to prepare * the request for transmission or receive. * Returns error, or zero. */ static int xprt_request_prepare(struct rpc_rqst *req, struct xdr_buf *buf) { struct rpc_xprt *xprt = req->rq_xprt; if (xprt->ops->prepare_request) return xprt->ops->prepare_request(req, buf); return 0; } /** * xprt_request_need_retransmit - Test if a task needs retransmission * @task: pointer to rpc_task * * Test for whether a connection breakage requires the task to retransmit */ bool xprt_request_need_retransmit(struct rpc_task *task) { return xprt_request_retransmit_after_disconnect(task); } /** * xprt_prepare_transmit - reserve the transport before sending a request * @task: RPC task about to send a request * */ bool xprt_prepare_transmit(struct rpc_task *task) { struct rpc_rqst *req = task->tk_rqstp; struct rpc_xprt *xprt = req->rq_xprt; if (!xprt_lock_write(xprt, task)) { /* Race breaker: someone may have transmitted us */ if (!test_bit(RPC_TASK_NEED_XMIT, &task->tk_runstate)) rpc_wake_up_queued_task_set_status(&xprt->sending, task, 0); return false; } if (atomic_read(&xprt->swapper)) /* This will be clear in __rpc_execute */ current->flags |= PF_MEMALLOC; return true; } void xprt_end_transmit(struct rpc_task *task) { struct rpc_xprt *xprt = task->tk_rqstp->rq_xprt; xprt_inject_disconnect(xprt); xprt_release_write(xprt, task); } /** * xprt_request_transmit - send an RPC request on a transport * @req: pointer to request to transmit * @snd_task: RPC task that owns the transport lock * * This performs the transmission of a single request. * Note that if the request is not the same as snd_task, then it * does need to be pinned. * Returns '0' on success. */ static int xprt_request_transmit(struct rpc_rqst *req, struct rpc_task *snd_task) { struct rpc_xprt *xprt = req->rq_xprt; struct rpc_task *task = req->rq_task; unsigned int connect_cookie; int is_retrans = RPC_WAS_SENT(task); int status; if (test_bit(XPRT_CLOSE_WAIT, &xprt->state)) return -ENOTCONN; if (!req->rq_bytes_sent) { if (xprt_request_data_received(task)) { status = 0; goto out_dequeue; } /* Verify that our message lies in the RPCSEC_GSS window */ if (rpcauth_xmit_need_reencode(task)) { status = -EBADMSG; goto out_dequeue; } if (RPC_SIGNALLED(task)) { status = -ERESTARTSYS; goto out_dequeue; } } /* * Update req->rq_ntrans before transmitting to avoid races with * xprt_update_rtt(), which needs to know that it is recording a * reply to the first transmission. */ req->rq_ntrans++; trace_rpc_xdr_sendto(task, &req->rq_snd_buf); connect_cookie = xprt->connect_cookie; status = xprt->ops->send_request(req); if (status != 0) { req->rq_ntrans--; trace_xprt_transmit(req, status); return status; } if (is_retrans) { task->tk_client->cl_stats->rpcretrans++; trace_xprt_retransmit(req); } xprt_inject_disconnect(xprt); task->tk_flags |= RPC_TASK_SENT; spin_lock(&xprt->transport_lock); xprt->stat.sends++; xprt->stat.req_u += xprt->stat.sends - xprt->stat.recvs; xprt->stat.bklog_u += xprt->backlog.qlen; xprt->stat.sending_u += xprt->sending.qlen; xprt->stat.pending_u += xprt->pending.qlen; spin_unlock(&xprt->transport_lock); req->rq_connect_cookie = connect_cookie; out_dequeue: trace_xprt_transmit(req, status); xprt_request_dequeue_transmit(task); rpc_wake_up_queued_task_set_status(&xprt->sending, task, status); return status; } /** * xprt_transmit - send an RPC request on a transport * @task: controlling RPC task * * Attempts to drain the transmit queue. On exit, either the transport * signalled an error that needs to be handled before transmission can * resume, or @task finished transmitting, and detected that it already * received a reply. */ void xprt_transmit(struct rpc_task *task) { struct rpc_rqst *next, *req = task->tk_rqstp; struct rpc_xprt *xprt = req->rq_xprt; int status; spin_lock(&xprt->queue_lock); for (;;) { next = list_first_entry_or_null(&xprt->xmit_queue, struct rpc_rqst, rq_xmit); if (!next) break; xprt_pin_rqst(next); spin_unlock(&xprt->queue_lock); status = xprt_request_transmit(next, task); if (status == -EBADMSG && next != req) status = 0; spin_lock(&xprt->queue_lock); xprt_unpin_rqst(next); if (status < 0) { if (test_bit(RPC_TASK_NEED_XMIT, &task->tk_runstate)) task->tk_status = status; break; } /* Was @task transmitted, and has it received a reply? */ if (xprt_request_data_received(task) && !test_bit(RPC_TASK_NEED_XMIT, &task->tk_runstate)) break; cond_resched_lock(&xprt->queue_lock); } spin_unlock(&xprt->queue_lock); } static void xprt_complete_request_init(struct rpc_task *task) { if (task->tk_rqstp) xprt_request_init(task); } void xprt_add_backlog(struct rpc_xprt *xprt, struct rpc_task *task) { set_bit(XPRT_CONGESTED, &xprt->state); rpc_sleep_on(&xprt->backlog, task, xprt_complete_request_init); } EXPORT_SYMBOL_GPL(xprt_add_backlog); static bool __xprt_set_rq(struct rpc_task *task, void *data) { struct rpc_rqst *req = data; if (task->tk_rqstp == NULL) { memset(req, 0, sizeof(*req)); /* mark unused */ task->tk_rqstp = req; return true; } return false; } bool xprt_wake_up_backlog(struct rpc_xprt *xprt, struct rpc_rqst *req) { if (rpc_wake_up_first(&xprt->backlog, __xprt_set_rq, req) == NULL) { clear_bit(XPRT_CONGESTED, &xprt->state); return false; } return true; } EXPORT_SYMBOL_GPL(xprt_wake_up_backlog); static bool xprt_throttle_congested(struct rpc_xprt *xprt, struct rpc_task *task) { bool ret = false; if (!test_bit(XPRT_CONGESTED, &xprt->state)) goto out; spin_lock(&xprt->reserve_lock); if (test_bit(XPRT_CONGESTED, &xprt->state)) { xprt_add_backlog(xprt, task); ret = true; } spin_unlock(&xprt->reserve_lock); out: return ret; } static struct rpc_rqst *xprt_dynamic_alloc_slot(struct rpc_xprt *xprt) { struct rpc_rqst *req = ERR_PTR(-EAGAIN); if (xprt->num_reqs >= xprt->max_reqs) goto out; ++xprt->num_reqs; spin_unlock(&xprt->reserve_lock); req = kzalloc(sizeof(*req), rpc_task_gfp_mask()); spin_lock(&xprt->reserve_lock); if (req != NULL) goto out; --xprt->num_reqs; req = ERR_PTR(-ENOMEM); out: return req; } static bool xprt_dynamic_free_slot(struct rpc_xprt *xprt, struct rpc_rqst *req) { if (xprt->num_reqs > xprt->min_reqs) { --xprt->num_reqs; kfree(req); return true; } return false; } void xprt_alloc_slot(struct rpc_xprt *xprt, struct rpc_task *task) { struct rpc_rqst *req; spin_lock(&xprt->reserve_lock); if (!list_empty(&xprt->free)) { req = list_entry(xprt->free.next, struct rpc_rqst, rq_list); list_del(&req->rq_list); goto out_init_req; } req = xprt_dynamic_alloc_slot(xprt); if (!IS_ERR(req)) goto out_init_req; switch (PTR_ERR(req)) { case -ENOMEM: dprintk("RPC: dynamic allocation of request slot " "failed! Retrying\n"); task->tk_status = -ENOMEM; break; case -EAGAIN: xprt_add_backlog(xprt, task); dprintk("RPC: waiting for request slot\n"); fallthrough; default: task->tk_status = -EAGAIN; } spin_unlock(&xprt->reserve_lock); return; out_init_req: xprt->stat.max_slots = max_t(unsigned int, xprt->stat.max_slots, xprt->num_reqs); spin_unlock(&xprt->reserve_lock); task->tk_status = 0; task->tk_rqstp = req; } EXPORT_SYMBOL_GPL(xprt_alloc_slot); void xprt_free_slot(struct rpc_xprt *xprt, struct rpc_rqst *req) { spin_lock(&xprt->reserve_lock); if (!xprt_wake_up_backlog(xprt, req) && !xprt_dynamic_free_slot(xprt, req)) { memset(req, 0, sizeof(*req)); /* mark unused */ list_add(&req->rq_list, &xprt->free); } spin_unlock(&xprt->reserve_lock); } EXPORT_SYMBOL_GPL(xprt_free_slot); static void xprt_free_all_slots(struct rpc_xprt *xprt) { struct rpc_rqst *req; while (!list_empty(&xprt->free)) { req = list_first_entry(&xprt->free, struct rpc_rqst, rq_list); list_del(&req->rq_list); kfree(req); } } static DEFINE_IDA(rpc_xprt_ids); void xprt_cleanup_ids(void) { ida_destroy(&rpc_xprt_ids); } static int xprt_alloc_id(struct rpc_xprt *xprt) { int id; id = ida_alloc(&rpc_xprt_ids, GFP_KERNEL); if (id < 0) return id; xprt->id = id; return 0; } static void xprt_free_id(struct rpc_xprt *xprt) { ida_free(&rpc_xprt_ids, xprt->id); } struct rpc_xprt *xprt_alloc(struct net *net, size_t size, unsigned int num_prealloc, unsigned int max_alloc) { struct rpc_xprt *xprt; struct rpc_rqst *req; int i; xprt = kzalloc(size, GFP_KERNEL); if (xprt == NULL) goto out; xprt_alloc_id(xprt); xprt_init(xprt, net); for (i = 0; i < num_prealloc; i++) { req = kzalloc(sizeof(struct rpc_rqst), GFP_KERNEL); if (!req) goto out_free; list_add(&req->rq_list, &xprt->free); } xprt->max_reqs = max_t(unsigned int, max_alloc, num_prealloc); xprt->min_reqs = num_prealloc; xprt->num_reqs = num_prealloc; return xprt; out_free: xprt_free(xprt); out: return NULL; } EXPORT_SYMBOL_GPL(xprt_alloc); void xprt_free(struct rpc_xprt *xprt) { put_net_track(xprt->xprt_net, &xprt->ns_tracker); xprt_free_all_slots(xprt); xprt_free_id(xprt); rpc_sysfs_xprt_destroy(xprt); kfree_rcu(xprt, rcu); } EXPORT_SYMBOL_GPL(xprt_free); static void xprt_init_connect_cookie(struct rpc_rqst *req, struct rpc_xprt *xprt) { req->rq_connect_cookie = xprt_connect_cookie(xprt) - 1; } static __be32 xprt_alloc_xid(struct rpc_xprt *xprt) { __be32 xid; spin_lock(&xprt->reserve_lock); xid = (__force __be32)xprt->xid++; spin_unlock(&xprt->reserve_lock); return xid; } static void xprt_init_xid(struct rpc_xprt *xprt) { xprt->xid = get_random_u32(); } static void xprt_request_init(struct rpc_task *task) { struct rpc_xprt *xprt = task->tk_xprt; struct rpc_rqst *req = task->tk_rqstp; req->rq_task = task; req->rq_xprt = xprt; req->rq_buffer = NULL; req->rq_xid = xprt_alloc_xid(xprt); xprt_init_connect_cookie(req, xprt); req->rq_snd_buf.len = 0; req->rq_snd_buf.buflen = 0; req->rq_rcv_buf.len = 0; req->rq_rcv_buf.buflen = 0; req->rq_snd_buf.bvec = NULL; req->rq_rcv_buf.bvec = NULL; req->rq_release_snd_buf = NULL; xprt_init_majortimeo(task, req, task->tk_client->cl_timeout); trace_xprt_reserve(req); } static void xprt_do_reserve(struct rpc_xprt *xprt, struct rpc_task *task) { xprt->ops->alloc_slot(xprt, task); if (task->tk_rqstp != NULL) xprt_request_init(task); } /** * xprt_reserve - allocate an RPC request slot * @task: RPC task requesting a slot allocation * * If the transport is marked as being congested, or if no more * slots are available, place the task on the transport's * backlog queue. */ void xprt_reserve(struct rpc_task *task) { struct rpc_xprt *xprt = task->tk_xprt; task->tk_status = 0; if (task->tk_rqstp != NULL) return; task->tk_status = -EAGAIN; if (!xprt_throttle_congested(xprt, task)) xprt_do_reserve(xprt, task); } /** * xprt_retry_reserve - allocate an RPC request slot * @task: RPC task requesting a slot allocation * * If no more slots are available, place the task on the transport's * backlog queue. * Note that the only difference with xprt_reserve is that we now * ignore the value of the XPRT_CONGESTED flag. */ void xprt_retry_reserve(struct rpc_task *task) { struct rpc_xprt *xprt = task->tk_xprt; task->tk_status = 0; if (task->tk_rqstp != NULL) return; task->tk_status = -EAGAIN; xprt_do_reserve(xprt, task); } /** * xprt_release - release an RPC request slot * @task: task which is finished with the slot * */ void xprt_release(struct rpc_task *task) { struct rpc_xprt *xprt; struct rpc_rqst *req = task->tk_rqstp; if (req == NULL) { if (task->tk_client) { xprt = task->tk_xprt; xprt_release_write(xprt, task); } return; } xprt = req->rq_xprt; xprt_request_dequeue_xprt(task); spin_lock(&xprt->transport_lock); xprt->ops->release_xprt(xprt, task); if (xprt->ops->release_request) xprt->ops->release_request(task); xprt_schedule_autodisconnect(xprt); spin_unlock(&xprt->transport_lock); if (req->rq_buffer) xprt->ops->buf_free(task); if (req->rq_cred != NULL) put_rpccred(req->rq_cred); if (req->rq_release_snd_buf) req->rq_release_snd_buf(req); task->tk_rqstp = NULL; if (likely(!bc_prealloc(req))) xprt->ops->free_slot(xprt, req); else xprt_free_bc_request(req); } #ifdef CONFIG_SUNRPC_BACKCHANNEL void xprt_init_bc_request(struct rpc_rqst *req, struct rpc_task *task, const struct rpc_timeout *to) { struct xdr_buf *xbufp = &req->rq_snd_buf; task->tk_rqstp = req; req->rq_task = task; xprt_init_connect_cookie(req, req->rq_xprt); /* * Set up the xdr_buf length. * This also indicates that the buffer is XDR encoded already. */ xbufp->len = xbufp->head[0].iov_len + xbufp->page_len + xbufp->tail[0].iov_len; /* * Backchannel Replies are sent with !RPC_TASK_SOFT and * RPC_TASK_NO_RETRANS_TIMEOUT. The major timeout setting * affects only how long each Reply waits to be sent when * a transport connection cannot be established. */ xprt_init_majortimeo(task, req, to); } #endif static void xprt_init(struct rpc_xprt *xprt, struct net *net) { kref_init(&xprt->kref); spin_lock_init(&xprt->transport_lock); spin_lock_init(&xprt->reserve_lock); spin_lock_init(&xprt->queue_lock); INIT_LIST_HEAD(&xprt->free); xprt->recv_queue = RB_ROOT; INIT_LIST_HEAD(&xprt->xmit_queue); #if defined(CONFIG_SUNRPC_BACKCHANNEL) spin_lock_init(&xprt->bc_pa_lock); INIT_LIST_HEAD(&xprt->bc_pa_list); #endif /* CONFIG_SUNRPC_BACKCHANNEL */ INIT_LIST_HEAD(&xprt->xprt_switch); xprt->last_used = jiffies; xprt->cwnd = RPC_INITCWND; xprt->bind_index = 0; rpc_init_wait_queue(&xprt->binding, "xprt_binding"); rpc_init_wait_queue(&xprt->pending, "xprt_pending"); rpc_init_wait_queue(&xprt->sending, "xprt_sending"); rpc_init_priority_wait_queue(&xprt->backlog, "xprt_backlog"); xprt_init_xid(xprt); xprt->xprt_net = get_net_track(net, &xprt->ns_tracker, GFP_KERNEL); } /** * xprt_create_transport - create an RPC transport * @args: rpc transport creation arguments * */ struct rpc_xprt *xprt_create_transport(struct xprt_create *args) { struct rpc_xprt *xprt; const struct xprt_class *t; t = xprt_class_find_by_ident(args->ident); if (!t) { dprintk("RPC: transport (%d) not supported\n", args->ident); return ERR_PTR(-EIO); } xprt = t->setup(args); xprt_class_release(t); if (IS_ERR(xprt)) goto out; if (args->flags & XPRT_CREATE_NO_IDLE_TIMEOUT) xprt->idle_timeout = 0; INIT_WORK(&xprt->task_cleanup, xprt_autoclose); if (xprt_has_timer(xprt)) timer_setup(&xprt->timer, xprt_init_autodisconnect, 0); else timer_setup(&xprt->timer, NULL, 0); if (strlen(args->servername) > RPC_MAXNETNAMELEN) { xprt_destroy(xprt); return ERR_PTR(-EINVAL); } xprt->servername = kstrdup(args->servername, GFP_KERNEL); if (xprt->servername == NULL) { xprt_destroy(xprt); return ERR_PTR(-ENOMEM); } rpc_xprt_debugfs_register(xprt); trace_xprt_create(xprt); out: return xprt; } static void xprt_destroy_cb(struct work_struct *work) { struct rpc_xprt *xprt = container_of(work, struct rpc_xprt, task_cleanup); trace_xprt_destroy(xprt); rpc_xprt_debugfs_unregister(xprt); rpc_destroy_wait_queue(&xprt->binding); rpc_destroy_wait_queue(&xprt->pending); rpc_destroy_wait_queue(&xprt->sending); rpc_destroy_wait_queue(&xprt->backlog); kfree(xprt->servername); /* * Destroy any existing back channel */ xprt_destroy_backchannel(xprt, UINT_MAX); /* * Tear down transport state and free the rpc_xprt */ xprt->ops->destroy(xprt); } /** * xprt_destroy - destroy an RPC transport, killing off all requests. * @xprt: transport to destroy * */ static void xprt_destroy(struct rpc_xprt *xprt) { /* * Exclude transport connect/disconnect handlers and autoclose */ wait_on_bit_lock(&xprt->state, XPRT_LOCKED, TASK_UNINTERRUPTIBLE); /* * xprt_schedule_autodisconnect() can run after XPRT_LOCKED * is cleared. We use ->transport_lock to ensure the mod_timer() * can only run *before* del_time_sync(), never after. */ spin_lock(&xprt->transport_lock); del_timer_sync(&xprt->timer); spin_unlock(&xprt->transport_lock); /* * Destroy sockets etc from the system workqueue so they can * safely flush receive work running on rpciod. */ INIT_WORK(&xprt->task_cleanup, xprt_destroy_cb); schedule_work(&xprt->task_cleanup); } static void xprt_destroy_kref(struct kref *kref) { xprt_destroy(container_of(kref, struct rpc_xprt, kref)); } /** * xprt_get - return a reference to an RPC transport. * @xprt: pointer to the transport * */ struct rpc_xprt *xprt_get(struct rpc_xprt *xprt) { if (xprt != NULL && kref_get_unless_zero(&xprt->kref)) return xprt; return NULL; } EXPORT_SYMBOL_GPL(xprt_get); /** * xprt_put - release a reference to an RPC transport. * @xprt: pointer to the transport * */ void xprt_put(struct rpc_xprt *xprt) { if (xprt != NULL) kref_put(&xprt->kref, xprt_destroy_kref); } EXPORT_SYMBOL_GPL(xprt_put); void xprt_set_offline_locked(struct rpc_xprt *xprt, struct rpc_xprt_switch *xps) { if (!test_and_set_bit(XPRT_OFFLINE, &xprt->state)) { spin_lock(&xps->xps_lock); xps->xps_nactive--; spin_unlock(&xps->xps_lock); } } void xprt_set_online_locked(struct rpc_xprt *xprt, struct rpc_xprt_switch *xps) { if (test_and_clear_bit(XPRT_OFFLINE, &xprt->state)) { spin_lock(&xps->xps_lock); xps->xps_nactive++; spin_unlock(&xps->xps_lock); } } void xprt_delete_locked(struct rpc_xprt *xprt, struct rpc_xprt_switch *xps) { if (test_and_set_bit(XPRT_REMOVE, &xprt->state)) return; xprt_force_disconnect(xprt); if (!test_bit(XPRT_CONNECTED, &xprt->state)) return; if (!xprt->sending.qlen && !xprt->pending.qlen && !xprt->backlog.qlen && !atomic_long_read(&xprt->queuelen)) rpc_xprt_switch_remove_xprt(xps, xprt, true); } |
| 4 4 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 | // SPDX-License-Identifier: GPL-2.0-only // Copyright (c) 2020 Facebook Inc. #include <linux/ethtool_netlink.h> #include <linux/netdevice.h> #include <linux/slab.h> #include <linux/types.h> #include <linux/workqueue.h> #include <net/udp_tunnel.h> #include <net/vxlan.h> enum udp_tunnel_nic_table_entry_flags { UDP_TUNNEL_NIC_ENTRY_ADD = BIT(0), UDP_TUNNEL_NIC_ENTRY_DEL = BIT(1), UDP_TUNNEL_NIC_ENTRY_OP_FAIL = BIT(2), UDP_TUNNEL_NIC_ENTRY_FROZEN = BIT(3), }; struct udp_tunnel_nic_table_entry { __be16 port; u8 type; u8 flags; u16 use_cnt; #define UDP_TUNNEL_NIC_USE_CNT_MAX U16_MAX u8 hw_priv; }; /** * struct udp_tunnel_nic - UDP tunnel port offload state * @work: async work for talking to hardware from process context * @dev: netdev pointer * @need_sync: at least one port start changed * @need_replay: space was freed, we need a replay of all ports * @work_pending: @work is currently scheduled * @n_tables: number of tables under @entries * @missed: bitmap of tables which overflown * @entries: table of tables of ports currently offloaded */ struct udp_tunnel_nic { struct work_struct work; struct net_device *dev; u8 need_sync:1; u8 need_replay:1; u8 work_pending:1; unsigned int n_tables; unsigned long missed; struct udp_tunnel_nic_table_entry *entries[] __counted_by(n_tables); }; /* We ensure all work structs are done using driver state, but not the code. * We need a workqueue we can flush before module gets removed. */ static struct workqueue_struct *udp_tunnel_nic_workqueue; static const char *udp_tunnel_nic_tunnel_type_name(unsigned int type) { switch (type) { case UDP_TUNNEL_TYPE_VXLAN: return "vxlan"; case UDP_TUNNEL_TYPE_GENEVE: return "geneve"; case UDP_TUNNEL_TYPE_VXLAN_GPE: return "vxlan-gpe"; default: return "unknown"; } } static bool udp_tunnel_nic_entry_is_free(struct udp_tunnel_nic_table_entry *entry) { return entry->use_cnt == 0 && !entry->flags; } static bool udp_tunnel_nic_entry_is_present(struct udp_tunnel_nic_table_entry *entry) { return entry->use_cnt && !(entry->flags & ~UDP_TUNNEL_NIC_ENTRY_FROZEN); } static bool udp_tunnel_nic_entry_is_frozen(struct udp_tunnel_nic_table_entry *entry) { return entry->flags & UDP_TUNNEL_NIC_ENTRY_FROZEN; } static void udp_tunnel_nic_entry_freeze_used(struct udp_tunnel_nic_table_entry *entry) { if (!udp_tunnel_nic_entry_is_free(entry)) entry->flags |= UDP_TUNNEL_NIC_ENTRY_FROZEN; } static void udp_tunnel_nic_entry_unfreeze(struct udp_tunnel_nic_table_entry *entry) { entry->flags &= ~UDP_TUNNEL_NIC_ENTRY_FROZEN; } static bool udp_tunnel_nic_entry_is_queued(struct udp_tunnel_nic_table_entry *entry) { return entry->flags & (UDP_TUNNEL_NIC_ENTRY_ADD | UDP_TUNNEL_NIC_ENTRY_DEL); } static void udp_tunnel_nic_entry_queue(struct udp_tunnel_nic *utn, struct udp_tunnel_nic_table_entry *entry, unsigned int flag) { entry->flags |= flag; utn->need_sync = 1; } static void udp_tunnel_nic_ti_from_entry(struct udp_tunnel_nic_table_entry *entry, struct udp_tunnel_info *ti) { memset(ti, 0, sizeof(*ti)); ti->port = entry->port; ti->type = entry->type; ti->hw_priv = entry->hw_priv; } static bool udp_tunnel_nic_is_empty(struct net_device *dev, struct udp_tunnel_nic *utn) { const struct udp_tunnel_nic_info *info = dev->udp_tunnel_nic_info; unsigned int i, j; for (i = 0; i < utn->n_tables; i++) for (j = 0; j < info->tables[i].n_entries; j++) if (!udp_tunnel_nic_entry_is_free(&utn->entries[i][j])) return false; return true; } static bool udp_tunnel_nic_should_replay(struct net_device *dev, struct udp_tunnel_nic *utn) { const struct udp_tunnel_nic_table_info *table; unsigned int i, j; if (!utn->missed) return false; for (i = 0; i < utn->n_tables; i++) { table = &dev->udp_tunnel_nic_info->tables[i]; if (!test_bit(i, &utn->missed)) continue; for (j = 0; j < table->n_entries; j++) if (udp_tunnel_nic_entry_is_free(&utn->entries[i][j])) return true; } return false; } static void __udp_tunnel_nic_get_port(struct net_device *dev, unsigned int table, unsigned int idx, struct udp_tunnel_info *ti) { struct udp_tunnel_nic_table_entry *entry; struct udp_tunnel_nic *utn; utn = dev->udp_tunnel_nic; entry = &utn->entries[table][idx]; if (entry->use_cnt) udp_tunnel_nic_ti_from_entry(entry, ti); } static void __udp_tunnel_nic_set_port_priv(struct net_device *dev, unsigned int table, unsigned int idx, u8 priv) { dev->udp_tunnel_nic->entries[table][idx].hw_priv = priv; } static void udp_tunnel_nic_entry_update_done(struct udp_tunnel_nic_table_entry *entry, int err) { bool dodgy = entry->flags & UDP_TUNNEL_NIC_ENTRY_OP_FAIL; WARN_ON_ONCE(entry->flags & UDP_TUNNEL_NIC_ENTRY_ADD && entry->flags & UDP_TUNNEL_NIC_ENTRY_DEL); if (entry->flags & UDP_TUNNEL_NIC_ENTRY_ADD && (!err || (err == -EEXIST && dodgy))) entry->flags &= ~UDP_TUNNEL_NIC_ENTRY_ADD; if (entry->flags & UDP_TUNNEL_NIC_ENTRY_DEL && (!err || (err == -ENOENT && dodgy))) entry->flags &= ~UDP_TUNNEL_NIC_ENTRY_DEL; if (!err) entry->flags &= ~UDP_TUNNEL_NIC_ENTRY_OP_FAIL; else entry->flags |= UDP_TUNNEL_NIC_ENTRY_OP_FAIL; } static void udp_tunnel_nic_device_sync_one(struct net_device *dev, struct udp_tunnel_nic *utn, unsigned int table, unsigned int idx) { struct udp_tunnel_nic_table_entry *entry; struct udp_tunnel_info ti; int err; entry = &utn->entries[table][idx]; if (!udp_tunnel_nic_entry_is_queued(entry)) return; udp_tunnel_nic_ti_from_entry(entry, &ti); if (entry->flags & UDP_TUNNEL_NIC_ENTRY_ADD) err = dev->udp_tunnel_nic_info->set_port(dev, table, idx, &ti); else err = dev->udp_tunnel_nic_info->unset_port(dev, table, idx, &ti); udp_tunnel_nic_entry_update_done(entry, err); if (err) netdev_warn(dev, "UDP tunnel port sync failed port %d type %s: %d\n", be16_to_cpu(entry->port), udp_tunnel_nic_tunnel_type_name(entry->type), err); } static void udp_tunnel_nic_device_sync_by_port(struct net_device *dev, struct udp_tunnel_nic *utn) { const struct udp_tunnel_nic_info *info = dev->udp_tunnel_nic_info; unsigned int i, j; for (i = 0; i < utn->n_tables; i++) for (j = 0; j < info->tables[i].n_entries; j++) udp_tunnel_nic_device_sync_one(dev, utn, i, j); } static void udp_tunnel_nic_device_sync_by_table(struct net_device *dev, struct udp_tunnel_nic *utn) { const struct udp_tunnel_nic_info *info = dev->udp_tunnel_nic_info; unsigned int i, j; int err; for (i = 0; i < utn->n_tables; i++) { /* Find something that needs sync in this table */ for (j = 0; j < info->tables[i].n_entries; j++) if (udp_tunnel_nic_entry_is_queued(&utn->entries[i][j])) break; if (j == info->tables[i].n_entries) continue; err = info->sync_table(dev, i); if (err) netdev_warn(dev, "UDP tunnel port sync failed for table %d: %d\n", i, err); for (j = 0; j < info->tables[i].n_entries; j++) { struct udp_tunnel_nic_table_entry *entry; entry = &utn->entries[i][j]; if (udp_tunnel_nic_entry_is_queued(entry)) udp_tunnel_nic_entry_update_done(entry, err); } } } static void __udp_tunnel_nic_device_sync(struct net_device *dev, struct udp_tunnel_nic *utn) { if (!utn->need_sync) return; if (dev->udp_tunnel_nic_info->sync_table) udp_tunnel_nic_device_sync_by_table(dev, utn); else udp_tunnel_nic_device_sync_by_port(dev, utn); utn->need_sync = 0; /* Can't replay directly here, in case we come from the tunnel driver's * notification - trying to replay may deadlock inside tunnel driver. */ utn->need_replay = udp_tunnel_nic_should_replay(dev, utn); } static void udp_tunnel_nic_device_sync(struct net_device *dev, struct udp_tunnel_nic *utn) { const struct udp_tunnel_nic_info *info = dev->udp_tunnel_nic_info; bool may_sleep; if (!utn->need_sync) return; /* Drivers which sleep in the callback need to update from * the workqueue, if we come from the tunnel driver's notification. */ may_sleep = info->flags & UDP_TUNNEL_NIC_INFO_MAY_SLEEP; if (!may_sleep) __udp_tunnel_nic_device_sync(dev, utn); if (may_sleep || utn->need_replay) { queue_work(udp_tunnel_nic_workqueue, &utn->work); utn->work_pending = 1; } } static bool udp_tunnel_nic_table_is_capable(const struct udp_tunnel_nic_table_info *table, struct udp_tunnel_info *ti) { return table->tunnel_types & ti->type; } static bool udp_tunnel_nic_is_capable(struct net_device *dev, struct udp_tunnel_nic *utn, struct udp_tunnel_info *ti) { const struct udp_tunnel_nic_info *info = dev->udp_tunnel_nic_info; unsigned int i; /* Special case IPv4-only NICs */ if (info->flags & UDP_TUNNEL_NIC_INFO_IPV4_ONLY && ti->sa_family != AF_INET) return false; for (i = 0; i < utn->n_tables; i++) if (udp_tunnel_nic_table_is_capable(&info->tables[i], ti)) return true; return false; } static int udp_tunnel_nic_has_collision(struct net_device *dev, struct udp_tunnel_nic *utn, struct udp_tunnel_info *ti) { const struct udp_tunnel_nic_info *info = dev->udp_tunnel_nic_info; struct udp_tunnel_nic_table_entry *entry; unsigned int i, j; for (i = 0; i < utn->n_tables; i++) for (j = 0; j < info->tables[i].n_entries; j++) { entry = &utn->entries[i][j]; if (!udp_tunnel_nic_entry_is_free(entry) && entry->port == ti->port && entry->type != ti->type) { __set_bit(i, &utn->missed); return true; } } return false; } static void udp_tunnel_nic_entry_adj(struct udp_tunnel_nic *utn, unsigned int table, unsigned int idx, int use_cnt_adj) { struct udp_tunnel_nic_table_entry *entry = &utn->entries[table][idx]; bool dodgy = entry->flags & UDP_TUNNEL_NIC_ENTRY_OP_FAIL; unsigned int from, to; WARN_ON(entry->use_cnt + (u32)use_cnt_adj > U16_MAX); /* If not going from used to unused or vice versa - all done. * For dodgy entries make sure we try to sync again (queue the entry). */ entry->use_cnt += use_cnt_adj; if (!dodgy && !entry->use_cnt == !(entry->use_cnt - use_cnt_adj)) return; /* Cancel the op before it was sent to the device, if possible, * otherwise we'd need to take special care to issue commands * in the same order the ports arrived. */ if (use_cnt_adj < 0) { from = UDP_TUNNEL_NIC_ENTRY_ADD; to = UDP_TUNNEL_NIC_ENTRY_DEL; } else { from = UDP_TUNNEL_NIC_ENTRY_DEL; to = UDP_TUNNEL_NIC_ENTRY_ADD; } if (entry->flags & from) { entry->flags &= ~from; if (!dodgy) return; } udp_tunnel_nic_entry_queue(utn, entry, to); } static bool udp_tunnel_nic_entry_try_adj(struct udp_tunnel_nic *utn, unsigned int table, unsigned int idx, struct udp_tunnel_info *ti, int use_cnt_adj) { struct udp_tunnel_nic_table_entry *entry = &utn->entries[table][idx]; if (udp_tunnel_nic_entry_is_free(entry) || entry->port != ti->port || entry->type != ti->type) return false; if (udp_tunnel_nic_entry_is_frozen(entry)) return true; udp_tunnel_nic_entry_adj(utn, table, idx, use_cnt_adj); return true; } /* Try to find existing matching entry and adjust its use count, instead of * adding a new one. Returns true if entry was found. In case of delete the * entry may have gotten removed in the process, in which case it will be * queued for removal. */ static bool udp_tunnel_nic_try_existing(struct net_device *dev, struct udp_tunnel_nic *utn, struct udp_tunnel_info *ti, int use_cnt_adj) { const struct udp_tunnel_nic_table_info *table; unsigned int i, j; for (i = 0; i < utn->n_tables; i++) { table = &dev->udp_tunnel_nic_info->tables[i]; if (!udp_tunnel_nic_table_is_capable(table, ti)) continue; for (j = 0; j < table->n_entries; j++) if (udp_tunnel_nic_entry_try_adj(utn, i, j, ti, use_cnt_adj)) return true; } return false; } static bool udp_tunnel_nic_add_existing(struct net_device *dev, struct udp_tunnel_nic *utn, struct udp_tunnel_info *ti) { return udp_tunnel_nic_try_existing(dev, utn, ti, +1); } static bool udp_tunnel_nic_del_existing(struct net_device *dev, struct udp_tunnel_nic *utn, struct udp_tunnel_info *ti) { return udp_tunnel_nic_try_existing(dev, utn, ti, -1); } static bool udp_tunnel_nic_add_new(struct net_device *dev, struct udp_tunnel_nic *utn, struct udp_tunnel_info *ti) { const struct udp_tunnel_nic_table_info *table; unsigned int i, j; for (i = 0; i < utn->n_tables; i++) { table = &dev->udp_tunnel_nic_info->tables[i]; if (!udp_tunnel_nic_table_is_capable(table, ti)) continue; for (j = 0; j < table->n_entries; j++) { struct udp_tunnel_nic_table_entry *entry; entry = &utn->entries[i][j]; if (!udp_tunnel_nic_entry_is_free(entry)) continue; entry->port = ti->port; entry->type = ti->type; entry->use_cnt = 1; udp_tunnel_nic_entry_queue(utn, entry, UDP_TUNNEL_NIC_ENTRY_ADD); return true; } /* The different table may still fit this port in, but there * are no devices currently which have multiple tables accepting * the same tunnel type, and false positives are okay. */ __set_bit(i, &utn->missed); } return false; } static void __udp_tunnel_nic_add_port(struct net_device *dev, struct udp_tunnel_info *ti) { const struct udp_tunnel_nic_info *info = dev->udp_tunnel_nic_info; struct udp_tunnel_nic *utn; utn = dev->udp_tunnel_nic; if (!utn) return; if (!netif_running(dev) && info->flags & UDP_TUNNEL_NIC_INFO_OPEN_ONLY) return; if (info->flags & UDP_TUNNEL_NIC_INFO_STATIC_IANA_VXLAN && ti->port == htons(IANA_VXLAN_UDP_PORT)) { if (ti->type != UDP_TUNNEL_TYPE_VXLAN) netdev_warn(dev, "device assumes port 4789 will be used by vxlan tunnels\n"); return; } if (!udp_tunnel_nic_is_capable(dev, utn, ti)) return; /* It may happen that a tunnel of one type is removed and different * tunnel type tries to reuse its port before the device was informed. * Rely on utn->missed to re-add this port later. */ if (udp_tunnel_nic_has_collision(dev, utn, ti)) return; if (!udp_tunnel_nic_add_existing(dev, utn, ti)) udp_tunnel_nic_add_new(dev, utn, ti); udp_tunnel_nic_device_sync(dev, utn); } static void __udp_tunnel_nic_del_port(struct net_device *dev, struct udp_tunnel_info *ti) { struct udp_tunnel_nic *utn; utn = dev->udp_tunnel_nic; if (!utn) return; if (!udp_tunnel_nic_is_capable(dev, utn, ti)) return; udp_tunnel_nic_del_existing(dev, utn, ti); udp_tunnel_nic_device_sync(dev, utn); } static void __udp_tunnel_nic_reset_ntf(struct net_device *dev) { const struct udp_tunnel_nic_info *info = dev->udp_tunnel_nic_info; struct udp_tunnel_nic *utn; unsigned int i, j; ASSERT_RTNL(); utn = dev->udp_tunnel_nic; if (!utn) return; utn->need_sync = false; for (i = 0; i < utn->n_tables; i++) for (j = 0; j < info->tables[i].n_entries; j++) { struct udp_tunnel_nic_table_entry *entry; entry = &utn->entries[i][j]; entry->flags &= ~(UDP_TUNNEL_NIC_ENTRY_DEL | UDP_TUNNEL_NIC_ENTRY_OP_FAIL); /* We don't release rtnl across ops */ WARN_ON(entry->flags & UDP_TUNNEL_NIC_ENTRY_FROZEN); if (!entry->use_cnt) continue; udp_tunnel_nic_entry_queue(utn, entry, UDP_TUNNEL_NIC_ENTRY_ADD); } __udp_tunnel_nic_device_sync(dev, utn); } static size_t __udp_tunnel_nic_dump_size(struct net_device *dev, unsigned int table) { const struct udp_tunnel_nic_info *info = dev->udp_tunnel_nic_info; struct udp_tunnel_nic *utn; unsigned int j; size_t size; utn = dev->udp_tunnel_nic; if (!utn) return 0; size = 0; for (j = 0; j < info->tables[table].n_entries; j++) { if (!udp_tunnel_nic_entry_is_present(&utn->entries[table][j])) continue; size += nla_total_size(0) + /* _TABLE_ENTRY */ nla_total_size(sizeof(__be16)) + /* _ENTRY_PORT */ nla_total_size(sizeof(u32)); /* _ENTRY_TYPE */ } return size; } static int __udp_tunnel_nic_dump_write(struct net_device *dev, unsigned int table, struct sk_buff *skb) { const struct udp_tunnel_nic_info *info = dev->udp_tunnel_nic_info; struct udp_tunnel_nic *utn; struct nlattr *nest; unsigned int j; utn = dev->udp_tunnel_nic; if (!utn) return 0; for (j = 0; j < info->tables[table].n_entries; j++) { if (!udp_tunnel_nic_entry_is_present(&utn->entries[table][j])) continue; nest = nla_nest_start(skb, ETHTOOL_A_TUNNEL_UDP_TABLE_ENTRY); if (!nest) return -EMSGSIZE; if (nla_put_be16(skb, ETHTOOL_A_TUNNEL_UDP_ENTRY_PORT, utn->entries[table][j].port) || nla_put_u32(skb, ETHTOOL_A_TUNNEL_UDP_ENTRY_TYPE, ilog2(utn->entries[table][j].type))) goto err_cancel; nla_nest_end(skb, nest); } return 0; err_cancel: nla_nest_cancel(skb, nest); return -EMSGSIZE; } static const struct udp_tunnel_nic_ops __udp_tunnel_nic_ops = { .get_port = __udp_tunnel_nic_get_port, .set_port_priv = __udp_tunnel_nic_set_port_priv, .add_port = __udp_tunnel_nic_add_port, .del_port = __udp_tunnel_nic_del_port, .reset_ntf = __udp_tunnel_nic_reset_ntf, .dump_size = __udp_tunnel_nic_dump_size, .dump_write = __udp_tunnel_nic_dump_write, }; static void udp_tunnel_nic_flush(struct net_device *dev, struct udp_tunnel_nic *utn) { const struct udp_tunnel_nic_info *info = dev->udp_tunnel_nic_info; unsigned int i, j; for (i = 0; i < utn->n_tables; i++) for (j = 0; j < info->tables[i].n_entries; j++) { int adj_cnt = -utn->entries[i][j].use_cnt; if (adj_cnt) udp_tunnel_nic_entry_adj(utn, i, j, adj_cnt); } __udp_tunnel_nic_device_sync(dev, utn); for (i = 0; i < utn->n_tables; i++) memset(utn->entries[i], 0, array_size(info->tables[i].n_entries, sizeof(**utn->entries))); WARN_ON(utn->need_sync); utn->need_replay = 0; } static void udp_tunnel_nic_replay(struct net_device *dev, struct udp_tunnel_nic *utn) { const struct udp_tunnel_nic_info *info = dev->udp_tunnel_nic_info; struct udp_tunnel_nic_shared_node *node; unsigned int i, j; /* Freeze all the ports we are already tracking so that the replay * does not double up the refcount. */ for (i = 0; i < utn->n_tables; i++) for (j = 0; j < info->tables[i].n_entries; j++) udp_tunnel_nic_entry_freeze_used(&utn->entries[i][j]); utn->missed = 0; utn->need_replay = 0; if (!info->shared) { udp_tunnel_get_rx_info(dev); } else { list_for_each_entry(node, &info->shared->devices, list) udp_tunnel_get_rx_info(node->dev); } for (i = 0; i < utn->n_tables; i++) for (j = 0; j < info->tables[i].n_entries; j++) udp_tunnel_nic_entry_unfreeze(&utn->entries[i][j]); } static void udp_tunnel_nic_device_sync_work(struct work_struct *work) { struct udp_tunnel_nic *utn = container_of(work, struct udp_tunnel_nic, work); rtnl_lock(); utn->work_pending = 0; __udp_tunnel_nic_device_sync(utn->dev, utn); if (utn->need_replay) udp_tunnel_nic_replay(utn->dev, utn); rtnl_unlock(); } static struct udp_tunnel_nic * udp_tunnel_nic_alloc(const struct udp_tunnel_nic_info *info, unsigned int n_tables) { struct udp_tunnel_nic *utn; unsigned int i; utn = kzalloc(struct_size(utn, entries, n_tables), GFP_KERNEL); if (!utn) return NULL; utn->n_tables = n_tables; INIT_WORK(&utn->work, udp_tunnel_nic_device_sync_work); for (i = 0; i < n_tables; i++) { utn->entries[i] = kcalloc(info->tables[i].n_entries, sizeof(*utn->entries[i]), GFP_KERNEL); if (!utn->entries[i]) goto err_free_prev_entries; } return utn; err_free_prev_entries: while (i--) kfree(utn->entries[i]); kfree(utn); return NULL; } static void udp_tunnel_nic_free(struct udp_tunnel_nic *utn) { unsigned int i; for (i = 0; i < utn->n_tables; i++) kfree(utn->entries[i]); kfree(utn); } static int udp_tunnel_nic_register(struct net_device *dev) { const struct udp_tunnel_nic_info *info = dev->udp_tunnel_nic_info; struct udp_tunnel_nic_shared_node *node = NULL; struct udp_tunnel_nic *utn; unsigned int n_tables, i; BUILD_BUG_ON(sizeof(utn->missed) * BITS_PER_BYTE < UDP_TUNNEL_NIC_MAX_TABLES); /* Expect use count of at most 2 (IPv4, IPv6) per device */ BUILD_BUG_ON(UDP_TUNNEL_NIC_USE_CNT_MAX < UDP_TUNNEL_NIC_MAX_SHARING_DEVICES * 2); /* Check that the driver info is sane */ if (WARN_ON(!info->set_port != !info->unset_port) || WARN_ON(!info->set_port == !info->sync_table) || WARN_ON(!info->tables[0].n_entries)) return -EINVAL; if (WARN_ON(info->shared && info->flags & UDP_TUNNEL_NIC_INFO_OPEN_ONLY)) return -EINVAL; n_tables = 1; for (i = 1; i < UDP_TUNNEL_NIC_MAX_TABLES; i++) { if (!info->tables[i].n_entries) continue; n_tables++; if (WARN_ON(!info->tables[i - 1].n_entries)) return -EINVAL; } /* Create UDP tunnel state structures */ if (info->shared) { node = kzalloc(sizeof(*node), GFP_KERNEL); if (!node) return -ENOMEM; node->dev = dev; } if (info->shared && info->shared->udp_tunnel_nic_info) { utn = info->shared->udp_tunnel_nic_info; } else { utn = udp_tunnel_nic_alloc(info, n_tables); if (!utn) { kfree(node); return -ENOMEM; } } if (info->shared) { if (!info->shared->udp_tunnel_nic_info) { INIT_LIST_HEAD(&info->shared->devices); info->shared->udp_tunnel_nic_info = utn; } list_add_tail(&node->list, &info->shared->devices); } utn->dev = dev; dev_hold(dev); dev->udp_tunnel_nic = utn; if (!(info->flags & UDP_TUNNEL_NIC_INFO_OPEN_ONLY)) udp_tunnel_get_rx_info(dev); return 0; } static void udp_tunnel_nic_unregister(struct net_device *dev, struct udp_tunnel_nic *utn) { const struct udp_tunnel_nic_info *info = dev->udp_tunnel_nic_info; /* For a shared table remove this dev from the list of sharing devices * and if there are other devices just detach. */ if (info->shared) { struct udp_tunnel_nic_shared_node *node, *first; list_for_each_entry(node, &info->shared->devices, list) if (node->dev == dev) break; if (list_entry_is_head(node, &info->shared->devices, list)) return; list_del(&node->list); kfree(node); first = list_first_entry_or_null(&info->shared->devices, typeof(*first), list); if (first) { udp_tunnel_drop_rx_info(dev); utn->dev = first->dev; goto release_dev; } info->shared->udp_tunnel_nic_info = NULL; } /* Flush before we check work, so we don't waste time adding entries * from the work which we will boot immediately. */ udp_tunnel_nic_flush(dev, utn); /* Wait for the work to be done using the state, netdev core will * retry unregister until we give up our reference on this device. */ if (utn->work_pending) return; udp_tunnel_nic_free(utn); release_dev: dev->udp_tunnel_nic = NULL; dev_put(dev); } static int udp_tunnel_nic_netdevice_event(struct notifier_block *unused, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); const struct udp_tunnel_nic_info *info; struct udp_tunnel_nic *utn; info = dev->udp_tunnel_nic_info; if (!info) return NOTIFY_DONE; if (event == NETDEV_REGISTER) { int err; err = udp_tunnel_nic_register(dev); if (err) netdev_WARN(dev, "failed to register for UDP tunnel offloads: %d", err); return notifier_from_errno(err); } /* All other events will need the udp_tunnel_nic state */ utn = dev->udp_tunnel_nic; if (!utn) return NOTIFY_DONE; if (event == NETDEV_UNREGISTER) { udp_tunnel_nic_unregister(dev, utn); return NOTIFY_OK; } /* All other events only matter if NIC has to be programmed open */ if (!(info->flags & UDP_TUNNEL_NIC_INFO_OPEN_ONLY)) return NOTIFY_DONE; if (event == NETDEV_UP) { WARN_ON(!udp_tunnel_nic_is_empty(dev, utn)); udp_tunnel_get_rx_info(dev); return NOTIFY_OK; } if (event == NETDEV_GOING_DOWN) { udp_tunnel_nic_flush(dev, utn); return NOTIFY_OK; } return NOTIFY_DONE; } static struct notifier_block udp_tunnel_nic_notifier_block __read_mostly = { .notifier_call = udp_tunnel_nic_netdevice_event, }; static int __init udp_tunnel_nic_init_module(void) { int err; udp_tunnel_nic_workqueue = alloc_ordered_workqueue("udp_tunnel_nic", 0); if (!udp_tunnel_nic_workqueue) return -ENOMEM; rtnl_lock(); udp_tunnel_nic_ops = &__udp_tunnel_nic_ops; rtnl_unlock(); err = register_netdevice_notifier(&udp_tunnel_nic_notifier_block); if (err) goto err_unset_ops; return 0; err_unset_ops: rtnl_lock(); udp_tunnel_nic_ops = NULL; rtnl_unlock(); destroy_workqueue(udp_tunnel_nic_workqueue); return err; } late_initcall(udp_tunnel_nic_init_module); static void __exit udp_tunnel_nic_cleanup_module(void) { unregister_netdevice_notifier(&udp_tunnel_nic_notifier_block); rtnl_lock(); udp_tunnel_nic_ops = NULL; rtnl_unlock(); destroy_workqueue(udp_tunnel_nic_workqueue); } module_exit(udp_tunnel_nic_cleanup_module); MODULE_LICENSE("GPL"); |
| 3 3 3 3 3 3 3 2 2 2 2 4 3 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 | // SPDX-License-Identifier: GPL-2.0-or-later /* * Linux IPv6 multicast routing support for BSD pim6sd * Based on net/ipv4/ipmr.c. * * (c) 2004 Mickael Hoerdt, <hoerdt@clarinet.u-strasbg.fr> * LSIIT Laboratory, Strasbourg, France * (c) 2004 Jean-Philippe Andriot, <jean-philippe.andriot@6WIND.com> * 6WIND, Paris, France * Copyright (C)2007,2008 USAGI/WIDE Project * YOSHIFUJI Hideaki <yoshfuji@linux-ipv6.org> */ #include <linux/uaccess.h> #include <linux/types.h> #include <linux/sched.h> #include <linux/errno.h> #include <linux/mm.h> #include <linux/kernel.h> #include <linux/fcntl.h> #include <linux/stat.h> #include <linux/socket.h> #include <linux/inet.h> #include <linux/netdevice.h> #include <linux/inetdevice.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <linux/init.h> #include <linux/compat.h> #include <linux/rhashtable.h> #include <net/protocol.h> #include <linux/skbuff.h> #include <net/raw.h> #include <linux/notifier.h> #include <linux/if_arp.h> #include <net/checksum.h> #include <net/netlink.h> #include <net/fib_rules.h> #include <net/ipv6.h> #include <net/ip6_route.h> #include <linux/mroute6.h> #include <linux/pim.h> #include <net/addrconf.h> #include <linux/netfilter_ipv6.h> #include <linux/export.h> #include <net/ip6_checksum.h> #include <linux/netconf.h> #include <net/ip_tunnels.h> #include <linux/nospec.h> struct ip6mr_rule { struct fib_rule common; }; struct ip6mr_result { struct mr_table *mrt; }; /* Big lock, protecting vif table, mrt cache and mroute socket state. Note that the changes are semaphored via rtnl_lock. */ static DEFINE_SPINLOCK(mrt_lock); static struct net_device *vif_dev_read(const struct vif_device *vif) { return rcu_dereference(vif->dev); } /* Multicast router control variables */ /* Special spinlock for queue of unresolved entries */ static DEFINE_SPINLOCK(mfc_unres_lock); /* We return to original Alan's scheme. Hash table of resolved entries is changed only in process context and protected with weak lock mrt_lock. Queue of unresolved entries is protected with strong spinlock mfc_unres_lock. In this case data path is free of exclusive locks at all. */ static struct kmem_cache *mrt_cachep __read_mostly; static struct mr_table *ip6mr_new_table(struct net *net, u32 id); static void ip6mr_free_table(struct mr_table *mrt); static void ip6_mr_forward(struct net *net, struct mr_table *mrt, struct net_device *dev, struct sk_buff *skb, struct mfc6_cache *cache); static int ip6mr_cache_report(const struct mr_table *mrt, struct sk_buff *pkt, mifi_t mifi, int assert); static void mr6_netlink_event(struct mr_table *mrt, struct mfc6_cache *mfc, int cmd); static void mrt6msg_netlink_event(const struct mr_table *mrt, struct sk_buff *pkt); static int ip6mr_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack); static int ip6mr_rtm_dumproute(struct sk_buff *skb, struct netlink_callback *cb); static void mroute_clean_tables(struct mr_table *mrt, int flags); static void ipmr_expire_process(struct timer_list *t); #ifdef CONFIG_IPV6_MROUTE_MULTIPLE_TABLES #define ip6mr_for_each_table(mrt, net) \ list_for_each_entry_rcu(mrt, &net->ipv6.mr6_tables, list, \ lockdep_rtnl_is_held() || \ list_empty(&net->ipv6.mr6_tables)) static struct mr_table *ip6mr_mr_table_iter(struct net *net, struct mr_table *mrt) { struct mr_table *ret; if (!mrt) ret = list_entry_rcu(net->ipv6.mr6_tables.next, struct mr_table, list); else ret = list_entry_rcu(mrt->list.next, struct mr_table, list); if (&ret->list == &net->ipv6.mr6_tables) return NULL; return ret; } static struct mr_table *ip6mr_get_table(struct net *net, u32 id) { struct mr_table *mrt; ip6mr_for_each_table(mrt, net) { if (mrt->id == id) return mrt; } return NULL; } static int ip6mr_fib_lookup(struct net *net, struct flowi6 *flp6, struct mr_table **mrt) { int err; struct ip6mr_result res; struct fib_lookup_arg arg = { .result = &res, .flags = FIB_LOOKUP_NOREF, }; /* update flow if oif or iif point to device enslaved to l3mdev */ l3mdev_update_flow(net, flowi6_to_flowi(flp6)); err = fib_rules_lookup(net->ipv6.mr6_rules_ops, flowi6_to_flowi(flp6), 0, &arg); if (err < 0) return err; *mrt = res.mrt; return 0; } static int ip6mr_rule_action(struct fib_rule *rule, struct flowi *flp, int flags, struct fib_lookup_arg *arg) { struct ip6mr_result *res = arg->result; struct mr_table *mrt; switch (rule->action) { case FR_ACT_TO_TBL: break; case FR_ACT_UNREACHABLE: return -ENETUNREACH; case FR_ACT_PROHIBIT: return -EACCES; case FR_ACT_BLACKHOLE: default: return -EINVAL; } arg->table = fib_rule_get_table(rule, arg); mrt = ip6mr_get_table(rule->fr_net, arg->table); if (!mrt) return -EAGAIN; res->mrt = mrt; return 0; } static int ip6mr_rule_match(struct fib_rule *rule, struct flowi *flp, int flags) { return 1; } static int ip6mr_rule_configure(struct fib_rule *rule, struct sk_buff *skb, struct fib_rule_hdr *frh, struct nlattr **tb, struct netlink_ext_ack *extack) { return 0; } static int ip6mr_rule_compare(struct fib_rule *rule, struct fib_rule_hdr *frh, struct nlattr **tb) { return 1; } static int ip6mr_rule_fill(struct fib_rule *rule, struct sk_buff *skb, struct fib_rule_hdr *frh) { frh->dst_len = 0; frh->src_len = 0; frh->tos = 0; return 0; } static const struct fib_rules_ops __net_initconst ip6mr_rules_ops_template = { .family = RTNL_FAMILY_IP6MR, .rule_size = sizeof(struct ip6mr_rule), .addr_size = sizeof(struct in6_addr), .action = ip6mr_rule_action, .match = ip6mr_rule_match, .configure = ip6mr_rule_configure, .compare = ip6mr_rule_compare, .fill = ip6mr_rule_fill, .nlgroup = RTNLGRP_IPV6_RULE, .owner = THIS_MODULE, }; static int __net_init ip6mr_rules_init(struct net *net) { struct fib_rules_ops *ops; struct mr_table *mrt; int err; ops = fib_rules_register(&ip6mr_rules_ops_template, net); if (IS_ERR(ops)) return PTR_ERR(ops); INIT_LIST_HEAD(&net->ipv6.mr6_tables); mrt = ip6mr_new_table(net, RT6_TABLE_DFLT); if (IS_ERR(mrt)) { err = PTR_ERR(mrt); goto err1; } err = fib_default_rule_add(ops, 0x7fff, RT6_TABLE_DFLT); if (err < 0) goto err2; net->ipv6.mr6_rules_ops = ops; return 0; err2: rtnl_lock(); ip6mr_free_table(mrt); rtnl_unlock(); err1: fib_rules_unregister(ops); return err; } static void __net_exit ip6mr_rules_exit(struct net *net) { struct mr_table *mrt, *next; ASSERT_RTNL(); list_for_each_entry_safe(mrt, next, &net->ipv6.mr6_tables, list) { list_del(&mrt->list); ip6mr_free_table(mrt); } fib_rules_unregister(net->ipv6.mr6_rules_ops); } static int ip6mr_rules_dump(struct net *net, struct notifier_block *nb, struct netlink_ext_ack *extack) { return fib_rules_dump(net, nb, RTNL_FAMILY_IP6MR, extack); } static unsigned int ip6mr_rules_seq_read(struct net *net) { return fib_rules_seq_read(net, RTNL_FAMILY_IP6MR); } bool ip6mr_rule_default(const struct fib_rule *rule) { return fib_rule_matchall(rule) && rule->action == FR_ACT_TO_TBL && rule->table == RT6_TABLE_DFLT && !rule->l3mdev; } EXPORT_SYMBOL(ip6mr_rule_default); #else #define ip6mr_for_each_table(mrt, net) \ for (mrt = net->ipv6.mrt6; mrt; mrt = NULL) static struct mr_table *ip6mr_mr_table_iter(struct net *net, struct mr_table *mrt) { if (!mrt) return net->ipv6.mrt6; return NULL; } static struct mr_table *ip6mr_get_table(struct net *net, u32 id) { return net->ipv6.mrt6; } static int ip6mr_fib_lookup(struct net *net, struct flowi6 *flp6, struct mr_table **mrt) { *mrt = net->ipv6.mrt6; return 0; } static int __net_init ip6mr_rules_init(struct net *net) { struct mr_table *mrt; mrt = ip6mr_new_table(net, RT6_TABLE_DFLT); if (IS_ERR(mrt)) return PTR_ERR(mrt); net->ipv6.mrt6 = mrt; return 0; } static void __net_exit ip6mr_rules_exit(struct net *net) { ASSERT_RTNL(); ip6mr_free_table(net->ipv6.mrt6); net->ipv6.mrt6 = NULL; } static int ip6mr_rules_dump(struct net *net, struct notifier_block *nb, struct netlink_ext_ack *extack) { return 0; } static unsigned int ip6mr_rules_seq_read(struct net *net) { return 0; } #endif static int ip6mr_hash_cmp(struct rhashtable_compare_arg *arg, const void *ptr) { const struct mfc6_cache_cmp_arg *cmparg = arg->key; struct mfc6_cache *c = (struct mfc6_cache *)ptr; return !ipv6_addr_equal(&c->mf6c_mcastgrp, &cmparg->mf6c_mcastgrp) || !ipv6_addr_equal(&c->mf6c_origin, &cmparg->mf6c_origin); } static const struct rhashtable_params ip6mr_rht_params = { .head_offset = offsetof(struct mr_mfc, mnode), .key_offset = offsetof(struct mfc6_cache, cmparg), .key_len = sizeof(struct mfc6_cache_cmp_arg), .nelem_hint = 3, .obj_cmpfn = ip6mr_hash_cmp, .automatic_shrinking = true, }; static void ip6mr_new_table_set(struct mr_table *mrt, struct net *net) { #ifdef CONFIG_IPV6_MROUTE_MULTIPLE_TABLES list_add_tail_rcu(&mrt->list, &net->ipv6.mr6_tables); #endif } static struct mfc6_cache_cmp_arg ip6mr_mr_table_ops_cmparg_any = { .mf6c_origin = IN6ADDR_ANY_INIT, .mf6c_mcastgrp = IN6ADDR_ANY_INIT, }; static struct mr_table_ops ip6mr_mr_table_ops = { .rht_params = &ip6mr_rht_params, .cmparg_any = &ip6mr_mr_table_ops_cmparg_any, }; static struct mr_table *ip6mr_new_table(struct net *net, u32 id) { struct mr_table *mrt; mrt = ip6mr_get_table(net, id); if (mrt) return mrt; return mr_table_alloc(net, id, &ip6mr_mr_table_ops, ipmr_expire_process, ip6mr_new_table_set); } static void ip6mr_free_table(struct mr_table *mrt) { timer_shutdown_sync(&mrt->ipmr_expire_timer); mroute_clean_tables(mrt, MRT6_FLUSH_MIFS | MRT6_FLUSH_MIFS_STATIC | MRT6_FLUSH_MFC | MRT6_FLUSH_MFC_STATIC); rhltable_destroy(&mrt->mfc_hash); kfree(mrt); } #ifdef CONFIG_PROC_FS /* The /proc interfaces to multicast routing * /proc/ip6_mr_cache /proc/ip6_mr_vif */ static void *ip6mr_vif_seq_start(struct seq_file *seq, loff_t *pos) __acquires(RCU) { struct mr_vif_iter *iter = seq->private; struct net *net = seq_file_net(seq); struct mr_table *mrt; mrt = ip6mr_get_table(net, RT6_TABLE_DFLT); if (!mrt) return ERR_PTR(-ENOENT); iter->mrt = mrt; rcu_read_lock(); return mr_vif_seq_start(seq, pos); } static void ip6mr_vif_seq_stop(struct seq_file *seq, void *v) __releases(RCU) { rcu_read_unlock(); } static int ip6mr_vif_seq_show(struct seq_file *seq, void *v) { struct mr_vif_iter *iter = seq->private; struct mr_table *mrt = iter->mrt; if (v == SEQ_START_TOKEN) { seq_puts(seq, "Interface BytesIn PktsIn BytesOut PktsOut Flags\n"); } else { const struct vif_device *vif = v; const struct net_device *vif_dev; const char *name; vif_dev = vif_dev_read(vif); name = vif_dev ? vif_dev->name : "none"; seq_printf(seq, "%2td %-10s %8ld %7ld %8ld %7ld %05X\n", vif - mrt->vif_table, name, vif->bytes_in, vif->pkt_in, vif->bytes_out, vif->pkt_out, vif->flags); } return 0; } static const struct seq_operations ip6mr_vif_seq_ops = { .start = ip6mr_vif_seq_start, .next = mr_vif_seq_next, .stop = ip6mr_vif_seq_stop, .show = ip6mr_vif_seq_show, }; static void *ipmr_mfc_seq_start(struct seq_file *seq, loff_t *pos) { struct net *net = seq_file_net(seq); struct mr_table *mrt; mrt = ip6mr_get_table(net, RT6_TABLE_DFLT); if (!mrt) return ERR_PTR(-ENOENT); return mr_mfc_seq_start(seq, pos, mrt, &mfc_unres_lock); } static int ipmr_mfc_seq_show(struct seq_file *seq, void *v) { int n; if (v == SEQ_START_TOKEN) { seq_puts(seq, "Group " "Origin " "Iif Pkts Bytes Wrong Oifs\n"); } else { const struct mfc6_cache *mfc = v; const struct mr_mfc_iter *it = seq->private; struct mr_table *mrt = it->mrt; seq_printf(seq, "%pI6 %pI6 %-3hd", &mfc->mf6c_mcastgrp, &mfc->mf6c_origin, mfc->_c.mfc_parent); if (it->cache != &mrt->mfc_unres_queue) { seq_printf(seq, " %8lu %8lu %8lu", mfc->_c.mfc_un.res.pkt, mfc->_c.mfc_un.res.bytes, mfc->_c.mfc_un.res.wrong_if); for (n = mfc->_c.mfc_un.res.minvif; n < mfc->_c.mfc_un.res.maxvif; n++) { if (VIF_EXISTS(mrt, n) && mfc->_c.mfc_un.res.ttls[n] < 255) seq_printf(seq, " %2d:%-3d", n, mfc->_c.mfc_un.res.ttls[n]); } } else { /* unresolved mfc_caches don't contain * pkt, bytes and wrong_if values */ seq_printf(seq, " %8lu %8lu %8lu", 0ul, 0ul, 0ul); } seq_putc(seq, '\n'); } return 0; } static const struct seq_operations ipmr_mfc_seq_ops = { .start = ipmr_mfc_seq_start, .next = mr_mfc_seq_next, .stop = mr_mfc_seq_stop, .show = ipmr_mfc_seq_show, }; #endif #ifdef CONFIG_IPV6_PIMSM_V2 static int pim6_rcv(struct sk_buff *skb) { struct pimreghdr *pim; struct ipv6hdr *encap; struct net_device *reg_dev = NULL; struct net *net = dev_net(skb->dev); struct mr_table *mrt; struct flowi6 fl6 = { .flowi6_iif = skb->dev->ifindex, .flowi6_mark = skb->mark, }; int reg_vif_num; if (!pskb_may_pull(skb, sizeof(*pim) + sizeof(*encap))) goto drop; pim = (struct pimreghdr *)skb_transport_header(skb); if (pim->type != ((PIM_VERSION << 4) | PIM_TYPE_REGISTER) || (pim->flags & PIM_NULL_REGISTER) || (csum_ipv6_magic(&ipv6_hdr(skb)->saddr, &ipv6_hdr(skb)->daddr, sizeof(*pim), IPPROTO_PIM, csum_partial((void *)pim, sizeof(*pim), 0)) && csum_fold(skb_checksum(skb, 0, skb->len, 0)))) goto drop; /* check if the inner packet is destined to mcast group */ encap = (struct ipv6hdr *)(skb_transport_header(skb) + sizeof(*pim)); if (!ipv6_addr_is_multicast(&encap->daddr) || encap->payload_len == 0 || ntohs(encap->payload_len) + sizeof(*pim) > skb->len) goto drop; if (ip6mr_fib_lookup(net, &fl6, &mrt) < 0) goto drop; /* Pairs with WRITE_ONCE() in mif6_add()/mif6_delete() */ reg_vif_num = READ_ONCE(mrt->mroute_reg_vif_num); if (reg_vif_num >= 0) reg_dev = vif_dev_read(&mrt->vif_table[reg_vif_num]); if (!reg_dev) goto drop; skb->mac_header = skb->network_header; skb_pull(skb, (u8 *)encap - skb->data); skb_reset_network_header(skb); skb->protocol = htons(ETH_P_IPV6); skb->ip_summed = CHECKSUM_NONE; skb_tunnel_rx(skb, reg_dev, dev_net(reg_dev)); netif_rx(skb); return 0; drop: kfree_skb(skb); return 0; } static const struct inet6_protocol pim6_protocol = { .handler = pim6_rcv, }; /* Service routines creating virtual interfaces: PIMREG */ static netdev_tx_t reg_vif_xmit(struct sk_buff *skb, struct net_device *dev) { struct net *net = dev_net(dev); struct mr_table *mrt; struct flowi6 fl6 = { .flowi6_oif = dev->ifindex, .flowi6_iif = skb->skb_iif ? : LOOPBACK_IFINDEX, .flowi6_mark = skb->mark, }; if (!pskb_inet_may_pull(skb)) goto tx_err; if (ip6mr_fib_lookup(net, &fl6, &mrt) < 0) goto tx_err; DEV_STATS_ADD(dev, tx_bytes, skb->len); DEV_STATS_INC(dev, tx_packets); rcu_read_lock(); ip6mr_cache_report(mrt, skb, READ_ONCE(mrt->mroute_reg_vif_num), MRT6MSG_WHOLEPKT); rcu_read_unlock(); kfree_skb(skb); return NETDEV_TX_OK; tx_err: DEV_STATS_INC(dev, tx_errors); kfree_skb(skb); return NETDEV_TX_OK; } static int reg_vif_get_iflink(const struct net_device *dev) { return 0; } static const struct net_device_ops reg_vif_netdev_ops = { .ndo_start_xmit = reg_vif_xmit, .ndo_get_iflink = reg_vif_get_iflink, }; static void reg_vif_setup(struct net_device *dev) { dev->type = ARPHRD_PIMREG; dev->mtu = 1500 - sizeof(struct ipv6hdr) - 8; dev->flags = IFF_NOARP; dev->netdev_ops = ®_vif_netdev_ops; dev->needs_free_netdev = true; dev->netns_local = true; } static struct net_device *ip6mr_reg_vif(struct net *net, struct mr_table *mrt) { struct net_device *dev; char name[IFNAMSIZ]; if (mrt->id == RT6_TABLE_DFLT) sprintf(name, "pim6reg"); else sprintf(name, "pim6reg%u", mrt->id); dev = alloc_netdev(0, name, NET_NAME_UNKNOWN, reg_vif_setup); if (!dev) return NULL; dev_net_set(dev, net); if (register_netdevice(dev)) { free_netdev(dev); return NULL; } if (dev_open(dev, NULL)) goto failure; dev_hold(dev); return dev; failure: unregister_netdevice(dev); return NULL; } #endif static int call_ip6mr_vif_entry_notifiers(struct net *net, enum fib_event_type event_type, struct vif_device *vif, struct net_device *vif_dev, mifi_t vif_index, u32 tb_id) { return mr_call_vif_notifiers(net, RTNL_FAMILY_IP6MR, event_type, vif, vif_dev, vif_index, tb_id, &net->ipv6.ipmr_seq); } static int call_ip6mr_mfc_entry_notifiers(struct net *net, enum fib_event_type event_type, struct mfc6_cache *mfc, u32 tb_id) { return mr_call_mfc_notifiers(net, RTNL_FAMILY_IP6MR, event_type, &mfc->_c, tb_id, &net->ipv6.ipmr_seq); } /* Delete a VIF entry */ static int mif6_delete(struct mr_table *mrt, int vifi, int notify, struct list_head *head) { struct vif_device *v; struct net_device *dev; struct inet6_dev *in6_dev; if (vifi < 0 || vifi >= mrt->maxvif) return -EADDRNOTAVAIL; v = &mrt->vif_table[vifi]; dev = rtnl_dereference(v->dev); if (!dev) return -EADDRNOTAVAIL; call_ip6mr_vif_entry_notifiers(read_pnet(&mrt->net), FIB_EVENT_VIF_DEL, v, dev, vifi, mrt->id); spin_lock(&mrt_lock); RCU_INIT_POINTER(v->dev, NULL); #ifdef CONFIG_IPV6_PIMSM_V2 if (vifi == mrt->mroute_reg_vif_num) { /* Pairs with READ_ONCE() in ip6mr_cache_report() and reg_vif_xmit() */ WRITE_ONCE(mrt->mroute_reg_vif_num, -1); } #endif if (vifi + 1 == mrt->maxvif) { int tmp; for (tmp = vifi - 1; tmp >= 0; tmp--) { if (VIF_EXISTS(mrt, tmp)) break; } WRITE_ONCE(mrt->maxvif, tmp + 1); } spin_unlock(&mrt_lock); dev_set_allmulti(dev, -1); in6_dev = __in6_dev_get(dev); if (in6_dev) { atomic_dec(&in6_dev->cnf.mc_forwarding); inet6_netconf_notify_devconf(dev_net(dev), RTM_NEWNETCONF, NETCONFA_MC_FORWARDING, dev->ifindex, &in6_dev->cnf); } if ((v->flags & MIFF_REGISTER) && !notify) unregister_netdevice_queue(dev, head); netdev_put(dev, &v->dev_tracker); return 0; } static inline void ip6mr_cache_free_rcu(struct rcu_head *head) { struct mr_mfc *c = container_of(head, struct mr_mfc, rcu); kmem_cache_free(mrt_cachep, (struct mfc6_cache *)c); } static inline void ip6mr_cache_free(struct mfc6_cache *c) { call_rcu(&c->_c.rcu, ip6mr_cache_free_rcu); } /* Destroy an unresolved cache entry, killing queued skbs and reporting error to netlink readers. */ static void ip6mr_destroy_unres(struct mr_table *mrt, struct mfc6_cache *c) { struct net *net = read_pnet(&mrt->net); struct sk_buff *skb; atomic_dec(&mrt->cache_resolve_queue_len); while ((skb = skb_dequeue(&c->_c.mfc_un.unres.unresolved)) != NULL) { if (ipv6_hdr(skb)->version == 0) { struct nlmsghdr *nlh = skb_pull(skb, sizeof(struct ipv6hdr)); nlh->nlmsg_type = NLMSG_ERROR; nlh->nlmsg_len = nlmsg_msg_size(sizeof(struct nlmsgerr)); skb_trim(skb, nlh->nlmsg_len); ((struct nlmsgerr *)nlmsg_data(nlh))->error = -ETIMEDOUT; rtnl_unicast(skb, net, NETLINK_CB(skb).portid); } else kfree_skb(skb); } ip6mr_cache_free(c); } /* Timer process for all the unresolved queue. */ static void ipmr_do_expire_process(struct mr_table *mrt) { unsigned long now = jiffies; unsigned long expires = 10 * HZ; struct mr_mfc *c, *next; list_for_each_entry_safe(c, next, &mrt->mfc_unres_queue, list) { if (time_after(c->mfc_un.unres.expires, now)) { /* not yet... */ unsigned long interval = c->mfc_un.unres.expires - now; if (interval < expires) expires = interval; continue; } list_del(&c->list); mr6_netlink_event(mrt, (struct mfc6_cache *)c, RTM_DELROUTE); ip6mr_destroy_unres(mrt, (struct mfc6_cache *)c); } if (!list_empty(&mrt->mfc_unres_queue)) mod_timer(&mrt->ipmr_expire_timer, jiffies + expires); } static void ipmr_expire_process(struct timer_list *t) { struct mr_table *mrt = from_timer(mrt, t, ipmr_expire_timer); if (!spin_trylock(&mfc_unres_lock)) { mod_timer(&mrt->ipmr_expire_timer, jiffies + 1); return; } if (!list_empty(&mrt->mfc_unres_queue)) ipmr_do_expire_process(mrt); spin_unlock(&mfc_unres_lock); } /* Fill oifs list. It is called under locked mrt_lock. */ static void ip6mr_update_thresholds(struct mr_table *mrt, struct mr_mfc *cache, unsigned char *ttls) { int vifi; cache->mfc_un.res.minvif = MAXMIFS; cache->mfc_un.res.maxvif = 0; memset(cache->mfc_un.res.ttls, 255, MAXMIFS); for (vifi = 0; vifi < mrt->maxvif; vifi++) { if (VIF_EXISTS(mrt, vifi) && ttls[vifi] && ttls[vifi] < 255) { cache->mfc_un.res.ttls[vifi] = ttls[vifi]; if (cache->mfc_un.res.minvif > vifi) cache->mfc_un.res.minvif = vifi; if (cache->mfc_un.res.maxvif <= vifi) cache->mfc_un.res.maxvif = vifi + 1; } } cache->mfc_un.res.lastuse = jiffies; } static int mif6_add(struct net *net, struct mr_table *mrt, struct mif6ctl *vifc, int mrtsock) { int vifi = vifc->mif6c_mifi; struct vif_device *v = &mrt->vif_table[vifi]; struct net_device *dev; struct inet6_dev *in6_dev; int err; /* Is vif busy ? */ if (VIF_EXISTS(mrt, vifi)) return -EADDRINUSE; switch (vifc->mif6c_flags) { #ifdef CONFIG_IPV6_PIMSM_V2 case MIFF_REGISTER: /* * Special Purpose VIF in PIM * All the packets will be sent to the daemon */ if (mrt->mroute_reg_vif_num >= 0) return -EADDRINUSE; dev = ip6mr_reg_vif(net, mrt); if (!dev) return -ENOBUFS; err = dev_set_allmulti(dev, 1); if (err) { unregister_netdevice(dev); dev_put(dev); return err; } break; #endif case 0: dev = dev_get_by_index(net, vifc->mif6c_pifi); if (!dev) return -EADDRNOTAVAIL; err = dev_set_allmulti(dev, 1); if (err) { dev_put(dev); return err; } break; default: return -EINVAL; } in6_dev = __in6_dev_get(dev); if (in6_dev) { atomic_inc(&in6_dev->cnf.mc_forwarding); inet6_netconf_notify_devconf(dev_net(dev), RTM_NEWNETCONF, NETCONFA_MC_FORWARDING, dev->ifindex, &in6_dev->cnf); } /* Fill in the VIF structures */ vif_device_init(v, dev, vifc->vifc_rate_limit, vifc->vifc_threshold, vifc->mif6c_flags | (!mrtsock ? VIFF_STATIC : 0), MIFF_REGISTER); /* And finish update writing critical data */ spin_lock(&mrt_lock); rcu_assign_pointer(v->dev, dev); netdev_tracker_alloc(dev, &v->dev_tracker, GFP_ATOMIC); #ifdef CONFIG_IPV6_PIMSM_V2 if (v->flags & MIFF_REGISTER) WRITE_ONCE(mrt->mroute_reg_vif_num, vifi); #endif if (vifi + 1 > mrt->maxvif) WRITE_ONCE(mrt->maxvif, vifi + 1); spin_unlock(&mrt_lock); call_ip6mr_vif_entry_notifiers(net, FIB_EVENT_VIF_ADD, v, dev, vifi, mrt->id); return 0; } static struct mfc6_cache *ip6mr_cache_find(struct mr_table *mrt, const struct in6_addr *origin, const struct in6_addr *mcastgrp) { struct mfc6_cache_cmp_arg arg = { .mf6c_origin = *origin, .mf6c_mcastgrp = *mcastgrp, }; return mr_mfc_find(mrt, &arg); } /* Look for a (*,G) entry */ static struct mfc6_cache *ip6mr_cache_find_any(struct mr_table *mrt, struct in6_addr *mcastgrp, mifi_t mifi) { struct mfc6_cache_cmp_arg arg = { .mf6c_origin = in6addr_any, .mf6c_mcastgrp = *mcastgrp, }; if (ipv6_addr_any(mcastgrp)) return mr_mfc_find_any_parent(mrt, mifi); return mr_mfc_find_any(mrt, mifi, &arg); } /* Look for a (S,G,iif) entry if parent != -1 */ static struct mfc6_cache * ip6mr_cache_find_parent(struct mr_table *mrt, const struct in6_addr *origin, const struct in6_addr *mcastgrp, int parent) { struct mfc6_cache_cmp_arg arg = { .mf6c_origin = *origin, .mf6c_mcastgrp = *mcastgrp, }; return mr_mfc_find_parent(mrt, &arg, parent); } /* Allocate a multicast cache entry */ static struct mfc6_cache *ip6mr_cache_alloc(void) { struct mfc6_cache *c = kmem_cache_zalloc(mrt_cachep, GFP_KERNEL); if (!c) return NULL; c->_c.mfc_un.res.last_assert = jiffies - MFC_ASSERT_THRESH - 1; c->_c.mfc_un.res.minvif = MAXMIFS; c->_c.free = ip6mr_cache_free_rcu; refcount_set(&c->_c.mfc_un.res.refcount, 1); return c; } static struct mfc6_cache *ip6mr_cache_alloc_unres(void) { struct mfc6_cache *c = kmem_cache_zalloc(mrt_cachep, GFP_ATOMIC); if (!c) return NULL; skb_queue_head_init(&c->_c.mfc_un.unres.unresolved); c->_c.mfc_un.unres.expires = jiffies + 10 * HZ; return c; } /* * A cache entry has gone into a resolved state from queued */ static void ip6mr_cache_resolve(struct net *net, struct mr_table *mrt, struct mfc6_cache *uc, struct mfc6_cache *c) { struct sk_buff *skb; /* * Play the pending entries through our router */ while ((skb = __skb_dequeue(&uc->_c.mfc_un.unres.unresolved))) { if (ipv6_hdr(skb)->version == 0) { struct nlmsghdr *nlh = skb_pull(skb, sizeof(struct ipv6hdr)); if (mr_fill_mroute(mrt, skb, &c->_c, nlmsg_data(nlh)) > 0) { nlh->nlmsg_len = skb_tail_pointer(skb) - (u8 *)nlh; } else { nlh->nlmsg_type = NLMSG_ERROR; nlh->nlmsg_len = nlmsg_msg_size(sizeof(struct nlmsgerr)); skb_trim(skb, nlh->nlmsg_len); ((struct nlmsgerr *)nlmsg_data(nlh))->error = -EMSGSIZE; } rtnl_unicast(skb, net, NETLINK_CB(skb).portid); } else { rcu_read_lock(); ip6_mr_forward(net, mrt, skb->dev, skb, c); rcu_read_unlock(); } } } /* * Bounce a cache query up to pim6sd and netlink. * * Called under rcu_read_lock() */ static int ip6mr_cache_report(const struct mr_table *mrt, struct sk_buff *pkt, mifi_t mifi, int assert) { struct sock *mroute6_sk; struct sk_buff *skb; struct mrt6msg *msg; int ret; #ifdef CONFIG_IPV6_PIMSM_V2 if (assert == MRT6MSG_WHOLEPKT || assert == MRT6MSG_WRMIFWHOLE) skb = skb_realloc_headroom(pkt, -skb_network_offset(pkt) +sizeof(*msg)); else #endif skb = alloc_skb(sizeof(struct ipv6hdr) + sizeof(*msg), GFP_ATOMIC); if (!skb) return -ENOBUFS; /* I suppose that internal messages * do not require checksums */ skb->ip_summed = CHECKSUM_UNNECESSARY; #ifdef CONFIG_IPV6_PIMSM_V2 if (assert == MRT6MSG_WHOLEPKT || assert == MRT6MSG_WRMIFWHOLE) { /* Ugly, but we have no choice with this interface. Duplicate old header, fix length etc. And all this only to mangle msg->im6_msgtype and to set msg->im6_mbz to "mbz" :-) */ __skb_pull(skb, skb_network_offset(pkt)); skb_push(skb, sizeof(*msg)); skb_reset_transport_header(skb); msg = (struct mrt6msg *)skb_transport_header(skb); msg->im6_mbz = 0; msg->im6_msgtype = assert; if (assert == MRT6MSG_WRMIFWHOLE) msg->im6_mif = mifi; else msg->im6_mif = READ_ONCE(mrt->mroute_reg_vif_num); msg->im6_pad = 0; msg->im6_src = ipv6_hdr(pkt)->saddr; msg->im6_dst = ipv6_hdr(pkt)->daddr; skb->ip_summed = CHECKSUM_UNNECESSARY; } else #endif { /* * Copy the IP header */ skb_put(skb, sizeof(struct ipv6hdr)); skb_reset_network_header(skb); skb_copy_to_linear_data(skb, ipv6_hdr(pkt), sizeof(struct ipv6hdr)); /* * Add our header */ skb_put(skb, sizeof(*msg)); skb_reset_transport_header(skb); msg = (struct mrt6msg *)skb_transport_header(skb); msg->im6_mbz = 0; msg->im6_msgtype = assert; msg->im6_mif = mifi; msg->im6_pad = 0; msg->im6_src = ipv6_hdr(pkt)->saddr; msg->im6_dst = ipv6_hdr(pkt)->daddr; skb_dst_set(skb, dst_clone(skb_dst(pkt))); skb->ip_summed = CHECKSUM_UNNECESSARY; } mroute6_sk = rcu_dereference(mrt->mroute_sk); if (!mroute6_sk) { kfree_skb(skb); return -EINVAL; } mrt6msg_netlink_event(mrt, skb); /* Deliver to user space multicast routing algorithms */ ret = sock_queue_rcv_skb(mroute6_sk, skb); if (ret < 0) { net_warn_ratelimited("mroute6: pending queue full, dropping entries\n"); kfree_skb(skb); } return ret; } /* Queue a packet for resolution. It gets locked cache entry! */ static int ip6mr_cache_unresolved(struct mr_table *mrt, mifi_t mifi, struct sk_buff *skb, struct net_device *dev) { struct mfc6_cache *c; bool found = false; int err; spin_lock_bh(&mfc_unres_lock); list_for_each_entry(c, &mrt->mfc_unres_queue, _c.list) { if (ipv6_addr_equal(&c->mf6c_mcastgrp, &ipv6_hdr(skb)->daddr) && ipv6_addr_equal(&c->mf6c_origin, &ipv6_hdr(skb)->saddr)) { found = true; break; } } if (!found) { /* * Create a new entry if allowable */ c = ip6mr_cache_alloc_unres(); if (!c) { spin_unlock_bh(&mfc_unres_lock); kfree_skb(skb); return -ENOBUFS; } /* Fill in the new cache entry */ c->_c.mfc_parent = -1; c->mf6c_origin = ipv6_hdr(skb)->saddr; c->mf6c_mcastgrp = ipv6_hdr(skb)->daddr; /* * Reflect first query at pim6sd */ err = ip6mr_cache_report(mrt, skb, mifi, MRT6MSG_NOCACHE); if (err < 0) { /* If the report failed throw the cache entry out - Brad Parker */ spin_unlock_bh(&mfc_unres_lock); ip6mr_cache_free(c); kfree_skb(skb); return err; } atomic_inc(&mrt->cache_resolve_queue_len); list_add(&c->_c.list, &mrt->mfc_unres_queue); mr6_netlink_event(mrt, c, RTM_NEWROUTE); ipmr_do_expire_process(mrt); } /* See if we can append the packet */ if (c->_c.mfc_un.unres.unresolved.qlen > 3) { kfree_skb(skb); err = -ENOBUFS; } else { if (dev) { skb->dev = dev; skb->skb_iif = dev->ifindex; } skb_queue_tail(&c->_c.mfc_un.unres.unresolved, skb); err = 0; } spin_unlock_bh(&mfc_unres_lock); return err; } /* * MFC6 cache manipulation by user space */ static int ip6mr_mfc_delete(struct mr_table *mrt, struct mf6cctl *mfc, int parent) { struct mfc6_cache *c; /* The entries are added/deleted only under RTNL */ rcu_read_lock(); c = ip6mr_cache_find_parent(mrt, &mfc->mf6cc_origin.sin6_addr, &mfc->mf6cc_mcastgrp.sin6_addr, parent); rcu_read_unlock(); if (!c) return -ENOENT; rhltable_remove(&mrt->mfc_hash, &c->_c.mnode, ip6mr_rht_params); list_del_rcu(&c->_c.list); call_ip6mr_mfc_entry_notifiers(read_pnet(&mrt->net), FIB_EVENT_ENTRY_DEL, c, mrt->id); mr6_netlink_event(mrt, c, RTM_DELROUTE); mr_cache_put(&c->_c); return 0; } static int ip6mr_device_event(struct notifier_block *this, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct net *net = dev_net(dev); struct mr_table *mrt; struct vif_device *v; int ct; if (event != NETDEV_UNREGISTER) return NOTIFY_DONE; ip6mr_for_each_table(mrt, net) { v = &mrt->vif_table[0]; for (ct = 0; ct < mrt->maxvif; ct++, v++) { if (rcu_access_pointer(v->dev) == dev) mif6_delete(mrt, ct, 1, NULL); } } return NOTIFY_DONE; } static unsigned int ip6mr_seq_read(struct net *net) { ASSERT_RTNL(); return net->ipv6.ipmr_seq + ip6mr_rules_seq_read(net); } static int ip6mr_dump(struct net *net, struct notifier_block *nb, struct netlink_ext_ack *extack) { return mr_dump(net, nb, RTNL_FAMILY_IP6MR, ip6mr_rules_dump, ip6mr_mr_table_iter, extack); } static struct notifier_block ip6_mr_notifier = { .notifier_call = ip6mr_device_event }; static const struct fib_notifier_ops ip6mr_notifier_ops_template = { .family = RTNL_FAMILY_IP6MR, .fib_seq_read = ip6mr_seq_read, .fib_dump = ip6mr_dump, .owner = THIS_MODULE, }; static int __net_init ip6mr_notifier_init(struct net *net) { struct fib_notifier_ops *ops; net->ipv6.ipmr_seq = 0; ops = fib_notifier_ops_register(&ip6mr_notifier_ops_template, net); if (IS_ERR(ops)) return PTR_ERR(ops); net->ipv6.ip6mr_notifier_ops = ops; return 0; } static void __net_exit ip6mr_notifier_exit(struct net *net) { fib_notifier_ops_unregister(net->ipv6.ip6mr_notifier_ops); net->ipv6.ip6mr_notifier_ops = NULL; } /* Setup for IP multicast routing */ static int __net_init ip6mr_net_init(struct net *net) { int err; err = ip6mr_notifier_init(net); if (err) return err; err = ip6mr_rules_init(net); if (err < 0) goto ip6mr_rules_fail; #ifdef CONFIG_PROC_FS err = -ENOMEM; if (!proc_create_net("ip6_mr_vif", 0, net->proc_net, &ip6mr_vif_seq_ops, sizeof(struct mr_vif_iter))) goto proc_vif_fail; if (!proc_create_net("ip6_mr_cache", 0, net->proc_net, &ipmr_mfc_seq_ops, sizeof(struct mr_mfc_iter))) goto proc_cache_fail; #endif return 0; #ifdef CONFIG_PROC_FS proc_cache_fail: remove_proc_entry("ip6_mr_vif", net->proc_net); proc_vif_fail: rtnl_lock(); ip6mr_rules_exit(net); rtnl_unlock(); #endif ip6mr_rules_fail: ip6mr_notifier_exit(net); return err; } static void __net_exit ip6mr_net_exit(struct net *net) { #ifdef CONFIG_PROC_FS remove_proc_entry("ip6_mr_cache", net->proc_net); remove_proc_entry("ip6_mr_vif", net->proc_net); #endif ip6mr_notifier_exit(net); } static void __net_exit ip6mr_net_exit_batch(struct list_head *net_list) { struct net *net; rtnl_lock(); list_for_each_entry(net, net_list, exit_list) ip6mr_rules_exit(net); rtnl_unlock(); } static struct pernet_operations ip6mr_net_ops = { .init = ip6mr_net_init, .exit = ip6mr_net_exit, .exit_batch = ip6mr_net_exit_batch, }; int __init ip6_mr_init(void) { int err; mrt_cachep = KMEM_CACHE(mfc6_cache, SLAB_HWCACHE_ALIGN); if (!mrt_cachep) return -ENOMEM; err = register_pernet_subsys(&ip6mr_net_ops); if (err) goto reg_pernet_fail; err = register_netdevice_notifier(&ip6_mr_notifier); if (err) goto reg_notif_fail; #ifdef CONFIG_IPV6_PIMSM_V2 if (inet6_add_protocol(&pim6_protocol, IPPROTO_PIM) < 0) { pr_err("%s: can't add PIM protocol\n", __func__); err = -EAGAIN; goto add_proto_fail; } #endif err = rtnl_register_module(THIS_MODULE, RTNL_FAMILY_IP6MR, RTM_GETROUTE, ip6mr_rtm_getroute, ip6mr_rtm_dumproute, 0); if (err == 0) return 0; #ifdef CONFIG_IPV6_PIMSM_V2 inet6_del_protocol(&pim6_protocol, IPPROTO_PIM); add_proto_fail: unregister_netdevice_notifier(&ip6_mr_notifier); #endif reg_notif_fail: unregister_pernet_subsys(&ip6mr_net_ops); reg_pernet_fail: kmem_cache_destroy(mrt_cachep); return err; } void ip6_mr_cleanup(void) { rtnl_unregister(RTNL_FAMILY_IP6MR, RTM_GETROUTE); #ifdef CONFIG_IPV6_PIMSM_V2 inet6_del_protocol(&pim6_protocol, IPPROTO_PIM); #endif unregister_netdevice_notifier(&ip6_mr_notifier); unregister_pernet_subsys(&ip6mr_net_ops); kmem_cache_destroy(mrt_cachep); } static int ip6mr_mfc_add(struct net *net, struct mr_table *mrt, struct mf6cctl *mfc, int mrtsock, int parent) { unsigned char ttls[MAXMIFS]; struct mfc6_cache *uc, *c; struct mr_mfc *_uc; bool found; int i, err; if (mfc->mf6cc_parent >= MAXMIFS) return -ENFILE; memset(ttls, 255, MAXMIFS); for (i = 0; i < MAXMIFS; i++) { if (IF_ISSET(i, &mfc->mf6cc_ifset)) ttls[i] = 1; } /* The entries are added/deleted only under RTNL */ rcu_read_lock(); c = ip6mr_cache_find_parent(mrt, &mfc->mf6cc_origin.sin6_addr, &mfc->mf6cc_mcastgrp.sin6_addr, parent); rcu_read_unlock(); if (c) { spin_lock(&mrt_lock); c->_c.mfc_parent = mfc->mf6cc_parent; ip6mr_update_thresholds(mrt, &c->_c, ttls); if (!mrtsock) c->_c.mfc_flags |= MFC_STATIC; spin_unlock(&mrt_lock); call_ip6mr_mfc_entry_notifiers(net, FIB_EVENT_ENTRY_REPLACE, c, mrt->id); mr6_netlink_event(mrt, c, RTM_NEWROUTE); return 0; } if (!ipv6_addr_any(&mfc->mf6cc_mcastgrp.sin6_addr) && !ipv6_addr_is_multicast(&mfc->mf6cc_mcastgrp.sin6_addr)) return -EINVAL; c = ip6mr_cache_alloc(); if (!c) return -ENOMEM; c->mf6c_origin = mfc->mf6cc_origin.sin6_addr; c->mf6c_mcastgrp = mfc->mf6cc_mcastgrp.sin6_addr; c->_c.mfc_parent = mfc->mf6cc_parent; ip6mr_update_thresholds(mrt, &c->_c, ttls); if (!mrtsock) c->_c.mfc_flags |= MFC_STATIC; err = rhltable_insert_key(&mrt->mfc_hash, &c->cmparg, &c->_c.mnode, ip6mr_rht_params); if (err) { pr_err("ip6mr: rhtable insert error %d\n", err); ip6mr_cache_free(c); return err; } list_add_tail_rcu(&c->_c.list, &mrt->mfc_cache_list); /* Check to see if we resolved a queued list. If so we * need to send on the frames and tidy up. */ found = false; spin_lock_bh(&mfc_unres_lock); list_for_each_entry(_uc, &mrt->mfc_unres_queue, list) { uc = (struct mfc6_cache *)_uc; if (ipv6_addr_equal(&uc->mf6c_origin, &c->mf6c_origin) && ipv6_addr_equal(&uc->mf6c_mcastgrp, &c->mf6c_mcastgrp)) { list_del(&_uc->list); atomic_dec(&mrt->cache_resolve_queue_len); found = true; break; } } if (list_empty(&mrt->mfc_unres_queue)) del_timer(&mrt->ipmr_expire_timer); spin_unlock_bh(&mfc_unres_lock); if (found) { ip6mr_cache_resolve(net, mrt, uc, c); ip6mr_cache_free(uc); } call_ip6mr_mfc_entry_notifiers(net, FIB_EVENT_ENTRY_ADD, c, mrt->id); mr6_netlink_event(mrt, c, RTM_NEWROUTE); return 0; } /* * Close the multicast socket, and clear the vif tables etc */ static void mroute_clean_tables(struct mr_table *mrt, int flags) { struct mr_mfc *c, *tmp; LIST_HEAD(list); int i; /* Shut down all active vif entries */ if (flags & (MRT6_FLUSH_MIFS | MRT6_FLUSH_MIFS_STATIC)) { for (i = 0; i < mrt->maxvif; i++) { if (((mrt->vif_table[i].flags & VIFF_STATIC) && !(flags & MRT6_FLUSH_MIFS_STATIC)) || (!(mrt->vif_table[i].flags & VIFF_STATIC) && !(flags & MRT6_FLUSH_MIFS))) continue; mif6_delete(mrt, i, 0, &list); } unregister_netdevice_many(&list); } /* Wipe the cache */ if (flags & (MRT6_FLUSH_MFC | MRT6_FLUSH_MFC_STATIC)) { list_for_each_entry_safe(c, tmp, &mrt->mfc_cache_list, list) { if (((c->mfc_flags & MFC_STATIC) && !(flags & MRT6_FLUSH_MFC_STATIC)) || (!(c->mfc_flags & MFC_STATIC) && !(flags & MRT6_FLUSH_MFC))) continue; rhltable_remove(&mrt->mfc_hash, &c->mnode, ip6mr_rht_params); list_del_rcu(&c->list); call_ip6mr_mfc_entry_notifiers(read_pnet(&mrt->net), FIB_EVENT_ENTRY_DEL, (struct mfc6_cache *)c, mrt->id); mr6_netlink_event(mrt, (struct mfc6_cache *)c, RTM_DELROUTE); mr_cache_put(c); } } if (flags & MRT6_FLUSH_MFC) { if (atomic_read(&mrt->cache_resolve_queue_len) != 0) { spin_lock_bh(&mfc_unres_lock); list_for_each_entry_safe(c, tmp, &mrt->mfc_unres_queue, list) { list_del(&c->list); mr6_netlink_event(mrt, (struct mfc6_cache *)c, RTM_DELROUTE); ip6mr_destroy_unres(mrt, (struct mfc6_cache *)c); } spin_unlock_bh(&mfc_unres_lock); } } } static int ip6mr_sk_init(struct mr_table *mrt, struct sock *sk) { int err = 0; struct net *net = sock_net(sk); rtnl_lock(); spin_lock(&mrt_lock); if (rtnl_dereference(mrt->mroute_sk)) { err = -EADDRINUSE; } else { rcu_assign_pointer(mrt->mroute_sk, sk); sock_set_flag(sk, SOCK_RCU_FREE); atomic_inc(&net->ipv6.devconf_all->mc_forwarding); } spin_unlock(&mrt_lock); if (!err) inet6_netconf_notify_devconf(net, RTM_NEWNETCONF, NETCONFA_MC_FORWARDING, NETCONFA_IFINDEX_ALL, net->ipv6.devconf_all); rtnl_unlock(); return err; } int ip6mr_sk_done(struct sock *sk) { struct net *net = sock_net(sk); struct ipv6_devconf *devconf; struct mr_table *mrt; int err = -EACCES; if (sk->sk_type != SOCK_RAW || inet_sk(sk)->inet_num != IPPROTO_ICMPV6) return err; devconf = net->ipv6.devconf_all; if (!devconf || !atomic_read(&devconf->mc_forwarding)) return err; rtnl_lock(); ip6mr_for_each_table(mrt, net) { if (sk == rtnl_dereference(mrt->mroute_sk)) { spin_lock(&mrt_lock); RCU_INIT_POINTER(mrt->mroute_sk, NULL); /* Note that mroute_sk had SOCK_RCU_FREE set, * so the RCU grace period before sk freeing * is guaranteed by sk_destruct() */ atomic_dec(&devconf->mc_forwarding); spin_unlock(&mrt_lock); inet6_netconf_notify_devconf(net, RTM_NEWNETCONF, NETCONFA_MC_FORWARDING, NETCONFA_IFINDEX_ALL, net->ipv6.devconf_all); mroute_clean_tables(mrt, MRT6_FLUSH_MIFS | MRT6_FLUSH_MFC); err = 0; break; } } rtnl_unlock(); return err; } bool mroute6_is_socket(struct net *net, struct sk_buff *skb) { struct mr_table *mrt; struct flowi6 fl6 = { .flowi6_iif = skb->skb_iif ? : LOOPBACK_IFINDEX, .flowi6_oif = skb->dev->ifindex, .flowi6_mark = skb->mark, }; if (ip6mr_fib_lookup(net, &fl6, &mrt) < 0) return NULL; return rcu_access_pointer(mrt->mroute_sk); } EXPORT_SYMBOL(mroute6_is_socket); /* * Socket options and virtual interface manipulation. The whole * virtual interface system is a complete heap, but unfortunately * that's how BSD mrouted happens to think. Maybe one day with a proper * MOSPF/PIM router set up we can clean this up. */ int ip6_mroute_setsockopt(struct sock *sk, int optname, sockptr_t optval, unsigned int optlen) { int ret, parent = 0; struct mif6ctl vif; struct mf6cctl mfc; mifi_t mifi; struct net *net = sock_net(sk); struct mr_table *mrt; if (sk->sk_type != SOCK_RAW || inet_sk(sk)->inet_num != IPPROTO_ICMPV6) return -EOPNOTSUPP; mrt = ip6mr_get_table(net, raw6_sk(sk)->ip6mr_table ? : RT6_TABLE_DFLT); if (!mrt) return -ENOENT; if (optname != MRT6_INIT) { if (sk != rcu_access_pointer(mrt->mroute_sk) && !ns_capable(net->user_ns, CAP_NET_ADMIN)) return -EACCES; } switch (optname) { case MRT6_INIT: if (optlen < sizeof(int)) return -EINVAL; return ip6mr_sk_init(mrt, sk); case MRT6_DONE: return ip6mr_sk_done(sk); case MRT6_ADD_MIF: if (optlen < sizeof(vif)) return -EINVAL; if (copy_from_sockptr(&vif, optval, sizeof(vif))) return -EFAULT; if (vif.mif6c_mifi >= MAXMIFS) return -ENFILE; rtnl_lock(); ret = mif6_add(net, mrt, &vif, sk == rtnl_dereference(mrt->mroute_sk)); rtnl_unlock(); return ret; case MRT6_DEL_MIF: if (optlen < sizeof(mifi_t)) return -EINVAL; if (copy_from_sockptr(&mifi, optval, sizeof(mifi_t))) return -EFAULT; rtnl_lock(); ret = mif6_delete(mrt, mifi, 0, NULL); rtnl_unlock(); return ret; /* * Manipulate the forwarding caches. These live * in a sort of kernel/user symbiosis. */ case MRT6_ADD_MFC: case MRT6_DEL_MFC: parent = -1; fallthrough; case MRT6_ADD_MFC_PROXY: case MRT6_DEL_MFC_PROXY: if (optlen < sizeof(mfc)) return -EINVAL; if (copy_from_sockptr(&mfc, optval, sizeof(mfc))) return -EFAULT; if (parent == 0) parent = mfc.mf6cc_parent; rtnl_lock(); if (optname == MRT6_DEL_MFC || optname == MRT6_DEL_MFC_PROXY) ret = ip6mr_mfc_delete(mrt, &mfc, parent); else ret = ip6mr_mfc_add(net, mrt, &mfc, sk == rtnl_dereference(mrt->mroute_sk), parent); rtnl_unlock(); return ret; case MRT6_FLUSH: { int flags; if (optlen != sizeof(flags)) return -EINVAL; if (copy_from_sockptr(&flags, optval, sizeof(flags))) return -EFAULT; rtnl_lock(); mroute_clean_tables(mrt, flags); rtnl_unlock(); return 0; } /* * Control PIM assert (to activate pim will activate assert) */ case MRT6_ASSERT: { int v; if (optlen != sizeof(v)) return -EINVAL; if (copy_from_sockptr(&v, optval, sizeof(v))) return -EFAULT; mrt->mroute_do_assert = v; return 0; } #ifdef CONFIG_IPV6_PIMSM_V2 case MRT6_PIM: { bool do_wrmifwhole; int v; if (optlen != sizeof(v)) return -EINVAL; if (copy_from_sockptr(&v, optval, sizeof(v))) return -EFAULT; do_wrmifwhole = (v == MRT6MSG_WRMIFWHOLE); v = !!v; rtnl_lock(); ret = 0; if (v != mrt->mroute_do_pim) { mrt->mroute_do_pim = v; mrt->mroute_do_assert = v; mrt->mroute_do_wrvifwhole = do_wrmifwhole; } rtnl_unlock(); return ret; } #endif #ifdef CONFIG_IPV6_MROUTE_MULTIPLE_TABLES case MRT6_TABLE: { u32 v; if (optlen != sizeof(u32)) return -EINVAL; if (copy_from_sockptr(&v, optval, sizeof(v))) return -EFAULT; /* "pim6reg%u" should not exceed 16 bytes (IFNAMSIZ) */ if (v != RT_TABLE_DEFAULT && v >= 100000000) return -EINVAL; if (sk == rcu_access_pointer(mrt->mroute_sk)) return -EBUSY; rtnl_lock(); ret = 0; mrt = ip6mr_new_table(net, v); if (IS_ERR(mrt)) ret = PTR_ERR(mrt); else raw6_sk(sk)->ip6mr_table = v; rtnl_unlock(); return ret; } #endif /* * Spurious command, or MRT6_VERSION which you cannot * set. */ default: return -ENOPROTOOPT; } } /* * Getsock opt support for the multicast routing system. */ int ip6_mroute_getsockopt(struct sock *sk, int optname, sockptr_t optval, sockptr_t optlen) { int olr; int val; struct net *net = sock_net(sk); struct mr_table *mrt; if (sk->sk_type != SOCK_RAW || inet_sk(sk)->inet_num != IPPROTO_ICMPV6) return -EOPNOTSUPP; mrt = ip6mr_get_table(net, raw6_sk(sk)->ip6mr_table ? : RT6_TABLE_DFLT); if (!mrt) return -ENOENT; switch (optname) { case MRT6_VERSION: val = 0x0305; break; #ifdef CONFIG_IPV6_PIMSM_V2 case MRT6_PIM: val = mrt->mroute_do_pim; break; #endif case MRT6_ASSERT: val = mrt->mroute_do_assert; break; default: return -ENOPROTOOPT; } if (copy_from_sockptr(&olr, optlen, sizeof(int))) return -EFAULT; olr = min_t(int, olr, sizeof(int)); if (olr < 0) return -EINVAL; if (copy_to_sockptr(optlen, &olr, sizeof(int))) return -EFAULT; if (copy_to_sockptr(optval, &val, olr)) return -EFAULT; return 0; } /* * The IP multicast ioctl support routines. */ int ip6mr_ioctl(struct sock *sk, int cmd, void *arg) { struct sioc_sg_req6 *sr; struct sioc_mif_req6 *vr; struct vif_device *vif; struct mfc6_cache *c; struct net *net = sock_net(sk); struct mr_table *mrt; mrt = ip6mr_get_table(net, raw6_sk(sk)->ip6mr_table ? : RT6_TABLE_DFLT); if (!mrt) return -ENOENT; switch (cmd) { case SIOCGETMIFCNT_IN6: vr = (struct sioc_mif_req6 *)arg; if (vr->mifi >= mrt->maxvif) return -EINVAL; vr->mifi = array_index_nospec(vr->mifi, mrt->maxvif); rcu_read_lock(); vif = &mrt->vif_table[vr->mifi]; if (VIF_EXISTS(mrt, vr->mifi)) { vr->icount = READ_ONCE(vif->pkt_in); vr->ocount = READ_ONCE(vif->pkt_out); vr->ibytes = READ_ONCE(vif->bytes_in); vr->obytes = READ_ONCE(vif->bytes_out); rcu_read_unlock(); return 0; } rcu_read_unlock(); return -EADDRNOTAVAIL; case SIOCGETSGCNT_IN6: sr = (struct sioc_sg_req6 *)arg; rcu_read_lock(); c = ip6mr_cache_find(mrt, &sr->src.sin6_addr, &sr->grp.sin6_addr); if (c) { sr->pktcnt = c->_c.mfc_un.res.pkt; sr->bytecnt = c->_c.mfc_un.res.bytes; sr->wrong_if = c->_c.mfc_un.res.wrong_if; rcu_read_unlock(); return 0; } rcu_read_unlock(); return -EADDRNOTAVAIL; default: return -ENOIOCTLCMD; } } #ifdef CONFIG_COMPAT struct compat_sioc_sg_req6 { struct sockaddr_in6 src; struct sockaddr_in6 grp; compat_ulong_t pktcnt; compat_ulong_t bytecnt; compat_ulong_t wrong_if; }; struct compat_sioc_mif_req6 { mifi_t mifi; compat_ulong_t icount; compat_ulong_t ocount; compat_ulong_t ibytes; compat_ulong_t obytes; }; int ip6mr_compat_ioctl(struct sock *sk, unsigned int cmd, void __user *arg) { struct compat_sioc_sg_req6 sr; struct compat_sioc_mif_req6 vr; struct vif_device *vif; struct mfc6_cache *c; struct net *net = sock_net(sk); struct mr_table *mrt; mrt = ip6mr_get_table(net, raw6_sk(sk)->ip6mr_table ? : RT6_TABLE_DFLT); if (!mrt) return -ENOENT; switch (cmd) { case SIOCGETMIFCNT_IN6: if (copy_from_user(&vr, arg, sizeof(vr))) return -EFAULT; if (vr.mifi >= mrt->maxvif) return -EINVAL; vr.mifi = array_index_nospec(vr.mifi, mrt->maxvif); rcu_read_lock(); vif = &mrt->vif_table[vr.mifi]; if (VIF_EXISTS(mrt, vr.mifi)) { vr.icount = READ_ONCE(vif->pkt_in); vr.ocount = READ_ONCE(vif->pkt_out); vr.ibytes = READ_ONCE(vif->bytes_in); vr.obytes = READ_ONCE(vif->bytes_out); rcu_read_unlock(); if (copy_to_user(arg, &vr, sizeof(vr))) return -EFAULT; return 0; } rcu_read_unlock(); return -EADDRNOTAVAIL; case SIOCGETSGCNT_IN6: if (copy_from_user(&sr, arg, sizeof(sr))) return -EFAULT; rcu_read_lock(); c = ip6mr_cache_find(mrt, &sr.src.sin6_addr, &sr.grp.sin6_addr); if (c) { sr.pktcnt = c->_c.mfc_un.res.pkt; sr.bytecnt = c->_c.mfc_un.res.bytes; sr.wrong_if = c->_c.mfc_un.res.wrong_if; rcu_read_unlock(); if (copy_to_user(arg, &sr, sizeof(sr))) return -EFAULT; return 0; } rcu_read_unlock(); return -EADDRNOTAVAIL; default: return -ENOIOCTLCMD; } } #endif static inline int ip6mr_forward2_finish(struct net *net, struct sock *sk, struct sk_buff *skb) { IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_OUTFORWDATAGRAMS); return dst_output(net, sk, skb); } /* * Processing handlers for ip6mr_forward */ static int ip6mr_forward2(struct net *net, struct mr_table *mrt, struct sk_buff *skb, int vifi) { struct vif_device *vif = &mrt->vif_table[vifi]; struct net_device *vif_dev; struct ipv6hdr *ipv6h; struct dst_entry *dst; struct flowi6 fl6; vif_dev = vif_dev_read(vif); if (!vif_dev) goto out_free; #ifdef CONFIG_IPV6_PIMSM_V2 if (vif->flags & MIFF_REGISTER) { WRITE_ONCE(vif->pkt_out, vif->pkt_out + 1); WRITE_ONCE(vif->bytes_out, vif->bytes_out + skb->len); DEV_STATS_ADD(vif_dev, tx_bytes, skb->len); DEV_STATS_INC(vif_dev, tx_packets); ip6mr_cache_report(mrt, skb, vifi, MRT6MSG_WHOLEPKT); goto out_free; } #endif ipv6h = ipv6_hdr(skb); fl6 = (struct flowi6) { .flowi6_oif = vif->link, .daddr = ipv6h->daddr, }; dst = ip6_route_output(net, NULL, &fl6); if (dst->error) { dst_release(dst); goto out_free; } skb_dst_drop(skb); skb_dst_set(skb, dst); /* * RFC1584 teaches, that DVMRP/PIM router must deliver packets locally * not only before forwarding, but after forwarding on all output * interfaces. It is clear, if mrouter runs a multicasting * program, it should receive packets not depending to what interface * program is joined. * If we will not make it, the program will have to join on all * interfaces. On the other hand, multihoming host (or router, but * not mrouter) cannot join to more than one interface - it will * result in receiving multiple packets. */ skb->dev = vif_dev; WRITE_ONCE(vif->pkt_out, vif->pkt_out + 1); WRITE_ONCE(vif->bytes_out, vif->bytes_out + skb->len); /* We are about to write */ /* XXX: extension headers? */ if (skb_cow(skb, sizeof(*ipv6h) + LL_RESERVED_SPACE(vif_dev))) goto out_free; ipv6h = ipv6_hdr(skb); ipv6h->hop_limit--; IP6CB(skb)->flags |= IP6SKB_FORWARDED; return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD, net, NULL, skb, skb->dev, vif_dev, ip6mr_forward2_finish); out_free: kfree_skb(skb); return 0; } /* Called with rcu_read_lock() */ static int ip6mr_find_vif(struct mr_table *mrt, struct net_device *dev) { int ct; /* Pairs with WRITE_ONCE() in mif6_delete()/mif6_add() */ for (ct = READ_ONCE(mrt->maxvif) - 1; ct >= 0; ct--) { if (rcu_access_pointer(mrt->vif_table[ct].dev) == dev) break; } return ct; } /* Called under rcu_read_lock() */ static void ip6_mr_forward(struct net *net, struct mr_table *mrt, struct net_device *dev, struct sk_buff *skb, struct mfc6_cache *c) { int psend = -1; int vif, ct; int true_vifi = ip6mr_find_vif(mrt, dev); vif = c->_c.mfc_parent; c->_c.mfc_un.res.pkt++; c->_c.mfc_un.res.bytes += skb->len; c->_c.mfc_un.res.lastuse = jiffies; if (ipv6_addr_any(&c->mf6c_origin) && true_vifi >= 0) { struct mfc6_cache *cache_proxy; /* For an (*,G) entry, we only check that the incoming * interface is part of the static tree. */ cache_proxy = mr_mfc_find_any_parent(mrt, vif); if (cache_proxy && cache_proxy->_c.mfc_un.res.ttls[true_vifi] < 255) goto forward; } /* * Wrong interface: drop packet and (maybe) send PIM assert. */ if (rcu_access_pointer(mrt->vif_table[vif].dev) != dev) { c->_c.mfc_un.res.wrong_if++; if (true_vifi >= 0 && mrt->mroute_do_assert && /* pimsm uses asserts, when switching from RPT to SPT, so that we cannot check that packet arrived on an oif. It is bad, but otherwise we would need to move pretty large chunk of pimd to kernel. Ough... --ANK */ (mrt->mroute_do_pim || c->_c.mfc_un.res.ttls[true_vifi] < 255) && time_after(jiffies, c->_c.mfc_un.res.last_assert + MFC_ASSERT_THRESH)) { c->_c.mfc_un.res.last_assert = jiffies; ip6mr_cache_report(mrt, skb, true_vifi, MRT6MSG_WRONGMIF); if (mrt->mroute_do_wrvifwhole) ip6mr_cache_report(mrt, skb, true_vifi, MRT6MSG_WRMIFWHOLE); } goto dont_forward; } forward: WRITE_ONCE(mrt->vif_table[vif].pkt_in, mrt->vif_table[vif].pkt_in + 1); WRITE_ONCE(mrt->vif_table[vif].bytes_in, mrt->vif_table[vif].bytes_in + skb->len); /* * Forward the frame */ if (ipv6_addr_any(&c->mf6c_origin) && ipv6_addr_any(&c->mf6c_mcastgrp)) { if (true_vifi >= 0 && true_vifi != c->_c.mfc_parent && ipv6_hdr(skb)->hop_limit > c->_c.mfc_un.res.ttls[c->_c.mfc_parent]) { /* It's an (*,*) entry and the packet is not coming from * the upstream: forward the packet to the upstream * only. */ psend = c->_c.mfc_parent; goto last_forward; } goto dont_forward; } for (ct = c->_c.mfc_un.res.maxvif - 1; ct >= c->_c.mfc_un.res.minvif; ct--) { /* For (*,G) entry, don't forward to the incoming interface */ if ((!ipv6_addr_any(&c->mf6c_origin) || ct != true_vifi) && ipv6_hdr(skb)->hop_limit > c->_c.mfc_un.res.ttls[ct]) { if (psend != -1) { struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC); if (skb2) ip6mr_forward2(net, mrt, skb2, psend); } psend = ct; } } last_forward: if (psend != -1) { ip6mr_forward2(net, mrt, skb, psend); return; } dont_forward: kfree_skb(skb); } /* * Multicast packets for forwarding arrive here */ int ip6_mr_input(struct sk_buff *skb) { struct mfc6_cache *cache; struct net *net = dev_net(skb->dev); struct mr_table *mrt; struct flowi6 fl6 = { .flowi6_iif = skb->dev->ifindex, .flowi6_mark = skb->mark, }; int err; struct net_device *dev; /* skb->dev passed in is the master dev for vrfs. * Get the proper interface that does have a vif associated with it. */ dev = skb->dev; if (netif_is_l3_master(skb->dev)) { dev = dev_get_by_index_rcu(net, IPCB(skb)->iif); if (!dev) { kfree_skb(skb); return -ENODEV; } } err = ip6mr_fib_lookup(net, &fl6, &mrt); if (err < 0) { kfree_skb(skb); return err; } cache = ip6mr_cache_find(mrt, &ipv6_hdr(skb)->saddr, &ipv6_hdr(skb)->daddr); if (!cache) { int vif = ip6mr_find_vif(mrt, dev); if (vif >= 0) cache = ip6mr_cache_find_any(mrt, &ipv6_hdr(skb)->daddr, vif); } /* * No usable cache entry */ if (!cache) { int vif; vif = ip6mr_find_vif(mrt, dev); if (vif >= 0) { int err = ip6mr_cache_unresolved(mrt, vif, skb, dev); return err; } kfree_skb(skb); return -ENODEV; } ip6_mr_forward(net, mrt, dev, skb, cache); return 0; } int ip6mr_get_route(struct net *net, struct sk_buff *skb, struct rtmsg *rtm, u32 portid) { int err; struct mr_table *mrt; struct mfc6_cache *cache; struct rt6_info *rt = dst_rt6_info(skb_dst(skb)); mrt = ip6mr_get_table(net, RT6_TABLE_DFLT); if (!mrt) return -ENOENT; rcu_read_lock(); cache = ip6mr_cache_find(mrt, &rt->rt6i_src.addr, &rt->rt6i_dst.addr); if (!cache && skb->dev) { int vif = ip6mr_find_vif(mrt, skb->dev); if (vif >= 0) cache = ip6mr_cache_find_any(mrt, &rt->rt6i_dst.addr, vif); } if (!cache) { struct sk_buff *skb2; struct ipv6hdr *iph; struct net_device *dev; int vif; dev = skb->dev; if (!dev || (vif = ip6mr_find_vif(mrt, dev)) < 0) { rcu_read_unlock(); return -ENODEV; } /* really correct? */ skb2 = alloc_skb(sizeof(struct ipv6hdr), GFP_ATOMIC); if (!skb2) { rcu_read_unlock(); return -ENOMEM; } NETLINK_CB(skb2).portid = portid; skb_reset_transport_header(skb2); skb_put(skb2, sizeof(struct ipv6hdr)); skb_reset_network_header(skb2); iph = ipv6_hdr(skb2); iph->version = 0; iph->priority = 0; iph->flow_lbl[0] = 0; iph->flow_lbl[1] = 0; iph->flow_lbl[2] = 0; iph->payload_len = 0; iph->nexthdr = IPPROTO_NONE; iph->hop_limit = 0; iph->saddr = rt->rt6i_src.addr; iph->daddr = rt->rt6i_dst.addr; err = ip6mr_cache_unresolved(mrt, vif, skb2, dev); rcu_read_unlock(); return err; } err = mr_fill_mroute(mrt, skb, &cache->_c, rtm); rcu_read_unlock(); return err; } static int ip6mr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb, u32 portid, u32 seq, struct mfc6_cache *c, int cmd, int flags) { struct nlmsghdr *nlh; struct rtmsg *rtm; int err; nlh = nlmsg_put(skb, portid, seq, cmd, sizeof(*rtm), flags); if (!nlh) return -EMSGSIZE; rtm = nlmsg_data(nlh); rtm->rtm_family = RTNL_FAMILY_IP6MR; rtm->rtm_dst_len = 128; rtm->rtm_src_len = 128; rtm->rtm_tos = 0; rtm->rtm_table = mrt->id; if (nla_put_u32(skb, RTA_TABLE, mrt->id)) goto nla_put_failure; rtm->rtm_type = RTN_MULTICAST; rtm->rtm_scope = RT_SCOPE_UNIVERSE; if (c->_c.mfc_flags & MFC_STATIC) rtm->rtm_protocol = RTPROT_STATIC; else rtm->rtm_protocol = RTPROT_MROUTED; rtm->rtm_flags = 0; if (nla_put_in6_addr(skb, RTA_SRC, &c->mf6c_origin) || nla_put_in6_addr(skb, RTA_DST, &c->mf6c_mcastgrp)) goto nla_put_failure; err = mr_fill_mroute(mrt, skb, &c->_c, rtm); /* do not break the dump if cache is unresolved */ if (err < 0 && err != -ENOENT) goto nla_put_failure; nlmsg_end(skb, nlh); return 0; nla_put_failure: nlmsg_cancel(skb, nlh); return -EMSGSIZE; } static int _ip6mr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb, u32 portid, u32 seq, struct mr_mfc *c, int cmd, int flags) { return ip6mr_fill_mroute(mrt, skb, portid, seq, (struct mfc6_cache *)c, cmd, flags); } static int mr6_msgsize(bool unresolved, int maxvif) { size_t len = NLMSG_ALIGN(sizeof(struct rtmsg)) + nla_total_size(4) /* RTA_TABLE */ + nla_total_size(sizeof(struct in6_addr)) /* RTA_SRC */ + nla_total_size(sizeof(struct in6_addr)) /* RTA_DST */ ; if (!unresolved) len = len + nla_total_size(4) /* RTA_IIF */ + nla_total_size(0) /* RTA_MULTIPATH */ + maxvif * NLA_ALIGN(sizeof(struct rtnexthop)) /* RTA_MFC_STATS */ + nla_total_size_64bit(sizeof(struct rta_mfc_stats)) ; return len; } static void mr6_netlink_event(struct mr_table *mrt, struct mfc6_cache *mfc, int cmd) { struct net *net = read_pnet(&mrt->net); struct sk_buff *skb; int err = -ENOBUFS; skb = nlmsg_new(mr6_msgsize(mfc->_c.mfc_parent >= MAXMIFS, mrt->maxvif), GFP_ATOMIC); if (!skb) goto errout; err = ip6mr_fill_mroute(mrt, skb, 0, 0, mfc, cmd, 0); if (err < 0) goto errout; rtnl_notify(skb, net, 0, RTNLGRP_IPV6_MROUTE, NULL, GFP_ATOMIC); return; errout: kfree_skb(skb); rtnl_set_sk_err(net, RTNLGRP_IPV6_MROUTE, err); } static size_t mrt6msg_netlink_msgsize(size_t payloadlen) { size_t len = NLMSG_ALIGN(sizeof(struct rtgenmsg)) + nla_total_size(1) /* IP6MRA_CREPORT_MSGTYPE */ + nla_total_size(4) /* IP6MRA_CREPORT_MIF_ID */ /* IP6MRA_CREPORT_SRC_ADDR */ + nla_total_size(sizeof(struct in6_addr)) /* IP6MRA_CREPORT_DST_ADDR */ + nla_total_size(sizeof(struct in6_addr)) /* IP6MRA_CREPORT_PKT */ + nla_total_size(payloadlen) ; return len; } static void mrt6msg_netlink_event(const struct mr_table *mrt, struct sk_buff *pkt) { struct net *net = read_pnet(&mrt->net); struct nlmsghdr *nlh; struct rtgenmsg *rtgenm; struct mrt6msg *msg; struct sk_buff *skb; struct nlattr *nla; int payloadlen; payloadlen = pkt->len - sizeof(struct mrt6msg); msg = (struct mrt6msg *)skb_transport_header(pkt); skb = nlmsg_new(mrt6msg_netlink_msgsize(payloadlen), GFP_ATOMIC); if (!skb) goto errout; nlh = nlmsg_put(skb, 0, 0, RTM_NEWCACHEREPORT, sizeof(struct rtgenmsg), 0); if (!nlh) goto errout; rtgenm = nlmsg_data(nlh); rtgenm->rtgen_family = RTNL_FAMILY_IP6MR; if (nla_put_u8(skb, IP6MRA_CREPORT_MSGTYPE, msg->im6_msgtype) || nla_put_u32(skb, IP6MRA_CREPORT_MIF_ID, msg->im6_mif) || nla_put_in6_addr(skb, IP6MRA_CREPORT_SRC_ADDR, &msg->im6_src) || nla_put_in6_addr(skb, IP6MRA_CREPORT_DST_ADDR, &msg->im6_dst)) goto nla_put_failure; nla = nla_reserve(skb, IP6MRA_CREPORT_PKT, payloadlen); if (!nla || skb_copy_bits(pkt, sizeof(struct mrt6msg), nla_data(nla), payloadlen)) goto nla_put_failure; nlmsg_end(skb, nlh); rtnl_notify(skb, net, 0, RTNLGRP_IPV6_MROUTE_R, NULL, GFP_ATOMIC); return; nla_put_failure: nlmsg_cancel(skb, nlh); errout: kfree_skb(skb); rtnl_set_sk_err(net, RTNLGRP_IPV6_MROUTE_R, -ENOBUFS); } static const struct nla_policy ip6mr_getroute_policy[RTA_MAX + 1] = { [RTA_SRC] = NLA_POLICY_EXACT_LEN(sizeof(struct in6_addr)), [RTA_DST] = NLA_POLICY_EXACT_LEN(sizeof(struct in6_addr)), [RTA_TABLE] = { .type = NLA_U32 }, }; static int ip6mr_rtm_valid_getroute_req(struct sk_buff *skb, const struct nlmsghdr *nlh, struct nlattr **tb, struct netlink_ext_ack *extack) { struct rtmsg *rtm; int err; err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, ip6mr_getroute_policy, extack); if (err) return err; rtm = nlmsg_data(nlh); if ((rtm->rtm_src_len && rtm->rtm_src_len != 128) || (rtm->rtm_dst_len && rtm->rtm_dst_len != 128) || rtm->rtm_tos || rtm->rtm_table || rtm->rtm_protocol || rtm->rtm_scope || rtm->rtm_type || rtm->rtm_flags) { NL_SET_ERR_MSG_MOD(extack, "Invalid values in header for multicast route get request"); return -EINVAL; } if ((tb[RTA_SRC] && !rtm->rtm_src_len) || (tb[RTA_DST] && !rtm->rtm_dst_len)) { NL_SET_ERR_MSG_MOD(extack, "rtm_src_len and rtm_dst_len must be 128 for IPv6"); return -EINVAL; } return 0; } static int ip6mr_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct net *net = sock_net(in_skb->sk); struct in6_addr src = {}, grp = {}; struct nlattr *tb[RTA_MAX + 1]; struct mfc6_cache *cache; struct mr_table *mrt; struct sk_buff *skb; u32 tableid; int err; err = ip6mr_rtm_valid_getroute_req(in_skb, nlh, tb, extack); if (err < 0) return err; if (tb[RTA_SRC]) src = nla_get_in6_addr(tb[RTA_SRC]); if (tb[RTA_DST]) grp = nla_get_in6_addr(tb[RTA_DST]); tableid = tb[RTA_TABLE] ? nla_get_u32(tb[RTA_TABLE]) : 0; mrt = ip6mr_get_table(net, tableid ?: RT_TABLE_DEFAULT); if (!mrt) { NL_SET_ERR_MSG_MOD(extack, "MR table does not exist"); return -ENOENT; } /* entries are added/deleted only under RTNL */ rcu_read_lock(); cache = ip6mr_cache_find(mrt, &src, &grp); rcu_read_unlock(); if (!cache) { NL_SET_ERR_MSG_MOD(extack, "MR cache entry not found"); return -ENOENT; } skb = nlmsg_new(mr6_msgsize(false, mrt->maxvif), GFP_KERNEL); if (!skb) return -ENOBUFS; err = ip6mr_fill_mroute(mrt, skb, NETLINK_CB(in_skb).portid, nlh->nlmsg_seq, cache, RTM_NEWROUTE, 0); if (err < 0) { kfree_skb(skb); return err; } return rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid); } static int ip6mr_rtm_dumproute(struct sk_buff *skb, struct netlink_callback *cb) { const struct nlmsghdr *nlh = cb->nlh; struct fib_dump_filter filter = { .rtnl_held = true, }; int err; if (cb->strict_check) { err = ip_valid_fib_dump_req(sock_net(skb->sk), nlh, &filter, cb); if (err < 0) return err; } if (filter.table_id) { struct mr_table *mrt; mrt = ip6mr_get_table(sock_net(skb->sk), filter.table_id); if (!mrt) { if (rtnl_msg_family(cb->nlh) != RTNL_FAMILY_IP6MR) return skb->len; NL_SET_ERR_MSG_MOD(cb->extack, "MR table does not exist"); return -ENOENT; } err = mr_table_dump(mrt, skb, cb, _ip6mr_fill_mroute, &mfc_unres_lock, &filter); return skb->len ? : err; } return mr_rtm_dumproute(skb, cb, ip6mr_mr_table_iter, _ip6mr_fill_mroute, &mfc_unres_lock, &filter); } |
| 3 355 8 17 67 2575 5834 5856 157 158 8 8 598 601 601 189 189 2819 2817 7 2572 2575 2578 93 93 75 76 2921 2896 25 2771 2777 327 328 1529 1533 239 240 249 250 3115 3128 19 19 1530 1535 1533 45 45 16 16 4948 4944 4964 2076 2081 2073 59 59 1292 10 1287 1295 1295 10 1286 1287 29 30 1308 1309 66 66 528 531 142 142 219 220 216 217 132 132 637 636 639 198 198 66 1 65 156 1 156 3 3 1314 1316 212 213 199 199 355 356 219 219 675 676 145 146 127 128 790 789 712 91 790 35 35 221 222 7362 7392 964 2 967 810 2 812 345 346 242 242 242 242 147 147 131 131 228 228 380 381 95 95 133 133 133 133 132 132 1731 1739 231 231 111 111 63 63 1078 1083 4857 4904 4614 4615 4631 1944 1951 3408 3421 3413 3176 3192 50 261 310 311 312 312 77 77 207 210 23 23 87 88 3937 3940 3938 3554 3558 328 329 8 8 8 11 11 355 355 355 68 68 3823 3831 79 79 145 145 5214 5208 5206 4 4 2 2 215 215 215 215 3 3 3 3 1332 1336 4 4 4 4 3 3 3 3 3 3 3 3 3 17 17 17 28 28 28 28 67 67 67 12 12 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808 4809 4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 4820 4821 4822 4823 4824 4825 4826 4827 4828 4829 4830 4831 4832 4833 4834 4835 4836 4837 4838 4839 4840 4841 4842 4843 4844 4845 4846 4847 4848 4849 4850 4851 4852 4853 4854 4855 4856 4857 4858 4859 4860 4861 4862 4863 4864 4865 4866 4867 4868 4869 4870 4871 4872 4873 4874 4875 4876 4877 4878 4879 4880 4881 4882 4883 4884 4885 4886 4887 4888 4889 4890 4891 4892 4893 4894 4895 4896 4897 4898 4899 4900 4901 4902 4903 4904 4905 4906 4907 4908 4909 4910 4911 4912 4913 4914 4915 4916 4917 4918 4919 4920 4921 4922 4923 4924 4925 4926 4927 4928 4929 4930 4931 4932 4933 4934 4935 4936 4937 4938 4939 4940 4941 4942 4943 4944 4945 4946 4947 4948 4949 4950 4951 4952 4953 4954 4955 4956 4957 4958 4959 4960 4961 4962 4963 4964 4965 4966 4967 4968 4969 4970 4971 4972 4973 4974 4975 4976 4977 4978 4979 4980 4981 4982 4983 4984 4985 4986 4987 4988 4989 4990 4991 4992 4993 4994 4995 4996 4997 4998 4999 5000 5001 5002 5003 5004 5005 5006 5007 5008 5009 5010 5011 5012 5013 5014 5015 5016 5017 5018 5019 5020 5021 5022 5023 5024 5025 5026 5027 5028 5029 5030 5031 5032 5033 5034 5035 5036 5037 5038 5039 5040 5041 5042 5043 5044 5045 5046 5047 5048 5049 5050 5051 5052 5053 5054 5055 5056 5057 5058 5059 5060 5061 5062 5063 5064 5065 5066 5067 5068 5069 5070 5071 5072 5073 5074 5075 5076 5077 5078 5079 5080 5081 5082 5083 5084 5085 5086 5087 5088 5089 5090 5091 5092 5093 5094 5095 5096 5097 5098 5099 5100 5101 5102 5103 5104 5105 5106 5107 5108 5109 5110 5111 5112 5113 5114 5115 5116 5117 5118 5119 5120 5121 5122 5123 5124 5125 5126 5127 5128 5129 5130 5131 5132 5133 5134 5135 5136 5137 5138 5139 5140 5141 5142 5143 5144 5145 5146 5147 5148 5149 5150 5151 5152 5153 5154 5155 5156 5157 5158 5159 5160 5161 5162 5163 5164 5165 5166 5167 5168 5169 5170 5171 5172 5173 5174 5175 5176 5177 5178 5179 5180 5181 5182 5183 5184 5185 5186 5187 5188 5189 5190 5191 5192 5193 5194 5195 5196 5197 5198 5199 5200 5201 5202 5203 5204 5205 5206 5207 5208 5209 5210 5211 5212 5213 5214 5215 5216 5217 5218 5219 5220 5221 5222 5223 5224 5225 5226 5227 5228 5229 5230 5231 5232 5233 5234 5235 5236 5237 5238 5239 5240 5241 5242 5243 5244 5245 5246 5247 5248 5249 5250 5251 5252 5253 5254 5255 5256 5257 5258 5259 5260 5261 5262 5263 5264 5265 5266 5267 5268 5269 5270 5271 5272 5273 5274 5275 5276 5277 5278 5279 5280 5281 5282 5283 5284 5285 5286 5287 5288 5289 5290 5291 5292 5293 5294 5295 5296 5297 5298 5299 5300 5301 5302 5303 5304 5305 5306 5307 5308 5309 5310 5311 5312 5313 5314 5315 5316 5317 5318 5319 5320 5321 5322 5323 5324 5325 5326 5327 5328 5329 5330 5331 5332 5333 5334 5335 5336 5337 5338 5339 5340 5341 5342 5343 5344 5345 5346 5347 5348 5349 5350 5351 5352 5353 5354 5355 5356 5357 5358 5359 5360 5361 5362 5363 5364 5365 5366 5367 5368 5369 5370 5371 5372 5373 5374 5375 5376 5377 5378 5379 5380 5381 5382 5383 5384 5385 5386 5387 5388 5389 5390 5391 5392 5393 5394 5395 5396 5397 5398 5399 5400 5401 5402 5403 5404 5405 5406 5407 5408 5409 5410 5411 5412 5413 5414 5415 5416 5417 5418 5419 5420 5421 5422 5423 5424 5425 5426 5427 5428 5429 5430 5431 5432 5433 5434 5435 5436 5437 5438 5439 5440 5441 5442 5443 5444 5445 5446 5447 5448 5449 5450 5451 5452 5453 5454 5455 5456 5457 5458 5459 5460 5461 5462 5463 5464 5465 5466 5467 5468 5469 5470 5471 5472 5473 5474 5475 5476 5477 5478 5479 5480 5481 5482 5483 5484 5485 5486 5487 5488 5489 5490 5491 5492 5493 5494 5495 5496 5497 5498 5499 5500 5501 5502 5503 5504 5505 5506 5507 5508 5509 5510 5511 5512 5513 5514 5515 5516 5517 5518 5519 5520 5521 5522 5523 5524 5525 5526 5527 5528 5529 5530 5531 5532 5533 5534 5535 5536 5537 5538 5539 5540 5541 5542 5543 5544 5545 5546 5547 5548 5549 5550 5551 5552 5553 5554 5555 5556 5557 5558 5559 5560 5561 5562 5563 5564 5565 5566 5567 5568 5569 5570 5571 5572 5573 5574 5575 5576 5577 5578 5579 5580 5581 5582 5583 5584 5585 5586 5587 5588 5589 5590 5591 5592 5593 5594 5595 5596 5597 5598 5599 5600 5601 5602 5603 5604 5605 5606 5607 5608 5609 5610 5611 5612 5613 5614 5615 5616 5617 5618 5619 5620 5621 5622 5623 5624 5625 5626 5627 5628 5629 5630 5631 5632 5633 5634 5635 5636 5637 5638 5639 5640 5641 5642 5643 5644 5645 5646 5647 5648 5649 5650 5651 5652 5653 5654 5655 5656 5657 5658 5659 5660 5661 5662 5663 5664 5665 5666 5667 5668 5669 5670 5671 5672 5673 5674 5675 5676 5677 5678 5679 5680 5681 5682 5683 5684 5685 5686 5687 5688 5689 5690 5691 5692 5693 5694 5695 5696 5697 5698 5699 5700 5701 5702 5703 5704 5705 5706 5707 5708 5709 5710 5711 5712 5713 5714 5715 5716 5717 5718 5719 5720 5721 5722 5723 5724 5725 5726 5727 5728 5729 5730 5731 5732 5733 5734 5735 5736 5737 5738 5739 5740 5741 5742 5743 5744 5745 5746 5747 5748 5749 5750 5751 5752 5753 5754 5755 5756 5757 5758 5759 5760 5761 5762 5763 5764 5765 5766 5767 5768 5769 5770 5771 5772 5773 5774 5775 5776 5777 5778 5779 5780 5781 5782 5783 5784 5785 5786 5787 5788 5789 5790 5791 5792 5793 5794 5795 5796 5797 5798 5799 5800 5801 5802 5803 5804 5805 5806 5807 5808 5809 5810 5811 5812 5813 5814 5815 5816 5817 5818 5819 5820 5821 5822 5823 5824 5825 5826 5827 5828 5829 5830 5831 5832 5833 5834 5835 5836 5837 5838 5839 5840 5841 5842 5843 5844 5845 5846 5847 5848 5849 5850 5851 5852 5853 5854 5855 5856 5857 5858 5859 5860 5861 5862 5863 5864 5865 5866 5867 5868 5869 5870 5871 5872 5873 5874 5875 5876 5877 5878 5879 5880 5881 5882 5883 5884 5885 5886 5887 5888 5889 5890 5891 5892 5893 5894 5895 5896 5897 5898 5899 5900 5901 5902 5903 5904 5905 5906 5907 5908 5909 5910 5911 5912 5913 5914 5915 5916 5917 5918 5919 5920 5921 5922 5923 5924 5925 5926 5927 5928 5929 5930 5931 5932 5933 5934 5935 5936 5937 5938 5939 5940 5941 5942 5943 5944 5945 5946 5947 5948 5949 5950 5951 5952 5953 5954 5955 5956 5957 5958 5959 5960 5961 5962 5963 5964 5965 5966 5967 5968 5969 5970 5971 5972 5973 5974 5975 5976 5977 5978 5979 5980 | // SPDX-License-Identifier: GPL-2.0-or-later /* * Security plug functions * * Copyright (C) 2001 WireX Communications, Inc <chris@wirex.com> * Copyright (C) 2001-2002 Greg Kroah-Hartman <greg@kroah.com> * Copyright (C) 2001 Networks Associates Technology, Inc <ssmalley@nai.com> * Copyright (C) 2016 Mellanox Technologies * Copyright (C) 2023 Microsoft Corporation <paul@paul-moore.com> */ #define pr_fmt(fmt) "LSM: " fmt #include <linux/bpf.h> #include <linux/capability.h> #include <linux/dcache.h> #include <linux/export.h> #include <linux/init.h> #include <linux/kernel.h> #include <linux/kernel_read_file.h> #include <linux/lsm_hooks.h> #include <linux/fsnotify.h> #include <linux/mman.h> #include <linux/mount.h> #include <linux/personality.h> #include <linux/backing-dev.h> #include <linux/string.h> #include <linux/xattr.h> #include <linux/msg.h> #include <linux/overflow.h> #include <linux/perf_event.h> #include <linux/fs.h> #include <net/flow.h> #include <net/sock.h> #define SECURITY_HOOK_ACTIVE_KEY(HOOK, IDX) security_hook_active_##HOOK##_##IDX /* * Identifier for the LSM static calls. * HOOK is an LSM hook as defined in linux/lsm_hookdefs.h * IDX is the index of the static call. 0 <= NUM < MAX_LSM_COUNT */ #define LSM_STATIC_CALL(HOOK, IDX) lsm_static_call_##HOOK##_##IDX /* * Call the macro M for each LSM hook MAX_LSM_COUNT times. */ #define LSM_LOOP_UNROLL(M, ...) \ do { \ UNROLL(MAX_LSM_COUNT, M, __VA_ARGS__) \ } while (0) #define LSM_DEFINE_UNROLL(M, ...) UNROLL(MAX_LSM_COUNT, M, __VA_ARGS__) /* * These are descriptions of the reasons that can be passed to the * security_locked_down() LSM hook. Placing this array here allows * all security modules to use the same descriptions for auditing * purposes. */ const char *const lockdown_reasons[LOCKDOWN_CONFIDENTIALITY_MAX + 1] = { [LOCKDOWN_NONE] = "none", [LOCKDOWN_MODULE_SIGNATURE] = "unsigned module loading", [LOCKDOWN_DEV_MEM] = "/dev/mem,kmem,port", [LOCKDOWN_EFI_TEST] = "/dev/efi_test access", [LOCKDOWN_KEXEC] = "kexec of unsigned images", [LOCKDOWN_HIBERNATION] = "hibernation", [LOCKDOWN_PCI_ACCESS] = "direct PCI access", [LOCKDOWN_IOPORT] = "raw io port access", [LOCKDOWN_MSR] = "raw MSR access", [LOCKDOWN_ACPI_TABLES] = "modifying ACPI tables", [LOCKDOWN_DEVICE_TREE] = "modifying device tree contents", [LOCKDOWN_PCMCIA_CIS] = "direct PCMCIA CIS storage", [LOCKDOWN_TIOCSSERIAL] = "reconfiguration of serial port IO", [LOCKDOWN_MODULE_PARAMETERS] = "unsafe module parameters", [LOCKDOWN_MMIOTRACE] = "unsafe mmio", [LOCKDOWN_DEBUGFS] = "debugfs access", [LOCKDOWN_XMON_WR] = "xmon write access", [LOCKDOWN_BPF_WRITE_USER] = "use of bpf to write user RAM", [LOCKDOWN_DBG_WRITE_KERNEL] = "use of kgdb/kdb to write kernel RAM", [LOCKDOWN_RTAS_ERROR_INJECTION] = "RTAS error injection", [LOCKDOWN_INTEGRITY_MAX] = "integrity", [LOCKDOWN_KCORE] = "/proc/kcore access", [LOCKDOWN_KPROBES] = "use of kprobes", [LOCKDOWN_BPF_READ_KERNEL] = "use of bpf to read kernel RAM", [LOCKDOWN_DBG_READ_KERNEL] = "use of kgdb/kdb to read kernel RAM", [LOCKDOWN_PERF] = "unsafe use of perf", [LOCKDOWN_TRACEFS] = "use of tracefs", [LOCKDOWN_XMON_RW] = "xmon read and write access", [LOCKDOWN_XFRM_SECRET] = "xfrm SA secret", [LOCKDOWN_CONFIDENTIALITY_MAX] = "confidentiality", }; static BLOCKING_NOTIFIER_HEAD(blocking_lsm_notifier_chain); static struct kmem_cache *lsm_file_cache; static struct kmem_cache *lsm_inode_cache; char *lsm_names; static struct lsm_blob_sizes blob_sizes __ro_after_init; /* Boot-time LSM user choice */ static __initdata const char *chosen_lsm_order; static __initdata const char *chosen_major_lsm; static __initconst const char *const builtin_lsm_order = CONFIG_LSM; /* Ordered list of LSMs to initialize. */ static __initdata struct lsm_info *ordered_lsms[MAX_LSM_COUNT + 1]; static __initdata struct lsm_info *exclusive; #ifdef CONFIG_HAVE_STATIC_CALL #define LSM_HOOK_TRAMP(NAME, NUM) \ &STATIC_CALL_TRAMP(LSM_STATIC_CALL(NAME, NUM)) #else #define LSM_HOOK_TRAMP(NAME, NUM) NULL #endif /* * Define static calls and static keys for each LSM hook. */ #define DEFINE_LSM_STATIC_CALL(NUM, NAME, RET, ...) \ DEFINE_STATIC_CALL_NULL(LSM_STATIC_CALL(NAME, NUM), \ *((RET(*)(__VA_ARGS__))NULL)); \ DEFINE_STATIC_KEY_FALSE(SECURITY_HOOK_ACTIVE_KEY(NAME, NUM)); #define LSM_HOOK(RET, DEFAULT, NAME, ...) \ LSM_DEFINE_UNROLL(DEFINE_LSM_STATIC_CALL, NAME, RET, __VA_ARGS__) #include <linux/lsm_hook_defs.h> #undef LSM_HOOK #undef DEFINE_LSM_STATIC_CALL /* * Initialise a table of static calls for each LSM hook. * DEFINE_STATIC_CALL_NULL invocation above generates a key (STATIC_CALL_KEY) * and a trampoline (STATIC_CALL_TRAMP) which are used to call * __static_call_update when updating the static call. * * The static calls table is used by early LSMs, some architectures can fault on * unaligned accesses and the fault handling code may not be ready by then. * Thus, the static calls table should be aligned to avoid any unhandled faults * in early init. */ struct lsm_static_calls_table static_calls_table __ro_after_init __aligned(sizeof(u64)) = { #define INIT_LSM_STATIC_CALL(NUM, NAME) \ (struct lsm_static_call) { \ .key = &STATIC_CALL_KEY(LSM_STATIC_CALL(NAME, NUM)), \ .trampoline = LSM_HOOK_TRAMP(NAME, NUM), \ .active = &SECURITY_HOOK_ACTIVE_KEY(NAME, NUM), \ }, #define LSM_HOOK(RET, DEFAULT, NAME, ...) \ .NAME = { \ LSM_DEFINE_UNROLL(INIT_LSM_STATIC_CALL, NAME) \ }, #include <linux/lsm_hook_defs.h> #undef LSM_HOOK #undef INIT_LSM_STATIC_CALL }; static __initdata bool debug; #define init_debug(...) \ do { \ if (debug) \ pr_info(__VA_ARGS__); \ } while (0) static bool __init is_enabled(struct lsm_info *lsm) { if (!lsm->enabled) return false; return *lsm->enabled; } /* Mark an LSM's enabled flag. */ static int lsm_enabled_true __initdata = 1; static int lsm_enabled_false __initdata = 0; static void __init set_enabled(struct lsm_info *lsm, bool enabled) { /* * When an LSM hasn't configured an enable variable, we can use * a hard-coded location for storing the default enabled state. */ if (!lsm->enabled) { if (enabled) lsm->enabled = &lsm_enabled_true; else lsm->enabled = &lsm_enabled_false; } else if (lsm->enabled == &lsm_enabled_true) { if (!enabled) lsm->enabled = &lsm_enabled_false; } else if (lsm->enabled == &lsm_enabled_false) { if (enabled) lsm->enabled = &lsm_enabled_true; } else { *lsm->enabled = enabled; } } /* Is an LSM already listed in the ordered LSMs list? */ static bool __init exists_ordered_lsm(struct lsm_info *lsm) { struct lsm_info **check; for (check = ordered_lsms; *check; check++) if (*check == lsm) return true; return false; } /* Append an LSM to the list of ordered LSMs to initialize. */ static int last_lsm __initdata; static void __init append_ordered_lsm(struct lsm_info *lsm, const char *from) { /* Ignore duplicate selections. */ if (exists_ordered_lsm(lsm)) return; if (WARN(last_lsm == MAX_LSM_COUNT, "%s: out of LSM static calls!?\n", from)) return; /* Enable this LSM, if it is not already set. */ if (!lsm->enabled) lsm->enabled = &lsm_enabled_true; ordered_lsms[last_lsm++] = lsm; init_debug("%s ordered: %s (%s)\n", from, lsm->name, is_enabled(lsm) ? "enabled" : "disabled"); } /* Is an LSM allowed to be initialized? */ static bool __init lsm_allowed(struct lsm_info *lsm) { /* Skip if the LSM is disabled. */ if (!is_enabled(lsm)) return false; /* Not allowed if another exclusive LSM already initialized. */ if ((lsm->flags & LSM_FLAG_EXCLUSIVE) && exclusive) { init_debug("exclusive disabled: %s\n", lsm->name); return false; } return true; } static void __init lsm_set_blob_size(int *need, int *lbs) { int offset; if (*need <= 0) return; offset = ALIGN(*lbs, sizeof(void *)); *lbs = offset + *need; *need = offset; } static void __init lsm_set_blob_sizes(struct lsm_blob_sizes *needed) { if (!needed) return; lsm_set_blob_size(&needed->lbs_cred, &blob_sizes.lbs_cred); lsm_set_blob_size(&needed->lbs_file, &blob_sizes.lbs_file); lsm_set_blob_size(&needed->lbs_ib, &blob_sizes.lbs_ib); /* * The inode blob gets an rcu_head in addition to * what the modules might need. */ if (needed->lbs_inode && blob_sizes.lbs_inode == 0) blob_sizes.lbs_inode = sizeof(struct rcu_head); lsm_set_blob_size(&needed->lbs_inode, &blob_sizes.lbs_inode); lsm_set_blob_size(&needed->lbs_ipc, &blob_sizes.lbs_ipc); lsm_set_blob_size(&needed->lbs_key, &blob_sizes.lbs_key); lsm_set_blob_size(&needed->lbs_msg_msg, &blob_sizes.lbs_msg_msg); lsm_set_blob_size(&needed->lbs_perf_event, &blob_sizes.lbs_perf_event); lsm_set_blob_size(&needed->lbs_sock, &blob_sizes.lbs_sock); lsm_set_blob_size(&needed->lbs_superblock, &blob_sizes.lbs_superblock); lsm_set_blob_size(&needed->lbs_task, &blob_sizes.lbs_task); lsm_set_blob_size(&needed->lbs_tun_dev, &blob_sizes.lbs_tun_dev); lsm_set_blob_size(&needed->lbs_xattr_count, &blob_sizes.lbs_xattr_count); lsm_set_blob_size(&needed->lbs_bdev, &blob_sizes.lbs_bdev); } /* Prepare LSM for initialization. */ static void __init prepare_lsm(struct lsm_info *lsm) { int enabled = lsm_allowed(lsm); /* Record enablement (to handle any following exclusive LSMs). */ set_enabled(lsm, enabled); /* If enabled, do pre-initialization work. */ if (enabled) { if ((lsm->flags & LSM_FLAG_EXCLUSIVE) && !exclusive) { exclusive = lsm; init_debug("exclusive chosen: %s\n", lsm->name); } lsm_set_blob_sizes(lsm->blobs); } } /* Initialize a given LSM, if it is enabled. */ static void __init initialize_lsm(struct lsm_info *lsm) { if (is_enabled(lsm)) { int ret; init_debug("initializing %s\n", lsm->name); ret = lsm->init(); WARN(ret, "%s failed to initialize: %d\n", lsm->name, ret); } } /* * Current index to use while initializing the lsm id list. */ u32 lsm_active_cnt __ro_after_init; const struct lsm_id *lsm_idlist[MAX_LSM_COUNT]; /* Populate ordered LSMs list from comma-separated LSM name list. */ static void __init ordered_lsm_parse(const char *order, const char *origin) { struct lsm_info *lsm; char *sep, *name, *next; /* LSM_ORDER_FIRST is always first. */ for (lsm = __start_lsm_info; lsm < __end_lsm_info; lsm++) { if (lsm->order == LSM_ORDER_FIRST) append_ordered_lsm(lsm, " first"); } /* Process "security=", if given. */ if (chosen_major_lsm) { struct lsm_info *major; /* * To match the original "security=" behavior, this * explicitly does NOT fallback to another Legacy Major * if the selected one was separately disabled: disable * all non-matching Legacy Major LSMs. */ for (major = __start_lsm_info; major < __end_lsm_info; major++) { if ((major->flags & LSM_FLAG_LEGACY_MAJOR) && strcmp(major->name, chosen_major_lsm) != 0) { set_enabled(major, false); init_debug("security=%s disabled: %s (only one legacy major LSM)\n", chosen_major_lsm, major->name); } } } sep = kstrdup(order, GFP_KERNEL); next = sep; /* Walk the list, looking for matching LSMs. */ while ((name = strsep(&next, ",")) != NULL) { bool found = false; for (lsm = __start_lsm_info; lsm < __end_lsm_info; lsm++) { if (strcmp(lsm->name, name) == 0) { if (lsm->order == LSM_ORDER_MUTABLE) append_ordered_lsm(lsm, origin); found = true; } } if (!found) init_debug("%s ignored: %s (not built into kernel)\n", origin, name); } /* Process "security=", if given. */ if (chosen_major_lsm) { for (lsm = __start_lsm_info; lsm < __end_lsm_info; lsm++) { if (exists_ordered_lsm(lsm)) continue; if (strcmp(lsm->name, chosen_major_lsm) == 0) append_ordered_lsm(lsm, "security="); } } /* LSM_ORDER_LAST is always last. */ for (lsm = __start_lsm_info; lsm < __end_lsm_info; lsm++) { if (lsm->order == LSM_ORDER_LAST) append_ordered_lsm(lsm, " last"); } /* Disable all LSMs not in the ordered list. */ for (lsm = __start_lsm_info; lsm < __end_lsm_info; lsm++) { if (exists_ordered_lsm(lsm)) continue; set_enabled(lsm, false); init_debug("%s skipped: %s (not in requested order)\n", origin, lsm->name); } kfree(sep); } static void __init lsm_static_call_init(struct security_hook_list *hl) { struct lsm_static_call *scall = hl->scalls; int i; for (i = 0; i < MAX_LSM_COUNT; i++) { /* Update the first static call that is not used yet */ if (!scall->hl) { __static_call_update(scall->key, scall->trampoline, hl->hook.lsm_func_addr); scall->hl = hl; static_branch_enable(scall->active); return; } scall++; } panic("%s - Ran out of static slots.\n", __func__); } static void __init lsm_early_cred(struct cred *cred); static void __init lsm_early_task(struct task_struct *task); static int lsm_append(const char *new, char **result); static void __init report_lsm_order(void) { struct lsm_info **lsm, *early; int first = 0; pr_info("initializing lsm="); /* Report each enabled LSM name, comma separated. */ for (early = __start_early_lsm_info; early < __end_early_lsm_info; early++) if (is_enabled(early)) pr_cont("%s%s", first++ == 0 ? "" : ",", early->name); for (lsm = ordered_lsms; *lsm; lsm++) if (is_enabled(*lsm)) pr_cont("%s%s", first++ == 0 ? "" : ",", (*lsm)->name); pr_cont("\n"); } static void __init ordered_lsm_init(void) { struct lsm_info **lsm; if (chosen_lsm_order) { if (chosen_major_lsm) { pr_warn("security=%s is ignored because it is superseded by lsm=%s\n", chosen_major_lsm, chosen_lsm_order); chosen_major_lsm = NULL; } ordered_lsm_parse(chosen_lsm_order, "cmdline"); } else ordered_lsm_parse(builtin_lsm_order, "builtin"); for (lsm = ordered_lsms; *lsm; lsm++) prepare_lsm(*lsm); report_lsm_order(); init_debug("cred blob size = %d\n", blob_sizes.lbs_cred); init_debug("file blob size = %d\n", blob_sizes.lbs_file); init_debug("ib blob size = %d\n", blob_sizes.lbs_ib); init_debug("inode blob size = %d\n", blob_sizes.lbs_inode); init_debug("ipc blob size = %d\n", blob_sizes.lbs_ipc); #ifdef CONFIG_KEYS init_debug("key blob size = %d\n", blob_sizes.lbs_key); #endif /* CONFIG_KEYS */ init_debug("msg_msg blob size = %d\n", blob_sizes.lbs_msg_msg); init_debug("sock blob size = %d\n", blob_sizes.lbs_sock); init_debug("superblock blob size = %d\n", blob_sizes.lbs_superblock); init_debug("perf event blob size = %d\n", blob_sizes.lbs_perf_event); init_debug("task blob size = %d\n", blob_sizes.lbs_task); init_debug("tun device blob size = %d\n", blob_sizes.lbs_tun_dev); init_debug("xattr slots = %d\n", blob_sizes.lbs_xattr_count); init_debug("bdev blob size = %d\n", blob_sizes.lbs_bdev); /* * Create any kmem_caches needed for blobs */ if (blob_sizes.lbs_file) lsm_file_cache = kmem_cache_create("lsm_file_cache", blob_sizes.lbs_file, 0, SLAB_PANIC, NULL); if (blob_sizes.lbs_inode) lsm_inode_cache = kmem_cache_create("lsm_inode_cache", blob_sizes.lbs_inode, 0, SLAB_PANIC, NULL); lsm_early_cred((struct cred *) current->cred); lsm_early_task(current); for (lsm = ordered_lsms; *lsm; lsm++) initialize_lsm(*lsm); } int __init early_security_init(void) { struct lsm_info *lsm; for (lsm = __start_early_lsm_info; lsm < __end_early_lsm_info; lsm++) { if (!lsm->enabled) lsm->enabled = &lsm_enabled_true; prepare_lsm(lsm); initialize_lsm(lsm); } return 0; } /** * security_init - initializes the security framework * * This should be called early in the kernel initialization sequence. */ int __init security_init(void) { struct lsm_info *lsm; init_debug("legacy security=%s\n", chosen_major_lsm ? : " *unspecified*"); init_debug(" CONFIG_LSM=%s\n", builtin_lsm_order); init_debug("boot arg lsm=%s\n", chosen_lsm_order ? : " *unspecified*"); /* * Append the names of the early LSM modules now that kmalloc() is * available */ for (lsm = __start_early_lsm_info; lsm < __end_early_lsm_info; lsm++) { init_debug(" early started: %s (%s)\n", lsm->name, is_enabled(lsm) ? "enabled" : "disabled"); if (lsm->enabled) lsm_append(lsm->name, &lsm_names); } /* Load LSMs in specified order. */ ordered_lsm_init(); return 0; } /* Save user chosen LSM */ static int __init choose_major_lsm(char *str) { chosen_major_lsm = str; return 1; } __setup("security=", choose_major_lsm); /* Explicitly choose LSM initialization order. */ static int __init choose_lsm_order(char *str) { chosen_lsm_order = str; return 1; } __setup("lsm=", choose_lsm_order); /* Enable LSM order debugging. */ static int __init enable_debug(char *str) { debug = true; return 1; } __setup("lsm.debug", enable_debug); static bool match_last_lsm(const char *list, const char *lsm) { const char *last; if (WARN_ON(!list || !lsm)) return false; last = strrchr(list, ','); if (last) /* Pass the comma, strcmp() will check for '\0' */ last++; else last = list; return !strcmp(last, lsm); } static int lsm_append(const char *new, char **result) { char *cp; if (*result == NULL) { *result = kstrdup(new, GFP_KERNEL); if (*result == NULL) return -ENOMEM; } else { /* Check if it is the last registered name */ if (match_last_lsm(*result, new)) return 0; cp = kasprintf(GFP_KERNEL, "%s,%s", *result, new); if (cp == NULL) return -ENOMEM; kfree(*result); *result = cp; } return 0; } /** * security_add_hooks - Add a modules hooks to the hook lists. * @hooks: the hooks to add * @count: the number of hooks to add * @lsmid: the identification information for the security module * * Each LSM has to register its hooks with the infrastructure. */ void __init security_add_hooks(struct security_hook_list *hooks, int count, const struct lsm_id *lsmid) { int i; /* * A security module may call security_add_hooks() more * than once during initialization, and LSM initialization * is serialized. Landlock is one such case. * Look at the previous entry, if there is one, for duplication. */ if (lsm_active_cnt == 0 || lsm_idlist[lsm_active_cnt - 1] != lsmid) { if (lsm_active_cnt >= MAX_LSM_COUNT) panic("%s Too many LSMs registered.\n", __func__); lsm_idlist[lsm_active_cnt++] = lsmid; } for (i = 0; i < count; i++) { hooks[i].lsmid = lsmid; lsm_static_call_init(&hooks[i]); } /* * Don't try to append during early_security_init(), we'll come back * and fix this up afterwards. */ if (slab_is_available()) { if (lsm_append(lsmid->name, &lsm_names) < 0) panic("%s - Cannot get early memory.\n", __func__); } } int call_blocking_lsm_notifier(enum lsm_event event, void *data) { return blocking_notifier_call_chain(&blocking_lsm_notifier_chain, event, data); } EXPORT_SYMBOL(call_blocking_lsm_notifier); int register_blocking_lsm_notifier(struct notifier_block *nb) { return blocking_notifier_chain_register(&blocking_lsm_notifier_chain, nb); } EXPORT_SYMBOL(register_blocking_lsm_notifier); int unregister_blocking_lsm_notifier(struct notifier_block *nb) { return blocking_notifier_chain_unregister(&blocking_lsm_notifier_chain, nb); } EXPORT_SYMBOL(unregister_blocking_lsm_notifier); /** * lsm_blob_alloc - allocate a composite blob * @dest: the destination for the blob * @size: the size of the blob * @gfp: allocation type * * Allocate a blob for all the modules * * Returns 0, or -ENOMEM if memory can't be allocated. */ static int lsm_blob_alloc(void **dest, size_t size, gfp_t gfp) { if (size == 0) { *dest = NULL; return 0; } *dest = kzalloc(size, gfp); if (*dest == NULL) return -ENOMEM; return 0; } /** * lsm_cred_alloc - allocate a composite cred blob * @cred: the cred that needs a blob * @gfp: allocation type * * Allocate the cred blob for all the modules * * Returns 0, or -ENOMEM if memory can't be allocated. */ static int lsm_cred_alloc(struct cred *cred, gfp_t gfp) { return lsm_blob_alloc(&cred->security, blob_sizes.lbs_cred, gfp); } /** * lsm_early_cred - during initialization allocate a composite cred blob * @cred: the cred that needs a blob * * Allocate the cred blob for all the modules */ static void __init lsm_early_cred(struct cred *cred) { int rc = lsm_cred_alloc(cred, GFP_KERNEL); if (rc) panic("%s: Early cred alloc failed.\n", __func__); } /** * lsm_file_alloc - allocate a composite file blob * @file: the file that needs a blob * * Allocate the file blob for all the modules * * Returns 0, or -ENOMEM if memory can't be allocated. */ static int lsm_file_alloc(struct file *file) { if (!lsm_file_cache) { file->f_security = NULL; return 0; } file->f_security = kmem_cache_zalloc(lsm_file_cache, GFP_KERNEL); if (file->f_security == NULL) return -ENOMEM; return 0; } /** * lsm_inode_alloc - allocate a composite inode blob * @inode: the inode that needs a blob * @gfp: allocation flags * * Allocate the inode blob for all the modules * * Returns 0, or -ENOMEM if memory can't be allocated. */ static int lsm_inode_alloc(struct inode *inode, gfp_t gfp) { if (!lsm_inode_cache) { inode->i_security = NULL; return 0; } inode->i_security = kmem_cache_zalloc(lsm_inode_cache, gfp); if (inode->i_security == NULL) return -ENOMEM; return 0; } /** * lsm_task_alloc - allocate a composite task blob * @task: the task that needs a blob * * Allocate the task blob for all the modules * * Returns 0, or -ENOMEM if memory can't be allocated. */ static int lsm_task_alloc(struct task_struct *task) { return lsm_blob_alloc(&task->security, blob_sizes.lbs_task, GFP_KERNEL); } /** * lsm_ipc_alloc - allocate a composite ipc blob * @kip: the ipc that needs a blob * * Allocate the ipc blob for all the modules * * Returns 0, or -ENOMEM if memory can't be allocated. */ static int lsm_ipc_alloc(struct kern_ipc_perm *kip) { return lsm_blob_alloc(&kip->security, blob_sizes.lbs_ipc, GFP_KERNEL); } #ifdef CONFIG_KEYS /** * lsm_key_alloc - allocate a composite key blob * @key: the key that needs a blob * * Allocate the key blob for all the modules * * Returns 0, or -ENOMEM if memory can't be allocated. */ static int lsm_key_alloc(struct key *key) { return lsm_blob_alloc(&key->security, blob_sizes.lbs_key, GFP_KERNEL); } #endif /* CONFIG_KEYS */ /** * lsm_msg_msg_alloc - allocate a composite msg_msg blob * @mp: the msg_msg that needs a blob * * Allocate the ipc blob for all the modules * * Returns 0, or -ENOMEM if memory can't be allocated. */ static int lsm_msg_msg_alloc(struct msg_msg *mp) { return lsm_blob_alloc(&mp->security, blob_sizes.lbs_msg_msg, GFP_KERNEL); } /** * lsm_bdev_alloc - allocate a composite block_device blob * @bdev: the block_device that needs a blob * * Allocate the block_device blob for all the modules * * Returns 0, or -ENOMEM if memory can't be allocated. */ static int lsm_bdev_alloc(struct block_device *bdev) { if (blob_sizes.lbs_bdev == 0) { bdev->bd_security = NULL; return 0; } bdev->bd_security = kzalloc(blob_sizes.lbs_bdev, GFP_KERNEL); if (!bdev->bd_security) return -ENOMEM; return 0; } /** * lsm_early_task - during initialization allocate a composite task blob * @task: the task that needs a blob * * Allocate the task blob for all the modules */ static void __init lsm_early_task(struct task_struct *task) { int rc = lsm_task_alloc(task); if (rc) panic("%s: Early task alloc failed.\n", __func__); } /** * lsm_superblock_alloc - allocate a composite superblock blob * @sb: the superblock that needs a blob * * Allocate the superblock blob for all the modules * * Returns 0, or -ENOMEM if memory can't be allocated. */ static int lsm_superblock_alloc(struct super_block *sb) { return lsm_blob_alloc(&sb->s_security, blob_sizes.lbs_superblock, GFP_KERNEL); } /** * lsm_fill_user_ctx - Fill a user space lsm_ctx structure * @uctx: a userspace LSM context to be filled * @uctx_len: available uctx size (input), used uctx size (output) * @val: the new LSM context value * @val_len: the size of the new LSM context value * @id: LSM id * @flags: LSM defined flags * * Fill all of the fields in a userspace lsm_ctx structure. If @uctx is NULL * simply calculate the required size to output via @utc_len and return * success. * * Returns 0 on success, -E2BIG if userspace buffer is not large enough, * -EFAULT on a copyout error, -ENOMEM if memory can't be allocated. */ int lsm_fill_user_ctx(struct lsm_ctx __user *uctx, u32 *uctx_len, void *val, size_t val_len, u64 id, u64 flags) { struct lsm_ctx *nctx = NULL; size_t nctx_len; int rc = 0; nctx_len = ALIGN(struct_size(nctx, ctx, val_len), sizeof(void *)); if (nctx_len > *uctx_len) { rc = -E2BIG; goto out; } /* no buffer - return success/0 and set @uctx_len to the req size */ if (!uctx) goto out; nctx = kzalloc(nctx_len, GFP_KERNEL); if (nctx == NULL) { rc = -ENOMEM; goto out; } nctx->id = id; nctx->flags = flags; nctx->len = nctx_len; nctx->ctx_len = val_len; memcpy(nctx->ctx, val, val_len); if (copy_to_user(uctx, nctx, nctx_len)) rc = -EFAULT; out: kfree(nctx); *uctx_len = nctx_len; return rc; } /* * The default value of the LSM hook is defined in linux/lsm_hook_defs.h and * can be accessed with: * * LSM_RET_DEFAULT(<hook_name>) * * The macros below define static constants for the default value of each * LSM hook. */ #define LSM_RET_DEFAULT(NAME) (NAME##_default) #define DECLARE_LSM_RET_DEFAULT_void(DEFAULT, NAME) #define DECLARE_LSM_RET_DEFAULT_int(DEFAULT, NAME) \ static const int __maybe_unused LSM_RET_DEFAULT(NAME) = (DEFAULT); #define LSM_HOOK(RET, DEFAULT, NAME, ...) \ DECLARE_LSM_RET_DEFAULT_##RET(DEFAULT, NAME) #include <linux/lsm_hook_defs.h> #undef LSM_HOOK /* * Hook list operation macros. * * call_void_hook: * This is a hook that does not return a value. * * call_int_hook: * This is a hook that returns a value. */ #define __CALL_STATIC_VOID(NUM, HOOK, ...) \ do { \ if (static_branch_unlikely(&SECURITY_HOOK_ACTIVE_KEY(HOOK, NUM))) { \ static_call(LSM_STATIC_CALL(HOOK, NUM))(__VA_ARGS__); \ } \ } while (0); #define call_void_hook(HOOK, ...) \ do { \ LSM_LOOP_UNROLL(__CALL_STATIC_VOID, HOOK, __VA_ARGS__); \ } while (0) #define __CALL_STATIC_INT(NUM, R, HOOK, LABEL, ...) \ do { \ if (static_branch_unlikely(&SECURITY_HOOK_ACTIVE_KEY(HOOK, NUM))) { \ R = static_call(LSM_STATIC_CALL(HOOK, NUM))(__VA_ARGS__); \ if (R != LSM_RET_DEFAULT(HOOK)) \ goto LABEL; \ } \ } while (0); #define call_int_hook(HOOK, ...) \ ({ \ __label__ OUT; \ int RC = LSM_RET_DEFAULT(HOOK); \ \ LSM_LOOP_UNROLL(__CALL_STATIC_INT, RC, HOOK, OUT, __VA_ARGS__); \ OUT: \ RC; \ }) #define lsm_for_each_hook(scall, NAME) \ for (scall = static_calls_table.NAME; \ scall - static_calls_table.NAME < MAX_LSM_COUNT; scall++) \ if (static_key_enabled(&scall->active->key)) /* Security operations */ /** * security_binder_set_context_mgr() - Check if becoming binder ctx mgr is ok * @mgr: task credentials of current binder process * * Check whether @mgr is allowed to be the binder context manager. * * Return: Return 0 if permission is granted. */ int security_binder_set_context_mgr(const struct cred *mgr) { return call_int_hook(binder_set_context_mgr, mgr); } /** * security_binder_transaction() - Check if a binder transaction is allowed * @from: sending process * @to: receiving process * * Check whether @from is allowed to invoke a binder transaction call to @to. * * Return: Returns 0 if permission is granted. */ int security_binder_transaction(const struct cred *from, const struct cred *to) { return call_int_hook(binder_transaction, from, to); } /** * security_binder_transfer_binder() - Check if a binder transfer is allowed * @from: sending process * @to: receiving process * * Check whether @from is allowed to transfer a binder reference to @to. * * Return: Returns 0 if permission is granted. */ int security_binder_transfer_binder(const struct cred *from, const struct cred *to) { return call_int_hook(binder_transfer_binder, from, to); } /** * security_binder_transfer_file() - Check if a binder file xfer is allowed * @from: sending process * @to: receiving process * @file: file being transferred * * Check whether @from is allowed to transfer @file to @to. * * Return: Returns 0 if permission is granted. */ int security_binder_transfer_file(const struct cred *from, const struct cred *to, const struct file *file) { return call_int_hook(binder_transfer_file, from, to, file); } /** * security_ptrace_access_check() - Check if tracing is allowed * @child: target process * @mode: PTRACE_MODE flags * * Check permission before allowing the current process to trace the @child * process. Security modules may also want to perform a process tracing check * during an execve in the set_security or apply_creds hooks of tracing check * during an execve in the bprm_set_creds hook of binprm_security_ops if the * process is being traced and its security attributes would be changed by the * execve. * * Return: Returns 0 if permission is granted. */ int security_ptrace_access_check(struct task_struct *child, unsigned int mode) { return call_int_hook(ptrace_access_check, child, mode); } /** * security_ptrace_traceme() - Check if tracing is allowed * @parent: tracing process * * Check that the @parent process has sufficient permission to trace the * current process before allowing the current process to present itself to the * @parent process for tracing. * * Return: Returns 0 if permission is granted. */ int security_ptrace_traceme(struct task_struct *parent) { return call_int_hook(ptrace_traceme, parent); } /** * security_capget() - Get the capability sets for a process * @target: target process * @effective: effective capability set * @inheritable: inheritable capability set * @permitted: permitted capability set * * Get the @effective, @inheritable, and @permitted capability sets for the * @target process. The hook may also perform permission checking to determine * if the current process is allowed to see the capability sets of the @target * process. * * Return: Returns 0 if the capability sets were successfully obtained. */ int security_capget(const struct task_struct *target, kernel_cap_t *effective, kernel_cap_t *inheritable, kernel_cap_t *permitted) { return call_int_hook(capget, target, effective, inheritable, permitted); } /** * security_capset() - Set the capability sets for a process * @new: new credentials for the target process * @old: current credentials of the target process * @effective: effective capability set * @inheritable: inheritable capability set * @permitted: permitted capability set * * Set the @effective, @inheritable, and @permitted capability sets for the * current process. * * Return: Returns 0 and update @new if permission is granted. */ int security_capset(struct cred *new, const struct cred *old, const kernel_cap_t *effective, const kernel_cap_t *inheritable, const kernel_cap_t *permitted) { return call_int_hook(capset, new, old, effective, inheritable, permitted); } /** * security_capable() - Check if a process has the necessary capability * @cred: credentials to examine * @ns: user namespace * @cap: capability requested * @opts: capability check options * * Check whether the @tsk process has the @cap capability in the indicated * credentials. @cap contains the capability <include/linux/capability.h>. * @opts contains options for the capable check <include/linux/security.h>. * * Return: Returns 0 if the capability is granted. */ int security_capable(const struct cred *cred, struct user_namespace *ns, int cap, unsigned int opts) { return call_int_hook(capable, cred, ns, cap, opts); } /** * security_quotactl() - Check if a quotactl() syscall is allowed for this fs * @cmds: commands * @type: type * @id: id * @sb: filesystem * * Check whether the quotactl syscall is allowed for this @sb. * * Return: Returns 0 if permission is granted. */ int security_quotactl(int cmds, int type, int id, const struct super_block *sb) { return call_int_hook(quotactl, cmds, type, id, sb); } /** * security_quota_on() - Check if QUOTAON is allowed for a dentry * @dentry: dentry * * Check whether QUOTAON is allowed for @dentry. * * Return: Returns 0 if permission is granted. */ int security_quota_on(struct dentry *dentry) { return call_int_hook(quota_on, dentry); } /** * security_syslog() - Check if accessing the kernel message ring is allowed * @type: SYSLOG_ACTION_* type * * Check permission before accessing the kernel message ring or changing * logging to the console. See the syslog(2) manual page for an explanation of * the @type values. * * Return: Return 0 if permission is granted. */ int security_syslog(int type) { return call_int_hook(syslog, type); } /** * security_settime64() - Check if changing the system time is allowed * @ts: new time * @tz: timezone * * Check permission to change the system time, struct timespec64 is defined in * <include/linux/time64.h> and timezone is defined in <include/linux/time.h>. * * Return: Returns 0 if permission is granted. */ int security_settime64(const struct timespec64 *ts, const struct timezone *tz) { return call_int_hook(settime, ts, tz); } /** * security_vm_enough_memory_mm() - Check if allocating a new mem map is allowed * @mm: mm struct * @pages: number of pages * * Check permissions for allocating a new virtual mapping. If all LSMs return * a positive value, __vm_enough_memory() will be called with cap_sys_admin * set. If at least one LSM returns 0 or negative, __vm_enough_memory() will be * called with cap_sys_admin cleared. * * Return: Returns 0 if permission is granted by the LSM infrastructure to the * caller. */ int security_vm_enough_memory_mm(struct mm_struct *mm, long pages) { struct lsm_static_call *scall; int cap_sys_admin = 1; int rc; /* * The module will respond with 0 if it thinks the __vm_enough_memory() * call should be made with the cap_sys_admin set. If all of the modules * agree that it should be set it will. If any module thinks it should * not be set it won't. */ lsm_for_each_hook(scall, vm_enough_memory) { rc = scall->hl->hook.vm_enough_memory(mm, pages); if (rc < 0) { cap_sys_admin = 0; break; } } return __vm_enough_memory(mm, pages, cap_sys_admin); } /** * security_bprm_creds_for_exec() - Prepare the credentials for exec() * @bprm: binary program information * * If the setup in prepare_exec_creds did not setup @bprm->cred->security * properly for executing @bprm->file, update the LSM's portion of * @bprm->cred->security to be what commit_creds needs to install for the new * program. This hook may also optionally check permissions (e.g. for * transitions between security domains). The hook must set @bprm->secureexec * to 1 if AT_SECURE should be set to request libc enable secure mode. @bprm * contains the linux_binprm structure. * * Return: Returns 0 if the hook is successful and permission is granted. */ int security_bprm_creds_for_exec(struct linux_binprm *bprm) { return call_int_hook(bprm_creds_for_exec, bprm); } /** * security_bprm_creds_from_file() - Update linux_binprm creds based on file * @bprm: binary program information * @file: associated file * * If @file is setpcap, suid, sgid or otherwise marked to change privilege upon * exec, update @bprm->cred to reflect that change. This is called after * finding the binary that will be executed without an interpreter. This * ensures that the credentials will not be derived from a script that the * binary will need to reopen, which when reopend may end up being a completely * different file. This hook may also optionally check permissions (e.g. for * transitions between security domains). The hook must set @bprm->secureexec * to 1 if AT_SECURE should be set to request libc enable secure mode. The * hook must add to @bprm->per_clear any personality flags that should be * cleared from current->personality. @bprm contains the linux_binprm * structure. * * Return: Returns 0 if the hook is successful and permission is granted. */ int security_bprm_creds_from_file(struct linux_binprm *bprm, const struct file *file) { return call_int_hook(bprm_creds_from_file, bprm, file); } /** * security_bprm_check() - Mediate binary handler search * @bprm: binary program information * * This hook mediates the point when a search for a binary handler will begin. * It allows a check against the @bprm->cred->security value which was set in * the preceding creds_for_exec call. The argv list and envp list are reliably * available in @bprm. This hook may be called multiple times during a single * execve. @bprm contains the linux_binprm structure. * * Return: Returns 0 if the hook is successful and permission is granted. */ int security_bprm_check(struct linux_binprm *bprm) { return call_int_hook(bprm_check_security, bprm); } /** * security_bprm_committing_creds() - Install creds for a process during exec() * @bprm: binary program information * * Prepare to install the new security attributes of a process being * transformed by an execve operation, based on the old credentials pointed to * by @current->cred and the information set in @bprm->cred by the * bprm_creds_for_exec hook. @bprm points to the linux_binprm structure. This * hook is a good place to perform state changes on the process such as closing * open file descriptors to which access will no longer be granted when the * attributes are changed. This is called immediately before commit_creds(). */ void security_bprm_committing_creds(const struct linux_binprm *bprm) { call_void_hook(bprm_committing_creds, bprm); } /** * security_bprm_committed_creds() - Tidy up after cred install during exec() * @bprm: binary program information * * Tidy up after the installation of the new security attributes of a process * being transformed by an execve operation. The new credentials have, by this * point, been set to @current->cred. @bprm points to the linux_binprm * structure. This hook is a good place to perform state changes on the * process such as clearing out non-inheritable signal state. This is called * immediately after commit_creds(). */ void security_bprm_committed_creds(const struct linux_binprm *bprm) { call_void_hook(bprm_committed_creds, bprm); } /** * security_fs_context_submount() - Initialise fc->security * @fc: new filesystem context * @reference: dentry reference for submount/remount * * Fill out the ->security field for a new fs_context. * * Return: Returns 0 on success or negative error code on failure. */ int security_fs_context_submount(struct fs_context *fc, struct super_block *reference) { return call_int_hook(fs_context_submount, fc, reference); } /** * security_fs_context_dup() - Duplicate a fs_context LSM blob * @fc: destination filesystem context * @src_fc: source filesystem context * * Allocate and attach a security structure to sc->security. This pointer is * initialised to NULL by the caller. @fc indicates the new filesystem context. * @src_fc indicates the original filesystem context. * * Return: Returns 0 on success or a negative error code on failure. */ int security_fs_context_dup(struct fs_context *fc, struct fs_context *src_fc) { return call_int_hook(fs_context_dup, fc, src_fc); } /** * security_fs_context_parse_param() - Configure a filesystem context * @fc: filesystem context * @param: filesystem parameter * * Userspace provided a parameter to configure a superblock. The LSM can * consume the parameter or return it to the caller for use elsewhere. * * Return: If the parameter is used by the LSM it should return 0, if it is * returned to the caller -ENOPARAM is returned, otherwise a negative * error code is returned. */ int security_fs_context_parse_param(struct fs_context *fc, struct fs_parameter *param) { struct lsm_static_call *scall; int trc; int rc = -ENOPARAM; lsm_for_each_hook(scall, fs_context_parse_param) { trc = scall->hl->hook.fs_context_parse_param(fc, param); if (trc == 0) rc = 0; else if (trc != -ENOPARAM) return trc; } return rc; } /** * security_sb_alloc() - Allocate a super_block LSM blob * @sb: filesystem superblock * * Allocate and attach a security structure to the sb->s_security field. The * s_security field is initialized to NULL when the structure is allocated. * @sb contains the super_block structure to be modified. * * Return: Returns 0 if operation was successful. */ int security_sb_alloc(struct super_block *sb) { int rc = lsm_superblock_alloc(sb); if (unlikely(rc)) return rc; rc = call_int_hook(sb_alloc_security, sb); if (unlikely(rc)) security_sb_free(sb); return rc; } /** * security_sb_delete() - Release super_block LSM associated objects * @sb: filesystem superblock * * Release objects tied to a superblock (e.g. inodes). @sb contains the * super_block structure being released. */ void security_sb_delete(struct super_block *sb) { call_void_hook(sb_delete, sb); } /** * security_sb_free() - Free a super_block LSM blob * @sb: filesystem superblock * * Deallocate and clear the sb->s_security field. @sb contains the super_block * structure to be modified. */ void security_sb_free(struct super_block *sb) { call_void_hook(sb_free_security, sb); kfree(sb->s_security); sb->s_security = NULL; } /** * security_free_mnt_opts() - Free memory associated with mount options * @mnt_opts: LSM processed mount options * * Free memory associated with @mnt_ops. */ void security_free_mnt_opts(void **mnt_opts) { if (!*mnt_opts) return; call_void_hook(sb_free_mnt_opts, *mnt_opts); *mnt_opts = NULL; } EXPORT_SYMBOL(security_free_mnt_opts); /** * security_sb_eat_lsm_opts() - Consume LSM mount options * @options: mount options * @mnt_opts: LSM processed mount options * * Eat (scan @options) and save them in @mnt_opts. * * Return: Returns 0 on success, negative values on failure. */ int security_sb_eat_lsm_opts(char *options, void **mnt_opts) { return call_int_hook(sb_eat_lsm_opts, options, mnt_opts); } EXPORT_SYMBOL(security_sb_eat_lsm_opts); /** * security_sb_mnt_opts_compat() - Check if new mount options are allowed * @sb: filesystem superblock * @mnt_opts: new mount options * * Determine if the new mount options in @mnt_opts are allowed given the * existing mounted filesystem at @sb. @sb superblock being compared. * * Return: Returns 0 if options are compatible. */ int security_sb_mnt_opts_compat(struct super_block *sb, void *mnt_opts) { return call_int_hook(sb_mnt_opts_compat, sb, mnt_opts); } EXPORT_SYMBOL(security_sb_mnt_opts_compat); /** * security_sb_remount() - Verify no incompatible mount changes during remount * @sb: filesystem superblock * @mnt_opts: (re)mount options * * Extracts security system specific mount options and verifies no changes are * being made to those options. * * Return: Returns 0 if permission is granted. */ int security_sb_remount(struct super_block *sb, void *mnt_opts) { return call_int_hook(sb_remount, sb, mnt_opts); } EXPORT_SYMBOL(security_sb_remount); /** * security_sb_kern_mount() - Check if a kernel mount is allowed * @sb: filesystem superblock * * Mount this @sb if allowed by permissions. * * Return: Returns 0 if permission is granted. */ int security_sb_kern_mount(const struct super_block *sb) { return call_int_hook(sb_kern_mount, sb); } /** * security_sb_show_options() - Output the mount options for a superblock * @m: output file * @sb: filesystem superblock * * Show (print on @m) mount options for this @sb. * * Return: Returns 0 on success, negative values on failure. */ int security_sb_show_options(struct seq_file *m, struct super_block *sb) { return call_int_hook(sb_show_options, m, sb); } /** * security_sb_statfs() - Check if accessing fs stats is allowed * @dentry: superblock handle * * Check permission before obtaining filesystem statistics for the @mnt * mountpoint. @dentry is a handle on the superblock for the filesystem. * * Return: Returns 0 if permission is granted. */ int security_sb_statfs(struct dentry *dentry) { return call_int_hook(sb_statfs, dentry); } /** * security_sb_mount() - Check permission for mounting a filesystem * @dev_name: filesystem backing device * @path: mount point * @type: filesystem type * @flags: mount flags * @data: filesystem specific data * * Check permission before an object specified by @dev_name is mounted on the * mount point named by @nd. For an ordinary mount, @dev_name identifies a * device if the file system type requires a device. For a remount * (@flags & MS_REMOUNT), @dev_name is irrelevant. For a loopback/bind mount * (@flags & MS_BIND), @dev_name identifies the pathname of the object being * mounted. * * Return: Returns 0 if permission is granted. */ int security_sb_mount(const char *dev_name, const struct path *path, const char *type, unsigned long flags, void *data) { return call_int_hook(sb_mount, dev_name, path, type, flags, data); } /** * security_sb_umount() - Check permission for unmounting a filesystem * @mnt: mounted filesystem * @flags: unmount flags * * Check permission before the @mnt file system is unmounted. * * Return: Returns 0 if permission is granted. */ int security_sb_umount(struct vfsmount *mnt, int flags) { return call_int_hook(sb_umount, mnt, flags); } /** * security_sb_pivotroot() - Check permissions for pivoting the rootfs * @old_path: new location for current rootfs * @new_path: location of the new rootfs * * Check permission before pivoting the root filesystem. * * Return: Returns 0 if permission is granted. */ int security_sb_pivotroot(const struct path *old_path, const struct path *new_path) { return call_int_hook(sb_pivotroot, old_path, new_path); } /** * security_sb_set_mnt_opts() - Set the mount options for a filesystem * @sb: filesystem superblock * @mnt_opts: binary mount options * @kern_flags: kernel flags (in) * @set_kern_flags: kernel flags (out) * * Set the security relevant mount options used for a superblock. * * Return: Returns 0 on success, error on failure. */ int security_sb_set_mnt_opts(struct super_block *sb, void *mnt_opts, unsigned long kern_flags, unsigned long *set_kern_flags) { struct lsm_static_call *scall; int rc = mnt_opts ? -EOPNOTSUPP : LSM_RET_DEFAULT(sb_set_mnt_opts); lsm_for_each_hook(scall, sb_set_mnt_opts) { rc = scall->hl->hook.sb_set_mnt_opts(sb, mnt_opts, kern_flags, set_kern_flags); if (rc != LSM_RET_DEFAULT(sb_set_mnt_opts)) break; } return rc; } EXPORT_SYMBOL(security_sb_set_mnt_opts); /** * security_sb_clone_mnt_opts() - Duplicate superblock mount options * @oldsb: source superblock * @newsb: destination superblock * @kern_flags: kernel flags (in) * @set_kern_flags: kernel flags (out) * * Copy all security options from a given superblock to another. * * Return: Returns 0 on success, error on failure. */ int security_sb_clone_mnt_opts(const struct super_block *oldsb, struct super_block *newsb, unsigned long kern_flags, unsigned long *set_kern_flags) { return call_int_hook(sb_clone_mnt_opts, oldsb, newsb, kern_flags, set_kern_flags); } EXPORT_SYMBOL(security_sb_clone_mnt_opts); /** * security_move_mount() - Check permissions for moving a mount * @from_path: source mount point * @to_path: destination mount point * * Check permission before a mount is moved. * * Return: Returns 0 if permission is granted. */ int security_move_mount(const struct path *from_path, const struct path *to_path) { return call_int_hook(move_mount, from_path, to_path); } /** * security_path_notify() - Check if setting a watch is allowed * @path: file path * @mask: event mask * @obj_type: file path type * * Check permissions before setting a watch on events as defined by @mask, on * an object at @path, whose type is defined by @obj_type. * * Return: Returns 0 if permission is granted. */ int security_path_notify(const struct path *path, u64 mask, unsigned int obj_type) { return call_int_hook(path_notify, path, mask, obj_type); } /** * security_inode_alloc() - Allocate an inode LSM blob * @inode: the inode * @gfp: allocation flags * * Allocate and attach a security structure to @inode->i_security. The * i_security field is initialized to NULL when the inode structure is * allocated. * * Return: Return 0 if operation was successful. */ int security_inode_alloc(struct inode *inode, gfp_t gfp) { int rc = lsm_inode_alloc(inode, gfp); if (unlikely(rc)) return rc; rc = call_int_hook(inode_alloc_security, inode); if (unlikely(rc)) security_inode_free(inode); return rc; } static void inode_free_by_rcu(struct rcu_head *head) { /* The rcu head is at the start of the inode blob */ call_void_hook(inode_free_security_rcu, head); kmem_cache_free(lsm_inode_cache, head); } /** * security_inode_free() - Free an inode's LSM blob * @inode: the inode * * Release any LSM resources associated with @inode, although due to the * inode's RCU protections it is possible that the resources will not be * fully released until after the current RCU grace period has elapsed. * * It is important for LSMs to note that despite being present in a call to * security_inode_free(), @inode may still be referenced in a VFS path walk * and calls to security_inode_permission() may be made during, or after, * a call to security_inode_free(). For this reason the inode->i_security * field is released via a call_rcu() callback and any LSMs which need to * retain inode state for use in security_inode_permission() should only * release that state in the inode_free_security_rcu() LSM hook callback. */ void security_inode_free(struct inode *inode) { call_void_hook(inode_free_security, inode); if (!inode->i_security) return; call_rcu((struct rcu_head *)inode->i_security, inode_free_by_rcu); } /** * security_dentry_init_security() - Perform dentry initialization * @dentry: the dentry to initialize * @mode: mode used to determine resource type * @name: name of the last path component * @xattr_name: name of the security/LSM xattr * @ctx: pointer to the resulting LSM context * @ctxlen: length of @ctx * * Compute a context for a dentry as the inode is not yet available since NFSv4 * has no label backed by an EA anyway. It is important to note that * @xattr_name does not need to be free'd by the caller, it is a static string. * * Return: Returns 0 on success, negative values on failure. */ int security_dentry_init_security(struct dentry *dentry, int mode, const struct qstr *name, const char **xattr_name, void **ctx, u32 *ctxlen) { return call_int_hook(dentry_init_security, dentry, mode, name, xattr_name, ctx, ctxlen); } EXPORT_SYMBOL(security_dentry_init_security); /** * security_dentry_create_files_as() - Perform dentry initialization * @dentry: the dentry to initialize * @mode: mode used to determine resource type * @name: name of the last path component * @old: creds to use for LSM context calculations * @new: creds to modify * * Compute a context for a dentry as the inode is not yet available and set * that context in passed in creds so that new files are created using that * context. Context is calculated using the passed in creds and not the creds * of the caller. * * Return: Returns 0 on success, error on failure. */ int security_dentry_create_files_as(struct dentry *dentry, int mode, struct qstr *name, const struct cred *old, struct cred *new) { return call_int_hook(dentry_create_files_as, dentry, mode, name, old, new); } EXPORT_SYMBOL(security_dentry_create_files_as); /** * security_inode_init_security() - Initialize an inode's LSM context * @inode: the inode * @dir: parent directory * @qstr: last component of the pathname * @initxattrs: callback function to write xattrs * @fs_data: filesystem specific data * * Obtain the security attribute name suffix and value to set on a newly * created inode and set up the incore security field for the new inode. This * hook is called by the fs code as part of the inode creation transaction and * provides for atomic labeling of the inode, unlike the post_create/mkdir/... * hooks called by the VFS. * * The hook function is expected to populate the xattrs array, by calling * lsm_get_xattr_slot() to retrieve the slots reserved by the security module * with the lbs_xattr_count field of the lsm_blob_sizes structure. For each * slot, the hook function should set ->name to the attribute name suffix * (e.g. selinux), to allocate ->value (will be freed by the caller) and set it * to the attribute value, to set ->value_len to the length of the value. If * the security module does not use security attributes or does not wish to put * a security attribute on this particular inode, then it should return * -EOPNOTSUPP to skip this processing. * * Return: Returns 0 if the LSM successfully initialized all of the inode * security attributes that are required, negative values otherwise. */ int security_inode_init_security(struct inode *inode, struct inode *dir, const struct qstr *qstr, const initxattrs initxattrs, void *fs_data) { struct lsm_static_call *scall; struct xattr *new_xattrs = NULL; int ret = -EOPNOTSUPP, xattr_count = 0; if (unlikely(IS_PRIVATE(inode))) return 0; if (!blob_sizes.lbs_xattr_count) return 0; if (initxattrs) { /* Allocate +1 as terminator. */ new_xattrs = kcalloc(blob_sizes.lbs_xattr_count + 1, sizeof(*new_xattrs), GFP_NOFS); if (!new_xattrs) return -ENOMEM; } lsm_for_each_hook(scall, inode_init_security) { ret = scall->hl->hook.inode_init_security(inode, dir, qstr, new_xattrs, &xattr_count); if (ret && ret != -EOPNOTSUPP) goto out; /* * As documented in lsm_hooks.h, -EOPNOTSUPP in this context * means that the LSM is not willing to provide an xattr, not * that it wants to signal an error. Thus, continue to invoke * the remaining LSMs. */ } /* If initxattrs() is NULL, xattr_count is zero, skip the call. */ if (!xattr_count) goto out; ret = initxattrs(inode, new_xattrs, fs_data); out: for (; xattr_count > 0; xattr_count--) kfree(new_xattrs[xattr_count - 1].value); kfree(new_xattrs); return (ret == -EOPNOTSUPP) ? 0 : ret; } EXPORT_SYMBOL(security_inode_init_security); /** * security_inode_init_security_anon() - Initialize an anonymous inode * @inode: the inode * @name: the anonymous inode class * @context_inode: an optional related inode * * Set up the incore security field for the new anonymous inode and return * whether the inode creation is permitted by the security module or not. * * Return: Returns 0 on success, -EACCES if the security module denies the * creation of this inode, or another -errno upon other errors. */ int security_inode_init_security_anon(struct inode *inode, const struct qstr *name, const struct inode *context_inode) { return call_int_hook(inode_init_security_anon, inode, name, context_inode); } #ifdef CONFIG_SECURITY_PATH /** * security_path_mknod() - Check if creating a special file is allowed * @dir: parent directory * @dentry: new file * @mode: new file mode * @dev: device number * * Check permissions when creating a file. Note that this hook is called even * if mknod operation is being done for a regular file. * * Return: Returns 0 if permission is granted. */ int security_path_mknod(const struct path *dir, struct dentry *dentry, umode_t mode, unsigned int dev) { if (unlikely(IS_PRIVATE(d_backing_inode(dir->dentry)))) return 0; return call_int_hook(path_mknod, dir, dentry, mode, dev); } EXPORT_SYMBOL(security_path_mknod); /** * security_path_post_mknod() - Update inode security after reg file creation * @idmap: idmap of the mount * @dentry: new file * * Update inode security field after a regular file has been created. */ void security_path_post_mknod(struct mnt_idmap *idmap, struct dentry *dentry) { if (unlikely(IS_PRIVATE(d_backing_inode(dentry)))) return; call_void_hook(path_post_mknod, idmap, dentry); } /** * security_path_mkdir() - Check if creating a new directory is allowed * @dir: parent directory * @dentry: new directory * @mode: new directory mode * * Check permissions to create a new directory in the existing directory. * * Return: Returns 0 if permission is granted. */ int security_path_mkdir(const struct path *dir, struct dentry *dentry, umode_t mode) { if (unlikely(IS_PRIVATE(d_backing_inode(dir->dentry)))) return 0; return call_int_hook(path_mkdir, dir, dentry, mode); } EXPORT_SYMBOL(security_path_mkdir); /** * security_path_rmdir() - Check if removing a directory is allowed * @dir: parent directory * @dentry: directory to remove * * Check the permission to remove a directory. * * Return: Returns 0 if permission is granted. */ int security_path_rmdir(const struct path *dir, struct dentry *dentry) { if (unlikely(IS_PRIVATE(d_backing_inode(dir->dentry)))) return 0; return call_int_hook(path_rmdir, dir, dentry); } /** * security_path_unlink() - Check if removing a hard link is allowed * @dir: parent directory * @dentry: file * * Check the permission to remove a hard link to a file. * * Return: Returns 0 if permission is granted. */ int security_path_unlink(const struct path *dir, struct dentry *dentry) { if (unlikely(IS_PRIVATE(d_backing_inode(dir->dentry)))) return 0; return call_int_hook(path_unlink, dir, dentry); } EXPORT_SYMBOL(security_path_unlink); /** * security_path_symlink() - Check if creating a symbolic link is allowed * @dir: parent directory * @dentry: symbolic link * @old_name: file pathname * * Check the permission to create a symbolic link to a file. * * Return: Returns 0 if permission is granted. */ int security_path_symlink(const struct path *dir, struct dentry *dentry, const char *old_name) { if (unlikely(IS_PRIVATE(d_backing_inode(dir->dentry)))) return 0; return call_int_hook(path_symlink, dir, dentry, old_name); } /** * security_path_link - Check if creating a hard link is allowed * @old_dentry: existing file * @new_dir: new parent directory * @new_dentry: new link * * Check permission before creating a new hard link to a file. * * Return: Returns 0 if permission is granted. */ int security_path_link(struct dentry *old_dentry, const struct path *new_dir, struct dentry *new_dentry) { if (unlikely(IS_PRIVATE(d_backing_inode(old_dentry)))) return 0; return call_int_hook(path_link, old_dentry, new_dir, new_dentry); } /** * security_path_rename() - Check if renaming a file is allowed * @old_dir: parent directory of the old file * @old_dentry: the old file * @new_dir: parent directory of the new file * @new_dentry: the new file * @flags: flags * * Check for permission to rename a file or directory. * * Return: Returns 0 if permission is granted. */ int security_path_rename(const struct path *old_dir, struct dentry *old_dentry, const struct path *new_dir, struct dentry *new_dentry, unsigned int flags) { if (unlikely(IS_PRIVATE(d_backing_inode(old_dentry)) || (d_is_positive(new_dentry) && IS_PRIVATE(d_backing_inode(new_dentry))))) return 0; return call_int_hook(path_rename, old_dir, old_dentry, new_dir, new_dentry, flags); } EXPORT_SYMBOL(security_path_rename); /** * security_path_truncate() - Check if truncating a file is allowed * @path: file * * Check permission before truncating the file indicated by path. Note that * truncation permissions may also be checked based on already opened files, * using the security_file_truncate() hook. * * Return: Returns 0 if permission is granted. */ int security_path_truncate(const struct path *path) { if (unlikely(IS_PRIVATE(d_backing_inode(path->dentry)))) return 0; return call_int_hook(path_truncate, path); } /** * security_path_chmod() - Check if changing the file's mode is allowed * @path: file * @mode: new mode * * Check for permission to change a mode of the file @path. The new mode is * specified in @mode which is a bitmask of constants from * <include/uapi/linux/stat.h>. * * Return: Returns 0 if permission is granted. */ int security_path_chmod(const struct path *path, umode_t mode) { if (unlikely(IS_PRIVATE(d_backing_inode(path->dentry)))) return 0; return call_int_hook(path_chmod, path, mode); } /** * security_path_chown() - Check if changing the file's owner/group is allowed * @path: file * @uid: file owner * @gid: file group * * Check for permission to change owner/group of a file or directory. * * Return: Returns 0 if permission is granted. */ int security_path_chown(const struct path *path, kuid_t uid, kgid_t gid) { if (unlikely(IS_PRIVATE(d_backing_inode(path->dentry)))) return 0; return call_int_hook(path_chown, path, uid, gid); } /** * security_path_chroot() - Check if changing the root directory is allowed * @path: directory * * Check for permission to change root directory. * * Return: Returns 0 if permission is granted. */ int security_path_chroot(const struct path *path) { return call_int_hook(path_chroot, path); } #endif /* CONFIG_SECURITY_PATH */ /** * security_inode_create() - Check if creating a file is allowed * @dir: the parent directory * @dentry: the file being created * @mode: requested file mode * * Check permission to create a regular file. * * Return: Returns 0 if permission is granted. */ int security_inode_create(struct inode *dir, struct dentry *dentry, umode_t mode) { if (unlikely(IS_PRIVATE(dir))) return 0; return call_int_hook(inode_create, dir, dentry, mode); } EXPORT_SYMBOL_GPL(security_inode_create); /** * security_inode_post_create_tmpfile() - Update inode security of new tmpfile * @idmap: idmap of the mount * @inode: inode of the new tmpfile * * Update inode security data after a tmpfile has been created. */ void security_inode_post_create_tmpfile(struct mnt_idmap *idmap, struct inode *inode) { if (unlikely(IS_PRIVATE(inode))) return; call_void_hook(inode_post_create_tmpfile, idmap, inode); } /** * security_inode_link() - Check if creating a hard link is allowed * @old_dentry: existing file * @dir: new parent directory * @new_dentry: new link * * Check permission before creating a new hard link to a file. * * Return: Returns 0 if permission is granted. */ int security_inode_link(struct dentry *old_dentry, struct inode *dir, struct dentry *new_dentry) { if (unlikely(IS_PRIVATE(d_backing_inode(old_dentry)))) return 0; return call_int_hook(inode_link, old_dentry, dir, new_dentry); } /** * security_inode_unlink() - Check if removing a hard link is allowed * @dir: parent directory * @dentry: file * * Check the permission to remove a hard link to a file. * * Return: Returns 0 if permission is granted. */ int security_inode_unlink(struct inode *dir, struct dentry *dentry) { if (unlikely(IS_PRIVATE(d_backing_inode(dentry)))) return 0; return call_int_hook(inode_unlink, dir, dentry); } /** * security_inode_symlink() - Check if creating a symbolic link is allowed * @dir: parent directory * @dentry: symbolic link * @old_name: existing filename * * Check the permission to create a symbolic link to a file. * * Return: Returns 0 if permission is granted. */ int security_inode_symlink(struct inode *dir, struct dentry *dentry, const char *old_name) { if (unlikely(IS_PRIVATE(dir))) return 0; return call_int_hook(inode_symlink, dir, dentry, old_name); } /** * security_inode_mkdir() - Check if creation a new director is allowed * @dir: parent directory * @dentry: new directory * @mode: new directory mode * * Check permissions to create a new directory in the existing directory * associated with inode structure @dir. * * Return: Returns 0 if permission is granted. */ int security_inode_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) { if (unlikely(IS_PRIVATE(dir))) return 0; return call_int_hook(inode_mkdir, dir, dentry, mode); } EXPORT_SYMBOL_GPL(security_inode_mkdir); /** * security_inode_rmdir() - Check if removing a directory is allowed * @dir: parent directory * @dentry: directory to be removed * * Check the permission to remove a directory. * * Return: Returns 0 if permission is granted. */ int security_inode_rmdir(struct inode *dir, struct dentry *dentry) { if (unlikely(IS_PRIVATE(d_backing_inode(dentry)))) return 0; return call_int_hook(inode_rmdir, dir, dentry); } /** * security_inode_mknod() - Check if creating a special file is allowed * @dir: parent directory * @dentry: new file * @mode: new file mode * @dev: device number * * Check permissions when creating a special file (or a socket or a fifo file * created via the mknod system call). Note that if mknod operation is being * done for a regular file, then the create hook will be called and not this * hook. * * Return: Returns 0 if permission is granted. */ int security_inode_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev) { if (unlikely(IS_PRIVATE(dir))) return 0; return call_int_hook(inode_mknod, dir, dentry, mode, dev); } /** * security_inode_rename() - Check if renaming a file is allowed * @old_dir: parent directory of the old file * @old_dentry: the old file * @new_dir: parent directory of the new file * @new_dentry: the new file * @flags: flags * * Check for permission to rename a file or directory. * * Return: Returns 0 if permission is granted. */ int security_inode_rename(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags) { if (unlikely(IS_PRIVATE(d_backing_inode(old_dentry)) || (d_is_positive(new_dentry) && IS_PRIVATE(d_backing_inode(new_dentry))))) return 0; if (flags & RENAME_EXCHANGE) { int err = call_int_hook(inode_rename, new_dir, new_dentry, old_dir, old_dentry); if (err) return err; } return call_int_hook(inode_rename, old_dir, old_dentry, new_dir, new_dentry); } /** * security_inode_readlink() - Check if reading a symbolic link is allowed * @dentry: link * * Check the permission to read the symbolic link. * * Return: Returns 0 if permission is granted. */ int security_inode_readlink(struct dentry *dentry) { if (unlikely(IS_PRIVATE(d_backing_inode(dentry)))) return 0; return call_int_hook(inode_readlink, dentry); } /** * security_inode_follow_link() - Check if following a symbolic link is allowed * @dentry: link dentry * @inode: link inode * @rcu: true if in RCU-walk mode * * Check permission to follow a symbolic link when looking up a pathname. If * @rcu is true, @inode is not stable. * * Return: Returns 0 if permission is granted. */ int security_inode_follow_link(struct dentry *dentry, struct inode *inode, bool rcu) { if (unlikely(IS_PRIVATE(inode))) return 0; return call_int_hook(inode_follow_link, dentry, inode, rcu); } /** * security_inode_permission() - Check if accessing an inode is allowed * @inode: inode * @mask: access mask * * Check permission before accessing an inode. This hook is called by the * existing Linux permission function, so a security module can use it to * provide additional checking for existing Linux permission checks. Notice * that this hook is called when a file is opened (as well as many other * operations), whereas the file_security_ops permission hook is called when * the actual read/write operations are performed. * * Return: Returns 0 if permission is granted. */ int security_inode_permission(struct inode *inode, int mask) { if (unlikely(IS_PRIVATE(inode))) return 0; return call_int_hook(inode_permission, inode, mask); } /** * security_inode_setattr() - Check if setting file attributes is allowed * @idmap: idmap of the mount * @dentry: file * @attr: new attributes * * Check permission before setting file attributes. Note that the kernel call * to notify_change is performed from several locations, whenever file * attributes change (such as when a file is truncated, chown/chmod operations, * transferring disk quotas, etc). * * Return: Returns 0 if permission is granted. */ int security_inode_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *attr) { if (unlikely(IS_PRIVATE(d_backing_inode(dentry)))) return 0; return call_int_hook(inode_setattr, idmap, dentry, attr); } EXPORT_SYMBOL_GPL(security_inode_setattr); /** * security_inode_post_setattr() - Update the inode after a setattr operation * @idmap: idmap of the mount * @dentry: file * @ia_valid: file attributes set * * Update inode security field after successful setting file attributes. */ void security_inode_post_setattr(struct mnt_idmap *idmap, struct dentry *dentry, int ia_valid) { if (unlikely(IS_PRIVATE(d_backing_inode(dentry)))) return; call_void_hook(inode_post_setattr, idmap, dentry, ia_valid); } /** * security_inode_getattr() - Check if getting file attributes is allowed * @path: file * * Check permission before obtaining file attributes. * * Return: Returns 0 if permission is granted. */ int security_inode_getattr(const struct path *path) { if (unlikely(IS_PRIVATE(d_backing_inode(path->dentry)))) return 0; return call_int_hook(inode_getattr, path); } /** * security_inode_setxattr() - Check if setting file xattrs is allowed * @idmap: idmap of the mount * @dentry: file * @name: xattr name * @value: xattr value * @size: size of xattr value * @flags: flags * * This hook performs the desired permission checks before setting the extended * attributes (xattrs) on @dentry. It is important to note that we have some * additional logic before the main LSM implementation calls to detect if we * need to perform an additional capability check at the LSM layer. * * Normally we enforce a capability check prior to executing the various LSM * hook implementations, but if a LSM wants to avoid this capability check, * it can register a 'inode_xattr_skipcap' hook and return a value of 1 for * xattrs that it wants to avoid the capability check, leaving the LSM fully * responsible for enforcing the access control for the specific xattr. If all * of the enabled LSMs refrain from registering a 'inode_xattr_skipcap' hook, * or return a 0 (the default return value), the capability check is still * performed. If no 'inode_xattr_skipcap' hooks are registered the capability * check is performed. * * Return: Returns 0 if permission is granted. */ int security_inode_setxattr(struct mnt_idmap *idmap, struct dentry *dentry, const char *name, const void *value, size_t size, int flags) { int rc; if (unlikely(IS_PRIVATE(d_backing_inode(dentry)))) return 0; /* enforce the capability checks at the lsm layer, if needed */ if (!call_int_hook(inode_xattr_skipcap, name)) { rc = cap_inode_setxattr(dentry, name, value, size, flags); if (rc) return rc; } return call_int_hook(inode_setxattr, idmap, dentry, name, value, size, flags); } /** * security_inode_set_acl() - Check if setting posix acls is allowed * @idmap: idmap of the mount * @dentry: file * @acl_name: acl name * @kacl: acl struct * * Check permission before setting posix acls, the posix acls in @kacl are * identified by @acl_name. * * Return: Returns 0 if permission is granted. */ int security_inode_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, const char *acl_name, struct posix_acl *kacl) { if (unlikely(IS_PRIVATE(d_backing_inode(dentry)))) return 0; return call_int_hook(inode_set_acl, idmap, dentry, acl_name, kacl); } /** * security_inode_post_set_acl() - Update inode security from posix acls set * @dentry: file * @acl_name: acl name * @kacl: acl struct * * Update inode security data after successfully setting posix acls on @dentry. * The posix acls in @kacl are identified by @acl_name. */ void security_inode_post_set_acl(struct dentry *dentry, const char *acl_name, struct posix_acl *kacl) { if (unlikely(IS_PRIVATE(d_backing_inode(dentry)))) return; call_void_hook(inode_post_set_acl, dentry, acl_name, kacl); } /** * security_inode_get_acl() - Check if reading posix acls is allowed * @idmap: idmap of the mount * @dentry: file * @acl_name: acl name * * Check permission before getting osix acls, the posix acls are identified by * @acl_name. * * Return: Returns 0 if permission is granted. */ int security_inode_get_acl(struct mnt_idmap *idmap, struct dentry *dentry, const char *acl_name) { if (unlikely(IS_PRIVATE(d_backing_inode(dentry)))) return 0; return call_int_hook(inode_get_acl, idmap, dentry, acl_name); } /** * security_inode_remove_acl() - Check if removing a posix acl is allowed * @idmap: idmap of the mount * @dentry: file * @acl_name: acl name * * Check permission before removing posix acls, the posix acls are identified * by @acl_name. * * Return: Returns 0 if permission is granted. */ int security_inode_remove_acl(struct mnt_idmap *idmap, struct dentry *dentry, const char *acl_name) { if (unlikely(IS_PRIVATE(d_backing_inode(dentry)))) return 0; return call_int_hook(inode_remove_acl, idmap, dentry, acl_name); } /** * security_inode_post_remove_acl() - Update inode security after rm posix acls * @idmap: idmap of the mount * @dentry: file * @acl_name: acl name * * Update inode security data after successfully removing posix acls on * @dentry in @idmap. The posix acls are identified by @acl_name. */ void security_inode_post_remove_acl(struct mnt_idmap *idmap, struct dentry *dentry, const char *acl_name) { if (unlikely(IS_PRIVATE(d_backing_inode(dentry)))) return; call_void_hook(inode_post_remove_acl, idmap, dentry, acl_name); } /** * security_inode_post_setxattr() - Update the inode after a setxattr operation * @dentry: file * @name: xattr name * @value: xattr value * @size: xattr value size * @flags: flags * * Update inode security field after successful setxattr operation. */ void security_inode_post_setxattr(struct dentry *dentry, const char *name, const void *value, size_t size, int flags) { if (unlikely(IS_PRIVATE(d_backing_inode(dentry)))) return; call_void_hook(inode_post_setxattr, dentry, name, value, size, flags); } /** * security_inode_getxattr() - Check if xattr access is allowed * @dentry: file * @name: xattr name * * Check permission before obtaining the extended attributes identified by * @name for @dentry. * * Return: Returns 0 if permission is granted. */ int security_inode_getxattr(struct dentry *dentry, const char *name) { if (unlikely(IS_PRIVATE(d_backing_inode(dentry)))) return 0; return call_int_hook(inode_getxattr, dentry, name); } /** * security_inode_listxattr() - Check if listing xattrs is allowed * @dentry: file * * Check permission before obtaining the list of extended attribute names for * @dentry. * * Return: Returns 0 if permission is granted. */ int security_inode_listxattr(struct dentry *dentry) { if (unlikely(IS_PRIVATE(d_backing_inode(dentry)))) return 0; return call_int_hook(inode_listxattr, dentry); } /** * security_inode_removexattr() - Check if removing an xattr is allowed * @idmap: idmap of the mount * @dentry: file * @name: xattr name * * This hook performs the desired permission checks before setting the extended * attributes (xattrs) on @dentry. It is important to note that we have some * additional logic before the main LSM implementation calls to detect if we * need to perform an additional capability check at the LSM layer. * * Normally we enforce a capability check prior to executing the various LSM * hook implementations, but if a LSM wants to avoid this capability check, * it can register a 'inode_xattr_skipcap' hook and return a value of 1 for * xattrs that it wants to avoid the capability check, leaving the LSM fully * responsible for enforcing the access control for the specific xattr. If all * of the enabled LSMs refrain from registering a 'inode_xattr_skipcap' hook, * or return a 0 (the default return value), the capability check is still * performed. If no 'inode_xattr_skipcap' hooks are registered the capability * check is performed. * * Return: Returns 0 if permission is granted. */ int security_inode_removexattr(struct mnt_idmap *idmap, struct dentry *dentry, const char *name) { int rc; if (unlikely(IS_PRIVATE(d_backing_inode(dentry)))) return 0; /* enforce the capability checks at the lsm layer, if needed */ if (!call_int_hook(inode_xattr_skipcap, name)) { rc = cap_inode_removexattr(idmap, dentry, name); if (rc) return rc; } return call_int_hook(inode_removexattr, idmap, dentry, name); } /** * security_inode_post_removexattr() - Update the inode after a removexattr op * @dentry: file * @name: xattr name * * Update the inode after a successful removexattr operation. */ void security_inode_post_removexattr(struct dentry *dentry, const char *name) { if (unlikely(IS_PRIVATE(d_backing_inode(dentry)))) return; call_void_hook(inode_post_removexattr, dentry, name); } /** * security_inode_need_killpriv() - Check if security_inode_killpriv() required * @dentry: associated dentry * * Called when an inode has been changed to determine if * security_inode_killpriv() should be called. * * Return: Return <0 on error to abort the inode change operation, return 0 if * security_inode_killpriv() does not need to be called, return >0 if * security_inode_killpriv() does need to be called. */ int security_inode_need_killpriv(struct dentry *dentry) { return call_int_hook(inode_need_killpriv, dentry); } /** * security_inode_killpriv() - The setuid bit is removed, update LSM state * @idmap: idmap of the mount * @dentry: associated dentry * * The @dentry's setuid bit is being removed. Remove similar security labels. * Called with the dentry->d_inode->i_mutex held. * * Return: Return 0 on success. If error is returned, then the operation * causing setuid bit removal is failed. */ int security_inode_killpriv(struct mnt_idmap *idmap, struct dentry *dentry) { return call_int_hook(inode_killpriv, idmap, dentry); } /** * security_inode_getsecurity() - Get the xattr security label of an inode * @idmap: idmap of the mount * @inode: inode * @name: xattr name * @buffer: security label buffer * @alloc: allocation flag * * Retrieve a copy of the extended attribute representation of the security * label associated with @name for @inode via @buffer. Note that @name is the * remainder of the attribute name after the security prefix has been removed. * @alloc is used to specify if the call should return a value via the buffer * or just the value length. * * Return: Returns size of buffer on success. */ int security_inode_getsecurity(struct mnt_idmap *idmap, struct inode *inode, const char *name, void **buffer, bool alloc) { if (unlikely(IS_PRIVATE(inode))) return LSM_RET_DEFAULT(inode_getsecurity); return call_int_hook(inode_getsecurity, idmap, inode, name, buffer, alloc); } /** * security_inode_setsecurity() - Set the xattr security label of an inode * @inode: inode * @name: xattr name * @value: security label * @size: length of security label * @flags: flags * * Set the security label associated with @name for @inode from the extended * attribute value @value. @size indicates the size of the @value in bytes. * @flags may be XATTR_CREATE, XATTR_REPLACE, or 0. Note that @name is the * remainder of the attribute name after the security. prefix has been removed. * * Return: Returns 0 on success. */ int security_inode_setsecurity(struct inode *inode, const char *name, const void *value, size_t size, int flags) { if (unlikely(IS_PRIVATE(inode))) return LSM_RET_DEFAULT(inode_setsecurity); return call_int_hook(inode_setsecurity, inode, name, value, size, flags); } /** * security_inode_listsecurity() - List the xattr security label names * @inode: inode * @buffer: buffer * @buffer_size: size of buffer * * Copy the extended attribute names for the security labels associated with * @inode into @buffer. The maximum size of @buffer is specified by * @buffer_size. @buffer may be NULL to request the size of the buffer * required. * * Return: Returns number of bytes used/required on success. */ int security_inode_listsecurity(struct inode *inode, char *buffer, size_t buffer_size) { if (unlikely(IS_PRIVATE(inode))) return 0; return call_int_hook(inode_listsecurity, inode, buffer, buffer_size); } EXPORT_SYMBOL(security_inode_listsecurity); /** * security_inode_getsecid() - Get an inode's secid * @inode: inode * @secid: secid to return * * Get the secid associated with the node. In case of failure, @secid will be * set to zero. */ void security_inode_getsecid(struct inode *inode, u32 *secid) { call_void_hook(inode_getsecid, inode, secid); } /** * security_inode_copy_up() - Create new creds for an overlayfs copy-up op * @src: union dentry of copy-up file * @new: newly created creds * * A file is about to be copied up from lower layer to upper layer of overlay * filesystem. Security module can prepare a set of new creds and modify as * need be and return new creds. Caller will switch to new creds temporarily to * create new file and release newly allocated creds. * * Return: Returns 0 on success or a negative error code on error. */ int security_inode_copy_up(struct dentry *src, struct cred **new) { return call_int_hook(inode_copy_up, src, new); } EXPORT_SYMBOL(security_inode_copy_up); /** * security_inode_copy_up_xattr() - Filter xattrs in an overlayfs copy-up op * @src: union dentry of copy-up file * @name: xattr name * * Filter the xattrs being copied up when a unioned file is copied up from a * lower layer to the union/overlay layer. The caller is responsible for * reading and writing the xattrs, this hook is merely a filter. * * Return: Returns 0 to accept the xattr, -ECANCELED to discard the xattr, * -EOPNOTSUPP if the security module does not know about attribute, * or a negative error code to abort the copy up. */ int security_inode_copy_up_xattr(struct dentry *src, const char *name) { int rc; rc = call_int_hook(inode_copy_up_xattr, src, name); if (rc != LSM_RET_DEFAULT(inode_copy_up_xattr)) return rc; return LSM_RET_DEFAULT(inode_copy_up_xattr); } EXPORT_SYMBOL(security_inode_copy_up_xattr); /** * security_inode_setintegrity() - Set the inode's integrity data * @inode: inode * @type: type of integrity, e.g. hash digest, signature, etc * @value: the integrity value * @size: size of the integrity value * * Register a verified integrity measurement of a inode with LSMs. * LSMs should free the previously saved data if @value is NULL. * * Return: Returns 0 on success, negative values on failure. */ int security_inode_setintegrity(const struct inode *inode, enum lsm_integrity_type type, const void *value, size_t size) { return call_int_hook(inode_setintegrity, inode, type, value, size); } EXPORT_SYMBOL(security_inode_setintegrity); /** * security_kernfs_init_security() - Init LSM context for a kernfs node * @kn_dir: parent kernfs node * @kn: the kernfs node to initialize * * Initialize the security context of a newly created kernfs node based on its * own and its parent's attributes. * * Return: Returns 0 if permission is granted. */ int security_kernfs_init_security(struct kernfs_node *kn_dir, struct kernfs_node *kn) { return call_int_hook(kernfs_init_security, kn_dir, kn); } /** * security_file_permission() - Check file permissions * @file: file * @mask: requested permissions * * Check file permissions before accessing an open file. This hook is called * by various operations that read or write files. A security module can use * this hook to perform additional checking on these operations, e.g. to * revalidate permissions on use to support privilege bracketing or policy * changes. Notice that this hook is used when the actual read/write * operations are performed, whereas the inode_security_ops hook is called when * a file is opened (as well as many other operations). Although this hook can * be used to revalidate permissions for various system call operations that * read or write files, it does not address the revalidation of permissions for * memory-mapped files. Security modules must handle this separately if they * need such revalidation. * * Return: Returns 0 if permission is granted. */ int security_file_permission(struct file *file, int mask) { return call_int_hook(file_permission, file, mask); } /** * security_file_alloc() - Allocate and init a file's LSM blob * @file: the file * * Allocate and attach a security structure to the file->f_security field. The * security field is initialized to NULL when the structure is first created. * * Return: Return 0 if the hook is successful and permission is granted. */ int security_file_alloc(struct file *file) { int rc = lsm_file_alloc(file); if (rc) return rc; rc = call_int_hook(file_alloc_security, file); if (unlikely(rc)) security_file_free(file); return rc; } /** * security_file_release() - Perform actions before releasing the file ref * @file: the file * * Perform actions before releasing the last reference to a file. */ void security_file_release(struct file *file) { call_void_hook(file_release, file); } /** * security_file_free() - Free a file's LSM blob * @file: the file * * Deallocate and free any security structures stored in file->f_security. */ void security_file_free(struct file *file) { void *blob; call_void_hook(file_free_security, file); blob = file->f_security; if (blob) { file->f_security = NULL; kmem_cache_free(lsm_file_cache, blob); } } /** * security_file_ioctl() - Check if an ioctl is allowed * @file: associated file * @cmd: ioctl cmd * @arg: ioctl arguments * * Check permission for an ioctl operation on @file. Note that @arg sometimes * represents a user space pointer; in other cases, it may be a simple integer * value. When @arg represents a user space pointer, it should never be used * by the security module. * * Return: Returns 0 if permission is granted. */ int security_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { return call_int_hook(file_ioctl, file, cmd, arg); } EXPORT_SYMBOL_GPL(security_file_ioctl); /** * security_file_ioctl_compat() - Check if an ioctl is allowed in compat mode * @file: associated file * @cmd: ioctl cmd * @arg: ioctl arguments * * Compat version of security_file_ioctl() that correctly handles 32-bit * processes running on 64-bit kernels. * * Return: Returns 0 if permission is granted. */ int security_file_ioctl_compat(struct file *file, unsigned int cmd, unsigned long arg) { return call_int_hook(file_ioctl_compat, file, cmd, arg); } EXPORT_SYMBOL_GPL(security_file_ioctl_compat); static inline unsigned long mmap_prot(struct file *file, unsigned long prot) { /* * Does we have PROT_READ and does the application expect * it to imply PROT_EXEC? If not, nothing to talk about... */ if ((prot & (PROT_READ | PROT_EXEC)) != PROT_READ) return prot; if (!(current->personality & READ_IMPLIES_EXEC)) return prot; /* * if that's an anonymous mapping, let it. */ if (!file) return prot | PROT_EXEC; /* * ditto if it's not on noexec mount, except that on !MMU we need * NOMMU_MAP_EXEC (== VM_MAYEXEC) in this case */ if (!path_noexec(&file->f_path)) { #ifndef CONFIG_MMU if (file->f_op->mmap_capabilities) { unsigned caps = file->f_op->mmap_capabilities(file); if (!(caps & NOMMU_MAP_EXEC)) return prot; } #endif return prot | PROT_EXEC; } /* anything on noexec mount won't get PROT_EXEC */ return prot; } /** * security_mmap_file() - Check if mmap'ing a file is allowed * @file: file * @prot: protection applied by the kernel * @flags: flags * * Check permissions for a mmap operation. The @file may be NULL, e.g. if * mapping anonymous memory. * * Return: Returns 0 if permission is granted. */ int security_mmap_file(struct file *file, unsigned long prot, unsigned long flags) { return call_int_hook(mmap_file, file, prot, mmap_prot(file, prot), flags); } /** * security_mmap_addr() - Check if mmap'ing an address is allowed * @addr: address * * Check permissions for a mmap operation at @addr. * * Return: Returns 0 if permission is granted. */ int security_mmap_addr(unsigned long addr) { return call_int_hook(mmap_addr, addr); } /** * security_file_mprotect() - Check if changing memory protections is allowed * @vma: memory region * @reqprot: application requested protection * @prot: protection applied by the kernel * * Check permissions before changing memory access permissions. * * Return: Returns 0 if permission is granted. */ int security_file_mprotect(struct vm_area_struct *vma, unsigned long reqprot, unsigned long prot) { return call_int_hook(file_mprotect, vma, reqprot, prot); } /** * security_file_lock() - Check if a file lock is allowed * @file: file * @cmd: lock operation (e.g. F_RDLCK, F_WRLCK) * * Check permission before performing file locking operations. Note the hook * mediates both flock and fcntl style locks. * * Return: Returns 0 if permission is granted. */ int security_file_lock(struct file *file, unsigned int cmd) { return call_int_hook(file_lock, file, cmd); } /** * security_file_fcntl() - Check if fcntl() op is allowed * @file: file * @cmd: fcntl command * @arg: command argument * * Check permission before allowing the file operation specified by @cmd from * being performed on the file @file. Note that @arg sometimes represents a * user space pointer; in other cases, it may be a simple integer value. When * @arg represents a user space pointer, it should never be used by the * security module. * * Return: Returns 0 if permission is granted. */ int security_file_fcntl(struct file *file, unsigned int cmd, unsigned long arg) { return call_int_hook(file_fcntl, file, cmd, arg); } /** * security_file_set_fowner() - Set the file owner info in the LSM blob * @file: the file * * Save owner security information (typically from current->security) in * file->f_security for later use by the send_sigiotask hook. * * This hook is called with file->f_owner.lock held. * * Return: Returns 0 on success. */ void security_file_set_fowner(struct file *file) { call_void_hook(file_set_fowner, file); } /** * security_file_send_sigiotask() - Check if sending SIGIO/SIGURG is allowed * @tsk: target task * @fown: signal sender * @sig: signal to be sent, SIGIO is sent if 0 * * Check permission for the file owner @fown to send SIGIO or SIGURG to the * process @tsk. Note that this hook is sometimes called from interrupt. Note * that the fown_struct, @fown, is never outside the context of a struct file, * so the file structure (and associated security information) can always be * obtained: container_of(fown, struct file, f_owner). * * Return: Returns 0 if permission is granted. */ int security_file_send_sigiotask(struct task_struct *tsk, struct fown_struct *fown, int sig) { return call_int_hook(file_send_sigiotask, tsk, fown, sig); } /** * security_file_receive() - Check if receiving a file via IPC is allowed * @file: file being received * * This hook allows security modules to control the ability of a process to * receive an open file descriptor via socket IPC. * * Return: Returns 0 if permission is granted. */ int security_file_receive(struct file *file) { return call_int_hook(file_receive, file); } /** * security_file_open() - Save open() time state for late use by the LSM * @file: * * Save open-time permission checking state for later use upon file_permission, * and recheck access if anything has changed since inode_permission. * * Return: Returns 0 if permission is granted. */ int security_file_open(struct file *file) { int ret; ret = call_int_hook(file_open, file); if (ret) return ret; return fsnotify_open_perm(file); } /** * security_file_post_open() - Evaluate a file after it has been opened * @file: the file * @mask: access mask * * Evaluate an opened file and the access mask requested with open(). The hook * is useful for LSMs that require the file content to be available in order to * make decisions. * * Return: Returns 0 if permission is granted. */ int security_file_post_open(struct file *file, int mask) { return call_int_hook(file_post_open, file, mask); } EXPORT_SYMBOL_GPL(security_file_post_open); /** * security_file_truncate() - Check if truncating a file is allowed * @file: file * * Check permission before truncating a file, i.e. using ftruncate. Note that * truncation permission may also be checked based on the path, using the * @path_truncate hook. * * Return: Returns 0 if permission is granted. */ int security_file_truncate(struct file *file) { return call_int_hook(file_truncate, file); } /** * security_task_alloc() - Allocate a task's LSM blob * @task: the task * @clone_flags: flags indicating what is being shared * * Handle allocation of task-related resources. * * Return: Returns a zero on success, negative values on failure. */ int security_task_alloc(struct task_struct *task, unsigned long clone_flags) { int rc = lsm_task_alloc(task); if (rc) return rc; rc = call_int_hook(task_alloc, task, clone_flags); if (unlikely(rc)) security_task_free(task); return rc; } /** * security_task_free() - Free a task's LSM blob and related resources * @task: task * * Handle release of task-related resources. Note that this can be called from * interrupt context. */ void security_task_free(struct task_struct *task) { call_void_hook(task_free, task); kfree(task->security); task->security = NULL; } /** * security_cred_alloc_blank() - Allocate the min memory to allow cred_transfer * @cred: credentials * @gfp: gfp flags * * Only allocate sufficient memory and attach to @cred such that * cred_transfer() will not get ENOMEM. * * Return: Returns 0 on success, negative values on failure. */ int security_cred_alloc_blank(struct cred *cred, gfp_t gfp) { int rc = lsm_cred_alloc(cred, gfp); if (rc) return rc; rc = call_int_hook(cred_alloc_blank, cred, gfp); if (unlikely(rc)) security_cred_free(cred); return rc; } /** * security_cred_free() - Free the cred's LSM blob and associated resources * @cred: credentials * * Deallocate and clear the cred->security field in a set of credentials. */ void security_cred_free(struct cred *cred) { /* * There is a failure case in prepare_creds() that * may result in a call here with ->security being NULL. */ if (unlikely(cred->security == NULL)) return; call_void_hook(cred_free, cred); kfree(cred->security); cred->security = NULL; } /** * security_prepare_creds() - Prepare a new set of credentials * @new: new credentials * @old: original credentials * @gfp: gfp flags * * Prepare a new set of credentials by copying the data from the old set. * * Return: Returns 0 on success, negative values on failure. */ int security_prepare_creds(struct cred *new, const struct cred *old, gfp_t gfp) { int rc = lsm_cred_alloc(new, gfp); if (rc) return rc; rc = call_int_hook(cred_prepare, new, old, gfp); if (unlikely(rc)) security_cred_free(new); return rc; } /** * security_transfer_creds() - Transfer creds * @new: target credentials * @old: original credentials * * Transfer data from original creds to new creds. */ void security_transfer_creds(struct cred *new, const struct cred *old) { call_void_hook(cred_transfer, new, old); } /** * security_cred_getsecid() - Get the secid from a set of credentials * @c: credentials * @secid: secid value * * Retrieve the security identifier of the cred structure @c. In case of * failure, @secid will be set to zero. */ void security_cred_getsecid(const struct cred *c, u32 *secid) { *secid = 0; call_void_hook(cred_getsecid, c, secid); } EXPORT_SYMBOL(security_cred_getsecid); /** * security_kernel_act_as() - Set the kernel credentials to act as secid * @new: credentials * @secid: secid * * Set the credentials for a kernel service to act as (subjective context). * The current task must be the one that nominated @secid. * * Return: Returns 0 if successful. */ int security_kernel_act_as(struct cred *new, u32 secid) { return call_int_hook(kernel_act_as, new, secid); } /** * security_kernel_create_files_as() - Set file creation context using an inode * @new: target credentials * @inode: reference inode * * Set the file creation context in a set of credentials to be the same as the * objective context of the specified inode. The current task must be the one * that nominated @inode. * * Return: Returns 0 if successful. */ int security_kernel_create_files_as(struct cred *new, struct inode *inode) { return call_int_hook(kernel_create_files_as, new, inode); } /** * security_kernel_module_request() - Check if loading a module is allowed * @kmod_name: module name * * Ability to trigger the kernel to automatically upcall to userspace for * userspace to load a kernel module with the given name. * * Return: Returns 0 if successful. */ int security_kernel_module_request(char *kmod_name) { return call_int_hook(kernel_module_request, kmod_name); } /** * security_kernel_read_file() - Read a file specified by userspace * @file: file * @id: file identifier * @contents: trust if security_kernel_post_read_file() will be called * * Read a file specified by userspace. * * Return: Returns 0 if permission is granted. */ int security_kernel_read_file(struct file *file, enum kernel_read_file_id id, bool contents) { return call_int_hook(kernel_read_file, file, id, contents); } EXPORT_SYMBOL_GPL(security_kernel_read_file); /** * security_kernel_post_read_file() - Read a file specified by userspace * @file: file * @buf: file contents * @size: size of file contents * @id: file identifier * * Read a file specified by userspace. This must be paired with a prior call * to security_kernel_read_file() call that indicated this hook would also be * called, see security_kernel_read_file() for more information. * * Return: Returns 0 if permission is granted. */ int security_kernel_post_read_file(struct file *file, char *buf, loff_t size, enum kernel_read_file_id id) { return call_int_hook(kernel_post_read_file, file, buf, size, id); } EXPORT_SYMBOL_GPL(security_kernel_post_read_file); /** * security_kernel_load_data() - Load data provided by userspace * @id: data identifier * @contents: true if security_kernel_post_load_data() will be called * * Load data provided by userspace. * * Return: Returns 0 if permission is granted. */ int security_kernel_load_data(enum kernel_load_data_id id, bool contents) { return call_int_hook(kernel_load_data, id, contents); } EXPORT_SYMBOL_GPL(security_kernel_load_data); /** * security_kernel_post_load_data() - Load userspace data from a non-file source * @buf: data * @size: size of data * @id: data identifier * @description: text description of data, specific to the id value * * Load data provided by a non-file source (usually userspace buffer). This * must be paired with a prior security_kernel_load_data() call that indicated * this hook would also be called, see security_kernel_load_data() for more * information. * * Return: Returns 0 if permission is granted. */ int security_kernel_post_load_data(char *buf, loff_t size, enum kernel_load_data_id id, char *description) { return call_int_hook(kernel_post_load_data, buf, size, id, description); } EXPORT_SYMBOL_GPL(security_kernel_post_load_data); /** * security_task_fix_setuid() - Update LSM with new user id attributes * @new: updated credentials * @old: credentials being replaced * @flags: LSM_SETID_* flag values * * Update the module's state after setting one or more of the user identity * attributes of the current process. The @flags parameter indicates which of * the set*uid system calls invoked this hook. If @new is the set of * credentials that will be installed. Modifications should be made to this * rather than to @current->cred. * * Return: Returns 0 on success. */ int security_task_fix_setuid(struct cred *new, const struct cred *old, int flags) { return call_int_hook(task_fix_setuid, new, old, flags); } /** * security_task_fix_setgid() - Update LSM with new group id attributes * @new: updated credentials * @old: credentials being replaced * @flags: LSM_SETID_* flag value * * Update the module's state after setting one or more of the group identity * attributes of the current process. The @flags parameter indicates which of * the set*gid system calls invoked this hook. @new is the set of credentials * that will be installed. Modifications should be made to this rather than to * @current->cred. * * Return: Returns 0 on success. */ int security_task_fix_setgid(struct cred *new, const struct cred *old, int flags) { return call_int_hook(task_fix_setgid, new, old, flags); } /** * security_task_fix_setgroups() - Update LSM with new supplementary groups * @new: updated credentials * @old: credentials being replaced * * Update the module's state after setting the supplementary group identity * attributes of the current process. @new is the set of credentials that will * be installed. Modifications should be made to this rather than to * @current->cred. * * Return: Returns 0 on success. */ int security_task_fix_setgroups(struct cred *new, const struct cred *old) { return call_int_hook(task_fix_setgroups, new, old); } /** * security_task_setpgid() - Check if setting the pgid is allowed * @p: task being modified * @pgid: new pgid * * Check permission before setting the process group identifier of the process * @p to @pgid. * * Return: Returns 0 if permission is granted. */ int security_task_setpgid(struct task_struct *p, pid_t pgid) { return call_int_hook(task_setpgid, p, pgid); } /** * security_task_getpgid() - Check if getting the pgid is allowed * @p: task * * Check permission before getting the process group identifier of the process * @p. * * Return: Returns 0 if permission is granted. */ int security_task_getpgid(struct task_struct *p) { return call_int_hook(task_getpgid, p); } /** * security_task_getsid() - Check if getting the session id is allowed * @p: task * * Check permission before getting the session identifier of the process @p. * * Return: Returns 0 if permission is granted. */ int security_task_getsid(struct task_struct *p) { return call_int_hook(task_getsid, p); } /** * security_current_getsecid_subj() - Get the current task's subjective secid * @secid: secid value * * Retrieve the subjective security identifier of the current task and return * it in @secid. In case of failure, @secid will be set to zero. */ void security_current_getsecid_subj(u32 *secid) { *secid = 0; call_void_hook(current_getsecid_subj, secid); } EXPORT_SYMBOL(security_current_getsecid_subj); /** * security_task_getsecid_obj() - Get a task's objective secid * @p: target task * @secid: secid value * * Retrieve the objective security identifier of the task_struct in @p and * return it in @secid. In case of failure, @secid will be set to zero. */ void security_task_getsecid_obj(struct task_struct *p, u32 *secid) { *secid = 0; call_void_hook(task_getsecid_obj, p, secid); } EXPORT_SYMBOL(security_task_getsecid_obj); /** * security_task_setnice() - Check if setting a task's nice value is allowed * @p: target task * @nice: nice value * * Check permission before setting the nice value of @p to @nice. * * Return: Returns 0 if permission is granted. */ int security_task_setnice(struct task_struct *p, int nice) { return call_int_hook(task_setnice, p, nice); } /** * security_task_setioprio() - Check if setting a task's ioprio is allowed * @p: target task * @ioprio: ioprio value * * Check permission before setting the ioprio value of @p to @ioprio. * * Return: Returns 0 if permission is granted. */ int security_task_setioprio(struct task_struct *p, int ioprio) { return call_int_hook(task_setioprio, p, ioprio); } /** * security_task_getioprio() - Check if getting a task's ioprio is allowed * @p: task * * Check permission before getting the ioprio value of @p. * * Return: Returns 0 if permission is granted. */ int security_task_getioprio(struct task_struct *p) { return call_int_hook(task_getioprio, p); } /** * security_task_prlimit() - Check if get/setting resources limits is allowed * @cred: current task credentials * @tcred: target task credentials * @flags: LSM_PRLIMIT_* flag bits indicating a get/set/both * * Check permission before getting and/or setting the resource limits of * another task. * * Return: Returns 0 if permission is granted. */ int security_task_prlimit(const struct cred *cred, const struct cred *tcred, unsigned int flags) { return call_int_hook(task_prlimit, cred, tcred, flags); } /** * security_task_setrlimit() - Check if setting a new rlimit value is allowed * @p: target task's group leader * @resource: resource whose limit is being set * @new_rlim: new resource limit * * Check permission before setting the resource limits of process @p for * @resource to @new_rlim. The old resource limit values can be examined by * dereferencing (p->signal->rlim + resource). * * Return: Returns 0 if permission is granted. */ int security_task_setrlimit(struct task_struct *p, unsigned int resource, struct rlimit *new_rlim) { return call_int_hook(task_setrlimit, p, resource, new_rlim); } /** * security_task_setscheduler() - Check if setting sched policy/param is allowed * @p: target task * * Check permission before setting scheduling policy and/or parameters of * process @p. * * Return: Returns 0 if permission is granted. */ int security_task_setscheduler(struct task_struct *p) { return call_int_hook(task_setscheduler, p); } /** * security_task_getscheduler() - Check if getting scheduling info is allowed * @p: target task * * Check permission before obtaining scheduling information for process @p. * * Return: Returns 0 if permission is granted. */ int security_task_getscheduler(struct task_struct *p) { return call_int_hook(task_getscheduler, p); } /** * security_task_movememory() - Check if moving memory is allowed * @p: task * * Check permission before moving memory owned by process @p. * * Return: Returns 0 if permission is granted. */ int security_task_movememory(struct task_struct *p) { return call_int_hook(task_movememory, p); } /** * security_task_kill() - Check if sending a signal is allowed * @p: target process * @info: signal information * @sig: signal value * @cred: credentials of the signal sender, NULL if @current * * Check permission before sending signal @sig to @p. @info can be NULL, the * constant 1, or a pointer to a kernel_siginfo structure. If @info is 1 or * SI_FROMKERNEL(info) is true, then the signal should be viewed as coming from * the kernel and should typically be permitted. SIGIO signals are handled * separately by the send_sigiotask hook in file_security_ops. * * Return: Returns 0 if permission is granted. */ int security_task_kill(struct task_struct *p, struct kernel_siginfo *info, int sig, const struct cred *cred) { return call_int_hook(task_kill, p, info, sig, cred); } /** * security_task_prctl() - Check if a prctl op is allowed * @option: operation * @arg2: argument * @arg3: argument * @arg4: argument * @arg5: argument * * Check permission before performing a process control operation on the * current process. * * Return: Return -ENOSYS if no-one wanted to handle this op, any other value * to cause prctl() to return immediately with that value. */ int security_task_prctl(int option, unsigned long arg2, unsigned long arg3, unsigned long arg4, unsigned long arg5) { int thisrc; int rc = LSM_RET_DEFAULT(task_prctl); struct lsm_static_call *scall; lsm_for_each_hook(scall, task_prctl) { thisrc = scall->hl->hook.task_prctl(option, arg2, arg3, arg4, arg5); if (thisrc != LSM_RET_DEFAULT(task_prctl)) { rc = thisrc; if (thisrc != 0) break; } } return rc; } /** * security_task_to_inode() - Set the security attributes of a task's inode * @p: task * @inode: inode * * Set the security attributes for an inode based on an associated task's * security attributes, e.g. for /proc/pid inodes. */ void security_task_to_inode(struct task_struct *p, struct inode *inode) { call_void_hook(task_to_inode, p, inode); } /** * security_create_user_ns() - Check if creating a new userns is allowed * @cred: prepared creds * * Check permission prior to creating a new user namespace. * * Return: Returns 0 if successful, otherwise < 0 error code. */ int security_create_user_ns(const struct cred *cred) { return call_int_hook(userns_create, cred); } /** * security_ipc_permission() - Check if sysv ipc access is allowed * @ipcp: ipc permission structure * @flag: requested permissions * * Check permissions for access to IPC. * * Return: Returns 0 if permission is granted. */ int security_ipc_permission(struct kern_ipc_perm *ipcp, short flag) { return call_int_hook(ipc_permission, ipcp, flag); } /** * security_ipc_getsecid() - Get the sysv ipc object's secid * @ipcp: ipc permission structure * @secid: secid pointer * * Get the secid associated with the ipc object. In case of failure, @secid * will be set to zero. */ void security_ipc_getsecid(struct kern_ipc_perm *ipcp, u32 *secid) { *secid = 0; call_void_hook(ipc_getsecid, ipcp, secid); } /** * security_msg_msg_alloc() - Allocate a sysv ipc message LSM blob * @msg: message structure * * Allocate and attach a security structure to the msg->security field. The * security field is initialized to NULL when the structure is first created. * * Return: Return 0 if operation was successful and permission is granted. */ int security_msg_msg_alloc(struct msg_msg *msg) { int rc = lsm_msg_msg_alloc(msg); if (unlikely(rc)) return rc; rc = call_int_hook(msg_msg_alloc_security, msg); if (unlikely(rc)) security_msg_msg_free(msg); return rc; } /** * security_msg_msg_free() - Free a sysv ipc message LSM blob * @msg: message structure * * Deallocate the security structure for this message. */ void security_msg_msg_free(struct msg_msg *msg) { call_void_hook(msg_msg_free_security, msg); kfree(msg->security); msg->security = NULL; } /** * security_msg_queue_alloc() - Allocate a sysv ipc msg queue LSM blob * @msq: sysv ipc permission structure * * Allocate and attach a security structure to @msg. The security field is * initialized to NULL when the structure is first created. * * Return: Returns 0 if operation was successful and permission is granted. */ int security_msg_queue_alloc(struct kern_ipc_perm *msq) { int rc = lsm_ipc_alloc(msq); if (unlikely(rc)) return rc; rc = call_int_hook(msg_queue_alloc_security, msq); if (unlikely(rc)) security_msg_queue_free(msq); return rc; } /** * security_msg_queue_free() - Free a sysv ipc msg queue LSM blob * @msq: sysv ipc permission structure * * Deallocate security field @perm->security for the message queue. */ void security_msg_queue_free(struct kern_ipc_perm *msq) { call_void_hook(msg_queue_free_security, msq); kfree(msq->security); msq->security = NULL; } /** * security_msg_queue_associate() - Check if a msg queue operation is allowed * @msq: sysv ipc permission structure * @msqflg: operation flags * * Check permission when a message queue is requested through the msgget system * call. This hook is only called when returning the message queue identifier * for an existing message queue, not when a new message queue is created. * * Return: Return 0 if permission is granted. */ int security_msg_queue_associate(struct kern_ipc_perm *msq, int msqflg) { return call_int_hook(msg_queue_associate, msq, msqflg); } /** * security_msg_queue_msgctl() - Check if a msg queue operation is allowed * @msq: sysv ipc permission structure * @cmd: operation * * Check permission when a message control operation specified by @cmd is to be * performed on the message queue with permissions. * * Return: Returns 0 if permission is granted. */ int security_msg_queue_msgctl(struct kern_ipc_perm *msq, int cmd) { return call_int_hook(msg_queue_msgctl, msq, cmd); } /** * security_msg_queue_msgsnd() - Check if sending a sysv ipc message is allowed * @msq: sysv ipc permission structure * @msg: message * @msqflg: operation flags * * Check permission before a message, @msg, is enqueued on the message queue * with permissions specified in @msq. * * Return: Returns 0 if permission is granted. */ int security_msg_queue_msgsnd(struct kern_ipc_perm *msq, struct msg_msg *msg, int msqflg) { return call_int_hook(msg_queue_msgsnd, msq, msg, msqflg); } /** * security_msg_queue_msgrcv() - Check if receiving a sysv ipc msg is allowed * @msq: sysv ipc permission structure * @msg: message * @target: target task * @type: type of message requested * @mode: operation flags * * Check permission before a message, @msg, is removed from the message queue. * The @target task structure contains a pointer to the process that will be * receiving the message (not equal to the current process when inline receives * are being performed). * * Return: Returns 0 if permission is granted. */ int security_msg_queue_msgrcv(struct kern_ipc_perm *msq, struct msg_msg *msg, struct task_struct *target, long type, int mode) { return call_int_hook(msg_queue_msgrcv, msq, msg, target, type, mode); } /** * security_shm_alloc() - Allocate a sysv shm LSM blob * @shp: sysv ipc permission structure * * Allocate and attach a security structure to the @shp security field. The * security field is initialized to NULL when the structure is first created. * * Return: Returns 0 if operation was successful and permission is granted. */ int security_shm_alloc(struct kern_ipc_perm *shp) { int rc = lsm_ipc_alloc(shp); if (unlikely(rc)) return rc; rc = call_int_hook(shm_alloc_security, shp); if (unlikely(rc)) security_shm_free(shp); return rc; } /** * security_shm_free() - Free a sysv shm LSM blob * @shp: sysv ipc permission structure * * Deallocate the security structure @perm->security for the memory segment. */ void security_shm_free(struct kern_ipc_perm *shp) { call_void_hook(shm_free_security, shp); kfree(shp->security); shp->security = NULL; } /** * security_shm_associate() - Check if a sysv shm operation is allowed * @shp: sysv ipc permission structure * @shmflg: operation flags * * Check permission when a shared memory region is requested through the shmget * system call. This hook is only called when returning the shared memory * region identifier for an existing region, not when a new shared memory * region is created. * * Return: Returns 0 if permission is granted. */ int security_shm_associate(struct kern_ipc_perm *shp, int shmflg) { return call_int_hook(shm_associate, shp, shmflg); } /** * security_shm_shmctl() - Check if a sysv shm operation is allowed * @shp: sysv ipc permission structure * @cmd: operation * * Check permission when a shared memory control operation specified by @cmd is * to be performed on the shared memory region with permissions in @shp. * * Return: Return 0 if permission is granted. */ int security_shm_shmctl(struct kern_ipc_perm *shp, int cmd) { return call_int_hook(shm_shmctl, shp, cmd); } /** * security_shm_shmat() - Check if a sysv shm attach operation is allowed * @shp: sysv ipc permission structure * @shmaddr: address of memory region to attach * @shmflg: operation flags * * Check permissions prior to allowing the shmat system call to attach the * shared memory segment with permissions @shp to the data segment of the * calling process. The attaching address is specified by @shmaddr. * * Return: Returns 0 if permission is granted. */ int security_shm_shmat(struct kern_ipc_perm *shp, char __user *shmaddr, int shmflg) { return call_int_hook(shm_shmat, shp, shmaddr, shmflg); } /** * security_sem_alloc() - Allocate a sysv semaphore LSM blob * @sma: sysv ipc permission structure * * Allocate and attach a security structure to the @sma security field. The * security field is initialized to NULL when the structure is first created. * * Return: Returns 0 if operation was successful and permission is granted. */ int security_sem_alloc(struct kern_ipc_perm *sma) { int rc = lsm_ipc_alloc(sma); if (unlikely(rc)) return rc; rc = call_int_hook(sem_alloc_security, sma); if (unlikely(rc)) security_sem_free(sma); return rc; } /** * security_sem_free() - Free a sysv semaphore LSM blob * @sma: sysv ipc permission structure * * Deallocate security structure @sma->security for the semaphore. */ void security_sem_free(struct kern_ipc_perm *sma) { call_void_hook(sem_free_security, sma); kfree(sma->security); sma->security = NULL; } /** * security_sem_associate() - Check if a sysv semaphore operation is allowed * @sma: sysv ipc permission structure * @semflg: operation flags * * Check permission when a semaphore is requested through the semget system * call. This hook is only called when returning the semaphore identifier for * an existing semaphore, not when a new one must be created. * * Return: Returns 0 if permission is granted. */ int security_sem_associate(struct kern_ipc_perm *sma, int semflg) { return call_int_hook(sem_associate, sma, semflg); } /** * security_sem_semctl() - Check if a sysv semaphore operation is allowed * @sma: sysv ipc permission structure * @cmd: operation * * Check permission when a semaphore operation specified by @cmd is to be * performed on the semaphore. * * Return: Returns 0 if permission is granted. */ int security_sem_semctl(struct kern_ipc_perm *sma, int cmd) { return call_int_hook(sem_semctl, sma, cmd); } /** * security_sem_semop() - Check if a sysv semaphore operation is allowed * @sma: sysv ipc permission structure * @sops: operations to perform * @nsops: number of operations * @alter: flag indicating changes will be made * * Check permissions before performing operations on members of the semaphore * set. If the @alter flag is nonzero, the semaphore set may be modified. * * Return: Returns 0 if permission is granted. */ int security_sem_semop(struct kern_ipc_perm *sma, struct sembuf *sops, unsigned nsops, int alter) { return call_int_hook(sem_semop, sma, sops, nsops, alter); } /** * security_d_instantiate() - Populate an inode's LSM state based on a dentry * @dentry: dentry * @inode: inode * * Fill in @inode security information for a @dentry if allowed. */ void security_d_instantiate(struct dentry *dentry, struct inode *inode) { if (unlikely(inode && IS_PRIVATE(inode))) return; call_void_hook(d_instantiate, dentry, inode); } EXPORT_SYMBOL(security_d_instantiate); /* * Please keep this in sync with it's counterpart in security/lsm_syscalls.c */ /** * security_getselfattr - Read an LSM attribute of the current process. * @attr: which attribute to return * @uctx: the user-space destination for the information, or NULL * @size: pointer to the size of space available to receive the data * @flags: special handling options. LSM_FLAG_SINGLE indicates that only * attributes associated with the LSM identified in the passed @ctx be * reported. * * A NULL value for @uctx can be used to get both the number of attributes * and the size of the data. * * Returns the number of attributes found on success, negative value * on error. @size is reset to the total size of the data. * If @size is insufficient to contain the data -E2BIG is returned. */ int security_getselfattr(unsigned int attr, struct lsm_ctx __user *uctx, u32 __user *size, u32 flags) { struct lsm_static_call *scall; struct lsm_ctx lctx = { .id = LSM_ID_UNDEF, }; u8 __user *base = (u8 __user *)uctx; u32 entrysize; u32 total = 0; u32 left; bool toobig = false; bool single = false; int count = 0; int rc; if (attr == LSM_ATTR_UNDEF) return -EINVAL; if (size == NULL) return -EINVAL; if (get_user(left, size)) return -EFAULT; if (flags) { /* * Only flag supported is LSM_FLAG_SINGLE */ if (flags != LSM_FLAG_SINGLE || !uctx) return -EINVAL; if (copy_from_user(&lctx, uctx, sizeof(lctx))) return -EFAULT; /* * If the LSM ID isn't specified it is an error. */ if (lctx.id == LSM_ID_UNDEF) return -EINVAL; single = true; } /* * In the usual case gather all the data from the LSMs. * In the single case only get the data from the LSM specified. */ lsm_for_each_hook(scall, getselfattr) { if (single && lctx.id != scall->hl->lsmid->id) continue; entrysize = left; if (base) uctx = (struct lsm_ctx __user *)(base + total); rc = scall->hl->hook.getselfattr(attr, uctx, &entrysize, flags); if (rc == -EOPNOTSUPP) { rc = 0; continue; } if (rc == -E2BIG) { rc = 0; left = 0; toobig = true; } else if (rc < 0) return rc; else left -= entrysize; total += entrysize; count += rc; if (single) break; } if (put_user(total, size)) return -EFAULT; if (toobig) return -E2BIG; if (count == 0) return LSM_RET_DEFAULT(getselfattr); return count; } /* * Please keep this in sync with it's counterpart in security/lsm_syscalls.c */ /** * security_setselfattr - Set an LSM attribute on the current process. * @attr: which attribute to set * @uctx: the user-space source for the information * @size: the size of the data * @flags: reserved for future use, must be 0 * * Set an LSM attribute for the current process. The LSM, attribute * and new value are included in @uctx. * * Returns 0 on success, -EINVAL if the input is inconsistent, -EFAULT * if the user buffer is inaccessible, E2BIG if size is too big, or an * LSM specific failure. */ int security_setselfattr(unsigned int attr, struct lsm_ctx __user *uctx, u32 size, u32 flags) { struct lsm_static_call *scall; struct lsm_ctx *lctx; int rc = LSM_RET_DEFAULT(setselfattr); u64 required_len; if (flags) return -EINVAL; if (size < sizeof(*lctx)) return -EINVAL; if (size > PAGE_SIZE) return -E2BIG; lctx = memdup_user(uctx, size); if (IS_ERR(lctx)) return PTR_ERR(lctx); if (size < lctx->len || check_add_overflow(sizeof(*lctx), lctx->ctx_len, &required_len) || lctx->len < required_len) { rc = -EINVAL; goto free_out; } lsm_for_each_hook(scall, setselfattr) if ((scall->hl->lsmid->id) == lctx->id) { rc = scall->hl->hook.setselfattr(attr, lctx, size, flags); break; } free_out: kfree(lctx); return rc; } /** * security_getprocattr() - Read an attribute for a task * @p: the task * @lsmid: LSM identification * @name: attribute name * @value: attribute value * * Read attribute @name for task @p and store it into @value if allowed. * * Return: Returns the length of @value on success, a negative value otherwise. */ int security_getprocattr(struct task_struct *p, int lsmid, const char *name, char **value) { struct lsm_static_call *scall; lsm_for_each_hook(scall, getprocattr) { if (lsmid != 0 && lsmid != scall->hl->lsmid->id) continue; return scall->hl->hook.getprocattr(p, name, value); } return LSM_RET_DEFAULT(getprocattr); } /** * security_setprocattr() - Set an attribute for a task * @lsmid: LSM identification * @name: attribute name * @value: attribute value * @size: attribute value size * * Write (set) the current task's attribute @name to @value, size @size if * allowed. * * Return: Returns bytes written on success, a negative value otherwise. */ int security_setprocattr(int lsmid, const char *name, void *value, size_t size) { struct lsm_static_call *scall; lsm_for_each_hook(scall, setprocattr) { if (lsmid != 0 && lsmid != scall->hl->lsmid->id) continue; return scall->hl->hook.setprocattr(name, value, size); } return LSM_RET_DEFAULT(setprocattr); } /** * security_netlink_send() - Save info and check if netlink sending is allowed * @sk: sending socket * @skb: netlink message * * Save security information for a netlink message so that permission checking * can be performed when the message is processed. The security information * can be saved using the eff_cap field of the netlink_skb_parms structure. * Also may be used to provide fine grained control over message transmission. * * Return: Returns 0 if the information was successfully saved and message is * allowed to be transmitted. */ int security_netlink_send(struct sock *sk, struct sk_buff *skb) { return call_int_hook(netlink_send, sk, skb); } /** * security_ismaclabel() - Check if the named attribute is a MAC label * @name: full extended attribute name * * Check if the extended attribute specified by @name represents a MAC label. * * Return: Returns 1 if name is a MAC attribute otherwise returns 0. */ int security_ismaclabel(const char *name) { return call_int_hook(ismaclabel, name); } EXPORT_SYMBOL(security_ismaclabel); /** * security_secid_to_secctx() - Convert a secid to a secctx * @secid: secid * @secdata: secctx * @seclen: secctx length * * Convert secid to security context. If @secdata is NULL the length of the * result will be returned in @seclen, but no @secdata will be returned. This * does mean that the length could change between calls to check the length and * the next call which actually allocates and returns the @secdata. * * Return: Return 0 on success, error on failure. */ int security_secid_to_secctx(u32 secid, char **secdata, u32 *seclen) { return call_int_hook(secid_to_secctx, secid, secdata, seclen); } EXPORT_SYMBOL(security_secid_to_secctx); /** * security_secctx_to_secid() - Convert a secctx to a secid * @secdata: secctx * @seclen: length of secctx * @secid: secid * * Convert security context to secid. * * Return: Returns 0 on success, error on failure. */ int security_secctx_to_secid(const char *secdata, u32 seclen, u32 *secid) { *secid = 0; return call_int_hook(secctx_to_secid, secdata, seclen, secid); } EXPORT_SYMBOL(security_secctx_to_secid); /** * security_release_secctx() - Free a secctx buffer * @secdata: secctx * @seclen: length of secctx * * Release the security context. */ void security_release_secctx(char *secdata, u32 seclen) { call_void_hook(release_secctx, secdata, seclen); } EXPORT_SYMBOL(security_release_secctx); /** * security_inode_invalidate_secctx() - Invalidate an inode's security label * @inode: inode * * Notify the security module that it must revalidate the security context of * an inode. */ void security_inode_invalidate_secctx(struct inode *inode) { call_void_hook(inode_invalidate_secctx, inode); } EXPORT_SYMBOL(security_inode_invalidate_secctx); /** * security_inode_notifysecctx() - Notify the LSM of an inode's security label * @inode: inode * @ctx: secctx * @ctxlen: length of secctx * * Notify the security module of what the security context of an inode should * be. Initializes the incore security context managed by the security module * for this inode. Example usage: NFS client invokes this hook to initialize * the security context in its incore inode to the value provided by the server * for the file when the server returned the file's attributes to the client. * Must be called with inode->i_mutex locked. * * Return: Returns 0 on success, error on failure. */ int security_inode_notifysecctx(struct inode *inode, void *ctx, u32 ctxlen) { return call_int_hook(inode_notifysecctx, inode, ctx, ctxlen); } EXPORT_SYMBOL(security_inode_notifysecctx); /** * security_inode_setsecctx() - Change the security label of an inode * @dentry: inode * @ctx: secctx * @ctxlen: length of secctx * * Change the security context of an inode. Updates the incore security * context managed by the security module and invokes the fs code as needed * (via __vfs_setxattr_noperm) to update any backing xattrs that represent the * context. Example usage: NFS server invokes this hook to change the security * context in its incore inode and on the backing filesystem to a value * provided by the client on a SETATTR operation. Must be called with * inode->i_mutex locked. * * Return: Returns 0 on success, error on failure. */ int security_inode_setsecctx(struct dentry *dentry, void *ctx, u32 ctxlen) { return call_int_hook(inode_setsecctx, dentry, ctx, ctxlen); } EXPORT_SYMBOL(security_inode_setsecctx); /** * security_inode_getsecctx() - Get the security label of an inode * @inode: inode * @ctx: secctx * @ctxlen: length of secctx * * On success, returns 0 and fills out @ctx and @ctxlen with the security * context for the given @inode. * * Return: Returns 0 on success, error on failure. */ int security_inode_getsecctx(struct inode *inode, void **ctx, u32 *ctxlen) { return call_int_hook(inode_getsecctx, inode, ctx, ctxlen); } EXPORT_SYMBOL(security_inode_getsecctx); #ifdef CONFIG_WATCH_QUEUE /** * security_post_notification() - Check if a watch notification can be posted * @w_cred: credentials of the task that set the watch * @cred: credentials of the task which triggered the watch * @n: the notification * * Check to see if a watch notification can be posted to a particular queue. * * Return: Returns 0 if permission is granted. */ int security_post_notification(const struct cred *w_cred, const struct cred *cred, struct watch_notification *n) { return call_int_hook(post_notification, w_cred, cred, n); } #endif /* CONFIG_WATCH_QUEUE */ #ifdef CONFIG_KEY_NOTIFICATIONS /** * security_watch_key() - Check if a task is allowed to watch for key events * @key: the key to watch * * Check to see if a process is allowed to watch for event notifications from * a key or keyring. * * Return: Returns 0 if permission is granted. */ int security_watch_key(struct key *key) { return call_int_hook(watch_key, key); } #endif /* CONFIG_KEY_NOTIFICATIONS */ #ifdef CONFIG_SECURITY_NETWORK /** * security_unix_stream_connect() - Check if a AF_UNIX stream is allowed * @sock: originating sock * @other: peer sock * @newsk: new sock * * Check permissions before establishing a Unix domain stream connection * between @sock and @other. * * The @unix_stream_connect and @unix_may_send hooks were necessary because * Linux provides an alternative to the conventional file name space for Unix * domain sockets. Whereas binding and connecting to sockets in the file name * space is mediated by the typical file permissions (and caught by the mknod * and permission hooks in inode_security_ops), binding and connecting to * sockets in the abstract name space is completely unmediated. Sufficient * control of Unix domain sockets in the abstract name space isn't possible * using only the socket layer hooks, since we need to know the actual target * socket, which is not looked up until we are inside the af_unix code. * * Return: Returns 0 if permission is granted. */ int security_unix_stream_connect(struct sock *sock, struct sock *other, struct sock *newsk) { return call_int_hook(unix_stream_connect, sock, other, newsk); } EXPORT_SYMBOL(security_unix_stream_connect); /** * security_unix_may_send() - Check if AF_UNIX socket can send datagrams * @sock: originating sock * @other: peer sock * * Check permissions before connecting or sending datagrams from @sock to * @other. * * The @unix_stream_connect and @unix_may_send hooks were necessary because * Linux provides an alternative to the conventional file name space for Unix * domain sockets. Whereas binding and connecting to sockets in the file name * space is mediated by the typical file permissions (and caught by the mknod * and permission hooks in inode_security_ops), binding and connecting to * sockets in the abstract name space is completely unmediated. Sufficient * control of Unix domain sockets in the abstract name space isn't possible * using only the socket layer hooks, since we need to know the actual target * socket, which is not looked up until we are inside the af_unix code. * * Return: Returns 0 if permission is granted. */ int security_unix_may_send(struct socket *sock, struct socket *other) { return call_int_hook(unix_may_send, sock, other); } EXPORT_SYMBOL(security_unix_may_send); /** * security_socket_create() - Check if creating a new socket is allowed * @family: protocol family * @type: communications type * @protocol: requested protocol * @kern: set to 1 if a kernel socket is requested * * Check permissions prior to creating a new socket. * * Return: Returns 0 if permission is granted. */ int security_socket_create(int family, int type, int protocol, int kern) { return call_int_hook(socket_create, family, type, protocol, kern); } /** * security_socket_post_create() - Initialize a newly created socket * @sock: socket * @family: protocol family * @type: communications type * @protocol: requested protocol * @kern: set to 1 if a kernel socket is requested * * This hook allows a module to update or allocate a per-socket security * structure. Note that the security field was not added directly to the socket * structure, but rather, the socket security information is stored in the * associated inode. Typically, the inode alloc_security hook will allocate * and attach security information to SOCK_INODE(sock)->i_security. This hook * may be used to update the SOCK_INODE(sock)->i_security field with additional * information that wasn't available when the inode was allocated. * * Return: Returns 0 if permission is granted. */ int security_socket_post_create(struct socket *sock, int family, int type, int protocol, int kern) { return call_int_hook(socket_post_create, sock, family, type, protocol, kern); } /** * security_socket_socketpair() - Check if creating a socketpair is allowed * @socka: first socket * @sockb: second socket * * Check permissions before creating a fresh pair of sockets. * * Return: Returns 0 if permission is granted and the connection was * established. */ int security_socket_socketpair(struct socket *socka, struct socket *sockb) { return call_int_hook(socket_socketpair, socka, sockb); } EXPORT_SYMBOL(security_socket_socketpair); /** * security_socket_bind() - Check if a socket bind operation is allowed * @sock: socket * @address: requested bind address * @addrlen: length of address * * Check permission before socket protocol layer bind operation is performed * and the socket @sock is bound to the address specified in the @address * parameter. * * Return: Returns 0 if permission is granted. */ int security_socket_bind(struct socket *sock, struct sockaddr *address, int addrlen) { return call_int_hook(socket_bind, sock, address, addrlen); } /** * security_socket_connect() - Check if a socket connect operation is allowed * @sock: socket * @address: address of remote connection point * @addrlen: length of address * * Check permission before socket protocol layer connect operation attempts to * connect socket @sock to a remote address, @address. * * Return: Returns 0 if permission is granted. */ int security_socket_connect(struct socket *sock, struct sockaddr *address, int addrlen) { return call_int_hook(socket_connect, sock, address, addrlen); } /** * security_socket_listen() - Check if a socket is allowed to listen * @sock: socket * @backlog: connection queue size * * Check permission before socket protocol layer listen operation. * * Return: Returns 0 if permission is granted. */ int security_socket_listen(struct socket *sock, int backlog) { return call_int_hook(socket_listen, sock, backlog); } /** * security_socket_accept() - Check if a socket is allowed to accept connections * @sock: listening socket * @newsock: newly creation connection socket * * Check permission before accepting a new connection. Note that the new * socket, @newsock, has been created and some information copied to it, but * the accept operation has not actually been performed. * * Return: Returns 0 if permission is granted. */ int security_socket_accept(struct socket *sock, struct socket *newsock) { return call_int_hook(socket_accept, sock, newsock); } /** * security_socket_sendmsg() - Check if sending a message is allowed * @sock: sending socket * @msg: message to send * @size: size of message * * Check permission before transmitting a message to another socket. * * Return: Returns 0 if permission is granted. */ int security_socket_sendmsg(struct socket *sock, struct msghdr *msg, int size) { return call_int_hook(socket_sendmsg, sock, msg, size); } /** * security_socket_recvmsg() - Check if receiving a message is allowed * @sock: receiving socket * @msg: message to receive * @size: size of message * @flags: operational flags * * Check permission before receiving a message from a socket. * * Return: Returns 0 if permission is granted. */ int security_socket_recvmsg(struct socket *sock, struct msghdr *msg, int size, int flags) { return call_int_hook(socket_recvmsg, sock, msg, size, flags); } /** * security_socket_getsockname() - Check if reading the socket addr is allowed * @sock: socket * * Check permission before reading the local address (name) of the socket * object. * * Return: Returns 0 if permission is granted. */ int security_socket_getsockname(struct socket *sock) { return call_int_hook(socket_getsockname, sock); } /** * security_socket_getpeername() - Check if reading the peer's addr is allowed * @sock: socket * * Check permission before the remote address (name) of a socket object. * * Return: Returns 0 if permission is granted. */ int security_socket_getpeername(struct socket *sock) { return call_int_hook(socket_getpeername, sock); } /** * security_socket_getsockopt() - Check if reading a socket option is allowed * @sock: socket * @level: option's protocol level * @optname: option name * * Check permissions before retrieving the options associated with socket * @sock. * * Return: Returns 0 if permission is granted. */ int security_socket_getsockopt(struct socket *sock, int level, int optname) { return call_int_hook(socket_getsockopt, sock, level, optname); } /** * security_socket_setsockopt() - Check if setting a socket option is allowed * @sock: socket * @level: option's protocol level * @optname: option name * * Check permissions before setting the options associated with socket @sock. * * Return: Returns 0 if permission is granted. */ int security_socket_setsockopt(struct socket *sock, int level, int optname) { return call_int_hook(socket_setsockopt, sock, level, optname); } /** * security_socket_shutdown() - Checks if shutting down the socket is allowed * @sock: socket * @how: flag indicating how sends and receives are handled * * Checks permission before all or part of a connection on the socket @sock is * shut down. * * Return: Returns 0 if permission is granted. */ int security_socket_shutdown(struct socket *sock, int how) { return call_int_hook(socket_shutdown, sock, how); } /** * security_sock_rcv_skb() - Check if an incoming network packet is allowed * @sk: destination sock * @skb: incoming packet * * Check permissions on incoming network packets. This hook is distinct from * Netfilter's IP input hooks since it is the first time that the incoming * sk_buff @skb has been associated with a particular socket, @sk. Must not * sleep inside this hook because some callers hold spinlocks. * * Return: Returns 0 if permission is granted. */ int security_sock_rcv_skb(struct sock *sk, struct sk_buff *skb) { return call_int_hook(socket_sock_rcv_skb, sk, skb); } EXPORT_SYMBOL(security_sock_rcv_skb); /** * security_socket_getpeersec_stream() - Get the remote peer label * @sock: socket * @optval: destination buffer * @optlen: size of peer label copied into the buffer * @len: maximum size of the destination buffer * * This hook allows the security module to provide peer socket security state * for unix or connected tcp sockets to userspace via getsockopt SO_GETPEERSEC. * For tcp sockets this can be meaningful if the socket is associated with an * ipsec SA. * * Return: Returns 0 if all is well, otherwise, typical getsockopt return * values. */ int security_socket_getpeersec_stream(struct socket *sock, sockptr_t optval, sockptr_t optlen, unsigned int len) { return call_int_hook(socket_getpeersec_stream, sock, optval, optlen, len); } /** * security_socket_getpeersec_dgram() - Get the remote peer label * @sock: socket * @skb: datagram packet * @secid: remote peer label secid * * This hook allows the security module to provide peer socket security state * for udp sockets on a per-packet basis to userspace via getsockopt * SO_GETPEERSEC. The application must first have indicated the IP_PASSSEC * option via getsockopt. It can then retrieve the security state returned by * this hook for a packet via the SCM_SECURITY ancillary message type. * * Return: Returns 0 on success, error on failure. */ int security_socket_getpeersec_dgram(struct socket *sock, struct sk_buff *skb, u32 *secid) { return call_int_hook(socket_getpeersec_dgram, sock, skb, secid); } EXPORT_SYMBOL(security_socket_getpeersec_dgram); /** * lsm_sock_alloc - allocate a composite sock blob * @sock: the sock that needs a blob * @gfp: allocation mode * * Allocate the sock blob for all the modules * * Returns 0, or -ENOMEM if memory can't be allocated. */ static int lsm_sock_alloc(struct sock *sock, gfp_t gfp) { return lsm_blob_alloc(&sock->sk_security, blob_sizes.lbs_sock, gfp); } /** * security_sk_alloc() - Allocate and initialize a sock's LSM blob * @sk: sock * @family: protocol family * @priority: gfp flags * * Allocate and attach a security structure to the sk->sk_security field, which * is used to copy security attributes between local stream sockets. * * Return: Returns 0 on success, error on failure. */ int security_sk_alloc(struct sock *sk, int family, gfp_t priority) { int rc = lsm_sock_alloc(sk, priority); if (unlikely(rc)) return rc; rc = call_int_hook(sk_alloc_security, sk, family, priority); if (unlikely(rc)) security_sk_free(sk); return rc; } /** * security_sk_free() - Free the sock's LSM blob * @sk: sock * * Deallocate security structure. */ void security_sk_free(struct sock *sk) { call_void_hook(sk_free_security, sk); kfree(sk->sk_security); sk->sk_security = NULL; } /** * security_sk_clone() - Clone a sock's LSM state * @sk: original sock * @newsk: target sock * * Clone/copy security structure. */ void security_sk_clone(const struct sock *sk, struct sock *newsk) { call_void_hook(sk_clone_security, sk, newsk); } EXPORT_SYMBOL(security_sk_clone); /** * security_sk_classify_flow() - Set a flow's secid based on socket * @sk: original socket * @flic: target flow * * Set the target flow's secid to socket's secid. */ void security_sk_classify_flow(const struct sock *sk, struct flowi_common *flic) { call_void_hook(sk_getsecid, sk, &flic->flowic_secid); } EXPORT_SYMBOL(security_sk_classify_flow); /** * security_req_classify_flow() - Set a flow's secid based on request_sock * @req: request_sock * @flic: target flow * * Sets @flic's secid to @req's secid. */ void security_req_classify_flow(const struct request_sock *req, struct flowi_common *flic) { call_void_hook(req_classify_flow, req, flic); } EXPORT_SYMBOL(security_req_classify_flow); /** * security_sock_graft() - Reconcile LSM state when grafting a sock on a socket * @sk: sock being grafted * @parent: target parent socket * * Sets @parent's inode secid to @sk's secid and update @sk with any necessary * LSM state from @parent. */ void security_sock_graft(struct sock *sk, struct socket *parent) { call_void_hook(sock_graft, sk, parent); } EXPORT_SYMBOL(security_sock_graft); /** * security_inet_conn_request() - Set request_sock state using incoming connect * @sk: parent listening sock * @skb: incoming connection * @req: new request_sock * * Initialize the @req LSM state based on @sk and the incoming connect in @skb. * * Return: Returns 0 if permission is granted. */ int security_inet_conn_request(const struct sock *sk, struct sk_buff *skb, struct request_sock *req) { return call_int_hook(inet_conn_request, sk, skb, req); } EXPORT_SYMBOL(security_inet_conn_request); /** * security_inet_csk_clone() - Set new sock LSM state based on request_sock * @newsk: new sock * @req: connection request_sock * * Set that LSM state of @sock using the LSM state from @req. */ void security_inet_csk_clone(struct sock *newsk, const struct request_sock *req) { call_void_hook(inet_csk_clone, newsk, req); } /** * security_inet_conn_established() - Update sock's LSM state with connection * @sk: sock * @skb: connection packet * * Update @sock's LSM state to represent a new connection from @skb. */ void security_inet_conn_established(struct sock *sk, struct sk_buff *skb) { call_void_hook(inet_conn_established, sk, skb); } EXPORT_SYMBOL(security_inet_conn_established); /** * security_secmark_relabel_packet() - Check if setting a secmark is allowed * @secid: new secmark value * * Check if the process should be allowed to relabel packets to @secid. * * Return: Returns 0 if permission is granted. */ int security_secmark_relabel_packet(u32 secid) { return call_int_hook(secmark_relabel_packet, secid); } EXPORT_SYMBOL(security_secmark_relabel_packet); /** * security_secmark_refcount_inc() - Increment the secmark labeling rule count * * Tells the LSM to increment the number of secmark labeling rules loaded. */ void security_secmark_refcount_inc(void) { call_void_hook(secmark_refcount_inc); } EXPORT_SYMBOL(security_secmark_refcount_inc); /** * security_secmark_refcount_dec() - Decrement the secmark labeling rule count * * Tells the LSM to decrement the number of secmark labeling rules loaded. */ void security_secmark_refcount_dec(void) { call_void_hook(secmark_refcount_dec); } EXPORT_SYMBOL(security_secmark_refcount_dec); /** * security_tun_dev_alloc_security() - Allocate a LSM blob for a TUN device * @security: pointer to the LSM blob * * This hook allows a module to allocate a security structure for a TUN device, * returning the pointer in @security. * * Return: Returns a zero on success, negative values on failure. */ int security_tun_dev_alloc_security(void **security) { int rc; rc = lsm_blob_alloc(security, blob_sizes.lbs_tun_dev, GFP_KERNEL); if (rc) return rc; rc = call_int_hook(tun_dev_alloc_security, *security); if (rc) { kfree(*security); *security = NULL; } return rc; } EXPORT_SYMBOL(security_tun_dev_alloc_security); /** * security_tun_dev_free_security() - Free a TUN device LSM blob * @security: LSM blob * * This hook allows a module to free the security structure for a TUN device. */ void security_tun_dev_free_security(void *security) { kfree(security); } EXPORT_SYMBOL(security_tun_dev_free_security); /** * security_tun_dev_create() - Check if creating a TUN device is allowed * * Check permissions prior to creating a new TUN device. * * Return: Returns 0 if permission is granted. */ int security_tun_dev_create(void) { return call_int_hook(tun_dev_create); } EXPORT_SYMBOL(security_tun_dev_create); /** * security_tun_dev_attach_queue() - Check if attaching a TUN queue is allowed * @security: TUN device LSM blob * * Check permissions prior to attaching to a TUN device queue. * * Return: Returns 0 if permission is granted. */ int security_tun_dev_attach_queue(void *security) { return call_int_hook(tun_dev_attach_queue, security); } EXPORT_SYMBOL(security_tun_dev_attach_queue); /** * security_tun_dev_attach() - Update TUN device LSM state on attach * @sk: associated sock * @security: TUN device LSM blob * * This hook can be used by the module to update any security state associated * with the TUN device's sock structure. * * Return: Returns 0 if permission is granted. */ int security_tun_dev_attach(struct sock *sk, void *security) { return call_int_hook(tun_dev_attach, sk, security); } EXPORT_SYMBOL(security_tun_dev_attach); /** * security_tun_dev_open() - Update TUN device LSM state on open * @security: TUN device LSM blob * * This hook can be used by the module to update any security state associated * with the TUN device's security structure. * * Return: Returns 0 if permission is granted. */ int security_tun_dev_open(void *security) { return call_int_hook(tun_dev_open, security); } EXPORT_SYMBOL(security_tun_dev_open); /** * security_sctp_assoc_request() - Update the LSM on a SCTP association req * @asoc: SCTP association * @skb: packet requesting the association * * Passes the @asoc and @chunk->skb of the association INIT packet to the LSM. * * Return: Returns 0 on success, error on failure. */ int security_sctp_assoc_request(struct sctp_association *asoc, struct sk_buff *skb) { return call_int_hook(sctp_assoc_request, asoc, skb); } EXPORT_SYMBOL(security_sctp_assoc_request); /** * security_sctp_bind_connect() - Validate a list of addrs for a SCTP option * @sk: socket * @optname: SCTP option to validate * @address: list of IP addresses to validate * @addrlen: length of the address list * * Validiate permissions required for each address associated with sock @sk. * Depending on @optname, the addresses will be treated as either a connect or * bind service. The @addrlen is calculated on each IPv4 and IPv6 address using * sizeof(struct sockaddr_in) or sizeof(struct sockaddr_in6). * * Return: Returns 0 on success, error on failure. */ int security_sctp_bind_connect(struct sock *sk, int optname, struct sockaddr *address, int addrlen) { return call_int_hook(sctp_bind_connect, sk, optname, address, addrlen); } EXPORT_SYMBOL(security_sctp_bind_connect); /** * security_sctp_sk_clone() - Clone a SCTP sock's LSM state * @asoc: SCTP association * @sk: original sock * @newsk: target sock * * Called whenever a new socket is created by accept(2) (i.e. a TCP style * socket) or when a socket is 'peeled off' e.g userspace calls * sctp_peeloff(3). */ void security_sctp_sk_clone(struct sctp_association *asoc, struct sock *sk, struct sock *newsk) { call_void_hook(sctp_sk_clone, asoc, sk, newsk); } EXPORT_SYMBOL(security_sctp_sk_clone); /** * security_sctp_assoc_established() - Update LSM state when assoc established * @asoc: SCTP association * @skb: packet establishing the association * * Passes the @asoc and @chunk->skb of the association COOKIE_ACK packet to the * security module. * * Return: Returns 0 if permission is granted. */ int security_sctp_assoc_established(struct sctp_association *asoc, struct sk_buff *skb) { return call_int_hook(sctp_assoc_established, asoc, skb); } EXPORT_SYMBOL(security_sctp_assoc_established); /** * security_mptcp_add_subflow() - Inherit the LSM label from the MPTCP socket * @sk: the owning MPTCP socket * @ssk: the new subflow * * Update the labeling for the given MPTCP subflow, to match the one of the * owning MPTCP socket. This hook has to be called after the socket creation and * initialization via the security_socket_create() and * security_socket_post_create() LSM hooks. * * Return: Returns 0 on success or a negative error code on failure. */ int security_mptcp_add_subflow(struct sock *sk, struct sock *ssk) { return call_int_hook(mptcp_add_subflow, sk, ssk); } #endif /* CONFIG_SECURITY_NETWORK */ #ifdef CONFIG_SECURITY_INFINIBAND /** * security_ib_pkey_access() - Check if access to an IB pkey is allowed * @sec: LSM blob * @subnet_prefix: subnet prefix of the port * @pkey: IB pkey * * Check permission to access a pkey when modifying a QP. * * Return: Returns 0 if permission is granted. */ int security_ib_pkey_access(void *sec, u64 subnet_prefix, u16 pkey) { return call_int_hook(ib_pkey_access, sec, subnet_prefix, pkey); } EXPORT_SYMBOL(security_ib_pkey_access); /** * security_ib_endport_manage_subnet() - Check if SMPs traffic is allowed * @sec: LSM blob * @dev_name: IB device name * @port_num: port number * * Check permissions to send and receive SMPs on a end port. * * Return: Returns 0 if permission is granted. */ int security_ib_endport_manage_subnet(void *sec, const char *dev_name, u8 port_num) { return call_int_hook(ib_endport_manage_subnet, sec, dev_name, port_num); } EXPORT_SYMBOL(security_ib_endport_manage_subnet); /** * security_ib_alloc_security() - Allocate an Infiniband LSM blob * @sec: LSM blob * * Allocate a security structure for Infiniband objects. * * Return: Returns 0 on success, non-zero on failure. */ int security_ib_alloc_security(void **sec) { int rc; rc = lsm_blob_alloc(sec, blob_sizes.lbs_ib, GFP_KERNEL); if (rc) return rc; rc = call_int_hook(ib_alloc_security, *sec); if (rc) { kfree(*sec); *sec = NULL; } return rc; } EXPORT_SYMBOL(security_ib_alloc_security); /** * security_ib_free_security() - Free an Infiniband LSM blob * @sec: LSM blob * * Deallocate an Infiniband security structure. */ void security_ib_free_security(void *sec) { kfree(sec); } EXPORT_SYMBOL(security_ib_free_security); #endif /* CONFIG_SECURITY_INFINIBAND */ #ifdef CONFIG_SECURITY_NETWORK_XFRM /** * security_xfrm_policy_alloc() - Allocate a xfrm policy LSM blob * @ctxp: xfrm security context being added to the SPD * @sec_ctx: security label provided by userspace * @gfp: gfp flags * * Allocate a security structure to the xp->security field; the security field * is initialized to NULL when the xfrm_policy is allocated. * * Return: Return 0 if operation was successful. */ int security_xfrm_policy_alloc(struct xfrm_sec_ctx **ctxp, struct xfrm_user_sec_ctx *sec_ctx, gfp_t gfp) { return call_int_hook(xfrm_policy_alloc_security, ctxp, sec_ctx, gfp); } EXPORT_SYMBOL(security_xfrm_policy_alloc); /** * security_xfrm_policy_clone() - Clone xfrm policy LSM state * @old_ctx: xfrm security context * @new_ctxp: target xfrm security context * * Allocate a security structure in new_ctxp that contains the information from * the old_ctx structure. * * Return: Return 0 if operation was successful. */ int security_xfrm_policy_clone(struct xfrm_sec_ctx *old_ctx, struct xfrm_sec_ctx **new_ctxp) { return call_int_hook(xfrm_policy_clone_security, old_ctx, new_ctxp); } /** * security_xfrm_policy_free() - Free a xfrm security context * @ctx: xfrm security context * * Free LSM resources associated with @ctx. */ void security_xfrm_policy_free(struct xfrm_sec_ctx *ctx) { call_void_hook(xfrm_policy_free_security, ctx); } EXPORT_SYMBOL(security_xfrm_policy_free); /** * security_xfrm_policy_delete() - Check if deleting a xfrm policy is allowed * @ctx: xfrm security context * * Authorize deletion of a SPD entry. * * Return: Returns 0 if permission is granted. */ int security_xfrm_policy_delete(struct xfrm_sec_ctx *ctx) { return call_int_hook(xfrm_policy_delete_security, ctx); } /** * security_xfrm_state_alloc() - Allocate a xfrm state LSM blob * @x: xfrm state being added to the SAD * @sec_ctx: security label provided by userspace * * Allocate a security structure to the @x->security field; the security field * is initialized to NULL when the xfrm_state is allocated. Set the context to * correspond to @sec_ctx. * * Return: Return 0 if operation was successful. */ int security_xfrm_state_alloc(struct xfrm_state *x, struct xfrm_user_sec_ctx *sec_ctx) { return call_int_hook(xfrm_state_alloc, x, sec_ctx); } EXPORT_SYMBOL(security_xfrm_state_alloc); /** * security_xfrm_state_alloc_acquire() - Allocate a xfrm state LSM blob * @x: xfrm state being added to the SAD * @polsec: associated policy's security context * @secid: secid from the flow * * Allocate a security structure to the x->security field; the security field * is initialized to NULL when the xfrm_state is allocated. Set the context to * correspond to secid. * * Return: Returns 0 if operation was successful. */ int security_xfrm_state_alloc_acquire(struct xfrm_state *x, struct xfrm_sec_ctx *polsec, u32 secid) { return call_int_hook(xfrm_state_alloc_acquire, x, polsec, secid); } /** * security_xfrm_state_delete() - Check if deleting a xfrm state is allowed * @x: xfrm state * * Authorize deletion of x->security. * * Return: Returns 0 if permission is granted. */ int security_xfrm_state_delete(struct xfrm_state *x) { return call_int_hook(xfrm_state_delete_security, x); } EXPORT_SYMBOL(security_xfrm_state_delete); /** * security_xfrm_state_free() - Free a xfrm state * @x: xfrm state * * Deallocate x->security. */ void security_xfrm_state_free(struct xfrm_state *x) { call_void_hook(xfrm_state_free_security, x); } /** * security_xfrm_policy_lookup() - Check if using a xfrm policy is allowed * @ctx: target xfrm security context * @fl_secid: flow secid used to authorize access * * Check permission when a flow selects a xfrm_policy for processing XFRMs on a * packet. The hook is called when selecting either a per-socket policy or a * generic xfrm policy. * * Return: Return 0 if permission is granted, -ESRCH otherwise, or -errno on * other errors. */ int security_xfrm_policy_lookup(struct xfrm_sec_ctx *ctx, u32 fl_secid) { return call_int_hook(xfrm_policy_lookup, ctx, fl_secid); } /** * security_xfrm_state_pol_flow_match() - Check for a xfrm match * @x: xfrm state to match * @xp: xfrm policy to check for a match * @flic: flow to check for a match. * * Check @xp and @flic for a match with @x. * * Return: Returns 1 if there is a match. */ int security_xfrm_state_pol_flow_match(struct xfrm_state *x, struct xfrm_policy *xp, const struct flowi_common *flic) { struct lsm_static_call *scall; int rc = LSM_RET_DEFAULT(xfrm_state_pol_flow_match); /* * Since this function is expected to return 0 or 1, the judgment * becomes difficult if multiple LSMs supply this call. Fortunately, * we can use the first LSM's judgment because currently only SELinux * supplies this call. * * For speed optimization, we explicitly break the loop rather than * using the macro */ lsm_for_each_hook(scall, xfrm_state_pol_flow_match) { rc = scall->hl->hook.xfrm_state_pol_flow_match(x, xp, flic); break; } return rc; } /** * security_xfrm_decode_session() - Determine the xfrm secid for a packet * @skb: xfrm packet * @secid: secid * * Decode the packet in @skb and return the security label in @secid. * * Return: Return 0 if all xfrms used have the same secid. */ int security_xfrm_decode_session(struct sk_buff *skb, u32 *secid) { return call_int_hook(xfrm_decode_session, skb, secid, 1); } void security_skb_classify_flow(struct sk_buff *skb, struct flowi_common *flic) { int rc = call_int_hook(xfrm_decode_session, skb, &flic->flowic_secid, 0); BUG_ON(rc); } EXPORT_SYMBOL(security_skb_classify_flow); #endif /* CONFIG_SECURITY_NETWORK_XFRM */ #ifdef CONFIG_KEYS /** * security_key_alloc() - Allocate and initialize a kernel key LSM blob * @key: key * @cred: credentials * @flags: allocation flags * * Permit allocation of a key and assign security data. Note that key does not * have a serial number assigned at this point. * * Return: Return 0 if permission is granted, -ve error otherwise. */ int security_key_alloc(struct key *key, const struct cred *cred, unsigned long flags) { int rc = lsm_key_alloc(key); if (unlikely(rc)) return rc; rc = call_int_hook(key_alloc, key, cred, flags); if (unlikely(rc)) security_key_free(key); return rc; } /** * security_key_free() - Free a kernel key LSM blob * @key: key * * Notification of destruction; free security data. */ void security_key_free(struct key *key) { kfree(key->security); key->security = NULL; } /** * security_key_permission() - Check if a kernel key operation is allowed * @key_ref: key reference * @cred: credentials of actor requesting access * @need_perm: requested permissions * * See whether a specific operational right is granted to a process on a key. * * Return: Return 0 if permission is granted, -ve error otherwise. */ int security_key_permission(key_ref_t key_ref, const struct cred *cred, enum key_need_perm need_perm) { return call_int_hook(key_permission, key_ref, cred, need_perm); } /** * security_key_getsecurity() - Get the key's security label * @key: key * @buffer: security label buffer * * Get a textual representation of the security context attached to a key for * the purposes of honouring KEYCTL_GETSECURITY. This function allocates the * storage for the NUL-terminated string and the caller should free it. * * Return: Returns the length of @buffer (including terminating NUL) or -ve if * an error occurs. May also return 0 (and a NULL buffer pointer) if * there is no security label assigned to the key. */ int security_key_getsecurity(struct key *key, char **buffer) { *buffer = NULL; return call_int_hook(key_getsecurity, key, buffer); } /** * security_key_post_create_or_update() - Notification of key create or update * @keyring: keyring to which the key is linked to * @key: created or updated key * @payload: data used to instantiate or update the key * @payload_len: length of payload * @flags: key flags * @create: flag indicating whether the key was created or updated * * Notify the caller of a key creation or update. */ void security_key_post_create_or_update(struct key *keyring, struct key *key, const void *payload, size_t payload_len, unsigned long flags, bool create) { call_void_hook(key_post_create_or_update, keyring, key, payload, payload_len, flags, create); } #endif /* CONFIG_KEYS */ #ifdef CONFIG_AUDIT /** * security_audit_rule_init() - Allocate and init an LSM audit rule struct * @field: audit action * @op: rule operator * @rulestr: rule context * @lsmrule: receive buffer for audit rule struct * @gfp: GFP flag used for kmalloc * * Allocate and initialize an LSM audit rule structure. * * Return: Return 0 if @lsmrule has been successfully set, -EINVAL in case of * an invalid rule. */ int security_audit_rule_init(u32 field, u32 op, char *rulestr, void **lsmrule, gfp_t gfp) { return call_int_hook(audit_rule_init, field, op, rulestr, lsmrule, gfp); } /** * security_audit_rule_known() - Check if an audit rule contains LSM fields * @krule: audit rule * * Specifies whether given @krule contains any fields related to the current * LSM. * * Return: Returns 1 in case of relation found, 0 otherwise. */ int security_audit_rule_known(struct audit_krule *krule) { return call_int_hook(audit_rule_known, krule); } /** * security_audit_rule_free() - Free an LSM audit rule struct * @lsmrule: audit rule struct * * Deallocate the LSM audit rule structure previously allocated by * audit_rule_init(). */ void security_audit_rule_free(void *lsmrule) { call_void_hook(audit_rule_free, lsmrule); } /** * security_audit_rule_match() - Check if a label matches an audit rule * @secid: security label * @field: LSM audit field * @op: matching operator * @lsmrule: audit rule * * Determine if given @secid matches a rule previously approved by * security_audit_rule_known(). * * Return: Returns 1 if secid matches the rule, 0 if it does not, -ERRNO on * failure. */ int security_audit_rule_match(u32 secid, u32 field, u32 op, void *lsmrule) { return call_int_hook(audit_rule_match, secid, field, op, lsmrule); } #endif /* CONFIG_AUDIT */ #ifdef CONFIG_BPF_SYSCALL /** * security_bpf() - Check if the bpf syscall operation is allowed * @cmd: command * @attr: bpf attribute * @size: size * * Do a initial check for all bpf syscalls after the attribute is copied into * the kernel. The actual security module can implement their own rules to * check the specific cmd they need. * * Return: Returns 0 if permission is granted. */ int security_bpf(int cmd, union bpf_attr *attr, unsigned int size) { return call_int_hook(bpf, cmd, attr, size); } /** * security_bpf_map() - Check if access to a bpf map is allowed * @map: bpf map * @fmode: mode * * Do a check when the kernel generates and returns a file descriptor for eBPF * maps. * * Return: Returns 0 if permission is granted. */ int security_bpf_map(struct bpf_map *map, fmode_t fmode) { return call_int_hook(bpf_map, map, fmode); } /** * security_bpf_prog() - Check if access to a bpf program is allowed * @prog: bpf program * * Do a check when the kernel generates and returns a file descriptor for eBPF * programs. * * Return: Returns 0 if permission is granted. */ int security_bpf_prog(struct bpf_prog *prog) { return call_int_hook(bpf_prog, prog); } /** * security_bpf_map_create() - Check if BPF map creation is allowed * @map: BPF map object * @attr: BPF syscall attributes used to create BPF map * @token: BPF token used to grant user access * * Do a check when the kernel creates a new BPF map. This is also the * point where LSM blob is allocated for LSMs that need them. * * Return: Returns 0 on success, error on failure. */ int security_bpf_map_create(struct bpf_map *map, union bpf_attr *attr, struct bpf_token *token) { return call_int_hook(bpf_map_create, map, attr, token); } /** * security_bpf_prog_load() - Check if loading of BPF program is allowed * @prog: BPF program object * @attr: BPF syscall attributes used to create BPF program * @token: BPF token used to grant user access to BPF subsystem * * Perform an access control check when the kernel loads a BPF program and * allocates associated BPF program object. This hook is also responsible for * allocating any required LSM state for the BPF program. * * Return: Returns 0 on success, error on failure. */ int security_bpf_prog_load(struct bpf_prog *prog, union bpf_attr *attr, struct bpf_token *token) { return call_int_hook(bpf_prog_load, prog, attr, token); } /** * security_bpf_token_create() - Check if creating of BPF token is allowed * @token: BPF token object * @attr: BPF syscall attributes used to create BPF token * @path: path pointing to BPF FS mount point from which BPF token is created * * Do a check when the kernel instantiates a new BPF token object from BPF FS * instance. This is also the point where LSM blob can be allocated for LSMs. * * Return: Returns 0 on success, error on failure. */ int security_bpf_token_create(struct bpf_token *token, union bpf_attr *attr, const struct path *path) { return call_int_hook(bpf_token_create, token, attr, path); } /** * security_bpf_token_cmd() - Check if BPF token is allowed to delegate * requested BPF syscall command * @token: BPF token object * @cmd: BPF syscall command requested to be delegated by BPF token * * Do a check when the kernel decides whether provided BPF token should allow * delegation of requested BPF syscall command. * * Return: Returns 0 on success, error on failure. */ int security_bpf_token_cmd(const struct bpf_token *token, enum bpf_cmd cmd) { return call_int_hook(bpf_token_cmd, token, cmd); } /** * security_bpf_token_capable() - Check if BPF token is allowed to delegate * requested BPF-related capability * @token: BPF token object * @cap: capabilities requested to be delegated by BPF token * * Do a check when the kernel decides whether provided BPF token should allow * delegation of requested BPF-related capabilities. * * Return: Returns 0 on success, error on failure. */ int security_bpf_token_capable(const struct bpf_token *token, int cap) { return call_int_hook(bpf_token_capable, token, cap); } /** * security_bpf_map_free() - Free a bpf map's LSM blob * @map: bpf map * * Clean up the security information stored inside bpf map. */ void security_bpf_map_free(struct bpf_map *map) { call_void_hook(bpf_map_free, map); } /** * security_bpf_prog_free() - Free a BPF program's LSM blob * @prog: BPF program struct * * Clean up the security information stored inside BPF program. */ void security_bpf_prog_free(struct bpf_prog *prog) { call_void_hook(bpf_prog_free, prog); } /** * security_bpf_token_free() - Free a BPF token's LSM blob * @token: BPF token struct * * Clean up the security information stored inside BPF token. */ void security_bpf_token_free(struct bpf_token *token) { call_void_hook(bpf_token_free, token); } #endif /* CONFIG_BPF_SYSCALL */ /** * security_locked_down() - Check if a kernel feature is allowed * @what: requested kernel feature * * Determine whether a kernel feature that potentially enables arbitrary code * execution in kernel space should be permitted. * * Return: Returns 0 if permission is granted. */ int security_locked_down(enum lockdown_reason what) { return call_int_hook(locked_down, what); } EXPORT_SYMBOL(security_locked_down); /** * security_bdev_alloc() - Allocate a block device LSM blob * @bdev: block device * * Allocate and attach a security structure to @bdev->bd_security. The * security field is initialized to NULL when the bdev structure is * allocated. * * Return: Return 0 if operation was successful. */ int security_bdev_alloc(struct block_device *bdev) { int rc = 0; rc = lsm_bdev_alloc(bdev); if (unlikely(rc)) return rc; rc = call_int_hook(bdev_alloc_security, bdev); if (unlikely(rc)) security_bdev_free(bdev); return rc; } EXPORT_SYMBOL(security_bdev_alloc); /** * security_bdev_free() - Free a block device's LSM blob * @bdev: block device * * Deallocate the bdev security structure and set @bdev->bd_security to NULL. */ void security_bdev_free(struct block_device *bdev) { if (!bdev->bd_security) return; call_void_hook(bdev_free_security, bdev); kfree(bdev->bd_security); bdev->bd_security = NULL; } EXPORT_SYMBOL(security_bdev_free); /** * security_bdev_setintegrity() - Set the device's integrity data * @bdev: block device * @type: type of integrity, e.g. hash digest, signature, etc * @value: the integrity value * @size: size of the integrity value * * Register a verified integrity measurement of a bdev with LSMs. * LSMs should free the previously saved data if @value is NULL. * Please note that the new hook should be invoked every time the security * information is updated to keep these data current. For example, in dm-verity, * if the mapping table is reloaded and configured to use a different dm-verity * target with a new roothash and signing information, the previously stored * data in the LSM blob will become obsolete. It is crucial to re-invoke the * hook to refresh these data and ensure they are up to date. This necessity * arises from the design of device-mapper, where a device-mapper device is * first created, and then targets are subsequently loaded into it. These * targets can be modified multiple times during the device's lifetime. * Therefore, while the LSM blob is allocated during the creation of the block * device, its actual contents are not initialized at this stage and can change * substantially over time. This includes alterations from data that the LSMs * 'trusts' to those they do not, making it essential to handle these changes * correctly. Failure to address this dynamic aspect could potentially allow * for bypassing LSM checks. * * Return: Returns 0 on success, negative values on failure. */ int security_bdev_setintegrity(struct block_device *bdev, enum lsm_integrity_type type, const void *value, size_t size) { return call_int_hook(bdev_setintegrity, bdev, type, value, size); } EXPORT_SYMBOL(security_bdev_setintegrity); #ifdef CONFIG_PERF_EVENTS /** * security_perf_event_open() - Check if a perf event open is allowed * @attr: perf event attribute * @type: type of event * * Check whether the @type of perf_event_open syscall is allowed. * * Return: Returns 0 if permission is granted. */ int security_perf_event_open(struct perf_event_attr *attr, int type) { return call_int_hook(perf_event_open, attr, type); } /** * security_perf_event_alloc() - Allocate a perf event LSM blob * @event: perf event * * Allocate and save perf_event security info. * * Return: Returns 0 on success, error on failure. */ int security_perf_event_alloc(struct perf_event *event) { int rc; rc = lsm_blob_alloc(&event->security, blob_sizes.lbs_perf_event, GFP_KERNEL); if (rc) return rc; rc = call_int_hook(perf_event_alloc, event); if (rc) { kfree(event->security); event->security = NULL; } return rc; } /** * security_perf_event_free() - Free a perf event LSM blob * @event: perf event * * Release (free) perf_event security info. */ void security_perf_event_free(struct perf_event *event) { kfree(event->security); event->security = NULL; } /** * security_perf_event_read() - Check if reading a perf event label is allowed * @event: perf event * * Read perf_event security info if allowed. * * Return: Returns 0 if permission is granted. */ int security_perf_event_read(struct perf_event *event) { return call_int_hook(perf_event_read, event); } /** * security_perf_event_write() - Check if writing a perf event label is allowed * @event: perf event * * Write perf_event security info if allowed. * * Return: Returns 0 if permission is granted. */ int security_perf_event_write(struct perf_event *event) { return call_int_hook(perf_event_write, event); } #endif /* CONFIG_PERF_EVENTS */ #ifdef CONFIG_IO_URING /** * security_uring_override_creds() - Check if overriding creds is allowed * @new: new credentials * * Check if the current task, executing an io_uring operation, is allowed to * override it's credentials with @new. * * Return: Returns 0 if permission is granted. */ int security_uring_override_creds(const struct cred *new) { return call_int_hook(uring_override_creds, new); } /** * security_uring_sqpoll() - Check if IORING_SETUP_SQPOLL is allowed * * Check whether the current task is allowed to spawn a io_uring polling thread * (IORING_SETUP_SQPOLL). * * Return: Returns 0 if permission is granted. */ int security_uring_sqpoll(void) { return call_int_hook(uring_sqpoll); } /** * security_uring_cmd() - Check if a io_uring passthrough command is allowed * @ioucmd: command * * Check whether the file_operations uring_cmd is allowed to run. * * Return: Returns 0 if permission is granted. */ int security_uring_cmd(struct io_uring_cmd *ioucmd) { return call_int_hook(uring_cmd, ioucmd); } #endif /* CONFIG_IO_URING */ /** * security_initramfs_populated() - Notify LSMs that initramfs has been loaded * * Tells the LSMs the initramfs has been unpacked into the rootfs. */ void security_initramfs_populated(void) { call_void_hook(initramfs_populated); } |
| 176 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_MBCACHE_H #define _LINUX_MBCACHE_H #include <linux/hash.h> #include <linux/list_bl.h> #include <linux/list.h> #include <linux/atomic.h> #include <linux/fs.h> struct mb_cache; /* Cache entry flags */ enum { MBE_REFERENCED_B = 0, MBE_REUSABLE_B }; struct mb_cache_entry { /* List of entries in cache - protected by cache->c_list_lock */ struct list_head e_list; /* * Hash table list - protected by hash chain bitlock. The entry is * guaranteed to be hashed while e_refcnt > 0. */ struct hlist_bl_node e_hash_list; /* * Entry refcount. Once it reaches zero, entry is unhashed and freed. * While refcount > 0, the entry is guaranteed to stay in the hash and * e.g. mb_cache_entry_try_delete() will fail. */ atomic_t e_refcnt; /* Key in hash - stable during lifetime of the entry */ u32 e_key; unsigned long e_flags; /* User provided value - stable during lifetime of the entry */ u64 e_value; }; struct mb_cache *mb_cache_create(int bucket_bits); void mb_cache_destroy(struct mb_cache *cache); int mb_cache_entry_create(struct mb_cache *cache, gfp_t mask, u32 key, u64 value, bool reusable); void __mb_cache_entry_free(struct mb_cache *cache, struct mb_cache_entry *entry); void mb_cache_entry_wait_unused(struct mb_cache_entry *entry); static inline void mb_cache_entry_put(struct mb_cache *cache, struct mb_cache_entry *entry) { unsigned int cnt = atomic_dec_return(&entry->e_refcnt); if (cnt > 0) { if (cnt <= 2) wake_up_var(&entry->e_refcnt); return; } __mb_cache_entry_free(cache, entry); } struct mb_cache_entry *mb_cache_entry_delete_or_get(struct mb_cache *cache, u32 key, u64 value); struct mb_cache_entry *mb_cache_entry_get(struct mb_cache *cache, u32 key, u64 value); struct mb_cache_entry *mb_cache_entry_find_first(struct mb_cache *cache, u32 key); struct mb_cache_entry *mb_cache_entry_find_next(struct mb_cache *cache, struct mb_cache_entry *entry); void mb_cache_entry_touch(struct mb_cache *cache, struct mb_cache_entry *entry); #endif /* _LINUX_MBCACHE_H */ |
| 150 13 132 8 8 8 8 10 15 9 10 22 22 11 1 9 15 15 8 7 2 1 1 9 4 5 5 15 1 1 7 1 3 2 10 2 7 5 13 12 1 4 8 26 26 3 15 8 21 25 32 14 29 17 30 16 48 48 47 1 46 151 9 1 150 1 22 13 2 9 15 13 26 48 1 3 56 1 9 155 24 138 2 2 2 175 1 1 8 140 12 15 164 10 38 115 151 12 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 | // SPDX-License-Identifier: GPL-2.0 /* * Quota code necessary even when VFS quota support is not compiled * into the kernel. The interesting stuff is over in dquot.c, here * we have symbols for initial quotactl(2) handling, the sysctl(2) * variables, etc - things needed even when quota support disabled. */ #include <linux/fs.h> #include <linux/namei.h> #include <linux/slab.h> #include <asm/current.h> #include <linux/blkdev.h> #include <linux/uaccess.h> #include <linux/kernel.h> #include <linux/security.h> #include <linux/syscalls.h> #include <linux/capability.h> #include <linux/quotaops.h> #include <linux/types.h> #include <linux/mount.h> #include <linux/writeback.h> #include <linux/nospec.h> #include "compat.h" #include "../internal.h" static int check_quotactl_permission(struct super_block *sb, int type, int cmd, qid_t id) { switch (cmd) { /* these commands do not require any special privilegues */ case Q_GETFMT: case Q_SYNC: case Q_GETINFO: case Q_XGETQSTAT: case Q_XGETQSTATV: case Q_XQUOTASYNC: break; /* allow to query information for dquots we "own" */ case Q_GETQUOTA: case Q_XGETQUOTA: if ((type == USRQUOTA && uid_eq(current_euid(), make_kuid(current_user_ns(), id))) || (type == GRPQUOTA && in_egroup_p(make_kgid(current_user_ns(), id)))) break; fallthrough; default: if (!capable(CAP_SYS_ADMIN)) return -EPERM; } return security_quotactl(cmd, type, id, sb); } static void quota_sync_one(struct super_block *sb, void *arg) { int type = *(int *)arg; if (sb->s_qcop && sb->s_qcop->quota_sync && (sb->s_quota_types & (1 << type))) sb->s_qcop->quota_sync(sb, type); } static int quota_sync_all(int type) { int ret; ret = security_quotactl(Q_SYNC, type, 0, NULL); if (!ret) iterate_supers(quota_sync_one, &type); return ret; } unsigned int qtype_enforce_flag(int type) { switch (type) { case USRQUOTA: return FS_QUOTA_UDQ_ENFD; case GRPQUOTA: return FS_QUOTA_GDQ_ENFD; case PRJQUOTA: return FS_QUOTA_PDQ_ENFD; } return 0; } static int quota_quotaon(struct super_block *sb, int type, qid_t id, const struct path *path) { if (!sb->s_qcop->quota_on && !sb->s_qcop->quota_enable) return -ENOSYS; if (sb->s_qcop->quota_enable) return sb->s_qcop->quota_enable(sb, qtype_enforce_flag(type)); if (IS_ERR(path)) return PTR_ERR(path); return sb->s_qcop->quota_on(sb, type, id, path); } static int quota_quotaoff(struct super_block *sb, int type) { if (!sb->s_qcop->quota_off && !sb->s_qcop->quota_disable) return -ENOSYS; if (sb->s_qcop->quota_disable) return sb->s_qcop->quota_disable(sb, qtype_enforce_flag(type)); return sb->s_qcop->quota_off(sb, type); } static int quota_getfmt(struct super_block *sb, int type, void __user *addr) { __u32 fmt; if (!sb_has_quota_active(sb, type)) return -ESRCH; fmt = sb_dqopt(sb)->info[type].dqi_format->qf_fmt_id; if (copy_to_user(addr, &fmt, sizeof(fmt))) return -EFAULT; return 0; } static int quota_getinfo(struct super_block *sb, int type, void __user *addr) { struct qc_state state; struct qc_type_state *tstate; struct if_dqinfo uinfo; int ret; if (!sb->s_qcop->get_state) return -ENOSYS; ret = sb->s_qcop->get_state(sb, &state); if (ret) return ret; tstate = state.s_state + type; if (!(tstate->flags & QCI_ACCT_ENABLED)) return -ESRCH; memset(&uinfo, 0, sizeof(uinfo)); uinfo.dqi_bgrace = tstate->spc_timelimit; uinfo.dqi_igrace = tstate->ino_timelimit; if (tstate->flags & QCI_SYSFILE) uinfo.dqi_flags |= DQF_SYS_FILE; if (tstate->flags & QCI_ROOT_SQUASH) uinfo.dqi_flags |= DQF_ROOT_SQUASH; uinfo.dqi_valid = IIF_ALL; if (copy_to_user(addr, &uinfo, sizeof(uinfo))) return -EFAULT; return 0; } static int quota_setinfo(struct super_block *sb, int type, void __user *addr) { struct if_dqinfo info; struct qc_info qinfo; if (copy_from_user(&info, addr, sizeof(info))) return -EFAULT; if (!sb->s_qcop->set_info) return -ENOSYS; if (info.dqi_valid & ~(IIF_FLAGS | IIF_BGRACE | IIF_IGRACE)) return -EINVAL; memset(&qinfo, 0, sizeof(qinfo)); if (info.dqi_valid & IIF_FLAGS) { if (info.dqi_flags & ~DQF_SETINFO_MASK) return -EINVAL; if (info.dqi_flags & DQF_ROOT_SQUASH) qinfo.i_flags |= QCI_ROOT_SQUASH; qinfo.i_fieldmask |= QC_FLAGS; } if (info.dqi_valid & IIF_BGRACE) { qinfo.i_spc_timelimit = info.dqi_bgrace; qinfo.i_fieldmask |= QC_SPC_TIMER; } if (info.dqi_valid & IIF_IGRACE) { qinfo.i_ino_timelimit = info.dqi_igrace; qinfo.i_fieldmask |= QC_INO_TIMER; } return sb->s_qcop->set_info(sb, type, &qinfo); } static inline qsize_t qbtos(qsize_t blocks) { return blocks << QIF_DQBLKSIZE_BITS; } static inline qsize_t stoqb(qsize_t space) { return (space + QIF_DQBLKSIZE - 1) >> QIF_DQBLKSIZE_BITS; } static void copy_to_if_dqblk(struct if_dqblk *dst, struct qc_dqblk *src) { memset(dst, 0, sizeof(*dst)); dst->dqb_bhardlimit = stoqb(src->d_spc_hardlimit); dst->dqb_bsoftlimit = stoqb(src->d_spc_softlimit); dst->dqb_curspace = src->d_space; dst->dqb_ihardlimit = src->d_ino_hardlimit; dst->dqb_isoftlimit = src->d_ino_softlimit; dst->dqb_curinodes = src->d_ino_count; dst->dqb_btime = src->d_spc_timer; dst->dqb_itime = src->d_ino_timer; dst->dqb_valid = QIF_ALL; } static int quota_getquota(struct super_block *sb, int type, qid_t id, void __user *addr) { struct kqid qid; struct qc_dqblk fdq; struct if_dqblk idq; int ret; if (!sb->s_qcop->get_dqblk) return -ENOSYS; qid = make_kqid(current_user_ns(), type, id); if (!qid_has_mapping(sb->s_user_ns, qid)) return -EINVAL; ret = sb->s_qcop->get_dqblk(sb, qid, &fdq); if (ret) return ret; copy_to_if_dqblk(&idq, &fdq); if (compat_need_64bit_alignment_fixup()) { struct compat_if_dqblk __user *compat_dqblk = addr; if (copy_to_user(compat_dqblk, &idq, sizeof(*compat_dqblk))) return -EFAULT; if (put_user(idq.dqb_valid, &compat_dqblk->dqb_valid)) return -EFAULT; } else { if (copy_to_user(addr, &idq, sizeof(idq))) return -EFAULT; } return 0; } /* * Return quota for next active quota >= this id, if any exists, * otherwise return -ENOENT via ->get_nextdqblk */ static int quota_getnextquota(struct super_block *sb, int type, qid_t id, void __user *addr) { struct kqid qid; struct qc_dqblk fdq; struct if_nextdqblk idq; int ret; if (!sb->s_qcop->get_nextdqblk) return -ENOSYS; qid = make_kqid(current_user_ns(), type, id); if (!qid_has_mapping(sb->s_user_ns, qid)) return -EINVAL; ret = sb->s_qcop->get_nextdqblk(sb, &qid, &fdq); if (ret) return ret; /* struct if_nextdqblk is a superset of struct if_dqblk */ copy_to_if_dqblk((struct if_dqblk *)&idq, &fdq); idq.dqb_id = from_kqid(current_user_ns(), qid); if (copy_to_user(addr, &idq, sizeof(idq))) return -EFAULT; return 0; } static void copy_from_if_dqblk(struct qc_dqblk *dst, struct if_dqblk *src) { dst->d_spc_hardlimit = qbtos(src->dqb_bhardlimit); dst->d_spc_softlimit = qbtos(src->dqb_bsoftlimit); dst->d_space = src->dqb_curspace; dst->d_ino_hardlimit = src->dqb_ihardlimit; dst->d_ino_softlimit = src->dqb_isoftlimit; dst->d_ino_count = src->dqb_curinodes; dst->d_spc_timer = src->dqb_btime; dst->d_ino_timer = src->dqb_itime; dst->d_fieldmask = 0; if (src->dqb_valid & QIF_BLIMITS) dst->d_fieldmask |= QC_SPC_SOFT | QC_SPC_HARD; if (src->dqb_valid & QIF_SPACE) dst->d_fieldmask |= QC_SPACE; if (src->dqb_valid & QIF_ILIMITS) dst->d_fieldmask |= QC_INO_SOFT | QC_INO_HARD; if (src->dqb_valid & QIF_INODES) dst->d_fieldmask |= QC_INO_COUNT; if (src->dqb_valid & QIF_BTIME) dst->d_fieldmask |= QC_SPC_TIMER; if (src->dqb_valid & QIF_ITIME) dst->d_fieldmask |= QC_INO_TIMER; } static int quota_setquota(struct super_block *sb, int type, qid_t id, void __user *addr) { struct qc_dqblk fdq; struct if_dqblk idq; struct kqid qid; if (compat_need_64bit_alignment_fixup()) { struct compat_if_dqblk __user *compat_dqblk = addr; if (copy_from_user(&idq, compat_dqblk, sizeof(*compat_dqblk)) || get_user(idq.dqb_valid, &compat_dqblk->dqb_valid)) return -EFAULT; } else { if (copy_from_user(&idq, addr, sizeof(idq))) return -EFAULT; } if (!sb->s_qcop->set_dqblk) return -ENOSYS; qid = make_kqid(current_user_ns(), type, id); if (!qid_has_mapping(sb->s_user_ns, qid)) return -EINVAL; copy_from_if_dqblk(&fdq, &idq); return sb->s_qcop->set_dqblk(sb, qid, &fdq); } static int quota_enable(struct super_block *sb, void __user *addr) { __u32 flags; if (copy_from_user(&flags, addr, sizeof(flags))) return -EFAULT; if (!sb->s_qcop->quota_enable) return -ENOSYS; return sb->s_qcop->quota_enable(sb, flags); } static int quota_disable(struct super_block *sb, void __user *addr) { __u32 flags; if (copy_from_user(&flags, addr, sizeof(flags))) return -EFAULT; if (!sb->s_qcop->quota_disable) return -ENOSYS; return sb->s_qcop->quota_disable(sb, flags); } static int quota_state_to_flags(struct qc_state *state) { int flags = 0; if (state->s_state[USRQUOTA].flags & QCI_ACCT_ENABLED) flags |= FS_QUOTA_UDQ_ACCT; if (state->s_state[USRQUOTA].flags & QCI_LIMITS_ENFORCED) flags |= FS_QUOTA_UDQ_ENFD; if (state->s_state[GRPQUOTA].flags & QCI_ACCT_ENABLED) flags |= FS_QUOTA_GDQ_ACCT; if (state->s_state[GRPQUOTA].flags & QCI_LIMITS_ENFORCED) flags |= FS_QUOTA_GDQ_ENFD; if (state->s_state[PRJQUOTA].flags & QCI_ACCT_ENABLED) flags |= FS_QUOTA_PDQ_ACCT; if (state->s_state[PRJQUOTA].flags & QCI_LIMITS_ENFORCED) flags |= FS_QUOTA_PDQ_ENFD; return flags; } static int quota_getstate(struct super_block *sb, int type, struct fs_quota_stat *fqs) { struct qc_state state; int ret; memset(&state, 0, sizeof (struct qc_state)); ret = sb->s_qcop->get_state(sb, &state); if (ret < 0) return ret; memset(fqs, 0, sizeof(*fqs)); fqs->qs_version = FS_QSTAT_VERSION; fqs->qs_flags = quota_state_to_flags(&state); /* No quota enabled? */ if (!fqs->qs_flags) return -ENOSYS; fqs->qs_incoredqs = state.s_incoredqs; fqs->qs_btimelimit = state.s_state[type].spc_timelimit; fqs->qs_itimelimit = state.s_state[type].ino_timelimit; fqs->qs_rtbtimelimit = state.s_state[type].rt_spc_timelimit; fqs->qs_bwarnlimit = state.s_state[type].spc_warnlimit; fqs->qs_iwarnlimit = state.s_state[type].ino_warnlimit; /* Inodes may be allocated even if inactive; copy out if present */ if (state.s_state[USRQUOTA].ino) { fqs->qs_uquota.qfs_ino = state.s_state[USRQUOTA].ino; fqs->qs_uquota.qfs_nblks = state.s_state[USRQUOTA].blocks; fqs->qs_uquota.qfs_nextents = state.s_state[USRQUOTA].nextents; } if (state.s_state[GRPQUOTA].ino) { fqs->qs_gquota.qfs_ino = state.s_state[GRPQUOTA].ino; fqs->qs_gquota.qfs_nblks = state.s_state[GRPQUOTA].blocks; fqs->qs_gquota.qfs_nextents = state.s_state[GRPQUOTA].nextents; } if (state.s_state[PRJQUOTA].ino) { /* * Q_XGETQSTAT doesn't have room for both group and project * quotas. So, allow the project quota values to be copied out * only if there is no group quota information available. */ if (!(state.s_state[GRPQUOTA].flags & QCI_ACCT_ENABLED)) { fqs->qs_gquota.qfs_ino = state.s_state[PRJQUOTA].ino; fqs->qs_gquota.qfs_nblks = state.s_state[PRJQUOTA].blocks; fqs->qs_gquota.qfs_nextents = state.s_state[PRJQUOTA].nextents; } } return 0; } static int compat_copy_fs_qfilestat(struct compat_fs_qfilestat __user *to, struct fs_qfilestat *from) { if (copy_to_user(to, from, sizeof(*to)) || put_user(from->qfs_nextents, &to->qfs_nextents)) return -EFAULT; return 0; } static int compat_copy_fs_quota_stat(struct compat_fs_quota_stat __user *to, struct fs_quota_stat *from) { if (put_user(from->qs_version, &to->qs_version) || put_user(from->qs_flags, &to->qs_flags) || put_user(from->qs_pad, &to->qs_pad) || compat_copy_fs_qfilestat(&to->qs_uquota, &from->qs_uquota) || compat_copy_fs_qfilestat(&to->qs_gquota, &from->qs_gquota) || put_user(from->qs_incoredqs, &to->qs_incoredqs) || put_user(from->qs_btimelimit, &to->qs_btimelimit) || put_user(from->qs_itimelimit, &to->qs_itimelimit) || put_user(from->qs_rtbtimelimit, &to->qs_rtbtimelimit) || put_user(from->qs_bwarnlimit, &to->qs_bwarnlimit) || put_user(from->qs_iwarnlimit, &to->qs_iwarnlimit)) return -EFAULT; return 0; } static int quota_getxstate(struct super_block *sb, int type, void __user *addr) { struct fs_quota_stat fqs; int ret; if (!sb->s_qcop->get_state) return -ENOSYS; ret = quota_getstate(sb, type, &fqs); if (ret) return ret; if (compat_need_64bit_alignment_fixup()) return compat_copy_fs_quota_stat(addr, &fqs); if (copy_to_user(addr, &fqs, sizeof(fqs))) return -EFAULT; return 0; } static int quota_getstatev(struct super_block *sb, int type, struct fs_quota_statv *fqs) { struct qc_state state; int ret; memset(&state, 0, sizeof (struct qc_state)); ret = sb->s_qcop->get_state(sb, &state); if (ret < 0) return ret; memset(fqs, 0, sizeof(*fqs)); fqs->qs_version = FS_QSTAT_VERSION; fqs->qs_flags = quota_state_to_flags(&state); /* No quota enabled? */ if (!fqs->qs_flags) return -ENOSYS; fqs->qs_incoredqs = state.s_incoredqs; fqs->qs_btimelimit = state.s_state[type].spc_timelimit; fqs->qs_itimelimit = state.s_state[type].ino_timelimit; fqs->qs_rtbtimelimit = state.s_state[type].rt_spc_timelimit; fqs->qs_bwarnlimit = state.s_state[type].spc_warnlimit; fqs->qs_iwarnlimit = state.s_state[type].ino_warnlimit; fqs->qs_rtbwarnlimit = state.s_state[type].rt_spc_warnlimit; /* Inodes may be allocated even if inactive; copy out if present */ if (state.s_state[USRQUOTA].ino) { fqs->qs_uquota.qfs_ino = state.s_state[USRQUOTA].ino; fqs->qs_uquota.qfs_nblks = state.s_state[USRQUOTA].blocks; fqs->qs_uquota.qfs_nextents = state.s_state[USRQUOTA].nextents; } if (state.s_state[GRPQUOTA].ino) { fqs->qs_gquota.qfs_ino = state.s_state[GRPQUOTA].ino; fqs->qs_gquota.qfs_nblks = state.s_state[GRPQUOTA].blocks; fqs->qs_gquota.qfs_nextents = state.s_state[GRPQUOTA].nextents; } if (state.s_state[PRJQUOTA].ino) { fqs->qs_pquota.qfs_ino = state.s_state[PRJQUOTA].ino; fqs->qs_pquota.qfs_nblks = state.s_state[PRJQUOTA].blocks; fqs->qs_pquota.qfs_nextents = state.s_state[PRJQUOTA].nextents; } return 0; } static int quota_getxstatev(struct super_block *sb, int type, void __user *addr) { struct fs_quota_statv fqs; int ret; if (!sb->s_qcop->get_state) return -ENOSYS; memset(&fqs, 0, sizeof(fqs)); if (copy_from_user(&fqs, addr, 1)) /* Just read qs_version */ return -EFAULT; /* If this kernel doesn't support user specified version, fail */ switch (fqs.qs_version) { case FS_QSTATV_VERSION1: break; default: return -EINVAL; } ret = quota_getstatev(sb, type, &fqs); if (!ret && copy_to_user(addr, &fqs, sizeof(fqs))) return -EFAULT; return ret; } /* * XFS defines BBTOB and BTOBB macros inside fs/xfs/ and we cannot move them * out of there as xfsprogs rely on definitions being in that header file. So * just define same functions here for quota purposes. */ #define XFS_BB_SHIFT 9 static inline u64 quota_bbtob(u64 blocks) { return blocks << XFS_BB_SHIFT; } static inline u64 quota_btobb(u64 bytes) { return (bytes + (1 << XFS_BB_SHIFT) - 1) >> XFS_BB_SHIFT; } static inline s64 copy_from_xfs_dqblk_ts(const struct fs_disk_quota *d, __s32 timer, __s8 timer_hi) { if (d->d_fieldmask & FS_DQ_BIGTIME) return (u32)timer | (s64)timer_hi << 32; return timer; } static void copy_from_xfs_dqblk(struct qc_dqblk *dst, struct fs_disk_quota *src) { dst->d_spc_hardlimit = quota_bbtob(src->d_blk_hardlimit); dst->d_spc_softlimit = quota_bbtob(src->d_blk_softlimit); dst->d_ino_hardlimit = src->d_ino_hardlimit; dst->d_ino_softlimit = src->d_ino_softlimit; dst->d_space = quota_bbtob(src->d_bcount); dst->d_ino_count = src->d_icount; dst->d_ino_timer = copy_from_xfs_dqblk_ts(src, src->d_itimer, src->d_itimer_hi); dst->d_spc_timer = copy_from_xfs_dqblk_ts(src, src->d_btimer, src->d_btimer_hi); dst->d_ino_warns = src->d_iwarns; dst->d_spc_warns = src->d_bwarns; dst->d_rt_spc_hardlimit = quota_bbtob(src->d_rtb_hardlimit); dst->d_rt_spc_softlimit = quota_bbtob(src->d_rtb_softlimit); dst->d_rt_space = quota_bbtob(src->d_rtbcount); dst->d_rt_spc_timer = copy_from_xfs_dqblk_ts(src, src->d_rtbtimer, src->d_rtbtimer_hi); dst->d_rt_spc_warns = src->d_rtbwarns; dst->d_fieldmask = 0; if (src->d_fieldmask & FS_DQ_ISOFT) dst->d_fieldmask |= QC_INO_SOFT; if (src->d_fieldmask & FS_DQ_IHARD) dst->d_fieldmask |= QC_INO_HARD; if (src->d_fieldmask & FS_DQ_BSOFT) dst->d_fieldmask |= QC_SPC_SOFT; if (src->d_fieldmask & FS_DQ_BHARD) dst->d_fieldmask |= QC_SPC_HARD; if (src->d_fieldmask & FS_DQ_RTBSOFT) dst->d_fieldmask |= QC_RT_SPC_SOFT; if (src->d_fieldmask & FS_DQ_RTBHARD) dst->d_fieldmask |= QC_RT_SPC_HARD; if (src->d_fieldmask & FS_DQ_BTIMER) dst->d_fieldmask |= QC_SPC_TIMER; if (src->d_fieldmask & FS_DQ_ITIMER) dst->d_fieldmask |= QC_INO_TIMER; if (src->d_fieldmask & FS_DQ_RTBTIMER) dst->d_fieldmask |= QC_RT_SPC_TIMER; if (src->d_fieldmask & FS_DQ_BWARNS) dst->d_fieldmask |= QC_SPC_WARNS; if (src->d_fieldmask & FS_DQ_IWARNS) dst->d_fieldmask |= QC_INO_WARNS; if (src->d_fieldmask & FS_DQ_RTBWARNS) dst->d_fieldmask |= QC_RT_SPC_WARNS; if (src->d_fieldmask & FS_DQ_BCOUNT) dst->d_fieldmask |= QC_SPACE; if (src->d_fieldmask & FS_DQ_ICOUNT) dst->d_fieldmask |= QC_INO_COUNT; if (src->d_fieldmask & FS_DQ_RTBCOUNT) dst->d_fieldmask |= QC_RT_SPACE; } static void copy_qcinfo_from_xfs_dqblk(struct qc_info *dst, struct fs_disk_quota *src) { memset(dst, 0, sizeof(*dst)); dst->i_spc_timelimit = src->d_btimer; dst->i_ino_timelimit = src->d_itimer; dst->i_rt_spc_timelimit = src->d_rtbtimer; dst->i_ino_warnlimit = src->d_iwarns; dst->i_spc_warnlimit = src->d_bwarns; dst->i_rt_spc_warnlimit = src->d_rtbwarns; if (src->d_fieldmask & FS_DQ_BWARNS) dst->i_fieldmask |= QC_SPC_WARNS; if (src->d_fieldmask & FS_DQ_IWARNS) dst->i_fieldmask |= QC_INO_WARNS; if (src->d_fieldmask & FS_DQ_RTBWARNS) dst->i_fieldmask |= QC_RT_SPC_WARNS; if (src->d_fieldmask & FS_DQ_BTIMER) dst->i_fieldmask |= QC_SPC_TIMER; if (src->d_fieldmask & FS_DQ_ITIMER) dst->i_fieldmask |= QC_INO_TIMER; if (src->d_fieldmask & FS_DQ_RTBTIMER) dst->i_fieldmask |= QC_RT_SPC_TIMER; } static int quota_setxquota(struct super_block *sb, int type, qid_t id, void __user *addr) { struct fs_disk_quota fdq; struct qc_dqblk qdq; struct kqid qid; if (copy_from_user(&fdq, addr, sizeof(fdq))) return -EFAULT; if (!sb->s_qcop->set_dqblk) return -ENOSYS; qid = make_kqid(current_user_ns(), type, id); if (!qid_has_mapping(sb->s_user_ns, qid)) return -EINVAL; /* Are we actually setting timer / warning limits for all users? */ if (from_kqid(sb->s_user_ns, qid) == 0 && fdq.d_fieldmask & (FS_DQ_WARNS_MASK | FS_DQ_TIMER_MASK)) { struct qc_info qinfo; int ret; if (!sb->s_qcop->set_info) return -EINVAL; copy_qcinfo_from_xfs_dqblk(&qinfo, &fdq); ret = sb->s_qcop->set_info(sb, type, &qinfo); if (ret) return ret; /* These are already done */ fdq.d_fieldmask &= ~(FS_DQ_WARNS_MASK | FS_DQ_TIMER_MASK); } copy_from_xfs_dqblk(&qdq, &fdq); return sb->s_qcop->set_dqblk(sb, qid, &qdq); } static inline void copy_to_xfs_dqblk_ts(const struct fs_disk_quota *d, __s32 *timer_lo, __s8 *timer_hi, s64 timer) { *timer_lo = timer; if (d->d_fieldmask & FS_DQ_BIGTIME) *timer_hi = timer >> 32; } static inline bool want_bigtime(s64 timer) { return timer > S32_MAX || timer < S32_MIN; } static void copy_to_xfs_dqblk(struct fs_disk_quota *dst, struct qc_dqblk *src, int type, qid_t id) { memset(dst, 0, sizeof(*dst)); if (want_bigtime(src->d_ino_timer) || want_bigtime(src->d_spc_timer) || want_bigtime(src->d_rt_spc_timer)) dst->d_fieldmask |= FS_DQ_BIGTIME; dst->d_version = FS_DQUOT_VERSION; dst->d_id = id; if (type == USRQUOTA) dst->d_flags = FS_USER_QUOTA; else if (type == PRJQUOTA) dst->d_flags = FS_PROJ_QUOTA; else dst->d_flags = FS_GROUP_QUOTA; dst->d_blk_hardlimit = quota_btobb(src->d_spc_hardlimit); dst->d_blk_softlimit = quota_btobb(src->d_spc_softlimit); dst->d_ino_hardlimit = src->d_ino_hardlimit; dst->d_ino_softlimit = src->d_ino_softlimit; dst->d_bcount = quota_btobb(src->d_space); dst->d_icount = src->d_ino_count; copy_to_xfs_dqblk_ts(dst, &dst->d_itimer, &dst->d_itimer_hi, src->d_ino_timer); copy_to_xfs_dqblk_ts(dst, &dst->d_btimer, &dst->d_btimer_hi, src->d_spc_timer); dst->d_iwarns = src->d_ino_warns; dst->d_bwarns = src->d_spc_warns; dst->d_rtb_hardlimit = quota_btobb(src->d_rt_spc_hardlimit); dst->d_rtb_softlimit = quota_btobb(src->d_rt_spc_softlimit); dst->d_rtbcount = quota_btobb(src->d_rt_space); copy_to_xfs_dqblk_ts(dst, &dst->d_rtbtimer, &dst->d_rtbtimer_hi, src->d_rt_spc_timer); dst->d_rtbwarns = src->d_rt_spc_warns; } static int quota_getxquota(struct super_block *sb, int type, qid_t id, void __user *addr) { struct fs_disk_quota fdq; struct qc_dqblk qdq; struct kqid qid; int ret; if (!sb->s_qcop->get_dqblk) return -ENOSYS; qid = make_kqid(current_user_ns(), type, id); if (!qid_has_mapping(sb->s_user_ns, qid)) return -EINVAL; ret = sb->s_qcop->get_dqblk(sb, qid, &qdq); if (ret) return ret; copy_to_xfs_dqblk(&fdq, &qdq, type, id); if (copy_to_user(addr, &fdq, sizeof(fdq))) return -EFAULT; return ret; } /* * Return quota for next active quota >= this id, if any exists, * otherwise return -ENOENT via ->get_nextdqblk. */ static int quota_getnextxquota(struct super_block *sb, int type, qid_t id, void __user *addr) { struct fs_disk_quota fdq; struct qc_dqblk qdq; struct kqid qid; qid_t id_out; int ret; if (!sb->s_qcop->get_nextdqblk) return -ENOSYS; qid = make_kqid(current_user_ns(), type, id); if (!qid_has_mapping(sb->s_user_ns, qid)) return -EINVAL; ret = sb->s_qcop->get_nextdqblk(sb, &qid, &qdq); if (ret) return ret; id_out = from_kqid(current_user_ns(), qid); copy_to_xfs_dqblk(&fdq, &qdq, type, id_out); if (copy_to_user(addr, &fdq, sizeof(fdq))) return -EFAULT; return ret; } static int quota_rmxquota(struct super_block *sb, void __user *addr) { __u32 flags; if (copy_from_user(&flags, addr, sizeof(flags))) return -EFAULT; if (!sb->s_qcop->rm_xquota) return -ENOSYS; return sb->s_qcop->rm_xquota(sb, flags); } /* Copy parameters and call proper function */ static int do_quotactl(struct super_block *sb, int type, int cmd, qid_t id, void __user *addr, const struct path *path) { int ret; type = array_index_nospec(type, MAXQUOTAS); /* * Quota not supported on this fs? Check this before s_quota_types * since they needn't be set if quota is not supported at all. */ if (!sb->s_qcop) return -ENOSYS; if (!(sb->s_quota_types & (1 << type))) return -EINVAL; ret = check_quotactl_permission(sb, type, cmd, id); if (ret < 0) return ret; switch (cmd) { case Q_QUOTAON: return quota_quotaon(sb, type, id, path); case Q_QUOTAOFF: return quota_quotaoff(sb, type); case Q_GETFMT: return quota_getfmt(sb, type, addr); case Q_GETINFO: return quota_getinfo(sb, type, addr); case Q_SETINFO: return quota_setinfo(sb, type, addr); case Q_GETQUOTA: return quota_getquota(sb, type, id, addr); case Q_GETNEXTQUOTA: return quota_getnextquota(sb, type, id, addr); case Q_SETQUOTA: return quota_setquota(sb, type, id, addr); case Q_SYNC: if (!sb->s_qcop->quota_sync) return -ENOSYS; return sb->s_qcop->quota_sync(sb, type); case Q_XQUOTAON: return quota_enable(sb, addr); case Q_XQUOTAOFF: return quota_disable(sb, addr); case Q_XQUOTARM: return quota_rmxquota(sb, addr); case Q_XGETQSTAT: return quota_getxstate(sb, type, addr); case Q_XGETQSTATV: return quota_getxstatev(sb, type, addr); case Q_XSETQLIM: return quota_setxquota(sb, type, id, addr); case Q_XGETQUOTA: return quota_getxquota(sb, type, id, addr); case Q_XGETNEXTQUOTA: return quota_getnextxquota(sb, type, id, addr); case Q_XQUOTASYNC: if (sb_rdonly(sb)) return -EROFS; /* XFS quotas are fully coherent now, making this call a noop */ return 0; default: return -EINVAL; } } /* Return 1 if 'cmd' will block on frozen filesystem */ static int quotactl_cmd_write(int cmd) { /* * We cannot allow Q_GETQUOTA and Q_GETNEXTQUOTA without write access * as dquot_acquire() may allocate space for new structure and OCFS2 * needs to increment on-disk use count. */ switch (cmd) { case Q_GETFMT: case Q_GETINFO: case Q_SYNC: case Q_XGETQSTAT: case Q_XGETQSTATV: case Q_XGETQUOTA: case Q_XGETNEXTQUOTA: case Q_XQUOTASYNC: return 0; } return 1; } /* Return true if quotactl command is manipulating quota on/off state */ static bool quotactl_cmd_onoff(int cmd) { return (cmd == Q_QUOTAON) || (cmd == Q_QUOTAOFF) || (cmd == Q_XQUOTAON) || (cmd == Q_XQUOTAOFF); } /* * look up a superblock on which quota ops will be performed * - use the name of a block device to find the superblock thereon */ static struct super_block *quotactl_block(const char __user *special, int cmd) { #ifdef CONFIG_BLOCK struct super_block *sb; struct filename *tmp = getname(special); bool excl = false, thawed = false; int error; dev_t dev; if (IS_ERR(tmp)) return ERR_CAST(tmp); error = lookup_bdev(tmp->name, &dev); putname(tmp); if (error) return ERR_PTR(error); if (quotactl_cmd_onoff(cmd)) { excl = true; thawed = true; } else if (quotactl_cmd_write(cmd)) { thawed = true; } retry: sb = user_get_super(dev, excl); if (!sb) return ERR_PTR(-ENODEV); if (thawed && sb->s_writers.frozen != SB_UNFROZEN) { if (excl) up_write(&sb->s_umount); else up_read(&sb->s_umount); /* Wait for sb to unfreeze */ sb_start_write(sb); sb_end_write(sb); put_super(sb); goto retry; } return sb; #else return ERR_PTR(-ENODEV); #endif } /* * This is the system call interface. This communicates with * the user-level programs. Currently this only supports diskquota * calls. Maybe we need to add the process quotas etc. in the future, * but we probably should use rlimits for that. */ SYSCALL_DEFINE4(quotactl, unsigned int, cmd, const char __user *, special, qid_t, id, void __user *, addr) { uint cmds, type; struct super_block *sb = NULL; struct path path, *pathp = NULL; int ret; cmds = cmd >> SUBCMDSHIFT; type = cmd & SUBCMDMASK; if (type >= MAXQUOTAS) return -EINVAL; /* * As a special case Q_SYNC can be called without a specific device. * It will iterate all superblocks that have quota enabled and call * the sync action on each of them. */ if (!special) { if (cmds == Q_SYNC) return quota_sync_all(type); return -ENODEV; } /* * Path for quotaon has to be resolved before grabbing superblock * because that gets s_umount sem which is also possibly needed by path * resolution (think about autofs) and thus deadlocks could arise. */ if (cmds == Q_QUOTAON) { ret = user_path_at(AT_FDCWD, addr, LOOKUP_FOLLOW|LOOKUP_AUTOMOUNT, &path); if (ret) pathp = ERR_PTR(ret); else pathp = &path; } sb = quotactl_block(special, cmds); if (IS_ERR(sb)) { ret = PTR_ERR(sb); goto out; } ret = do_quotactl(sb, type, cmds, id, addr, pathp); if (!quotactl_cmd_onoff(cmds)) drop_super(sb); else drop_super_exclusive(sb); out: if (pathp && !IS_ERR(pathp)) path_put(pathp); return ret; } SYSCALL_DEFINE4(quotactl_fd, unsigned int, fd, unsigned int, cmd, qid_t, id, void __user *, addr) { struct super_block *sb; unsigned int cmds = cmd >> SUBCMDSHIFT; unsigned int type = cmd & SUBCMDMASK; struct fd f; int ret; f = fdget_raw(fd); if (!fd_file(f)) return -EBADF; ret = -EINVAL; if (type >= MAXQUOTAS) goto out; if (quotactl_cmd_write(cmds)) { ret = mnt_want_write(fd_file(f)->f_path.mnt); if (ret) goto out; } sb = fd_file(f)->f_path.mnt->mnt_sb; if (quotactl_cmd_onoff(cmds)) down_write(&sb->s_umount); else down_read(&sb->s_umount); ret = do_quotactl(sb, type, cmds, id, addr, ERR_PTR(-EINVAL)); if (quotactl_cmd_onoff(cmds)) up_write(&sb->s_umount); else up_read(&sb->s_umount); if (quotactl_cmd_write(cmds)) mnt_drop_write(fd_file(f)->f_path.mnt); out: fdput(f); return ret; } |
| 3 3 2 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 | // SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved. */ #include "queueing.h" #include "socket.h" #include "timers.h" #include "device.h" #include "ratelimiter.h" #include "peer.h" #include "messages.h" #include <linux/module.h> #include <linux/rtnetlink.h> #include <linux/inet.h> #include <linux/netdevice.h> #include <linux/inetdevice.h> #include <linux/if_arp.h> #include <linux/icmp.h> #include <linux/suspend.h> #include <net/dst_metadata.h> #include <net/gso.h> #include <net/icmp.h> #include <net/rtnetlink.h> #include <net/ip_tunnels.h> #include <net/addrconf.h> static LIST_HEAD(device_list); static int wg_open(struct net_device *dev) { struct in_device *dev_v4 = __in_dev_get_rtnl(dev); struct inet6_dev *dev_v6 = __in6_dev_get(dev); struct wg_device *wg = netdev_priv(dev); struct wg_peer *peer; int ret; if (dev_v4) { /* At some point we might put this check near the ip_rt_send_ * redirect call of ip_forward in net/ipv4/ip_forward.c, similar * to the current secpath check. */ IN_DEV_CONF_SET(dev_v4, SEND_REDIRECTS, false); IPV4_DEVCONF_ALL(dev_net(dev), SEND_REDIRECTS) = false; } if (dev_v6) dev_v6->cnf.addr_gen_mode = IN6_ADDR_GEN_MODE_NONE; mutex_lock(&wg->device_update_lock); ret = wg_socket_init(wg, wg->incoming_port); if (ret < 0) goto out; list_for_each_entry(peer, &wg->peer_list, peer_list) { wg_packet_send_staged_packets(peer); if (peer->persistent_keepalive_interval) wg_packet_send_keepalive(peer); } out: mutex_unlock(&wg->device_update_lock); return ret; } static int wg_pm_notification(struct notifier_block *nb, unsigned long action, void *data) { struct wg_device *wg; struct wg_peer *peer; /* If the machine is constantly suspending and resuming, as part of * its normal operation rather than as a somewhat rare event, then we * don't actually want to clear keys. */ if (IS_ENABLED(CONFIG_PM_AUTOSLEEP) || IS_ENABLED(CONFIG_PM_USERSPACE_AUTOSLEEP)) return 0; if (action != PM_HIBERNATION_PREPARE && action != PM_SUSPEND_PREPARE) return 0; rtnl_lock(); list_for_each_entry(wg, &device_list, device_list) { mutex_lock(&wg->device_update_lock); list_for_each_entry(peer, &wg->peer_list, peer_list) { del_timer(&peer->timer_zero_key_material); wg_noise_handshake_clear(&peer->handshake); wg_noise_keypairs_clear(&peer->keypairs); } mutex_unlock(&wg->device_update_lock); } rtnl_unlock(); rcu_barrier(); return 0; } static struct notifier_block pm_notifier = { .notifier_call = wg_pm_notification }; static int wg_vm_notification(struct notifier_block *nb, unsigned long action, void *data) { struct wg_device *wg; struct wg_peer *peer; rtnl_lock(); list_for_each_entry(wg, &device_list, device_list) { mutex_lock(&wg->device_update_lock); list_for_each_entry(peer, &wg->peer_list, peer_list) wg_noise_expire_current_peer_keypairs(peer); mutex_unlock(&wg->device_update_lock); } rtnl_unlock(); return 0; } static struct notifier_block vm_notifier = { .notifier_call = wg_vm_notification }; static int wg_stop(struct net_device *dev) { struct wg_device *wg = netdev_priv(dev); struct wg_peer *peer; struct sk_buff *skb; mutex_lock(&wg->device_update_lock); list_for_each_entry(peer, &wg->peer_list, peer_list) { wg_packet_purge_staged_packets(peer); wg_timers_stop(peer); wg_noise_handshake_clear(&peer->handshake); wg_noise_keypairs_clear(&peer->keypairs); wg_noise_reset_last_sent_handshake(&peer->last_sent_handshake); } mutex_unlock(&wg->device_update_lock); while ((skb = ptr_ring_consume(&wg->handshake_queue.ring)) != NULL) kfree_skb(skb); atomic_set(&wg->handshake_queue_len, 0); wg_socket_reinit(wg, NULL, NULL); return 0; } static netdev_tx_t wg_xmit(struct sk_buff *skb, struct net_device *dev) { struct wg_device *wg = netdev_priv(dev); struct sk_buff_head packets; struct wg_peer *peer; struct sk_buff *next; sa_family_t family; u32 mtu; int ret; if (unlikely(!wg_check_packet_protocol(skb))) { ret = -EPROTONOSUPPORT; net_dbg_ratelimited("%s: Invalid IP packet\n", dev->name); goto err; } peer = wg_allowedips_lookup_dst(&wg->peer_allowedips, skb); if (unlikely(!peer)) { ret = -ENOKEY; if (skb->protocol == htons(ETH_P_IP)) net_dbg_ratelimited("%s: No peer has allowed IPs matching %pI4\n", dev->name, &ip_hdr(skb)->daddr); else if (skb->protocol == htons(ETH_P_IPV6)) net_dbg_ratelimited("%s: No peer has allowed IPs matching %pI6\n", dev->name, &ipv6_hdr(skb)->daddr); goto err_icmp; } family = READ_ONCE(peer->endpoint.addr.sa_family); if (unlikely(family != AF_INET && family != AF_INET6)) { ret = -EDESTADDRREQ; net_dbg_ratelimited("%s: No valid endpoint has been configured or discovered for peer %llu\n", dev->name, peer->internal_id); goto err_peer; } mtu = skb_valid_dst(skb) ? dst_mtu(skb_dst(skb)) : dev->mtu; __skb_queue_head_init(&packets); if (!skb_is_gso(skb)) { skb_mark_not_on_list(skb); } else { struct sk_buff *segs = skb_gso_segment(skb, 0); if (IS_ERR(segs)) { ret = PTR_ERR(segs); goto err_peer; } dev_kfree_skb(skb); skb = segs; } skb_list_walk_safe(skb, skb, next) { skb_mark_not_on_list(skb); skb = skb_share_check(skb, GFP_ATOMIC); if (unlikely(!skb)) continue; /* We only need to keep the original dst around for icmp, * so at this point we're in a position to drop it. */ skb_dst_drop(skb); PACKET_CB(skb)->mtu = mtu; __skb_queue_tail(&packets, skb); } spin_lock_bh(&peer->staged_packet_queue.lock); /* If the queue is getting too big, we start removing the oldest packets * until it's small again. We do this before adding the new packet, so * we don't remove GSO segments that are in excess. */ while (skb_queue_len(&peer->staged_packet_queue) > MAX_STAGED_PACKETS) { dev_kfree_skb(__skb_dequeue(&peer->staged_packet_queue)); DEV_STATS_INC(dev, tx_dropped); } skb_queue_splice_tail(&packets, &peer->staged_packet_queue); spin_unlock_bh(&peer->staged_packet_queue.lock); wg_packet_send_staged_packets(peer); wg_peer_put(peer); return NETDEV_TX_OK; err_peer: wg_peer_put(peer); err_icmp: if (skb->protocol == htons(ETH_P_IP)) icmp_ndo_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0); else if (skb->protocol == htons(ETH_P_IPV6)) icmpv6_ndo_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0); err: DEV_STATS_INC(dev, tx_errors); kfree_skb(skb); return ret; } static const struct net_device_ops netdev_ops = { .ndo_open = wg_open, .ndo_stop = wg_stop, .ndo_start_xmit = wg_xmit, }; static void wg_destruct(struct net_device *dev) { struct wg_device *wg = netdev_priv(dev); rtnl_lock(); list_del(&wg->device_list); rtnl_unlock(); mutex_lock(&wg->device_update_lock); rcu_assign_pointer(wg->creating_net, NULL); wg->incoming_port = 0; wg_socket_reinit(wg, NULL, NULL); /* The final references are cleared in the below calls to destroy_workqueue. */ wg_peer_remove_all(wg); destroy_workqueue(wg->handshake_receive_wq); destroy_workqueue(wg->handshake_send_wq); destroy_workqueue(wg->packet_crypt_wq); wg_packet_queue_free(&wg->handshake_queue, true); wg_packet_queue_free(&wg->decrypt_queue, false); wg_packet_queue_free(&wg->encrypt_queue, false); rcu_barrier(); /* Wait for all the peers to be actually freed. */ wg_ratelimiter_uninit(); memzero_explicit(&wg->static_identity, sizeof(wg->static_identity)); kvfree(wg->index_hashtable); kvfree(wg->peer_hashtable); mutex_unlock(&wg->device_update_lock); pr_debug("%s: Interface destroyed\n", dev->name); free_netdev(dev); } static const struct device_type device_type = { .name = KBUILD_MODNAME }; static void wg_setup(struct net_device *dev) { struct wg_device *wg = netdev_priv(dev); enum { WG_NETDEV_FEATURES = NETIF_F_HW_CSUM | NETIF_F_RXCSUM | NETIF_F_SG | NETIF_F_GSO | NETIF_F_GSO_SOFTWARE | NETIF_F_HIGHDMA }; const int overhead = MESSAGE_MINIMUM_LENGTH + sizeof(struct udphdr) + max(sizeof(struct ipv6hdr), sizeof(struct iphdr)); dev->netdev_ops = &netdev_ops; dev->header_ops = &ip_tunnel_header_ops; dev->hard_header_len = 0; dev->addr_len = 0; dev->needed_headroom = DATA_PACKET_HEAD_ROOM; dev->needed_tailroom = noise_encrypted_len(MESSAGE_PADDING_MULTIPLE); dev->type = ARPHRD_NONE; dev->flags = IFF_POINTOPOINT | IFF_NOARP; dev->priv_flags |= IFF_NO_QUEUE; dev->lltx = true; dev->features |= WG_NETDEV_FEATURES; dev->hw_features |= WG_NETDEV_FEATURES; dev->hw_enc_features |= WG_NETDEV_FEATURES; dev->mtu = ETH_DATA_LEN - overhead; dev->max_mtu = round_down(INT_MAX, MESSAGE_PADDING_MULTIPLE) - overhead; dev->pcpu_stat_type = NETDEV_PCPU_STAT_TSTATS; SET_NETDEV_DEVTYPE(dev, &device_type); /* We need to keep the dst around in case of icmp replies. */ netif_keep_dst(dev); memset(wg, 0, sizeof(*wg)); wg->dev = dev; } static int wg_newlink(struct net *src_net, struct net_device *dev, struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { struct wg_device *wg = netdev_priv(dev); int ret = -ENOMEM; rcu_assign_pointer(wg->creating_net, src_net); init_rwsem(&wg->static_identity.lock); mutex_init(&wg->socket_update_lock); mutex_init(&wg->device_update_lock); wg_allowedips_init(&wg->peer_allowedips); wg_cookie_checker_init(&wg->cookie_checker, wg); INIT_LIST_HEAD(&wg->peer_list); wg->device_update_gen = 1; wg->peer_hashtable = wg_pubkey_hashtable_alloc(); if (!wg->peer_hashtable) return ret; wg->index_hashtable = wg_index_hashtable_alloc(); if (!wg->index_hashtable) goto err_free_peer_hashtable; wg->handshake_receive_wq = alloc_workqueue("wg-kex-%s", WQ_CPU_INTENSIVE | WQ_FREEZABLE, 0, dev->name); if (!wg->handshake_receive_wq) goto err_free_index_hashtable; wg->handshake_send_wq = alloc_workqueue("wg-kex-%s", WQ_UNBOUND | WQ_FREEZABLE, 0, dev->name); if (!wg->handshake_send_wq) goto err_destroy_handshake_receive; wg->packet_crypt_wq = alloc_workqueue("wg-crypt-%s", WQ_CPU_INTENSIVE | WQ_MEM_RECLAIM, 0, dev->name); if (!wg->packet_crypt_wq) goto err_destroy_handshake_send; ret = wg_packet_queue_init(&wg->encrypt_queue, wg_packet_encrypt_worker, MAX_QUEUED_PACKETS); if (ret < 0) goto err_destroy_packet_crypt; ret = wg_packet_queue_init(&wg->decrypt_queue, wg_packet_decrypt_worker, MAX_QUEUED_PACKETS); if (ret < 0) goto err_free_encrypt_queue; ret = wg_packet_queue_init(&wg->handshake_queue, wg_packet_handshake_receive_worker, MAX_QUEUED_INCOMING_HANDSHAKES); if (ret < 0) goto err_free_decrypt_queue; ret = wg_ratelimiter_init(); if (ret < 0) goto err_free_handshake_queue; ret = register_netdevice(dev); if (ret < 0) goto err_uninit_ratelimiter; list_add(&wg->device_list, &device_list); /* We wait until the end to assign priv_destructor, so that * register_netdevice doesn't call it for us if it fails. */ dev->priv_destructor = wg_destruct; pr_debug("%s: Interface created\n", dev->name); return ret; err_uninit_ratelimiter: wg_ratelimiter_uninit(); err_free_handshake_queue: wg_packet_queue_free(&wg->handshake_queue, false); err_free_decrypt_queue: wg_packet_queue_free(&wg->decrypt_queue, false); err_free_encrypt_queue: wg_packet_queue_free(&wg->encrypt_queue, false); err_destroy_packet_crypt: destroy_workqueue(wg->packet_crypt_wq); err_destroy_handshake_send: destroy_workqueue(wg->handshake_send_wq); err_destroy_handshake_receive: destroy_workqueue(wg->handshake_receive_wq); err_free_index_hashtable: kvfree(wg->index_hashtable); err_free_peer_hashtable: kvfree(wg->peer_hashtable); return ret; } static struct rtnl_link_ops link_ops __read_mostly = { .kind = KBUILD_MODNAME, .priv_size = sizeof(struct wg_device), .setup = wg_setup, .newlink = wg_newlink, }; static void wg_netns_pre_exit(struct net *net) { struct wg_device *wg; struct wg_peer *peer; rtnl_lock(); list_for_each_entry(wg, &device_list, device_list) { if (rcu_access_pointer(wg->creating_net) == net) { pr_debug("%s: Creating namespace exiting\n", wg->dev->name); netif_carrier_off(wg->dev); mutex_lock(&wg->device_update_lock); rcu_assign_pointer(wg->creating_net, NULL); wg_socket_reinit(wg, NULL, NULL); list_for_each_entry(peer, &wg->peer_list, peer_list) wg_socket_clear_peer_endpoint_src(peer); mutex_unlock(&wg->device_update_lock); } } rtnl_unlock(); } static struct pernet_operations pernet_ops = { .pre_exit = wg_netns_pre_exit }; int __init wg_device_init(void) { int ret; ret = register_pm_notifier(&pm_notifier); if (ret) return ret; ret = register_random_vmfork_notifier(&vm_notifier); if (ret) goto error_pm; ret = register_pernet_device(&pernet_ops); if (ret) goto error_vm; ret = rtnl_link_register(&link_ops); if (ret) goto error_pernet; return 0; error_pernet: unregister_pernet_device(&pernet_ops); error_vm: unregister_random_vmfork_notifier(&vm_notifier); error_pm: unregister_pm_notifier(&pm_notifier); return ret; } void wg_device_uninit(void) { rtnl_link_unregister(&link_ops); unregister_pernet_device(&pernet_ops); unregister_random_vmfork_notifier(&vm_notifier); unregister_pm_notifier(&pm_notifier); rcu_barrier(); } |
| 3 3 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 | // SPDX-License-Identifier: GPL-2.0-only #include <linux/etherdevice.h> #include <linux/if_macvlan.h> #include <linux/if_tap.h> #include <linux/if_vlan.h> #include <linux/interrupt.h> #include <linux/nsproxy.h> #include <linux/compat.h> #include <linux/if_tun.h> #include <linux/module.h> #include <linux/skbuff.h> #include <linux/cache.h> #include <linux/sched/signal.h> #include <linux/types.h> #include <linux/slab.h> #include <linux/wait.h> #include <linux/cdev.h> #include <linux/idr.h> #include <linux/fs.h> #include <linux/uio.h> #include <net/net_namespace.h> #include <net/rtnetlink.h> #include <net/sock.h> #include <linux/virtio_net.h> #include <linux/skb_array.h> struct macvtap_dev { struct macvlan_dev vlan; struct tap_dev tap; }; /* * Variables for dealing with macvtaps device numbers. */ static dev_t macvtap_major; static const void *macvtap_net_namespace(const struct device *d) { const struct net_device *dev = to_net_dev(d->parent); return dev_net(dev); } static struct class macvtap_class = { .name = "macvtap", .ns_type = &net_ns_type_operations, .namespace = macvtap_net_namespace, }; static struct cdev macvtap_cdev; #define TUN_OFFLOADS (NETIF_F_HW_CSUM | NETIF_F_TSO_ECN | NETIF_F_TSO | \ NETIF_F_TSO6) static void macvtap_count_tx_dropped(struct tap_dev *tap) { struct macvtap_dev *vlantap = container_of(tap, struct macvtap_dev, tap); struct macvlan_dev *vlan = &vlantap->vlan; this_cpu_inc(vlan->pcpu_stats->tx_dropped); } static void macvtap_count_rx_dropped(struct tap_dev *tap) { struct macvtap_dev *vlantap = container_of(tap, struct macvtap_dev, tap); struct macvlan_dev *vlan = &vlantap->vlan; macvlan_count_rx(vlan, 0, 0, 0); } static void macvtap_update_features(struct tap_dev *tap, netdev_features_t features) { struct macvtap_dev *vlantap = container_of(tap, struct macvtap_dev, tap); struct macvlan_dev *vlan = &vlantap->vlan; vlan->set_features = features; netdev_update_features(vlan->dev); } static int macvtap_newlink(struct net *src_net, struct net_device *dev, struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { struct macvtap_dev *vlantap = netdev_priv(dev); int err; INIT_LIST_HEAD(&vlantap->tap.queue_list); /* Since macvlan supports all offloads by default, make * tap support all offloads also. */ vlantap->tap.tap_features = TUN_OFFLOADS; /* Register callbacks for rx/tx drops accounting and updating * net_device features */ vlantap->tap.count_tx_dropped = macvtap_count_tx_dropped; vlantap->tap.count_rx_dropped = macvtap_count_rx_dropped; vlantap->tap.update_features = macvtap_update_features; err = netdev_rx_handler_register(dev, tap_handle_frame, &vlantap->tap); if (err) return err; /* Don't put anything that may fail after macvlan_common_newlink * because we can't undo what it does. */ err = macvlan_common_newlink(src_net, dev, tb, data, extack); if (err) { netdev_rx_handler_unregister(dev); return err; } vlantap->tap.dev = vlantap->vlan.dev; return 0; } static void macvtap_dellink(struct net_device *dev, struct list_head *head) { struct macvtap_dev *vlantap = netdev_priv(dev); netdev_rx_handler_unregister(dev); tap_del_queues(&vlantap->tap); macvlan_dellink(dev, head); } static void macvtap_setup(struct net_device *dev) { macvlan_common_setup(dev); dev->tx_queue_len = TUN_READQ_SIZE; } static struct net *macvtap_link_net(const struct net_device *dev) { return dev_net(macvlan_dev_real_dev(dev)); } static struct rtnl_link_ops macvtap_link_ops __read_mostly = { .kind = "macvtap", .setup = macvtap_setup, .newlink = macvtap_newlink, .dellink = macvtap_dellink, .get_link_net = macvtap_link_net, .priv_size = sizeof(struct macvtap_dev), }; static int macvtap_device_event(struct notifier_block *unused, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct macvtap_dev *vlantap; struct device *classdev; dev_t devt; int err; char tap_name[IFNAMSIZ]; if (dev->rtnl_link_ops != &macvtap_link_ops) return NOTIFY_DONE; snprintf(tap_name, IFNAMSIZ, "tap%d", dev->ifindex); vlantap = netdev_priv(dev); switch (event) { case NETDEV_REGISTER: /* Create the device node here after the network device has * been registered but before register_netdevice has * finished running. */ err = tap_get_minor(macvtap_major, &vlantap->tap); if (err) return notifier_from_errno(err); devt = MKDEV(MAJOR(macvtap_major), vlantap->tap.minor); classdev = device_create(&macvtap_class, &dev->dev, devt, dev, "%s", tap_name); if (IS_ERR(classdev)) { tap_free_minor(macvtap_major, &vlantap->tap); return notifier_from_errno(PTR_ERR(classdev)); } err = sysfs_create_link(&dev->dev.kobj, &classdev->kobj, tap_name); if (err) return notifier_from_errno(err); break; case NETDEV_UNREGISTER: /* vlan->minor == 0 if NETDEV_REGISTER above failed */ if (vlantap->tap.minor == 0) break; sysfs_remove_link(&dev->dev.kobj, tap_name); devt = MKDEV(MAJOR(macvtap_major), vlantap->tap.minor); device_destroy(&macvtap_class, devt); tap_free_minor(macvtap_major, &vlantap->tap); break; case NETDEV_CHANGE_TX_QUEUE_LEN: if (tap_queue_resize(&vlantap->tap)) return NOTIFY_BAD; break; } return NOTIFY_DONE; } static struct notifier_block macvtap_notifier_block __read_mostly = { .notifier_call = macvtap_device_event, }; static int __init macvtap_init(void) { int err; err = tap_create_cdev(&macvtap_cdev, &macvtap_major, "macvtap", THIS_MODULE); if (err) goto out1; err = class_register(&macvtap_class); if (err) goto out2; err = register_netdevice_notifier(&macvtap_notifier_block); if (err) goto out3; err = macvlan_link_register(&macvtap_link_ops); if (err) goto out4; return 0; out4: unregister_netdevice_notifier(&macvtap_notifier_block); out3: class_unregister(&macvtap_class); out2: tap_destroy_cdev(macvtap_major, &macvtap_cdev); out1: return err; } module_init(macvtap_init); static void __exit macvtap_exit(void) { rtnl_link_unregister(&macvtap_link_ops); unregister_netdevice_notifier(&macvtap_notifier_block); class_unregister(&macvtap_class); tap_destroy_cdev(macvtap_major, &macvtap_cdev); } module_exit(macvtap_exit); MODULE_ALIAS_RTNL_LINK("macvtap"); MODULE_DESCRIPTION("MAC-VLAN based tap driver"); MODULE_AUTHOR("Arnd Bergmann <arnd@arndb.de>"); MODULE_LICENSE("GPL"); |
| 3 23 56 115 22 1 8 2 22 22 10 16 12 11 2 22 22 8 6 3 4 4 2 2 2 2 2 1 1 1 6 2 2 3 3 5 115 115 54 54 59 2 13 11 11 37 28 23 23 34 34 3 34 34 37 27 37 17 37 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 | // SPDX-License-Identifier: GPL-2.0 /* * fs/partitions/msdos.c * * Code extracted from drivers/block/genhd.c * Copyright (C) 1991-1998 Linus Torvalds * * Thanks to Branko Lankester, lankeste@fwi.uva.nl, who found a bug * in the early extended-partition checks and added DM partitions * * Support for DiskManager v6.0x added by Mark Lord, * with information provided by OnTrack. This now works for linux fdisk * and LILO, as well as loadlin and bootln. Note that disks other than * /dev/hda *must* have a "DOS" type 0x51 partition in the first slot (hda1). * * More flexible handling of extended partitions - aeb, 950831 * * Check partition table on IDE disks for common CHS translations * * Re-organised Feb 1998 Russell King * * BSD disklabel support by Yossi Gottlieb <yogo@math.tau.ac.il> * updated by Marc Espie <Marc.Espie@openbsd.org> * * Unixware slices support by Andrzej Krzysztofowicz <ankry@mif.pg.gda.pl> * and Krzysztof G. Baranowski <kgb@knm.org.pl> */ #include <linux/msdos_fs.h> #include <linux/msdos_partition.h> #include "check.h" #include "efi.h" /* * Many architectures don't like unaligned accesses, while * the nr_sects and start_sect partition table entries are * at a 2 (mod 4) address. */ #include <linux/unaligned.h> static inline sector_t nr_sects(struct msdos_partition *p) { return (sector_t)get_unaligned_le32(&p->nr_sects); } static inline sector_t start_sect(struct msdos_partition *p) { return (sector_t)get_unaligned_le32(&p->start_sect); } static inline int is_extended_partition(struct msdos_partition *p) { return (p->sys_ind == DOS_EXTENDED_PARTITION || p->sys_ind == WIN98_EXTENDED_PARTITION || p->sys_ind == LINUX_EXTENDED_PARTITION); } #define MSDOS_LABEL_MAGIC1 0x55 #define MSDOS_LABEL_MAGIC2 0xAA static inline int msdos_magic_present(unsigned char *p) { return (p[0] == MSDOS_LABEL_MAGIC1 && p[1] == MSDOS_LABEL_MAGIC2); } /* Value is EBCDIC 'IBMA' */ #define AIX_LABEL_MAGIC1 0xC9 #define AIX_LABEL_MAGIC2 0xC2 #define AIX_LABEL_MAGIC3 0xD4 #define AIX_LABEL_MAGIC4 0xC1 static int aix_magic_present(struct parsed_partitions *state, unsigned char *p) { struct msdos_partition *pt = (struct msdos_partition *) (p + 0x1be); Sector sect; unsigned char *d; int slot, ret = 0; if (!(p[0] == AIX_LABEL_MAGIC1 && p[1] == AIX_LABEL_MAGIC2 && p[2] == AIX_LABEL_MAGIC3 && p[3] == AIX_LABEL_MAGIC4)) return 0; /* * Assume the partition table is valid if Linux partitions exists. * Note that old Solaris/x86 partitions use the same indicator as * Linux swap partitions, so we consider that a Linux partition as * well. */ for (slot = 1; slot <= 4; slot++, pt++) { if (pt->sys_ind == SOLARIS_X86_PARTITION || pt->sys_ind == LINUX_RAID_PARTITION || pt->sys_ind == LINUX_DATA_PARTITION || pt->sys_ind == LINUX_LVM_PARTITION || is_extended_partition(pt)) return 0; } d = read_part_sector(state, 7, §); if (d) { if (d[0] == '_' && d[1] == 'L' && d[2] == 'V' && d[3] == 'M') ret = 1; put_dev_sector(sect); } return ret; } static void set_info(struct parsed_partitions *state, int slot, u32 disksig) { struct partition_meta_info *info = &state->parts[slot].info; snprintf(info->uuid, sizeof(info->uuid), "%08x-%02x", disksig, slot); info->volname[0] = 0; state->parts[slot].has_info = true; } /* * Create devices for each logical partition in an extended partition. * The logical partitions form a linked list, with each entry being * a partition table with two entries. The first entry * is the real data partition (with a start relative to the partition * table start). The second is a pointer to the next logical partition * (with a start relative to the entire extended partition). * We do not create a Linux partition for the partition tables, but * only for the actual data partitions. */ static void parse_extended(struct parsed_partitions *state, sector_t first_sector, sector_t first_size, u32 disksig) { struct msdos_partition *p; Sector sect; unsigned char *data; sector_t this_sector, this_size; sector_t sector_size; int loopct = 0; /* number of links followed without finding a data partition */ int i; sector_size = queue_logical_block_size(state->disk->queue) / 512; this_sector = first_sector; this_size = first_size; while (1) { if (++loopct > 100) return; if (state->next == state->limit) return; data = read_part_sector(state, this_sector, §); if (!data) return; if (!msdos_magic_present(data + 510)) goto done; p = (struct msdos_partition *) (data + 0x1be); /* * Usually, the first entry is the real data partition, * the 2nd entry is the next extended partition, or empty, * and the 3rd and 4th entries are unused. * However, DRDOS sometimes has the extended partition as * the first entry (when the data partition is empty), * and OS/2 seems to use all four entries. */ /* * First process the data partition(s) */ for (i = 0; i < 4; i++, p++) { sector_t offs, size, next; if (!nr_sects(p) || is_extended_partition(p)) continue; /* Check the 3rd and 4th entries - these sometimes contain random garbage */ offs = start_sect(p)*sector_size; size = nr_sects(p)*sector_size; next = this_sector + offs; if (i >= 2) { if (offs + size > this_size) continue; if (next < first_sector) continue; if (next + size > first_sector + first_size) continue; } put_partition(state, state->next, next, size); set_info(state, state->next, disksig); if (p->sys_ind == LINUX_RAID_PARTITION) state->parts[state->next].flags = ADDPART_FLAG_RAID; loopct = 0; if (++state->next == state->limit) goto done; } /* * Next, process the (first) extended partition, if present. * (So far, there seems to be no reason to make * parse_extended() recursive and allow a tree * of extended partitions.) * It should be a link to the next logical partition. */ p -= 4; for (i = 0; i < 4; i++, p++) if (nr_sects(p) && is_extended_partition(p)) break; if (i == 4) goto done; /* nothing left to do */ this_sector = first_sector + start_sect(p) * sector_size; this_size = nr_sects(p) * sector_size; put_dev_sector(sect); } done: put_dev_sector(sect); } #define SOLARIS_X86_NUMSLICE 16 #define SOLARIS_X86_VTOC_SANE (0x600DDEEEUL) struct solaris_x86_slice { __le16 s_tag; /* ID tag of partition */ __le16 s_flag; /* permission flags */ __le32 s_start; /* start sector no of partition */ __le32 s_size; /* # of blocks in partition */ }; struct solaris_x86_vtoc { unsigned int v_bootinfo[3]; /* info needed by mboot */ __le32 v_sanity; /* to verify vtoc sanity */ __le32 v_version; /* layout version */ char v_volume[8]; /* volume name */ __le16 v_sectorsz; /* sector size in bytes */ __le16 v_nparts; /* number of partitions */ unsigned int v_reserved[10]; /* free space */ struct solaris_x86_slice v_slice[SOLARIS_X86_NUMSLICE]; /* slice headers */ unsigned int timestamp[SOLARIS_X86_NUMSLICE]; /* timestamp */ char v_asciilabel[128]; /* for compatibility */ }; /* james@bpgc.com: Solaris has a nasty indicator: 0x82 which also indicates linux swap. Be careful before believing this is Solaris. */ static void parse_solaris_x86(struct parsed_partitions *state, sector_t offset, sector_t size, int origin) { #ifdef CONFIG_SOLARIS_X86_PARTITION Sector sect; struct solaris_x86_vtoc *v; int i; short max_nparts; v = read_part_sector(state, offset + 1, §); if (!v) return; if (le32_to_cpu(v->v_sanity) != SOLARIS_X86_VTOC_SANE) { put_dev_sector(sect); return; } { char tmp[1 + BDEVNAME_SIZE + 10 + 11 + 1]; snprintf(tmp, sizeof(tmp), " %s%d: <solaris:", state->name, origin); strlcat(state->pp_buf, tmp, PAGE_SIZE); } if (le32_to_cpu(v->v_version) != 1) { char tmp[64]; snprintf(tmp, sizeof(tmp), " cannot handle version %d vtoc>\n", le32_to_cpu(v->v_version)); strlcat(state->pp_buf, tmp, PAGE_SIZE); put_dev_sector(sect); return; } /* Ensure we can handle previous case of VTOC with 8 entries gracefully */ max_nparts = le16_to_cpu(v->v_nparts) > 8 ? SOLARIS_X86_NUMSLICE : 8; for (i = 0; i < max_nparts && state->next < state->limit; i++) { struct solaris_x86_slice *s = &v->v_slice[i]; char tmp[3 + 10 + 1 + 1]; if (s->s_size == 0) continue; snprintf(tmp, sizeof(tmp), " [s%d]", i); strlcat(state->pp_buf, tmp, PAGE_SIZE); /* solaris partitions are relative to current MS-DOS * one; must add the offset of the current partition */ put_partition(state, state->next++, le32_to_cpu(s->s_start)+offset, le32_to_cpu(s->s_size)); } put_dev_sector(sect); strlcat(state->pp_buf, " >\n", PAGE_SIZE); #endif } /* check against BSD src/sys/sys/disklabel.h for consistency */ #define BSD_DISKMAGIC (0x82564557UL) /* The disk magic number */ #define BSD_MAXPARTITIONS 16 #define OPENBSD_MAXPARTITIONS 16 #define BSD_FS_UNUSED 0 /* disklabel unused partition entry ID */ struct bsd_disklabel { __le32 d_magic; /* the magic number */ __s16 d_type; /* drive type */ __s16 d_subtype; /* controller/d_type specific */ char d_typename[16]; /* type name, e.g. "eagle" */ char d_packname[16]; /* pack identifier */ __u32 d_secsize; /* # of bytes per sector */ __u32 d_nsectors; /* # of data sectors per track */ __u32 d_ntracks; /* # of tracks per cylinder */ __u32 d_ncylinders; /* # of data cylinders per unit */ __u32 d_secpercyl; /* # of data sectors per cylinder */ __u32 d_secperunit; /* # of data sectors per unit */ __u16 d_sparespertrack; /* # of spare sectors per track */ __u16 d_sparespercyl; /* # of spare sectors per cylinder */ __u32 d_acylinders; /* # of alt. cylinders per unit */ __u16 d_rpm; /* rotational speed */ __u16 d_interleave; /* hardware sector interleave */ __u16 d_trackskew; /* sector 0 skew, per track */ __u16 d_cylskew; /* sector 0 skew, per cylinder */ __u32 d_headswitch; /* head switch time, usec */ __u32 d_trkseek; /* track-to-track seek, usec */ __u32 d_flags; /* generic flags */ #define NDDATA 5 __u32 d_drivedata[NDDATA]; /* drive-type specific information */ #define NSPARE 5 __u32 d_spare[NSPARE]; /* reserved for future use */ __le32 d_magic2; /* the magic number (again) */ __le16 d_checksum; /* xor of data incl. partitions */ /* filesystem and partition information: */ __le16 d_npartitions; /* number of partitions in following */ __le32 d_bbsize; /* size of boot area at sn0, bytes */ __le32 d_sbsize; /* max size of fs superblock, bytes */ struct bsd_partition { /* the partition table */ __le32 p_size; /* number of sectors in partition */ __le32 p_offset; /* starting sector */ __le32 p_fsize; /* filesystem basic fragment size */ __u8 p_fstype; /* filesystem type, see below */ __u8 p_frag; /* filesystem fragments per block */ __le16 p_cpg; /* filesystem cylinders per group */ } d_partitions[BSD_MAXPARTITIONS]; /* actually may be more */ }; #if defined(CONFIG_BSD_DISKLABEL) /* * Create devices for BSD partitions listed in a disklabel, under a * dos-like partition. See parse_extended() for more information. */ static void parse_bsd(struct parsed_partitions *state, sector_t offset, sector_t size, int origin, char *flavour, int max_partitions) { Sector sect; struct bsd_disklabel *l; struct bsd_partition *p; char tmp[64]; l = read_part_sector(state, offset + 1, §); if (!l) return; if (le32_to_cpu(l->d_magic) != BSD_DISKMAGIC) { put_dev_sector(sect); return; } snprintf(tmp, sizeof(tmp), " %s%d: <%s:", state->name, origin, flavour); strlcat(state->pp_buf, tmp, PAGE_SIZE); if (le16_to_cpu(l->d_npartitions) < max_partitions) max_partitions = le16_to_cpu(l->d_npartitions); for (p = l->d_partitions; p - l->d_partitions < max_partitions; p++) { sector_t bsd_start, bsd_size; if (state->next == state->limit) break; if (p->p_fstype == BSD_FS_UNUSED) continue; bsd_start = le32_to_cpu(p->p_offset); bsd_size = le32_to_cpu(p->p_size); /* FreeBSD has relative offset if C partition offset is zero */ if (memcmp(flavour, "bsd\0", 4) == 0 && le32_to_cpu(l->d_partitions[2].p_offset) == 0) bsd_start += offset; if (offset == bsd_start && size == bsd_size) /* full parent partition, we have it already */ continue; if (offset > bsd_start || offset+size < bsd_start+bsd_size) { strlcat(state->pp_buf, "bad subpartition - ignored\n", PAGE_SIZE); continue; } put_partition(state, state->next++, bsd_start, bsd_size); } put_dev_sector(sect); if (le16_to_cpu(l->d_npartitions) > max_partitions) { snprintf(tmp, sizeof(tmp), " (ignored %d more)", le16_to_cpu(l->d_npartitions) - max_partitions); strlcat(state->pp_buf, tmp, PAGE_SIZE); } strlcat(state->pp_buf, " >\n", PAGE_SIZE); } #endif static void parse_freebsd(struct parsed_partitions *state, sector_t offset, sector_t size, int origin) { #ifdef CONFIG_BSD_DISKLABEL parse_bsd(state, offset, size, origin, "bsd", BSD_MAXPARTITIONS); #endif } static void parse_netbsd(struct parsed_partitions *state, sector_t offset, sector_t size, int origin) { #ifdef CONFIG_BSD_DISKLABEL parse_bsd(state, offset, size, origin, "netbsd", BSD_MAXPARTITIONS); #endif } static void parse_openbsd(struct parsed_partitions *state, sector_t offset, sector_t size, int origin) { #ifdef CONFIG_BSD_DISKLABEL parse_bsd(state, offset, size, origin, "openbsd", OPENBSD_MAXPARTITIONS); #endif } #define UNIXWARE_DISKMAGIC (0xCA5E600DUL) /* The disk magic number */ #define UNIXWARE_DISKMAGIC2 (0x600DDEEEUL) /* The slice table magic nr */ #define UNIXWARE_NUMSLICE 16 #define UNIXWARE_FS_UNUSED 0 /* Unused slice entry ID */ struct unixware_slice { __le16 s_label; /* label */ __le16 s_flags; /* permission flags */ __le32 start_sect; /* starting sector */ __le32 nr_sects; /* number of sectors in slice */ }; struct unixware_disklabel { __le32 d_type; /* drive type */ __le32 d_magic; /* the magic number */ __le32 d_version; /* version number */ char d_serial[12]; /* serial number of the device */ __le32 d_ncylinders; /* # of data cylinders per device */ __le32 d_ntracks; /* # of tracks per cylinder */ __le32 d_nsectors; /* # of data sectors per track */ __le32 d_secsize; /* # of bytes per sector */ __le32 d_part_start; /* # of first sector of this partition*/ __le32 d_unknown1[12]; /* ? */ __le32 d_alt_tbl; /* byte offset of alternate table */ __le32 d_alt_len; /* byte length of alternate table */ __le32 d_phys_cyl; /* # of physical cylinders per device */ __le32 d_phys_trk; /* # of physical tracks per cylinder */ __le32 d_phys_sec; /* # of physical sectors per track */ __le32 d_phys_bytes; /* # of physical bytes per sector */ __le32 d_unknown2; /* ? */ __le32 d_unknown3; /* ? */ __le32 d_pad[8]; /* pad */ struct unixware_vtoc { __le32 v_magic; /* the magic number */ __le32 v_version; /* version number */ char v_name[8]; /* volume name */ __le16 v_nslices; /* # of slices */ __le16 v_unknown1; /* ? */ __le32 v_reserved[10]; /* reserved */ struct unixware_slice v_slice[UNIXWARE_NUMSLICE]; /* slice headers */ } vtoc; }; /* 408 */ /* * Create devices for Unixware partitions listed in a disklabel, under a * dos-like partition. See parse_extended() for more information. */ static void parse_unixware(struct parsed_partitions *state, sector_t offset, sector_t size, int origin) { #ifdef CONFIG_UNIXWARE_DISKLABEL Sector sect; struct unixware_disklabel *l; struct unixware_slice *p; l = read_part_sector(state, offset + 29, §); if (!l) return; if (le32_to_cpu(l->d_magic) != UNIXWARE_DISKMAGIC || le32_to_cpu(l->vtoc.v_magic) != UNIXWARE_DISKMAGIC2) { put_dev_sector(sect); return; } { char tmp[1 + BDEVNAME_SIZE + 10 + 12 + 1]; snprintf(tmp, sizeof(tmp), " %s%d: <unixware:", state->name, origin); strlcat(state->pp_buf, tmp, PAGE_SIZE); } p = &l->vtoc.v_slice[1]; /* I omit the 0th slice as it is the same as whole disk. */ while (p - &l->vtoc.v_slice[0] < UNIXWARE_NUMSLICE) { if (state->next == state->limit) break; if (p->s_label != UNIXWARE_FS_UNUSED) put_partition(state, state->next++, le32_to_cpu(p->start_sect), le32_to_cpu(p->nr_sects)); p++; } put_dev_sector(sect); strlcat(state->pp_buf, " >\n", PAGE_SIZE); #endif } #define MINIX_NR_SUBPARTITIONS 4 /* * Minix 2.0.0/2.0.2 subpartition support. * Anand Krishnamurthy <anandk@wiproge.med.ge.com> * Rajeev V. Pillai <rajeevvp@yahoo.com> */ static void parse_minix(struct parsed_partitions *state, sector_t offset, sector_t size, int origin) { #ifdef CONFIG_MINIX_SUBPARTITION Sector sect; unsigned char *data; struct msdos_partition *p; int i; data = read_part_sector(state, offset, §); if (!data) return; p = (struct msdos_partition *)(data + 0x1be); /* The first sector of a Minix partition can have either * a secondary MBR describing its subpartitions, or * the normal boot sector. */ if (msdos_magic_present(data + 510) && p->sys_ind == MINIX_PARTITION) { /* subpartition table present */ char tmp[1 + BDEVNAME_SIZE + 10 + 9 + 1]; snprintf(tmp, sizeof(tmp), " %s%d: <minix:", state->name, origin); strlcat(state->pp_buf, tmp, PAGE_SIZE); for (i = 0; i < MINIX_NR_SUBPARTITIONS; i++, p++) { if (state->next == state->limit) break; /* add each partition in use */ if (p->sys_ind == MINIX_PARTITION) put_partition(state, state->next++, start_sect(p), nr_sects(p)); } strlcat(state->pp_buf, " >\n", PAGE_SIZE); } put_dev_sector(sect); #endif /* CONFIG_MINIX_SUBPARTITION */ } static struct { unsigned char id; void (*parse)(struct parsed_partitions *, sector_t, sector_t, int); } subtypes[] = { {FREEBSD_PARTITION, parse_freebsd}, {NETBSD_PARTITION, parse_netbsd}, {OPENBSD_PARTITION, parse_openbsd}, {MINIX_PARTITION, parse_minix}, {UNIXWARE_PARTITION, parse_unixware}, {SOLARIS_X86_PARTITION, parse_solaris_x86}, {NEW_SOLARIS_X86_PARTITION, parse_solaris_x86}, {0, NULL}, }; int msdos_partition(struct parsed_partitions *state) { sector_t sector_size; Sector sect; unsigned char *data; struct msdos_partition *p; struct fat_boot_sector *fb; int slot; u32 disksig; sector_size = queue_logical_block_size(state->disk->queue) / 512; data = read_part_sector(state, 0, §); if (!data) return -1; /* * Note order! (some AIX disks, e.g. unbootable kind, * have no MSDOS 55aa) */ if (aix_magic_present(state, data)) { put_dev_sector(sect); #ifdef CONFIG_AIX_PARTITION return aix_partition(state); #else strlcat(state->pp_buf, " [AIX]", PAGE_SIZE); return 0; #endif } if (!msdos_magic_present(data + 510)) { put_dev_sector(sect); return 0; } /* * Now that the 55aa signature is present, this is probably * either the boot sector of a FAT filesystem or a DOS-type * partition table. Reject this in case the boot indicator * is not 0 or 0x80. */ p = (struct msdos_partition *) (data + 0x1be); for (slot = 1; slot <= 4; slot++, p++) { if (p->boot_ind != 0 && p->boot_ind != 0x80) { /* * Even without a valid boot indicator value * its still possible this is valid FAT filesystem * without a partition table. */ fb = (struct fat_boot_sector *) data; if (slot == 1 && fb->reserved && fb->fats && fat_valid_media(fb->media)) { strlcat(state->pp_buf, "\n", PAGE_SIZE); put_dev_sector(sect); return 1; } else { put_dev_sector(sect); return 0; } } } #ifdef CONFIG_EFI_PARTITION p = (struct msdos_partition *) (data + 0x1be); for (slot = 1 ; slot <= 4 ; slot++, p++) { /* If this is an EFI GPT disk, msdos should ignore it. */ if (p->sys_ind == EFI_PMBR_OSTYPE_EFI_GPT) { put_dev_sector(sect); return 0; } } #endif p = (struct msdos_partition *) (data + 0x1be); disksig = le32_to_cpup((__le32 *)(data + 0x1b8)); /* * Look for partitions in two passes: * First find the primary and DOS-type extended partitions. * On the second pass look inside *BSD, Unixware and Solaris partitions. */ state->next = 5; for (slot = 1 ; slot <= 4 ; slot++, p++) { sector_t start = start_sect(p)*sector_size; sector_t size = nr_sects(p)*sector_size; if (!size) continue; if (is_extended_partition(p)) { /* * prevent someone doing mkfs or mkswap on an * extended partition, but leave room for LILO * FIXME: this uses one logical sector for > 512b * sector, although it may not be enough/proper. */ sector_t n = 2; n = min(size, max(sector_size, n)); put_partition(state, slot, start, n); strlcat(state->pp_buf, " <", PAGE_SIZE); parse_extended(state, start, size, disksig); strlcat(state->pp_buf, " >", PAGE_SIZE); continue; } put_partition(state, slot, start, size); set_info(state, slot, disksig); if (p->sys_ind == LINUX_RAID_PARTITION) state->parts[slot].flags = ADDPART_FLAG_RAID; if (p->sys_ind == DM6_PARTITION) strlcat(state->pp_buf, "[DM]", PAGE_SIZE); if (p->sys_ind == EZD_PARTITION) strlcat(state->pp_buf, "[EZD]", PAGE_SIZE); } strlcat(state->pp_buf, "\n", PAGE_SIZE); /* second pass - output for each on a separate line */ p = (struct msdos_partition *) (0x1be + data); for (slot = 1 ; slot <= 4 ; slot++, p++) { unsigned char id = p->sys_ind; int n; if (!nr_sects(p)) continue; for (n = 0; subtypes[n].parse && id != subtypes[n].id; n++) ; if (!subtypes[n].parse) continue; subtypes[n].parse(state, start_sect(p) * sector_size, nr_sects(p) * sector_size, slot); } put_dev_sector(sect); return 1; } |
| 1 1 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 | /* * Linear conversion Plug-In * Copyright (c) 1999 by Jaroslav Kysela <perex@perex.cz>, * Abramo Bagnara <abramo@alsa-project.org> * * * This library is free software; you can redistribute it and/or modify * it under the terms of the GNU Library General Public License as * published by the Free Software Foundation; either version 2 of * the License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Library General Public License for more details. * * You should have received a copy of the GNU Library General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA * */ #include <linux/time.h> #include <sound/core.h> #include <sound/pcm.h> #include "pcm_plugin.h" /* * Basic linear conversion plugin */ struct linear_priv { int cvt_endian; /* need endian conversion? */ unsigned int src_ofs; /* byte offset in source format */ unsigned int dst_ofs; /* byte soffset in destination format */ unsigned int copy_ofs; /* byte offset in temporary u32 data */ unsigned int dst_bytes; /* byte size of destination format */ unsigned int copy_bytes; /* bytes to copy per conversion */ unsigned int flip; /* MSB flip for signeness, done after endian conv */ }; static inline void do_convert(struct linear_priv *data, unsigned char *dst, unsigned char *src) { unsigned int tmp = 0; unsigned char *p = (unsigned char *)&tmp; memcpy(p + data->copy_ofs, src + data->src_ofs, data->copy_bytes); if (data->cvt_endian) tmp = swab32(tmp); tmp ^= data->flip; memcpy(dst, p + data->dst_ofs, data->dst_bytes); } static void convert(struct snd_pcm_plugin *plugin, const struct snd_pcm_plugin_channel *src_channels, struct snd_pcm_plugin_channel *dst_channels, snd_pcm_uframes_t frames) { struct linear_priv *data = (struct linear_priv *)plugin->extra_data; int channel; int nchannels = plugin->src_format.channels; for (channel = 0; channel < nchannels; ++channel) { char *src; char *dst; int src_step, dst_step; snd_pcm_uframes_t frames1; if (!src_channels[channel].enabled) { if (dst_channels[channel].wanted) snd_pcm_area_silence(&dst_channels[channel].area, 0, frames, plugin->dst_format.format); dst_channels[channel].enabled = 0; continue; } dst_channels[channel].enabled = 1; src = src_channels[channel].area.addr + src_channels[channel].area.first / 8; dst = dst_channels[channel].area.addr + dst_channels[channel].area.first / 8; src_step = src_channels[channel].area.step / 8; dst_step = dst_channels[channel].area.step / 8; frames1 = frames; while (frames1-- > 0) { do_convert(data, dst, src); src += src_step; dst += dst_step; } } } static snd_pcm_sframes_t linear_transfer(struct snd_pcm_plugin *plugin, const struct snd_pcm_plugin_channel *src_channels, struct snd_pcm_plugin_channel *dst_channels, snd_pcm_uframes_t frames) { if (snd_BUG_ON(!plugin || !src_channels || !dst_channels)) return -ENXIO; if (frames == 0) return 0; #ifdef CONFIG_SND_DEBUG { unsigned int channel; for (channel = 0; channel < plugin->src_format.channels; channel++) { if (snd_BUG_ON(src_channels[channel].area.first % 8 || src_channels[channel].area.step % 8)) return -ENXIO; if (snd_BUG_ON(dst_channels[channel].area.first % 8 || dst_channels[channel].area.step % 8)) return -ENXIO; } } #endif if (frames > dst_channels[0].frames) frames = dst_channels[0].frames; convert(plugin, src_channels, dst_channels, frames); return frames; } static void init_data(struct linear_priv *data, snd_pcm_format_t src_format, snd_pcm_format_t dst_format) { int src_le, dst_le, src_bytes, dst_bytes; src_bytes = snd_pcm_format_width(src_format) / 8; dst_bytes = snd_pcm_format_width(dst_format) / 8; src_le = snd_pcm_format_little_endian(src_format) > 0; dst_le = snd_pcm_format_little_endian(dst_format) > 0; data->dst_bytes = dst_bytes; data->cvt_endian = src_le != dst_le; data->copy_bytes = src_bytes < dst_bytes ? src_bytes : dst_bytes; if (src_le) { data->copy_ofs = 4 - data->copy_bytes; data->src_ofs = src_bytes - data->copy_bytes; } else data->src_ofs = snd_pcm_format_physical_width(src_format) / 8 - src_bytes; if (dst_le) data->dst_ofs = 4 - data->dst_bytes; else data->dst_ofs = snd_pcm_format_physical_width(dst_format) / 8 - dst_bytes; if (snd_pcm_format_signed(src_format) != snd_pcm_format_signed(dst_format)) { if (dst_le) data->flip = (__force u32)cpu_to_le32(0x80000000); else data->flip = (__force u32)cpu_to_be32(0x80000000); } } int snd_pcm_plugin_build_linear(struct snd_pcm_substream *plug, struct snd_pcm_plugin_format *src_format, struct snd_pcm_plugin_format *dst_format, struct snd_pcm_plugin **r_plugin) { int err; struct linear_priv *data; struct snd_pcm_plugin *plugin; if (snd_BUG_ON(!r_plugin)) return -ENXIO; *r_plugin = NULL; if (snd_BUG_ON(src_format->rate != dst_format->rate)) return -ENXIO; if (snd_BUG_ON(src_format->channels != dst_format->channels)) return -ENXIO; if (snd_BUG_ON(!snd_pcm_format_linear(src_format->format) || !snd_pcm_format_linear(dst_format->format))) return -ENXIO; err = snd_pcm_plugin_build(plug, "linear format conversion", src_format, dst_format, sizeof(struct linear_priv), &plugin); if (err < 0) return err; data = (struct linear_priv *)plugin->extra_data; init_data(data, src_format->format, dst_format->format); plugin->transfer = linear_transfer; *r_plugin = plugin; return 0; } |
| 1 1 1 10 4 312 312 313 310 255 253 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 | /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * vma.h * * Core VMA manipulation API implemented in vma.c. */ #ifndef __MM_VMA_H #define __MM_VMA_H /* * VMA lock generalization */ struct vma_prepare { struct vm_area_struct *vma; struct vm_area_struct *adj_next; struct file *file; struct address_space *mapping; struct anon_vma *anon_vma; struct vm_area_struct *insert; struct vm_area_struct *remove; struct vm_area_struct *remove2; }; struct unlink_vma_file_batch { int count; struct vm_area_struct *vmas[8]; }; /* * vma munmap operation */ struct vma_munmap_struct { struct vma_iterator *vmi; struct vm_area_struct *vma; /* The first vma to munmap */ struct vm_area_struct *prev; /* vma before the munmap area */ struct vm_area_struct *next; /* vma after the munmap area */ struct list_head *uf; /* Userfaultfd list_head */ unsigned long start; /* Aligned start addr (inclusive) */ unsigned long end; /* Aligned end addr (exclusive) */ unsigned long unmap_start; /* Unmap PTE start */ unsigned long unmap_end; /* Unmap PTE end */ int vma_count; /* Number of vmas that will be removed */ bool unlock; /* Unlock after the munmap */ bool clear_ptes; /* If there are outstanding PTE to be cleared */ bool closed_vm_ops; /* call_mmap() was encountered, so vmas may be closed */ /* 1 byte hole */ unsigned long nr_pages; /* Number of pages being removed */ unsigned long locked_vm; /* Number of locked pages */ unsigned long nr_accounted; /* Number of VM_ACCOUNT pages */ unsigned long exec_vm; unsigned long stack_vm; unsigned long data_vm; }; enum vma_merge_state { VMA_MERGE_START, VMA_MERGE_ERROR_NOMEM, VMA_MERGE_NOMERGE, VMA_MERGE_SUCCESS, }; /* Represents a VMA merge operation. */ struct vma_merge_struct { struct mm_struct *mm; struct vma_iterator *vmi; pgoff_t pgoff; struct vm_area_struct *prev; struct vm_area_struct *next; /* Modified by vma_merge(). */ struct vm_area_struct *vma; /* Either a new VMA or the one being modified. */ unsigned long start; unsigned long end; unsigned long flags; struct file *file; struct anon_vma *anon_vma; struct mempolicy *policy; struct vm_userfaultfd_ctx uffd_ctx; struct anon_vma_name *anon_name; enum vma_merge_state state; }; static inline bool vmg_nomem(struct vma_merge_struct *vmg) { return vmg->state == VMA_MERGE_ERROR_NOMEM; } /* Assumes addr >= vma->vm_start. */ static inline pgoff_t vma_pgoff_offset(struct vm_area_struct *vma, unsigned long addr) { return vma->vm_pgoff + PHYS_PFN(addr - vma->vm_start); } #define VMG_STATE(name, mm_, vmi_, start_, end_, flags_, pgoff_) \ struct vma_merge_struct name = { \ .mm = mm_, \ .vmi = vmi_, \ .start = start_, \ .end = end_, \ .flags = flags_, \ .pgoff = pgoff_, \ .state = VMA_MERGE_START, \ } #define VMG_VMA_STATE(name, vmi_, prev_, vma_, start_, end_) \ struct vma_merge_struct name = { \ .mm = vma_->vm_mm, \ .vmi = vmi_, \ .prev = prev_, \ .next = NULL, \ .vma = vma_, \ .start = start_, \ .end = end_, \ .flags = vma_->vm_flags, \ .pgoff = vma_pgoff_offset(vma_, start_), \ .file = vma_->vm_file, \ .anon_vma = vma_->anon_vma, \ .policy = vma_policy(vma_), \ .uffd_ctx = vma_->vm_userfaultfd_ctx, \ .anon_name = anon_vma_name(vma_), \ .state = VMA_MERGE_START, \ } #ifdef CONFIG_DEBUG_VM_MAPLE_TREE void validate_mm(struct mm_struct *mm); #else #define validate_mm(mm) do { } while (0) #endif /* Required for expand_downwards(). */ void anon_vma_interval_tree_pre_update_vma(struct vm_area_struct *vma); /* Required for expand_downwards(). */ void anon_vma_interval_tree_post_update_vma(struct vm_area_struct *vma); int vma_expand(struct vma_merge_struct *vmg); int vma_shrink(struct vma_iterator *vmi, struct vm_area_struct *vma, unsigned long start, unsigned long end, pgoff_t pgoff); static inline int vma_iter_store_gfp(struct vma_iterator *vmi, struct vm_area_struct *vma, gfp_t gfp) { if (vmi->mas.status != ma_start && ((vmi->mas.index > vma->vm_start) || (vmi->mas.last < vma->vm_start))) vma_iter_invalidate(vmi); __mas_set_range(&vmi->mas, vma->vm_start, vma->vm_end - 1); mas_store_gfp(&vmi->mas, vma, gfp); if (unlikely(mas_is_err(&vmi->mas))) return -ENOMEM; return 0; } #ifdef CONFIG_MMU /* * init_vma_munmap() - Initializer wrapper for vma_munmap_struct * @vms: The vma munmap struct * @vmi: The vma iterator * @vma: The first vm_area_struct to munmap * @start: The aligned start address to munmap * @end: The aligned end address to munmap * @uf: The userfaultfd list_head * @unlock: Unlock after the operation. Only unlocked on success */ static inline void init_vma_munmap(struct vma_munmap_struct *vms, struct vma_iterator *vmi, struct vm_area_struct *vma, unsigned long start, unsigned long end, struct list_head *uf, bool unlock) { vms->vmi = vmi; vms->vma = vma; if (vma) { vms->start = start; vms->end = end; } else { vms->start = vms->end = 0; } vms->unlock = unlock; vms->uf = uf; vms->vma_count = 0; vms->nr_pages = vms->locked_vm = vms->nr_accounted = 0; vms->exec_vm = vms->stack_vm = vms->data_vm = 0; vms->unmap_start = FIRST_USER_ADDRESS; vms->unmap_end = USER_PGTABLES_CEILING; vms->clear_ptes = false; vms->closed_vm_ops = false; } #endif int vms_gather_munmap_vmas(struct vma_munmap_struct *vms, struct ma_state *mas_detach); void vms_complete_munmap_vmas(struct vma_munmap_struct *vms, struct ma_state *mas_detach); void vms_clean_up_area(struct vma_munmap_struct *vms, struct ma_state *mas_detach); /* * reattach_vmas() - Undo any munmap work and free resources * @mas_detach: The maple state with the detached maple tree * * Reattach any detached vmas and free up the maple tree used to track the vmas. */ static inline void reattach_vmas(struct ma_state *mas_detach) { struct vm_area_struct *vma; mas_set(mas_detach, 0); mas_for_each(mas_detach, vma, ULONG_MAX) vma_mark_detached(vma, false); __mt_destroy(mas_detach->tree); } /* * vms_abort_munmap_vmas() - Undo as much as possible from an aborted munmap() * operation. * @vms: The vma unmap structure * @mas_detach: The maple state with the detached maple tree * * Reattach any detached vmas, free up the maple tree used to track the vmas. * If that's not possible because the ptes are cleared (and vm_ops->closed() may * have been called), then a NULL is written over the vmas and the vmas are * removed (munmap() completed). */ static inline void vms_abort_munmap_vmas(struct vma_munmap_struct *vms, struct ma_state *mas_detach) { struct ma_state *mas = &vms->vmi->mas; if (!vms->nr_pages) return; if (vms->clear_ptes) return reattach_vmas(mas_detach); /* * Aborting cannot just call the vm_ops open() because they are often * not symmetrical and state data has been lost. Resort to the old * failure method of leaving a gap where the MAP_FIXED mapping failed. */ mas_set_range(mas, vms->start, vms->end - 1); if (unlikely(mas_store_gfp(mas, NULL, GFP_KERNEL))) { pr_warn_once("%s: (%d) Unable to abort munmap() operation\n", current->comm, current->pid); /* Leaving vmas detached and in-tree may hamper recovery */ reattach_vmas(mas_detach); } else { /* Clean up the insertion of the unfortunate gap */ vms_complete_munmap_vmas(vms, mas_detach); } } int do_vmi_align_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma, struct mm_struct *mm, unsigned long start, unsigned long end, struct list_head *uf, bool unlock); int do_vmi_munmap(struct vma_iterator *vmi, struct mm_struct *mm, unsigned long start, size_t len, struct list_head *uf, bool unlock); void remove_vma(struct vm_area_struct *vma, bool unreachable, bool closed); void unmap_region(struct ma_state *mas, struct vm_area_struct *vma, struct vm_area_struct *prev, struct vm_area_struct *next); /* We are about to modify the VMA's flags. */ struct vm_area_struct *vma_modify_flags(struct vma_iterator *vmi, struct vm_area_struct *prev, struct vm_area_struct *vma, unsigned long start, unsigned long end, unsigned long new_flags); /* We are about to modify the VMA's flags and/or anon_name. */ struct vm_area_struct *vma_modify_flags_name(struct vma_iterator *vmi, struct vm_area_struct *prev, struct vm_area_struct *vma, unsigned long start, unsigned long end, unsigned long new_flags, struct anon_vma_name *new_name); /* We are about to modify the VMA's memory policy. */ struct vm_area_struct *vma_modify_policy(struct vma_iterator *vmi, struct vm_area_struct *prev, struct vm_area_struct *vma, unsigned long start, unsigned long end, struct mempolicy *new_pol); /* We are about to modify the VMA's flags and/or uffd context. */ struct vm_area_struct *vma_modify_flags_uffd(struct vma_iterator *vmi, struct vm_area_struct *prev, struct vm_area_struct *vma, unsigned long start, unsigned long end, unsigned long new_flags, struct vm_userfaultfd_ctx new_ctx); struct vm_area_struct *vma_merge_new_range(struct vma_merge_struct *vmg); struct vm_area_struct *vma_merge_extend(struct vma_iterator *vmi, struct vm_area_struct *vma, unsigned long delta); void unlink_file_vma_batch_init(struct unlink_vma_file_batch *vb); void unlink_file_vma_batch_final(struct unlink_vma_file_batch *vb); void unlink_file_vma_batch_add(struct unlink_vma_file_batch *vb, struct vm_area_struct *vma); void unlink_file_vma(struct vm_area_struct *vma); void vma_link_file(struct vm_area_struct *vma); int vma_link(struct mm_struct *mm, struct vm_area_struct *vma); struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, unsigned long addr, unsigned long len, pgoff_t pgoff, bool *need_rmap_locks); struct anon_vma *find_mergeable_anon_vma(struct vm_area_struct *vma); bool vma_needs_dirty_tracking(struct vm_area_struct *vma); bool vma_wants_writenotify(struct vm_area_struct *vma, pgprot_t vm_page_prot); int mm_take_all_locks(struct mm_struct *mm); void mm_drop_all_locks(struct mm_struct *mm); static inline bool vma_wants_manual_pte_write_upgrade(struct vm_area_struct *vma) { /* * We want to check manually if we can change individual PTEs writable * if we can't do that automatically for all PTEs in a mapping. For * private mappings, that's always the case when we have write * permissions as we properly have to handle COW. */ if (vma->vm_flags & VM_SHARED) return vma_wants_writenotify(vma, vma->vm_page_prot); return !!(vma->vm_flags & VM_WRITE); } #ifdef CONFIG_MMU static inline pgprot_t vm_pgprot_modify(pgprot_t oldprot, unsigned long vm_flags) { return pgprot_modify(oldprot, vm_get_page_prot(vm_flags)); } #endif static inline struct vm_area_struct *vma_prev_limit(struct vma_iterator *vmi, unsigned long min) { return mas_prev(&vmi->mas, min); } /* * These three helpers classifies VMAs for virtual memory accounting. */ /* * Executable code area - executable, not writable, not stack */ static inline bool is_exec_mapping(vm_flags_t flags) { return (flags & (VM_EXEC | VM_WRITE | VM_STACK)) == VM_EXEC; } /* * Stack area (including shadow stacks) * * VM_GROWSUP / VM_GROWSDOWN VMAs are always private anonymous: * do_mmap() forbids all other combinations. */ static inline bool is_stack_mapping(vm_flags_t flags) { return ((flags & VM_STACK) == VM_STACK) || (flags & VM_SHADOW_STACK); } /* * Data area - private, writable, not stack */ static inline bool is_data_mapping(vm_flags_t flags) { return (flags & (VM_WRITE | VM_SHARED | VM_STACK)) == VM_WRITE; } static inline void vma_iter_config(struct vma_iterator *vmi, unsigned long index, unsigned long last) { __mas_set_range(&vmi->mas, index, last - 1); } static inline void vma_iter_reset(struct vma_iterator *vmi) { mas_reset(&vmi->mas); } static inline struct vm_area_struct *vma_iter_prev_range_limit(struct vma_iterator *vmi, unsigned long min) { return mas_prev_range(&vmi->mas, min); } static inline struct vm_area_struct *vma_iter_next_range_limit(struct vma_iterator *vmi, unsigned long max) { return mas_next_range(&vmi->mas, max); } static inline int vma_iter_area_lowest(struct vma_iterator *vmi, unsigned long min, unsigned long max, unsigned long size) { return mas_empty_area(&vmi->mas, min, max - 1, size); } static inline int vma_iter_area_highest(struct vma_iterator *vmi, unsigned long min, unsigned long max, unsigned long size) { return mas_empty_area_rev(&vmi->mas, min, max - 1, size); } /* * VMA Iterator functions shared between nommu and mmap */ static inline int vma_iter_prealloc(struct vma_iterator *vmi, struct vm_area_struct *vma) { return mas_preallocate(&vmi->mas, vma, GFP_KERNEL); } static inline void vma_iter_clear(struct vma_iterator *vmi) { mas_store_prealloc(&vmi->mas, NULL); } static inline struct vm_area_struct *vma_iter_load(struct vma_iterator *vmi) { return mas_walk(&vmi->mas); } /* Store a VMA with preallocated memory */ static inline void vma_iter_store(struct vma_iterator *vmi, struct vm_area_struct *vma) { #if defined(CONFIG_DEBUG_VM_MAPLE_TREE) if (MAS_WARN_ON(&vmi->mas, vmi->mas.status != ma_start && vmi->mas.index > vma->vm_start)) { pr_warn("%lx > %lx\n store vma %lx-%lx\n into slot %lx-%lx\n", vmi->mas.index, vma->vm_start, vma->vm_start, vma->vm_end, vmi->mas.index, vmi->mas.last); } if (MAS_WARN_ON(&vmi->mas, vmi->mas.status != ma_start && vmi->mas.last < vma->vm_start)) { pr_warn("%lx < %lx\nstore vma %lx-%lx\ninto slot %lx-%lx\n", vmi->mas.last, vma->vm_start, vma->vm_start, vma->vm_end, vmi->mas.index, vmi->mas.last); } #endif if (vmi->mas.status != ma_start && ((vmi->mas.index > vma->vm_start) || (vmi->mas.last < vma->vm_start))) vma_iter_invalidate(vmi); __mas_set_range(&vmi->mas, vma->vm_start, vma->vm_end - 1); mas_store_prealloc(&vmi->mas, vma); } static inline unsigned long vma_iter_addr(struct vma_iterator *vmi) { return vmi->mas.index; } static inline unsigned long vma_iter_end(struct vma_iterator *vmi) { return vmi->mas.last + 1; } static inline int vma_iter_bulk_alloc(struct vma_iterator *vmi, unsigned long count) { return mas_expected_entries(&vmi->mas, count); } static inline struct vm_area_struct *vma_iter_prev_range(struct vma_iterator *vmi) { return mas_prev_range(&vmi->mas, 0); } /* * Retrieve the next VMA and rewind the iterator to end of the previous VMA, or * if no previous VMA, to index 0. */ static inline struct vm_area_struct *vma_iter_next_rewind(struct vma_iterator *vmi, struct vm_area_struct **pprev) { struct vm_area_struct *next = vma_next(vmi); struct vm_area_struct *prev = vma_prev(vmi); /* * Consider the case where no previous VMA exists. We advance to the * next VMA, skipping any gap, then rewind to the start of the range. * * If we were to unconditionally advance to the next range we'd wind up * at the next VMA again, so we check to ensure there is a previous VMA * to skip over. */ if (prev) vma_iter_next_range(vmi); if (pprev) *pprev = prev; return next; } #ifdef CONFIG_64BIT static inline bool vma_is_sealed(struct vm_area_struct *vma) { return (vma->vm_flags & VM_SEALED); } /* * check if a vma is sealed for modification. * return true, if modification is allowed. */ static inline bool can_modify_vma(struct vm_area_struct *vma) { if (unlikely(vma_is_sealed(vma))) return false; return true; } bool can_modify_vma_madv(struct vm_area_struct *vma, int behavior); #else static inline bool can_modify_vma(struct vm_area_struct *vma) { return true; } static inline bool can_modify_vma_madv(struct vm_area_struct *vma, int behavior) { return true; } #endif #endif /* __MM_VMA_H */ |
| 12 12 7 6 6 6 3 5 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 | // SPDX-License-Identifier: GPL-2.0-or-later /* * Squashfs - a compressed read only filesystem for Linux * * Copyright (c) 2010 LG Electronics * Chan Jeong <chan.jeong@lge.com> * * lzo_wrapper.c */ #include <linux/mutex.h> #include <linux/bio.h> #include <linux/slab.h> #include <linux/vmalloc.h> #include <linux/lzo.h> #include "squashfs_fs.h" #include "squashfs_fs_sb.h" #include "squashfs.h" #include "decompressor.h" #include "page_actor.h" struct squashfs_lzo { void *input; void *output; }; static void *lzo_init(struct squashfs_sb_info *msblk, void *buff) { int block_size = max_t(int, msblk->block_size, SQUASHFS_METADATA_SIZE); struct squashfs_lzo *stream = kzalloc(sizeof(*stream), GFP_KERNEL); if (stream == NULL) goto failed; stream->input = vmalloc(block_size); if (stream->input == NULL) goto failed; stream->output = vmalloc(block_size); if (stream->output == NULL) goto failed2; return stream; failed2: vfree(stream->input); failed: ERROR("Failed to allocate lzo workspace\n"); kfree(stream); return ERR_PTR(-ENOMEM); } static void lzo_free(void *strm) { struct squashfs_lzo *stream = strm; if (stream) { vfree(stream->input); vfree(stream->output); } kfree(stream); } static int lzo_uncompress(struct squashfs_sb_info *msblk, void *strm, struct bio *bio, int offset, int length, struct squashfs_page_actor *output) { struct bvec_iter_all iter_all = {}; struct bio_vec *bvec = bvec_init_iter_all(&iter_all); struct squashfs_lzo *stream = strm; void *buff = stream->input, *data; int bytes = length, res; size_t out_len = output->length; while (bio_next_segment(bio, &iter_all)) { int avail = min(bytes, ((int)bvec->bv_len) - offset); data = bvec_virt(bvec); memcpy(buff, data + offset, avail); buff += avail; bytes -= avail; offset = 0; } res = lzo1x_decompress_safe(stream->input, (size_t)length, stream->output, &out_len); if (res != LZO_E_OK) goto failed; res = bytes = (int)out_len; data = squashfs_first_page(output); buff = stream->output; while (data) { if (bytes <= PAGE_SIZE) { if (!IS_ERR(data)) memcpy(data, buff, bytes); break; } else { if (!IS_ERR(data)) memcpy(data, buff, PAGE_SIZE); buff += PAGE_SIZE; bytes -= PAGE_SIZE; data = squashfs_next_page(output); } } squashfs_finish_page(output); return res; failed: return -EIO; } const struct squashfs_decompressor squashfs_lzo_comp_ops = { .init = lzo_init, .free = lzo_free, .decompress = lzo_uncompress, .id = LZO_COMPRESSION, .name = "lzo", .alloc_buffer = 0, .supported = 1 }; |
| 6 102 41 56 58 99 20 10 1 3 18 2 46 43 3 3 39 14 11 3 8 29 5 1 4 2 12 1 2 1 2 3 3 8 3 2 41 2 241 20 2 211 222 224 223 3006 6 175 6 2 2820 7 3027 325 326 13 311 2 2 233 1 232 232 52 192 191 523 2 2 197 330 117 406 892 895 276 602 11 1 10 6 4 10 6 5 5 5 1121 2 2 2 1124 229 897 469 635 1053 430 37 19 416 53 370 426 431 954 79 85 864 402 529 897 958 1 6 1 97 103 103 1 44 1 186 192 192 649 648 649 648 14 614 3 91 179 186 204 30 30 30 10 10 1 9 10 280 1 1 5 13 1 74 188 216 213 735 1 3 1 2 1 728 536 191 232 378 707 64 8 15 49 33 30 64 44 20 217 17 62 156 159 55 219 145 74 1 11 2 219 130 88 1 73 2 500 232 305 65 221 181 181 45 45 43 43 478 477 363 24 1 336 26 4 8 1 335 1 24 2 15 111 206 205 295 304 365 338 6 5 111 1 2 110 36 71 120 120 3 1 8 102 2 43 83 101 66 9 86 86 124 9 3 8 115 77 45 2 31 22 115 115 116 19 4 1486 12 1325 164 1473 1475 1485 17 1471 70 4 1 3 2 68 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 | // SPDX-License-Identifier: GPL-2.0 /* * linux/fs/read_write.c * * Copyright (C) 1991, 1992 Linus Torvalds */ #include <linux/slab.h> #include <linux/stat.h> #include <linux/sched/xacct.h> #include <linux/fcntl.h> #include <linux/file.h> #include <linux/uio.h> #include <linux/fsnotify.h> #include <linux/security.h> #include <linux/export.h> #include <linux/syscalls.h> #include <linux/pagemap.h> #include <linux/splice.h> #include <linux/compat.h> #include <linux/mount.h> #include <linux/fs.h> #include "internal.h" #include <linux/uaccess.h> #include <asm/unistd.h> const struct file_operations generic_ro_fops = { .llseek = generic_file_llseek, .read_iter = generic_file_read_iter, .mmap = generic_file_readonly_mmap, .splice_read = filemap_splice_read, }; EXPORT_SYMBOL(generic_ro_fops); static inline bool unsigned_offsets(struct file *file) { return file->f_op->fop_flags & FOP_UNSIGNED_OFFSET; } /** * vfs_setpos_cookie - update the file offset for lseek and reset cookie * @file: file structure in question * @offset: file offset to seek to * @maxsize: maximum file size * @cookie: cookie to reset * * Update the file offset to the value specified by @offset if the given * offset is valid and it is not equal to the current file offset and * reset the specified cookie to indicate that a seek happened. * * Return the specified offset on success and -EINVAL on invalid offset. */ static loff_t vfs_setpos_cookie(struct file *file, loff_t offset, loff_t maxsize, u64 *cookie) { if (offset < 0 && !unsigned_offsets(file)) return -EINVAL; if (offset > maxsize) return -EINVAL; if (offset != file->f_pos) { file->f_pos = offset; if (cookie) *cookie = 0; } return offset; } /** * vfs_setpos - update the file offset for lseek * @file: file structure in question * @offset: file offset to seek to * @maxsize: maximum file size * * This is a low-level filesystem helper for updating the file offset to * the value specified by @offset if the given offset is valid and it is * not equal to the current file offset. * * Return the specified offset on success and -EINVAL on invalid offset. */ loff_t vfs_setpos(struct file *file, loff_t offset, loff_t maxsize) { return vfs_setpos_cookie(file, offset, maxsize, NULL); } EXPORT_SYMBOL(vfs_setpos); /** * must_set_pos - check whether f_pos has to be updated * @file: file to seek on * @offset: offset to use * @whence: type of seek operation * @eof: end of file * * Check whether f_pos needs to be updated and update @offset according * to @whence. * * Return: 0 if f_pos doesn't need to be updated, 1 if f_pos has to be * updated, and negative error code on failure. */ static int must_set_pos(struct file *file, loff_t *offset, int whence, loff_t eof) { switch (whence) { case SEEK_END: *offset += eof; break; case SEEK_CUR: /* * Here we special-case the lseek(fd, 0, SEEK_CUR) * position-querying operation. Avoid rewriting the "same" * f_pos value back to the file because a concurrent read(), * write() or lseek() might have altered it */ if (*offset == 0) { *offset = file->f_pos; return 0; } break; case SEEK_DATA: /* * In the generic case the entire file is data, so as long as * offset isn't at the end of the file then the offset is data. */ if ((unsigned long long)*offset >= eof) return -ENXIO; break; case SEEK_HOLE: /* * There is a virtual hole at the end of the file, so as long as * offset isn't i_size or larger, return i_size. */ if ((unsigned long long)*offset >= eof) return -ENXIO; *offset = eof; break; } return 1; } /** * generic_file_llseek_size - generic llseek implementation for regular files * @file: file structure to seek on * @offset: file offset to seek to * @whence: type of seek * @maxsize: max size of this file in file system * @eof: offset used for SEEK_END position * * This is a variant of generic_file_llseek that allows passing in a custom * maximum file size and a custom EOF position, for e.g. hashed directories * * Synchronization: * SEEK_SET and SEEK_END are unsynchronized (but atomic on 64bit platforms) * SEEK_CUR is synchronized against other SEEK_CURs, but not read/writes. * read/writes behave like SEEK_SET against seeks. */ loff_t generic_file_llseek_size(struct file *file, loff_t offset, int whence, loff_t maxsize, loff_t eof) { int ret; ret = must_set_pos(file, &offset, whence, eof); if (ret < 0) return ret; if (ret == 0) return offset; if (whence == SEEK_CUR) { /* * f_lock protects against read/modify/write race with * other SEEK_CURs. Note that parallel writes and reads * behave like SEEK_SET. */ guard(spinlock)(&file->f_lock); return vfs_setpos(file, file->f_pos + offset, maxsize); } return vfs_setpos(file, offset, maxsize); } EXPORT_SYMBOL(generic_file_llseek_size); /** * generic_llseek_cookie - versioned llseek implementation * @file: file structure to seek on * @offset: file offset to seek to * @whence: type of seek * @cookie: cookie to update * * See generic_file_llseek for a general description and locking assumptions. * * In contrast to generic_file_llseek, this function also resets a * specified cookie to indicate a seek took place. */ loff_t generic_llseek_cookie(struct file *file, loff_t offset, int whence, u64 *cookie) { struct inode *inode = file->f_mapping->host; loff_t maxsize = inode->i_sb->s_maxbytes; loff_t eof = i_size_read(inode); int ret; if (WARN_ON_ONCE(!cookie)) return -EINVAL; /* * Require that this is only used for directories that guarantee * synchronization between readdir and seek so that an update to * @cookie is correctly synchronized with concurrent readdir. */ if (WARN_ON_ONCE(!(file->f_mode & FMODE_ATOMIC_POS))) return -EINVAL; ret = must_set_pos(file, &offset, whence, eof); if (ret < 0) return ret; if (ret == 0) return offset; /* No need to hold f_lock because we know that f_pos_lock is held. */ if (whence == SEEK_CUR) return vfs_setpos_cookie(file, file->f_pos + offset, maxsize, cookie); return vfs_setpos_cookie(file, offset, maxsize, cookie); } EXPORT_SYMBOL(generic_llseek_cookie); /** * generic_file_llseek - generic llseek implementation for regular files * @file: file structure to seek on * @offset: file offset to seek to * @whence: type of seek * * This is a generic implemenation of ->llseek useable for all normal local * filesystems. It just updates the file offset to the value specified by * @offset and @whence. */ loff_t generic_file_llseek(struct file *file, loff_t offset, int whence) { struct inode *inode = file->f_mapping->host; return generic_file_llseek_size(file, offset, whence, inode->i_sb->s_maxbytes, i_size_read(inode)); } EXPORT_SYMBOL(generic_file_llseek); /** * fixed_size_llseek - llseek implementation for fixed-sized devices * @file: file structure to seek on * @offset: file offset to seek to * @whence: type of seek * @size: size of the file * */ loff_t fixed_size_llseek(struct file *file, loff_t offset, int whence, loff_t size) { switch (whence) { case SEEK_SET: case SEEK_CUR: case SEEK_END: return generic_file_llseek_size(file, offset, whence, size, size); default: return -EINVAL; } } EXPORT_SYMBOL(fixed_size_llseek); /** * no_seek_end_llseek - llseek implementation for fixed-sized devices * @file: file structure to seek on * @offset: file offset to seek to * @whence: type of seek * */ loff_t no_seek_end_llseek(struct file *file, loff_t offset, int whence) { switch (whence) { case SEEK_SET: case SEEK_CUR: return generic_file_llseek_size(file, offset, whence, OFFSET_MAX, 0); default: return -EINVAL; } } EXPORT_SYMBOL(no_seek_end_llseek); /** * no_seek_end_llseek_size - llseek implementation for fixed-sized devices * @file: file structure to seek on * @offset: file offset to seek to * @whence: type of seek * @size: maximal offset allowed * */ loff_t no_seek_end_llseek_size(struct file *file, loff_t offset, int whence, loff_t size) { switch (whence) { case SEEK_SET: case SEEK_CUR: return generic_file_llseek_size(file, offset, whence, size, 0); default: return -EINVAL; } } EXPORT_SYMBOL(no_seek_end_llseek_size); /** * noop_llseek - No Operation Performed llseek implementation * @file: file structure to seek on * @offset: file offset to seek to * @whence: type of seek * * This is an implementation of ->llseek useable for the rare special case when * userspace expects the seek to succeed but the (device) file is actually not * able to perform the seek. In this case you use noop_llseek() instead of * falling back to the default implementation of ->llseek. */ loff_t noop_llseek(struct file *file, loff_t offset, int whence) { return file->f_pos; } EXPORT_SYMBOL(noop_llseek); loff_t default_llseek(struct file *file, loff_t offset, int whence) { struct inode *inode = file_inode(file); loff_t retval; inode_lock(inode); switch (whence) { case SEEK_END: offset += i_size_read(inode); break; case SEEK_CUR: if (offset == 0) { retval = file->f_pos; goto out; } offset += file->f_pos; break; case SEEK_DATA: /* * In the generic case the entire file is data, so as * long as offset isn't at the end of the file then the * offset is data. */ if (offset >= inode->i_size) { retval = -ENXIO; goto out; } break; case SEEK_HOLE: /* * There is a virtual hole at the end of the file, so * as long as offset isn't i_size or larger, return * i_size. */ if (offset >= inode->i_size) { retval = -ENXIO; goto out; } offset = inode->i_size; break; } retval = -EINVAL; if (offset >= 0 || unsigned_offsets(file)) { if (offset != file->f_pos) file->f_pos = offset; retval = offset; } out: inode_unlock(inode); return retval; } EXPORT_SYMBOL(default_llseek); loff_t vfs_llseek(struct file *file, loff_t offset, int whence) { if (!(file->f_mode & FMODE_LSEEK)) return -ESPIPE; return file->f_op->llseek(file, offset, whence); } EXPORT_SYMBOL(vfs_llseek); static off_t ksys_lseek(unsigned int fd, off_t offset, unsigned int whence) { off_t retval; struct fd f = fdget_pos(fd); if (!fd_file(f)) return -EBADF; retval = -EINVAL; if (whence <= SEEK_MAX) { loff_t res = vfs_llseek(fd_file(f), offset, whence); retval = res; if (res != (loff_t)retval) retval = -EOVERFLOW; /* LFS: should only happen on 32 bit platforms */ } fdput_pos(f); return retval; } SYSCALL_DEFINE3(lseek, unsigned int, fd, off_t, offset, unsigned int, whence) { return ksys_lseek(fd, offset, whence); } #ifdef CONFIG_COMPAT COMPAT_SYSCALL_DEFINE3(lseek, unsigned int, fd, compat_off_t, offset, unsigned int, whence) { return ksys_lseek(fd, offset, whence); } #endif #if !defined(CONFIG_64BIT) || defined(CONFIG_COMPAT) || \ defined(__ARCH_WANT_SYS_LLSEEK) SYSCALL_DEFINE5(llseek, unsigned int, fd, unsigned long, offset_high, unsigned long, offset_low, loff_t __user *, result, unsigned int, whence) { int retval; struct fd f = fdget_pos(fd); loff_t offset; if (!fd_file(f)) return -EBADF; retval = -EINVAL; if (whence > SEEK_MAX) goto out_putf; offset = vfs_llseek(fd_file(f), ((loff_t) offset_high << 32) | offset_low, whence); retval = (int)offset; if (offset >= 0) { retval = -EFAULT; if (!copy_to_user(result, &offset, sizeof(offset))) retval = 0; } out_putf: fdput_pos(f); return retval; } #endif int rw_verify_area(int read_write, struct file *file, const loff_t *ppos, size_t count) { int mask = read_write == READ ? MAY_READ : MAY_WRITE; int ret; if (unlikely((ssize_t) count < 0)) return -EINVAL; if (ppos) { loff_t pos = *ppos; if (unlikely(pos < 0)) { if (!unsigned_offsets(file)) return -EINVAL; if (count >= -pos) /* both values are in 0..LLONG_MAX */ return -EOVERFLOW; } else if (unlikely((loff_t) (pos + count) < 0)) { if (!unsigned_offsets(file)) return -EINVAL; } } ret = security_file_permission(file, mask); if (ret) return ret; return fsnotify_file_area_perm(file, mask, ppos, count); } EXPORT_SYMBOL(rw_verify_area); static ssize_t new_sync_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos) { struct kiocb kiocb; struct iov_iter iter; ssize_t ret; init_sync_kiocb(&kiocb, filp); kiocb.ki_pos = (ppos ? *ppos : 0); iov_iter_ubuf(&iter, ITER_DEST, buf, len); ret = filp->f_op->read_iter(&kiocb, &iter); BUG_ON(ret == -EIOCBQUEUED); if (ppos) *ppos = kiocb.ki_pos; return ret; } static int warn_unsupported(struct file *file, const char *op) { pr_warn_ratelimited( "kernel %s not supported for file %pD4 (pid: %d comm: %.20s)\n", op, file, current->pid, current->comm); return -EINVAL; } ssize_t __kernel_read(struct file *file, void *buf, size_t count, loff_t *pos) { struct kvec iov = { .iov_base = buf, .iov_len = min_t(size_t, count, MAX_RW_COUNT), }; struct kiocb kiocb; struct iov_iter iter; ssize_t ret; if (WARN_ON_ONCE(!(file->f_mode & FMODE_READ))) return -EINVAL; if (!(file->f_mode & FMODE_CAN_READ)) return -EINVAL; /* * Also fail if ->read_iter and ->read are both wired up as that * implies very convoluted semantics. */ if (unlikely(!file->f_op->read_iter || file->f_op->read)) return warn_unsupported(file, "read"); init_sync_kiocb(&kiocb, file); kiocb.ki_pos = pos ? *pos : 0; iov_iter_kvec(&iter, ITER_DEST, &iov, 1, iov.iov_len); ret = file->f_op->read_iter(&kiocb, &iter); if (ret > 0) { if (pos) *pos = kiocb.ki_pos; fsnotify_access(file); add_rchar(current, ret); } inc_syscr(current); return ret; } ssize_t kernel_read(struct file *file, void *buf, size_t count, loff_t *pos) { ssize_t ret; ret = rw_verify_area(READ, file, pos, count); if (ret) return ret; return __kernel_read(file, buf, count, pos); } EXPORT_SYMBOL(kernel_read); ssize_t vfs_read(struct file *file, char __user *buf, size_t count, loff_t *pos) { ssize_t ret; if (!(file->f_mode & FMODE_READ)) return -EBADF; if (!(file->f_mode & FMODE_CAN_READ)) return -EINVAL; if (unlikely(!access_ok(buf, count))) return -EFAULT; ret = rw_verify_area(READ, file, pos, count); if (ret) return ret; if (count > MAX_RW_COUNT) count = MAX_RW_COUNT; if (file->f_op->read) ret = file->f_op->read(file, buf, count, pos); else if (file->f_op->read_iter) ret = new_sync_read(file, buf, count, pos); else ret = -EINVAL; if (ret > 0) { fsnotify_access(file); add_rchar(current, ret); } inc_syscr(current); return ret; } static ssize_t new_sync_write(struct file *filp, const char __user *buf, size_t len, loff_t *ppos) { struct kiocb kiocb; struct iov_iter iter; ssize_t ret; init_sync_kiocb(&kiocb, filp); kiocb.ki_pos = (ppos ? *ppos : 0); iov_iter_ubuf(&iter, ITER_SOURCE, (void __user *)buf, len); ret = filp->f_op->write_iter(&kiocb, &iter); BUG_ON(ret == -EIOCBQUEUED); if (ret > 0 && ppos) *ppos = kiocb.ki_pos; return ret; } /* caller is responsible for file_start_write/file_end_write */ ssize_t __kernel_write_iter(struct file *file, struct iov_iter *from, loff_t *pos) { struct kiocb kiocb; ssize_t ret; if (WARN_ON_ONCE(!(file->f_mode & FMODE_WRITE))) return -EBADF; if (!(file->f_mode & FMODE_CAN_WRITE)) return -EINVAL; /* * Also fail if ->write_iter and ->write are both wired up as that * implies very convoluted semantics. */ if (unlikely(!file->f_op->write_iter || file->f_op->write)) return warn_unsupported(file, "write"); init_sync_kiocb(&kiocb, file); kiocb.ki_pos = pos ? *pos : 0; ret = file->f_op->write_iter(&kiocb, from); if (ret > 0) { if (pos) *pos = kiocb.ki_pos; fsnotify_modify(file); add_wchar(current, ret); } inc_syscw(current); return ret; } /* caller is responsible for file_start_write/file_end_write */ ssize_t __kernel_write(struct file *file, const void *buf, size_t count, loff_t *pos) { struct kvec iov = { .iov_base = (void *)buf, .iov_len = min_t(size_t, count, MAX_RW_COUNT), }; struct iov_iter iter; iov_iter_kvec(&iter, ITER_SOURCE, &iov, 1, iov.iov_len); return __kernel_write_iter(file, &iter, pos); } /* * This "EXPORT_SYMBOL_GPL()" is more of a "EXPORT_SYMBOL_DONTUSE()", * but autofs is one of the few internal kernel users that actually * wants this _and_ can be built as a module. So we need to export * this symbol for autofs, even though it really isn't appropriate * for any other kernel modules. */ EXPORT_SYMBOL_GPL(__kernel_write); ssize_t kernel_write(struct file *file, const void *buf, size_t count, loff_t *pos) { ssize_t ret; ret = rw_verify_area(WRITE, file, pos, count); if (ret) return ret; file_start_write(file); ret = __kernel_write(file, buf, count, pos); file_end_write(file); return ret; } EXPORT_SYMBOL(kernel_write); ssize_t vfs_write(struct file *file, const char __user *buf, size_t count, loff_t *pos) { ssize_t ret; if (!(file->f_mode & FMODE_WRITE)) return -EBADF; if (!(file->f_mode & FMODE_CAN_WRITE)) return -EINVAL; if (unlikely(!access_ok(buf, count))) return -EFAULT; ret = rw_verify_area(WRITE, file, pos, count); if (ret) return ret; if (count > MAX_RW_COUNT) count = MAX_RW_COUNT; file_start_write(file); if (file->f_op->write) ret = file->f_op->write(file, buf, count, pos); else if (file->f_op->write_iter) ret = new_sync_write(file, buf, count, pos); else ret = -EINVAL; if (ret > 0) { fsnotify_modify(file); add_wchar(current, ret); } inc_syscw(current); file_end_write(file); return ret; } /* file_ppos returns &file->f_pos or NULL if file is stream */ static inline loff_t *file_ppos(struct file *file) { return file->f_mode & FMODE_STREAM ? NULL : &file->f_pos; } ssize_t ksys_read(unsigned int fd, char __user *buf, size_t count) { struct fd f = fdget_pos(fd); ssize_t ret = -EBADF; if (fd_file(f)) { loff_t pos, *ppos = file_ppos(fd_file(f)); if (ppos) { pos = *ppos; ppos = &pos; } ret = vfs_read(fd_file(f), buf, count, ppos); if (ret >= 0 && ppos) fd_file(f)->f_pos = pos; fdput_pos(f); } return ret; } SYSCALL_DEFINE3(read, unsigned int, fd, char __user *, buf, size_t, count) { return ksys_read(fd, buf, count); } ssize_t ksys_write(unsigned int fd, const char __user *buf, size_t count) { struct fd f = fdget_pos(fd); ssize_t ret = -EBADF; if (fd_file(f)) { loff_t pos, *ppos = file_ppos(fd_file(f)); if (ppos) { pos = *ppos; ppos = &pos; } ret = vfs_write(fd_file(f), buf, count, ppos); if (ret >= 0 && ppos) fd_file(f)->f_pos = pos; fdput_pos(f); } return ret; } SYSCALL_DEFINE3(write, unsigned int, fd, const char __user *, buf, size_t, count) { return ksys_write(fd, buf, count); } ssize_t ksys_pread64(unsigned int fd, char __user *buf, size_t count, loff_t pos) { struct fd f; ssize_t ret = -EBADF; if (pos < 0) return -EINVAL; f = fdget(fd); if (fd_file(f)) { ret = -ESPIPE; if (fd_file(f)->f_mode & FMODE_PREAD) ret = vfs_read(fd_file(f), buf, count, &pos); fdput(f); } return ret; } SYSCALL_DEFINE4(pread64, unsigned int, fd, char __user *, buf, size_t, count, loff_t, pos) { return ksys_pread64(fd, buf, count, pos); } #if defined(CONFIG_COMPAT) && defined(__ARCH_WANT_COMPAT_PREAD64) COMPAT_SYSCALL_DEFINE5(pread64, unsigned int, fd, char __user *, buf, size_t, count, compat_arg_u64_dual(pos)) { return ksys_pread64(fd, buf, count, compat_arg_u64_glue(pos)); } #endif ssize_t ksys_pwrite64(unsigned int fd, const char __user *buf, size_t count, loff_t pos) { struct fd f; ssize_t ret = -EBADF; if (pos < 0) return -EINVAL; f = fdget(fd); if (fd_file(f)) { ret = -ESPIPE; if (fd_file(f)->f_mode & FMODE_PWRITE) ret = vfs_write(fd_file(f), buf, count, &pos); fdput(f); } return ret; } SYSCALL_DEFINE4(pwrite64, unsigned int, fd, const char __user *, buf, size_t, count, loff_t, pos) { return ksys_pwrite64(fd, buf, count, pos); } #if defined(CONFIG_COMPAT) && defined(__ARCH_WANT_COMPAT_PWRITE64) COMPAT_SYSCALL_DEFINE5(pwrite64, unsigned int, fd, const char __user *, buf, size_t, count, compat_arg_u64_dual(pos)) { return ksys_pwrite64(fd, buf, count, compat_arg_u64_glue(pos)); } #endif static ssize_t do_iter_readv_writev(struct file *filp, struct iov_iter *iter, loff_t *ppos, int type, rwf_t flags) { struct kiocb kiocb; ssize_t ret; init_sync_kiocb(&kiocb, filp); ret = kiocb_set_rw_flags(&kiocb, flags, type); if (ret) return ret; kiocb.ki_pos = (ppos ? *ppos : 0); if (type == READ) ret = filp->f_op->read_iter(&kiocb, iter); else ret = filp->f_op->write_iter(&kiocb, iter); BUG_ON(ret == -EIOCBQUEUED); if (ppos) *ppos = kiocb.ki_pos; return ret; } /* Do it by hand, with file-ops */ static ssize_t do_loop_readv_writev(struct file *filp, struct iov_iter *iter, loff_t *ppos, int type, rwf_t flags) { ssize_t ret = 0; if (flags & ~RWF_HIPRI) return -EOPNOTSUPP; while (iov_iter_count(iter)) { ssize_t nr; if (type == READ) { nr = filp->f_op->read(filp, iter_iov_addr(iter), iter_iov_len(iter), ppos); } else { nr = filp->f_op->write(filp, iter_iov_addr(iter), iter_iov_len(iter), ppos); } if (nr < 0) { if (!ret) ret = nr; break; } ret += nr; if (nr != iter_iov_len(iter)) break; iov_iter_advance(iter, nr); } return ret; } ssize_t vfs_iocb_iter_read(struct file *file, struct kiocb *iocb, struct iov_iter *iter) { size_t tot_len; ssize_t ret = 0; if (!file->f_op->read_iter) return -EINVAL; if (!(file->f_mode & FMODE_READ)) return -EBADF; if (!(file->f_mode & FMODE_CAN_READ)) return -EINVAL; tot_len = iov_iter_count(iter); if (!tot_len) goto out; ret = rw_verify_area(READ, file, &iocb->ki_pos, tot_len); if (ret < 0) return ret; ret = file->f_op->read_iter(iocb, iter); out: if (ret >= 0) fsnotify_access(file); return ret; } EXPORT_SYMBOL(vfs_iocb_iter_read); ssize_t vfs_iter_read(struct file *file, struct iov_iter *iter, loff_t *ppos, rwf_t flags) { size_t tot_len; ssize_t ret = 0; if (!file->f_op->read_iter) return -EINVAL; if (!(file->f_mode & FMODE_READ)) return -EBADF; if (!(file->f_mode & FMODE_CAN_READ)) return -EINVAL; tot_len = iov_iter_count(iter); if (!tot_len) goto out; ret = rw_verify_area(READ, file, ppos, tot_len); if (ret < 0) return ret; ret = do_iter_readv_writev(file, iter, ppos, READ, flags); out: if (ret >= 0) fsnotify_access(file); return ret; } EXPORT_SYMBOL(vfs_iter_read); /* * Caller is responsible for calling kiocb_end_write() on completion * if async iocb was queued. */ ssize_t vfs_iocb_iter_write(struct file *file, struct kiocb *iocb, struct iov_iter *iter) { size_t tot_len; ssize_t ret = 0; if (!file->f_op->write_iter) return -EINVAL; if (!(file->f_mode & FMODE_WRITE)) return -EBADF; if (!(file->f_mode & FMODE_CAN_WRITE)) return -EINVAL; tot_len = iov_iter_count(iter); if (!tot_len) return 0; ret = rw_verify_area(WRITE, file, &iocb->ki_pos, tot_len); if (ret < 0) return ret; kiocb_start_write(iocb); ret = file->f_op->write_iter(iocb, iter); if (ret != -EIOCBQUEUED) kiocb_end_write(iocb); if (ret > 0) fsnotify_modify(file); return ret; } EXPORT_SYMBOL(vfs_iocb_iter_write); ssize_t vfs_iter_write(struct file *file, struct iov_iter *iter, loff_t *ppos, rwf_t flags) { size_t tot_len; ssize_t ret; if (!(file->f_mode & FMODE_WRITE)) return -EBADF; if (!(file->f_mode & FMODE_CAN_WRITE)) return -EINVAL; if (!file->f_op->write_iter) return -EINVAL; tot_len = iov_iter_count(iter); if (!tot_len) return 0; ret = rw_verify_area(WRITE, file, ppos, tot_len); if (ret < 0) return ret; file_start_write(file); ret = do_iter_readv_writev(file, iter, ppos, WRITE, flags); if (ret > 0) fsnotify_modify(file); file_end_write(file); return ret; } EXPORT_SYMBOL(vfs_iter_write); static ssize_t vfs_readv(struct file *file, const struct iovec __user *vec, unsigned long vlen, loff_t *pos, rwf_t flags) { struct iovec iovstack[UIO_FASTIOV]; struct iovec *iov = iovstack; struct iov_iter iter; size_t tot_len; ssize_t ret = 0; if (!(file->f_mode & FMODE_READ)) return -EBADF; if (!(file->f_mode & FMODE_CAN_READ)) return -EINVAL; ret = import_iovec(ITER_DEST, vec, vlen, ARRAY_SIZE(iovstack), &iov, &iter); if (ret < 0) return ret; tot_len = iov_iter_count(&iter); if (!tot_len) goto out; ret = rw_verify_area(READ, file, pos, tot_len); if (ret < 0) goto out; if (file->f_op->read_iter) ret = do_iter_readv_writev(file, &iter, pos, READ, flags); else ret = do_loop_readv_writev(file, &iter, pos, READ, flags); out: if (ret >= 0) fsnotify_access(file); kfree(iov); return ret; } static ssize_t vfs_writev(struct file *file, const struct iovec __user *vec, unsigned long vlen, loff_t *pos, rwf_t flags) { struct iovec iovstack[UIO_FASTIOV]; struct iovec *iov = iovstack; struct iov_iter iter; size_t tot_len; ssize_t ret = 0; if (!(file->f_mode & FMODE_WRITE)) return -EBADF; if (!(file->f_mode & FMODE_CAN_WRITE)) return -EINVAL; ret = import_iovec(ITER_SOURCE, vec, vlen, ARRAY_SIZE(iovstack), &iov, &iter); if (ret < 0) return ret; tot_len = iov_iter_count(&iter); if (!tot_len) goto out; ret = rw_verify_area(WRITE, file, pos, tot_len); if (ret < 0) goto out; file_start_write(file); if (file->f_op->write_iter) ret = do_iter_readv_writev(file, &iter, pos, WRITE, flags); else ret = do_loop_readv_writev(file, &iter, pos, WRITE, flags); if (ret > 0) fsnotify_modify(file); file_end_write(file); out: kfree(iov); return ret; } static ssize_t do_readv(unsigned long fd, const struct iovec __user *vec, unsigned long vlen, rwf_t flags) { struct fd f = fdget_pos(fd); ssize_t ret = -EBADF; if (fd_file(f)) { loff_t pos, *ppos = file_ppos(fd_file(f)); if (ppos) { pos = *ppos; ppos = &pos; } ret = vfs_readv(fd_file(f), vec, vlen, ppos, flags); if (ret >= 0 && ppos) fd_file(f)->f_pos = pos; fdput_pos(f); } if (ret > 0) add_rchar(current, ret); inc_syscr(current); return ret; } static ssize_t do_writev(unsigned long fd, const struct iovec __user *vec, unsigned long vlen, rwf_t flags) { struct fd f = fdget_pos(fd); ssize_t ret = -EBADF; if (fd_file(f)) { loff_t pos, *ppos = file_ppos(fd_file(f)); if (ppos) { pos = *ppos; ppos = &pos; } ret = vfs_writev(fd_file(f), vec, vlen, ppos, flags); if (ret >= 0 && ppos) fd_file(f)->f_pos = pos; fdput_pos(f); } if (ret > 0) add_wchar(current, ret); inc_syscw(current); return ret; } static inline loff_t pos_from_hilo(unsigned long high, unsigned long low) { #define HALF_LONG_BITS (BITS_PER_LONG / 2) return (((loff_t)high << HALF_LONG_BITS) << HALF_LONG_BITS) | low; } static ssize_t do_preadv(unsigned long fd, const struct iovec __user *vec, unsigned long vlen, loff_t pos, rwf_t flags) { struct fd f; ssize_t ret = -EBADF; if (pos < 0) return -EINVAL; f = fdget(fd); if (fd_file(f)) { ret = -ESPIPE; if (fd_file(f)->f_mode & FMODE_PREAD) ret = vfs_readv(fd_file(f), vec, vlen, &pos, flags); fdput(f); } if (ret > 0) add_rchar(current, ret); inc_syscr(current); return ret; } static ssize_t do_pwritev(unsigned long fd, const struct iovec __user *vec, unsigned long vlen, loff_t pos, rwf_t flags) { struct fd f; ssize_t ret = -EBADF; if (pos < 0) return -EINVAL; f = fdget(fd); if (fd_file(f)) { ret = -ESPIPE; if (fd_file(f)->f_mode & FMODE_PWRITE) ret = vfs_writev(fd_file(f), vec, vlen, &pos, flags); fdput(f); } if (ret > 0) add_wchar(current, ret); inc_syscw(current); return ret; } SYSCALL_DEFINE3(readv, unsigned long, fd, const struct iovec __user *, vec, unsigned long, vlen) { return do_readv(fd, vec, vlen, 0); } SYSCALL_DEFINE3(writev, unsigned long, fd, const struct iovec __user *, vec, unsigned long, vlen) { return do_writev(fd, vec, vlen, 0); } SYSCALL_DEFINE5(preadv, unsigned long, fd, const struct iovec __user *, vec, unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h) { loff_t pos = pos_from_hilo(pos_h, pos_l); return do_preadv(fd, vec, vlen, pos, 0); } SYSCALL_DEFINE6(preadv2, unsigned long, fd, const struct iovec __user *, vec, unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h, rwf_t, flags) { loff_t pos = pos_from_hilo(pos_h, pos_l); if (pos == -1) return do_readv(fd, vec, vlen, flags); return do_preadv(fd, vec, vlen, pos, flags); } SYSCALL_DEFINE5(pwritev, unsigned long, fd, const struct iovec __user *, vec, unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h) { loff_t pos = pos_from_hilo(pos_h, pos_l); return do_pwritev(fd, vec, vlen, pos, 0); } SYSCALL_DEFINE6(pwritev2, unsigned long, fd, const struct iovec __user *, vec, unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h, rwf_t, flags) { loff_t pos = pos_from_hilo(pos_h, pos_l); if (pos == -1) return do_writev(fd, vec, vlen, flags); return do_pwritev(fd, vec, vlen, pos, flags); } /* * Various compat syscalls. Note that they all pretend to take a native * iovec - import_iovec will properly treat those as compat_iovecs based on * in_compat_syscall(). */ #ifdef CONFIG_COMPAT #ifdef __ARCH_WANT_COMPAT_SYS_PREADV64 COMPAT_SYSCALL_DEFINE4(preadv64, unsigned long, fd, const struct iovec __user *, vec, unsigned long, vlen, loff_t, pos) { return do_preadv(fd, vec, vlen, pos, 0); } #endif COMPAT_SYSCALL_DEFINE5(preadv, compat_ulong_t, fd, const struct iovec __user *, vec, compat_ulong_t, vlen, u32, pos_low, u32, pos_high) { loff_t pos = ((loff_t)pos_high << 32) | pos_low; return do_preadv(fd, vec, vlen, pos, 0); } #ifdef __ARCH_WANT_COMPAT_SYS_PREADV64V2 COMPAT_SYSCALL_DEFINE5(preadv64v2, unsigned long, fd, const struct iovec __user *, vec, unsigned long, vlen, loff_t, pos, rwf_t, flags) { if (pos == -1) return do_readv(fd, vec, vlen, flags); return do_preadv(fd, vec, vlen, pos, flags); } #endif COMPAT_SYSCALL_DEFINE6(preadv2, compat_ulong_t, fd, const struct iovec __user *, vec, compat_ulong_t, vlen, u32, pos_low, u32, pos_high, rwf_t, flags) { loff_t pos = ((loff_t)pos_high << 32) | pos_low; if (pos == -1) return do_readv(fd, vec, vlen, flags); return do_preadv(fd, vec, vlen, pos, flags); } #ifdef __ARCH_WANT_COMPAT_SYS_PWRITEV64 COMPAT_SYSCALL_DEFINE4(pwritev64, unsigned long, fd, const struct iovec __user *, vec, unsigned long, vlen, loff_t, pos) { return do_pwritev(fd, vec, vlen, pos, 0); } #endif COMPAT_SYSCALL_DEFINE5(pwritev, compat_ulong_t, fd, const struct iovec __user *,vec, compat_ulong_t, vlen, u32, pos_low, u32, pos_high) { loff_t pos = ((loff_t)pos_high << 32) | pos_low; return do_pwritev(fd, vec, vlen, pos, 0); } #ifdef __ARCH_WANT_COMPAT_SYS_PWRITEV64V2 COMPAT_SYSCALL_DEFINE5(pwritev64v2, unsigned long, fd, const struct iovec __user *, vec, unsigned long, vlen, loff_t, pos, rwf_t, flags) { if (pos == -1) return do_writev(fd, vec, vlen, flags); return do_pwritev(fd, vec, vlen, pos, flags); } #endif COMPAT_SYSCALL_DEFINE6(pwritev2, compat_ulong_t, fd, const struct iovec __user *,vec, compat_ulong_t, vlen, u32, pos_low, u32, pos_high, rwf_t, flags) { loff_t pos = ((loff_t)pos_high << 32) | pos_low; if (pos == -1) return do_writev(fd, vec, vlen, flags); return do_pwritev(fd, vec, vlen, pos, flags); } #endif /* CONFIG_COMPAT */ static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos, size_t count, loff_t max) { struct fd in, out; struct inode *in_inode, *out_inode; struct pipe_inode_info *opipe; loff_t pos; loff_t out_pos; ssize_t retval; int fl; /* * Get input file, and verify that it is ok.. */ retval = -EBADF; in = fdget(in_fd); if (!fd_file(in)) goto out; if (!(fd_file(in)->f_mode & FMODE_READ)) goto fput_in; retval = -ESPIPE; if (!ppos) { pos = fd_file(in)->f_pos; } else { pos = *ppos; if (!(fd_file(in)->f_mode & FMODE_PREAD)) goto fput_in; } retval = rw_verify_area(READ, fd_file(in), &pos, count); if (retval < 0) goto fput_in; if (count > MAX_RW_COUNT) count = MAX_RW_COUNT; /* * Get output file, and verify that it is ok.. */ retval = -EBADF; out = fdget(out_fd); if (!fd_file(out)) goto fput_in; if (!(fd_file(out)->f_mode & FMODE_WRITE)) goto fput_out; in_inode = file_inode(fd_file(in)); out_inode = file_inode(fd_file(out)); out_pos = fd_file(out)->f_pos; if (!max) max = min(in_inode->i_sb->s_maxbytes, out_inode->i_sb->s_maxbytes); if (unlikely(pos + count > max)) { retval = -EOVERFLOW; if (pos >= max) goto fput_out; count = max - pos; } fl = 0; #if 0 /* * We need to debate whether we can enable this or not. The * man page documents EAGAIN return for the output at least, * and the application is arguably buggy if it doesn't expect * EAGAIN on a non-blocking file descriptor. */ if (fd_file(in)->f_flags & O_NONBLOCK) fl = SPLICE_F_NONBLOCK; #endif opipe = get_pipe_info(fd_file(out), true); if (!opipe) { retval = rw_verify_area(WRITE, fd_file(out), &out_pos, count); if (retval < 0) goto fput_out; retval = do_splice_direct(fd_file(in), &pos, fd_file(out), &out_pos, count, fl); } else { if (fd_file(out)->f_flags & O_NONBLOCK) fl |= SPLICE_F_NONBLOCK; retval = splice_file_to_pipe(fd_file(in), opipe, &pos, count, fl); } if (retval > 0) { add_rchar(current, retval); add_wchar(current, retval); fsnotify_access(fd_file(in)); fsnotify_modify(fd_file(out)); fd_file(out)->f_pos = out_pos; if (ppos) *ppos = pos; else fd_file(in)->f_pos = pos; } inc_syscr(current); inc_syscw(current); if (pos > max) retval = -EOVERFLOW; fput_out: fdput(out); fput_in: fdput(in); out: return retval; } SYSCALL_DEFINE4(sendfile, int, out_fd, int, in_fd, off_t __user *, offset, size_t, count) { loff_t pos; off_t off; ssize_t ret; if (offset) { if (unlikely(get_user(off, offset))) return -EFAULT; pos = off; ret = do_sendfile(out_fd, in_fd, &pos, count, MAX_NON_LFS); if (unlikely(put_user(pos, offset))) return -EFAULT; return ret; } return do_sendfile(out_fd, in_fd, NULL, count, 0); } SYSCALL_DEFINE4(sendfile64, int, out_fd, int, in_fd, loff_t __user *, offset, size_t, count) { loff_t pos; ssize_t ret; if (offset) { if (unlikely(copy_from_user(&pos, offset, sizeof(loff_t)))) return -EFAULT; ret = do_sendfile(out_fd, in_fd, &pos, count, 0); if (unlikely(put_user(pos, offset))) return -EFAULT; return ret; } return do_sendfile(out_fd, in_fd, NULL, count, 0); } #ifdef CONFIG_COMPAT COMPAT_SYSCALL_DEFINE4(sendfile, int, out_fd, int, in_fd, compat_off_t __user *, offset, compat_size_t, count) { loff_t pos; off_t off; ssize_t ret; if (offset) { if (unlikely(get_user(off, offset))) return -EFAULT; pos = off; ret = do_sendfile(out_fd, in_fd, &pos, count, MAX_NON_LFS); if (unlikely(put_user(pos, offset))) return -EFAULT; return ret; } return do_sendfile(out_fd, in_fd, NULL, count, 0); } COMPAT_SYSCALL_DEFINE4(sendfile64, int, out_fd, int, in_fd, compat_loff_t __user *, offset, compat_size_t, count) { loff_t pos; ssize_t ret; if (offset) { if (unlikely(copy_from_user(&pos, offset, sizeof(loff_t)))) return -EFAULT; ret = do_sendfile(out_fd, in_fd, &pos, count, 0); if (unlikely(put_user(pos, offset))) return -EFAULT; return ret; } return do_sendfile(out_fd, in_fd, NULL, count, 0); } #endif /* * Performs necessary checks before doing a file copy * * Can adjust amount of bytes to copy via @req_count argument. * Returns appropriate error code that caller should return or * zero in case the copy should be allowed. */ static int generic_copy_file_checks(struct file *file_in, loff_t pos_in, struct file *file_out, loff_t pos_out, size_t *req_count, unsigned int flags) { struct inode *inode_in = file_inode(file_in); struct inode *inode_out = file_inode(file_out); uint64_t count = *req_count; loff_t size_in; int ret; ret = generic_file_rw_checks(file_in, file_out); if (ret) return ret; /* * We allow some filesystems to handle cross sb copy, but passing * a file of the wrong filesystem type to filesystem driver can result * in an attempt to dereference the wrong type of ->private_data, so * avoid doing that until we really have a good reason. * * nfs and cifs define several different file_system_type structures * and several different sets of file_operations, but they all end up * using the same ->copy_file_range() function pointer. */ if (flags & COPY_FILE_SPLICE) { /* cross sb splice is allowed */ } else if (file_out->f_op->copy_file_range) { if (file_in->f_op->copy_file_range != file_out->f_op->copy_file_range) return -EXDEV; } else if (file_inode(file_in)->i_sb != file_inode(file_out)->i_sb) { return -EXDEV; } /* Don't touch certain kinds of inodes */ if (IS_IMMUTABLE(inode_out)) return -EPERM; if (IS_SWAPFILE(inode_in) || IS_SWAPFILE(inode_out)) return -ETXTBSY; /* Ensure offsets don't wrap. */ if (pos_in + count < pos_in || pos_out + count < pos_out) return -EOVERFLOW; /* Shorten the copy to EOF */ size_in = i_size_read(inode_in); if (pos_in >= size_in) count = 0; else count = min(count, size_in - (uint64_t)pos_in); ret = generic_write_check_limits(file_out, pos_out, &count); if (ret) return ret; /* Don't allow overlapped copying within the same file. */ if (inode_in == inode_out && pos_out + count > pos_in && pos_out < pos_in + count) return -EINVAL; *req_count = count; return 0; } /* * copy_file_range() differs from regular file read and write in that it * specifically allows return partial success. When it does so is up to * the copy_file_range method. */ ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in, struct file *file_out, loff_t pos_out, size_t len, unsigned int flags) { ssize_t ret; bool splice = flags & COPY_FILE_SPLICE; bool samesb = file_inode(file_in)->i_sb == file_inode(file_out)->i_sb; if (flags & ~COPY_FILE_SPLICE) return -EINVAL; ret = generic_copy_file_checks(file_in, pos_in, file_out, pos_out, &len, flags); if (unlikely(ret)) return ret; ret = rw_verify_area(READ, file_in, &pos_in, len); if (unlikely(ret)) return ret; ret = rw_verify_area(WRITE, file_out, &pos_out, len); if (unlikely(ret)) return ret; if (len == 0) return 0; file_start_write(file_out); /* * Cloning is supported by more file systems, so we implement copy on * same sb using clone, but for filesystems where both clone and copy * are supported (e.g. nfs,cifs), we only call the copy method. */ if (!splice && file_out->f_op->copy_file_range) { ret = file_out->f_op->copy_file_range(file_in, pos_in, file_out, pos_out, len, flags); } else if (!splice && file_in->f_op->remap_file_range && samesb) { ret = file_in->f_op->remap_file_range(file_in, pos_in, file_out, pos_out, min_t(loff_t, MAX_RW_COUNT, len), REMAP_FILE_CAN_SHORTEN); /* fallback to splice */ if (ret <= 0) splice = true; } else if (samesb) { /* Fallback to splice for same sb copy for backward compat */ splice = true; } file_end_write(file_out); if (!splice) goto done; /* * We can get here for same sb copy of filesystems that do not implement * ->copy_file_range() in case filesystem does not support clone or in * case filesystem supports clone but rejected the clone request (e.g. * because it was not block aligned). * * In both cases, fall back to kernel copy so we are able to maintain a * consistent story about which filesystems support copy_file_range() * and which filesystems do not, that will allow userspace tools to * make consistent desicions w.r.t using copy_file_range(). * * We also get here if caller (e.g. nfsd) requested COPY_FILE_SPLICE * for server-side-copy between any two sb. * * In any case, we call do_splice_direct() and not splice_file_range(), * without file_start_write() held, to avoid possible deadlocks related * to splicing from input file, while file_start_write() is held on * the output file on a different sb. */ ret = do_splice_direct(file_in, &pos_in, file_out, &pos_out, min_t(size_t, len, MAX_RW_COUNT), 0); done: if (ret > 0) { fsnotify_access(file_in); add_rchar(current, ret); fsnotify_modify(file_out); add_wchar(current, ret); } inc_syscr(current); inc_syscw(current); return ret; } EXPORT_SYMBOL(vfs_copy_file_range); SYSCALL_DEFINE6(copy_file_range, int, fd_in, loff_t __user *, off_in, int, fd_out, loff_t __user *, off_out, size_t, len, unsigned int, flags) { loff_t pos_in; loff_t pos_out; struct fd f_in; struct fd f_out; ssize_t ret = -EBADF; f_in = fdget(fd_in); if (!fd_file(f_in)) goto out2; f_out = fdget(fd_out); if (!fd_file(f_out)) goto out1; ret = -EFAULT; if (off_in) { if (copy_from_user(&pos_in, off_in, sizeof(loff_t))) goto out; } else { pos_in = fd_file(f_in)->f_pos; } if (off_out) { if (copy_from_user(&pos_out, off_out, sizeof(loff_t))) goto out; } else { pos_out = fd_file(f_out)->f_pos; } ret = -EINVAL; if (flags != 0) goto out; ret = vfs_copy_file_range(fd_file(f_in), pos_in, fd_file(f_out), pos_out, len, flags); if (ret > 0) { pos_in += ret; pos_out += ret; if (off_in) { if (copy_to_user(off_in, &pos_in, sizeof(loff_t))) ret = -EFAULT; } else { fd_file(f_in)->f_pos = pos_in; } if (off_out) { if (copy_to_user(off_out, &pos_out, sizeof(loff_t))) ret = -EFAULT; } else { fd_file(f_out)->f_pos = pos_out; } } out: fdput(f_out); out1: fdput(f_in); out2: return ret; } /* * Don't operate on ranges the page cache doesn't support, and don't exceed the * LFS limits. If pos is under the limit it becomes a short access. If it * exceeds the limit we return -EFBIG. */ int generic_write_check_limits(struct file *file, loff_t pos, loff_t *count) { struct inode *inode = file->f_mapping->host; loff_t max_size = inode->i_sb->s_maxbytes; loff_t limit = rlimit(RLIMIT_FSIZE); if (limit != RLIM_INFINITY) { if (pos >= limit) { send_sig(SIGXFSZ, current, 0); return -EFBIG; } *count = min(*count, limit - pos); } if (!(file->f_flags & O_LARGEFILE)) max_size = MAX_NON_LFS; if (unlikely(pos >= max_size)) return -EFBIG; *count = min(*count, max_size - pos); return 0; } EXPORT_SYMBOL_GPL(generic_write_check_limits); /* Like generic_write_checks(), but takes size of write instead of iter. */ int generic_write_checks_count(struct kiocb *iocb, loff_t *count) { struct file *file = iocb->ki_filp; struct inode *inode = file->f_mapping->host; if (IS_SWAPFILE(inode)) return -ETXTBSY; if (!*count) return 0; if (iocb->ki_flags & IOCB_APPEND) iocb->ki_pos = i_size_read(inode); if ((iocb->ki_flags & IOCB_NOWAIT) && !((iocb->ki_flags & IOCB_DIRECT) || (file->f_op->fop_flags & FOP_BUFFER_WASYNC))) return -EINVAL; return generic_write_check_limits(iocb->ki_filp, iocb->ki_pos, count); } EXPORT_SYMBOL(generic_write_checks_count); /* * Performs necessary checks before doing a write * * Can adjust writing position or amount of bytes to write. * Returns appropriate error code that caller should return or * zero in case that write should be allowed. */ ssize_t generic_write_checks(struct kiocb *iocb, struct iov_iter *from) { loff_t count = iov_iter_count(from); int ret; ret = generic_write_checks_count(iocb, &count); if (ret) return ret; iov_iter_truncate(from, count); return iov_iter_count(from); } EXPORT_SYMBOL(generic_write_checks); /* * Performs common checks before doing a file copy/clone * from @file_in to @file_out. */ int generic_file_rw_checks(struct file *file_in, struct file *file_out) { struct inode *inode_in = file_inode(file_in); struct inode *inode_out = file_inode(file_out); /* Don't copy dirs, pipes, sockets... */ if (S_ISDIR(inode_in->i_mode) || S_ISDIR(inode_out->i_mode)) return -EISDIR; if (!S_ISREG(inode_in->i_mode) || !S_ISREG(inode_out->i_mode)) return -EINVAL; if (!(file_in->f_mode & FMODE_READ) || !(file_out->f_mode & FMODE_WRITE) || (file_out->f_flags & O_APPEND)) return -EBADF; return 0; } bool generic_atomic_write_valid(struct iov_iter *iter, loff_t pos) { size_t len = iov_iter_count(iter); if (!iter_is_ubuf(iter)) return false; if (!is_power_of_2(len)) return false; if (!IS_ALIGNED(pos, len)) return false; return true; } |
| 189 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 | /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * ocfs2_lockid.h * * Defines OCFS2 lockid bits. * * Copyright (C) 2002, 2005 Oracle. All rights reserved. */ #ifndef OCFS2_LOCKID_H #define OCFS2_LOCKID_H /* lock ids are made up in the following manner: * name[0] --> type * name[1-6] --> 6 pad characters, reserved for now * name[7-22] --> block number, expressed in hex as 16 chars * name[23-30] --> i_generation, expressed in hex 8 chars * name[31] --> '\0' */ #define OCFS2_LOCK_ID_MAX_LEN 32 #define OCFS2_LOCK_ID_PAD "000000" #define OCFS2_DENTRY_LOCK_INO_START 18 enum ocfs2_lock_type { OCFS2_LOCK_TYPE_META = 0, OCFS2_LOCK_TYPE_DATA, OCFS2_LOCK_TYPE_SUPER, OCFS2_LOCK_TYPE_RENAME, OCFS2_LOCK_TYPE_RW, OCFS2_LOCK_TYPE_DENTRY, OCFS2_LOCK_TYPE_OPEN, OCFS2_LOCK_TYPE_FLOCK, OCFS2_LOCK_TYPE_QINFO, OCFS2_LOCK_TYPE_NFS_SYNC, OCFS2_LOCK_TYPE_ORPHAN_SCAN, OCFS2_LOCK_TYPE_REFCOUNT, OCFS2_LOCK_TYPE_TRIM_FS, OCFS2_NUM_LOCK_TYPES }; static inline char ocfs2_lock_type_char(enum ocfs2_lock_type type) { char c; switch (type) { case OCFS2_LOCK_TYPE_META: c = 'M'; break; case OCFS2_LOCK_TYPE_DATA: c = 'D'; break; case OCFS2_LOCK_TYPE_SUPER: c = 'S'; break; case OCFS2_LOCK_TYPE_RENAME: c = 'R'; break; case OCFS2_LOCK_TYPE_RW: c = 'W'; break; case OCFS2_LOCK_TYPE_DENTRY: c = 'N'; break; case OCFS2_LOCK_TYPE_OPEN: c = 'O'; break; case OCFS2_LOCK_TYPE_FLOCK: c = 'F'; break; case OCFS2_LOCK_TYPE_QINFO: c = 'Q'; break; case OCFS2_LOCK_TYPE_NFS_SYNC: c = 'Y'; break; case OCFS2_LOCK_TYPE_ORPHAN_SCAN: c = 'P'; break; case OCFS2_LOCK_TYPE_REFCOUNT: c = 'T'; break; case OCFS2_LOCK_TYPE_TRIM_FS: c = 'I'; break; default: c = '\0'; } return c; } static char *ocfs2_lock_type_strings[] = { [OCFS2_LOCK_TYPE_META] = "Meta", [OCFS2_LOCK_TYPE_DATA] = "Data", [OCFS2_LOCK_TYPE_SUPER] = "Super", [OCFS2_LOCK_TYPE_RENAME] = "Rename", /* Need to differntiate from [R]ename.. serializing writes is the * important job it does, anyway. */ [OCFS2_LOCK_TYPE_RW] = "Write/Read", [OCFS2_LOCK_TYPE_DENTRY] = "Dentry", [OCFS2_LOCK_TYPE_OPEN] = "Open", [OCFS2_LOCK_TYPE_FLOCK] = "Flock", [OCFS2_LOCK_TYPE_QINFO] = "Quota", [OCFS2_LOCK_TYPE_NFS_SYNC] = "NFSSync", [OCFS2_LOCK_TYPE_ORPHAN_SCAN] = "OrphanScan", [OCFS2_LOCK_TYPE_REFCOUNT] = "Refcount", [OCFS2_LOCK_TYPE_TRIM_FS] = "TrimFs", }; static inline const char *ocfs2_lock_type_string(enum ocfs2_lock_type type) { #ifdef __KERNEL__ BUG_ON(type >= OCFS2_NUM_LOCK_TYPES); #endif return ocfs2_lock_type_strings[type]; } #endif /* OCFS2_LOCKID_H */ |
| 407 197 329 516 202 1 3 197 219 1 43 201 212 219 115 1 114 1 12 113 116 114 538 2 1 407 133 126 3 528 2 523 1 1 3 516 214 346 502 43 396 406 399 1 9 11 1 10 2 8 1 8 1 8 10 4 7 1218 42 1177 4 7 3 1 1 4 6 10 8 3 66 66 66 11 11 2 11 11 5 55 33 33 25 25 137 132 155 157 157 81 76 155 147 116 1 25 1 107 103 2 33 80 2 50 50 5 51 49 51 1 1 1347 57 3963 40 1951 2214 1348 3930 3928 9 2 3925 3546 2620 1716 3818 40 3824 34 3252 866 3860 1929 2456 1123 1125 1122 214 213 4 12 3939 1139 3856 410 406 4 3774 18 1175 2790 4010 1439 2787 2 3971 43 3996 18 2822 1332 74 4002 3 3 3 3 34 34 1 33 3981 2 7 1627 3531 3766 510 510 3337 3337 286 1423 1408 8 7 1412 52 1375 9 1365 1304 1304 19 47 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 | // SPDX-License-Identifier: GPL-2.0-only /* * linux/fs/open.c * * Copyright (C) 1991, 1992 Linus Torvalds */ #include <linux/string.h> #include <linux/mm.h> #include <linux/file.h> #include <linux/fdtable.h> #include <linux/fsnotify.h> #include <linux/module.h> #include <linux/tty.h> #include <linux/namei.h> #include <linux/backing-dev.h> #include <linux/capability.h> #include <linux/securebits.h> #include <linux/security.h> #include <linux/mount.h> #include <linux/fcntl.h> #include <linux/slab.h> #include <linux/uaccess.h> #include <linux/fs.h> #include <linux/personality.h> #include <linux/pagemap.h> #include <linux/syscalls.h> #include <linux/rcupdate.h> #include <linux/audit.h> #include <linux/falloc.h> #include <linux/fs_struct.h> #include <linux/dnotify.h> #include <linux/compat.h> #include <linux/mnt_idmapping.h> #include <linux/filelock.h> #include "internal.h" int do_truncate(struct mnt_idmap *idmap, struct dentry *dentry, loff_t length, unsigned int time_attrs, struct file *filp) { int ret; struct iattr newattrs; /* Not pretty: "inode->i_size" shouldn't really be signed. But it is. */ if (length < 0) return -EINVAL; newattrs.ia_size = length; newattrs.ia_valid = ATTR_SIZE | time_attrs; if (filp) { newattrs.ia_file = filp; newattrs.ia_valid |= ATTR_FILE; } /* Remove suid, sgid, and file capabilities on truncate too */ ret = dentry_needs_remove_privs(idmap, dentry); if (ret < 0) return ret; if (ret) newattrs.ia_valid |= ret | ATTR_FORCE; inode_lock(dentry->d_inode); /* Note any delegations or leases have already been broken: */ ret = notify_change(idmap, dentry, &newattrs, NULL); inode_unlock(dentry->d_inode); return ret; } long vfs_truncate(const struct path *path, loff_t length) { struct mnt_idmap *idmap; struct inode *inode; long error; inode = path->dentry->d_inode; /* For directories it's -EISDIR, for other non-regulars - -EINVAL */ if (S_ISDIR(inode->i_mode)) return -EISDIR; if (!S_ISREG(inode->i_mode)) return -EINVAL; error = mnt_want_write(path->mnt); if (error) goto out; idmap = mnt_idmap(path->mnt); error = inode_permission(idmap, inode, MAY_WRITE); if (error) goto mnt_drop_write_and_out; error = -EPERM; if (IS_APPEND(inode)) goto mnt_drop_write_and_out; error = get_write_access(inode); if (error) goto mnt_drop_write_and_out; /* * Make sure that there are no leases. get_write_access() protects * against the truncate racing with a lease-granting setlease(). */ error = break_lease(inode, O_WRONLY); if (error) goto put_write_and_out; error = security_path_truncate(path); if (!error) error = do_truncate(idmap, path->dentry, length, 0, NULL); put_write_and_out: put_write_access(inode); mnt_drop_write_and_out: mnt_drop_write(path->mnt); out: return error; } EXPORT_SYMBOL_GPL(vfs_truncate); long do_sys_truncate(const char __user *pathname, loff_t length) { unsigned int lookup_flags = LOOKUP_FOLLOW; struct path path; int error; if (length < 0) /* sorry, but loff_t says... */ return -EINVAL; retry: error = user_path_at(AT_FDCWD, pathname, lookup_flags, &path); if (!error) { error = vfs_truncate(&path, length); path_put(&path); } if (retry_estale(error, lookup_flags)) { lookup_flags |= LOOKUP_REVAL; goto retry; } return error; } SYSCALL_DEFINE2(truncate, const char __user *, path, long, length) { return do_sys_truncate(path, length); } #ifdef CONFIG_COMPAT COMPAT_SYSCALL_DEFINE2(truncate, const char __user *, path, compat_off_t, length) { return do_sys_truncate(path, length); } #endif long do_ftruncate(struct file *file, loff_t length, int small) { struct inode *inode; struct dentry *dentry; int error; /* explicitly opened as large or we are on 64-bit box */ if (file->f_flags & O_LARGEFILE) small = 0; dentry = file->f_path.dentry; inode = dentry->d_inode; if (!S_ISREG(inode->i_mode) || !(file->f_mode & FMODE_WRITE)) return -EINVAL; /* Cannot ftruncate over 2^31 bytes without large file support */ if (small && length > MAX_NON_LFS) return -EINVAL; /* Check IS_APPEND on real upper inode */ if (IS_APPEND(file_inode(file))) return -EPERM; sb_start_write(inode->i_sb); error = security_file_truncate(file); if (!error) error = do_truncate(file_mnt_idmap(file), dentry, length, ATTR_MTIME | ATTR_CTIME, file); sb_end_write(inode->i_sb); return error; } long do_sys_ftruncate(unsigned int fd, loff_t length, int small) { struct fd f; int error; if (length < 0) return -EINVAL; f = fdget(fd); if (!fd_file(f)) return -EBADF; error = do_ftruncate(fd_file(f), length, small); fdput(f); return error; } SYSCALL_DEFINE2(ftruncate, unsigned int, fd, off_t, length) { return do_sys_ftruncate(fd, length, 1); } #ifdef CONFIG_COMPAT COMPAT_SYSCALL_DEFINE2(ftruncate, unsigned int, fd, compat_off_t, length) { return do_sys_ftruncate(fd, length, 1); } #endif /* LFS versions of truncate are only needed on 32 bit machines */ #if BITS_PER_LONG == 32 SYSCALL_DEFINE2(truncate64, const char __user *, path, loff_t, length) { return do_sys_truncate(path, length); } SYSCALL_DEFINE2(ftruncate64, unsigned int, fd, loff_t, length) { return do_sys_ftruncate(fd, length, 0); } #endif /* BITS_PER_LONG == 32 */ #if defined(CONFIG_COMPAT) && defined(__ARCH_WANT_COMPAT_TRUNCATE64) COMPAT_SYSCALL_DEFINE3(truncate64, const char __user *, pathname, compat_arg_u64_dual(length)) { return ksys_truncate(pathname, compat_arg_u64_glue(length)); } #endif #if defined(CONFIG_COMPAT) && defined(__ARCH_WANT_COMPAT_FTRUNCATE64) COMPAT_SYSCALL_DEFINE3(ftruncate64, unsigned int, fd, compat_arg_u64_dual(length)) { return ksys_ftruncate(fd, compat_arg_u64_glue(length)); } #endif int vfs_fallocate(struct file *file, int mode, loff_t offset, loff_t len) { struct inode *inode = file_inode(file); long ret; loff_t sum; if (offset < 0 || len <= 0) return -EINVAL; if (mode & ~(FALLOC_FL_MODE_MASK | FALLOC_FL_KEEP_SIZE)) return -EOPNOTSUPP; /* * Modes are exclusive, even if that is not obvious from the encoding * as bit masks and the mix with the flag in the same namespace. * * To make things even more complicated, FALLOC_FL_ALLOCATE_RANGE is * encoded as no bit set. */ switch (mode & FALLOC_FL_MODE_MASK) { case FALLOC_FL_ALLOCATE_RANGE: case FALLOC_FL_UNSHARE_RANGE: case FALLOC_FL_ZERO_RANGE: break; case FALLOC_FL_PUNCH_HOLE: if (!(mode & FALLOC_FL_KEEP_SIZE)) return -EOPNOTSUPP; break; case FALLOC_FL_COLLAPSE_RANGE: case FALLOC_FL_INSERT_RANGE: if (mode & FALLOC_FL_KEEP_SIZE) return -EOPNOTSUPP; break; default: return -EOPNOTSUPP; } if (!(file->f_mode & FMODE_WRITE)) return -EBADF; /* * On append-only files only space preallocation is supported. */ if ((mode & ~FALLOC_FL_KEEP_SIZE) && IS_APPEND(inode)) return -EPERM; if (IS_IMMUTABLE(inode)) return -EPERM; /* * We cannot allow any fallocate operation on an active swapfile */ if (IS_SWAPFILE(inode)) return -ETXTBSY; /* * Revalidate the write permissions, in case security policy has * changed since the files were opened. */ ret = security_file_permission(file, MAY_WRITE); if (ret) return ret; ret = fsnotify_file_area_perm(file, MAY_WRITE, &offset, len); if (ret) return ret; if (S_ISFIFO(inode->i_mode)) return -ESPIPE; if (S_ISDIR(inode->i_mode)) return -EISDIR; if (!S_ISREG(inode->i_mode) && !S_ISBLK(inode->i_mode)) return -ENODEV; /* Check for wraparound */ if (check_add_overflow(offset, len, &sum)) return -EFBIG; if (sum > inode->i_sb->s_maxbytes) return -EFBIG; if (!file->f_op->fallocate) return -EOPNOTSUPP; file_start_write(file); ret = file->f_op->fallocate(file, mode, offset, len); /* * Create inotify and fanotify events. * * To keep the logic simple always create events if fallocate succeeds. * This implies that events are even created if the file size remains * unchanged, e.g. when using flag FALLOC_FL_KEEP_SIZE. */ if (ret == 0) fsnotify_modify(file); file_end_write(file); return ret; } EXPORT_SYMBOL_GPL(vfs_fallocate); int ksys_fallocate(int fd, int mode, loff_t offset, loff_t len) { struct fd f = fdget(fd); int error = -EBADF; if (fd_file(f)) { error = vfs_fallocate(fd_file(f), mode, offset, len); fdput(f); } return error; } SYSCALL_DEFINE4(fallocate, int, fd, int, mode, loff_t, offset, loff_t, len) { return ksys_fallocate(fd, mode, offset, len); } #if defined(CONFIG_COMPAT) && defined(__ARCH_WANT_COMPAT_FALLOCATE) COMPAT_SYSCALL_DEFINE6(fallocate, int, fd, int, mode, compat_arg_u64_dual(offset), compat_arg_u64_dual(len)) { return ksys_fallocate(fd, mode, compat_arg_u64_glue(offset), compat_arg_u64_glue(len)); } #endif /* * access() needs to use the real uid/gid, not the effective uid/gid. * We do this by temporarily clearing all FS-related capabilities and * switching the fsuid/fsgid around to the real ones. * * Creating new credentials is expensive, so we try to skip doing it, * which we can if the result would match what we already got. */ static bool access_need_override_creds(int flags) { const struct cred *cred; if (flags & AT_EACCESS) return false; cred = current_cred(); if (!uid_eq(cred->fsuid, cred->uid) || !gid_eq(cred->fsgid, cred->gid)) return true; if (!issecure(SECURE_NO_SETUID_FIXUP)) { kuid_t root_uid = make_kuid(cred->user_ns, 0); if (!uid_eq(cred->uid, root_uid)) { if (!cap_isclear(cred->cap_effective)) return true; } else { if (!cap_isidentical(cred->cap_effective, cred->cap_permitted)) return true; } } return false; } static const struct cred *access_override_creds(void) { const struct cred *old_cred; struct cred *override_cred; override_cred = prepare_creds(); if (!override_cred) return NULL; /* * XXX access_need_override_creds performs checks in hopes of skipping * this work. Make sure it stays in sync if making any changes in this * routine. */ override_cred->fsuid = override_cred->uid; override_cred->fsgid = override_cred->gid; if (!issecure(SECURE_NO_SETUID_FIXUP)) { /* Clear the capabilities if we switch to a non-root user */ kuid_t root_uid = make_kuid(override_cred->user_ns, 0); if (!uid_eq(override_cred->uid, root_uid)) cap_clear(override_cred->cap_effective); else override_cred->cap_effective = override_cred->cap_permitted; } /* * The new set of credentials can *only* be used in * task-synchronous circumstances, and does not need * RCU freeing, unless somebody then takes a separate * reference to it. * * NOTE! This is _only_ true because this credential * is used purely for override_creds() that installs * it as the subjective cred. Other threads will be * accessing ->real_cred, not the subjective cred. * * If somebody _does_ make a copy of this (using the * 'get_current_cred()' function), that will clear the * non_rcu field, because now that other user may be * expecting RCU freeing. But normal thread-synchronous * cred accesses will keep things non-racy to avoid RCU * freeing. */ override_cred->non_rcu = 1; old_cred = override_creds(override_cred); /* override_cred() gets its own ref */ put_cred(override_cred); return old_cred; } static long do_faccessat(int dfd, const char __user *filename, int mode, int flags) { struct path path; struct inode *inode; int res; unsigned int lookup_flags = LOOKUP_FOLLOW; const struct cred *old_cred = NULL; if (mode & ~S_IRWXO) /* where's F_OK, X_OK, W_OK, R_OK? */ return -EINVAL; if (flags & ~(AT_EACCESS | AT_SYMLINK_NOFOLLOW | AT_EMPTY_PATH)) return -EINVAL; if (flags & AT_SYMLINK_NOFOLLOW) lookup_flags &= ~LOOKUP_FOLLOW; if (flags & AT_EMPTY_PATH) lookup_flags |= LOOKUP_EMPTY; if (access_need_override_creds(flags)) { old_cred = access_override_creds(); if (!old_cred) return -ENOMEM; } retry: res = user_path_at(dfd, filename, lookup_flags, &path); if (res) goto out; inode = d_backing_inode(path.dentry); if ((mode & MAY_EXEC) && S_ISREG(inode->i_mode)) { /* * MAY_EXEC on regular files is denied if the fs is mounted * with the "noexec" flag. */ res = -EACCES; if (path_noexec(&path)) goto out_path_release; } res = inode_permission(mnt_idmap(path.mnt), inode, mode | MAY_ACCESS); /* SuS v2 requires we report a read only fs too */ if (res || !(mode & S_IWOTH) || special_file(inode->i_mode)) goto out_path_release; /* * This is a rare case where using __mnt_is_readonly() * is OK without a mnt_want/drop_write() pair. Since * no actual write to the fs is performed here, we do * not need to telegraph to that to anyone. * * By doing this, we accept that this access is * inherently racy and know that the fs may change * state before we even see this result. */ if (__mnt_is_readonly(path.mnt)) res = -EROFS; out_path_release: path_put(&path); if (retry_estale(res, lookup_flags)) { lookup_flags |= LOOKUP_REVAL; goto retry; } out: if (old_cred) revert_creds(old_cred); return res; } SYSCALL_DEFINE3(faccessat, int, dfd, const char __user *, filename, int, mode) { return do_faccessat(dfd, filename, mode, 0); } SYSCALL_DEFINE4(faccessat2, int, dfd, const char __user *, filename, int, mode, int, flags) { return do_faccessat(dfd, filename, mode, flags); } SYSCALL_DEFINE2(access, const char __user *, filename, int, mode) { return do_faccessat(AT_FDCWD, filename, mode, 0); } SYSCALL_DEFINE1(chdir, const char __user *, filename) { struct path path; int error; unsigned int lookup_flags = LOOKUP_FOLLOW | LOOKUP_DIRECTORY; retry: error = user_path_at(AT_FDCWD, filename, lookup_flags, &path); if (error) goto out; error = path_permission(&path, MAY_EXEC | MAY_CHDIR); if (error) goto dput_and_out; set_fs_pwd(current->fs, &path); dput_and_out: path_put(&path); if (retry_estale(error, lookup_flags)) { lookup_flags |= LOOKUP_REVAL; goto retry; } out: return error; } SYSCALL_DEFINE1(fchdir, unsigned int, fd) { struct fd f = fdget_raw(fd); int error; error = -EBADF; if (!fd_file(f)) goto out; error = -ENOTDIR; if (!d_can_lookup(fd_file(f)->f_path.dentry)) goto out_putf; error = file_permission(fd_file(f), MAY_EXEC | MAY_CHDIR); if (!error) set_fs_pwd(current->fs, &fd_file(f)->f_path); out_putf: fdput(f); out: return error; } SYSCALL_DEFINE1(chroot, const char __user *, filename) { struct path path; int error; unsigned int lookup_flags = LOOKUP_FOLLOW | LOOKUP_DIRECTORY; retry: error = user_path_at(AT_FDCWD, filename, lookup_flags, &path); if (error) goto out; error = path_permission(&path, MAY_EXEC | MAY_CHDIR); if (error) goto dput_and_out; error = -EPERM; if (!ns_capable(current_user_ns(), CAP_SYS_CHROOT)) goto dput_and_out; error = security_path_chroot(&path); if (error) goto dput_and_out; set_fs_root(current->fs, &path); error = 0; dput_and_out: path_put(&path); if (retry_estale(error, lookup_flags)) { lookup_flags |= LOOKUP_REVAL; goto retry; } out: return error; } int chmod_common(const struct path *path, umode_t mode) { struct inode *inode = path->dentry->d_inode; struct inode *delegated_inode = NULL; struct iattr newattrs; int error; error = mnt_want_write(path->mnt); if (error) return error; retry_deleg: inode_lock(inode); error = security_path_chmod(path, mode); if (error) goto out_unlock; newattrs.ia_mode = (mode & S_IALLUGO) | (inode->i_mode & ~S_IALLUGO); newattrs.ia_valid = ATTR_MODE | ATTR_CTIME; error = notify_change(mnt_idmap(path->mnt), path->dentry, &newattrs, &delegated_inode); out_unlock: inode_unlock(inode); if (delegated_inode) { error = break_deleg_wait(&delegated_inode); if (!error) goto retry_deleg; } mnt_drop_write(path->mnt); return error; } int vfs_fchmod(struct file *file, umode_t mode) { audit_file(file); return chmod_common(&file->f_path, mode); } SYSCALL_DEFINE2(fchmod, unsigned int, fd, umode_t, mode) { struct fd f = fdget(fd); int err = -EBADF; if (fd_file(f)) { err = vfs_fchmod(fd_file(f), mode); fdput(f); } return err; } static int do_fchmodat(int dfd, const char __user *filename, umode_t mode, unsigned int flags) { struct path path; int error; unsigned int lookup_flags; if (unlikely(flags & ~(AT_SYMLINK_NOFOLLOW | AT_EMPTY_PATH))) return -EINVAL; lookup_flags = (flags & AT_SYMLINK_NOFOLLOW) ? 0 : LOOKUP_FOLLOW; if (flags & AT_EMPTY_PATH) lookup_flags |= LOOKUP_EMPTY; retry: error = user_path_at(dfd, filename, lookup_flags, &path); if (!error) { error = chmod_common(&path, mode); path_put(&path); if (retry_estale(error, lookup_flags)) { lookup_flags |= LOOKUP_REVAL; goto retry; } } return error; } SYSCALL_DEFINE4(fchmodat2, int, dfd, const char __user *, filename, umode_t, mode, unsigned int, flags) { return do_fchmodat(dfd, filename, mode, flags); } SYSCALL_DEFINE3(fchmodat, int, dfd, const char __user *, filename, umode_t, mode) { return do_fchmodat(dfd, filename, mode, 0); } SYSCALL_DEFINE2(chmod, const char __user *, filename, umode_t, mode) { return do_fchmodat(AT_FDCWD, filename, mode, 0); } /* * Check whether @kuid is valid and if so generate and set vfsuid_t in * ia_vfsuid. * * Return: true if @kuid is valid, false if not. */ static inline bool setattr_vfsuid(struct iattr *attr, kuid_t kuid) { if (!uid_valid(kuid)) return false; attr->ia_valid |= ATTR_UID; attr->ia_vfsuid = VFSUIDT_INIT(kuid); return true; } /* * Check whether @kgid is valid and if so generate and set vfsgid_t in * ia_vfsgid. * * Return: true if @kgid is valid, false if not. */ static inline bool setattr_vfsgid(struct iattr *attr, kgid_t kgid) { if (!gid_valid(kgid)) return false; attr->ia_valid |= ATTR_GID; attr->ia_vfsgid = VFSGIDT_INIT(kgid); return true; } int chown_common(const struct path *path, uid_t user, gid_t group) { struct mnt_idmap *idmap; struct user_namespace *fs_userns; struct inode *inode = path->dentry->d_inode; struct inode *delegated_inode = NULL; int error; struct iattr newattrs; kuid_t uid; kgid_t gid; uid = make_kuid(current_user_ns(), user); gid = make_kgid(current_user_ns(), group); idmap = mnt_idmap(path->mnt); fs_userns = i_user_ns(inode); retry_deleg: newattrs.ia_vfsuid = INVALID_VFSUID; newattrs.ia_vfsgid = INVALID_VFSGID; newattrs.ia_valid = ATTR_CTIME; if ((user != (uid_t)-1) && !setattr_vfsuid(&newattrs, uid)) return -EINVAL; if ((group != (gid_t)-1) && !setattr_vfsgid(&newattrs, gid)) return -EINVAL; inode_lock(inode); if (!S_ISDIR(inode->i_mode)) newattrs.ia_valid |= ATTR_KILL_SUID | ATTR_KILL_PRIV | setattr_should_drop_sgid(idmap, inode); /* Continue to send actual fs values, not the mount values. */ error = security_path_chown( path, from_vfsuid(idmap, fs_userns, newattrs.ia_vfsuid), from_vfsgid(idmap, fs_userns, newattrs.ia_vfsgid)); if (!error) error = notify_change(idmap, path->dentry, &newattrs, &delegated_inode); inode_unlock(inode); if (delegated_inode) { error = break_deleg_wait(&delegated_inode); if (!error) goto retry_deleg; } return error; } int do_fchownat(int dfd, const char __user *filename, uid_t user, gid_t group, int flag) { struct path path; int error = -EINVAL; int lookup_flags; if ((flag & ~(AT_SYMLINK_NOFOLLOW | AT_EMPTY_PATH)) != 0) goto out; lookup_flags = (flag & AT_SYMLINK_NOFOLLOW) ? 0 : LOOKUP_FOLLOW; if (flag & AT_EMPTY_PATH) lookup_flags |= LOOKUP_EMPTY; retry: error = user_path_at(dfd, filename, lookup_flags, &path); if (error) goto out; error = mnt_want_write(path.mnt); if (error) goto out_release; error = chown_common(&path, user, group); mnt_drop_write(path.mnt); out_release: path_put(&path); if (retry_estale(error, lookup_flags)) { lookup_flags |= LOOKUP_REVAL; goto retry; } out: return error; } SYSCALL_DEFINE5(fchownat, int, dfd, const char __user *, filename, uid_t, user, gid_t, group, int, flag) { return do_fchownat(dfd, filename, user, group, flag); } SYSCALL_DEFINE3(chown, const char __user *, filename, uid_t, user, gid_t, group) { return do_fchownat(AT_FDCWD, filename, user, group, 0); } SYSCALL_DEFINE3(lchown, const char __user *, filename, uid_t, user, gid_t, group) { return do_fchownat(AT_FDCWD, filename, user, group, AT_SYMLINK_NOFOLLOW); } int vfs_fchown(struct file *file, uid_t user, gid_t group) { int error; error = mnt_want_write_file(file); if (error) return error; audit_file(file); error = chown_common(&file->f_path, user, group); mnt_drop_write_file(file); return error; } int ksys_fchown(unsigned int fd, uid_t user, gid_t group) { struct fd f = fdget(fd); int error = -EBADF; if (fd_file(f)) { error = vfs_fchown(fd_file(f), user, group); fdput(f); } return error; } SYSCALL_DEFINE3(fchown, unsigned int, fd, uid_t, user, gid_t, group) { return ksys_fchown(fd, user, group); } static inline int file_get_write_access(struct file *f) { int error; error = get_write_access(f->f_inode); if (unlikely(error)) return error; error = mnt_get_write_access(f->f_path.mnt); if (unlikely(error)) goto cleanup_inode; if (unlikely(f->f_mode & FMODE_BACKING)) { error = mnt_get_write_access(backing_file_user_path(f)->mnt); if (unlikely(error)) goto cleanup_mnt; } return 0; cleanup_mnt: mnt_put_write_access(f->f_path.mnt); cleanup_inode: put_write_access(f->f_inode); return error; } static int do_dentry_open(struct file *f, int (*open)(struct inode *, struct file *)) { static const struct file_operations empty_fops = {}; struct inode *inode = f->f_path.dentry->d_inode; int error; path_get(&f->f_path); f->f_inode = inode; f->f_mapping = inode->i_mapping; f->f_wb_err = filemap_sample_wb_err(f->f_mapping); f->f_sb_err = file_sample_sb_err(f); if (unlikely(f->f_flags & O_PATH)) { f->f_mode = FMODE_PATH | FMODE_OPENED; f->f_op = &empty_fops; return 0; } if ((f->f_mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ) { i_readcount_inc(inode); } else if (f->f_mode & FMODE_WRITE && !special_file(inode->i_mode)) { error = file_get_write_access(f); if (unlikely(error)) goto cleanup_file; f->f_mode |= FMODE_WRITER; } /* POSIX.1-2008/SUSv4 Section XSI 2.9.7 */ if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode)) f->f_mode |= FMODE_ATOMIC_POS; f->f_op = fops_get(inode->i_fop); if (WARN_ON(!f->f_op)) { error = -ENODEV; goto cleanup_all; } error = security_file_open(f); if (error) goto cleanup_all; error = break_lease(file_inode(f), f->f_flags); if (error) goto cleanup_all; /* normally all 3 are set; ->open() can clear them if needed */ f->f_mode |= FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE; if (!open) open = f->f_op->open; if (open) { error = open(inode, f); if (error) goto cleanup_all; } f->f_mode |= FMODE_OPENED; if ((f->f_mode & FMODE_READ) && likely(f->f_op->read || f->f_op->read_iter)) f->f_mode |= FMODE_CAN_READ; if ((f->f_mode & FMODE_WRITE) && likely(f->f_op->write || f->f_op->write_iter)) f->f_mode |= FMODE_CAN_WRITE; if ((f->f_mode & FMODE_LSEEK) && !f->f_op->llseek) f->f_mode &= ~FMODE_LSEEK; if (f->f_mapping->a_ops && f->f_mapping->a_ops->direct_IO) f->f_mode |= FMODE_CAN_ODIRECT; f->f_flags &= ~(O_CREAT | O_EXCL | O_NOCTTY | O_TRUNC); f->f_iocb_flags = iocb_flags(f); file_ra_state_init(&f->f_ra, f->f_mapping->host->i_mapping); if ((f->f_flags & O_DIRECT) && !(f->f_mode & FMODE_CAN_ODIRECT)) return -EINVAL; /* * XXX: Huge page cache doesn't support writing yet. Drop all page * cache for this file before processing writes. */ if (f->f_mode & FMODE_WRITE) { /* * Depends on full fence from get_write_access() to synchronize * against collapse_file() regarding i_writecount and nr_thps * updates. Ensures subsequent insertion of THPs into the page * cache will fail. */ if (filemap_nr_thps(inode->i_mapping)) { struct address_space *mapping = inode->i_mapping; filemap_invalidate_lock(inode->i_mapping); /* * unmap_mapping_range just need to be called once * here, because the private pages is not need to be * unmapped mapping (e.g. data segment of dynamic * shared libraries here). */ unmap_mapping_range(mapping, 0, 0, 0); truncate_inode_pages(mapping, 0); filemap_invalidate_unlock(inode->i_mapping); } } return 0; cleanup_all: if (WARN_ON_ONCE(error > 0)) error = -EINVAL; fops_put(f->f_op); put_file_access(f); cleanup_file: path_put(&f->f_path); f->f_path.mnt = NULL; f->f_path.dentry = NULL; f->f_inode = NULL; return error; } /** * finish_open - finish opening a file * @file: file pointer * @dentry: pointer to dentry * @open: open callback * * This can be used to finish opening a file passed to i_op->atomic_open(). * * If the open callback is set to NULL, then the standard f_op->open() * filesystem callback is substituted. * * NB: the dentry reference is _not_ consumed. If, for example, the dentry is * the return value of d_splice_alias(), then the caller needs to perform dput() * on it after finish_open(). * * Returns zero on success or -errno if the open failed. */ int finish_open(struct file *file, struct dentry *dentry, int (*open)(struct inode *, struct file *)) { BUG_ON(file->f_mode & FMODE_OPENED); /* once it's opened, it's opened */ file->f_path.dentry = dentry; return do_dentry_open(file, open); } EXPORT_SYMBOL(finish_open); /** * finish_no_open - finish ->atomic_open() without opening the file * * @file: file pointer * @dentry: dentry or NULL (as returned from ->lookup()) * * This can be used to set the result of a successful lookup in ->atomic_open(). * * NB: unlike finish_open() this function does consume the dentry reference and * the caller need not dput() it. * * Returns "0" which must be the return value of ->atomic_open() after having * called this function. */ int finish_no_open(struct file *file, struct dentry *dentry) { file->f_path.dentry = dentry; return 0; } EXPORT_SYMBOL(finish_no_open); char *file_path(struct file *filp, char *buf, int buflen) { return d_path(&filp->f_path, buf, buflen); } EXPORT_SYMBOL(file_path); /** * vfs_open - open the file at the given path * @path: path to open * @file: newly allocated file with f_flag initialized */ int vfs_open(const struct path *path, struct file *file) { int ret; file->f_path = *path; ret = do_dentry_open(file, NULL); if (!ret) { /* * Once we return a file with FMODE_OPENED, __fput() will call * fsnotify_close(), so we need fsnotify_open() here for * symmetry. */ fsnotify_open(file); } return ret; } struct file *dentry_open(const struct path *path, int flags, const struct cred *cred) { int error; struct file *f; /* We must always pass in a valid mount pointer. */ BUG_ON(!path->mnt); f = alloc_empty_file(flags, cred); if (!IS_ERR(f)) { error = vfs_open(path, f); if (error) { fput(f); f = ERR_PTR(error); } } return f; } EXPORT_SYMBOL(dentry_open); /** * dentry_create - Create and open a file * @path: path to create * @flags: O_ flags * @mode: mode bits for new file * @cred: credentials to use * * Caller must hold the parent directory's lock, and have prepared * a negative dentry, placed in @path->dentry, for the new file. * * Caller sets @path->mnt to the vfsmount of the filesystem where * the new file is to be created. The parent directory and the * negative dentry must reside on the same filesystem instance. * * On success, returns a "struct file *". Otherwise a ERR_PTR * is returned. */ struct file *dentry_create(const struct path *path, int flags, umode_t mode, const struct cred *cred) { struct file *f; int error; f = alloc_empty_file(flags, cred); if (IS_ERR(f)) return f; error = vfs_create(mnt_idmap(path->mnt), d_inode(path->dentry->d_parent), path->dentry, mode, true); if (!error) error = vfs_open(path, f); if (unlikely(error)) { fput(f); return ERR_PTR(error); } return f; } EXPORT_SYMBOL(dentry_create); /** * kernel_file_open - open a file for kernel internal use * @path: path of the file to open * @flags: open flags * @cred: credentials for open * * Open a file for use by in-kernel consumers. The file is not accounted * against nr_files and must not be installed into the file descriptor * table. * * Return: Opened file on success, an error pointer on failure. */ struct file *kernel_file_open(const struct path *path, int flags, const struct cred *cred) { struct file *f; int error; f = alloc_empty_file_noaccount(flags, cred); if (IS_ERR(f)) return f; f->f_path = *path; error = do_dentry_open(f, NULL); if (error) { fput(f); return ERR_PTR(error); } fsnotify_open(f); return f; } EXPORT_SYMBOL_GPL(kernel_file_open); #define WILL_CREATE(flags) (flags & (O_CREAT | __O_TMPFILE)) #define O_PATH_FLAGS (O_DIRECTORY | O_NOFOLLOW | O_PATH | O_CLOEXEC) inline struct open_how build_open_how(int flags, umode_t mode) { struct open_how how = { .flags = flags & VALID_OPEN_FLAGS, .mode = mode & S_IALLUGO, }; /* O_PATH beats everything else. */ if (how.flags & O_PATH) how.flags &= O_PATH_FLAGS; /* Modes should only be set for create-like flags. */ if (!WILL_CREATE(how.flags)) how.mode = 0; return how; } inline int build_open_flags(const struct open_how *how, struct open_flags *op) { u64 flags = how->flags; u64 strip = __FMODE_NONOTIFY | O_CLOEXEC; int lookup_flags = 0; int acc_mode = ACC_MODE(flags); BUILD_BUG_ON_MSG(upper_32_bits(VALID_OPEN_FLAGS), "struct open_flags doesn't yet handle flags > 32 bits"); /* * Strip flags that either shouldn't be set by userspace like * FMODE_NONOTIFY or that aren't relevant in determining struct * open_flags like O_CLOEXEC. */ flags &= ~strip; /* * Older syscalls implicitly clear all of the invalid flags or argument * values before calling build_open_flags(), but openat2(2) checks all * of its arguments. */ if (flags & ~VALID_OPEN_FLAGS) return -EINVAL; if (how->resolve & ~VALID_RESOLVE_FLAGS) return -EINVAL; /* Scoping flags are mutually exclusive. */ if ((how->resolve & RESOLVE_BENEATH) && (how->resolve & RESOLVE_IN_ROOT)) return -EINVAL; /* Deal with the mode. */ if (WILL_CREATE(flags)) { if (how->mode & ~S_IALLUGO) return -EINVAL; op->mode = how->mode | S_IFREG; } else { if (how->mode != 0) return -EINVAL; op->mode = 0; } /* * Block bugs where O_DIRECTORY | O_CREAT created regular files. * Note, that blocking O_DIRECTORY | O_CREAT here also protects * O_TMPFILE below which requires O_DIRECTORY being raised. */ if ((flags & (O_DIRECTORY | O_CREAT)) == (O_DIRECTORY | O_CREAT)) return -EINVAL; /* Now handle the creative implementation of O_TMPFILE. */ if (flags & __O_TMPFILE) { /* * In order to ensure programs get explicit errors when trying * to use O_TMPFILE on old kernels we enforce that O_DIRECTORY * is raised alongside __O_TMPFILE. */ if (!(flags & O_DIRECTORY)) return -EINVAL; if (!(acc_mode & MAY_WRITE)) return -EINVAL; } if (flags & O_PATH) { /* O_PATH only permits certain other flags to be set. */ if (flags & ~O_PATH_FLAGS) return -EINVAL; acc_mode = 0; } /* * O_SYNC is implemented as __O_SYNC|O_DSYNC. As many places only * check for O_DSYNC if the need any syncing at all we enforce it's * always set instead of having to deal with possibly weird behaviour * for malicious applications setting only __O_SYNC. */ if (flags & __O_SYNC) flags |= O_DSYNC; op->open_flag = flags; /* O_TRUNC implies we need access checks for write permissions */ if (flags & O_TRUNC) acc_mode |= MAY_WRITE; /* Allow the LSM permission hook to distinguish append access from general write access. */ if (flags & O_APPEND) acc_mode |= MAY_APPEND; op->acc_mode = acc_mode; op->intent = flags & O_PATH ? 0 : LOOKUP_OPEN; if (flags & O_CREAT) { op->intent |= LOOKUP_CREATE; if (flags & O_EXCL) { op->intent |= LOOKUP_EXCL; flags |= O_NOFOLLOW; } } if (flags & O_DIRECTORY) lookup_flags |= LOOKUP_DIRECTORY; if (!(flags & O_NOFOLLOW)) lookup_flags |= LOOKUP_FOLLOW; if (how->resolve & RESOLVE_NO_XDEV) lookup_flags |= LOOKUP_NO_XDEV; if (how->resolve & RESOLVE_NO_MAGICLINKS) lookup_flags |= LOOKUP_NO_MAGICLINKS; if (how->resolve & RESOLVE_NO_SYMLINKS) lookup_flags |= LOOKUP_NO_SYMLINKS; if (how->resolve & RESOLVE_BENEATH) lookup_flags |= LOOKUP_BENEATH; if (how->resolve & RESOLVE_IN_ROOT) lookup_flags |= LOOKUP_IN_ROOT; if (how->resolve & RESOLVE_CACHED) { /* Don't bother even trying for create/truncate/tmpfile open */ if (flags & (O_TRUNC | O_CREAT | __O_TMPFILE)) return -EAGAIN; lookup_flags |= LOOKUP_CACHED; } op->lookup_flags = lookup_flags; return 0; } /** * file_open_name - open file and return file pointer * * @name: struct filename containing path to open * @flags: open flags as per the open(2) second argument * @mode: mode for the new file if O_CREAT is set, else ignored * * This is the helper to open a file from kernelspace if you really * have to. But in generally you should not do this, so please move * along, nothing to see here.. */ struct file *file_open_name(struct filename *name, int flags, umode_t mode) { struct open_flags op; struct open_how how = build_open_how(flags, mode); int err = build_open_flags(&how, &op); if (err) return ERR_PTR(err); return do_filp_open(AT_FDCWD, name, &op); } /** * filp_open - open file and return file pointer * * @filename: path to open * @flags: open flags as per the open(2) second argument * @mode: mode for the new file if O_CREAT is set, else ignored * * This is the helper to open a file from kernelspace if you really * have to. But in generally you should not do this, so please move * along, nothing to see here.. */ struct file *filp_open(const char *filename, int flags, umode_t mode) { struct filename *name = getname_kernel(filename); struct file *file = ERR_CAST(name); if (!IS_ERR(name)) { file = file_open_name(name, flags, mode); putname(name); } return file; } EXPORT_SYMBOL(filp_open); struct file *file_open_root(const struct path *root, const char *filename, int flags, umode_t mode) { struct open_flags op; struct open_how how = build_open_how(flags, mode); int err = build_open_flags(&how, &op); if (err) return ERR_PTR(err); return do_file_open_root(root, filename, &op); } EXPORT_SYMBOL(file_open_root); static long do_sys_openat2(int dfd, const char __user *filename, struct open_how *how) { struct open_flags op; int fd = build_open_flags(how, &op); struct filename *tmp; if (fd) return fd; tmp = getname(filename); if (IS_ERR(tmp)) return PTR_ERR(tmp); fd = get_unused_fd_flags(how->flags); if (fd >= 0) { struct file *f = do_filp_open(dfd, tmp, &op); if (IS_ERR(f)) { put_unused_fd(fd); fd = PTR_ERR(f); } else { fd_install(fd, f); } } putname(tmp); return fd; } long do_sys_open(int dfd, const char __user *filename, int flags, umode_t mode) { struct open_how how = build_open_how(flags, mode); return do_sys_openat2(dfd, filename, &how); } SYSCALL_DEFINE3(open, const char __user *, filename, int, flags, umode_t, mode) { if (force_o_largefile()) flags |= O_LARGEFILE; return do_sys_open(AT_FDCWD, filename, flags, mode); } SYSCALL_DEFINE4(openat, int, dfd, const char __user *, filename, int, flags, umode_t, mode) { if (force_o_largefile()) flags |= O_LARGEFILE; return do_sys_open(dfd, filename, flags, mode); } SYSCALL_DEFINE4(openat2, int, dfd, const char __user *, filename, struct open_how __user *, how, size_t, usize) { int err; struct open_how tmp; BUILD_BUG_ON(sizeof(struct open_how) < OPEN_HOW_SIZE_VER0); BUILD_BUG_ON(sizeof(struct open_how) != OPEN_HOW_SIZE_LATEST); if (unlikely(usize < OPEN_HOW_SIZE_VER0)) return -EINVAL; err = copy_struct_from_user(&tmp, sizeof(tmp), how, usize); if (err) return err; audit_openat2_how(&tmp); /* O_LARGEFILE is only allowed for non-O_PATH. */ if (!(tmp.flags & O_PATH) && force_o_largefile()) tmp.flags |= O_LARGEFILE; return do_sys_openat2(dfd, filename, &tmp); } #ifdef CONFIG_COMPAT /* * Exactly like sys_open(), except that it doesn't set the * O_LARGEFILE flag. */ COMPAT_SYSCALL_DEFINE3(open, const char __user *, filename, int, flags, umode_t, mode) { return do_sys_open(AT_FDCWD, filename, flags, mode); } /* * Exactly like sys_openat(), except that it doesn't set the * O_LARGEFILE flag. */ COMPAT_SYSCALL_DEFINE4(openat, int, dfd, const char __user *, filename, int, flags, umode_t, mode) { return do_sys_open(dfd, filename, flags, mode); } #endif #ifndef __alpha__ /* * For backward compatibility? Maybe this should be moved * into arch/i386 instead? */ SYSCALL_DEFINE2(creat, const char __user *, pathname, umode_t, mode) { int flags = O_CREAT | O_WRONLY | O_TRUNC; if (force_o_largefile()) flags |= O_LARGEFILE; return do_sys_open(AT_FDCWD, pathname, flags, mode); } #endif /* * "id" is the POSIX thread ID. We use the * files pointer for this.. */ static int filp_flush(struct file *filp, fl_owner_t id) { int retval = 0; if (CHECK_DATA_CORRUPTION(file_count(filp) == 0, "VFS: Close: file count is 0 (f_op=%ps)", filp->f_op)) { return 0; } if (filp->f_op->flush) retval = filp->f_op->flush(filp, id); if (likely(!(filp->f_mode & FMODE_PATH))) { dnotify_flush(filp, id); locks_remove_posix(filp, id); } return retval; } int filp_close(struct file *filp, fl_owner_t id) { int retval; retval = filp_flush(filp, id); fput(filp); return retval; } EXPORT_SYMBOL(filp_close); /* * Careful here! We test whether the file pointer is NULL before * releasing the fd. This ensures that one clone task can't release * an fd while another clone is opening it. */ SYSCALL_DEFINE1(close, unsigned int, fd) { int retval; struct file *file; file = file_close_fd(fd); if (!file) return -EBADF; retval = filp_flush(file, current->files); /* * We're returning to user space. Don't bother * with any delayed fput() cases. */ __fput_sync(file); /* can't restart close syscall because file table entry was cleared */ if (unlikely(retval == -ERESTARTSYS || retval == -ERESTARTNOINTR || retval == -ERESTARTNOHAND || retval == -ERESTART_RESTARTBLOCK)) retval = -EINTR; return retval; } /** * sys_close_range() - Close all file descriptors in a given range. * * @fd: starting file descriptor to close * @max_fd: last file descriptor to close * @flags: reserved for future extensions * * This closes a range of file descriptors. All file descriptors * from @fd up to and including @max_fd are closed. * Currently, errors to close a given file descriptor are ignored. */ SYSCALL_DEFINE3(close_range, unsigned int, fd, unsigned int, max_fd, unsigned int, flags) { return __close_range(fd, max_fd, flags); } /* * This routine simulates a hangup on the tty, to arrange that users * are given clean terminals at login time. */ SYSCALL_DEFINE0(vhangup) { if (capable(CAP_SYS_TTY_CONFIG)) { tty_vhangup_self(); return 0; } return -EPERM; } /* * Called when an inode is about to be open. * We use this to disallow opening large files on 32bit systems if * the caller didn't specify O_LARGEFILE. On 64bit systems we force * on this flag in sys_open. */ int generic_file_open(struct inode * inode, struct file * filp) { if (!(filp->f_flags & O_LARGEFILE) && i_size_read(inode) > MAX_NON_LFS) return -EOVERFLOW; return 0; } EXPORT_SYMBOL(generic_file_open); /* * This is used by subsystems that don't want seekable * file descriptors. The function is not supposed to ever fail, the only * reason it returns an 'int' and not 'void' is so that it can be plugged * directly into file_operations structure. */ int nonseekable_open(struct inode *inode, struct file *filp) { filp->f_mode &= ~(FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE); return 0; } EXPORT_SYMBOL(nonseekable_open); /* * stream_open is used by subsystems that want stream-like file descriptors. * Such file descriptors are not seekable and don't have notion of position * (file.f_pos is always 0 and ppos passed to .read()/.write() is always NULL). * Contrary to file descriptors of other regular files, .read() and .write() * can run simultaneously. * * stream_open never fails and is marked to return int so that it could be * directly used as file_operations.open . */ int stream_open(struct inode *inode, struct file *filp) { filp->f_mode &= ~(FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE | FMODE_ATOMIC_POS); filp->f_mode |= FMODE_STREAM; return 0; } EXPORT_SYMBOL(stream_open); |
| 4 4 1 1 1 1 2 1 1 2 1 1 2 1 1 3 3 3 3 3 3 3 3 3 3 3 3 2 1 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 | // SPDX-License-Identifier: GPL-2.0-or-later /* * IPVS An implementation of the IP virtual server support for the * LINUX operating system. IPVS is now implemented as a module * over the NetFilter framework. IPVS can be used to build a * high-performance and highly available server based on a * cluster of servers. * * Authors: Wensong Zhang <wensong@linuxvirtualserver.org> * Peter Kese <peter.kese@ijs.si> * Julian Anastasov <ja@ssi.bg> * * Changes: */ #define KMSG_COMPONENT "IPVS" #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt #include <linux/module.h> #include <linux/init.h> #include <linux/types.h> #include <linux/capability.h> #include <linux/fs.h> #include <linux/sysctl.h> #include <linux/proc_fs.h> #include <linux/workqueue.h> #include <linux/seq_file.h> #include <linux/slab.h> #include <linux/netfilter.h> #include <linux/netfilter_ipv4.h> #include <linux/mutex.h> #include <net/net_namespace.h> #include <linux/nsproxy.h> #include <net/ip.h> #ifdef CONFIG_IP_VS_IPV6 #include <net/ipv6.h> #include <net/ip6_route.h> #include <net/netfilter/ipv6/nf_defrag_ipv6.h> #endif #include <net/route.h> #include <net/sock.h> #include <net/genetlink.h> #include <linux/uaccess.h> #include <net/ip_vs.h> MODULE_ALIAS_GENL_FAMILY(IPVS_GENL_NAME); DEFINE_MUTEX(__ip_vs_mutex); /* Serialize configuration with sockopt/netlink */ /* sysctl variables */ #ifdef CONFIG_IP_VS_DEBUG static int sysctl_ip_vs_debug_level = 0; int ip_vs_get_debug_level(void) { return sysctl_ip_vs_debug_level; } #endif /* Protos */ static void __ip_vs_del_service(struct ip_vs_service *svc, bool cleanup); #ifdef CONFIG_IP_VS_IPV6 /* Taken from rt6_fill_node() in net/ipv6/route.c, is there a better way? */ static bool __ip_vs_addr_is_local_v6(struct net *net, const struct in6_addr *addr) { struct flowi6 fl6 = { .daddr = *addr, }; struct dst_entry *dst = ip6_route_output(net, NULL, &fl6); bool is_local; is_local = !dst->error && dst->dev && (dst->dev->flags & IFF_LOOPBACK); dst_release(dst); return is_local; } #endif #ifdef CONFIG_SYSCTL /* * update_defense_level is called from keventd and from sysctl, * so it needs to protect itself from softirqs */ static void update_defense_level(struct netns_ipvs *ipvs) { struct sysinfo i; int availmem; int amemthresh; int nomem; int to_change = -1; /* we only count free and buffered memory (in pages) */ si_meminfo(&i); availmem = i.freeram + i.bufferram; /* however in linux 2.5 the i.bufferram is total page cache size, we need adjust it */ /* si_swapinfo(&i); */ /* availmem = availmem - (i.totalswap - i.freeswap); */ amemthresh = max(READ_ONCE(ipvs->sysctl_amemthresh), 0); nomem = (availmem < amemthresh); local_bh_disable(); /* drop_entry */ spin_lock(&ipvs->dropentry_lock); switch (ipvs->sysctl_drop_entry) { case 0: atomic_set(&ipvs->dropentry, 0); break; case 1: if (nomem) { atomic_set(&ipvs->dropentry, 1); ipvs->sysctl_drop_entry = 2; } else { atomic_set(&ipvs->dropentry, 0); } break; case 2: if (nomem) { atomic_set(&ipvs->dropentry, 1); } else { atomic_set(&ipvs->dropentry, 0); ipvs->sysctl_drop_entry = 1; } break; case 3: atomic_set(&ipvs->dropentry, 1); break; } spin_unlock(&ipvs->dropentry_lock); /* drop_packet */ spin_lock(&ipvs->droppacket_lock); switch (ipvs->sysctl_drop_packet) { case 0: ipvs->drop_rate = 0; break; case 1: if (nomem) { ipvs->drop_counter = amemthresh / (amemthresh - availmem); ipvs->drop_rate = ipvs->drop_counter; ipvs->sysctl_drop_packet = 2; } else { ipvs->drop_rate = 0; } break; case 2: if (nomem) { ipvs->drop_counter = amemthresh / (amemthresh - availmem); ipvs->drop_rate = ipvs->drop_counter; } else { ipvs->drop_rate = 0; ipvs->sysctl_drop_packet = 1; } break; case 3: ipvs->drop_rate = ipvs->sysctl_am_droprate; break; } spin_unlock(&ipvs->droppacket_lock); /* secure_tcp */ spin_lock(&ipvs->securetcp_lock); switch (ipvs->sysctl_secure_tcp) { case 0: if (ipvs->old_secure_tcp >= 2) to_change = 0; break; case 1: if (nomem) { if (ipvs->old_secure_tcp < 2) to_change = 1; ipvs->sysctl_secure_tcp = 2; } else { if (ipvs->old_secure_tcp >= 2) to_change = 0; } break; case 2: if (nomem) { if (ipvs->old_secure_tcp < 2) to_change = 1; } else { if (ipvs->old_secure_tcp >= 2) to_change = 0; ipvs->sysctl_secure_tcp = 1; } break; case 3: if (ipvs->old_secure_tcp < 2) to_change = 1; break; } ipvs->old_secure_tcp = ipvs->sysctl_secure_tcp; if (to_change >= 0) ip_vs_protocol_timeout_change(ipvs, ipvs->sysctl_secure_tcp > 1); spin_unlock(&ipvs->securetcp_lock); local_bh_enable(); } /* Handler for delayed work for expiring no * destination connections */ static void expire_nodest_conn_handler(struct work_struct *work) { struct netns_ipvs *ipvs; ipvs = container_of(work, struct netns_ipvs, expire_nodest_conn_work.work); ip_vs_expire_nodest_conn_flush(ipvs); } /* * Timer for checking the defense */ #define DEFENSE_TIMER_PERIOD 1*HZ static void defense_work_handler(struct work_struct *work) { struct netns_ipvs *ipvs = container_of(work, struct netns_ipvs, defense_work.work); update_defense_level(ipvs); if (atomic_read(&ipvs->dropentry)) ip_vs_random_dropentry(ipvs); queue_delayed_work(system_long_wq, &ipvs->defense_work, DEFENSE_TIMER_PERIOD); } #endif static void est_reload_work_handler(struct work_struct *work) { struct netns_ipvs *ipvs = container_of(work, struct netns_ipvs, est_reload_work.work); int genid_done = atomic_read(&ipvs->est_genid_done); unsigned long delay = HZ / 10; /* repeat startups after failure */ bool repeat = false; int genid; int id; mutex_lock(&ipvs->est_mutex); genid = atomic_read(&ipvs->est_genid); for (id = 0; id < ipvs->est_kt_count; id++) { struct ip_vs_est_kt_data *kd = ipvs->est_kt_arr[id]; /* netns clean up started, abort delayed work */ if (!ipvs->enable) goto unlock; if (!kd) continue; /* New config ? Stop kthread tasks */ if (genid != genid_done) ip_vs_est_kthread_stop(kd); if (!kd->task && !ip_vs_est_stopped(ipvs)) { /* Do not start kthreads above 0 in calc phase */ if ((!id || !ipvs->est_calc_phase) && ip_vs_est_kthread_start(ipvs, kd) < 0) repeat = true; } } atomic_set(&ipvs->est_genid_done, genid); if (repeat) queue_delayed_work(system_long_wq, &ipvs->est_reload_work, delay); unlock: mutex_unlock(&ipvs->est_mutex); } int ip_vs_use_count_inc(void) { return try_module_get(THIS_MODULE); } void ip_vs_use_count_dec(void) { module_put(THIS_MODULE); } /* * Hash table: for virtual service lookups */ #define IP_VS_SVC_TAB_BITS 8 #define IP_VS_SVC_TAB_SIZE (1 << IP_VS_SVC_TAB_BITS) #define IP_VS_SVC_TAB_MASK (IP_VS_SVC_TAB_SIZE - 1) /* the service table hashed by <protocol, addr, port> */ static struct hlist_head ip_vs_svc_table[IP_VS_SVC_TAB_SIZE]; /* the service table hashed by fwmark */ static struct hlist_head ip_vs_svc_fwm_table[IP_VS_SVC_TAB_SIZE]; /* * Returns hash value for virtual service */ static inline unsigned int ip_vs_svc_hashkey(struct netns_ipvs *ipvs, int af, unsigned int proto, const union nf_inet_addr *addr, __be16 port) { unsigned int porth = ntohs(port); __be32 addr_fold = addr->ip; __u32 ahash; #ifdef CONFIG_IP_VS_IPV6 if (af == AF_INET6) addr_fold = addr->ip6[0]^addr->ip6[1]^ addr->ip6[2]^addr->ip6[3]; #endif ahash = ntohl(addr_fold); ahash ^= ((size_t) ipvs >> 8); return (proto ^ ahash ^ (porth >> IP_VS_SVC_TAB_BITS) ^ porth) & IP_VS_SVC_TAB_MASK; } /* * Returns hash value of fwmark for virtual service lookup */ static inline unsigned int ip_vs_svc_fwm_hashkey(struct netns_ipvs *ipvs, __u32 fwmark) { return (((size_t)ipvs>>8) ^ fwmark) & IP_VS_SVC_TAB_MASK; } /* * Hashes a service in the ip_vs_svc_table by <netns,proto,addr,port> * or in the ip_vs_svc_fwm_table by fwmark. * Should be called with locked tables. */ static int ip_vs_svc_hash(struct ip_vs_service *svc) { unsigned int hash; if (svc->flags & IP_VS_SVC_F_HASHED) { pr_err("%s(): request for already hashed, called from %pS\n", __func__, __builtin_return_address(0)); return 0; } if (svc->fwmark == 0) { /* * Hash it by <netns,protocol,addr,port> in ip_vs_svc_table */ hash = ip_vs_svc_hashkey(svc->ipvs, svc->af, svc->protocol, &svc->addr, svc->port); hlist_add_head_rcu(&svc->s_list, &ip_vs_svc_table[hash]); } else { /* * Hash it by fwmark in svc_fwm_table */ hash = ip_vs_svc_fwm_hashkey(svc->ipvs, svc->fwmark); hlist_add_head_rcu(&svc->f_list, &ip_vs_svc_fwm_table[hash]); } svc->flags |= IP_VS_SVC_F_HASHED; /* increase its refcnt because it is referenced by the svc table */ atomic_inc(&svc->refcnt); return 1; } /* * Unhashes a service from svc_table / svc_fwm_table. * Should be called with locked tables. */ static int ip_vs_svc_unhash(struct ip_vs_service *svc) { if (!(svc->flags & IP_VS_SVC_F_HASHED)) { pr_err("%s(): request for unhash flagged, called from %pS\n", __func__, __builtin_return_address(0)); return 0; } if (svc->fwmark == 0) { /* Remove it from the svc_table table */ hlist_del_rcu(&svc->s_list); } else { /* Remove it from the svc_fwm_table table */ hlist_del_rcu(&svc->f_list); } svc->flags &= ~IP_VS_SVC_F_HASHED; atomic_dec(&svc->refcnt); return 1; } /* * Get service by {netns, proto,addr,port} in the service table. */ static inline struct ip_vs_service * __ip_vs_service_find(struct netns_ipvs *ipvs, int af, __u16 protocol, const union nf_inet_addr *vaddr, __be16 vport) { unsigned int hash; struct ip_vs_service *svc; /* Check for "full" addressed entries */ hash = ip_vs_svc_hashkey(ipvs, af, protocol, vaddr, vport); hlist_for_each_entry_rcu(svc, &ip_vs_svc_table[hash], s_list) { if ((svc->af == af) && ip_vs_addr_equal(af, &svc->addr, vaddr) && (svc->port == vport) && (svc->protocol == protocol) && (svc->ipvs == ipvs)) { /* HIT */ return svc; } } return NULL; } /* * Get service by {fwmark} in the service table. */ static inline struct ip_vs_service * __ip_vs_svc_fwm_find(struct netns_ipvs *ipvs, int af, __u32 fwmark) { unsigned int hash; struct ip_vs_service *svc; /* Check for fwmark addressed entries */ hash = ip_vs_svc_fwm_hashkey(ipvs, fwmark); hlist_for_each_entry_rcu(svc, &ip_vs_svc_fwm_table[hash], f_list) { if (svc->fwmark == fwmark && svc->af == af && (svc->ipvs == ipvs)) { /* HIT */ return svc; } } return NULL; } /* Find service, called under RCU lock */ struct ip_vs_service * ip_vs_service_find(struct netns_ipvs *ipvs, int af, __u32 fwmark, __u16 protocol, const union nf_inet_addr *vaddr, __be16 vport) { struct ip_vs_service *svc; /* * Check the table hashed by fwmark first */ if (fwmark) { svc = __ip_vs_svc_fwm_find(ipvs, af, fwmark); if (svc) goto out; } /* * Check the table hashed by <protocol,addr,port> * for "full" addressed entries */ svc = __ip_vs_service_find(ipvs, af, protocol, vaddr, vport); if (!svc && protocol == IPPROTO_TCP && atomic_read(&ipvs->ftpsvc_counter) && (vport == FTPDATA || !inet_port_requires_bind_service(ipvs->net, ntohs(vport)))) { /* * Check if ftp service entry exists, the packet * might belong to FTP data connections. */ svc = __ip_vs_service_find(ipvs, af, protocol, vaddr, FTPPORT); } if (svc == NULL && atomic_read(&ipvs->nullsvc_counter)) { /* * Check if the catch-all port (port zero) exists */ svc = __ip_vs_service_find(ipvs, af, protocol, vaddr, 0); } out: IP_VS_DBG_BUF(9, "lookup service: fwm %u %s %s:%u %s\n", fwmark, ip_vs_proto_name(protocol), IP_VS_DBG_ADDR(af, vaddr), ntohs(vport), svc ? "hit" : "not hit"); return svc; } static inline void __ip_vs_bind_svc(struct ip_vs_dest *dest, struct ip_vs_service *svc) { atomic_inc(&svc->refcnt); rcu_assign_pointer(dest->svc, svc); } static void ip_vs_service_free(struct ip_vs_service *svc) { ip_vs_stats_release(&svc->stats); kfree(svc); } static void ip_vs_service_rcu_free(struct rcu_head *head) { struct ip_vs_service *svc; svc = container_of(head, struct ip_vs_service, rcu_head); ip_vs_service_free(svc); } static void __ip_vs_svc_put(struct ip_vs_service *svc) { if (atomic_dec_and_test(&svc->refcnt)) { IP_VS_DBG_BUF(3, "Removing service %u/%s:%u\n", svc->fwmark, IP_VS_DBG_ADDR(svc->af, &svc->addr), ntohs(svc->port)); call_rcu(&svc->rcu_head, ip_vs_service_rcu_free); } } /* * Returns hash value for real service */ static inline unsigned int ip_vs_rs_hashkey(int af, const union nf_inet_addr *addr, __be16 port) { unsigned int porth = ntohs(port); __be32 addr_fold = addr->ip; #ifdef CONFIG_IP_VS_IPV6 if (af == AF_INET6) addr_fold = addr->ip6[0]^addr->ip6[1]^ addr->ip6[2]^addr->ip6[3]; #endif return (ntohl(addr_fold)^(porth>>IP_VS_RTAB_BITS)^porth) & IP_VS_RTAB_MASK; } /* Hash ip_vs_dest in rs_table by <proto,addr,port>. */ static void ip_vs_rs_hash(struct netns_ipvs *ipvs, struct ip_vs_dest *dest) { unsigned int hash; __be16 port; if (dest->in_rs_table) return; switch (IP_VS_DFWD_METHOD(dest)) { case IP_VS_CONN_F_MASQ: port = dest->port; break; case IP_VS_CONN_F_TUNNEL: switch (dest->tun_type) { case IP_VS_CONN_F_TUNNEL_TYPE_GUE: port = dest->tun_port; break; case IP_VS_CONN_F_TUNNEL_TYPE_IPIP: case IP_VS_CONN_F_TUNNEL_TYPE_GRE: port = 0; break; default: return; } break; default: return; } /* * Hash by proto,addr,port, * which are the parameters of the real service. */ hash = ip_vs_rs_hashkey(dest->af, &dest->addr, port); hlist_add_head_rcu(&dest->d_list, &ipvs->rs_table[hash]); dest->in_rs_table = 1; } /* Unhash ip_vs_dest from rs_table. */ static void ip_vs_rs_unhash(struct ip_vs_dest *dest) { /* * Remove it from the rs_table table. */ if (dest->in_rs_table) { hlist_del_rcu(&dest->d_list); dest->in_rs_table = 0; } } /* Check if real service by <proto,addr,port> is present */ bool ip_vs_has_real_service(struct netns_ipvs *ipvs, int af, __u16 protocol, const union nf_inet_addr *daddr, __be16 dport) { unsigned int hash; struct ip_vs_dest *dest; /* Check for "full" addressed entries */ hash = ip_vs_rs_hashkey(af, daddr, dport); hlist_for_each_entry_rcu(dest, &ipvs->rs_table[hash], d_list) { if (dest->port == dport && dest->af == af && ip_vs_addr_equal(af, &dest->addr, daddr) && (dest->protocol == protocol || dest->vfwmark) && IP_VS_DFWD_METHOD(dest) == IP_VS_CONN_F_MASQ) { /* HIT */ return true; } } return false; } /* Find real service record by <proto,addr,port>. * In case of multiple records with the same <proto,addr,port>, only * the first found record is returned. * * To be called under RCU lock. */ struct ip_vs_dest *ip_vs_find_real_service(struct netns_ipvs *ipvs, int af, __u16 protocol, const union nf_inet_addr *daddr, __be16 dport) { unsigned int hash; struct ip_vs_dest *dest; /* Check for "full" addressed entries */ hash = ip_vs_rs_hashkey(af, daddr, dport); hlist_for_each_entry_rcu(dest, &ipvs->rs_table[hash], d_list) { if (dest->port == dport && dest->af == af && ip_vs_addr_equal(af, &dest->addr, daddr) && (dest->protocol == protocol || dest->vfwmark) && IP_VS_DFWD_METHOD(dest) == IP_VS_CONN_F_MASQ) { /* HIT */ return dest; } } return NULL; } /* Find real service record by <af,addr,tun_port>. * In case of multiple records with the same <af,addr,tun_port>, only * the first found record is returned. * * To be called under RCU lock. */ struct ip_vs_dest *ip_vs_find_tunnel(struct netns_ipvs *ipvs, int af, const union nf_inet_addr *daddr, __be16 tun_port) { struct ip_vs_dest *dest; unsigned int hash; /* Check for "full" addressed entries */ hash = ip_vs_rs_hashkey(af, daddr, tun_port); hlist_for_each_entry_rcu(dest, &ipvs->rs_table[hash], d_list) { if (dest->tun_port == tun_port && dest->af == af && ip_vs_addr_equal(af, &dest->addr, daddr) && IP_VS_DFWD_METHOD(dest) == IP_VS_CONN_F_TUNNEL) { /* HIT */ return dest; } } return NULL; } /* Lookup destination by {addr,port} in the given service * Called under RCU lock. */ static struct ip_vs_dest * ip_vs_lookup_dest(struct ip_vs_service *svc, int dest_af, const union nf_inet_addr *daddr, __be16 dport) { struct ip_vs_dest *dest; /* * Find the destination for the given service */ list_for_each_entry_rcu(dest, &svc->destinations, n_list) { if ((dest->af == dest_af) && ip_vs_addr_equal(dest_af, &dest->addr, daddr) && (dest->port == dport)) { /* HIT */ return dest; } } return NULL; } /* * Find destination by {daddr,dport,vaddr,protocol} * Created to be used in ip_vs_process_message() in * the backup synchronization daemon. It finds the * destination to be bound to the received connection * on the backup. * Called under RCU lock, no refcnt is returned. */ struct ip_vs_dest *ip_vs_find_dest(struct netns_ipvs *ipvs, int svc_af, int dest_af, const union nf_inet_addr *daddr, __be16 dport, const union nf_inet_addr *vaddr, __be16 vport, __u16 protocol, __u32 fwmark, __u32 flags) { struct ip_vs_dest *dest; struct ip_vs_service *svc; __be16 port = dport; svc = ip_vs_service_find(ipvs, svc_af, fwmark, protocol, vaddr, vport); if (!svc) return NULL; if (fwmark && (flags & IP_VS_CONN_F_FWD_MASK) != IP_VS_CONN_F_MASQ) port = 0; dest = ip_vs_lookup_dest(svc, dest_af, daddr, port); if (!dest) dest = ip_vs_lookup_dest(svc, dest_af, daddr, port ^ dport); return dest; } void ip_vs_dest_dst_rcu_free(struct rcu_head *head) { struct ip_vs_dest_dst *dest_dst = container_of(head, struct ip_vs_dest_dst, rcu_head); dst_release(dest_dst->dst_cache); kfree(dest_dst); } /* Release dest_dst and dst_cache for dest in user context */ static void __ip_vs_dst_cache_reset(struct ip_vs_dest *dest) { struct ip_vs_dest_dst *old; old = rcu_dereference_protected(dest->dest_dst, 1); if (old) { RCU_INIT_POINTER(dest->dest_dst, NULL); call_rcu(&old->rcu_head, ip_vs_dest_dst_rcu_free); } } /* * Lookup dest by {svc,addr,port} in the destination trash. * The destination trash is used to hold the destinations that are removed * from the service table but are still referenced by some conn entries. * The reason to add the destination trash is when the dest is temporary * down (either by administrator or by monitor program), the dest can be * picked back from the trash, the remaining connections to the dest can * continue, and the counting information of the dest is also useful for * scheduling. */ static struct ip_vs_dest * ip_vs_trash_get_dest(struct ip_vs_service *svc, int dest_af, const union nf_inet_addr *daddr, __be16 dport) { struct ip_vs_dest *dest; struct netns_ipvs *ipvs = svc->ipvs; /* * Find the destination in trash */ spin_lock_bh(&ipvs->dest_trash_lock); list_for_each_entry(dest, &ipvs->dest_trash, t_list) { IP_VS_DBG_BUF(3, "Destination %u/%s:%u still in trash, " "dest->refcnt=%d\n", dest->vfwmark, IP_VS_DBG_ADDR(dest->af, &dest->addr), ntohs(dest->port), refcount_read(&dest->refcnt)); if (dest->af == dest_af && ip_vs_addr_equal(dest_af, &dest->addr, daddr) && dest->port == dport && dest->vfwmark == svc->fwmark && dest->protocol == svc->protocol && (svc->fwmark || (ip_vs_addr_equal(svc->af, &dest->vaddr, &svc->addr) && dest->vport == svc->port))) { /* HIT */ list_del(&dest->t_list); goto out; } } dest = NULL; out: spin_unlock_bh(&ipvs->dest_trash_lock); return dest; } static void ip_vs_dest_rcu_free(struct rcu_head *head) { struct ip_vs_dest *dest; dest = container_of(head, struct ip_vs_dest, rcu_head); ip_vs_stats_release(&dest->stats); ip_vs_dest_put_and_free(dest); } static void ip_vs_dest_free(struct ip_vs_dest *dest) { struct ip_vs_service *svc = rcu_dereference_protected(dest->svc, 1); __ip_vs_dst_cache_reset(dest); __ip_vs_svc_put(svc); call_rcu(&dest->rcu_head, ip_vs_dest_rcu_free); } /* * Clean up all the destinations in the trash * Called by the ip_vs_control_cleanup() * * When the ip_vs_control_clearup is activated by ipvs module exit, * the service tables must have been flushed and all the connections * are expired, and the refcnt of each destination in the trash must * be 1, so we simply release them here. */ static void ip_vs_trash_cleanup(struct netns_ipvs *ipvs) { struct ip_vs_dest *dest, *nxt; del_timer_sync(&ipvs->dest_trash_timer); /* No need to use dest_trash_lock */ list_for_each_entry_safe(dest, nxt, &ipvs->dest_trash, t_list) { list_del(&dest->t_list); ip_vs_dest_free(dest); } } static void ip_vs_stats_rcu_free(struct rcu_head *head) { struct ip_vs_stats_rcu *rs = container_of(head, struct ip_vs_stats_rcu, rcu_head); ip_vs_stats_release(&rs->s); kfree(rs); } static void ip_vs_copy_stats(struct ip_vs_kstats *dst, struct ip_vs_stats *src) { #define IP_VS_SHOW_STATS_COUNTER(c) dst->c = src->kstats.c - src->kstats0.c spin_lock(&src->lock); IP_VS_SHOW_STATS_COUNTER(conns); IP_VS_SHOW_STATS_COUNTER(inpkts); IP_VS_SHOW_STATS_COUNTER(outpkts); IP_VS_SHOW_STATS_COUNTER(inbytes); IP_VS_SHOW_STATS_COUNTER(outbytes); ip_vs_read_estimator(dst, src); spin_unlock(&src->lock); } static void ip_vs_export_stats_user(struct ip_vs_stats_user *dst, struct ip_vs_kstats *src) { dst->conns = (u32)src->conns; dst->inpkts = (u32)src->inpkts; dst->outpkts = (u32)src->outpkts; dst->inbytes = src->inbytes; dst->outbytes = src->outbytes; dst->cps = (u32)src->cps; dst->inpps = (u32)src->inpps; dst->outpps = (u32)src->outpps; dst->inbps = (u32)src->inbps; dst->outbps = (u32)src->outbps; } static void ip_vs_zero_stats(struct ip_vs_stats *stats) { spin_lock(&stats->lock); /* get current counters as zero point, rates are zeroed */ #define IP_VS_ZERO_STATS_COUNTER(c) stats->kstats0.c = stats->kstats.c IP_VS_ZERO_STATS_COUNTER(conns); IP_VS_ZERO_STATS_COUNTER(inpkts); IP_VS_ZERO_STATS_COUNTER(outpkts); IP_VS_ZERO_STATS_COUNTER(inbytes); IP_VS_ZERO_STATS_COUNTER(outbytes); ip_vs_zero_estimator(stats); spin_unlock(&stats->lock); } /* Allocate fields after kzalloc */ int ip_vs_stats_init_alloc(struct ip_vs_stats *s) { int i; spin_lock_init(&s->lock); s->cpustats = alloc_percpu(struct ip_vs_cpu_stats); if (!s->cpustats) return -ENOMEM; for_each_possible_cpu(i) { struct ip_vs_cpu_stats *cs = per_cpu_ptr(s->cpustats, i); u64_stats_init(&cs->syncp); } return 0; } struct ip_vs_stats *ip_vs_stats_alloc(void) { struct ip_vs_stats *s = kzalloc(sizeof(*s), GFP_KERNEL); if (s && ip_vs_stats_init_alloc(s) >= 0) return s; kfree(s); return NULL; } void ip_vs_stats_release(struct ip_vs_stats *stats) { free_percpu(stats->cpustats); } void ip_vs_stats_free(struct ip_vs_stats *stats) { if (stats) { ip_vs_stats_release(stats); kfree(stats); } } /* * Update a destination in the given service */ static void __ip_vs_update_dest(struct ip_vs_service *svc, struct ip_vs_dest *dest, struct ip_vs_dest_user_kern *udest, int add) { struct netns_ipvs *ipvs = svc->ipvs; struct ip_vs_service *old_svc; struct ip_vs_scheduler *sched; int conn_flags; /* We cannot modify an address and change the address family */ BUG_ON(!add && udest->af != dest->af); if (add && udest->af != svc->af) ipvs->mixed_address_family_dests++; /* keep the last_weight with latest non-0 weight */ if (add || udest->weight != 0) atomic_set(&dest->last_weight, udest->weight); /* set the weight and the flags */ atomic_set(&dest->weight, udest->weight); conn_flags = udest->conn_flags & IP_VS_CONN_F_DEST_MASK; conn_flags |= IP_VS_CONN_F_INACTIVE; /* Need to rehash? */ if ((udest->conn_flags & IP_VS_CONN_F_FWD_MASK) != IP_VS_DFWD_METHOD(dest) || udest->tun_type != dest->tun_type || udest->tun_port != dest->tun_port) ip_vs_rs_unhash(dest); /* set the tunnel info */ dest->tun_type = udest->tun_type; dest->tun_port = udest->tun_port; dest->tun_flags = udest->tun_flags; /* set the IP_VS_CONN_F_NOOUTPUT flag if not masquerading/NAT */ if ((conn_flags & IP_VS_CONN_F_FWD_MASK) != IP_VS_CONN_F_MASQ) { conn_flags |= IP_VS_CONN_F_NOOUTPUT; } else { /* FTP-NAT requires conntrack for mangling */ if (svc->port == FTPPORT) ip_vs_register_conntrack(svc); } atomic_set(&dest->conn_flags, conn_flags); /* Put the real service in rs_table if not present. */ ip_vs_rs_hash(ipvs, dest); /* bind the service */ old_svc = rcu_dereference_protected(dest->svc, 1); if (!old_svc) { __ip_vs_bind_svc(dest, svc); } else { if (old_svc != svc) { ip_vs_zero_stats(&dest->stats); __ip_vs_bind_svc(dest, svc); __ip_vs_svc_put(old_svc); } } /* set the dest status flags */ dest->flags |= IP_VS_DEST_F_AVAILABLE; if (udest->u_threshold == 0 || udest->u_threshold > dest->u_threshold) dest->flags &= ~IP_VS_DEST_F_OVERLOAD; dest->u_threshold = udest->u_threshold; dest->l_threshold = udest->l_threshold; dest->af = udest->af; spin_lock_bh(&dest->dst_lock); __ip_vs_dst_cache_reset(dest); spin_unlock_bh(&dest->dst_lock); if (add) { list_add_rcu(&dest->n_list, &svc->destinations); svc->num_dests++; sched = rcu_dereference_protected(svc->scheduler, 1); if (sched && sched->add_dest) sched->add_dest(svc, dest); } else { sched = rcu_dereference_protected(svc->scheduler, 1); if (sched && sched->upd_dest) sched->upd_dest(svc, dest); } } /* * Create a destination for the given service */ static int ip_vs_new_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest) { struct ip_vs_dest *dest; unsigned int atype; int ret; #ifdef CONFIG_IP_VS_IPV6 if (udest->af == AF_INET6) { atype = ipv6_addr_type(&udest->addr.in6); if ((!(atype & IPV6_ADDR_UNICAST) || atype & IPV6_ADDR_LINKLOCAL) && !__ip_vs_addr_is_local_v6(svc->ipvs->net, &udest->addr.in6)) return -EINVAL; ret = nf_defrag_ipv6_enable(svc->ipvs->net); if (ret) return ret; } else #endif { atype = inet_addr_type(svc->ipvs->net, udest->addr.ip); if (atype != RTN_LOCAL && atype != RTN_UNICAST) return -EINVAL; } dest = kzalloc(sizeof(struct ip_vs_dest), GFP_KERNEL); if (dest == NULL) return -ENOMEM; ret = ip_vs_stats_init_alloc(&dest->stats); if (ret < 0) goto err_alloc; ret = ip_vs_start_estimator(svc->ipvs, &dest->stats); if (ret < 0) goto err_stats; dest->af = udest->af; dest->protocol = svc->protocol; dest->vaddr = svc->addr; dest->vport = svc->port; dest->vfwmark = svc->fwmark; ip_vs_addr_copy(udest->af, &dest->addr, &udest->addr); dest->port = udest->port; atomic_set(&dest->activeconns, 0); atomic_set(&dest->inactconns, 0); atomic_set(&dest->persistconns, 0); refcount_set(&dest->refcnt, 1); INIT_HLIST_NODE(&dest->d_list); spin_lock_init(&dest->dst_lock); __ip_vs_update_dest(svc, dest, udest, 1); return 0; err_stats: ip_vs_stats_release(&dest->stats); err_alloc: kfree(dest); return ret; } /* * Add a destination into an existing service */ static int ip_vs_add_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest) { struct ip_vs_dest *dest; union nf_inet_addr daddr; __be16 dport = udest->port; int ret; if (udest->weight < 0) { pr_err("%s(): server weight less than zero\n", __func__); return -ERANGE; } if (udest->l_threshold > udest->u_threshold) { pr_err("%s(): lower threshold is higher than upper threshold\n", __func__); return -ERANGE; } if (udest->tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) { if (udest->tun_port == 0) { pr_err("%s(): tunnel port is zero\n", __func__); return -EINVAL; } } ip_vs_addr_copy(udest->af, &daddr, &udest->addr); /* We use function that requires RCU lock */ rcu_read_lock(); dest = ip_vs_lookup_dest(svc, udest->af, &daddr, dport); rcu_read_unlock(); if (dest != NULL) { IP_VS_DBG(1, "%s(): dest already exists\n", __func__); return -EEXIST; } /* * Check if the dest already exists in the trash and * is from the same service */ dest = ip_vs_trash_get_dest(svc, udest->af, &daddr, dport); if (dest != NULL) { IP_VS_DBG_BUF(3, "Get destination %s:%u from trash, " "dest->refcnt=%d, service %u/%s:%u\n", IP_VS_DBG_ADDR(udest->af, &daddr), ntohs(dport), refcount_read(&dest->refcnt), dest->vfwmark, IP_VS_DBG_ADDR(svc->af, &dest->vaddr), ntohs(dest->vport)); ret = ip_vs_start_estimator(svc->ipvs, &dest->stats); if (ret < 0) return ret; __ip_vs_update_dest(svc, dest, udest, 1); } else { /* * Allocate and initialize the dest structure */ ret = ip_vs_new_dest(svc, udest); } return ret; } /* * Edit a destination in the given service */ static int ip_vs_edit_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest) { struct ip_vs_dest *dest; union nf_inet_addr daddr; __be16 dport = udest->port; if (udest->weight < 0) { pr_err("%s(): server weight less than zero\n", __func__); return -ERANGE; } if (udest->l_threshold > udest->u_threshold) { pr_err("%s(): lower threshold is higher than upper threshold\n", __func__); return -ERANGE; } if (udest->tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) { if (udest->tun_port == 0) { pr_err("%s(): tunnel port is zero\n", __func__); return -EINVAL; } } ip_vs_addr_copy(udest->af, &daddr, &udest->addr); /* We use function that requires RCU lock */ rcu_read_lock(); dest = ip_vs_lookup_dest(svc, udest->af, &daddr, dport); rcu_read_unlock(); if (dest == NULL) { IP_VS_DBG(1, "%s(): dest doesn't exist\n", __func__); return -ENOENT; } __ip_vs_update_dest(svc, dest, udest, 0); return 0; } /* * Delete a destination (must be already unlinked from the service) */ static void __ip_vs_del_dest(struct netns_ipvs *ipvs, struct ip_vs_dest *dest, bool cleanup) { ip_vs_stop_estimator(ipvs, &dest->stats); /* * Remove it from the d-linked list with the real services. */ ip_vs_rs_unhash(dest); spin_lock_bh(&ipvs->dest_trash_lock); IP_VS_DBG_BUF(3, "Moving dest %s:%u into trash, dest->refcnt=%d\n", IP_VS_DBG_ADDR(dest->af, &dest->addr), ntohs(dest->port), refcount_read(&dest->refcnt)); if (list_empty(&ipvs->dest_trash) && !cleanup) mod_timer(&ipvs->dest_trash_timer, jiffies + (IP_VS_DEST_TRASH_PERIOD >> 1)); /* dest lives in trash with reference */ list_add(&dest->t_list, &ipvs->dest_trash); dest->idle_start = 0; spin_unlock_bh(&ipvs->dest_trash_lock); /* Queue up delayed work to expire all no destination connections. * No-op when CONFIG_SYSCTL is disabled. */ if (!cleanup) ip_vs_enqueue_expire_nodest_conns(ipvs); } /* * Unlink a destination from the given service */ static void __ip_vs_unlink_dest(struct ip_vs_service *svc, struct ip_vs_dest *dest, int svcupd) { dest->flags &= ~IP_VS_DEST_F_AVAILABLE; /* * Remove it from the d-linked destination list. */ list_del_rcu(&dest->n_list); svc->num_dests--; if (dest->af != svc->af) svc->ipvs->mixed_address_family_dests--; if (svcupd) { struct ip_vs_scheduler *sched; sched = rcu_dereference_protected(svc->scheduler, 1); if (sched && sched->del_dest) sched->del_dest(svc, dest); } } /* * Delete a destination server in the given service */ static int ip_vs_del_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest) { struct ip_vs_dest *dest; __be16 dport = udest->port; /* We use function that requires RCU lock */ rcu_read_lock(); dest = ip_vs_lookup_dest(svc, udest->af, &udest->addr, dport); rcu_read_unlock(); if (dest == NULL) { IP_VS_DBG(1, "%s(): destination not found!\n", __func__); return -ENOENT; } /* * Unlink dest from the service */ __ip_vs_unlink_dest(svc, dest, 1); /* * Delete the destination */ __ip_vs_del_dest(svc->ipvs, dest, false); return 0; } static void ip_vs_dest_trash_expire(struct timer_list *t) { struct netns_ipvs *ipvs = from_timer(ipvs, t, dest_trash_timer); struct ip_vs_dest *dest, *next; unsigned long now = jiffies; spin_lock(&ipvs->dest_trash_lock); list_for_each_entry_safe(dest, next, &ipvs->dest_trash, t_list) { if (refcount_read(&dest->refcnt) > 1) continue; if (dest->idle_start) { if (time_before(now, dest->idle_start + IP_VS_DEST_TRASH_PERIOD)) continue; } else { dest->idle_start = max(1UL, now); continue; } IP_VS_DBG_BUF(3, "Removing destination %u/%s:%u from trash\n", dest->vfwmark, IP_VS_DBG_ADDR(dest->af, &dest->addr), ntohs(dest->port)); list_del(&dest->t_list); ip_vs_dest_free(dest); } if (!list_empty(&ipvs->dest_trash)) mod_timer(&ipvs->dest_trash_timer, jiffies + (IP_VS_DEST_TRASH_PERIOD >> 1)); spin_unlock(&ipvs->dest_trash_lock); } /* * Add a service into the service hash table */ static int ip_vs_add_service(struct netns_ipvs *ipvs, struct ip_vs_service_user_kern *u, struct ip_vs_service **svc_p) { int ret = 0; struct ip_vs_scheduler *sched = NULL; struct ip_vs_pe *pe = NULL; struct ip_vs_service *svc = NULL; int ret_hooks = -1; /* increase the module use count */ if (!ip_vs_use_count_inc()) return -ENOPROTOOPT; /* Lookup the scheduler by 'u->sched_name' */ if (strcmp(u->sched_name, "none")) { sched = ip_vs_scheduler_get(u->sched_name); if (!sched) { pr_info("Scheduler module ip_vs_%s not found\n", u->sched_name); ret = -ENOENT; goto out_err; } } if (u->pe_name && *u->pe_name) { pe = ip_vs_pe_getbyname(u->pe_name); if (pe == NULL) { pr_info("persistence engine module ip_vs_pe_%s " "not found\n", u->pe_name); ret = -ENOENT; goto out_err; } } #ifdef CONFIG_IP_VS_IPV6 if (u->af == AF_INET6) { __u32 plen = (__force __u32) u->netmask; if (plen < 1 || plen > 128) { ret = -EINVAL; goto out_err; } ret = nf_defrag_ipv6_enable(ipvs->net); if (ret) goto out_err; } #endif if ((u->af == AF_INET && !ipvs->num_services) || (u->af == AF_INET6 && !ipvs->num_services6)) { ret = ip_vs_register_hooks(ipvs, u->af); if (ret < 0) goto out_err; ret_hooks = ret; } svc = kzalloc(sizeof(struct ip_vs_service), GFP_KERNEL); if (svc == NULL) { IP_VS_DBG(1, "%s(): no memory\n", __func__); ret = -ENOMEM; goto out_err; } ret = ip_vs_stats_init_alloc(&svc->stats); if (ret < 0) goto out_err; /* I'm the first user of the service */ atomic_set(&svc->refcnt, 0); svc->af = u->af; svc->protocol = u->protocol; ip_vs_addr_copy(svc->af, &svc->addr, &u->addr); svc->port = u->port; svc->fwmark = u->fwmark; svc->flags = u->flags & ~IP_VS_SVC_F_HASHED; svc->timeout = u->timeout * HZ; svc->netmask = u->netmask; svc->ipvs = ipvs; INIT_LIST_HEAD(&svc->destinations); spin_lock_init(&svc->sched_lock); /* Bind the scheduler */ if (sched) { ret = ip_vs_bind_scheduler(svc, sched); if (ret) goto out_err; sched = NULL; } ret = ip_vs_start_estimator(ipvs, &svc->stats); if (ret < 0) goto out_err; /* Update the virtual service counters */ if (svc->port == FTPPORT) atomic_inc(&ipvs->ftpsvc_counter); else if (svc->port == 0) atomic_inc(&ipvs->nullsvc_counter); if (pe && pe->conn_out) atomic_inc(&ipvs->conn_out_counter); /* Bind the ct retriever */ RCU_INIT_POINTER(svc->pe, pe); pe = NULL; /* Count only IPv4 services for old get/setsockopt interface */ if (svc->af == AF_INET) ipvs->num_services++; else if (svc->af == AF_INET6) ipvs->num_services6++; /* Hash the service into the service table */ ip_vs_svc_hash(svc); *svc_p = svc; if (!ipvs->enable) { /* Now there is a service - full throttle */ ipvs->enable = 1; /* Start estimation for first time */ ip_vs_est_reload_start(ipvs); } return 0; out_err: if (ret_hooks >= 0) ip_vs_unregister_hooks(ipvs, u->af); if (svc != NULL) { ip_vs_unbind_scheduler(svc, sched); ip_vs_service_free(svc); } ip_vs_scheduler_put(sched); ip_vs_pe_put(pe); /* decrease the module use count */ ip_vs_use_count_dec(); return ret; } /* * Edit a service and bind it with a new scheduler */ static int ip_vs_edit_service(struct ip_vs_service *svc, struct ip_vs_service_user_kern *u) { struct ip_vs_scheduler *sched = NULL, *old_sched; struct ip_vs_pe *pe = NULL, *old_pe = NULL; int ret = 0; bool new_pe_conn_out, old_pe_conn_out; /* * Lookup the scheduler, by 'u->sched_name' */ if (strcmp(u->sched_name, "none")) { sched = ip_vs_scheduler_get(u->sched_name); if (!sched) { pr_info("Scheduler module ip_vs_%s not found\n", u->sched_name); return -ENOENT; } } old_sched = sched; if (u->pe_name && *u->pe_name) { pe = ip_vs_pe_getbyname(u->pe_name); if (pe == NULL) { pr_info("persistence engine module ip_vs_pe_%s " "not found\n", u->pe_name); ret = -ENOENT; goto out; } old_pe = pe; } #ifdef CONFIG_IP_VS_IPV6 if (u->af == AF_INET6) { __u32 plen = (__force __u32) u->netmask; if (plen < 1 || plen > 128) { ret = -EINVAL; goto out; } } #endif old_sched = rcu_dereference_protected(svc->scheduler, 1); if (sched != old_sched) { if (old_sched) { ip_vs_unbind_scheduler(svc, old_sched); RCU_INIT_POINTER(svc->scheduler, NULL); /* Wait all svc->sched_data users */ synchronize_rcu(); } /* Bind the new scheduler */ if (sched) { ret = ip_vs_bind_scheduler(svc, sched); if (ret) { ip_vs_scheduler_put(sched); goto out; } } } /* * Set the flags and timeout value */ svc->flags = u->flags | IP_VS_SVC_F_HASHED; svc->timeout = u->timeout * HZ; svc->netmask = u->netmask; old_pe = rcu_dereference_protected(svc->pe, 1); if (pe != old_pe) { rcu_assign_pointer(svc->pe, pe); /* check for optional methods in new pe */ new_pe_conn_out = (pe && pe->conn_out) ? true : false; old_pe_conn_out = (old_pe && old_pe->conn_out) ? true : false; if (new_pe_conn_out && !old_pe_conn_out) atomic_inc(&svc->ipvs->conn_out_counter); if (old_pe_conn_out && !new_pe_conn_out) atomic_dec(&svc->ipvs->conn_out_counter); } out: ip_vs_scheduler_put(old_sched); ip_vs_pe_put(old_pe); return ret; } /* * Delete a service from the service list * - The service must be unlinked, unlocked and not referenced! * - We are called under _bh lock */ static void __ip_vs_del_service(struct ip_vs_service *svc, bool cleanup) { struct ip_vs_dest *dest, *nxt; struct ip_vs_scheduler *old_sched; struct ip_vs_pe *old_pe; struct netns_ipvs *ipvs = svc->ipvs; if (svc->af == AF_INET) { ipvs->num_services--; if (!ipvs->num_services) ip_vs_unregister_hooks(ipvs, svc->af); } else if (svc->af == AF_INET6) { ipvs->num_services6--; if (!ipvs->num_services6) ip_vs_unregister_hooks(ipvs, svc->af); } ip_vs_stop_estimator(svc->ipvs, &svc->stats); /* Unbind scheduler */ old_sched = rcu_dereference_protected(svc->scheduler, 1); ip_vs_unbind_scheduler(svc, old_sched); ip_vs_scheduler_put(old_sched); /* Unbind persistence engine, keep svc->pe */ old_pe = rcu_dereference_protected(svc->pe, 1); if (old_pe && old_pe->conn_out) atomic_dec(&ipvs->conn_out_counter); ip_vs_pe_put(old_pe); /* * Unlink the whole destination list */ list_for_each_entry_safe(dest, nxt, &svc->destinations, n_list) { __ip_vs_unlink_dest(svc, dest, 0); __ip_vs_del_dest(svc->ipvs, dest, cleanup); } /* * Update the virtual service counters */ if (svc->port == FTPPORT) atomic_dec(&ipvs->ftpsvc_counter); else if (svc->port == 0) atomic_dec(&ipvs->nullsvc_counter); /* * Free the service if nobody refers to it */ __ip_vs_svc_put(svc); /* decrease the module use count */ ip_vs_use_count_dec(); } /* * Unlink a service from list and try to delete it if its refcnt reached 0 */ static void ip_vs_unlink_service(struct ip_vs_service *svc, bool cleanup) { ip_vs_unregister_conntrack(svc); /* Hold svc to avoid double release from dest_trash */ atomic_inc(&svc->refcnt); /* * Unhash it from the service table */ ip_vs_svc_unhash(svc); __ip_vs_del_service(svc, cleanup); } /* * Delete a service from the service list */ static int ip_vs_del_service(struct ip_vs_service *svc) { if (svc == NULL) return -EEXIST; ip_vs_unlink_service(svc, false); return 0; } /* * Flush all the virtual services */ static int ip_vs_flush(struct netns_ipvs *ipvs, bool cleanup) { int idx; struct ip_vs_service *svc; struct hlist_node *n; /* * Flush the service table hashed by <netns,protocol,addr,port> */ for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { hlist_for_each_entry_safe(svc, n, &ip_vs_svc_table[idx], s_list) { if (svc->ipvs == ipvs) ip_vs_unlink_service(svc, cleanup); } } /* * Flush the service table hashed by fwmark */ for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { hlist_for_each_entry_safe(svc, n, &ip_vs_svc_fwm_table[idx], f_list) { if (svc->ipvs == ipvs) ip_vs_unlink_service(svc, cleanup); } } return 0; } /* * Delete service by {netns} in the service table. * Called by __ip_vs_batch_cleanup() */ void ip_vs_service_nets_cleanup(struct list_head *net_list) { struct netns_ipvs *ipvs; struct net *net; /* Check for "full" addressed entries */ mutex_lock(&__ip_vs_mutex); list_for_each_entry(net, net_list, exit_list) { ipvs = net_ipvs(net); ip_vs_flush(ipvs, true); } mutex_unlock(&__ip_vs_mutex); } /* Put all references for device (dst_cache) */ static inline void ip_vs_forget_dev(struct ip_vs_dest *dest, struct net_device *dev) { struct ip_vs_dest_dst *dest_dst; spin_lock_bh(&dest->dst_lock); dest_dst = rcu_dereference_protected(dest->dest_dst, 1); if (dest_dst && dest_dst->dst_cache->dev == dev) { IP_VS_DBG_BUF(3, "Reset dev:%s dest %s:%u ,dest->refcnt=%d\n", dev->name, IP_VS_DBG_ADDR(dest->af, &dest->addr), ntohs(dest->port), refcount_read(&dest->refcnt)); __ip_vs_dst_cache_reset(dest); } spin_unlock_bh(&dest->dst_lock); } /* Netdev event receiver * Currently only NETDEV_DOWN is handled to release refs to cached dsts */ static int ip_vs_dst_event(struct notifier_block *this, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct net *net = dev_net(dev); struct netns_ipvs *ipvs = net_ipvs(net); struct ip_vs_service *svc; struct ip_vs_dest *dest; unsigned int idx; if (event != NETDEV_DOWN || !ipvs) return NOTIFY_DONE; IP_VS_DBG(3, "%s() dev=%s\n", __func__, dev->name); mutex_lock(&__ip_vs_mutex); for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { hlist_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) { if (svc->ipvs == ipvs) { list_for_each_entry(dest, &svc->destinations, n_list) { ip_vs_forget_dev(dest, dev); } } } hlist_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) { if (svc->ipvs == ipvs) { list_for_each_entry(dest, &svc->destinations, n_list) { ip_vs_forget_dev(dest, dev); } } } } spin_lock_bh(&ipvs->dest_trash_lock); list_for_each_entry(dest, &ipvs->dest_trash, t_list) { ip_vs_forget_dev(dest, dev); } spin_unlock_bh(&ipvs->dest_trash_lock); mutex_unlock(&__ip_vs_mutex); return NOTIFY_DONE; } /* * Zero counters in a service or all services */ static int ip_vs_zero_service(struct ip_vs_service *svc) { struct ip_vs_dest *dest; list_for_each_entry(dest, &svc->destinations, n_list) { ip_vs_zero_stats(&dest->stats); } ip_vs_zero_stats(&svc->stats); return 0; } static int ip_vs_zero_all(struct netns_ipvs *ipvs) { int idx; struct ip_vs_service *svc; for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { hlist_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) { if (svc->ipvs == ipvs) ip_vs_zero_service(svc); } } for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { hlist_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) { if (svc->ipvs == ipvs) ip_vs_zero_service(svc); } } ip_vs_zero_stats(&ipvs->tot_stats->s); return 0; } #ifdef CONFIG_SYSCTL static int proc_do_defense_mode(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { struct netns_ipvs *ipvs = table->extra2; int *valp = table->data; int val = *valp; int rc; struct ctl_table tmp = { .data = &val, .maxlen = sizeof(int), .mode = table->mode, }; rc = proc_dointvec(&tmp, write, buffer, lenp, ppos); if (write && (*valp != val)) { if (val < 0 || val > 3) { rc = -EINVAL; } else { *valp = val; update_defense_level(ipvs); } } return rc; } static int proc_do_sync_threshold(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { struct netns_ipvs *ipvs = table->extra2; int *valp = table->data; int val[2]; int rc; struct ctl_table tmp = { .data = &val, .maxlen = table->maxlen, .mode = table->mode, }; mutex_lock(&ipvs->sync_mutex); memcpy(val, valp, sizeof(val)); rc = proc_dointvec(&tmp, write, buffer, lenp, ppos); if (write) { if (val[0] < 0 || val[1] < 0 || (val[0] >= val[1] && val[1])) rc = -EINVAL; else memcpy(valp, val, sizeof(val)); } mutex_unlock(&ipvs->sync_mutex); return rc; } static int proc_do_sync_ports(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { int *valp = table->data; int val = *valp; int rc; struct ctl_table tmp = { .data = &val, .maxlen = sizeof(int), .mode = table->mode, }; rc = proc_dointvec(&tmp, write, buffer, lenp, ppos); if (write && (*valp != val)) { if (val < 1 || !is_power_of_2(val)) rc = -EINVAL; else *valp = val; } return rc; } static int ipvs_proc_est_cpumask_set(const struct ctl_table *table, void *buffer) { struct netns_ipvs *ipvs = table->extra2; cpumask_var_t *valp = table->data; cpumask_var_t newmask; int ret; if (!zalloc_cpumask_var(&newmask, GFP_KERNEL)) return -ENOMEM; ret = cpulist_parse(buffer, newmask); if (ret) goto out; mutex_lock(&ipvs->est_mutex); if (!ipvs->est_cpulist_valid) { if (!zalloc_cpumask_var(valp, GFP_KERNEL)) { ret = -ENOMEM; goto unlock; } ipvs->est_cpulist_valid = 1; } cpumask_and(newmask, newmask, ¤t->cpus_mask); cpumask_copy(*valp, newmask); /* est_max_threads may depend on cpulist size */ ipvs->est_max_threads = ip_vs_est_max_threads(ipvs); ipvs->est_calc_phase = 1; ip_vs_est_reload_start(ipvs); unlock: mutex_unlock(&ipvs->est_mutex); out: free_cpumask_var(newmask); return ret; } static int ipvs_proc_est_cpumask_get(const struct ctl_table *table, void *buffer, size_t size) { struct netns_ipvs *ipvs = table->extra2; cpumask_var_t *valp = table->data; struct cpumask *mask; int ret; mutex_lock(&ipvs->est_mutex); if (ipvs->est_cpulist_valid) mask = *valp; else mask = (struct cpumask *)housekeeping_cpumask(HK_TYPE_KTHREAD); ret = scnprintf(buffer, size, "%*pbl\n", cpumask_pr_args(mask)); mutex_unlock(&ipvs->est_mutex); return ret; } static int ipvs_proc_est_cpulist(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { int ret; /* Ignore both read and write(append) if *ppos not 0 */ if (*ppos || !*lenp) { *lenp = 0; return 0; } if (write) { /* proc_sys_call_handler() appends terminator */ ret = ipvs_proc_est_cpumask_set(table, buffer); if (ret >= 0) *ppos += *lenp; } else { /* proc_sys_call_handler() allocates 1 byte for terminator */ ret = ipvs_proc_est_cpumask_get(table, buffer, *lenp + 1); if (ret >= 0) { *lenp = ret; *ppos += *lenp; ret = 0; } } return ret; } static int ipvs_proc_est_nice(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { struct netns_ipvs *ipvs = table->extra2; int *valp = table->data; int val = *valp; int ret; struct ctl_table tmp_table = { .data = &val, .maxlen = sizeof(int), .mode = table->mode, }; ret = proc_dointvec(&tmp_table, write, buffer, lenp, ppos); if (write && ret >= 0) { if (val < MIN_NICE || val > MAX_NICE) { ret = -EINVAL; } else { mutex_lock(&ipvs->est_mutex); if (*valp != val) { *valp = val; ip_vs_est_reload_start(ipvs); } mutex_unlock(&ipvs->est_mutex); } } return ret; } static int ipvs_proc_run_estimation(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { struct netns_ipvs *ipvs = table->extra2; int *valp = table->data; int val = *valp; int ret; struct ctl_table tmp_table = { .data = &val, .maxlen = sizeof(int), .mode = table->mode, }; ret = proc_dointvec(&tmp_table, write, buffer, lenp, ppos); if (write && ret >= 0) { mutex_lock(&ipvs->est_mutex); if (*valp != val) { *valp = val; ip_vs_est_reload_start(ipvs); } mutex_unlock(&ipvs->est_mutex); } return ret; } /* * IPVS sysctl table (under the /proc/sys/net/ipv4/vs/) * Do not change order or insert new entries without * align with netns init in ip_vs_control_net_init() */ static struct ctl_table vs_vars[] = { { .procname = "amemthresh", .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, }, { .procname = "am_droprate", .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, }, { .procname = "drop_entry", .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_do_defense_mode, }, { .procname = "drop_packet", .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_do_defense_mode, }, #ifdef CONFIG_IP_VS_NFCT { .procname = "conntrack", .maxlen = sizeof(int), .mode = 0644, .proc_handler = &proc_dointvec, }, #endif { .procname = "secure_tcp", .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_do_defense_mode, }, { .procname = "snat_reroute", .maxlen = sizeof(int), .mode = 0644, .proc_handler = &proc_dointvec, }, { .procname = "sync_version", .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, { .procname = "sync_ports", .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_do_sync_ports, }, { .procname = "sync_persist_mode", .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, }, { .procname = "sync_qlen_max", .maxlen = sizeof(unsigned long), .mode = 0644, .proc_handler = proc_doulongvec_minmax, }, { .procname = "sync_sock_size", .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, }, { .procname = "cache_bypass", .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, }, { .procname = "expire_nodest_conn", .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, }, { .procname = "sloppy_tcp", .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, }, { .procname = "sloppy_sctp", .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, }, { .procname = "expire_quiescent_template", .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, }, { .procname = "sync_threshold", .maxlen = sizeof(((struct netns_ipvs *)0)->sysctl_sync_threshold), .mode = 0644, .proc_handler = proc_do_sync_threshold, }, { .procname = "sync_refresh_period", .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, { .procname = "sync_retries", .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_THREE, }, { .procname = "nat_icmp_send", .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, }, { .procname = "pmtu_disc", .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, }, { .procname = "backup_only", .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, }, { .procname = "conn_reuse_mode", .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, }, { .procname = "schedule_icmp", .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, }, { .procname = "ignore_tunneled", .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, }, { .procname = "run_estimation", .maxlen = sizeof(int), .mode = 0644, .proc_handler = ipvs_proc_run_estimation, }, { .procname = "est_cpulist", .maxlen = NR_CPUS, /* unused */ .mode = 0644, .proc_handler = ipvs_proc_est_cpulist, }, { .procname = "est_nice", .maxlen = sizeof(int), .mode = 0644, .proc_handler = ipvs_proc_est_nice, }, #ifdef CONFIG_IP_VS_DEBUG { .procname = "debug_level", .data = &sysctl_ip_vs_debug_level, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, }, #endif }; #endif #ifdef CONFIG_PROC_FS struct ip_vs_iter { struct seq_net_private p; /* Do not move this, netns depends upon it*/ struct hlist_head *table; int bucket; }; /* * Write the contents of the VS rule table to a PROCfs file. * (It is kept just for backward compatibility) */ static inline const char *ip_vs_fwd_name(unsigned int flags) { switch (flags & IP_VS_CONN_F_FWD_MASK) { case IP_VS_CONN_F_LOCALNODE: return "Local"; case IP_VS_CONN_F_TUNNEL: return "Tunnel"; case IP_VS_CONN_F_DROUTE: return "Route"; default: return "Masq"; } } /* Get the Nth entry in the two lists */ static struct ip_vs_service *ip_vs_info_array(struct seq_file *seq, loff_t pos) { struct net *net = seq_file_net(seq); struct netns_ipvs *ipvs = net_ipvs(net); struct ip_vs_iter *iter = seq->private; int idx; struct ip_vs_service *svc; /* look in hash by protocol */ for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { hlist_for_each_entry_rcu(svc, &ip_vs_svc_table[idx], s_list) { if ((svc->ipvs == ipvs) && pos-- == 0) { iter->table = ip_vs_svc_table; iter->bucket = idx; return svc; } } } /* keep looking in fwmark */ for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { hlist_for_each_entry_rcu(svc, &ip_vs_svc_fwm_table[idx], f_list) { if ((svc->ipvs == ipvs) && pos-- == 0) { iter->table = ip_vs_svc_fwm_table; iter->bucket = idx; return svc; } } } return NULL; } static void *ip_vs_info_seq_start(struct seq_file *seq, loff_t *pos) __acquires(RCU) { rcu_read_lock(); return *pos ? ip_vs_info_array(seq, *pos - 1) : SEQ_START_TOKEN; } static void *ip_vs_info_seq_next(struct seq_file *seq, void *v, loff_t *pos) { struct hlist_node *e; struct ip_vs_iter *iter; struct ip_vs_service *svc; ++*pos; if (v == SEQ_START_TOKEN) return ip_vs_info_array(seq,0); svc = v; iter = seq->private; if (iter->table == ip_vs_svc_table) { /* next service in table hashed by protocol */ e = rcu_dereference(hlist_next_rcu(&svc->s_list)); if (e) return hlist_entry(e, struct ip_vs_service, s_list); while (++iter->bucket < IP_VS_SVC_TAB_SIZE) { hlist_for_each_entry_rcu(svc, &ip_vs_svc_table[iter->bucket], s_list) { return svc; } } iter->table = ip_vs_svc_fwm_table; iter->bucket = -1; goto scan_fwmark; } /* next service in hashed by fwmark */ e = rcu_dereference(hlist_next_rcu(&svc->f_list)); if (e) return hlist_entry(e, struct ip_vs_service, f_list); scan_fwmark: while (++iter->bucket < IP_VS_SVC_TAB_SIZE) { hlist_for_each_entry_rcu(svc, &ip_vs_svc_fwm_table[iter->bucket], f_list) return svc; } return NULL; } static void ip_vs_info_seq_stop(struct seq_file *seq, void *v) __releases(RCU) { rcu_read_unlock(); } static int ip_vs_info_seq_show(struct seq_file *seq, void *v) { if (v == SEQ_START_TOKEN) { seq_printf(seq, "IP Virtual Server version %d.%d.%d (size=%d)\n", NVERSION(IP_VS_VERSION_CODE), ip_vs_conn_tab_size); seq_puts(seq, "Prot LocalAddress:Port Scheduler Flags\n"); seq_puts(seq, " -> RemoteAddress:Port Forward Weight ActiveConn InActConn\n"); } else { struct net *net = seq_file_net(seq); struct netns_ipvs *ipvs = net_ipvs(net); const struct ip_vs_service *svc = v; const struct ip_vs_iter *iter = seq->private; const struct ip_vs_dest *dest; struct ip_vs_scheduler *sched = rcu_dereference(svc->scheduler); char *sched_name = sched ? sched->name : "none"; if (svc->ipvs != ipvs) return 0; if (iter->table == ip_vs_svc_table) { #ifdef CONFIG_IP_VS_IPV6 if (svc->af == AF_INET6) seq_printf(seq, "%s [%pI6]:%04X %s ", ip_vs_proto_name(svc->protocol), &svc->addr.in6, ntohs(svc->port), sched_name); else #endif seq_printf(seq, "%s %08X:%04X %s %s ", ip_vs_proto_name(svc->protocol), ntohl(svc->addr.ip), ntohs(svc->port), sched_name, (svc->flags & IP_VS_SVC_F_ONEPACKET)?"ops ":""); } else { seq_printf(seq, "FWM %08X %s %s", svc->fwmark, sched_name, (svc->flags & IP_VS_SVC_F_ONEPACKET)?"ops ":""); } if (svc->flags & IP_VS_SVC_F_PERSISTENT) seq_printf(seq, "persistent %d %08X\n", svc->timeout, ntohl(svc->netmask)); else seq_putc(seq, '\n'); list_for_each_entry_rcu(dest, &svc->destinations, n_list) { #ifdef CONFIG_IP_VS_IPV6 if (dest->af == AF_INET6) seq_printf(seq, " -> [%pI6]:%04X" " %-7s %-6d %-10d %-10d\n", &dest->addr.in6, ntohs(dest->port), ip_vs_fwd_name(atomic_read(&dest->conn_flags)), atomic_read(&dest->weight), atomic_read(&dest->activeconns), atomic_read(&dest->inactconns)); else #endif seq_printf(seq, " -> %08X:%04X " "%-7s %-6d %-10d %-10d\n", ntohl(dest->addr.ip), ntohs(dest->port), ip_vs_fwd_name(atomic_read(&dest->conn_flags)), atomic_read(&dest->weight), atomic_read(&dest->activeconns), atomic_read(&dest->inactconns)); } } return 0; } static const struct seq_operations ip_vs_info_seq_ops = { .start = ip_vs_info_seq_start, .next = ip_vs_info_seq_next, .stop = ip_vs_info_seq_stop, .show = ip_vs_info_seq_show, }; static int ip_vs_stats_show(struct seq_file *seq, void *v) { struct net *net = seq_file_single_net(seq); struct ip_vs_kstats show; /* 01234567 01234567 01234567 0123456701234567 0123456701234567 */ seq_puts(seq, " Total Incoming Outgoing Incoming Outgoing\n"); seq_puts(seq, " Conns Packets Packets Bytes Bytes\n"); ip_vs_copy_stats(&show, &net_ipvs(net)->tot_stats->s); seq_printf(seq, "%8LX %8LX %8LX %16LX %16LX\n\n", (unsigned long long)show.conns, (unsigned long long)show.inpkts, (unsigned long long)show.outpkts, (unsigned long long)show.inbytes, (unsigned long long)show.outbytes); /* 01234567 01234567 01234567 0123456701234567 0123456701234567*/ seq_puts(seq, " Conns/s Pkts/s Pkts/s Bytes/s Bytes/s\n"); seq_printf(seq, "%8LX %8LX %8LX %16LX %16LX\n", (unsigned long long)show.cps, (unsigned long long)show.inpps, (unsigned long long)show.outpps, (unsigned long long)show.inbps, (unsigned long long)show.outbps); return 0; } static int ip_vs_stats_percpu_show(struct seq_file *seq, void *v) { struct net *net = seq_file_single_net(seq); struct ip_vs_stats *tot_stats = &net_ipvs(net)->tot_stats->s; struct ip_vs_cpu_stats __percpu *cpustats = tot_stats->cpustats; struct ip_vs_kstats kstats; int i; /* 01234567 01234567 01234567 0123456701234567 0123456701234567 */ seq_puts(seq, " Total Incoming Outgoing Incoming Outgoing\n"); seq_puts(seq, "CPU Conns Packets Packets Bytes Bytes\n"); for_each_possible_cpu(i) { struct ip_vs_cpu_stats *u = per_cpu_ptr(cpustats, i); unsigned int start; u64 conns, inpkts, outpkts, inbytes, outbytes; do { start = u64_stats_fetch_begin(&u->syncp); conns = u64_stats_read(&u->cnt.conns); inpkts = u64_stats_read(&u->cnt.inpkts); outpkts = u64_stats_read(&u->cnt.outpkts); inbytes = u64_stats_read(&u->cnt.inbytes); outbytes = u64_stats_read(&u->cnt.outbytes); } while (u64_stats_fetch_retry(&u->syncp, start)); seq_printf(seq, "%3X %8LX %8LX %8LX %16LX %16LX\n", i, (u64)conns, (u64)inpkts, (u64)outpkts, (u64)inbytes, (u64)outbytes); } ip_vs_copy_stats(&kstats, tot_stats); seq_printf(seq, " ~ %8LX %8LX %8LX %16LX %16LX\n\n", (unsigned long long)kstats.conns, (unsigned long long)kstats.inpkts, (unsigned long long)kstats.outpkts, (unsigned long long)kstats.inbytes, (unsigned long long)kstats.outbytes); /* ... 01234567 01234567 01234567 0123456701234567 0123456701234567 */ seq_puts(seq, " Conns/s Pkts/s Pkts/s Bytes/s Bytes/s\n"); seq_printf(seq, " %8LX %8LX %8LX %16LX %16LX\n", kstats.cps, kstats.inpps, kstats.outpps, kstats.inbps, kstats.outbps); return 0; } #endif /* * Set timeout values for tcp tcpfin udp in the timeout_table. */ static int ip_vs_set_timeout(struct netns_ipvs *ipvs, struct ip_vs_timeout_user *u) { #if defined(CONFIG_IP_VS_PROTO_TCP) || defined(CONFIG_IP_VS_PROTO_UDP) struct ip_vs_proto_data *pd; #endif IP_VS_DBG(2, "Setting timeout tcp:%d tcpfin:%d udp:%d\n", u->tcp_timeout, u->tcp_fin_timeout, u->udp_timeout); #ifdef CONFIG_IP_VS_PROTO_TCP if (u->tcp_timeout < 0 || u->tcp_timeout > (INT_MAX / HZ) || u->tcp_fin_timeout < 0 || u->tcp_fin_timeout > (INT_MAX / HZ)) { return -EINVAL; } #endif #ifdef CONFIG_IP_VS_PROTO_UDP if (u->udp_timeout < 0 || u->udp_timeout > (INT_MAX / HZ)) return -EINVAL; #endif #ifdef CONFIG_IP_VS_PROTO_TCP if (u->tcp_timeout) { pd = ip_vs_proto_data_get(ipvs, IPPROTO_TCP); pd->timeout_table[IP_VS_TCP_S_ESTABLISHED] = u->tcp_timeout * HZ; } if (u->tcp_fin_timeout) { pd = ip_vs_proto_data_get(ipvs, IPPROTO_TCP); pd->timeout_table[IP_VS_TCP_S_FIN_WAIT] = u->tcp_fin_timeout * HZ; } #endif #ifdef CONFIG_IP_VS_PROTO_UDP if (u->udp_timeout) { pd = ip_vs_proto_data_get(ipvs, IPPROTO_UDP); pd->timeout_table[IP_VS_UDP_S_NORMAL] = u->udp_timeout * HZ; } #endif return 0; } #define CMDID(cmd) (cmd - IP_VS_BASE_CTL) struct ip_vs_svcdest_user { struct ip_vs_service_user s; struct ip_vs_dest_user d; }; static const unsigned char set_arglen[CMDID(IP_VS_SO_SET_MAX) + 1] = { [CMDID(IP_VS_SO_SET_ADD)] = sizeof(struct ip_vs_service_user), [CMDID(IP_VS_SO_SET_EDIT)] = sizeof(struct ip_vs_service_user), [CMDID(IP_VS_SO_SET_DEL)] = sizeof(struct ip_vs_service_user), [CMDID(IP_VS_SO_SET_ADDDEST)] = sizeof(struct ip_vs_svcdest_user), [CMDID(IP_VS_SO_SET_DELDEST)] = sizeof(struct ip_vs_svcdest_user), [CMDID(IP_VS_SO_SET_EDITDEST)] = sizeof(struct ip_vs_svcdest_user), [CMDID(IP_VS_SO_SET_TIMEOUT)] = sizeof(struct ip_vs_timeout_user), [CMDID(IP_VS_SO_SET_STARTDAEMON)] = sizeof(struct ip_vs_daemon_user), [CMDID(IP_VS_SO_SET_STOPDAEMON)] = sizeof(struct ip_vs_daemon_user), [CMDID(IP_VS_SO_SET_ZERO)] = sizeof(struct ip_vs_service_user), }; union ip_vs_set_arglen { struct ip_vs_service_user field_IP_VS_SO_SET_ADD; struct ip_vs_service_user field_IP_VS_SO_SET_EDIT; struct ip_vs_service_user field_IP_VS_SO_SET_DEL; struct ip_vs_svcdest_user field_IP_VS_SO_SET_ADDDEST; struct ip_vs_svcdest_user field_IP_VS_SO_SET_DELDEST; struct ip_vs_svcdest_user field_IP_VS_SO_SET_EDITDEST; struct ip_vs_timeout_user field_IP_VS_SO_SET_TIMEOUT; struct ip_vs_daemon_user field_IP_VS_SO_SET_STARTDAEMON; struct ip_vs_daemon_user field_IP_VS_SO_SET_STOPDAEMON; struct ip_vs_service_user field_IP_VS_SO_SET_ZERO; }; #define MAX_SET_ARGLEN sizeof(union ip_vs_set_arglen) static void ip_vs_copy_usvc_compat(struct ip_vs_service_user_kern *usvc, struct ip_vs_service_user *usvc_compat) { memset(usvc, 0, sizeof(*usvc)); usvc->af = AF_INET; usvc->protocol = usvc_compat->protocol; usvc->addr.ip = usvc_compat->addr; usvc->port = usvc_compat->port; usvc->fwmark = usvc_compat->fwmark; /* Deep copy of sched_name is not needed here */ usvc->sched_name = usvc_compat->sched_name; usvc->flags = usvc_compat->flags; usvc->timeout = usvc_compat->timeout; usvc->netmask = usvc_compat->netmask; } static void ip_vs_copy_udest_compat(struct ip_vs_dest_user_kern *udest, struct ip_vs_dest_user *udest_compat) { memset(udest, 0, sizeof(*udest)); udest->addr.ip = udest_compat->addr; udest->port = udest_compat->port; udest->conn_flags = udest_compat->conn_flags; udest->weight = udest_compat->weight; udest->u_threshold = udest_compat->u_threshold; udest->l_threshold = udest_compat->l_threshold; udest->af = AF_INET; udest->tun_type = IP_VS_CONN_F_TUNNEL_TYPE_IPIP; } static int do_ip_vs_set_ctl(struct sock *sk, int cmd, sockptr_t ptr, unsigned int len) { struct net *net = sock_net(sk); int ret; unsigned char arg[MAX_SET_ARGLEN]; struct ip_vs_service_user *usvc_compat; struct ip_vs_service_user_kern usvc; struct ip_vs_service *svc; struct ip_vs_dest_user *udest_compat; struct ip_vs_dest_user_kern udest; struct netns_ipvs *ipvs = net_ipvs(net); BUILD_BUG_ON(sizeof(arg) > 255); if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) return -EPERM; if (cmd < IP_VS_BASE_CTL || cmd > IP_VS_SO_SET_MAX) return -EINVAL; if (len != set_arglen[CMDID(cmd)]) { IP_VS_DBG(1, "set_ctl: len %u != %u\n", len, set_arglen[CMDID(cmd)]); return -EINVAL; } if (copy_from_sockptr(arg, ptr, len) != 0) return -EFAULT; /* Handle daemons since they have another lock */ if (cmd == IP_VS_SO_SET_STARTDAEMON || cmd == IP_VS_SO_SET_STOPDAEMON) { struct ip_vs_daemon_user *dm = (struct ip_vs_daemon_user *)arg; if (cmd == IP_VS_SO_SET_STARTDAEMON) { struct ipvs_sync_daemon_cfg cfg; memset(&cfg, 0, sizeof(cfg)); ret = -EINVAL; if (strscpy(cfg.mcast_ifn, dm->mcast_ifn, sizeof(cfg.mcast_ifn)) <= 0) return ret; cfg.syncid = dm->syncid; ret = start_sync_thread(ipvs, &cfg, dm->state); } else { ret = stop_sync_thread(ipvs, dm->state); } return ret; } mutex_lock(&__ip_vs_mutex); if (cmd == IP_VS_SO_SET_FLUSH) { /* Flush the virtual service */ ret = ip_vs_flush(ipvs, false); goto out_unlock; } else if (cmd == IP_VS_SO_SET_TIMEOUT) { /* Set timeout values for (tcp tcpfin udp) */ ret = ip_vs_set_timeout(ipvs, (struct ip_vs_timeout_user *)arg); goto out_unlock; } else if (!len) { /* No more commands with len == 0 below */ ret = -EINVAL; goto out_unlock; } usvc_compat = (struct ip_vs_service_user *)arg; udest_compat = (struct ip_vs_dest_user *)(usvc_compat + 1); /* We only use the new structs internally, so copy userspace compat * structs to extended internal versions */ ip_vs_copy_usvc_compat(&usvc, usvc_compat); ip_vs_copy_udest_compat(&udest, udest_compat); if (cmd == IP_VS_SO_SET_ZERO) { /* if no service address is set, zero counters in all */ if (!usvc.fwmark && !usvc.addr.ip && !usvc.port) { ret = ip_vs_zero_all(ipvs); goto out_unlock; } } if ((cmd == IP_VS_SO_SET_ADD || cmd == IP_VS_SO_SET_EDIT) && strnlen(usvc.sched_name, IP_VS_SCHEDNAME_MAXLEN) == IP_VS_SCHEDNAME_MAXLEN) { ret = -EINVAL; goto out_unlock; } /* Check for valid protocol: TCP or UDP or SCTP, even for fwmark!=0 */ if (usvc.protocol != IPPROTO_TCP && usvc.protocol != IPPROTO_UDP && usvc.protocol != IPPROTO_SCTP) { pr_err("set_ctl: invalid protocol: %d %pI4:%d\n", usvc.protocol, &usvc.addr.ip, ntohs(usvc.port)); ret = -EFAULT; goto out_unlock; } /* Lookup the exact service by <protocol, addr, port> or fwmark */ rcu_read_lock(); if (usvc.fwmark == 0) svc = __ip_vs_service_find(ipvs, usvc.af, usvc.protocol, &usvc.addr, usvc.port); else svc = __ip_vs_svc_fwm_find(ipvs, usvc.af, usvc.fwmark); rcu_read_unlock(); if (cmd != IP_VS_SO_SET_ADD && (svc == NULL || svc->protocol != usvc.protocol)) { ret = -ESRCH; goto out_unlock; } switch (cmd) { case IP_VS_SO_SET_ADD: if (svc != NULL) ret = -EEXIST; else ret = ip_vs_add_service(ipvs, &usvc, &svc); break; case IP_VS_SO_SET_EDIT: ret = ip_vs_edit_service(svc, &usvc); break; case IP_VS_SO_SET_DEL: ret = ip_vs_del_service(svc); if (!ret) goto out_unlock; break; case IP_VS_SO_SET_ZERO: ret = ip_vs_zero_service(svc); break; case IP_VS_SO_SET_ADDDEST: ret = ip_vs_add_dest(svc, &udest); break; case IP_VS_SO_SET_EDITDEST: ret = ip_vs_edit_dest(svc, &udest); break; case IP_VS_SO_SET_DELDEST: ret = ip_vs_del_dest(svc, &udest); break; default: WARN_ON_ONCE(1); ret = -EINVAL; break; } out_unlock: mutex_unlock(&__ip_vs_mutex); return ret; } static void ip_vs_copy_service(struct ip_vs_service_entry *dst, struct ip_vs_service *src) { struct ip_vs_scheduler *sched; struct ip_vs_kstats kstats; char *sched_name; sched = rcu_dereference_protected(src->scheduler, 1); sched_name = sched ? sched->name : "none"; dst->protocol = src->protocol; dst->addr = src->addr.ip; dst->port = src->port; dst->fwmark = src->fwmark; strscpy(dst->sched_name, sched_name, sizeof(dst->sched_name)); dst->flags = src->flags; dst->timeout = src->timeout / HZ; dst->netmask = src->netmask; dst->num_dests = src->num_dests; ip_vs_copy_stats(&kstats, &src->stats); ip_vs_export_stats_user(&dst->stats, &kstats); } static inline int __ip_vs_get_service_entries(struct netns_ipvs *ipvs, const struct ip_vs_get_services *get, struct ip_vs_get_services __user *uptr) { int idx, count=0; struct ip_vs_service *svc; struct ip_vs_service_entry entry; int ret = 0; for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { hlist_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) { /* Only expose IPv4 entries to old interface */ if (svc->af != AF_INET || (svc->ipvs != ipvs)) continue; if (count >= get->num_services) goto out; memset(&entry, 0, sizeof(entry)); ip_vs_copy_service(&entry, svc); if (copy_to_user(&uptr->entrytable[count], &entry, sizeof(entry))) { ret = -EFAULT; goto out; } count++; } } for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { hlist_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) { /* Only expose IPv4 entries to old interface */ if (svc->af != AF_INET || (svc->ipvs != ipvs)) continue; if (count >= get->num_services) goto out; memset(&entry, 0, sizeof(entry)); ip_vs_copy_service(&entry, svc); if (copy_to_user(&uptr->entrytable[count], &entry, sizeof(entry))) { ret = -EFAULT; goto out; } count++; } } out: return ret; } static inline int __ip_vs_get_dest_entries(struct netns_ipvs *ipvs, const struct ip_vs_get_dests *get, struct ip_vs_get_dests __user *uptr) { struct ip_vs_service *svc; union nf_inet_addr addr = { .ip = get->addr }; int ret = 0; rcu_read_lock(); if (get->fwmark) svc = __ip_vs_svc_fwm_find(ipvs, AF_INET, get->fwmark); else svc = __ip_vs_service_find(ipvs, AF_INET, get->protocol, &addr, get->port); rcu_read_unlock(); if (svc) { int count = 0; struct ip_vs_dest *dest; struct ip_vs_dest_entry entry; struct ip_vs_kstats kstats; memset(&entry, 0, sizeof(entry)); list_for_each_entry(dest, &svc->destinations, n_list) { if (count >= get->num_dests) break; /* Cannot expose heterogeneous members via sockopt * interface */ if (dest->af != svc->af) continue; entry.addr = dest->addr.ip; entry.port = dest->port; entry.conn_flags = atomic_read(&dest->conn_flags); entry.weight = atomic_read(&dest->weight); entry.u_threshold = dest->u_threshold; entry.l_threshold = dest->l_threshold; entry.activeconns = atomic_read(&dest->activeconns); entry.inactconns = atomic_read(&dest->inactconns); entry.persistconns = atomic_read(&dest->persistconns); ip_vs_copy_stats(&kstats, &dest->stats); ip_vs_export_stats_user(&entry.stats, &kstats); if (copy_to_user(&uptr->entrytable[count], &entry, sizeof(entry))) { ret = -EFAULT; break; } count++; } } else ret = -ESRCH; return ret; } static inline void __ip_vs_get_timeouts(struct netns_ipvs *ipvs, struct ip_vs_timeout_user *u) { #if defined(CONFIG_IP_VS_PROTO_TCP) || defined(CONFIG_IP_VS_PROTO_UDP) struct ip_vs_proto_data *pd; #endif memset(u, 0, sizeof (*u)); #ifdef CONFIG_IP_VS_PROTO_TCP pd = ip_vs_proto_data_get(ipvs, IPPROTO_TCP); u->tcp_timeout = pd->timeout_table[IP_VS_TCP_S_ESTABLISHED] / HZ; u->tcp_fin_timeout = pd->timeout_table[IP_VS_TCP_S_FIN_WAIT] / HZ; #endif #ifdef CONFIG_IP_VS_PROTO_UDP pd = ip_vs_proto_data_get(ipvs, IPPROTO_UDP); u->udp_timeout = pd->timeout_table[IP_VS_UDP_S_NORMAL] / HZ; #endif } static const unsigned char get_arglen[CMDID(IP_VS_SO_GET_MAX) + 1] = { [CMDID(IP_VS_SO_GET_VERSION)] = 64, [CMDID(IP_VS_SO_GET_INFO)] = sizeof(struct ip_vs_getinfo), [CMDID(IP_VS_SO_GET_SERVICES)] = sizeof(struct ip_vs_get_services), [CMDID(IP_VS_SO_GET_SERVICE)] = sizeof(struct ip_vs_service_entry), [CMDID(IP_VS_SO_GET_DESTS)] = sizeof(struct ip_vs_get_dests), [CMDID(IP_VS_SO_GET_TIMEOUT)] = sizeof(struct ip_vs_timeout_user), [CMDID(IP_VS_SO_GET_DAEMON)] = 2 * sizeof(struct ip_vs_daemon_user), }; union ip_vs_get_arglen { char field_IP_VS_SO_GET_VERSION[64]; struct ip_vs_getinfo field_IP_VS_SO_GET_INFO; struct ip_vs_get_services field_IP_VS_SO_GET_SERVICES; struct ip_vs_service_entry field_IP_VS_SO_GET_SERVICE; struct ip_vs_get_dests field_IP_VS_SO_GET_DESTS; struct ip_vs_timeout_user field_IP_VS_SO_GET_TIMEOUT; struct ip_vs_daemon_user field_IP_VS_SO_GET_DAEMON[2]; }; #define MAX_GET_ARGLEN sizeof(union ip_vs_get_arglen) static int do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len) { unsigned char arg[MAX_GET_ARGLEN]; int ret = 0; unsigned int copylen; struct net *net = sock_net(sk); struct netns_ipvs *ipvs = net_ipvs(net); BUG_ON(!net); BUILD_BUG_ON(sizeof(arg) > 255); if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) return -EPERM; if (cmd < IP_VS_BASE_CTL || cmd > IP_VS_SO_GET_MAX) return -EINVAL; copylen = get_arglen[CMDID(cmd)]; if (*len < (int) copylen) { IP_VS_DBG(1, "get_ctl: len %d < %u\n", *len, copylen); return -EINVAL; } if (copy_from_user(arg, user, copylen) != 0) return -EFAULT; /* * Handle daemons first since it has its own locking */ if (cmd == IP_VS_SO_GET_DAEMON) { struct ip_vs_daemon_user d[2]; memset(&d, 0, sizeof(d)); mutex_lock(&ipvs->sync_mutex); if (ipvs->sync_state & IP_VS_STATE_MASTER) { d[0].state = IP_VS_STATE_MASTER; strscpy(d[0].mcast_ifn, ipvs->mcfg.mcast_ifn, sizeof(d[0].mcast_ifn)); d[0].syncid = ipvs->mcfg.syncid; } if (ipvs->sync_state & IP_VS_STATE_BACKUP) { d[1].state = IP_VS_STATE_BACKUP; strscpy(d[1].mcast_ifn, ipvs->bcfg.mcast_ifn, sizeof(d[1].mcast_ifn)); d[1].syncid = ipvs->bcfg.syncid; } if (copy_to_user(user, &d, sizeof(d)) != 0) ret = -EFAULT; mutex_unlock(&ipvs->sync_mutex); return ret; } mutex_lock(&__ip_vs_mutex); switch (cmd) { case IP_VS_SO_GET_VERSION: { char buf[64]; sprintf(buf, "IP Virtual Server version %d.%d.%d (size=%d)", NVERSION(IP_VS_VERSION_CODE), ip_vs_conn_tab_size); if (copy_to_user(user, buf, strlen(buf)+1) != 0) { ret = -EFAULT; goto out; } *len = strlen(buf)+1; } break; case IP_VS_SO_GET_INFO: { struct ip_vs_getinfo info; info.version = IP_VS_VERSION_CODE; info.size = ip_vs_conn_tab_size; info.num_services = ipvs->num_services; if (copy_to_user(user, &info, sizeof(info)) != 0) ret = -EFAULT; } break; case IP_VS_SO_GET_SERVICES: { struct ip_vs_get_services *get; int size; get = (struct ip_vs_get_services *)arg; size = struct_size(get, entrytable, get->num_services); if (*len != size) { pr_err("length: %u != %u\n", *len, size); ret = -EINVAL; goto out; } ret = __ip_vs_get_service_entries(ipvs, get, user); } break; case IP_VS_SO_GET_SERVICE: { struct ip_vs_service_entry *entry; struct ip_vs_service *svc; union nf_inet_addr addr; entry = (struct ip_vs_service_entry *)arg; addr.ip = entry->addr; rcu_read_lock(); if (entry->fwmark) svc = __ip_vs_svc_fwm_find(ipvs, AF_INET, entry->fwmark); else svc = __ip_vs_service_find(ipvs, AF_INET, entry->protocol, &addr, entry->port); rcu_read_unlock(); if (svc) { ip_vs_copy_service(entry, svc); if (copy_to_user(user, entry, sizeof(*entry)) != 0) ret = -EFAULT; } else ret = -ESRCH; } break; case IP_VS_SO_GET_DESTS: { struct ip_vs_get_dests *get; int size; get = (struct ip_vs_get_dests *)arg; size = struct_size(get, entrytable, get->num_dests); if (*len != size) { pr_err("length: %u != %u\n", *len, size); ret = -EINVAL; goto out; } ret = __ip_vs_get_dest_entries(ipvs, get, user); } break; case IP_VS_SO_GET_TIMEOUT: { struct ip_vs_timeout_user t; __ip_vs_get_timeouts(ipvs, &t); if (copy_to_user(user, &t, sizeof(t)) != 0) ret = -EFAULT; } break; default: ret = -EINVAL; } out: mutex_unlock(&__ip_vs_mutex); return ret; } static struct nf_sockopt_ops ip_vs_sockopts = { .pf = PF_INET, .set_optmin = IP_VS_BASE_CTL, .set_optmax = IP_VS_SO_SET_MAX+1, .set = do_ip_vs_set_ctl, .get_optmin = IP_VS_BASE_CTL, .get_optmax = IP_VS_SO_GET_MAX+1, .get = do_ip_vs_get_ctl, .owner = THIS_MODULE, }; /* * Generic Netlink interface */ /* IPVS genetlink family */ static struct genl_family ip_vs_genl_family; /* Policy used for first-level command attributes */ static const struct nla_policy ip_vs_cmd_policy[IPVS_CMD_ATTR_MAX + 1] = { [IPVS_CMD_ATTR_SERVICE] = { .type = NLA_NESTED }, [IPVS_CMD_ATTR_DEST] = { .type = NLA_NESTED }, [IPVS_CMD_ATTR_DAEMON] = { .type = NLA_NESTED }, [IPVS_CMD_ATTR_TIMEOUT_TCP] = { .type = NLA_U32 }, [IPVS_CMD_ATTR_TIMEOUT_TCP_FIN] = { .type = NLA_U32 }, [IPVS_CMD_ATTR_TIMEOUT_UDP] = { .type = NLA_U32 }, }; /* Policy used for attributes in nested attribute IPVS_CMD_ATTR_DAEMON */ static const struct nla_policy ip_vs_daemon_policy[IPVS_DAEMON_ATTR_MAX + 1] = { [IPVS_DAEMON_ATTR_STATE] = { .type = NLA_U32 }, [IPVS_DAEMON_ATTR_MCAST_IFN] = { .type = NLA_NUL_STRING, .len = IP_VS_IFNAME_MAXLEN - 1 }, [IPVS_DAEMON_ATTR_SYNC_ID] = { .type = NLA_U32 }, [IPVS_DAEMON_ATTR_SYNC_MAXLEN] = { .type = NLA_U16 }, [IPVS_DAEMON_ATTR_MCAST_GROUP] = { .type = NLA_U32 }, [IPVS_DAEMON_ATTR_MCAST_GROUP6] = { .len = sizeof(struct in6_addr) }, [IPVS_DAEMON_ATTR_MCAST_PORT] = { .type = NLA_U16 }, [IPVS_DAEMON_ATTR_MCAST_TTL] = { .type = NLA_U8 }, }; /* Policy used for attributes in nested attribute IPVS_CMD_ATTR_SERVICE */ static const struct nla_policy ip_vs_svc_policy[IPVS_SVC_ATTR_MAX + 1] = { [IPVS_SVC_ATTR_AF] = { .type = NLA_U16 }, [IPVS_SVC_ATTR_PROTOCOL] = { .type = NLA_U16 }, [IPVS_SVC_ATTR_ADDR] = { .type = NLA_BINARY, .len = sizeof(union nf_inet_addr) }, [IPVS_SVC_ATTR_PORT] = { .type = NLA_U16 }, [IPVS_SVC_ATTR_FWMARK] = { .type = NLA_U32 }, [IPVS_SVC_ATTR_SCHED_NAME] = { .type = NLA_NUL_STRING, .len = IP_VS_SCHEDNAME_MAXLEN - 1 }, [IPVS_SVC_ATTR_PE_NAME] = { .type = NLA_NUL_STRING, .len = IP_VS_PENAME_MAXLEN }, [IPVS_SVC_ATTR_FLAGS] = { .type = NLA_BINARY, .len = sizeof(struct ip_vs_flags) }, [IPVS_SVC_ATTR_TIMEOUT] = { .type = NLA_U32 }, [IPVS_SVC_ATTR_NETMASK] = { .type = NLA_U32 }, [IPVS_SVC_ATTR_STATS] = { .type = NLA_NESTED }, }; /* Policy used for attributes in nested attribute IPVS_CMD_ATTR_DEST */ static const struct nla_policy ip_vs_dest_policy[IPVS_DEST_ATTR_MAX + 1] = { [IPVS_DEST_ATTR_ADDR] = { .type = NLA_BINARY, .len = sizeof(union nf_inet_addr) }, [IPVS_DEST_ATTR_PORT] = { .type = NLA_U16 }, [IPVS_DEST_ATTR_FWD_METHOD] = { .type = NLA_U32 }, [IPVS_DEST_ATTR_WEIGHT] = { .type = NLA_U32 }, [IPVS_DEST_ATTR_U_THRESH] = { .type = NLA_U32 }, [IPVS_DEST_ATTR_L_THRESH] = { .type = NLA_U32 }, [IPVS_DEST_ATTR_ACTIVE_CONNS] = { .type = NLA_U32 }, [IPVS_DEST_ATTR_INACT_CONNS] = { .type = NLA_U32 }, [IPVS_DEST_ATTR_PERSIST_CONNS] = { .type = NLA_U32 }, [IPVS_DEST_ATTR_STATS] = { .type = NLA_NESTED }, [IPVS_DEST_ATTR_ADDR_FAMILY] = { .type = NLA_U16 }, [IPVS_DEST_ATTR_TUN_TYPE] = { .type = NLA_U8 }, [IPVS_DEST_ATTR_TUN_PORT] = { .type = NLA_U16 }, [IPVS_DEST_ATTR_TUN_FLAGS] = { .type = NLA_U16 }, }; static int ip_vs_genl_fill_stats(struct sk_buff *skb, int container_type, struct ip_vs_kstats *kstats) { struct nlattr *nl_stats = nla_nest_start_noflag(skb, container_type); if (!nl_stats) return -EMSGSIZE; if (nla_put_u32(skb, IPVS_STATS_ATTR_CONNS, (u32)kstats->conns) || nla_put_u32(skb, IPVS_STATS_ATTR_INPKTS, (u32)kstats->inpkts) || nla_put_u32(skb, IPVS_STATS_ATTR_OUTPKTS, (u32)kstats->outpkts) || nla_put_u64_64bit(skb, IPVS_STATS_ATTR_INBYTES, kstats->inbytes, IPVS_STATS_ATTR_PAD) || nla_put_u64_64bit(skb, IPVS_STATS_ATTR_OUTBYTES, kstats->outbytes, IPVS_STATS_ATTR_PAD) || nla_put_u32(skb, IPVS_STATS_ATTR_CPS, (u32)kstats->cps) || nla_put_u32(skb, IPVS_STATS_ATTR_INPPS, (u32)kstats->inpps) || nla_put_u32(skb, IPVS_STATS_ATTR_OUTPPS, (u32)kstats->outpps) || nla_put_u32(skb, IPVS_STATS_ATTR_INBPS, (u32)kstats->inbps) || nla_put_u32(skb, IPVS_STATS_ATTR_OUTBPS, (u32)kstats->outbps)) goto nla_put_failure; nla_nest_end(skb, nl_stats); return 0; nla_put_failure: nla_nest_cancel(skb, nl_stats); return -EMSGSIZE; } static int ip_vs_genl_fill_stats64(struct sk_buff *skb, int container_type, struct ip_vs_kstats *kstats) { struct nlattr *nl_stats = nla_nest_start_noflag(skb, container_type); if (!nl_stats) return -EMSGSIZE; if (nla_put_u64_64bit(skb, IPVS_STATS_ATTR_CONNS, kstats->conns, IPVS_STATS_ATTR_PAD) || nla_put_u64_64bit(skb, IPVS_STATS_ATTR_INPKTS, kstats->inpkts, IPVS_STATS_ATTR_PAD) || nla_put_u64_64bit(skb, IPVS_STATS_ATTR_OUTPKTS, kstats->outpkts, IPVS_STATS_ATTR_PAD) || nla_put_u64_64bit(skb, IPVS_STATS_ATTR_INBYTES, kstats->inbytes, IPVS_STATS_ATTR_PAD) || nla_put_u64_64bit(skb, IPVS_STATS_ATTR_OUTBYTES, kstats->outbytes, IPVS_STATS_ATTR_PAD) || nla_put_u64_64bit(skb, IPVS_STATS_ATTR_CPS, kstats->cps, IPVS_STATS_ATTR_PAD) || nla_put_u64_64bit(skb, IPVS_STATS_ATTR_INPPS, kstats->inpps, IPVS_STATS_ATTR_PAD) || nla_put_u64_64bit(skb, IPVS_STATS_ATTR_OUTPPS, kstats->outpps, IPVS_STATS_ATTR_PAD) || nla_put_u64_64bit(skb, IPVS_STATS_ATTR_INBPS, kstats->inbps, IPVS_STATS_ATTR_PAD) || nla_put_u64_64bit(skb, IPVS_STATS_ATTR_OUTBPS, kstats->outbps, IPVS_STATS_ATTR_PAD)) goto nla_put_failure; nla_nest_end(skb, nl_stats); return 0; nla_put_failure: nla_nest_cancel(skb, nl_stats); return -EMSGSIZE; } static int ip_vs_genl_fill_service(struct sk_buff *skb, struct ip_vs_service *svc) { struct ip_vs_scheduler *sched; struct ip_vs_pe *pe; struct nlattr *nl_service; struct ip_vs_flags flags = { .flags = svc->flags, .mask = ~0 }; struct ip_vs_kstats kstats; char *sched_name; nl_service = nla_nest_start_noflag(skb, IPVS_CMD_ATTR_SERVICE); if (!nl_service) return -EMSGSIZE; if (nla_put_u16(skb, IPVS_SVC_ATTR_AF, svc->af)) goto nla_put_failure; if (svc->fwmark) { if (nla_put_u32(skb, IPVS_SVC_ATTR_FWMARK, svc->fwmark)) goto nla_put_failure; } else { if (nla_put_u16(skb, IPVS_SVC_ATTR_PROTOCOL, svc->protocol) || nla_put(skb, IPVS_SVC_ATTR_ADDR, sizeof(svc->addr), &svc->addr) || nla_put_be16(skb, IPVS_SVC_ATTR_PORT, svc->port)) goto nla_put_failure; } sched = rcu_dereference_protected(svc->scheduler, 1); sched_name = sched ? sched->name : "none"; pe = rcu_dereference_protected(svc->pe, 1); if (nla_put_string(skb, IPVS_SVC_ATTR_SCHED_NAME, sched_name) || (pe && nla_put_string(skb, IPVS_SVC_ATTR_PE_NAME, pe->name)) || nla_put(skb, IPVS_SVC_ATTR_FLAGS, sizeof(flags), &flags) || nla_put_u32(skb, IPVS_SVC_ATTR_TIMEOUT, svc->timeout / HZ) || nla_put_be32(skb, IPVS_SVC_ATTR_NETMASK, svc->netmask)) goto nla_put_failure; ip_vs_copy_stats(&kstats, &svc->stats); if (ip_vs_genl_fill_stats(skb, IPVS_SVC_ATTR_STATS, &kstats)) goto nla_put_failure; if (ip_vs_genl_fill_stats64(skb, IPVS_SVC_ATTR_STATS64, &kstats)) goto nla_put_failure; nla_nest_end(skb, nl_service); return 0; nla_put_failure: nla_nest_cancel(skb, nl_service); return -EMSGSIZE; } static int ip_vs_genl_dump_service(struct sk_buff *skb, struct ip_vs_service *svc, struct netlink_callback *cb) { void *hdr; hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, &ip_vs_genl_family, NLM_F_MULTI, IPVS_CMD_NEW_SERVICE); if (!hdr) return -EMSGSIZE; if (ip_vs_genl_fill_service(skb, svc) < 0) goto nla_put_failure; genlmsg_end(skb, hdr); return 0; nla_put_failure: genlmsg_cancel(skb, hdr); return -EMSGSIZE; } static int ip_vs_genl_dump_services(struct sk_buff *skb, struct netlink_callback *cb) { int idx = 0, i; int start = cb->args[0]; struct ip_vs_service *svc; struct net *net = sock_net(skb->sk); struct netns_ipvs *ipvs = net_ipvs(net); mutex_lock(&__ip_vs_mutex); for (i = 0; i < IP_VS_SVC_TAB_SIZE; i++) { hlist_for_each_entry(svc, &ip_vs_svc_table[i], s_list) { if (++idx <= start || (svc->ipvs != ipvs)) continue; if (ip_vs_genl_dump_service(skb, svc, cb) < 0) { idx--; goto nla_put_failure; } } } for (i = 0; i < IP_VS_SVC_TAB_SIZE; i++) { hlist_for_each_entry(svc, &ip_vs_svc_fwm_table[i], f_list) { if (++idx <= start || (svc->ipvs != ipvs)) continue; if (ip_vs_genl_dump_service(skb, svc, cb) < 0) { idx--; goto nla_put_failure; } } } nla_put_failure: mutex_unlock(&__ip_vs_mutex); cb->args[0] = idx; return skb->len; } static bool ip_vs_is_af_valid(int af) { if (af == AF_INET) return true; #ifdef CONFIG_IP_VS_IPV6 if (af == AF_INET6 && ipv6_mod_enabled()) return true; #endif return false; } static int ip_vs_genl_parse_service(struct netns_ipvs *ipvs, struct ip_vs_service_user_kern *usvc, struct nlattr *nla, bool full_entry, struct ip_vs_service **ret_svc) { struct nlattr *attrs[IPVS_SVC_ATTR_MAX + 1]; struct nlattr *nla_af, *nla_port, *nla_fwmark, *nla_protocol, *nla_addr; struct ip_vs_service *svc; /* Parse mandatory identifying service fields first */ if (nla == NULL || nla_parse_nested_deprecated(attrs, IPVS_SVC_ATTR_MAX, nla, ip_vs_svc_policy, NULL)) return -EINVAL; nla_af = attrs[IPVS_SVC_ATTR_AF]; nla_protocol = attrs[IPVS_SVC_ATTR_PROTOCOL]; nla_addr = attrs[IPVS_SVC_ATTR_ADDR]; nla_port = attrs[IPVS_SVC_ATTR_PORT]; nla_fwmark = attrs[IPVS_SVC_ATTR_FWMARK]; if (!(nla_af && (nla_fwmark || (nla_port && nla_protocol && nla_addr)))) return -EINVAL; memset(usvc, 0, sizeof(*usvc)); usvc->af = nla_get_u16(nla_af); if (!ip_vs_is_af_valid(usvc->af)) return -EAFNOSUPPORT; if (nla_fwmark) { usvc->protocol = IPPROTO_TCP; usvc->fwmark = nla_get_u32(nla_fwmark); } else { usvc->protocol = nla_get_u16(nla_protocol); nla_memcpy(&usvc->addr, nla_addr, sizeof(usvc->addr)); usvc->port = nla_get_be16(nla_port); usvc->fwmark = 0; } rcu_read_lock(); if (usvc->fwmark) svc = __ip_vs_svc_fwm_find(ipvs, usvc->af, usvc->fwmark); else svc = __ip_vs_service_find(ipvs, usvc->af, usvc->protocol, &usvc->addr, usvc->port); rcu_read_unlock(); *ret_svc = svc; /* If a full entry was requested, check for the additional fields */ if (full_entry) { struct nlattr *nla_sched, *nla_flags, *nla_pe, *nla_timeout, *nla_netmask; struct ip_vs_flags flags; nla_sched = attrs[IPVS_SVC_ATTR_SCHED_NAME]; nla_pe = attrs[IPVS_SVC_ATTR_PE_NAME]; nla_flags = attrs[IPVS_SVC_ATTR_FLAGS]; nla_timeout = attrs[IPVS_SVC_ATTR_TIMEOUT]; nla_netmask = attrs[IPVS_SVC_ATTR_NETMASK]; if (!(nla_sched && nla_flags && nla_timeout && nla_netmask)) return -EINVAL; nla_memcpy(&flags, nla_flags, sizeof(flags)); /* prefill flags from service if it already exists */ if (svc) usvc->flags = svc->flags; /* set new flags from userland */ usvc->flags = (usvc->flags & ~flags.mask) | (flags.flags & flags.mask); usvc->sched_name = nla_data(nla_sched); usvc->pe_name = nla_pe ? nla_data(nla_pe) : NULL; usvc->timeout = nla_get_u32(nla_timeout); usvc->netmask = nla_get_be32(nla_netmask); } return 0; } static struct ip_vs_service *ip_vs_genl_find_service(struct netns_ipvs *ipvs, struct nlattr *nla) { struct ip_vs_service_user_kern usvc; struct ip_vs_service *svc; int ret; ret = ip_vs_genl_parse_service(ipvs, &usvc, nla, false, &svc); return ret ? ERR_PTR(ret) : svc; } static int ip_vs_genl_fill_dest(struct sk_buff *skb, struct ip_vs_dest *dest) { struct nlattr *nl_dest; struct ip_vs_kstats kstats; nl_dest = nla_nest_start_noflag(skb, IPVS_CMD_ATTR_DEST); if (!nl_dest) return -EMSGSIZE; if (nla_put(skb, IPVS_DEST_ATTR_ADDR, sizeof(dest->addr), &dest->addr) || nla_put_be16(skb, IPVS_DEST_ATTR_PORT, dest->port) || nla_put_u32(skb, IPVS_DEST_ATTR_FWD_METHOD, (atomic_read(&dest->conn_flags) & IP_VS_CONN_F_FWD_MASK)) || nla_put_u32(skb, IPVS_DEST_ATTR_WEIGHT, atomic_read(&dest->weight)) || nla_put_u8(skb, IPVS_DEST_ATTR_TUN_TYPE, dest->tun_type) || nla_put_be16(skb, IPVS_DEST_ATTR_TUN_PORT, dest->tun_port) || nla_put_u16(skb, IPVS_DEST_ATTR_TUN_FLAGS, dest->tun_flags) || nla_put_u32(skb, IPVS_DEST_ATTR_U_THRESH, dest->u_threshold) || nla_put_u32(skb, IPVS_DEST_ATTR_L_THRESH, dest->l_threshold) || nla_put_u32(skb, IPVS_DEST_ATTR_ACTIVE_CONNS, atomic_read(&dest->activeconns)) || nla_put_u32(skb, IPVS_DEST_ATTR_INACT_CONNS, atomic_read(&dest->inactconns)) || nla_put_u32(skb, IPVS_DEST_ATTR_PERSIST_CONNS, atomic_read(&dest->persistconns)) || nla_put_u16(skb, IPVS_DEST_ATTR_ADDR_FAMILY, dest->af)) goto nla_put_failure; ip_vs_copy_stats(&kstats, &dest->stats); if (ip_vs_genl_fill_stats(skb, IPVS_DEST_ATTR_STATS, &kstats)) goto nla_put_failure; if (ip_vs_genl_fill_stats64(skb, IPVS_DEST_ATTR_STATS64, &kstats)) goto nla_put_failure; nla_nest_end(skb, nl_dest); return 0; nla_put_failure: nla_nest_cancel(skb, nl_dest); return -EMSGSIZE; } static int ip_vs_genl_dump_dest(struct sk_buff *skb, struct ip_vs_dest *dest, struct netlink_callback *cb) { void *hdr; hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, &ip_vs_genl_family, NLM_F_MULTI, IPVS_CMD_NEW_DEST); if (!hdr) return -EMSGSIZE; if (ip_vs_genl_fill_dest(skb, dest) < 0) goto nla_put_failure; genlmsg_end(skb, hdr); return 0; nla_put_failure: genlmsg_cancel(skb, hdr); return -EMSGSIZE; } static int ip_vs_genl_dump_dests(struct sk_buff *skb, struct netlink_callback *cb) { int idx = 0; int start = cb->args[0]; struct ip_vs_service *svc; struct ip_vs_dest *dest; struct nlattr *attrs[IPVS_CMD_ATTR_MAX + 1]; struct net *net = sock_net(skb->sk); struct netns_ipvs *ipvs = net_ipvs(net); mutex_lock(&__ip_vs_mutex); /* Try to find the service for which to dump destinations */ if (nlmsg_parse_deprecated(cb->nlh, GENL_HDRLEN, attrs, IPVS_CMD_ATTR_MAX, ip_vs_cmd_policy, cb->extack)) goto out_err; svc = ip_vs_genl_find_service(ipvs, attrs[IPVS_CMD_ATTR_SERVICE]); if (IS_ERR_OR_NULL(svc)) goto out_err; /* Dump the destinations */ list_for_each_entry(dest, &svc->destinations, n_list) { if (++idx <= start) continue; if (ip_vs_genl_dump_dest(skb, dest, cb) < 0) { idx--; goto nla_put_failure; } } nla_put_failure: cb->args[0] = idx; out_err: mutex_unlock(&__ip_vs_mutex); return skb->len; } static int ip_vs_genl_parse_dest(struct ip_vs_dest_user_kern *udest, struct nlattr *nla, bool full_entry) { struct nlattr *attrs[IPVS_DEST_ATTR_MAX + 1]; struct nlattr *nla_addr, *nla_port; struct nlattr *nla_addr_family; /* Parse mandatory identifying destination fields first */ if (nla == NULL || nla_parse_nested_deprecated(attrs, IPVS_DEST_ATTR_MAX, nla, ip_vs_dest_policy, NULL)) return -EINVAL; nla_addr = attrs[IPVS_DEST_ATTR_ADDR]; nla_port = attrs[IPVS_DEST_ATTR_PORT]; nla_addr_family = attrs[IPVS_DEST_ATTR_ADDR_FAMILY]; if (!(nla_addr && nla_port)) return -EINVAL; memset(udest, 0, sizeof(*udest)); nla_memcpy(&udest->addr, nla_addr, sizeof(udest->addr)); udest->port = nla_get_be16(nla_port); if (nla_addr_family) udest->af = nla_get_u16(nla_addr_family); else udest->af = 0; /* If a full entry was requested, check for the additional fields */ if (full_entry) { struct nlattr *nla_fwd, *nla_weight, *nla_u_thresh, *nla_l_thresh, *nla_tun_type, *nla_tun_port, *nla_tun_flags; nla_fwd = attrs[IPVS_DEST_ATTR_FWD_METHOD]; nla_weight = attrs[IPVS_DEST_ATTR_WEIGHT]; nla_u_thresh = attrs[IPVS_DEST_ATTR_U_THRESH]; nla_l_thresh = attrs[IPVS_DEST_ATTR_L_THRESH]; nla_tun_type = attrs[IPVS_DEST_ATTR_TUN_TYPE]; nla_tun_port = attrs[IPVS_DEST_ATTR_TUN_PORT]; nla_tun_flags = attrs[IPVS_DEST_ATTR_TUN_FLAGS]; if (!(nla_fwd && nla_weight && nla_u_thresh && nla_l_thresh)) return -EINVAL; udest->conn_flags = nla_get_u32(nla_fwd) & IP_VS_CONN_F_FWD_MASK; udest->weight = nla_get_u32(nla_weight); udest->u_threshold = nla_get_u32(nla_u_thresh); udest->l_threshold = nla_get_u32(nla_l_thresh); if (nla_tun_type) udest->tun_type = nla_get_u8(nla_tun_type); if (nla_tun_port) udest->tun_port = nla_get_be16(nla_tun_port); if (nla_tun_flags) udest->tun_flags = nla_get_u16(nla_tun_flags); } return 0; } static int ip_vs_genl_fill_daemon(struct sk_buff *skb, __u32 state, struct ipvs_sync_daemon_cfg *c) { struct nlattr *nl_daemon; nl_daemon = nla_nest_start_noflag(skb, IPVS_CMD_ATTR_DAEMON); if (!nl_daemon) return -EMSGSIZE; if (nla_put_u32(skb, IPVS_DAEMON_ATTR_STATE, state) || nla_put_string(skb, IPVS_DAEMON_ATTR_MCAST_IFN, c->mcast_ifn) || nla_put_u32(skb, IPVS_DAEMON_ATTR_SYNC_ID, c->syncid) || nla_put_u16(skb, IPVS_DAEMON_ATTR_SYNC_MAXLEN, c->sync_maxlen) || nla_put_u16(skb, IPVS_DAEMON_ATTR_MCAST_PORT, c->mcast_port) || nla_put_u8(skb, IPVS_DAEMON_ATTR_MCAST_TTL, c->mcast_ttl)) goto nla_put_failure; #ifdef CONFIG_IP_VS_IPV6 if (c->mcast_af == AF_INET6) { if (nla_put_in6_addr(skb, IPVS_DAEMON_ATTR_MCAST_GROUP6, &c->mcast_group.in6)) goto nla_put_failure; } else #endif if (c->mcast_af == AF_INET && nla_put_in_addr(skb, IPVS_DAEMON_ATTR_MCAST_GROUP, c->mcast_group.ip)) goto nla_put_failure; nla_nest_end(skb, nl_daemon); return 0; nla_put_failure: nla_nest_cancel(skb, nl_daemon); return -EMSGSIZE; } static int ip_vs_genl_dump_daemon(struct sk_buff *skb, __u32 state, struct ipvs_sync_daemon_cfg *c, struct netlink_callback *cb) { void *hdr; hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, &ip_vs_genl_family, NLM_F_MULTI, IPVS_CMD_NEW_DAEMON); if (!hdr) return -EMSGSIZE; if (ip_vs_genl_fill_daemon(skb, state, c)) goto nla_put_failure; genlmsg_end(skb, hdr); return 0; nla_put_failure: genlmsg_cancel(skb, hdr); return -EMSGSIZE; } static int ip_vs_genl_dump_daemons(struct sk_buff *skb, struct netlink_callback *cb) { struct net *net = sock_net(skb->sk); struct netns_ipvs *ipvs = net_ipvs(net); mutex_lock(&ipvs->sync_mutex); if ((ipvs->sync_state & IP_VS_STATE_MASTER) && !cb->args[0]) { if (ip_vs_genl_dump_daemon(skb, IP_VS_STATE_MASTER, &ipvs->mcfg, cb) < 0) goto nla_put_failure; cb->args[0] = 1; } if ((ipvs->sync_state & IP_VS_STATE_BACKUP) && !cb->args[1]) { if (ip_vs_genl_dump_daemon(skb, IP_VS_STATE_BACKUP, &ipvs->bcfg, cb) < 0) goto nla_put_failure; cb->args[1] = 1; } nla_put_failure: mutex_unlock(&ipvs->sync_mutex); return skb->len; } static int ip_vs_genl_new_daemon(struct netns_ipvs *ipvs, struct nlattr **attrs) { struct ipvs_sync_daemon_cfg c; struct nlattr *a; int ret; memset(&c, 0, sizeof(c)); if (!(attrs[IPVS_DAEMON_ATTR_STATE] && attrs[IPVS_DAEMON_ATTR_MCAST_IFN] && attrs[IPVS_DAEMON_ATTR_SYNC_ID])) return -EINVAL; strscpy(c.mcast_ifn, nla_data(attrs[IPVS_DAEMON_ATTR_MCAST_IFN]), sizeof(c.mcast_ifn)); c.syncid = nla_get_u32(attrs[IPVS_DAEMON_ATTR_SYNC_ID]); a = attrs[IPVS_DAEMON_ATTR_SYNC_MAXLEN]; if (a) c.sync_maxlen = nla_get_u16(a); a = attrs[IPVS_DAEMON_ATTR_MCAST_GROUP]; if (a) { c.mcast_af = AF_INET; c.mcast_group.ip = nla_get_in_addr(a); if (!ipv4_is_multicast(c.mcast_group.ip)) return -EINVAL; } else { a = attrs[IPVS_DAEMON_ATTR_MCAST_GROUP6]; if (a) { #ifdef CONFIG_IP_VS_IPV6 int addr_type; c.mcast_af = AF_INET6; c.mcast_group.in6 = nla_get_in6_addr(a); addr_type = ipv6_addr_type(&c.mcast_group.in6); if (!(addr_type & IPV6_ADDR_MULTICAST)) return -EINVAL; #else return -EAFNOSUPPORT; #endif } } a = attrs[IPVS_DAEMON_ATTR_MCAST_PORT]; if (a) c.mcast_port = nla_get_u16(a); a = attrs[IPVS_DAEMON_ATTR_MCAST_TTL]; if (a) c.mcast_ttl = nla_get_u8(a); /* The synchronization protocol is incompatible with mixed family * services */ if (ipvs->mixed_address_family_dests > 0) return -EINVAL; ret = start_sync_thread(ipvs, &c, nla_get_u32(attrs[IPVS_DAEMON_ATTR_STATE])); return ret; } static int ip_vs_genl_del_daemon(struct netns_ipvs *ipvs, struct nlattr **attrs) { int ret; if (!attrs[IPVS_DAEMON_ATTR_STATE]) return -EINVAL; ret = stop_sync_thread(ipvs, nla_get_u32(attrs[IPVS_DAEMON_ATTR_STATE])); return ret; } static int ip_vs_genl_set_config(struct netns_ipvs *ipvs, struct nlattr **attrs) { struct ip_vs_timeout_user t; __ip_vs_get_timeouts(ipvs, &t); if (attrs[IPVS_CMD_ATTR_TIMEOUT_TCP]) t.tcp_timeout = nla_get_u32(attrs[IPVS_CMD_ATTR_TIMEOUT_TCP]); if (attrs[IPVS_CMD_ATTR_TIMEOUT_TCP_FIN]) t.tcp_fin_timeout = nla_get_u32(attrs[IPVS_CMD_ATTR_TIMEOUT_TCP_FIN]); if (attrs[IPVS_CMD_ATTR_TIMEOUT_UDP]) t.udp_timeout = nla_get_u32(attrs[IPVS_CMD_ATTR_TIMEOUT_UDP]); return ip_vs_set_timeout(ipvs, &t); } static int ip_vs_genl_set_daemon(struct sk_buff *skb, struct genl_info *info) { int ret = -EINVAL, cmd; struct net *net = sock_net(skb->sk); struct netns_ipvs *ipvs = net_ipvs(net); cmd = info->genlhdr->cmd; if (cmd == IPVS_CMD_NEW_DAEMON || cmd == IPVS_CMD_DEL_DAEMON) { struct nlattr *daemon_attrs[IPVS_DAEMON_ATTR_MAX + 1]; if (!info->attrs[IPVS_CMD_ATTR_DAEMON] || nla_parse_nested_deprecated(daemon_attrs, IPVS_DAEMON_ATTR_MAX, info->attrs[IPVS_CMD_ATTR_DAEMON], ip_vs_daemon_policy, info->extack)) goto out; if (cmd == IPVS_CMD_NEW_DAEMON) ret = ip_vs_genl_new_daemon(ipvs, daemon_attrs); else ret = ip_vs_genl_del_daemon(ipvs, daemon_attrs); } out: return ret; } static int ip_vs_genl_set_cmd(struct sk_buff *skb, struct genl_info *info) { bool need_full_svc = false, need_full_dest = false; struct ip_vs_service *svc = NULL; struct ip_vs_service_user_kern usvc; struct ip_vs_dest_user_kern udest; int ret = 0, cmd; struct net *net = sock_net(skb->sk); struct netns_ipvs *ipvs = net_ipvs(net); cmd = info->genlhdr->cmd; mutex_lock(&__ip_vs_mutex); if (cmd == IPVS_CMD_FLUSH) { ret = ip_vs_flush(ipvs, false); goto out; } else if (cmd == IPVS_CMD_SET_CONFIG) { ret = ip_vs_genl_set_config(ipvs, info->attrs); goto out; } else if (cmd == IPVS_CMD_ZERO && !info->attrs[IPVS_CMD_ATTR_SERVICE]) { ret = ip_vs_zero_all(ipvs); goto out; } /* All following commands require a service argument, so check if we * received a valid one. We need a full service specification when * adding / editing a service. Only identifying members otherwise. */ if (cmd == IPVS_CMD_NEW_SERVICE || cmd == IPVS_CMD_SET_SERVICE) need_full_svc = true; ret = ip_vs_genl_parse_service(ipvs, &usvc, info->attrs[IPVS_CMD_ATTR_SERVICE], need_full_svc, &svc); if (ret) goto out; /* Unless we're adding a new service, the service must already exist */ if ((cmd != IPVS_CMD_NEW_SERVICE) && (svc == NULL)) { ret = -ESRCH; goto out; } /* Destination commands require a valid destination argument. For * adding / editing a destination, we need a full destination * specification. */ if (cmd == IPVS_CMD_NEW_DEST || cmd == IPVS_CMD_SET_DEST || cmd == IPVS_CMD_DEL_DEST) { if (cmd != IPVS_CMD_DEL_DEST) need_full_dest = true; ret = ip_vs_genl_parse_dest(&udest, info->attrs[IPVS_CMD_ATTR_DEST], need_full_dest); if (ret) goto out; /* Old protocols did not allow the user to specify address * family, so we set it to zero instead. We also didn't * allow heterogeneous pools in the old code, so it's safe * to assume that this will have the same address family as * the service. */ if (udest.af == 0) udest.af = svc->af; if (!ip_vs_is_af_valid(udest.af)) { ret = -EAFNOSUPPORT; goto out; } if (udest.af != svc->af && cmd != IPVS_CMD_DEL_DEST) { /* The synchronization protocol is incompatible * with mixed family services */ if (ipvs->sync_state) { ret = -EINVAL; goto out; } /* Which connection types do we support? */ switch (udest.conn_flags) { case IP_VS_CONN_F_TUNNEL: /* We are able to forward this */ break; default: ret = -EINVAL; goto out; } } } switch (cmd) { case IPVS_CMD_NEW_SERVICE: if (svc == NULL) ret = ip_vs_add_service(ipvs, &usvc, &svc); else ret = -EEXIST; break; case IPVS_CMD_SET_SERVICE: ret = ip_vs_edit_service(svc, &usvc); break; case IPVS_CMD_DEL_SERVICE: ret = ip_vs_del_service(svc); /* do not use svc, it can be freed */ break; case IPVS_CMD_NEW_DEST: ret = ip_vs_add_dest(svc, &udest); break; case IPVS_CMD_SET_DEST: ret = ip_vs_edit_dest(svc, &udest); break; case IPVS_CMD_DEL_DEST: ret = ip_vs_del_dest(svc, &udest); break; case IPVS_CMD_ZERO: ret = ip_vs_zero_service(svc); break; default: ret = -EINVAL; } out: mutex_unlock(&__ip_vs_mutex); return ret; } static int ip_vs_genl_get_cmd(struct sk_buff *skb, struct genl_info *info) { struct sk_buff *msg; void *reply; int ret, cmd, reply_cmd; struct net *net = sock_net(skb->sk); struct netns_ipvs *ipvs = net_ipvs(net); cmd = info->genlhdr->cmd; if (cmd == IPVS_CMD_GET_SERVICE) reply_cmd = IPVS_CMD_NEW_SERVICE; else if (cmd == IPVS_CMD_GET_INFO) reply_cmd = IPVS_CMD_SET_INFO; else if (cmd == IPVS_CMD_GET_CONFIG) reply_cmd = IPVS_CMD_SET_CONFIG; else { pr_err("unknown Generic Netlink command\n"); return -EINVAL; } msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) return -ENOMEM; mutex_lock(&__ip_vs_mutex); reply = genlmsg_put_reply(msg, info, &ip_vs_genl_family, 0, reply_cmd); if (reply == NULL) goto nla_put_failure; switch (cmd) { case IPVS_CMD_GET_SERVICE: { struct ip_vs_service *svc; svc = ip_vs_genl_find_service(ipvs, info->attrs[IPVS_CMD_ATTR_SERVICE]); if (IS_ERR(svc)) { ret = PTR_ERR(svc); goto out_err; } else if (svc) { ret = ip_vs_genl_fill_service(msg, svc); if (ret) goto nla_put_failure; } else { ret = -ESRCH; goto out_err; } break; } case IPVS_CMD_GET_CONFIG: { struct ip_vs_timeout_user t; __ip_vs_get_timeouts(ipvs, &t); #ifdef CONFIG_IP_VS_PROTO_TCP if (nla_put_u32(msg, IPVS_CMD_ATTR_TIMEOUT_TCP, t.tcp_timeout) || nla_put_u32(msg, IPVS_CMD_ATTR_TIMEOUT_TCP_FIN, t.tcp_fin_timeout)) goto nla_put_failure; #endif #ifdef CONFIG_IP_VS_PROTO_UDP if (nla_put_u32(msg, IPVS_CMD_ATTR_TIMEOUT_UDP, t.udp_timeout)) goto nla_put_failure; #endif break; } case IPVS_CMD_GET_INFO: if (nla_put_u32(msg, IPVS_INFO_ATTR_VERSION, IP_VS_VERSION_CODE) || nla_put_u32(msg, IPVS_INFO_ATTR_CONN_TAB_SIZE, ip_vs_conn_tab_size)) goto nla_put_failure; break; } genlmsg_end(msg, reply); ret = genlmsg_reply(msg, info); goto out; nla_put_failure: pr_err("not enough space in Netlink message\n"); ret = -EMSGSIZE; out_err: nlmsg_free(msg); out: mutex_unlock(&__ip_vs_mutex); return ret; } static const struct genl_small_ops ip_vs_genl_ops[] = { { .cmd = IPVS_CMD_NEW_SERVICE, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = GENL_ADMIN_PERM, .doit = ip_vs_genl_set_cmd, }, { .cmd = IPVS_CMD_SET_SERVICE, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = GENL_ADMIN_PERM, .doit = ip_vs_genl_set_cmd, }, { .cmd = IPVS_CMD_DEL_SERVICE, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = GENL_ADMIN_PERM, .doit = ip_vs_genl_set_cmd, }, { .cmd = IPVS_CMD_GET_SERVICE, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = GENL_ADMIN_PERM, .doit = ip_vs_genl_get_cmd, .dumpit = ip_vs_genl_dump_services, }, { .cmd = IPVS_CMD_NEW_DEST, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = GENL_ADMIN_PERM, .doit = ip_vs_genl_set_cmd, }, { .cmd = IPVS_CMD_SET_DEST, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = GENL_ADMIN_PERM, .doit = ip_vs_genl_set_cmd, }, { .cmd = IPVS_CMD_DEL_DEST, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = GENL_ADMIN_PERM, .doit = ip_vs_genl_set_cmd, }, { .cmd = IPVS_CMD_GET_DEST, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = GENL_ADMIN_PERM, .dumpit = ip_vs_genl_dump_dests, }, { .cmd = IPVS_CMD_NEW_DAEMON, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = GENL_ADMIN_PERM, .doit = ip_vs_genl_set_daemon, }, { .cmd = IPVS_CMD_DEL_DAEMON, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = GENL_ADMIN_PERM, .doit = ip_vs_genl_set_daemon, }, { .cmd = IPVS_CMD_GET_DAEMON, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = GENL_ADMIN_PERM, .dumpit = ip_vs_genl_dump_daemons, }, { .cmd = IPVS_CMD_SET_CONFIG, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = GENL_ADMIN_PERM, .doit = ip_vs_genl_set_cmd, }, { .cmd = IPVS_CMD_GET_CONFIG, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = GENL_ADMIN_PERM, .doit = ip_vs_genl_get_cmd, }, { .cmd = IPVS_CMD_GET_INFO, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = GENL_ADMIN_PERM, .doit = ip_vs_genl_get_cmd, }, { .cmd = IPVS_CMD_ZERO, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = GENL_ADMIN_PERM, .doit = ip_vs_genl_set_cmd, }, { .cmd = IPVS_CMD_FLUSH, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = GENL_ADMIN_PERM, .doit = ip_vs_genl_set_cmd, }, }; static struct genl_family ip_vs_genl_family __ro_after_init = { .hdrsize = 0, .name = IPVS_GENL_NAME, .version = IPVS_GENL_VERSION, .maxattr = IPVS_CMD_ATTR_MAX, .policy = ip_vs_cmd_policy, .netnsok = true, /* Make ipvsadm to work on netns */ .module = THIS_MODULE, .small_ops = ip_vs_genl_ops, .n_small_ops = ARRAY_SIZE(ip_vs_genl_ops), .resv_start_op = IPVS_CMD_FLUSH + 1, }; static int __init ip_vs_genl_register(void) { return genl_register_family(&ip_vs_genl_family); } static void ip_vs_genl_unregister(void) { genl_unregister_family(&ip_vs_genl_family); } /* End of Generic Netlink interface definitions */ /* * per netns intit/exit func. */ #ifdef CONFIG_SYSCTL static int __net_init ip_vs_control_net_init_sysctl(struct netns_ipvs *ipvs) { struct net *net = ipvs->net; struct ctl_table *tbl; int idx, ret; size_t ctl_table_size = ARRAY_SIZE(vs_vars); bool unpriv = net->user_ns != &init_user_ns; atomic_set(&ipvs->dropentry, 0); spin_lock_init(&ipvs->dropentry_lock); spin_lock_init(&ipvs->droppacket_lock); spin_lock_init(&ipvs->securetcp_lock); INIT_DELAYED_WORK(&ipvs->defense_work, defense_work_handler); INIT_DELAYED_WORK(&ipvs->expire_nodest_conn_work, expire_nodest_conn_handler); ipvs->est_stopped = 0; if (!net_eq(net, &init_net)) { tbl = kmemdup(vs_vars, sizeof(vs_vars), GFP_KERNEL); if (tbl == NULL) return -ENOMEM; } else tbl = vs_vars; /* Initialize sysctl defaults */ for (idx = 0; idx < ARRAY_SIZE(vs_vars); idx++) { if (tbl[idx].proc_handler == proc_do_defense_mode) tbl[idx].extra2 = ipvs; } idx = 0; ipvs->sysctl_amemthresh = 1024; tbl[idx++].data = &ipvs->sysctl_amemthresh; ipvs->sysctl_am_droprate = 10; tbl[idx++].data = &ipvs->sysctl_am_droprate; tbl[idx++].data = &ipvs->sysctl_drop_entry; tbl[idx++].data = &ipvs->sysctl_drop_packet; #ifdef CONFIG_IP_VS_NFCT tbl[idx++].data = &ipvs->sysctl_conntrack; #endif tbl[idx++].data = &ipvs->sysctl_secure_tcp; ipvs->sysctl_snat_reroute = 1; tbl[idx++].data = &ipvs->sysctl_snat_reroute; ipvs->sysctl_sync_ver = 1; tbl[idx++].data = &ipvs->sysctl_sync_ver; ipvs->sysctl_sync_ports = 1; tbl[idx++].data = &ipvs->sysctl_sync_ports; tbl[idx++].data = &ipvs->sysctl_sync_persist_mode; ipvs->sysctl_sync_qlen_max = nr_free_buffer_pages() / 32; if (unpriv) tbl[idx].mode = 0444; tbl[idx++].data = &ipvs->sysctl_sync_qlen_max; ipvs->sysctl_sync_sock_size = 0; if (unpriv) tbl[idx].mode = 0444; tbl[idx++].data = &ipvs->sysctl_sync_sock_size; tbl[idx++].data = &ipvs->sysctl_cache_bypass; tbl[idx++].data = &ipvs->sysctl_expire_nodest_conn; tbl[idx++].data = &ipvs->sysctl_sloppy_tcp; tbl[idx++].data = &ipvs->sysctl_sloppy_sctp; tbl[idx++].data = &ipvs->sysctl_expire_quiescent_template; ipvs->sysctl_sync_threshold[0] = DEFAULT_SYNC_THRESHOLD; ipvs->sysctl_sync_threshold[1] = DEFAULT_SYNC_PERIOD; tbl[idx].data = &ipvs->sysctl_sync_threshold; tbl[idx].extra2 = ipvs; tbl[idx++].maxlen = sizeof(ipvs->sysctl_sync_threshold); ipvs->sysctl_sync_refresh_period = DEFAULT_SYNC_REFRESH_PERIOD; tbl[idx++].data = &ipvs->sysctl_sync_refresh_period; ipvs->sysctl_sync_retries = clamp_t(int, DEFAULT_SYNC_RETRIES, 0, 3); tbl[idx++].data = &ipvs->sysctl_sync_retries; tbl[idx++].data = &ipvs->sysctl_nat_icmp_send; ipvs->sysctl_pmtu_disc = 1; tbl[idx++].data = &ipvs->sysctl_pmtu_disc; tbl[idx++].data = &ipvs->sysctl_backup_only; ipvs->sysctl_conn_reuse_mode = 1; tbl[idx++].data = &ipvs->sysctl_conn_reuse_mode; tbl[idx++].data = &ipvs->sysctl_schedule_icmp; tbl[idx++].data = &ipvs->sysctl_ignore_tunneled; ipvs->sysctl_run_estimation = 1; if (unpriv) tbl[idx].mode = 0444; tbl[idx].extra2 = ipvs; tbl[idx++].data = &ipvs->sysctl_run_estimation; ipvs->est_cpulist_valid = 0; if (unpriv) tbl[idx].mode = 0444; tbl[idx].extra2 = ipvs; tbl[idx++].data = &ipvs->sysctl_est_cpulist; ipvs->sysctl_est_nice = IPVS_EST_NICE; if (unpriv) tbl[idx].mode = 0444; tbl[idx].extra2 = ipvs; tbl[idx++].data = &ipvs->sysctl_est_nice; #ifdef CONFIG_IP_VS_DEBUG /* Global sysctls must be ro in non-init netns */ if (!net_eq(net, &init_net)) tbl[idx++].mode = 0444; #endif ret = -ENOMEM; ipvs->sysctl_hdr = register_net_sysctl_sz(net, "net/ipv4/vs", tbl, ctl_table_size); if (!ipvs->sysctl_hdr) goto err; ipvs->sysctl_tbl = tbl; ret = ip_vs_start_estimator(ipvs, &ipvs->tot_stats->s); if (ret < 0) goto err; /* Schedule defense work */ queue_delayed_work(system_long_wq, &ipvs->defense_work, DEFENSE_TIMER_PERIOD); return 0; err: unregister_net_sysctl_table(ipvs->sysctl_hdr); if (!net_eq(net, &init_net)) kfree(tbl); return ret; } static void __net_exit ip_vs_control_net_cleanup_sysctl(struct netns_ipvs *ipvs) { struct net *net = ipvs->net; cancel_delayed_work_sync(&ipvs->expire_nodest_conn_work); cancel_delayed_work_sync(&ipvs->defense_work); cancel_work_sync(&ipvs->defense_work.work); unregister_net_sysctl_table(ipvs->sysctl_hdr); ip_vs_stop_estimator(ipvs, &ipvs->tot_stats->s); if (ipvs->est_cpulist_valid) free_cpumask_var(ipvs->sysctl_est_cpulist); if (!net_eq(net, &init_net)) kfree(ipvs->sysctl_tbl); } #else static int __net_init ip_vs_control_net_init_sysctl(struct netns_ipvs *ipvs) { return 0; } static void __net_exit ip_vs_control_net_cleanup_sysctl(struct netns_ipvs *ipvs) { } #endif static struct notifier_block ip_vs_dst_notifier = { .notifier_call = ip_vs_dst_event, #ifdef CONFIG_IP_VS_IPV6 .priority = ADDRCONF_NOTIFY_PRIORITY + 5, #endif }; int __net_init ip_vs_control_net_init(struct netns_ipvs *ipvs) { int ret = -ENOMEM; int idx; /* Initialize rs_table */ for (idx = 0; idx < IP_VS_RTAB_SIZE; idx++) INIT_HLIST_HEAD(&ipvs->rs_table[idx]); INIT_LIST_HEAD(&ipvs->dest_trash); spin_lock_init(&ipvs->dest_trash_lock); timer_setup(&ipvs->dest_trash_timer, ip_vs_dest_trash_expire, 0); atomic_set(&ipvs->ftpsvc_counter, 0); atomic_set(&ipvs->nullsvc_counter, 0); atomic_set(&ipvs->conn_out_counter, 0); INIT_DELAYED_WORK(&ipvs->est_reload_work, est_reload_work_handler); /* procfs stats */ ipvs->tot_stats = kzalloc(sizeof(*ipvs->tot_stats), GFP_KERNEL); if (!ipvs->tot_stats) goto out; if (ip_vs_stats_init_alloc(&ipvs->tot_stats->s) < 0) goto err_tot_stats; #ifdef CONFIG_PROC_FS if (!proc_create_net("ip_vs", 0, ipvs->net->proc_net, &ip_vs_info_seq_ops, sizeof(struct ip_vs_iter))) goto err_vs; if (!proc_create_net_single("ip_vs_stats", 0, ipvs->net->proc_net, ip_vs_stats_show, NULL)) goto err_stats; if (!proc_create_net_single("ip_vs_stats_percpu", 0, ipvs->net->proc_net, ip_vs_stats_percpu_show, NULL)) goto err_percpu; #endif ret = ip_vs_control_net_init_sysctl(ipvs); if (ret < 0) goto err; return 0; err: #ifdef CONFIG_PROC_FS remove_proc_entry("ip_vs_stats_percpu", ipvs->net->proc_net); err_percpu: remove_proc_entry("ip_vs_stats", ipvs->net->proc_net); err_stats: remove_proc_entry("ip_vs", ipvs->net->proc_net); err_vs: #endif ip_vs_stats_release(&ipvs->tot_stats->s); err_tot_stats: kfree(ipvs->tot_stats); out: return ret; } void __net_exit ip_vs_control_net_cleanup(struct netns_ipvs *ipvs) { ip_vs_trash_cleanup(ipvs); ip_vs_control_net_cleanup_sysctl(ipvs); cancel_delayed_work_sync(&ipvs->est_reload_work); #ifdef CONFIG_PROC_FS remove_proc_entry("ip_vs_stats_percpu", ipvs->net->proc_net); remove_proc_entry("ip_vs_stats", ipvs->net->proc_net); remove_proc_entry("ip_vs", ipvs->net->proc_net); #endif call_rcu(&ipvs->tot_stats->rcu_head, ip_vs_stats_rcu_free); } int __init ip_vs_register_nl_ioctl(void) { int ret; ret = nf_register_sockopt(&ip_vs_sockopts); if (ret) { pr_err("cannot register sockopt.\n"); goto err_sock; } ret = ip_vs_genl_register(); if (ret) { pr_err("cannot register Generic Netlink interface.\n"); goto err_genl; } return 0; err_genl: nf_unregister_sockopt(&ip_vs_sockopts); err_sock: return ret; } void ip_vs_unregister_nl_ioctl(void) { ip_vs_genl_unregister(); nf_unregister_sockopt(&ip_vs_sockopts); } int __init ip_vs_control_init(void) { int idx; int ret; /* Initialize svc_table, ip_vs_svc_fwm_table */ for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { INIT_HLIST_HEAD(&ip_vs_svc_table[idx]); INIT_HLIST_HEAD(&ip_vs_svc_fwm_table[idx]); } smp_wmb(); /* Do we really need it now ? */ ret = register_netdevice_notifier(&ip_vs_dst_notifier); if (ret < 0) return ret; return 0; } void ip_vs_control_cleanup(void) { unregister_netdevice_notifier(&ip_vs_dst_notifier); /* relying on common rcu_barrier() in ip_vs_cleanup() */ } |
| 1879 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 | /* * Performance events: * * Copyright (C) 2008-2009, Thomas Gleixner <tglx@linutronix.de> * Copyright (C) 2008-2011, Red Hat, Inc., Ingo Molnar * Copyright (C) 2008-2011, Red Hat, Inc., Peter Zijlstra * * Data type definitions, declarations, prototypes. * * Started by: Thomas Gleixner and Ingo Molnar * * For licencing details see kernel-base/COPYING */ #ifndef _LINUX_PERF_EVENT_H #define _LINUX_PERF_EVENT_H #include <uapi/linux/perf_event.h> #include <uapi/linux/bpf_perf_event.h> /* * Kernel-internal data types and definitions: */ #ifdef CONFIG_PERF_EVENTS # include <asm/perf_event.h> # include <asm/local64.h> #endif #define PERF_GUEST_ACTIVE 0x01 #define PERF_GUEST_USER 0x02 struct perf_guest_info_callbacks { unsigned int (*state)(void); unsigned long (*get_ip)(void); unsigned int (*handle_intel_pt_intr)(void); }; #ifdef CONFIG_HAVE_HW_BREAKPOINT #include <linux/rhashtable-types.h> #include <asm/hw_breakpoint.h> #endif #include <linux/list.h> #include <linux/mutex.h> #include <linux/rculist.h> #include <linux/rcupdate.h> #include <linux/spinlock.h> #include <linux/hrtimer.h> #include <linux/fs.h> #include <linux/pid_namespace.h> #include <linux/workqueue.h> #include <linux/ftrace.h> #include <linux/cpu.h> #include <linux/irq_work.h> #include <linux/static_key.h> #include <linux/jump_label_ratelimit.h> #include <linux/atomic.h> #include <linux/sysfs.h> #include <linux/perf_regs.h> #include <linux/cgroup.h> #include <linux/refcount.h> #include <linux/security.h> #include <linux/static_call.h> #include <linux/lockdep.h> #include <asm/local.h> struct perf_callchain_entry { __u64 nr; __u64 ip[]; /* /proc/sys/kernel/perf_event_max_stack */ }; struct perf_callchain_entry_ctx { struct perf_callchain_entry *entry; u32 max_stack; u32 nr; short contexts; bool contexts_maxed; }; typedef unsigned long (*perf_copy_f)(void *dst, const void *src, unsigned long off, unsigned long len); struct perf_raw_frag { union { struct perf_raw_frag *next; unsigned long pad; }; perf_copy_f copy; void *data; u32 size; } __packed; struct perf_raw_record { struct perf_raw_frag frag; u32 size; }; static __always_inline bool perf_raw_frag_last(const struct perf_raw_frag *frag) { return frag->pad < sizeof(u64); } /* * branch stack layout: * nr: number of taken branches stored in entries[] * hw_idx: The low level index of raw branch records * for the most recent branch. * -1ULL means invalid/unknown. * * Note that nr can vary from sample to sample * branches (to, from) are stored from most recent * to least recent, i.e., entries[0] contains the most * recent branch. * The entries[] is an abstraction of raw branch records, * which may not be stored in age order in HW, e.g. Intel LBR. * The hw_idx is to expose the low level index of raw * branch record for the most recent branch aka entries[0]. * The hw_idx index is between -1 (unknown) and max depth, * which can be retrieved in /sys/devices/cpu/caps/branches. * For the architectures whose raw branch records are * already stored in age order, the hw_idx should be 0. */ struct perf_branch_stack { __u64 nr; __u64 hw_idx; struct perf_branch_entry entries[]; }; struct task_struct; /* * extra PMU register associated with an event */ struct hw_perf_event_extra { u64 config; /* register value */ unsigned int reg; /* register address or index */ int alloc; /* extra register already allocated */ int idx; /* index in shared_regs->regs[] */ }; /** * hw_perf_event::flag values * * PERF_EVENT_FLAG_ARCH bits are reserved for architecture-specific * usage. */ #define PERF_EVENT_FLAG_ARCH 0x000fffff #define PERF_EVENT_FLAG_USER_READ_CNT 0x80000000 static_assert((PERF_EVENT_FLAG_USER_READ_CNT & PERF_EVENT_FLAG_ARCH) == 0); /** * struct hw_perf_event - performance event hardware details: */ struct hw_perf_event { #ifdef CONFIG_PERF_EVENTS union { struct { /* hardware */ u64 config; u64 last_tag; unsigned long config_base; unsigned long event_base; int event_base_rdpmc; int idx; int last_cpu; int flags; struct hw_perf_event_extra extra_reg; struct hw_perf_event_extra branch_reg; }; struct { /* aux / Intel-PT */ u64 aux_config; }; struct { /* software */ struct hrtimer hrtimer; }; struct { /* tracepoint */ /* for tp_event->class */ struct list_head tp_list; }; struct { /* amd_power */ u64 pwr_acc; u64 ptsc; }; #ifdef CONFIG_HAVE_HW_BREAKPOINT struct { /* breakpoint */ /* * Crufty hack to avoid the chicken and egg * problem hw_breakpoint has with context * creation and event initalization. */ struct arch_hw_breakpoint info; struct rhlist_head bp_list; }; #endif struct { /* amd_iommu */ u8 iommu_bank; u8 iommu_cntr; u16 padding; u64 conf; u64 conf1; }; }; /* * If the event is a per task event, this will point to the task in * question. See the comment in perf_event_alloc(). */ struct task_struct *target; /* * PMU would store hardware filter configuration * here. */ void *addr_filters; /* Last sync'ed generation of filters */ unsigned long addr_filters_gen; /* * hw_perf_event::state flags; used to track the PERF_EF_* state. */ #define PERF_HES_STOPPED 0x01 /* the counter is stopped */ #define PERF_HES_UPTODATE 0x02 /* event->count up-to-date */ #define PERF_HES_ARCH 0x04 int state; /* * The last observed hardware counter value, updated with a * local64_cmpxchg() such that pmu::read() can be called nested. */ local64_t prev_count; /* * The period to start the next sample with. */ u64 sample_period; union { struct { /* Sampling */ /* * The period we started this sample with. */ u64 last_period; /* * However much is left of the current period; * note that this is a full 64bit value and * allows for generation of periods longer * than hardware might allow. */ local64_t period_left; }; struct { /* Topdown events counting for context switch */ u64 saved_metric; u64 saved_slots; }; }; /* * State for throttling the event, see __perf_event_overflow() and * perf_adjust_freq_unthr_context(). */ u64 interrupts_seq; u64 interrupts; /* * State for freq target events, see __perf_event_overflow() and * perf_adjust_freq_unthr_context(). */ u64 freq_time_stamp; u64 freq_count_stamp; #endif }; struct perf_event; struct perf_event_pmu_context; /* * Common implementation detail of pmu::{start,commit,cancel}_txn */ #define PERF_PMU_TXN_ADD 0x1 /* txn to add/schedule event on PMU */ #define PERF_PMU_TXN_READ 0x2 /* txn to read event group from PMU */ /** * pmu::capabilities flags */ #define PERF_PMU_CAP_NO_INTERRUPT 0x0001 #define PERF_PMU_CAP_NO_NMI 0x0002 #define PERF_PMU_CAP_AUX_NO_SG 0x0004 #define PERF_PMU_CAP_EXTENDED_REGS 0x0008 #define PERF_PMU_CAP_EXCLUSIVE 0x0010 #define PERF_PMU_CAP_ITRACE 0x0020 #define PERF_PMU_CAP_NO_EXCLUDE 0x0040 #define PERF_PMU_CAP_AUX_OUTPUT 0x0080 #define PERF_PMU_CAP_EXTENDED_HW_TYPE 0x0100 /** * pmu::scope */ enum perf_pmu_scope { PERF_PMU_SCOPE_NONE = 0, PERF_PMU_SCOPE_CORE, PERF_PMU_SCOPE_DIE, PERF_PMU_SCOPE_CLUSTER, PERF_PMU_SCOPE_PKG, PERF_PMU_SCOPE_SYS_WIDE, PERF_PMU_MAX_SCOPE, }; struct perf_output_handle; #define PMU_NULL_DEV ((void *)(~0UL)) /** * struct pmu - generic performance monitoring unit */ struct pmu { struct list_head entry; struct module *module; struct device *dev; struct device *parent; const struct attribute_group **attr_groups; const struct attribute_group **attr_update; const char *name; int type; /* * various common per-pmu feature flags */ int capabilities; /* * PMU scope */ unsigned int scope; int __percpu *pmu_disable_count; struct perf_cpu_pmu_context __percpu *cpu_pmu_context; atomic_t exclusive_cnt; /* < 0: cpu; > 0: tsk */ int task_ctx_nr; int hrtimer_interval_ms; /* number of address filters this PMU can do */ unsigned int nr_addr_filters; /* * Fully disable/enable this PMU, can be used to protect from the PMI * as well as for lazy/batch writing of the MSRs. */ void (*pmu_enable) (struct pmu *pmu); /* optional */ void (*pmu_disable) (struct pmu *pmu); /* optional */ /* * Try and initialize the event for this PMU. * * Returns: * -ENOENT -- @event is not for this PMU * * -ENODEV -- @event is for this PMU but PMU not present * -EBUSY -- @event is for this PMU but PMU temporarily unavailable * -EINVAL -- @event is for this PMU but @event is not valid * -EOPNOTSUPP -- @event is for this PMU, @event is valid, but not supported * -EACCES -- @event is for this PMU, @event is valid, but no privileges * * 0 -- @event is for this PMU and valid * * Other error return values are allowed. */ int (*event_init) (struct perf_event *event); /* * Notification that the event was mapped or unmapped. Called * in the context of the mapping task. */ void (*event_mapped) (struct perf_event *event, struct mm_struct *mm); /* optional */ void (*event_unmapped) (struct perf_event *event, struct mm_struct *mm); /* optional */ /* * Flags for ->add()/->del()/ ->start()/->stop(). There are * matching hw_perf_event::state flags. */ #define PERF_EF_START 0x01 /* start the counter when adding */ #define PERF_EF_RELOAD 0x02 /* reload the counter when starting */ #define PERF_EF_UPDATE 0x04 /* update the counter when stopping */ /* * Adds/Removes a counter to/from the PMU, can be done inside a * transaction, see the ->*_txn() methods. * * The add/del callbacks will reserve all hardware resources required * to service the event, this includes any counter constraint * scheduling etc. * * Called with IRQs disabled and the PMU disabled on the CPU the event * is on. * * ->add() called without PERF_EF_START should result in the same state * as ->add() followed by ->stop(). * * ->del() must always PERF_EF_UPDATE stop an event. If it calls * ->stop() that must deal with already being stopped without * PERF_EF_UPDATE. */ int (*add) (struct perf_event *event, int flags); void (*del) (struct perf_event *event, int flags); /* * Starts/Stops a counter present on the PMU. * * The PMI handler should stop the counter when perf_event_overflow() * returns !0. ->start() will be used to continue. * * Also used to change the sample period. * * Called with IRQs disabled and the PMU disabled on the CPU the event * is on -- will be called from NMI context with the PMU generates * NMIs. * * ->stop() with PERF_EF_UPDATE will read the counter and update * period/count values like ->read() would. * * ->start() with PERF_EF_RELOAD will reprogram the counter * value, must be preceded by a ->stop() with PERF_EF_UPDATE. */ void (*start) (struct perf_event *event, int flags); void (*stop) (struct perf_event *event, int flags); /* * Updates the counter value of the event. * * For sampling capable PMUs this will also update the software period * hw_perf_event::period_left field. */ void (*read) (struct perf_event *event); /* * Group events scheduling is treated as a transaction, add * group events as a whole and perform one schedulability test. * If the test fails, roll back the whole group * * Start the transaction, after this ->add() doesn't need to * do schedulability tests. * * Optional. */ void (*start_txn) (struct pmu *pmu, unsigned int txn_flags); /* * If ->start_txn() disabled the ->add() schedulability test * then ->commit_txn() is required to perform one. On success * the transaction is closed. On error the transaction is kept * open until ->cancel_txn() is called. * * Optional. */ int (*commit_txn) (struct pmu *pmu); /* * Will cancel the transaction, assumes ->del() is called * for each successful ->add() during the transaction. * * Optional. */ void (*cancel_txn) (struct pmu *pmu); /* * Will return the value for perf_event_mmap_page::index for this event, * if no implementation is provided it will default to 0 (see * perf_event_idx_default). */ int (*event_idx) (struct perf_event *event); /*optional */ /* * context-switches callback */ void (*sched_task) (struct perf_event_pmu_context *pmu_ctx, bool sched_in); /* * Kmem cache of PMU specific data */ struct kmem_cache *task_ctx_cache; /* * PMU specific parts of task perf event context (i.e. ctx->task_ctx_data) * can be synchronized using this function. See Intel LBR callstack support * implementation and Perf core context switch handling callbacks for usage * examples. */ void (*swap_task_ctx) (struct perf_event_pmu_context *prev_epc, struct perf_event_pmu_context *next_epc); /* optional */ /* * Set up pmu-private data structures for an AUX area */ void *(*setup_aux) (struct perf_event *event, void **pages, int nr_pages, bool overwrite); /* optional */ /* * Free pmu-private AUX data structures */ void (*free_aux) (void *aux); /* optional */ /* * Take a snapshot of the AUX buffer without touching the event * state, so that preempting ->start()/->stop() callbacks does * not interfere with their logic. Called in PMI context. * * Returns the size of AUX data copied to the output handle. * * Optional. */ long (*snapshot_aux) (struct perf_event *event, struct perf_output_handle *handle, unsigned long size); /* * Validate address range filters: make sure the HW supports the * requested configuration and number of filters; return 0 if the * supplied filters are valid, -errno otherwise. * * Runs in the context of the ioctl()ing process and is not serialized * with the rest of the PMU callbacks. */ int (*addr_filters_validate) (struct list_head *filters); /* optional */ /* * Synchronize address range filter configuration: * translate hw-agnostic filters into hardware configuration in * event::hw::addr_filters. * * Runs as a part of filter sync sequence that is done in ->start() * callback by calling perf_event_addr_filters_sync(). * * May (and should) traverse event::addr_filters::list, for which its * caller provides necessary serialization. */ void (*addr_filters_sync) (struct perf_event *event); /* optional */ /* * Check if event can be used for aux_output purposes for * events of this PMU. * * Runs from perf_event_open(). Should return 0 for "no match" * or non-zero for "match". */ int (*aux_output_match) (struct perf_event *event); /* optional */ /* * Skip programming this PMU on the given CPU. Typically needed for * big.LITTLE things. */ bool (*filter) (struct pmu *pmu, int cpu); /* optional */ /* * Check period value for PERF_EVENT_IOC_PERIOD ioctl. */ int (*check_period) (struct perf_event *event, u64 value); /* optional */ }; enum perf_addr_filter_action_t { PERF_ADDR_FILTER_ACTION_STOP = 0, PERF_ADDR_FILTER_ACTION_START, PERF_ADDR_FILTER_ACTION_FILTER, }; /** * struct perf_addr_filter - address range filter definition * @entry: event's filter list linkage * @path: object file's path for file-based filters * @offset: filter range offset * @size: filter range size (size==0 means single address trigger) * @action: filter/start/stop * * This is a hardware-agnostic filter configuration as specified by the user. */ struct perf_addr_filter { struct list_head entry; struct path path; unsigned long offset; unsigned long size; enum perf_addr_filter_action_t action; }; /** * struct perf_addr_filters_head - container for address range filters * @list: list of filters for this event * @lock: spinlock that serializes accesses to the @list and event's * (and its children's) filter generations. * @nr_file_filters: number of file-based filters * * A child event will use parent's @list (and therefore @lock), so they are * bundled together; see perf_event_addr_filters(). */ struct perf_addr_filters_head { struct list_head list; raw_spinlock_t lock; unsigned int nr_file_filters; }; struct perf_addr_filter_range { unsigned long start; unsigned long size; }; /** * enum perf_event_state - the states of an event: */ enum perf_event_state { PERF_EVENT_STATE_DEAD = -4, PERF_EVENT_STATE_EXIT = -3, PERF_EVENT_STATE_ERROR = -2, PERF_EVENT_STATE_OFF = -1, PERF_EVENT_STATE_INACTIVE = 0, PERF_EVENT_STATE_ACTIVE = 1, }; struct file; struct perf_sample_data; typedef void (*perf_overflow_handler_t)(struct perf_event *, struct perf_sample_data *, struct pt_regs *regs); /* * Event capabilities. For event_caps and groups caps. * * PERF_EV_CAP_SOFTWARE: Is a software event. * PERF_EV_CAP_READ_ACTIVE_PKG: A CPU event (or cgroup event) that can be read * from any CPU in the package where it is active. * PERF_EV_CAP_SIBLING: An event with this flag must be a group sibling and * cannot be a group leader. If an event with this flag is detached from the * group it is scheduled out and moved into an unrecoverable ERROR state. * PERF_EV_CAP_READ_SCOPE: A CPU event that can be read from any CPU of the * PMU scope where it is active. */ #define PERF_EV_CAP_SOFTWARE BIT(0) #define PERF_EV_CAP_READ_ACTIVE_PKG BIT(1) #define PERF_EV_CAP_SIBLING BIT(2) #define PERF_EV_CAP_READ_SCOPE BIT(3) #define SWEVENT_HLIST_BITS 8 #define SWEVENT_HLIST_SIZE (1 << SWEVENT_HLIST_BITS) struct swevent_hlist { struct hlist_head heads[SWEVENT_HLIST_SIZE]; struct rcu_head rcu_head; }; #define PERF_ATTACH_CONTEXT 0x01 #define PERF_ATTACH_GROUP 0x02 #define PERF_ATTACH_TASK 0x04 #define PERF_ATTACH_TASK_DATA 0x08 #define PERF_ATTACH_ITRACE 0x10 #define PERF_ATTACH_SCHED_CB 0x20 #define PERF_ATTACH_CHILD 0x40 struct bpf_prog; struct perf_cgroup; struct perf_buffer; struct pmu_event_list { raw_spinlock_t lock; struct list_head list; }; /* * event->sibling_list is modified whole holding both ctx->lock and ctx->mutex * as such iteration must hold either lock. However, since ctx->lock is an IRQ * safe lock, and is only held by the CPU doing the modification, having IRQs * disabled is sufficient since it will hold-off the IPIs. */ #ifdef CONFIG_PROVE_LOCKING #define lockdep_assert_event_ctx(event) \ WARN_ON_ONCE(__lockdep_enabled && \ (this_cpu_read(hardirqs_enabled) && \ lockdep_is_held(&(event)->ctx->mutex) != LOCK_STATE_HELD)) #else #define lockdep_assert_event_ctx(event) #endif #define for_each_sibling_event(sibling, event) \ lockdep_assert_event_ctx(event); \ if ((event)->group_leader == (event)) \ list_for_each_entry((sibling), &(event)->sibling_list, sibling_list) /** * struct perf_event - performance event kernel representation: */ struct perf_event { #ifdef CONFIG_PERF_EVENTS /* * entry onto perf_event_context::event_list; * modifications require ctx->lock * RCU safe iterations. */ struct list_head event_entry; /* * Locked for modification by both ctx->mutex and ctx->lock; holding * either sufficies for read. */ struct list_head sibling_list; struct list_head active_list; /* * Node on the pinned or flexible tree located at the event context; */ struct rb_node group_node; u64 group_index; /* * We need storage to track the entries in perf_pmu_migrate_context; we * cannot use the event_entry because of RCU and we want to keep the * group in tact which avoids us using the other two entries. */ struct list_head migrate_entry; struct hlist_node hlist_entry; struct list_head active_entry; int nr_siblings; /* Not serialized. Only written during event initialization. */ int event_caps; /* The cumulative AND of all event_caps for events in this group. */ int group_caps; unsigned int group_generation; struct perf_event *group_leader; /* * event->pmu will always point to pmu in which this event belongs. * Whereas event->pmu_ctx->pmu may point to other pmu when group of * different pmu events is created. */ struct pmu *pmu; void *pmu_private; enum perf_event_state state; unsigned int attach_state; local64_t count; atomic64_t child_count; /* * These are the total time in nanoseconds that the event * has been enabled (i.e. eligible to run, and the task has * been scheduled in, if this is a per-task event) * and running (scheduled onto the CPU), respectively. */ u64 total_time_enabled; u64 total_time_running; u64 tstamp; struct perf_event_attr attr; u16 header_size; u16 id_header_size; u16 read_size; struct hw_perf_event hw; struct perf_event_context *ctx; /* * event->pmu_ctx points to perf_event_pmu_context in which the event * is added. This pmu_ctx can be of other pmu for sw event when that * sw event is part of a group which also contains non-sw events. */ struct perf_event_pmu_context *pmu_ctx; atomic_long_t refcount; /* * These accumulate total time (in nanoseconds) that children * events have been enabled and running, respectively. */ atomic64_t child_total_time_enabled; atomic64_t child_total_time_running; /* * Protect attach/detach and child_list: */ struct mutex child_mutex; struct list_head child_list; struct perf_event *parent; int oncpu; int cpu; struct list_head owner_entry; struct task_struct *owner; /* mmap bits */ struct mutex mmap_mutex; atomic_t mmap_count; struct perf_buffer *rb; struct list_head rb_entry; unsigned long rcu_batches; int rcu_pending; /* poll related */ wait_queue_head_t waitq; struct fasync_struct *fasync; /* delayed work for NMIs and such */ unsigned int pending_wakeup; unsigned int pending_kill; unsigned int pending_disable; unsigned long pending_addr; /* SIGTRAP */ struct irq_work pending_irq; struct irq_work pending_disable_irq; struct callback_head pending_task; unsigned int pending_work; struct rcuwait pending_work_wait; atomic_t event_limit; /* address range filters */ struct perf_addr_filters_head addr_filters; /* vma address array for file-based filders */ struct perf_addr_filter_range *addr_filter_ranges; unsigned long addr_filters_gen; /* for aux_output events */ struct perf_event *aux_event; void (*destroy)(struct perf_event *); struct rcu_head rcu_head; struct pid_namespace *ns; u64 id; atomic64_t lost_samples; u64 (*clock)(void); perf_overflow_handler_t overflow_handler; void *overflow_handler_context; struct bpf_prog *prog; u64 bpf_cookie; #ifdef CONFIG_EVENT_TRACING struct trace_event_call *tp_event; struct event_filter *filter; #ifdef CONFIG_FUNCTION_TRACER struct ftrace_ops ftrace_ops; #endif #endif #ifdef CONFIG_CGROUP_PERF struct perf_cgroup *cgrp; /* cgroup event is attach to */ #endif #ifdef CONFIG_SECURITY void *security; #endif struct list_head sb_list; /* * Certain events gets forwarded to another pmu internally by over- * writing kernel copy of event->attr.type without user being aware * of it. event->orig_type contains original 'type' requested by * user. */ __u32 orig_type; #endif /* CONFIG_PERF_EVENTS */ }; /* * ,-----------------------[1:n]------------------------. * V V * perf_event_context <-[1:n]-> perf_event_pmu_context <-[1:n]- perf_event * | | * `--[n:1]-> pmu <-[1:n]--' * * * struct perf_event_pmu_context lifetime is refcount based and RCU freed * (similar to perf_event_context). Locking is as if it were a member of * perf_event_context; specifically: * * modification, both: ctx->mutex && ctx->lock * reading, either: ctx->mutex || ctx->lock * * There is one exception to this; namely put_pmu_ctx() isn't always called * with ctx->mutex held; this means that as long as we can guarantee the epc * has events the above rules hold. * * Specificially, sys_perf_event_open()'s group_leader case depends on * ctx->mutex pinning the configuration. Since we hold a reference on * group_leader (through the filedesc) it can't go away, therefore it's * associated pmu_ctx must exist and cannot change due to ctx->mutex. * * perf_event holds a refcount on perf_event_context * perf_event holds a refcount on perf_event_pmu_context */ struct perf_event_pmu_context { struct pmu *pmu; struct perf_event_context *ctx; struct list_head pmu_ctx_entry; struct list_head pinned_active; struct list_head flexible_active; /* Used to avoid freeing per-cpu perf_event_pmu_context */ unsigned int embedded : 1; unsigned int nr_events; unsigned int nr_cgroups; unsigned int nr_freq; atomic_t refcount; /* event <-> epc */ struct rcu_head rcu_head; void *task_ctx_data; /* pmu specific data */ /* * Set when one or more (plausibly active) event can't be scheduled * due to pmu overcommit or pmu constraints, except tolerant to * events not necessary to be active due to scheduling constraints, * such as cgroups. */ int rotate_necessary; }; static inline bool perf_pmu_ctx_is_active(struct perf_event_pmu_context *epc) { return !list_empty(&epc->flexible_active) || !list_empty(&epc->pinned_active); } struct perf_event_groups { struct rb_root tree; u64 index; }; /** * struct perf_event_context - event context structure * * Used as a container for task events and CPU events as well: */ struct perf_event_context { /* * Protect the states of the events in the list, * nr_active, and the list: */ raw_spinlock_t lock; /* * Protect the list of events. Locking either mutex or lock * is sufficient to ensure the list doesn't change; to change * the list you need to lock both the mutex and the spinlock. */ struct mutex mutex; struct list_head pmu_ctx_list; struct perf_event_groups pinned_groups; struct perf_event_groups flexible_groups; struct list_head event_list; int nr_events; int nr_user; int is_active; int nr_task_data; int nr_stat; int nr_freq; int rotate_disable; refcount_t refcount; /* event <-> ctx */ struct task_struct *task; /* * Context clock, runs when context enabled. */ u64 time; u64 timestamp; u64 timeoffset; /* * These fields let us detect when two contexts have both * been cloned (inherited) from a common ancestor. */ struct perf_event_context *parent_ctx; u64 parent_gen; u64 generation; int pin_count; #ifdef CONFIG_CGROUP_PERF int nr_cgroups; /* cgroup evts */ #endif struct rcu_head rcu_head; /* * The count of events for which using the switch-out fast path * should be avoided. * * Sum (event->pending_work + events with * (attr->inherit && (attr->sample_type & PERF_SAMPLE_READ))) * * The SIGTRAP is targeted at ctx->task, as such it won't do changing * that until the signal is delivered. */ local_t nr_no_switch_fast; }; struct perf_cpu_pmu_context { struct perf_event_pmu_context epc; struct perf_event_pmu_context *task_epc; struct list_head sched_cb_entry; int sched_cb_usage; int active_oncpu; int exclusive; raw_spinlock_t hrtimer_lock; struct hrtimer hrtimer; ktime_t hrtimer_interval; unsigned int hrtimer_active; }; /** * struct perf_event_cpu_context - per cpu event context structure */ struct perf_cpu_context { struct perf_event_context ctx; struct perf_event_context *task_ctx; int online; #ifdef CONFIG_CGROUP_PERF struct perf_cgroup *cgrp; #endif /* * Per-CPU storage for iterators used in visit_groups_merge. The default * storage is of size 2 to hold the CPU and any CPU event iterators. */ int heap_size; struct perf_event **heap; struct perf_event *heap_default[2]; }; struct perf_output_handle { struct perf_event *event; struct perf_buffer *rb; unsigned long wakeup; unsigned long size; u64 aux_flags; union { void *addr; unsigned long head; }; int page; }; struct bpf_perf_event_data_kern { bpf_user_pt_regs_t *regs; struct perf_sample_data *data; struct perf_event *event; }; #ifdef CONFIG_CGROUP_PERF /* * perf_cgroup_info keeps track of time_enabled for a cgroup. * This is a per-cpu dynamically allocated data structure. */ struct perf_cgroup_info { u64 time; u64 timestamp; u64 timeoffset; int active; }; struct perf_cgroup { struct cgroup_subsys_state css; struct perf_cgroup_info __percpu *info; }; /* * Must ensure cgroup is pinned (css_get) before calling * this function. In other words, we cannot call this function * if there is no cgroup event for the current CPU context. */ static inline struct perf_cgroup * perf_cgroup_from_task(struct task_struct *task, struct perf_event_context *ctx) { return container_of(task_css_check(task, perf_event_cgrp_id, ctx ? lockdep_is_held(&ctx->lock) : true), struct perf_cgroup, css); } #endif /* CONFIG_CGROUP_PERF */ #ifdef CONFIG_PERF_EVENTS extern struct perf_event_context *perf_cpu_task_ctx(void); extern void *perf_aux_output_begin(struct perf_output_handle *handle, struct perf_event *event); extern void perf_aux_output_end(struct perf_output_handle *handle, unsigned long size); extern int perf_aux_output_skip(struct perf_output_handle *handle, unsigned long size); extern void *perf_get_aux(struct perf_output_handle *handle); extern void perf_aux_output_flag(struct perf_output_handle *handle, u64 flags); extern void perf_event_itrace_started(struct perf_event *event); extern int perf_pmu_register(struct pmu *pmu, const char *name, int type); extern void perf_pmu_unregister(struct pmu *pmu); extern void __perf_event_task_sched_in(struct task_struct *prev, struct task_struct *task); extern void __perf_event_task_sched_out(struct task_struct *prev, struct task_struct *next); extern int perf_event_init_task(struct task_struct *child, u64 clone_flags); extern void perf_event_exit_task(struct task_struct *child); extern void perf_event_free_task(struct task_struct *task); extern void perf_event_delayed_put(struct task_struct *task); extern struct file *perf_event_get(unsigned int fd); extern const struct perf_event *perf_get_event(struct file *file); extern const struct perf_event_attr *perf_event_attrs(struct perf_event *event); extern void perf_event_print_debug(void); extern void perf_pmu_disable(struct pmu *pmu); extern void perf_pmu_enable(struct pmu *pmu); extern void perf_sched_cb_dec(struct pmu *pmu); extern void perf_sched_cb_inc(struct pmu *pmu); extern int perf_event_task_disable(void); extern int perf_event_task_enable(void); extern void perf_pmu_resched(struct pmu *pmu); extern int perf_event_refresh(struct perf_event *event, int refresh); extern void perf_event_update_userpage(struct perf_event *event); extern int perf_event_release_kernel(struct perf_event *event); extern struct perf_event * perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu, struct task_struct *task, perf_overflow_handler_t callback, void *context); extern void perf_pmu_migrate_context(struct pmu *pmu, int src_cpu, int dst_cpu); int perf_event_read_local(struct perf_event *event, u64 *value, u64 *enabled, u64 *running); extern u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running); extern struct perf_callchain_entry *perf_callchain(struct perf_event *event, struct pt_regs *regs); static inline bool branch_sample_no_flags(const struct perf_event *event) { return event->attr.branch_sample_type & PERF_SAMPLE_BRANCH_NO_FLAGS; } static inline bool branch_sample_no_cycles(const struct perf_event *event) { return event->attr.branch_sample_type & PERF_SAMPLE_BRANCH_NO_CYCLES; } static inline bool branch_sample_type(const struct perf_event *event) { return event->attr.branch_sample_type & PERF_SAMPLE_BRANCH_TYPE_SAVE; } static inline bool branch_sample_hw_index(const struct perf_event *event) { return event->attr.branch_sample_type & PERF_SAMPLE_BRANCH_HW_INDEX; } static inline bool branch_sample_priv(const struct perf_event *event) { return event->attr.branch_sample_type & PERF_SAMPLE_BRANCH_PRIV_SAVE; } static inline bool branch_sample_counters(const struct perf_event *event) { return event->attr.branch_sample_type & PERF_SAMPLE_BRANCH_COUNTERS; } static inline bool branch_sample_call_stack(const struct perf_event *event) { return event->attr.branch_sample_type & PERF_SAMPLE_BRANCH_CALL_STACK; } struct perf_sample_data { /* * Fields set by perf_sample_data_init() unconditionally, * group so as to minimize the cachelines touched. */ u64 sample_flags; u64 period; u64 dyn_size; /* * Fields commonly set by __perf_event_header__init_id(), * group so as to minimize the cachelines touched. */ u64 type; struct { u32 pid; u32 tid; } tid_entry; u64 time; u64 id; struct { u32 cpu; u32 reserved; } cpu_entry; /* * The other fields, optionally {set,used} by * perf_{prepare,output}_sample(). */ u64 ip; struct perf_callchain_entry *callchain; struct perf_raw_record *raw; struct perf_branch_stack *br_stack; u64 *br_stack_cntr; union perf_sample_weight weight; union perf_mem_data_src data_src; u64 txn; struct perf_regs regs_user; struct perf_regs regs_intr; u64 stack_user_size; u64 stream_id; u64 cgroup; u64 addr; u64 phys_addr; u64 data_page_size; u64 code_page_size; u64 aux_size; } ____cacheline_aligned; /* default value for data source */ #define PERF_MEM_NA (PERF_MEM_S(OP, NA) |\ PERF_MEM_S(LVL, NA) |\ PERF_MEM_S(SNOOP, NA) |\ PERF_MEM_S(LOCK, NA) |\ PERF_MEM_S(TLB, NA) |\ PERF_MEM_S(LVLNUM, NA)) static inline void perf_sample_data_init(struct perf_sample_data *data, u64 addr, u64 period) { /* remaining struct members initialized in perf_prepare_sample() */ data->sample_flags = PERF_SAMPLE_PERIOD; data->period = period; data->dyn_size = 0; if (addr) { data->addr = addr; data->sample_flags |= PERF_SAMPLE_ADDR; } } static inline void perf_sample_save_callchain(struct perf_sample_data *data, struct perf_event *event, struct pt_regs *regs) { int size = 1; data->callchain = perf_callchain(event, regs); size += data->callchain->nr; data->dyn_size += size * sizeof(u64); data->sample_flags |= PERF_SAMPLE_CALLCHAIN; } static inline void perf_sample_save_raw_data(struct perf_sample_data *data, struct perf_raw_record *raw) { struct perf_raw_frag *frag = &raw->frag; u32 sum = 0; int size; do { sum += frag->size; if (perf_raw_frag_last(frag)) break; frag = frag->next; } while (1); size = round_up(sum + sizeof(u32), sizeof(u64)); raw->size = size - sizeof(u32); frag->pad = raw->size - sum; data->raw = raw; data->dyn_size += size; data->sample_flags |= PERF_SAMPLE_RAW; } static inline void perf_sample_save_brstack(struct perf_sample_data *data, struct perf_event *event, struct perf_branch_stack *brs, u64 *brs_cntr) { int size = sizeof(u64); /* nr */ if (branch_sample_hw_index(event)) size += sizeof(u64); size += brs->nr * sizeof(struct perf_branch_entry); /* * The extension space for counters is appended after the * struct perf_branch_stack. It is used to store the occurrences * of events of each branch. */ if (brs_cntr) size += brs->nr * sizeof(u64); data->br_stack = brs; data->br_stack_cntr = brs_cntr; data->dyn_size += size; data->sample_flags |= PERF_SAMPLE_BRANCH_STACK; } static inline u32 perf_sample_data_size(struct perf_sample_data *data, struct perf_event *event) { u32 size = sizeof(struct perf_event_header); size += event->header_size + event->id_header_size; size += data->dyn_size; return size; } /* * Clear all bitfields in the perf_branch_entry. * The to and from fields are not cleared because they are * systematically modified by caller. */ static inline void perf_clear_branch_entry_bitfields(struct perf_branch_entry *br) { br->mispred = 0; br->predicted = 0; br->in_tx = 0; br->abort = 0; br->cycles = 0; br->type = 0; br->spec = PERF_BR_SPEC_NA; br->reserved = 0; } extern void perf_output_sample(struct perf_output_handle *handle, struct perf_event_header *header, struct perf_sample_data *data, struct perf_event *event); extern void perf_prepare_sample(struct perf_sample_data *data, struct perf_event *event, struct pt_regs *regs); extern void perf_prepare_header(struct perf_event_header *header, struct perf_sample_data *data, struct perf_event *event, struct pt_regs *regs); extern int perf_event_overflow(struct perf_event *event, struct perf_sample_data *data, struct pt_regs *regs); extern void perf_event_output_forward(struct perf_event *event, struct perf_sample_data *data, struct pt_regs *regs); extern void perf_event_output_backward(struct perf_event *event, struct perf_sample_data *data, struct pt_regs *regs); extern int perf_event_output(struct perf_event *event, struct perf_sample_data *data, struct pt_regs *regs); static inline bool is_default_overflow_handler(struct perf_event *event) { perf_overflow_handler_t overflow_handler = event->overflow_handler; if (likely(overflow_handler == perf_event_output_forward)) return true; if (unlikely(overflow_handler == perf_event_output_backward)) return true; return false; } extern void perf_event_header__init_id(struct perf_event_header *header, struct perf_sample_data *data, struct perf_event *event); extern void perf_event__output_id_sample(struct perf_event *event, struct perf_output_handle *handle, struct perf_sample_data *sample); extern void perf_log_lost_samples(struct perf_event *event, u64 lost); static inline bool event_has_any_exclude_flag(struct perf_event *event) { struct perf_event_attr *attr = &event->attr; return attr->exclude_idle || attr->exclude_user || attr->exclude_kernel || attr->exclude_hv || attr->exclude_guest || attr->exclude_host; } static inline bool is_sampling_event(struct perf_event *event) { return event->attr.sample_period != 0; } /* * Return 1 for a software event, 0 for a hardware event */ static inline int is_software_event(struct perf_event *event) { return event->event_caps & PERF_EV_CAP_SOFTWARE; } /* * Return 1 for event in sw context, 0 for event in hw context */ static inline int in_software_context(struct perf_event *event) { return event->pmu_ctx->pmu->task_ctx_nr == perf_sw_context; } static inline int is_exclusive_pmu(struct pmu *pmu) { return pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE; } extern struct static_key perf_swevent_enabled[PERF_COUNT_SW_MAX]; extern void ___perf_sw_event(u32, u64, struct pt_regs *, u64); extern void __perf_sw_event(u32, u64, struct pt_regs *, u64); #ifndef perf_arch_fetch_caller_regs static inline void perf_arch_fetch_caller_regs(struct pt_regs *regs, unsigned long ip) { } #endif /* * When generating a perf sample in-line, instead of from an interrupt / * exception, we lack a pt_regs. This is typically used from software events * like: SW_CONTEXT_SWITCHES, SW_MIGRATIONS and the tie-in with tracepoints. * * We typically don't need a full set, but (for x86) do require: * - ip for PERF_SAMPLE_IP * - cs for user_mode() tests * - sp for PERF_SAMPLE_CALLCHAIN * - eflags for MISC bits and CALLCHAIN (see: perf_hw_regs()) * * NOTE: assumes @regs is otherwise already 0 filled; this is important for * things like PERF_SAMPLE_REGS_INTR. */ static inline void perf_fetch_caller_regs(struct pt_regs *regs) { perf_arch_fetch_caller_regs(regs, CALLER_ADDR0); } static __always_inline void perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr) { if (static_key_false(&perf_swevent_enabled[event_id])) __perf_sw_event(event_id, nr, regs, addr); } DECLARE_PER_CPU(struct pt_regs, __perf_regs[4]); /* * 'Special' version for the scheduler, it hard assumes no recursion, * which is guaranteed by us not actually scheduling inside other swevents * because those disable preemption. */ static __always_inline void __perf_sw_event_sched(u32 event_id, u64 nr, u64 addr) { struct pt_regs *regs = this_cpu_ptr(&__perf_regs[0]); perf_fetch_caller_regs(regs); ___perf_sw_event(event_id, nr, regs, addr); } extern struct static_key_false perf_sched_events; static __always_inline bool __perf_sw_enabled(int swevt) { return static_key_false(&perf_swevent_enabled[swevt]); } static inline void perf_event_task_migrate(struct task_struct *task) { if (__perf_sw_enabled(PERF_COUNT_SW_CPU_MIGRATIONS)) task->sched_migrated = 1; } static inline void perf_event_task_sched_in(struct task_struct *prev, struct task_struct *task) { if (static_branch_unlikely(&perf_sched_events)) __perf_event_task_sched_in(prev, task); if (__perf_sw_enabled(PERF_COUNT_SW_CPU_MIGRATIONS) && task->sched_migrated) { __perf_sw_event_sched(PERF_COUNT_SW_CPU_MIGRATIONS, 1, 0); task->sched_migrated = 0; } } static inline void perf_event_task_sched_out(struct task_struct *prev, struct task_struct *next) { if (__perf_sw_enabled(PERF_COUNT_SW_CONTEXT_SWITCHES)) __perf_sw_event_sched(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, 0); #ifdef CONFIG_CGROUP_PERF if (__perf_sw_enabled(PERF_COUNT_SW_CGROUP_SWITCHES) && perf_cgroup_from_task(prev, NULL) != perf_cgroup_from_task(next, NULL)) __perf_sw_event_sched(PERF_COUNT_SW_CGROUP_SWITCHES, 1, 0); #endif if (static_branch_unlikely(&perf_sched_events)) __perf_event_task_sched_out(prev, next); } extern void perf_event_mmap(struct vm_area_struct *vma); extern void perf_event_ksymbol(u16 ksym_type, u64 addr, u32 len, bool unregister, const char *sym); extern void perf_event_bpf_event(struct bpf_prog *prog, enum perf_bpf_event_type type, u16 flags); #ifdef CONFIG_GUEST_PERF_EVENTS extern struct perf_guest_info_callbacks __rcu *perf_guest_cbs; DECLARE_STATIC_CALL(__perf_guest_state, *perf_guest_cbs->state); DECLARE_STATIC_CALL(__perf_guest_get_ip, *perf_guest_cbs->get_ip); DECLARE_STATIC_CALL(__perf_guest_handle_intel_pt_intr, *perf_guest_cbs->handle_intel_pt_intr); static inline unsigned int perf_guest_state(void) { return static_call(__perf_guest_state)(); } static inline unsigned long perf_guest_get_ip(void) { return static_call(__perf_guest_get_ip)(); } static inline unsigned int perf_guest_handle_intel_pt_intr(void) { return static_call(__perf_guest_handle_intel_pt_intr)(); } extern void perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *cbs); extern void perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks *cbs); #else static inline unsigned int perf_guest_state(void) { return 0; } static inline unsigned long perf_guest_get_ip(void) { return 0; } static inline unsigned int perf_guest_handle_intel_pt_intr(void) { return 0; } #endif /* CONFIG_GUEST_PERF_EVENTS */ extern void perf_event_exec(void); extern void perf_event_comm(struct task_struct *tsk, bool exec); extern void perf_event_namespaces(struct task_struct *tsk); extern void perf_event_fork(struct task_struct *tsk); extern void perf_event_text_poke(const void *addr, const void *old_bytes, size_t old_len, const void *new_bytes, size_t new_len); /* Callchains */ DECLARE_PER_CPU(struct perf_callchain_entry, perf_callchain_entry); extern void perf_callchain_user(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs); extern void perf_callchain_kernel(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs); extern struct perf_callchain_entry * get_perf_callchain(struct pt_regs *regs, u32 init_nr, bool kernel, bool user, u32 max_stack, bool crosstask, bool add_mark); extern int get_callchain_buffers(int max_stack); extern void put_callchain_buffers(void); extern struct perf_callchain_entry *get_callchain_entry(int *rctx); extern void put_callchain_entry(int rctx); extern int sysctl_perf_event_max_stack; extern int sysctl_perf_event_max_contexts_per_stack; static inline int perf_callchain_store_context(struct perf_callchain_entry_ctx *ctx, u64 ip) { if (ctx->contexts < sysctl_perf_event_max_contexts_per_stack) { struct perf_callchain_entry *entry = ctx->entry; entry->ip[entry->nr++] = ip; ++ctx->contexts; return 0; } else { ctx->contexts_maxed = true; return -1; /* no more room, stop walking the stack */ } } static inline int perf_callchain_store(struct perf_callchain_entry_ctx *ctx, u64 ip) { if (ctx->nr < ctx->max_stack && !ctx->contexts_maxed) { struct perf_callchain_entry *entry = ctx->entry; entry->ip[entry->nr++] = ip; ++ctx->nr; return 0; } else { return -1; /* no more room, stop walking the stack */ } } extern int sysctl_perf_event_paranoid; extern int sysctl_perf_event_mlock; extern int sysctl_perf_event_sample_rate; extern int sysctl_perf_cpu_time_max_percent; extern void perf_sample_event_took(u64 sample_len_ns); int perf_event_max_sample_rate_handler(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos); int perf_cpu_time_max_percent_handler(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos); int perf_event_max_stack_handler(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos); /* Access to perf_event_open(2) syscall. */ #define PERF_SECURITY_OPEN 0 /* Finer grained perf_event_open(2) access control. */ #define PERF_SECURITY_CPU 1 #define PERF_SECURITY_KERNEL 2 #define PERF_SECURITY_TRACEPOINT 3 static inline int perf_is_paranoid(void) { return sysctl_perf_event_paranoid > -1; } int perf_allow_kernel(struct perf_event_attr *attr); static inline int perf_allow_cpu(struct perf_event_attr *attr) { if (sysctl_perf_event_paranoid > 0 && !perfmon_capable()) return -EACCES; return security_perf_event_open(attr, PERF_SECURITY_CPU); } static inline int perf_allow_tracepoint(struct perf_event_attr *attr) { if (sysctl_perf_event_paranoid > -1 && !perfmon_capable()) return -EPERM; return security_perf_event_open(attr, PERF_SECURITY_TRACEPOINT); } extern void perf_event_init(void); extern void perf_tp_event(u16 event_type, u64 count, void *record, int entry_size, struct pt_regs *regs, struct hlist_head *head, int rctx, struct task_struct *task); extern void perf_bp_event(struct perf_event *event, void *data); #ifndef perf_misc_flags # define perf_misc_flags(regs) \ (user_mode(regs) ? PERF_RECORD_MISC_USER : PERF_RECORD_MISC_KERNEL) # define perf_instruction_pointer(regs) instruction_pointer(regs) #endif #ifndef perf_arch_bpf_user_pt_regs # define perf_arch_bpf_user_pt_regs(regs) regs #endif static inline bool has_branch_stack(struct perf_event *event) { return event->attr.sample_type & PERF_SAMPLE_BRANCH_STACK; } static inline bool needs_branch_stack(struct perf_event *event) { return event->attr.branch_sample_type != 0; } static inline bool has_aux(struct perf_event *event) { return event->pmu->setup_aux; } static inline bool is_write_backward(struct perf_event *event) { return !!event->attr.write_backward; } static inline bool has_addr_filter(struct perf_event *event) { return event->pmu->nr_addr_filters; } /* * An inherited event uses parent's filters */ static inline struct perf_addr_filters_head * perf_event_addr_filters(struct perf_event *event) { struct perf_addr_filters_head *ifh = &event->addr_filters; if (event->parent) ifh = &event->parent->addr_filters; return ifh; } static inline struct fasync_struct **perf_event_fasync(struct perf_event *event) { /* Only the parent has fasync state */ if (event->parent) event = event->parent; return &event->fasync; } extern void perf_event_addr_filters_sync(struct perf_event *event); extern void perf_report_aux_output_id(struct perf_event *event, u64 hw_id); extern int perf_output_begin(struct perf_output_handle *handle, struct perf_sample_data *data, struct perf_event *event, unsigned int size); extern int perf_output_begin_forward(struct perf_output_handle *handle, struct perf_sample_data *data, struct perf_event *event, unsigned int size); extern int perf_output_begin_backward(struct perf_output_handle *handle, struct perf_sample_data *data, struct perf_event *event, unsigned int size); extern void perf_output_end(struct perf_output_handle *handle); extern unsigned int perf_output_copy(struct perf_output_handle *handle, const void *buf, unsigned int len); extern unsigned int perf_output_skip(struct perf_output_handle *handle, unsigned int len); extern long perf_output_copy_aux(struct perf_output_handle *aux_handle, struct perf_output_handle *handle, unsigned long from, unsigned long to); extern int perf_swevent_get_recursion_context(void); extern void perf_swevent_put_recursion_context(int rctx); extern u64 perf_swevent_set_period(struct perf_event *event); extern void perf_event_enable(struct perf_event *event); extern void perf_event_disable(struct perf_event *event); extern void perf_event_disable_local(struct perf_event *event); extern void perf_event_disable_inatomic(struct perf_event *event); extern void perf_event_task_tick(void); extern int perf_event_account_interrupt(struct perf_event *event); extern int perf_event_period(struct perf_event *event, u64 value); extern u64 perf_event_pause(struct perf_event *event, bool reset); #else /* !CONFIG_PERF_EVENTS: */ static inline void * perf_aux_output_begin(struct perf_output_handle *handle, struct perf_event *event) { return NULL; } static inline void perf_aux_output_end(struct perf_output_handle *handle, unsigned long size) { } static inline int perf_aux_output_skip(struct perf_output_handle *handle, unsigned long size) { return -EINVAL; } static inline void * perf_get_aux(struct perf_output_handle *handle) { return NULL; } static inline void perf_event_task_migrate(struct task_struct *task) { } static inline void perf_event_task_sched_in(struct task_struct *prev, struct task_struct *task) { } static inline void perf_event_task_sched_out(struct task_struct *prev, struct task_struct *next) { } static inline int perf_event_init_task(struct task_struct *child, u64 clone_flags) { return 0; } static inline void perf_event_exit_task(struct task_struct *child) { } static inline void perf_event_free_task(struct task_struct *task) { } static inline void perf_event_delayed_put(struct task_struct *task) { } static inline struct file *perf_event_get(unsigned int fd) { return ERR_PTR(-EINVAL); } static inline const struct perf_event *perf_get_event(struct file *file) { return ERR_PTR(-EINVAL); } static inline const struct perf_event_attr *perf_event_attrs(struct perf_event *event) { return ERR_PTR(-EINVAL); } static inline int perf_event_read_local(struct perf_event *event, u64 *value, u64 *enabled, u64 *running) { return -EINVAL; } static inline void perf_event_print_debug(void) { } static inline int perf_event_task_disable(void) { return -EINVAL; } static inline int perf_event_task_enable(void) { return -EINVAL; } static inline int perf_event_refresh(struct perf_event *event, int refresh) { return -EINVAL; } static inline void perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr) { } static inline void perf_bp_event(struct perf_event *event, void *data) { } static inline void perf_event_mmap(struct vm_area_struct *vma) { } typedef int (perf_ksymbol_get_name_f)(char *name, int name_len, void *data); static inline void perf_event_ksymbol(u16 ksym_type, u64 addr, u32 len, bool unregister, const char *sym) { } static inline void perf_event_bpf_event(struct bpf_prog *prog, enum perf_bpf_event_type type, u16 flags) { } static inline void perf_event_exec(void) { } static inline void perf_event_comm(struct task_struct *tsk, bool exec) { } static inline void perf_event_namespaces(struct task_struct *tsk) { } static inline void perf_event_fork(struct task_struct *tsk) { } static inline void perf_event_text_poke(const void *addr, const void *old_bytes, size_t old_len, const void *new_bytes, size_t new_len) { } static inline void perf_event_init(void) { } static inline int perf_swevent_get_recursion_context(void) { return -1; } static inline void perf_swevent_put_recursion_context(int rctx) { } static inline u64 perf_swevent_set_period(struct perf_event *event) { return 0; } static inline void perf_event_enable(struct perf_event *event) { } static inline void perf_event_disable(struct perf_event *event) { } static inline int __perf_event_disable(void *info) { return -1; } static inline void perf_event_task_tick(void) { } static inline int perf_event_release_kernel(struct perf_event *event) { return 0; } static inline int perf_event_period(struct perf_event *event, u64 value) { return -EINVAL; } static inline u64 perf_event_pause(struct perf_event *event, bool reset) { return 0; } #endif #if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_CPU_SUP_INTEL) extern void perf_restore_debug_store(void); #else static inline void perf_restore_debug_store(void) { } #endif #define perf_output_put(handle, x) perf_output_copy((handle), &(x), sizeof(x)) struct perf_pmu_events_attr { struct device_attribute attr; u64 id; const char *event_str; }; struct perf_pmu_events_ht_attr { struct device_attribute attr; u64 id; const char *event_str_ht; const char *event_str_noht; }; struct perf_pmu_events_hybrid_attr { struct device_attribute attr; u64 id; const char *event_str; u64 pmu_type; }; struct perf_pmu_format_hybrid_attr { struct device_attribute attr; u64 pmu_type; }; ssize_t perf_event_sysfs_show(struct device *dev, struct device_attribute *attr, char *page); #define PMU_EVENT_ATTR(_name, _var, _id, _show) \ static struct perf_pmu_events_attr _var = { \ .attr = __ATTR(_name, 0444, _show, NULL), \ .id = _id, \ }; #define PMU_EVENT_ATTR_STRING(_name, _var, _str) \ static struct perf_pmu_events_attr _var = { \ .attr = __ATTR(_name, 0444, perf_event_sysfs_show, NULL), \ .id = 0, \ .event_str = _str, \ }; #define PMU_EVENT_ATTR_ID(_name, _show, _id) \ (&((struct perf_pmu_events_attr[]) { \ { .attr = __ATTR(_name, 0444, _show, NULL), \ .id = _id, } \ })[0].attr.attr) #define PMU_FORMAT_ATTR_SHOW(_name, _format) \ static ssize_t \ _name##_show(struct device *dev, \ struct device_attribute *attr, \ char *page) \ { \ BUILD_BUG_ON(sizeof(_format) >= PAGE_SIZE); \ return sprintf(page, _format "\n"); \ } \ #define PMU_FORMAT_ATTR(_name, _format) \ PMU_FORMAT_ATTR_SHOW(_name, _format) \ \ static struct device_attribute format_attr_##_name = __ATTR_RO(_name) /* Performance counter hotplug functions */ #ifdef CONFIG_PERF_EVENTS int perf_event_init_cpu(unsigned int cpu); int perf_event_exit_cpu(unsigned int cpu); #else #define perf_event_init_cpu NULL #define perf_event_exit_cpu NULL #endif extern void arch_perf_update_userpage(struct perf_event *event, struct perf_event_mmap_page *userpg, u64 now); /* * Snapshot branch stack on software events. * * Branch stack can be very useful in understanding software events. For * example, when a long function, e.g. sys_perf_event_open, returns an * errno, it is not obvious why the function failed. Branch stack could * provide very helpful information in this type of scenarios. * * On software event, it is necessary to stop the hardware branch recorder * fast. Otherwise, the hardware register/buffer will be flushed with * entries of the triggering event. Therefore, static call is used to * stop the hardware recorder. */ /* * cnt is the number of entries allocated for entries. * Return number of entries copied to . */ typedef int (perf_snapshot_branch_stack_t)(struct perf_branch_entry *entries, unsigned int cnt); DECLARE_STATIC_CALL(perf_snapshot_branch_stack, perf_snapshot_branch_stack_t); #ifndef PERF_NEEDS_LOPWR_CB static inline void perf_lopwr_cb(bool mode) { } #endif #endif /* _LINUX_PERF_EVENT_H */ |
| 4 4 5 2 2 2 2 2 2 2 2 1 1 2 1 2 2 2 2 2 2 2 2 2 1 2 2 2 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 | // SPDX-License-Identifier: GPL-2.0-or-later /* * ALSA sequencer Ports * Copyright (c) 1998 by Frank van de Pol <fvdpol@coil.demon.nl> * Jaroslav Kysela <perex@perex.cz> */ #include <sound/core.h> #include <linux/slab.h> #include <linux/module.h> #include "seq_system.h" #include "seq_ports.h" #include "seq_clientmgr.h" /* registration of client ports */ /* NOTE: the current implementation of the port structure as a linked list is not optimal for clients that have many ports. For sending messages to all subscribers of a port we first need to find the address of the port structure, which means we have to traverse the list. A direct access table (array) would be better, but big preallocated arrays waste memory. Possible actions: 1) leave it this way, a client does normaly does not have more than a few ports 2) replace the linked list of ports by a array of pointers which is dynamicly kmalloced. When a port is added or deleted we can simply allocate a new array, copy the corresponding pointers, and delete the old one. We then only need a pointer to this array, and an integer that tells us how much elements are in array. */ /* return pointer to port structure - port is locked if found */ struct snd_seq_client_port *snd_seq_port_use_ptr(struct snd_seq_client *client, int num) { struct snd_seq_client_port *port; if (client == NULL) return NULL; guard(read_lock)(&client->ports_lock); list_for_each_entry(port, &client->ports_list_head, list) { if (port->addr.port == num) { if (port->closing) break; /* deleting now */ snd_use_lock_use(&port->use_lock); return port; } } return NULL; /* not found */ } /* search for the next port - port is locked if found */ struct snd_seq_client_port *snd_seq_port_query_nearest(struct snd_seq_client *client, struct snd_seq_port_info *pinfo) { int num; struct snd_seq_client_port *port, *found; bool check_inactive = (pinfo->capability & SNDRV_SEQ_PORT_CAP_INACTIVE); num = pinfo->addr.port; found = NULL; guard(read_lock)(&client->ports_lock); list_for_each_entry(port, &client->ports_list_head, list) { if ((port->capability & SNDRV_SEQ_PORT_CAP_INACTIVE) && !check_inactive) continue; /* skip inactive ports */ if (port->addr.port < num) continue; if (port->addr.port == num) { found = port; break; } if (found == NULL || port->addr.port < found->addr.port) found = port; } if (found) { if (found->closing) found = NULL; else snd_use_lock_use(&found->use_lock); } return found; } /* initialize snd_seq_port_subs_info */ static void port_subs_info_init(struct snd_seq_port_subs_info *grp) { INIT_LIST_HEAD(&grp->list_head); grp->count = 0; grp->exclusive = 0; rwlock_init(&grp->list_lock); init_rwsem(&grp->list_mutex); grp->open = NULL; grp->close = NULL; } /* create a port, port number or a negative error code is returned * the caller needs to unref the port via snd_seq_port_unlock() appropriately */ int snd_seq_create_port(struct snd_seq_client *client, int port, struct snd_seq_client_port **port_ret) { struct snd_seq_client_port *new_port, *p; int num; *port_ret = NULL; /* sanity check */ if (snd_BUG_ON(!client)) return -EINVAL; if (client->num_ports >= SNDRV_SEQ_MAX_PORTS) { pr_warn("ALSA: seq: too many ports for client %d\n", client->number); return -EINVAL; } /* create a new port */ new_port = kzalloc(sizeof(*new_port), GFP_KERNEL); if (!new_port) return -ENOMEM; /* failure, out of memory */ /* init port data */ new_port->addr.client = client->number; new_port->addr.port = -1; new_port->owner = THIS_MODULE; snd_use_lock_init(&new_port->use_lock); port_subs_info_init(&new_port->c_src); port_subs_info_init(&new_port->c_dest); snd_use_lock_use(&new_port->use_lock); num = max(port, 0); guard(mutex)(&client->ports_mutex); guard(write_lock_irq)(&client->ports_lock); list_for_each_entry(p, &client->ports_list_head, list) { if (p->addr.port == port) { kfree(new_port); return -EBUSY; } if (p->addr.port > num) break; if (port < 0) /* auto-probe mode */ num = p->addr.port + 1; } /* insert the new port */ list_add_tail(&new_port->list, &p->list); client->num_ports++; new_port->addr.port = num; /* store the port number in the port */ sprintf(new_port->name, "port-%d", num); *port_ret = new_port; return num; } /* */ static int subscribe_port(struct snd_seq_client *client, struct snd_seq_client_port *port, struct snd_seq_port_subs_info *grp, struct snd_seq_port_subscribe *info, int send_ack); static int unsubscribe_port(struct snd_seq_client *client, struct snd_seq_client_port *port, struct snd_seq_port_subs_info *grp, struct snd_seq_port_subscribe *info, int send_ack); static struct snd_seq_client_port *get_client_port(struct snd_seq_addr *addr, struct snd_seq_client **cp) { struct snd_seq_client_port *p; *cp = snd_seq_client_use_ptr(addr->client); if (*cp) { p = snd_seq_port_use_ptr(*cp, addr->port); if (! p) { snd_seq_client_unlock(*cp); *cp = NULL; } return p; } return NULL; } static void delete_and_unsubscribe_port(struct snd_seq_client *client, struct snd_seq_client_port *port, struct snd_seq_subscribers *subs, bool is_src, bool ack); static inline struct snd_seq_subscribers * get_subscriber(struct list_head *p, bool is_src) { if (is_src) return list_entry(p, struct snd_seq_subscribers, src_list); else return list_entry(p, struct snd_seq_subscribers, dest_list); } /* * remove all subscribers on the list * this is called from port_delete, for each src and dest list. */ static void clear_subscriber_list(struct snd_seq_client *client, struct snd_seq_client_port *port, struct snd_seq_port_subs_info *grp, int is_src) { struct list_head *p, *n; list_for_each_safe(p, n, &grp->list_head) { struct snd_seq_subscribers *subs; struct snd_seq_client *c; struct snd_seq_client_port *aport; subs = get_subscriber(p, is_src); if (is_src) aport = get_client_port(&subs->info.dest, &c); else aport = get_client_port(&subs->info.sender, &c); delete_and_unsubscribe_port(client, port, subs, is_src, false); if (!aport) { /* looks like the connected port is being deleted. * we decrease the counter, and when both ports are deleted * remove the subscriber info */ if (atomic_dec_and_test(&subs->ref_count)) kfree(subs); continue; } /* ok we got the connected port */ delete_and_unsubscribe_port(c, aport, subs, !is_src, true); kfree(subs); snd_seq_port_unlock(aport); snd_seq_client_unlock(c); } } /* delete port data */ static int port_delete(struct snd_seq_client *client, struct snd_seq_client_port *port) { /* set closing flag and wait for all port access are gone */ port->closing = 1; snd_use_lock_sync(&port->use_lock); /* clear subscribers info */ clear_subscriber_list(client, port, &port->c_src, true); clear_subscriber_list(client, port, &port->c_dest, false); if (port->private_free) port->private_free(port->private_data); snd_BUG_ON(port->c_src.count != 0); snd_BUG_ON(port->c_dest.count != 0); kfree(port); return 0; } /* delete a port with the given port id */ int snd_seq_delete_port(struct snd_seq_client *client, int port) { struct snd_seq_client_port *found = NULL, *p; scoped_guard(mutex, &client->ports_mutex) { guard(write_lock_irq)(&client->ports_lock); list_for_each_entry(p, &client->ports_list_head, list) { if (p->addr.port == port) { /* ok found. delete from the list at first */ list_del(&p->list); client->num_ports--; found = p; break; } } } if (found) return port_delete(client, found); else return -ENOENT; } /* delete the all ports belonging to the given client */ int snd_seq_delete_all_ports(struct snd_seq_client *client) { struct list_head deleted_list; struct snd_seq_client_port *port, *tmp; /* move the port list to deleted_list, and * clear the port list in the client data. */ guard(mutex)(&client->ports_mutex); scoped_guard(write_lock_irq, &client->ports_lock) { if (!list_empty(&client->ports_list_head)) { list_add(&deleted_list, &client->ports_list_head); list_del_init(&client->ports_list_head); } else { INIT_LIST_HEAD(&deleted_list); } client->num_ports = 0; } /* remove each port in deleted_list */ list_for_each_entry_safe(port, tmp, &deleted_list, list) { list_del(&port->list); snd_seq_system_client_ev_port_exit(port->addr.client, port->addr.port); port_delete(client, port); } return 0; } /* set port info fields */ int snd_seq_set_port_info(struct snd_seq_client_port * port, struct snd_seq_port_info * info) { if (snd_BUG_ON(!port || !info)) return -EINVAL; /* set port name */ if (info->name[0]) strscpy(port->name, info->name, sizeof(port->name)); /* set capabilities */ port->capability = info->capability; /* get port type */ port->type = info->type; /* information about supported channels/voices */ port->midi_channels = info->midi_channels; port->midi_voices = info->midi_voices; port->synth_voices = info->synth_voices; /* timestamping */ port->timestamping = (info->flags & SNDRV_SEQ_PORT_FLG_TIMESTAMP) ? 1 : 0; port->time_real = (info->flags & SNDRV_SEQ_PORT_FLG_TIME_REAL) ? 1 : 0; port->time_queue = info->time_queue; /* UMP direction and group */ port->direction = info->direction; port->ump_group = info->ump_group; if (port->ump_group > SNDRV_UMP_MAX_GROUPS) port->ump_group = 0; /* fill default port direction */ if (!port->direction) { if (info->capability & SNDRV_SEQ_PORT_CAP_READ) port->direction |= SNDRV_SEQ_PORT_DIR_INPUT; if (info->capability & SNDRV_SEQ_PORT_CAP_WRITE) port->direction |= SNDRV_SEQ_PORT_DIR_OUTPUT; } port->is_midi1 = !!(info->flags & SNDRV_SEQ_PORT_FLG_IS_MIDI1); return 0; } /* get port info fields */ int snd_seq_get_port_info(struct snd_seq_client_port * port, struct snd_seq_port_info * info) { if (snd_BUG_ON(!port || !info)) return -EINVAL; /* get port name */ strscpy(info->name, port->name, sizeof(info->name)); /* get capabilities */ info->capability = port->capability; /* get port type */ info->type = port->type; /* information about supported channels/voices */ info->midi_channels = port->midi_channels; info->midi_voices = port->midi_voices; info->synth_voices = port->synth_voices; /* get subscriber counts */ info->read_use = port->c_src.count; info->write_use = port->c_dest.count; /* timestamping */ info->flags = 0; if (port->timestamping) { info->flags |= SNDRV_SEQ_PORT_FLG_TIMESTAMP; if (port->time_real) info->flags |= SNDRV_SEQ_PORT_FLG_TIME_REAL; info->time_queue = port->time_queue; } if (port->is_midi1) info->flags |= SNDRV_SEQ_PORT_FLG_IS_MIDI1; /* UMP direction and group */ info->direction = port->direction; info->ump_group = port->ump_group; return 0; } /* * call callback functions (if any): * the callbacks are invoked only when the first (for connection) or * the last subscription (for disconnection) is done. Second or later * subscription results in increment of counter, but no callback is * invoked. * This feature is useful if these callbacks are associated with * initialization or termination of devices (see seq_midi.c). */ static int subscribe_port(struct snd_seq_client *client, struct snd_seq_client_port *port, struct snd_seq_port_subs_info *grp, struct snd_seq_port_subscribe *info, int send_ack) { int err = 0; if (!try_module_get(port->owner)) return -EFAULT; grp->count++; if (grp->open && grp->count == 1) { err = grp->open(port->private_data, info); if (err < 0) { module_put(port->owner); grp->count--; } } if (err >= 0 && send_ack && client->type == USER_CLIENT) snd_seq_client_notify_subscription(port->addr.client, port->addr.port, info, SNDRV_SEQ_EVENT_PORT_SUBSCRIBED); return err; } static int unsubscribe_port(struct snd_seq_client *client, struct snd_seq_client_port *port, struct snd_seq_port_subs_info *grp, struct snd_seq_port_subscribe *info, int send_ack) { int err = 0; if (! grp->count) return -EINVAL; grp->count--; if (grp->close && grp->count == 0) err = grp->close(port->private_data, info); if (send_ack && client->type == USER_CLIENT) snd_seq_client_notify_subscription(port->addr.client, port->addr.port, info, SNDRV_SEQ_EVENT_PORT_UNSUBSCRIBED); module_put(port->owner); return err; } /* check if both addresses are identical */ static inline int addr_match(struct snd_seq_addr *r, struct snd_seq_addr *s) { return (r->client == s->client) && (r->port == s->port); } /* check the two subscribe info match */ /* if flags is zero, checks only sender and destination addresses */ static int match_subs_info(struct snd_seq_port_subscribe *r, struct snd_seq_port_subscribe *s) { if (addr_match(&r->sender, &s->sender) && addr_match(&r->dest, &s->dest)) { if (r->flags && r->flags == s->flags) return r->queue == s->queue; else if (! r->flags) return 1; } return 0; } static int check_and_subscribe_port(struct snd_seq_client *client, struct snd_seq_client_port *port, struct snd_seq_subscribers *subs, bool is_src, bool exclusive, bool ack) { struct snd_seq_port_subs_info *grp; struct list_head *p; struct snd_seq_subscribers *s; int err; grp = is_src ? &port->c_src : &port->c_dest; guard(rwsem_write)(&grp->list_mutex); if (exclusive) { if (!list_empty(&grp->list_head)) return -EBUSY; } else { if (grp->exclusive) return -EBUSY; /* check whether already exists */ list_for_each(p, &grp->list_head) { s = get_subscriber(p, is_src); if (match_subs_info(&subs->info, &s->info)) return -EBUSY; } } err = subscribe_port(client, port, grp, &subs->info, ack); if (err < 0) { grp->exclusive = 0; return err; } /* add to list */ guard(write_lock_irq)(&grp->list_lock); if (is_src) list_add_tail(&subs->src_list, &grp->list_head); else list_add_tail(&subs->dest_list, &grp->list_head); grp->exclusive = exclusive; atomic_inc(&subs->ref_count); return 0; } /* called with grp->list_mutex held */ static void __delete_and_unsubscribe_port(struct snd_seq_client *client, struct snd_seq_client_port *port, struct snd_seq_subscribers *subs, bool is_src, bool ack) { struct snd_seq_port_subs_info *grp; struct list_head *list; bool empty; grp = is_src ? &port->c_src : &port->c_dest; list = is_src ? &subs->src_list : &subs->dest_list; scoped_guard(write_lock_irq, &grp->list_lock) { empty = list_empty(list); if (!empty) list_del_init(list); grp->exclusive = 0; } if (!empty) unsubscribe_port(client, port, grp, &subs->info, ack); } static void delete_and_unsubscribe_port(struct snd_seq_client *client, struct snd_seq_client_port *port, struct snd_seq_subscribers *subs, bool is_src, bool ack) { struct snd_seq_port_subs_info *grp; grp = is_src ? &port->c_src : &port->c_dest; guard(rwsem_write)(&grp->list_mutex); __delete_and_unsubscribe_port(client, port, subs, is_src, ack); } /* connect two ports */ int snd_seq_port_connect(struct snd_seq_client *connector, struct snd_seq_client *src_client, struct snd_seq_client_port *src_port, struct snd_seq_client *dest_client, struct snd_seq_client_port *dest_port, struct snd_seq_port_subscribe *info) { struct snd_seq_subscribers *subs; bool exclusive; int err; subs = kzalloc(sizeof(*subs), GFP_KERNEL); if (!subs) return -ENOMEM; subs->info = *info; atomic_set(&subs->ref_count, 0); INIT_LIST_HEAD(&subs->src_list); INIT_LIST_HEAD(&subs->dest_list); exclusive = !!(info->flags & SNDRV_SEQ_PORT_SUBS_EXCLUSIVE); err = check_and_subscribe_port(src_client, src_port, subs, true, exclusive, connector->number != src_client->number); if (err < 0) goto error; err = check_and_subscribe_port(dest_client, dest_port, subs, false, exclusive, connector->number != dest_client->number); if (err < 0) goto error_dest; return 0; error_dest: delete_and_unsubscribe_port(src_client, src_port, subs, true, connector->number != src_client->number); error: kfree(subs); return err; } /* remove the connection */ int snd_seq_port_disconnect(struct snd_seq_client *connector, struct snd_seq_client *src_client, struct snd_seq_client_port *src_port, struct snd_seq_client *dest_client, struct snd_seq_client_port *dest_port, struct snd_seq_port_subscribe *info) { struct snd_seq_port_subs_info *dest = &dest_port->c_dest; struct snd_seq_subscribers *subs; int err = -ENOENT; /* always start from deleting the dest port for avoiding concurrent * deletions */ scoped_guard(rwsem_write, &dest->list_mutex) { /* look for the connection */ list_for_each_entry(subs, &dest->list_head, dest_list) { if (match_subs_info(info, &subs->info)) { __delete_and_unsubscribe_port(dest_client, dest_port, subs, false, connector->number != dest_client->number); err = 0; break; } } } if (err < 0) return err; delete_and_unsubscribe_port(src_client, src_port, subs, true, connector->number != src_client->number); kfree(subs); return 0; } /* get matched subscriber */ int snd_seq_port_get_subscription(struct snd_seq_port_subs_info *src_grp, struct snd_seq_addr *dest_addr, struct snd_seq_port_subscribe *subs) { struct snd_seq_subscribers *s; int err = -ENOENT; guard(rwsem_read)(&src_grp->list_mutex); list_for_each_entry(s, &src_grp->list_head, src_list) { if (addr_match(dest_addr, &s->info.dest)) { *subs = s->info; err = 0; break; } } return err; } /* * Attach a device driver that wants to receive events from the * sequencer. Returns the new port number on success. * A driver that wants to receive the events converted to midi, will * use snd_seq_midisynth_register_port(). */ /* exported */ int snd_seq_event_port_attach(int client, struct snd_seq_port_callback *pcbp, int cap, int type, int midi_channels, int midi_voices, char *portname) { struct snd_seq_port_info portinfo; int ret; /* Set up the port */ memset(&portinfo, 0, sizeof(portinfo)); portinfo.addr.client = client; strscpy(portinfo.name, portname ? portname : "Unnamed port", sizeof(portinfo.name)); portinfo.capability = cap; portinfo.type = type; portinfo.kernel = pcbp; portinfo.midi_channels = midi_channels; portinfo.midi_voices = midi_voices; /* Create it */ ret = snd_seq_kernel_client_ctl(client, SNDRV_SEQ_IOCTL_CREATE_PORT, &portinfo); if (ret >= 0) ret = portinfo.addr.port; return ret; } EXPORT_SYMBOL(snd_seq_event_port_attach); /* * Detach the driver from a port. */ /* exported */ int snd_seq_event_port_detach(int client, int port) { struct snd_seq_port_info portinfo; int err; memset(&portinfo, 0, sizeof(portinfo)); portinfo.addr.client = client; portinfo.addr.port = port; err = snd_seq_kernel_client_ctl(client, SNDRV_SEQ_IOCTL_DELETE_PORT, &portinfo); return err; } EXPORT_SYMBOL(snd_seq_event_port_detach); |
| 5 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 | /* SPDX-License-Identifier: GPL-2.0-or-later */ /* md.h : kernel internal structure of the Linux MD driver Copyright (C) 1996-98 Ingo Molnar, Gadi Oxman */ #ifndef _MD_MD_H #define _MD_MD_H #include <linux/blkdev.h> #include <linux/backing-dev.h> #include <linux/badblocks.h> #include <linux/kobject.h> #include <linux/list.h> #include <linux/mm.h> #include <linux/mutex.h> #include <linux/timer.h> #include <linux/wait.h> #include <linux/workqueue.h> #include <trace/events/block.h> #include "md-cluster.h" #define MaxSector (~(sector_t)0) /* * These flags should really be called "NO_RETRY" rather than * "FAILFAST" because they don't make any promise about time lapse, * only about the number of retries, which will be zero. * REQ_FAILFAST_DRIVER is not included because * Commit: 4a27446f3e39 ("[SCSI] modify scsi to handle new fail fast flags.") * seems to suggest that the errors it avoids retrying should usually * be retried. */ #define MD_FAILFAST (REQ_FAILFAST_DEV | REQ_FAILFAST_TRANSPORT) /* Status of sync thread. */ enum sync_action { /* * Represent by MD_RECOVERY_SYNC, start when: * 1) after assemble, sync data from first rdev to other copies, this * must be done first before other sync actions and will only execute * once; * 2) resize the array(notice that this is not reshape), sync data for * the new range; */ ACTION_RESYNC, /* * Represent by MD_RECOVERY_RECOVER, start when: * 1) for new replacement, sync data based on the replace rdev or * available copies from other rdev; * 2) for new member disk while the array is degraded, sync data from * other rdev; * 3) reassemble after power failure or re-add a hot removed rdev, sync * data from first rdev to other copies based on bitmap; */ ACTION_RECOVER, /* * Represent by MD_RECOVERY_SYNC | MD_RECOVERY_REQUESTED | * MD_RECOVERY_CHECK, start when user echo "check" to sysfs api * sync_action, used to check if data copies from differenct rdev are * the same. The number of mismatch sectors will be exported to user * by sysfs api mismatch_cnt; */ ACTION_CHECK, /* * Represent by MD_RECOVERY_SYNC | MD_RECOVERY_REQUESTED, start when * user echo "repair" to sysfs api sync_action, usually paired with * ACTION_CHECK, used to force syncing data once user found that there * are inconsistent data, */ ACTION_REPAIR, /* * Represent by MD_RECOVERY_RESHAPE, start when new member disk is added * to the conf, notice that this is different from spares or * replacement; */ ACTION_RESHAPE, /* * Represent by MD_RECOVERY_FROZEN, can be set by sysfs api sync_action * or internal usage like setting the array read-only, will forbid above * actions. */ ACTION_FROZEN, /* * All above actions don't match. */ ACTION_IDLE, NR_SYNC_ACTIONS, }; /* * The struct embedded in rdev is used to serialize IO. */ struct serial_in_rdev { struct rb_root_cached serial_rb; spinlock_t serial_lock; wait_queue_head_t serial_io_wait; }; /* * MD's 'extended' device */ struct md_rdev { struct list_head same_set; /* RAID devices within the same set */ sector_t sectors; /* Device size (in 512bytes sectors) */ struct mddev *mddev; /* RAID array if running */ int last_events; /* IO event timestamp */ /* * If meta_bdev is non-NULL, it means that a separate device is * being used to store the metadata (superblock/bitmap) which * would otherwise be contained on the same device as the data (bdev). */ struct block_device *meta_bdev; struct block_device *bdev; /* block device handle */ struct file *bdev_file; /* Handle from open for bdev */ struct page *sb_page, *bb_page; int sb_loaded; __u64 sb_events; sector_t data_offset; /* start of data in array */ sector_t new_data_offset;/* only relevant while reshaping */ sector_t sb_start; /* offset of the super block (in 512byte sectors) */ int sb_size; /* bytes in the superblock */ int preferred_minor; /* autorun support */ struct kobject kobj; /* A device can be in one of three states based on two flags: * Not working: faulty==1 in_sync==0 * Fully working: faulty==0 in_sync==1 * Working, but not * in sync with array * faulty==0 in_sync==0 * * It can never have faulty==1, in_sync==1 * This reduces the burden of testing multiple flags in many cases */ unsigned long flags; /* bit set of 'enum flag_bits' bits. */ wait_queue_head_t blocked_wait; int desc_nr; /* descriptor index in the superblock */ int raid_disk; /* role of device in array */ int new_raid_disk; /* role that the device will have in * the array after a level-change completes. */ int saved_raid_disk; /* role that device used to have in the * array and could again if we did a partial * resync from the bitmap */ union { sector_t recovery_offset;/* If this device has been partially * recovered, this is where we were * up to. */ sector_t journal_tail; /* If this device is a journal device, * this is the journal tail (journal * recovery start point) */ }; atomic_t nr_pending; /* number of pending requests. * only maintained for arrays that * support hot removal */ atomic_t read_errors; /* number of consecutive read errors that * we have tried to ignore. */ time64_t last_read_error; /* monotonic time since our * last read error */ atomic_t corrected_errors; /* number of corrected read errors, * for reporting to userspace and storing * in superblock. */ struct serial_in_rdev *serial; /* used for raid1 io serialization */ struct kernfs_node *sysfs_state; /* handle for 'state' * sysfs entry */ /* handle for 'unacknowledged_bad_blocks' sysfs dentry */ struct kernfs_node *sysfs_unack_badblocks; /* handle for 'bad_blocks' sysfs dentry */ struct kernfs_node *sysfs_badblocks; struct badblocks badblocks; struct { short offset; /* Offset from superblock to start of PPL. * Not used by external metadata. */ unsigned int size; /* Size in sectors of the PPL space */ sector_t sector; /* First sector of the PPL space */ } ppl; }; enum flag_bits { Faulty, /* device is known to have a fault */ In_sync, /* device is in_sync with rest of array */ Bitmap_sync, /* ..actually, not quite In_sync. Need a * bitmap-based recovery to get fully in sync. * The bit is only meaningful before device * has been passed to pers->hot_add_disk. */ WriteMostly, /* Avoid reading if at all possible */ AutoDetected, /* added by auto-detect */ Blocked, /* An error occurred but has not yet * been acknowledged by the metadata * handler, so don't allow writes * until it is cleared */ WriteErrorSeen, /* A write error has been seen on this * device */ FaultRecorded, /* Intermediate state for clearing * Blocked. The Fault is/will-be * recorded in the metadata, but that * metadata hasn't been stored safely * on disk yet. */ BlockedBadBlocks, /* A writer is blocked because they * found an unacknowledged bad-block. * This can safely be cleared at any * time, and the writer will re-check. * It may be set at any time, and at * worst the writer will timeout and * re-check. So setting it as * accurately as possible is good, but * not absolutely critical. */ WantReplacement, /* This device is a candidate to be * hot-replaced, either because it has * reported some faults, or because * of explicit request. */ Replacement, /* This device is a replacement for * a want_replacement device with same * raid_disk number. */ Candidate, /* For clustered environments only: * This device is seen locally but not * by the whole cluster */ Journal, /* This device is used as journal for * raid-5/6. * Usually, this device should be faster * than other devices in the array */ ClusterRemove, ExternalBbl, /* External metadata provides bad * block management for a disk */ FailFast, /* Minimal retries should be attempted on * this device, so use REQ_FAILFAST_DEV. * Also don't try to repair failed reads. * It is expects that no bad block log * is present. */ LastDev, /* Seems to be the last working dev as * it didn't fail, so don't use FailFast * any more for metadata */ CollisionCheck, /* * check if there is collision between raid1 * serial bios. */ Nonrot, /* non-rotational device (SSD) */ }; static inline int is_badblock(struct md_rdev *rdev, sector_t s, int sectors, sector_t *first_bad, int *bad_sectors) { if (unlikely(rdev->badblocks.count)) { int rv = badblocks_check(&rdev->badblocks, rdev->data_offset + s, sectors, first_bad, bad_sectors); if (rv) *first_bad -= rdev->data_offset; return rv; } return 0; } static inline int rdev_has_badblock(struct md_rdev *rdev, sector_t s, int sectors) { sector_t first_bad; int bad_sectors; return is_badblock(rdev, s, sectors, &first_bad, &bad_sectors); } extern int rdev_set_badblocks(struct md_rdev *rdev, sector_t s, int sectors, int is_new); extern int rdev_clear_badblocks(struct md_rdev *rdev, sector_t s, int sectors, int is_new); struct md_cluster_info; /** * enum mddev_flags - md device flags. * @MD_ARRAY_FIRST_USE: First use of array, needs initialization. * @MD_CLOSING: If set, we are closing the array, do not open it then. * @MD_JOURNAL_CLEAN: A raid with journal is already clean. * @MD_HAS_JOURNAL: The raid array has journal feature set. * @MD_CLUSTER_RESYNC_LOCKED: cluster raid only, which means node, already took * resync lock, need to release the lock. * @MD_FAILFAST_SUPPORTED: Using MD_FAILFAST on metadata writes is supported as * calls to md_error() will never cause the array to * become failed. * @MD_HAS_PPL: The raid array has PPL feature set. * @MD_HAS_MULTIPLE_PPLS: The raid array has multiple PPLs feature set. * @MD_NOT_READY: do_md_run() is active, so 'array_state', ust not report that * array is ready yet. * @MD_BROKEN: This is used to stop writes and mark array as failed. * @MD_DELETED: This device is being deleted * * change UNSUPPORTED_MDDEV_FLAGS for each array type if new flag is added */ enum mddev_flags { MD_ARRAY_FIRST_USE, MD_CLOSING, MD_JOURNAL_CLEAN, MD_HAS_JOURNAL, MD_CLUSTER_RESYNC_LOCKED, MD_FAILFAST_SUPPORTED, MD_HAS_PPL, MD_HAS_MULTIPLE_PPLS, MD_NOT_READY, MD_BROKEN, MD_DELETED, }; enum mddev_sb_flags { MD_SB_CHANGE_DEVS, /* Some device status has changed */ MD_SB_CHANGE_CLEAN, /* transition to or from 'clean' */ MD_SB_CHANGE_PENDING, /* switch from 'clean' to 'active' in progress */ MD_SB_NEED_REWRITE, /* metadata write needs to be repeated */ }; #define NR_SERIAL_INFOS 8 /* record current range of serialize IOs */ struct serial_info { struct rb_node node; sector_t start; /* start sector of rb node */ sector_t last; /* end sector of rb node */ sector_t _subtree_last; /* highest sector in subtree of rb node */ }; /* * mddev->curr_resync stores the current sector of the resync but * also has some overloaded values. */ enum { /* No resync in progress */ MD_RESYNC_NONE = 0, /* Yielded to allow another conflicting resync to commence */ MD_RESYNC_YIELDED = 1, /* Delayed to check that there is no conflict with another sync */ MD_RESYNC_DELAYED = 2, /* Any value greater than or equal to this is in an active resync */ MD_RESYNC_ACTIVE = 3, }; struct mddev { void *private; struct md_personality *pers; dev_t unit; int md_minor; struct list_head disks; unsigned long flags; unsigned long sb_flags; int suspended; struct mutex suspend_mutex; struct percpu_ref active_io; int ro; int sysfs_active; /* set when sysfs deletes * are happening, so run/ * takeover/stop are not safe */ struct gendisk *gendisk; struct kobject kobj; int hold_active; #define UNTIL_IOCTL 1 #define UNTIL_STOP 2 /* Superblock information */ int major_version, minor_version, patch_version; int persistent; int external; /* metadata is * managed externally */ char metadata_type[17]; /* externally set*/ int chunk_sectors; time64_t ctime, utime; int level, layout; char clevel[16]; int raid_disks; int max_disks; sector_t dev_sectors; /* used size of * component devices */ sector_t array_sectors; /* exported array size */ int external_size; /* size managed * externally */ __u64 events; /* If the last 'event' was simply a clean->dirty transition, and * we didn't write it to the spares, then it is safe and simple * to just decrement the event count on a dirty->clean transition. * So we record that possibility here. */ int can_decrease_events; char uuid[16]; /* If the array is being reshaped, we need to record the * new shape and an indication of where we are up to. * This is written to the superblock. * If reshape_position is MaxSector, then no reshape is happening (yet). */ sector_t reshape_position; int delta_disks, new_level, new_layout; int new_chunk_sectors; int reshape_backwards; struct md_thread __rcu *thread; /* management thread */ struct md_thread __rcu *sync_thread; /* doing resync or reconstruct */ /* * Set when a sync operation is started. It holds this value even * when the sync thread is "frozen" (interrupted) or "idle" (stopped * or finished). It is overwritten when a new sync operation is begun. */ enum sync_action last_sync_action; sector_t curr_resync; /* last block scheduled */ /* As resync requests can complete out of order, we cannot easily track * how much resync has been completed. So we occasionally pause until * everything completes, then set curr_resync_completed to curr_resync. * As such it may be well behind the real resync mark, but it is a value * we are certain of. */ sector_t curr_resync_completed; unsigned long resync_mark; /* a recent timestamp */ sector_t resync_mark_cnt;/* blocks written at resync_mark */ sector_t curr_mark_cnt; /* blocks scheduled now */ sector_t resync_max_sectors; /* may be set by personality */ atomic64_t resync_mismatches; /* count of sectors where * parity/replica mismatch found */ /* allow user-space to request suspension of IO to regions of the array */ sector_t suspend_lo; sector_t suspend_hi; /* if zero, use the system-wide default */ int sync_speed_min; int sync_speed_max; /* resync even though the same disks are shared among md-devices */ int parallel_resync; int ok_start_degraded; unsigned long recovery; /* If a RAID personality determines that recovery (of a particular * device) will fail due to a read error on the source device, it * takes a copy of this number and does not attempt recovery again * until this number changes. */ int recovery_disabled; int in_sync; /* know to not need resync */ /* 'open_mutex' avoids races between 'md_open' and 'do_md_stop', so * that we are never stopping an array while it is open. * 'reconfig_mutex' protects all other reconfiguration. * These locks are separate due to conflicting interactions * with disk->open_mutex. * Lock ordering is: * reconfig_mutex -> disk->open_mutex * disk->open_mutex -> open_mutex: e.g. __blkdev_get -> md_open */ struct mutex open_mutex; struct mutex reconfig_mutex; atomic_t active; /* general refcount */ atomic_t openers; /* number of active opens */ int changed; /* True if we might need to * reread partition info */ int degraded; /* whether md should consider * adding a spare */ atomic_t recovery_active; /* blocks scheduled, but not written */ wait_queue_head_t recovery_wait; sector_t recovery_cp; sector_t resync_min; /* user requested sync * starts here */ sector_t resync_max; /* resync should pause * when it gets here */ struct kernfs_node *sysfs_state; /* handle for 'array_state' * file in sysfs. */ struct kernfs_node *sysfs_action; /* handle for 'sync_action' */ struct kernfs_node *sysfs_completed; /*handle for 'sync_completed' */ struct kernfs_node *sysfs_degraded; /*handle for 'degraded' */ struct kernfs_node *sysfs_level; /*handle for 'level' */ /* used for delayed sysfs removal */ struct work_struct del_work; /* used for register new sync thread */ struct work_struct sync_work; /* "lock" protects: * flush_bio transition from NULL to !NULL * rdev superblocks, events * clearing MD_CHANGE_* * in_sync - and related safemode and MD_CHANGE changes * pers (also protected by reconfig_mutex and pending IO). * clearing ->bitmap * clearing ->bitmap_info.file * changing ->resync_{min,max} * setting MD_RECOVERY_RUNNING (which interacts with resync_{min,max}) */ spinlock_t lock; wait_queue_head_t sb_wait; /* for waiting on superblock updates */ atomic_t pending_writes; /* number of active superblock writes */ unsigned int safemode; /* if set, update "clean" superblock * when no writes pending. */ unsigned int safemode_delay; struct timer_list safemode_timer; struct percpu_ref writes_pending; int sync_checkers; /* # of threads checking writes_pending */ void *bitmap; /* the bitmap for the device */ struct bitmap_operations *bitmap_ops; struct { struct file *file; /* the bitmap file */ loff_t offset; /* offset from superblock of * start of bitmap. May be * negative, but not '0' * For external metadata, offset * from start of device. */ unsigned long space; /* space available at this offset */ loff_t default_offset; /* this is the offset to use when * hot-adding a bitmap. It should * eventually be settable by sysfs. */ unsigned long default_space; /* space available at * default offset */ struct mutex mutex; unsigned long chunksize; unsigned long daemon_sleep; /* how many jiffies between updates? */ unsigned long max_write_behind; /* write-behind mode */ int external; int nodes; /* Maximum number of nodes in the cluster */ char cluster_name[64]; /* Name of the cluster */ } bitmap_info; atomic_t max_corr_read_errors; /* max read retries */ struct list_head all_mddevs; const struct attribute_group *to_remove; struct bio_set bio_set; struct bio_set sync_set; /* for sync operations like * metadata and bitmap writes */ struct bio_set io_clone_set; struct work_struct event_work; /* used by dm to report failure event */ mempool_t *serial_info_pool; void (*sync_super)(struct mddev *mddev, struct md_rdev *rdev); struct md_cluster_info *cluster_info; unsigned int good_device_nr; /* good device num within cluster raid */ unsigned int noio_flag; /* for memalloc scope API */ /* * Temporarily store rdev that will be finally removed when * reconfig_mutex is unlocked, protected by reconfig_mutex. */ struct list_head deleting; /* The sequence number for sync thread */ atomic_t sync_seq; bool has_superblocks:1; bool fail_last_dev:1; bool serialize_policy:1; }; enum recovery_flags { /* flags for sync thread running status */ /* * set when one of sync action is set and new sync thread need to be * registered, or just add/remove spares from conf. */ MD_RECOVERY_NEEDED, /* sync thread is running, or about to be started */ MD_RECOVERY_RUNNING, /* sync thread needs to be aborted for some reason */ MD_RECOVERY_INTR, /* sync thread is done and is waiting to be unregistered */ MD_RECOVERY_DONE, /* running sync thread must abort immediately, and not restart */ MD_RECOVERY_FROZEN, /* waiting for pers->start() to finish */ MD_RECOVERY_WAIT, /* interrupted because io-error */ MD_RECOVERY_ERROR, /* flags determines sync action, see details in enum sync_action */ /* if just this flag is set, action is resync. */ MD_RECOVERY_SYNC, /* * paired with MD_RECOVERY_SYNC, if MD_RECOVERY_CHECK is not set, * action is repair, means user requested resync. */ MD_RECOVERY_REQUESTED, /* * paired with MD_RECOVERY_SYNC and MD_RECOVERY_REQUESTED, action is * check. */ MD_RECOVERY_CHECK, /* recovery, or need to try it */ MD_RECOVERY_RECOVER, /* reshape */ MD_RECOVERY_RESHAPE, /* remote node is running resync thread */ MD_RESYNCING_REMOTE, }; enum md_ro_state { MD_RDWR, MD_RDONLY, MD_AUTO_READ, MD_MAX_STATE }; static inline bool md_is_rdwr(struct mddev *mddev) { return (mddev->ro == MD_RDWR); } static inline bool reshape_interrupted(struct mddev *mddev) { /* reshape never start */ if (mddev->reshape_position == MaxSector) return false; /* interrupted */ if (!test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) return true; /* running reshape will be interrupted soon. */ if (test_bit(MD_RECOVERY_WAIT, &mddev->recovery) || test_bit(MD_RECOVERY_INTR, &mddev->recovery) || test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) return true; return false; } static inline int __must_check mddev_lock(struct mddev *mddev) { return mutex_lock_interruptible(&mddev->reconfig_mutex); } /* Sometimes we need to take the lock in a situation where * failure due to interrupts is not acceptable. */ static inline void mddev_lock_nointr(struct mddev *mddev) { mutex_lock(&mddev->reconfig_mutex); } static inline int mddev_trylock(struct mddev *mddev) { return mutex_trylock(&mddev->reconfig_mutex); } extern void mddev_unlock(struct mddev *mddev); static inline void md_sync_acct(struct block_device *bdev, unsigned long nr_sectors) { if (blk_queue_io_stat(bdev->bd_disk->queue)) atomic_add(nr_sectors, &bdev->bd_disk->sync_io); } static inline void md_sync_acct_bio(struct bio *bio, unsigned long nr_sectors) { md_sync_acct(bio->bi_bdev, nr_sectors); } struct md_personality { char *name; int level; struct list_head list; struct module *owner; bool __must_check (*make_request)(struct mddev *mddev, struct bio *bio); /* * start up works that do NOT require md_thread. tasks that * requires md_thread should go into start() */ int (*run)(struct mddev *mddev); /* start up works that require md threads */ int (*start)(struct mddev *mddev); void (*free)(struct mddev *mddev, void *priv); void (*status)(struct seq_file *seq, struct mddev *mddev); /* error_handler must set ->faulty and clear ->in_sync * if appropriate, and should abort recovery if needed */ void (*error_handler)(struct mddev *mddev, struct md_rdev *rdev); int (*hot_add_disk) (struct mddev *mddev, struct md_rdev *rdev); int (*hot_remove_disk) (struct mddev *mddev, struct md_rdev *rdev); int (*spare_active) (struct mddev *mddev); sector_t (*sync_request)(struct mddev *mddev, sector_t sector_nr, sector_t max_sector, int *skipped); int (*resize) (struct mddev *mddev, sector_t sectors); sector_t (*size) (struct mddev *mddev, sector_t sectors, int raid_disks); int (*check_reshape) (struct mddev *mddev); int (*start_reshape) (struct mddev *mddev); void (*finish_reshape) (struct mddev *mddev); void (*update_reshape_pos) (struct mddev *mddev); void (*prepare_suspend) (struct mddev *mddev); /* quiesce suspends or resumes internal processing. * 1 - stop new actions and wait for action io to complete * 0 - return to normal behaviour */ void (*quiesce) (struct mddev *mddev, int quiesce); /* takeover is used to transition an array from one * personality to another. The new personality must be able * to handle the data in the current layout. * e.g. 2drive raid1 -> 2drive raid5 * ndrive raid5 -> degraded n+1drive raid6 with special layout * If the takeover succeeds, a new 'private' structure is returned. * This needs to be installed and then ->run used to activate the * array. */ void *(*takeover) (struct mddev *mddev); /* Changes the consistency policy of an active array. */ int (*change_consistency_policy)(struct mddev *mddev, const char *buf); }; struct md_sysfs_entry { struct attribute attr; ssize_t (*show)(struct mddev *, char *); ssize_t (*store)(struct mddev *, const char *, size_t); }; extern const struct attribute_group md_bitmap_group; static inline struct kernfs_node *sysfs_get_dirent_safe(struct kernfs_node *sd, char *name) { if (sd) return sysfs_get_dirent(sd, name); return sd; } static inline void sysfs_notify_dirent_safe(struct kernfs_node *sd) { if (sd) sysfs_notify_dirent(sd); } static inline char * mdname (struct mddev * mddev) { return mddev->gendisk ? mddev->gendisk->disk_name : "mdX"; } static inline int sysfs_link_rdev(struct mddev *mddev, struct md_rdev *rdev) { char nm[20]; if (!test_bit(Replacement, &rdev->flags) && !test_bit(Journal, &rdev->flags) && mddev->kobj.sd) { sprintf(nm, "rd%d", rdev->raid_disk); return sysfs_create_link(&mddev->kobj, &rdev->kobj, nm); } else return 0; } static inline void sysfs_unlink_rdev(struct mddev *mddev, struct md_rdev *rdev) { char nm[20]; if (!test_bit(Replacement, &rdev->flags) && !test_bit(Journal, &rdev->flags) && mddev->kobj.sd) { sprintf(nm, "rd%d", rdev->raid_disk); sysfs_remove_link(&mddev->kobj, nm); } } /* * iterates through some rdev ringlist. It's safe to remove the * current 'rdev'. Dont touch 'tmp' though. */ #define rdev_for_each_list(rdev, tmp, head) \ list_for_each_entry_safe(rdev, tmp, head, same_set) /* * iterates through the 'same array disks' ringlist */ #define rdev_for_each(rdev, mddev) \ list_for_each_entry(rdev, &((mddev)->disks), same_set) #define rdev_for_each_safe(rdev, tmp, mddev) \ list_for_each_entry_safe(rdev, tmp, &((mddev)->disks), same_set) #define rdev_for_each_rcu(rdev, mddev) \ list_for_each_entry_rcu(rdev, &((mddev)->disks), same_set) struct md_thread { void (*run) (struct md_thread *thread); struct mddev *mddev; wait_queue_head_t wqueue; unsigned long flags; struct task_struct *tsk; unsigned long timeout; void *private; }; struct md_io_clone { struct mddev *mddev; struct bio *orig_bio; unsigned long start_time; struct bio bio_clone; }; #define THREAD_WAKEUP 0 static inline void safe_put_page(struct page *p) { if (p) put_page(p); } extern int register_md_personality(struct md_personality *p); extern int unregister_md_personality(struct md_personality *p); extern int register_md_cluster_operations(const struct md_cluster_operations *ops, struct module *module); extern int unregister_md_cluster_operations(void); extern int md_setup_cluster(struct mddev *mddev, int nodes); extern void md_cluster_stop(struct mddev *mddev); extern struct md_thread *md_register_thread( void (*run)(struct md_thread *thread), struct mddev *mddev, const char *name); extern void md_unregister_thread(struct mddev *mddev, struct md_thread __rcu **threadp); extern void md_wakeup_thread(struct md_thread __rcu *thread); extern void md_check_recovery(struct mddev *mddev); extern void md_reap_sync_thread(struct mddev *mddev); extern enum sync_action md_sync_action(struct mddev *mddev); extern enum sync_action md_sync_action_by_name(const char *page); extern const char *md_sync_action_name(enum sync_action action); extern void md_write_start(struct mddev *mddev, struct bio *bi); extern void md_write_inc(struct mddev *mddev, struct bio *bi); extern void md_write_end(struct mddev *mddev); extern void md_done_sync(struct mddev *mddev, int blocks, int ok); extern void md_error(struct mddev *mddev, struct md_rdev *rdev); extern void md_finish_reshape(struct mddev *mddev); void md_submit_discard_bio(struct mddev *mddev, struct md_rdev *rdev, struct bio *bio, sector_t start, sector_t size); void md_account_bio(struct mddev *mddev, struct bio **bio); void md_free_cloned_bio(struct bio *bio); extern bool __must_check md_flush_request(struct mddev *mddev, struct bio *bio); extern void md_super_write(struct mddev *mddev, struct md_rdev *rdev, sector_t sector, int size, struct page *page); extern int md_super_wait(struct mddev *mddev); extern int sync_page_io(struct md_rdev *rdev, sector_t sector, int size, struct page *page, blk_opf_t opf, bool metadata_op); extern void md_do_sync(struct md_thread *thread); extern void md_new_event(void); extern void md_allow_write(struct mddev *mddev); extern void md_wait_for_blocked_rdev(struct md_rdev *rdev, struct mddev *mddev); extern void md_set_array_sectors(struct mddev *mddev, sector_t array_sectors); extern int md_check_no_bitmap(struct mddev *mddev); extern int md_integrity_register(struct mddev *mddev); extern int strict_strtoul_scaled(const char *cp, unsigned long *res, int scale); extern int mddev_init(struct mddev *mddev); extern void mddev_destroy(struct mddev *mddev); void md_init_stacking_limits(struct queue_limits *lim); struct mddev *md_alloc(dev_t dev, char *name); void mddev_put(struct mddev *mddev); extern int md_run(struct mddev *mddev); extern int md_start(struct mddev *mddev); extern void md_stop(struct mddev *mddev); extern void md_stop_writes(struct mddev *mddev); extern int md_rdev_init(struct md_rdev *rdev); extern void md_rdev_clear(struct md_rdev *rdev); extern bool md_handle_request(struct mddev *mddev, struct bio *bio); extern int mddev_suspend(struct mddev *mddev, bool interruptible); extern void mddev_resume(struct mddev *mddev); extern void md_idle_sync_thread(struct mddev *mddev); extern void md_frozen_sync_thread(struct mddev *mddev); extern void md_unfrozen_sync_thread(struct mddev *mddev); extern void md_reload_sb(struct mddev *mddev, int raid_disk); extern void md_update_sb(struct mddev *mddev, int force); extern void mddev_create_serial_pool(struct mddev *mddev, struct md_rdev *rdev); extern void mddev_destroy_serial_pool(struct mddev *mddev, struct md_rdev *rdev); struct md_rdev *md_find_rdev_nr_rcu(struct mddev *mddev, int nr); struct md_rdev *md_find_rdev_rcu(struct mddev *mddev, dev_t dev); static inline bool is_rdev_broken(struct md_rdev *rdev) { return !disk_live(rdev->bdev->bd_disk); } static inline void rdev_dec_pending(struct md_rdev *rdev, struct mddev *mddev) { int faulty = test_bit(Faulty, &rdev->flags); if (atomic_dec_and_test(&rdev->nr_pending) && faulty) { set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); md_wakeup_thread(mddev->thread); } } extern const struct md_cluster_operations *md_cluster_ops; static inline int mddev_is_clustered(struct mddev *mddev) { return mddev->cluster_info && mddev->bitmap_info.nodes > 1; } /* clear unsupported mddev_flags */ static inline void mddev_clear_unsupported_flags(struct mddev *mddev, unsigned long unsupported_flags) { mddev->flags &= ~unsupported_flags; } static inline void mddev_check_write_zeroes(struct mddev *mddev, struct bio *bio) { if (bio_op(bio) == REQ_OP_WRITE_ZEROES && !bio->bi_bdev->bd_disk->queue->limits.max_write_zeroes_sectors) mddev->gendisk->queue->limits.max_write_zeroes_sectors = 0; } static inline int mddev_suspend_and_lock(struct mddev *mddev) { int ret; ret = mddev_suspend(mddev, true); if (ret) return ret; ret = mddev_lock(mddev); if (ret) mddev_resume(mddev); return ret; } static inline void mddev_suspend_and_lock_nointr(struct mddev *mddev) { mddev_suspend(mddev, false); mutex_lock(&mddev->reconfig_mutex); } static inline void mddev_unlock_and_resume(struct mddev *mddev) { mddev_unlock(mddev); mddev_resume(mddev); } struct mdu_array_info_s; struct mdu_disk_info_s; extern int mdp_major; extern struct workqueue_struct *md_bitmap_wq; void md_autostart_arrays(int part); int md_set_array_info(struct mddev *mddev, struct mdu_array_info_s *info); int md_add_new_disk(struct mddev *mddev, struct mdu_disk_info_s *info); int do_md_run(struct mddev *mddev); #define MDDEV_STACK_INTEGRITY (1u << 0) int mddev_stack_rdev_limits(struct mddev *mddev, struct queue_limits *lim, unsigned int flags); int mddev_stack_new_rdev(struct mddev *mddev, struct md_rdev *rdev); void mddev_update_io_opt(struct mddev *mddev, unsigned int nr_stripes); extern const struct block_device_operations md_fops; /* * MD devices can be used undeneath by DM, in which case ->gendisk is NULL. */ static inline bool mddev_is_dm(struct mddev *mddev) { return !mddev->gendisk; } static inline void mddev_trace_remap(struct mddev *mddev, struct bio *bio, sector_t sector) { if (!mddev_is_dm(mddev)) trace_block_bio_remap(bio, disk_devt(mddev->gendisk), sector); } #define mddev_add_trace_msg(mddev, fmt, args...) \ do { \ if (!mddev_is_dm(mddev)) \ blk_add_trace_msg((mddev)->gendisk->queue, fmt, ##args); \ } while (0) #endif /* _MD_MD_H */ |
| 3 7 7 80 5 3 72 9 9 2 7 2 9 1 9 70 1 59 5 7 11 2 9 2 9 3 3 28 66 7 6 60 6 1 206 207 128 78 206 205 187 1 202 52 1 102 51 89 2 51 142 117 53 2 53 53 53 197 118 118 5 70 44 49 77 52 28 28 77 49 9 27 27 13 14 15 15 8 7 11 15 1 1 7 7 15 11 11 11 5 4 1 15 15 15 15 42 15 15 15 15 15 15 15 8 6 6 2 2 2 2 10 10 6 10 6 4 10 10 10 10 10 4 6 6 6 5 1 5 1 6 4 38 38 38 38 37 38 38 4 34 32 34 25 13 13 12 13 38 11 27 52 3 50 15 35 35 2 9 1 1 1 1 24 34 10 41 39 2 41 25 16 16 17 22 24 24 22 2 41 8 2 2 1 1 1 1 1 1 1 1 2 2 1 1 1 1 1 1 44 29 15 5 5 2 3 5 5 5 5 5 5 5 111 10 7 2 7 4 72 4 6 1 62 7 26 3 15 3 21 1 6 7 7 79 79 79 65 8 13 79 6 66 13 79 66 30 49 5 49 5 85 85 17 17 17 17 1 20 11 1 1 1 4 5 5 2 2 2 2 2 2 52 50 7 7 166 125 76 173 147 31 31 14 17 5 1 1 1 15 15 15 15 15 12 107 106 106 107 37 32 107 51 59 55 38 4 59 62 61 1 59 107 47 29 46 53 48 6 10 10 10 10 10 10 10 10 10 10 10 8 10 9 10 10 42 42 42 42 42 42 6 6 1 7 8 34 12 12 12 12 12 12 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 | // SPDX-License-Identifier: GPL-2.0-or-later /* * Copyright (C) International Business Machines Corp., 2000-2004 */ /* * jfs_dtree.c: directory B+-tree manager * * B+-tree with variable length key directory: * * each directory page is structured as an array of 32-byte * directory entry slots initialized as a freelist * to avoid search/compaction of free space at insertion. * when an entry is inserted, a number of slots are allocated * from the freelist as required to store variable length data * of the entry; when the entry is deleted, slots of the entry * are returned to freelist. * * leaf entry stores full name as key and file serial number * (aka inode number) as data. * internal/router entry stores sufffix compressed name * as key and simple extent descriptor as data. * * each directory page maintains a sorted entry index table * which stores the start slot index of sorted entries * to allow binary search on the table. * * directory starts as a root/leaf page in on-disk inode * inline data area. * when it becomes full, it starts a leaf of a external extent * of length of 1 block. each time the first leaf becomes full, * it is extended rather than split (its size is doubled), * until its length becoms 4 KBytes, from then the extent is split * with new 4 Kbyte extent when it becomes full * to reduce external fragmentation of small directories. * * blah, blah, blah, for linear scan of directory in pieces by * readdir(). * * * case-insensitive directory file system * * names are stored in case-sensitive way in leaf entry. * but stored, searched and compared in case-insensitive (uppercase) order * (i.e., both search key and entry key are folded for search/compare): * (note that case-sensitive order is BROKEN in storage, e.g., * sensitive: Ad, aB, aC, aD -> insensitive: aB, aC, aD, Ad * * entries which folds to the same key makes up a equivalent class * whose members are stored as contiguous cluster (may cross page boundary) * but whose order is arbitrary and acts as duplicate, e.g., * abc, Abc, aBc, abC) * * once match is found at leaf, requires scan forward/backward * either for, in case-insensitive search, duplicate * or for, in case-sensitive search, for exact match * * router entry must be created/stored in case-insensitive way * in internal entry: * (right most key of left page and left most key of right page * are folded, and its suffix compression is propagated as router * key in parent) * (e.g., if split occurs <abc> and <aBd>, <ABD> trather than <aB> * should be made the router key for the split) * * case-insensitive search: * * fold search key; * * case-insensitive search of B-tree: * for internal entry, router key is already folded; * for leaf entry, fold the entry key before comparison. * * if (leaf entry case-insensitive match found) * if (next entry satisfies case-insensitive match) * return EDUPLICATE; * if (prev entry satisfies case-insensitive match) * return EDUPLICATE; * return match; * else * return no match; * * serialization: * target directory inode lock is being held on entry/exit * of all main directory service routines. * * log based recovery: */ #include <linux/fs.h> #include <linux/quotaops.h> #include <linux/slab.h> #include "jfs_incore.h" #include "jfs_superblock.h" #include "jfs_filsys.h" #include "jfs_metapage.h" #include "jfs_dmap.h" #include "jfs_unicode.h" #include "jfs_debug.h" /* dtree split parameter */ struct dtsplit { struct metapage *mp; s16 index; s16 nslot; struct component_name *key; ddata_t *data; struct pxdlist *pxdlist; }; #define DT_PAGE(IP, MP) BT_PAGE(IP, MP, dtpage_t, i_dtroot) /* get page buffer for specified block address */ #define DT_GETPAGE(IP, BN, MP, SIZE, P, RC) \ do { \ BT_GETPAGE(IP, BN, MP, dtpage_t, SIZE, P, RC, i_dtroot); \ if (!(RC)) { \ if (((P)->header.nextindex > \ (((BN) == 0) ? DTROOTMAXSLOT : (P)->header.maxslot)) || \ ((BN) && ((P)->header.maxslot > DTPAGEMAXSLOT))) { \ BT_PUTPAGE(MP); \ jfs_error((IP)->i_sb, \ "DT_GETPAGE: dtree page corrupt\n"); \ MP = NULL; \ RC = -EIO; \ } \ } \ } while (0) /* for consistency */ #define DT_PUTPAGE(MP) BT_PUTPAGE(MP) #define DT_GETSEARCH(IP, LEAF, BN, MP, P, INDEX) \ BT_GETSEARCH(IP, LEAF, BN, MP, dtpage_t, P, INDEX, i_dtroot) /* * forward references */ static int dtSplitUp(tid_t tid, struct inode *ip, struct dtsplit * split, struct btstack * btstack); static int dtSplitPage(tid_t tid, struct inode *ip, struct dtsplit * split, struct metapage ** rmpp, dtpage_t ** rpp, pxd_t * rxdp); static int dtExtendPage(tid_t tid, struct inode *ip, struct dtsplit * split, struct btstack * btstack); static int dtSplitRoot(tid_t tid, struct inode *ip, struct dtsplit * split, struct metapage ** rmpp); static int dtDeleteUp(tid_t tid, struct inode *ip, struct metapage * fmp, dtpage_t * fp, struct btstack * btstack); static int dtRelink(tid_t tid, struct inode *ip, dtpage_t * p); static int dtReadFirst(struct inode *ip, struct btstack * btstack); static int dtReadNext(struct inode *ip, loff_t * offset, struct btstack * btstack); static int dtCompare(struct component_name * key, dtpage_t * p, int si); static int ciCompare(struct component_name * key, dtpage_t * p, int si, int flag); static void dtGetKey(dtpage_t * p, int i, struct component_name * key, int flag); static int ciGetLeafPrefixKey(dtpage_t * lp, int li, dtpage_t * rp, int ri, struct component_name * key, int flag); static void dtInsertEntry(dtpage_t * p, int index, struct component_name * key, ddata_t * data, struct dt_lock **); static void dtMoveEntry(dtpage_t * sp, int si, dtpage_t * dp, struct dt_lock ** sdtlock, struct dt_lock ** ddtlock, int do_index); static void dtDeleteEntry(dtpage_t * p, int fi, struct dt_lock ** dtlock); static void dtTruncateEntry(dtpage_t * p, int ti, struct dt_lock ** dtlock); static void dtLinelockFreelist(dtpage_t * p, int m, struct dt_lock ** dtlock); #define ciToUpper(c) UniStrupr((c)->name) /* * read_index_page() * * Reads a page of a directory's index table. * Having metadata mapped into the directory inode's address space * presents a multitude of problems. We avoid this by mapping to * the absolute address space outside of the *_metapage routines */ static struct metapage *read_index_page(struct inode *inode, s64 blkno) { int rc; s64 xaddr; int xflag; s32 xlen; rc = xtLookup(inode, blkno, 1, &xflag, &xaddr, &xlen, 1); if (rc || (xaddr == 0)) return NULL; return read_metapage(inode, xaddr, PSIZE, 1); } /* * get_index_page() * * Same as get_index_page(), but get's a new page without reading */ static struct metapage *get_index_page(struct inode *inode, s64 blkno) { int rc; s64 xaddr; int xflag; s32 xlen; rc = xtLookup(inode, blkno, 1, &xflag, &xaddr, &xlen, 1); if (rc || (xaddr == 0)) return NULL; return get_metapage(inode, xaddr, PSIZE, 1); } /* * find_index() * * Returns dtree page containing directory table entry for specified * index and pointer to its entry. * * mp must be released by caller. */ static struct dir_table_slot *find_index(struct inode *ip, u32 index, struct metapage ** mp, s64 *lblock) { struct jfs_inode_info *jfs_ip = JFS_IP(ip); s64 blkno; s64 offset; int page_offset; struct dir_table_slot *slot; static int maxWarnings = 10; if (index < 2) { if (maxWarnings) { jfs_warn("find_entry called with index = %d", index); maxWarnings--; } return NULL; } if (index >= jfs_ip->next_index) { jfs_warn("find_entry called with index >= next_index"); return NULL; } if (jfs_dirtable_inline(ip)) { /* * Inline directory table */ *mp = NULL; slot = &jfs_ip->i_dirtable[index - 2]; } else { offset = (index - 2) * sizeof(struct dir_table_slot); page_offset = offset & (PSIZE - 1); blkno = ((offset + 1) >> L2PSIZE) << JFS_SBI(ip->i_sb)->l2nbperpage; if (*mp && (*lblock != blkno)) { release_metapage(*mp); *mp = NULL; } if (!(*mp)) { *lblock = blkno; *mp = read_index_page(ip, blkno); } if (!(*mp)) { jfs_err("free_index: error reading directory table"); return NULL; } slot = (struct dir_table_slot *) ((char *) (*mp)->data + page_offset); } return slot; } static inline void lock_index(tid_t tid, struct inode *ip, struct metapage * mp, u32 index) { struct tlock *tlck; struct linelock *llck; struct lv *lv; tlck = txLock(tid, ip, mp, tlckDATA); llck = (struct linelock *) tlck->lock; if (llck->index >= llck->maxcnt) llck = txLinelock(llck); lv = &llck->lv[llck->index]; /* * Linelock slot size is twice the size of directory table * slot size. 512 entries per page. */ lv->offset = ((index - 2) & 511) >> 1; lv->length = 1; llck->index++; } /* * add_index() * * Adds an entry to the directory index table. This is used to provide * each directory entry with a persistent index in which to resume * directory traversals */ static u32 add_index(tid_t tid, struct inode *ip, s64 bn, int slot) { struct super_block *sb = ip->i_sb; struct jfs_sb_info *sbi = JFS_SBI(sb); struct jfs_inode_info *jfs_ip = JFS_IP(ip); u64 blkno; struct dir_table_slot *dirtab_slot; u32 index; struct linelock *llck; struct lv *lv; struct metapage *mp; s64 offset; uint page_offset; struct tlock *tlck; s64 xaddr; ASSERT(DO_INDEX(ip)); if (jfs_ip->next_index < 2) { jfs_warn("add_index: next_index = %d. Resetting!", jfs_ip->next_index); jfs_ip->next_index = 2; } index = jfs_ip->next_index++; if (index <= MAX_INLINE_DIRTABLE_ENTRY) { /* * i_size reflects size of index table, or 8 bytes per entry. */ ip->i_size = (loff_t) (index - 1) << 3; /* * dir table fits inline within inode */ dirtab_slot = &jfs_ip->i_dirtable[index-2]; dirtab_slot->flag = DIR_INDEX_VALID; dirtab_slot->slot = slot; DTSaddress(dirtab_slot, bn); set_cflag(COMMIT_Dirtable, ip); return index; } if (index == (MAX_INLINE_DIRTABLE_ENTRY + 1)) { struct dir_table_slot temp_table[12]; /* * It's time to move the inline table to an external * page and begin to build the xtree */ if (dquot_alloc_block(ip, sbi->nbperpage)) goto clean_up; if (dbAlloc(ip, 0, sbi->nbperpage, &xaddr)) { dquot_free_block(ip, sbi->nbperpage); goto clean_up; } /* * Save the table, we're going to overwrite it with the * xtree root */ memcpy(temp_table, &jfs_ip->i_dirtable, sizeof(temp_table)); /* * Initialize empty x-tree */ xtInitRoot(tid, ip); /* * Add the first block to the xtree */ if (xtInsert(tid, ip, 0, 0, sbi->nbperpage, &xaddr, 0)) { /* This really shouldn't fail */ jfs_warn("add_index: xtInsert failed!"); memcpy(&jfs_ip->i_dirtable, temp_table, sizeof (temp_table)); dbFree(ip, xaddr, sbi->nbperpage); dquot_free_block(ip, sbi->nbperpage); goto clean_up; } ip->i_size = PSIZE; mp = get_index_page(ip, 0); if (!mp) { jfs_err("add_index: get_metapage failed!"); xtTruncate(tid, ip, 0, COMMIT_PWMAP); memcpy(&jfs_ip->i_dirtable, temp_table, sizeof (temp_table)); goto clean_up; } tlck = txLock(tid, ip, mp, tlckDATA); llck = (struct linelock *) & tlck->lock; ASSERT(llck->index == 0); lv = &llck->lv[0]; lv->offset = 0; lv->length = 6; /* tlckDATA slot size is 16 bytes */ llck->index++; memcpy(mp->data, temp_table, sizeof(temp_table)); mark_metapage_dirty(mp); release_metapage(mp); /* * Logging is now directed by xtree tlocks */ clear_cflag(COMMIT_Dirtable, ip); } offset = (index - 2) * sizeof(struct dir_table_slot); page_offset = offset & (PSIZE - 1); blkno = ((offset + 1) >> L2PSIZE) << sbi->l2nbperpage; if (page_offset == 0) { /* * This will be the beginning of a new page */ xaddr = 0; if (xtInsert(tid, ip, 0, blkno, sbi->nbperpage, &xaddr, 0)) { jfs_warn("add_index: xtInsert failed!"); goto clean_up; } ip->i_size += PSIZE; if ((mp = get_index_page(ip, blkno))) memset(mp->data, 0, PSIZE); /* Just looks better */ else xtTruncate(tid, ip, offset, COMMIT_PWMAP); } else mp = read_index_page(ip, blkno); if (!mp) { jfs_err("add_index: get/read_metapage failed!"); goto clean_up; } lock_index(tid, ip, mp, index); dirtab_slot = (struct dir_table_slot *) ((char *) mp->data + page_offset); dirtab_slot->flag = DIR_INDEX_VALID; dirtab_slot->slot = slot; DTSaddress(dirtab_slot, bn); mark_metapage_dirty(mp); release_metapage(mp); return index; clean_up: jfs_ip->next_index--; return 0; } /* * free_index() * * Marks an entry to the directory index table as free. */ static void free_index(tid_t tid, struct inode *ip, u32 index, u32 next) { struct dir_table_slot *dirtab_slot; s64 lblock; struct metapage *mp = NULL; dirtab_slot = find_index(ip, index, &mp, &lblock); if (!dirtab_slot) return; dirtab_slot->flag = DIR_INDEX_FREE; dirtab_slot->slot = dirtab_slot->addr1 = 0; dirtab_slot->addr2 = cpu_to_le32(next); if (mp) { lock_index(tid, ip, mp, index); mark_metapage_dirty(mp); release_metapage(mp); } else set_cflag(COMMIT_Dirtable, ip); } /* * modify_index() * * Changes an entry in the directory index table */ static void modify_index(tid_t tid, struct inode *ip, u32 index, s64 bn, int slot, struct metapage ** mp, s64 *lblock) { struct dir_table_slot *dirtab_slot; dirtab_slot = find_index(ip, index, mp, lblock); if (!dirtab_slot) return; DTSaddress(dirtab_slot, bn); dirtab_slot->slot = slot; if (*mp) { lock_index(tid, ip, *mp, index); mark_metapage_dirty(*mp); } else set_cflag(COMMIT_Dirtable, ip); } /* * read_index() * * reads a directory table slot */ static int read_index(struct inode *ip, u32 index, struct dir_table_slot * dirtab_slot) { s64 lblock; struct metapage *mp = NULL; struct dir_table_slot *slot; slot = find_index(ip, index, &mp, &lblock); if (!slot) { return -EIO; } memcpy(dirtab_slot, slot, sizeof(struct dir_table_slot)); if (mp) release_metapage(mp); return 0; } /* * dtSearch() * * function: * Search for the entry with specified key * * parameter: * * return: 0 - search result on stack, leaf page pinned; * errno - I/O error */ int dtSearch(struct inode *ip, struct component_name * key, ino_t * data, struct btstack * btstack, int flag) { int rc = 0; int cmp = 1; /* init for empty page */ s64 bn; struct metapage *mp; dtpage_t *p; s8 *stbl; int base, index, lim; struct btframe *btsp; pxd_t *pxd; int psize = 288; /* initial in-line directory */ ino_t inumber; struct component_name ciKey; struct super_block *sb = ip->i_sb; ciKey.name = kmalloc_array(JFS_NAME_MAX + 1, sizeof(wchar_t), GFP_NOFS); if (!ciKey.name) { rc = -ENOMEM; goto dtSearch_Exit2; } /* uppercase search key for c-i directory */ UniStrcpy(ciKey.name, key->name); ciKey.namlen = key->namlen; /* only uppercase if case-insensitive support is on */ if ((JFS_SBI(sb)->mntflag & JFS_OS2) == JFS_OS2) { ciToUpper(&ciKey); } BT_CLR(btstack); /* reset stack */ /* init level count for max pages to split */ btstack->nsplit = 1; /* * search down tree from root: * * between two consecutive entries of <Ki, Pi> and <Kj, Pj> of * internal page, child page Pi contains entry with k, Ki <= K < Kj. * * if entry with search key K is not found * internal page search find the entry with largest key Ki * less than K which point to the child page to search; * leaf page search find the entry with smallest key Kj * greater than K so that the returned index is the position of * the entry to be shifted right for insertion of new entry. * for empty tree, search key is greater than any key of the tree. * * by convention, root bn = 0. */ for (bn = 0;;) { /* get/pin the page to search */ DT_GETPAGE(ip, bn, mp, psize, p, rc); if (rc) goto dtSearch_Exit1; /* get sorted entry table of the page */ stbl = DT_GETSTBL(p); /* * binary search with search key K on the current page. */ for (base = 0, lim = p->header.nextindex; lim; lim >>= 1) { index = base + (lim >> 1); if (stbl[index] < 0) { rc = -EIO; goto out; } if (p->header.flag & BT_LEAF) { /* uppercase leaf name to compare */ cmp = ciCompare(&ciKey, p, stbl[index], JFS_SBI(sb)->mntflag); } else { /* router key is in uppercase */ cmp = dtCompare(&ciKey, p, stbl[index]); } if (cmp == 0) { /* * search hit */ /* search hit - leaf page: * return the entry found */ if (p->header.flag & BT_LEAF) { inumber = le32_to_cpu( ((struct ldtentry *) & p->slot[stbl[index]])->inumber); /* * search for JFS_LOOKUP */ if (flag == JFS_LOOKUP) { *data = inumber; rc = 0; goto out; } /* * search for JFS_CREATE */ if (flag == JFS_CREATE) { *data = inumber; rc = -EEXIST; goto out; } /* * search for JFS_REMOVE or JFS_RENAME */ if ((flag == JFS_REMOVE || flag == JFS_RENAME) && *data != inumber) { rc = -ESTALE; goto out; } /* * JFS_REMOVE|JFS_FINDDIR|JFS_RENAME */ /* save search result */ *data = inumber; btsp = btstack->top; btsp->bn = bn; btsp->index = index; btsp->mp = mp; rc = 0; goto dtSearch_Exit1; } /* search hit - internal page: * descend/search its child page */ goto getChild; } if (cmp > 0) { base = index + 1; --lim; } } /* * search miss * * base is the smallest index with key (Kj) greater than * search key (K) and may be zero or (maxindex + 1) index. */ /* * search miss - leaf page * * return location of entry (base) where new entry with * search key K is to be inserted. */ if (p->header.flag & BT_LEAF) { /* * search for JFS_LOOKUP, JFS_REMOVE, or JFS_RENAME */ if (flag == JFS_LOOKUP || flag == JFS_REMOVE || flag == JFS_RENAME) { rc = -ENOENT; goto out; } /* * search for JFS_CREATE|JFS_FINDDIR: * * save search result */ *data = 0; btsp = btstack->top; btsp->bn = bn; btsp->index = base; btsp->mp = mp; rc = 0; goto dtSearch_Exit1; } /* * search miss - internal page * * if base is non-zero, decrement base by one to get the parent * entry of the child page to search. */ index = base ? base - 1 : base; /* * go down to child page */ getChild: /* update max. number of pages to split */ if (BT_STACK_FULL(btstack)) { /* Something's corrupted, mark filesystem dirty so * chkdsk will fix it. */ jfs_error(sb, "stack overrun!\n"); BT_STACK_DUMP(btstack); rc = -EIO; goto out; } btstack->nsplit++; /* push (bn, index) of the parent page/entry */ BT_PUSH(btstack, bn, index); /* get the child page block number */ pxd = (pxd_t *) & p->slot[stbl[index]]; bn = addressPXD(pxd); psize = lengthPXD(pxd) << JFS_SBI(ip->i_sb)->l2bsize; /* unpin the parent page */ DT_PUTPAGE(mp); } out: DT_PUTPAGE(mp); dtSearch_Exit1: kfree(ciKey.name); dtSearch_Exit2: return rc; } /* * dtInsert() * * function: insert an entry to directory tree * * parameter: * * return: 0 - success; * errno - failure; */ int dtInsert(tid_t tid, struct inode *ip, struct component_name * name, ino_t * fsn, struct btstack * btstack) { int rc = 0; struct metapage *mp; /* meta-page buffer */ dtpage_t *p; /* base B+-tree index page */ s64 bn; int index; struct dtsplit split; /* split information */ ddata_t data; struct dt_lock *dtlck; int n; struct tlock *tlck; struct lv *lv; /* * retrieve search result * * dtSearch() returns (leaf page pinned, index at which to insert). * n.b. dtSearch() may return index of (maxindex + 1) of * the full page. */ DT_GETSEARCH(ip, btstack->top, bn, mp, p, index); if (p->header.freelist == 0) return -EINVAL; /* * insert entry for new key */ if (DO_INDEX(ip)) { if (JFS_IP(ip)->next_index == DIREND) { DT_PUTPAGE(mp); return -EMLINK; } n = NDTLEAF(name->namlen); data.leaf.tid = tid; data.leaf.ip = ip; } else { n = NDTLEAF_LEGACY(name->namlen); data.leaf.ip = NULL; /* signifies legacy directory format */ } data.leaf.ino = *fsn; /* * leaf page does not have enough room for new entry: * * extend/split the leaf page; * * dtSplitUp() will insert the entry and unpin the leaf page. */ if (n > p->header.freecnt) { split.mp = mp; split.index = index; split.nslot = n; split.key = name; split.data = &data; rc = dtSplitUp(tid, ip, &split, btstack); return rc; } /* * leaf page does have enough room for new entry: * * insert the new data entry into the leaf page; */ BT_MARK_DIRTY(mp, ip); /* * acquire a transaction lock on the leaf page */ tlck = txLock(tid, ip, mp, tlckDTREE | tlckENTRY); dtlck = (struct dt_lock *) & tlck->lock; ASSERT(dtlck->index == 0); lv = & dtlck->lv[0]; /* linelock header */ lv->offset = 0; lv->length = 1; dtlck->index++; dtInsertEntry(p, index, name, &data, &dtlck); /* linelock stbl of non-root leaf page */ if (!(p->header.flag & BT_ROOT)) { if (dtlck->index >= dtlck->maxcnt) dtlck = (struct dt_lock *) txLinelock(dtlck); lv = & dtlck->lv[dtlck->index]; n = index >> L2DTSLOTSIZE; lv->offset = p->header.stblindex + n; lv->length = ((p->header.nextindex - 1) >> L2DTSLOTSIZE) - n + 1; dtlck->index++; } /* unpin the leaf page */ DT_PUTPAGE(mp); return 0; } /* * dtSplitUp() * * function: propagate insertion bottom up; * * parameter: * * return: 0 - success; * errno - failure; * leaf page unpinned; */ static int dtSplitUp(tid_t tid, struct inode *ip, struct dtsplit * split, struct btstack * btstack) { struct jfs_sb_info *sbi = JFS_SBI(ip->i_sb); int rc = 0; struct metapage *smp; dtpage_t *sp; /* split page */ struct metapage *rmp; dtpage_t *rp; /* new right page split from sp */ pxd_t rpxd; /* new right page extent descriptor */ struct metapage *lmp; dtpage_t *lp; /* left child page */ int skip; /* index of entry of insertion */ struct btframe *parent; /* parent page entry on traverse stack */ s64 xaddr, nxaddr; int xlen, xsize; struct pxdlist pxdlist; pxd_t *pxd; struct component_name key = { 0, NULL }; ddata_t *data = split->data; int n; struct dt_lock *dtlck; struct tlock *tlck; struct lv *lv; int quota_allocation = 0; /* get split page */ smp = split->mp; sp = DT_PAGE(ip, smp); key.name = kmalloc_array(JFS_NAME_MAX + 2, sizeof(wchar_t), GFP_NOFS); if (!key.name) { DT_PUTPAGE(smp); rc = -ENOMEM; goto dtSplitUp_Exit; } /* * split leaf page * * The split routines insert the new entry, and * acquire txLock as appropriate. */ /* * split root leaf page: */ if (sp->header.flag & BT_ROOT) { /* * allocate a single extent child page */ xlen = 1; n = sbi->bsize >> L2DTSLOTSIZE; n -= (n + 31) >> L2DTSLOTSIZE; /* stbl size */ n -= DTROOTMAXSLOT - sp->header.freecnt; /* header + entries */ if (n <= split->nslot) xlen++; if ((rc = dbAlloc(ip, 0, (s64) xlen, &xaddr))) { DT_PUTPAGE(smp); goto freeKeyName; } pxdlist.maxnpxd = 1; pxdlist.npxd = 0; pxd = &pxdlist.pxd[0]; PXDaddress(pxd, xaddr); PXDlength(pxd, xlen); split->pxdlist = &pxdlist; rc = dtSplitRoot(tid, ip, split, &rmp); if (rc) dbFree(ip, xaddr, xlen); else DT_PUTPAGE(rmp); DT_PUTPAGE(smp); if (!DO_INDEX(ip)) ip->i_size = xlen << sbi->l2bsize; goto freeKeyName; } /* * extend first leaf page * * extend the 1st extent if less than buffer page size * (dtExtendPage() reurns leaf page unpinned) */ pxd = &sp->header.self; xlen = lengthPXD(pxd); xsize = xlen << sbi->l2bsize; if (xsize < PSIZE) { xaddr = addressPXD(pxd); n = xsize >> L2DTSLOTSIZE; n -= (n + 31) >> L2DTSLOTSIZE; /* stbl size */ if ((n + sp->header.freecnt) <= split->nslot) n = xlen + (xlen << 1); else n = xlen; /* Allocate blocks to quota. */ rc = dquot_alloc_block(ip, n); if (rc) goto extendOut; quota_allocation += n; if ((rc = dbReAlloc(sbi->ipbmap, xaddr, (s64) xlen, (s64) n, &nxaddr))) goto extendOut; pxdlist.maxnpxd = 1; pxdlist.npxd = 0; pxd = &pxdlist.pxd[0]; PXDaddress(pxd, nxaddr); PXDlength(pxd, xlen + n); split->pxdlist = &pxdlist; if ((rc = dtExtendPage(tid, ip, split, btstack))) { nxaddr = addressPXD(pxd); if (xaddr != nxaddr) { /* free relocated extent */ xlen = lengthPXD(pxd); dbFree(ip, nxaddr, (s64) xlen); } else { /* free extended delta */ xlen = lengthPXD(pxd) - n; xaddr = addressPXD(pxd) + xlen; dbFree(ip, xaddr, (s64) n); } } else if (!DO_INDEX(ip)) ip->i_size = lengthPXD(pxd) << sbi->l2bsize; extendOut: DT_PUTPAGE(smp); goto freeKeyName; } /* * split leaf page <sp> into <sp> and a new right page <rp>. * * return <rp> pinned and its extent descriptor <rpxd> */ /* * allocate new directory page extent and * new index page(s) to cover page split(s) * * allocation hint: ? */ n = btstack->nsplit; pxdlist.maxnpxd = pxdlist.npxd = 0; xlen = sbi->nbperpage; for (pxd = pxdlist.pxd; n > 0; n--, pxd++) { if ((rc = dbAlloc(ip, 0, (s64) xlen, &xaddr)) == 0) { PXDaddress(pxd, xaddr); PXDlength(pxd, xlen); pxdlist.maxnpxd++; continue; } DT_PUTPAGE(smp); /* undo allocation */ goto splitOut; } split->pxdlist = &pxdlist; if ((rc = dtSplitPage(tid, ip, split, &rmp, &rp, &rpxd))) { DT_PUTPAGE(smp); /* undo allocation */ goto splitOut; } if (!DO_INDEX(ip)) ip->i_size += PSIZE; /* * propagate up the router entry for the leaf page just split * * insert a router entry for the new page into the parent page, * propagate the insert/split up the tree by walking back the stack * of (bn of parent page, index of child page entry in parent page) * that were traversed during the search for the page that split. * * the propagation of insert/split up the tree stops if the root * splits or the page inserted into doesn't have to split to hold * the new entry. * * the parent entry for the split page remains the same, and * a new entry is inserted at its right with the first key and * block number of the new right page. * * There are a maximum of 4 pages pinned at any time: * two children, left parent and right parent (when the parent splits). * keep the child pages pinned while working on the parent. * make sure that all pins are released at exit. */ while ((parent = BT_POP(btstack)) != NULL) { /* parent page specified by stack frame <parent> */ /* keep current child pages (<lp>, <rp>) pinned */ lmp = smp; lp = sp; /* * insert router entry in parent for new right child page <rp> */ /* get the parent page <sp> */ DT_GETPAGE(ip, parent->bn, smp, PSIZE, sp, rc); if (rc) { DT_PUTPAGE(lmp); DT_PUTPAGE(rmp); goto splitOut; } /* * The new key entry goes ONE AFTER the index of parent entry, * because the split was to the right. */ skip = parent->index + 1; /* * compute the key for the router entry * * key suffix compression: * for internal pages that have leaf pages as children, * retain only what's needed to distinguish between * the new entry and the entry on the page to its left. * If the keys compare equal, retain the entire key. * * note that compression is performed only at computing * router key at the lowest internal level. * further compression of the key between pairs of higher * level internal pages loses too much information and * the search may fail. * (e.g., two adjacent leaf pages of {a, ..., x} {xx, ...,} * results in two adjacent parent entries (a)(xx). * if split occurs between these two entries, and * if compression is applied, the router key of parent entry * of right page (x) will divert search for x into right * subtree and miss x in the left subtree.) * * the entire key must be retained for the next-to-leftmost * internal key at any level of the tree, or search may fail * (e.g., ?) */ switch (rp->header.flag & BT_TYPE) { case BT_LEAF: /* * compute the length of prefix for suffix compression * between last entry of left page and first entry * of right page */ if ((sp->header.flag & BT_ROOT && skip > 1) || sp->header.prev != 0 || skip > 1) { /* compute uppercase router prefix key */ rc = ciGetLeafPrefixKey(lp, lp->header.nextindex-1, rp, 0, &key, sbi->mntflag); if (rc) { DT_PUTPAGE(lmp); DT_PUTPAGE(rmp); DT_PUTPAGE(smp); goto splitOut; } } else { /* next to leftmost entry of lowest internal level */ /* compute uppercase router key */ dtGetKey(rp, 0, &key, sbi->mntflag); key.name[key.namlen] = 0; if ((sbi->mntflag & JFS_OS2) == JFS_OS2) ciToUpper(&key); } n = NDTINTERNAL(key.namlen); break; case BT_INTERNAL: dtGetKey(rp, 0, &key, sbi->mntflag); n = NDTINTERNAL(key.namlen); break; default: jfs_err("dtSplitUp(): UFO!"); break; } /* unpin left child page */ DT_PUTPAGE(lmp); /* * compute the data for the router entry */ data->xd = rpxd; /* child page xd */ /* * parent page is full - split the parent page */ if (n > sp->header.freecnt) { /* init for parent page split */ split->mp = smp; split->index = skip; /* index at insert */ split->nslot = n; split->key = &key; /* split->data = data; */ /* unpin right child page */ DT_PUTPAGE(rmp); /* The split routines insert the new entry, * acquire txLock as appropriate. * return <rp> pinned and its block number <rbn>. */ rc = (sp->header.flag & BT_ROOT) ? dtSplitRoot(tid, ip, split, &rmp) : dtSplitPage(tid, ip, split, &rmp, &rp, &rpxd); if (rc) { DT_PUTPAGE(smp); goto splitOut; } /* smp and rmp are pinned */ } /* * parent page is not full - insert router entry in parent page */ else { BT_MARK_DIRTY(smp, ip); /* * acquire a transaction lock on the parent page */ tlck = txLock(tid, ip, smp, tlckDTREE | tlckENTRY); dtlck = (struct dt_lock *) & tlck->lock; ASSERT(dtlck->index == 0); lv = & dtlck->lv[0]; /* linelock header */ lv->offset = 0; lv->length = 1; dtlck->index++; /* linelock stbl of non-root parent page */ if (!(sp->header.flag & BT_ROOT)) { lv++; n = skip >> L2DTSLOTSIZE; lv->offset = sp->header.stblindex + n; lv->length = ((sp->header.nextindex - 1) >> L2DTSLOTSIZE) - n + 1; dtlck->index++; } dtInsertEntry(sp, skip, &key, data, &dtlck); /* exit propagate up */ break; } } /* unpin current split and its right page */ DT_PUTPAGE(smp); DT_PUTPAGE(rmp); /* * free remaining extents allocated for split */ splitOut: n = pxdlist.npxd; pxd = &pxdlist.pxd[n]; for (; n < pxdlist.maxnpxd; n++, pxd++) dbFree(ip, addressPXD(pxd), (s64) lengthPXD(pxd)); freeKeyName: kfree(key.name); /* Rollback quota allocation */ if (rc && quota_allocation) dquot_free_block(ip, quota_allocation); dtSplitUp_Exit: return rc; } /* * dtSplitPage() * * function: Split a non-root page of a btree. * * parameter: * * return: 0 - success; * errno - failure; * return split and new page pinned; */ static int dtSplitPage(tid_t tid, struct inode *ip, struct dtsplit * split, struct metapage ** rmpp, dtpage_t ** rpp, pxd_t * rpxdp) { int rc = 0; struct metapage *smp; dtpage_t *sp; struct metapage *rmp; dtpage_t *rp; /* new right page allocated */ s64 rbn; /* new right page block number */ struct metapage *mp; dtpage_t *p; s64 nextbn; struct pxdlist *pxdlist; pxd_t *pxd; int skip, nextindex, half, left, nxt, off, si; struct ldtentry *ldtentry; struct idtentry *idtentry; u8 *stbl; struct dtslot *f; int fsi, stblsize; int n; struct dt_lock *sdtlck, *rdtlck; struct tlock *tlck; struct dt_lock *dtlck; struct lv *slv, *rlv, *lv; /* get split page */ smp = split->mp; sp = DT_PAGE(ip, smp); /* * allocate the new right page for the split */ pxdlist = split->pxdlist; pxd = &pxdlist->pxd[pxdlist->npxd]; pxdlist->npxd++; rbn = addressPXD(pxd); rmp = get_metapage(ip, rbn, PSIZE, 1); if (rmp == NULL) return -EIO; /* Allocate blocks to quota. */ rc = dquot_alloc_block(ip, lengthPXD(pxd)); if (rc) { release_metapage(rmp); return rc; } jfs_info("dtSplitPage: ip:0x%p smp:0x%p rmp:0x%p", ip, smp, rmp); BT_MARK_DIRTY(rmp, ip); /* * acquire a transaction lock on the new right page */ tlck = txLock(tid, ip, rmp, tlckDTREE | tlckNEW); rdtlck = (struct dt_lock *) & tlck->lock; rp = (dtpage_t *) rmp->data; *rpp = rp; rp->header.self = *pxd; BT_MARK_DIRTY(smp, ip); /* * acquire a transaction lock on the split page * * action: */ tlck = txLock(tid, ip, smp, tlckDTREE | tlckENTRY); sdtlck = (struct dt_lock *) & tlck->lock; /* linelock header of split page */ ASSERT(sdtlck->index == 0); slv = & sdtlck->lv[0]; slv->offset = 0; slv->length = 1; sdtlck->index++; /* * initialize/update sibling pointers between sp and rp */ nextbn = le64_to_cpu(sp->header.next); rp->header.next = cpu_to_le64(nextbn); rp->header.prev = cpu_to_le64(addressPXD(&sp->header.self)); sp->header.next = cpu_to_le64(rbn); /* * initialize new right page */ rp->header.flag = sp->header.flag; /* compute sorted entry table at start of extent data area */ rp->header.nextindex = 0; rp->header.stblindex = 1; n = PSIZE >> L2DTSLOTSIZE; rp->header.maxslot = n; stblsize = (n + 31) >> L2DTSLOTSIZE; /* in unit of slot */ /* init freelist */ fsi = rp->header.stblindex + stblsize; rp->header.freelist = fsi; rp->header.freecnt = rp->header.maxslot - fsi; /* * sequential append at tail: append without split * * If splitting the last page on a level because of appending * a entry to it (skip is maxentry), it's likely that the access is * sequential. Adding an empty page on the side of the level is less * work and can push the fill factor much higher than normal. * If we're wrong it's no big deal, we'll just do the split the right * way next time. * (It may look like it's equally easy to do a similar hack for * reverse sorted data, that is, split the tree left, * but it's not. Be my guest.) */ if (nextbn == 0 && split->index == sp->header.nextindex) { /* linelock header + stbl (first slot) of new page */ rlv = & rdtlck->lv[rdtlck->index]; rlv->offset = 0; rlv->length = 2; rdtlck->index++; /* * initialize freelist of new right page */ f = &rp->slot[fsi]; for (fsi++; fsi < rp->header.maxslot; f++, fsi++) f->next = fsi; f->next = -1; /* insert entry at the first entry of the new right page */ dtInsertEntry(rp, 0, split->key, split->data, &rdtlck); goto out; } /* * non-sequential insert (at possibly middle page) */ /* * update prev pointer of previous right sibling page; */ if (nextbn != 0) { DT_GETPAGE(ip, nextbn, mp, PSIZE, p, rc); if (rc) { discard_metapage(rmp); return rc; } BT_MARK_DIRTY(mp, ip); /* * acquire a transaction lock on the next page */ tlck = txLock(tid, ip, mp, tlckDTREE | tlckRELINK); jfs_info("dtSplitPage: tlck = 0x%p, ip = 0x%p, mp=0x%p", tlck, ip, mp); dtlck = (struct dt_lock *) & tlck->lock; /* linelock header of previous right sibling page */ lv = & dtlck->lv[dtlck->index]; lv->offset = 0; lv->length = 1; dtlck->index++; p->header.prev = cpu_to_le64(rbn); DT_PUTPAGE(mp); } /* * split the data between the split and right pages. */ skip = split->index; half = (PSIZE >> L2DTSLOTSIZE) >> 1; /* swag */ left = 0; /* * compute fill factor for split pages * * <nxt> traces the next entry to move to rp * <off> traces the next entry to stay in sp */ stbl = (u8 *) & sp->slot[sp->header.stblindex]; nextindex = sp->header.nextindex; for (nxt = off = 0; nxt < nextindex; ++off) { if (off == skip) /* check for fill factor with new entry size */ n = split->nslot; else { si = stbl[nxt]; switch (sp->header.flag & BT_TYPE) { case BT_LEAF: ldtentry = (struct ldtentry *) & sp->slot[si]; if (DO_INDEX(ip)) n = NDTLEAF(ldtentry->namlen); else n = NDTLEAF_LEGACY(ldtentry-> namlen); break; case BT_INTERNAL: idtentry = (struct idtentry *) & sp->slot[si]; n = NDTINTERNAL(idtentry->namlen); break; default: break; } ++nxt; /* advance to next entry to move in sp */ } left += n; if (left >= half) break; } /* <nxt> poins to the 1st entry to move */ /* * move entries to right page * * dtMoveEntry() initializes rp and reserves entry for insertion * * split page moved out entries are linelocked; * new/right page moved in entries are linelocked; */ /* linelock header + stbl of new right page */ rlv = & rdtlck->lv[rdtlck->index]; rlv->offset = 0; rlv->length = 5; rdtlck->index++; dtMoveEntry(sp, nxt, rp, &sdtlck, &rdtlck, DO_INDEX(ip)); sp->header.nextindex = nxt; /* * finalize freelist of new right page */ fsi = rp->header.freelist; f = &rp->slot[fsi]; for (fsi++; fsi < rp->header.maxslot; f++, fsi++) f->next = fsi; f->next = -1; /* * Update directory index table for entries now in right page */ if ((rp->header.flag & BT_LEAF) && DO_INDEX(ip)) { s64 lblock; mp = NULL; stbl = DT_GETSTBL(rp); for (n = 0; n < rp->header.nextindex; n++) { ldtentry = (struct ldtentry *) & rp->slot[stbl[n]]; modify_index(tid, ip, le32_to_cpu(ldtentry->index), rbn, n, &mp, &lblock); } if (mp) release_metapage(mp); } /* * the skipped index was on the left page, */ if (skip <= off) { /* insert the new entry in the split page */ dtInsertEntry(sp, skip, split->key, split->data, &sdtlck); /* linelock stbl of split page */ if (sdtlck->index >= sdtlck->maxcnt) sdtlck = (struct dt_lock *) txLinelock(sdtlck); slv = & sdtlck->lv[sdtlck->index]; n = skip >> L2DTSLOTSIZE; slv->offset = sp->header.stblindex + n; slv->length = ((sp->header.nextindex - 1) >> L2DTSLOTSIZE) - n + 1; sdtlck->index++; } /* * the skipped index was on the right page, */ else { /* adjust the skip index to reflect the new position */ skip -= nxt; /* insert the new entry in the right page */ dtInsertEntry(rp, skip, split->key, split->data, &rdtlck); } out: *rmpp = rmp; *rpxdp = *pxd; return rc; } /* * dtExtendPage() * * function: extend 1st/only directory leaf page * * parameter: * * return: 0 - success; * errno - failure; * return extended page pinned; */ static int dtExtendPage(tid_t tid, struct inode *ip, struct dtsplit * split, struct btstack * btstack) { struct super_block *sb = ip->i_sb; int rc; struct metapage *smp, *pmp, *mp; dtpage_t *sp, *pp; struct pxdlist *pxdlist; pxd_t *pxd, *tpxd; int xlen, xsize; int newstblindex, newstblsize; int oldstblindex, oldstblsize; int fsi, last; struct dtslot *f; struct btframe *parent; int n; struct dt_lock *dtlck; s64 xaddr, txaddr; struct tlock *tlck; struct pxd_lock *pxdlock; struct lv *lv; uint type; struct ldtentry *ldtentry; u8 *stbl; /* get page to extend */ smp = split->mp; sp = DT_PAGE(ip, smp); /* get parent/root page */ parent = BT_POP(btstack); DT_GETPAGE(ip, parent->bn, pmp, PSIZE, pp, rc); if (rc) return (rc); /* * extend the extent */ pxdlist = split->pxdlist; pxd = &pxdlist->pxd[pxdlist->npxd]; pxdlist->npxd++; xaddr = addressPXD(pxd); tpxd = &sp->header.self; txaddr = addressPXD(tpxd); /* in-place extension */ if (xaddr == txaddr) { type = tlckEXTEND; } /* relocation */ else { type = tlckNEW; /* save moved extent descriptor for later free */ tlck = txMaplock(tid, ip, tlckDTREE | tlckRELOCATE); pxdlock = (struct pxd_lock *) & tlck->lock; pxdlock->flag = mlckFREEPXD; pxdlock->pxd = sp->header.self; pxdlock->index = 1; /* * Update directory index table to reflect new page address */ if (DO_INDEX(ip)) { s64 lblock; mp = NULL; stbl = DT_GETSTBL(sp); for (n = 0; n < sp->header.nextindex; n++) { ldtentry = (struct ldtentry *) & sp->slot[stbl[n]]; modify_index(tid, ip, le32_to_cpu(ldtentry->index), xaddr, n, &mp, &lblock); } if (mp) release_metapage(mp); } } /* * extend the page */ sp->header.self = *pxd; jfs_info("dtExtendPage: ip:0x%p smp:0x%p sp:0x%p", ip, smp, sp); BT_MARK_DIRTY(smp, ip); /* * acquire a transaction lock on the extended/leaf page */ tlck = txLock(tid, ip, smp, tlckDTREE | type); dtlck = (struct dt_lock *) & tlck->lock; lv = & dtlck->lv[0]; /* update buffer extent descriptor of extended page */ xlen = lengthPXD(pxd); xsize = xlen << JFS_SBI(sb)->l2bsize; /* * copy old stbl to new stbl at start of extended area */ oldstblindex = sp->header.stblindex; oldstblsize = (sp->header.maxslot + 31) >> L2DTSLOTSIZE; newstblindex = sp->header.maxslot; n = xsize >> L2DTSLOTSIZE; newstblsize = (n + 31) >> L2DTSLOTSIZE; memcpy(&sp->slot[newstblindex], &sp->slot[oldstblindex], sp->header.nextindex); /* * in-line extension: linelock old area of extended page */ if (type == tlckEXTEND) { /* linelock header */ lv->offset = 0; lv->length = 1; dtlck->index++; lv++; /* linelock new stbl of extended page */ lv->offset = newstblindex; lv->length = newstblsize; } /* * relocation: linelock whole relocated area */ else { lv->offset = 0; lv->length = sp->header.maxslot + newstblsize; } dtlck->index++; sp->header.maxslot = n; sp->header.stblindex = newstblindex; /* sp->header.nextindex remains the same */ /* * add old stbl region at head of freelist */ fsi = oldstblindex; f = &sp->slot[fsi]; last = sp->header.freelist; for (n = 0; n < oldstblsize; n++, fsi++, f++) { f->next = last; last = fsi; } sp->header.freelist = last; sp->header.freecnt += oldstblsize; /* * append free region of newly extended area at tail of freelist */ /* init free region of newly extended area */ fsi = n = newstblindex + newstblsize; f = &sp->slot[fsi]; for (fsi++; fsi < sp->header.maxslot; f++, fsi++) f->next = fsi; f->next = -1; /* append new free region at tail of old freelist */ fsi = sp->header.freelist; if (fsi == -1) sp->header.freelist = n; else { do { f = &sp->slot[fsi]; fsi = f->next; } while (fsi != -1); f->next = n; } sp->header.freecnt += sp->header.maxslot - n; /* * insert the new entry */ dtInsertEntry(sp, split->index, split->key, split->data, &dtlck); BT_MARK_DIRTY(pmp, ip); /* * linelock any freeslots residing in old extent */ if (type == tlckEXTEND) { n = sp->header.maxslot >> 2; if (sp->header.freelist < n) dtLinelockFreelist(sp, n, &dtlck); } /* * update parent entry on the parent/root page */ /* * acquire a transaction lock on the parent/root page */ tlck = txLock(tid, ip, pmp, tlckDTREE | tlckENTRY); dtlck = (struct dt_lock *) & tlck->lock; lv = & dtlck->lv[dtlck->index]; /* linelock parent entry - 1st slot */ lv->offset = 1; lv->length = 1; dtlck->index++; /* update the parent pxd for page extension */ tpxd = (pxd_t *) & pp->slot[1]; *tpxd = *pxd; DT_PUTPAGE(pmp); return 0; } /* * dtSplitRoot() * * function: * split the full root page into * original/root/split page and new right page * i.e., root remains fixed in tree anchor (inode) and * the root is copied to a single new right child page * since root page << non-root page, and * the split root page contains a single entry for the * new right child page. * * parameter: * * return: 0 - success; * errno - failure; * return new page pinned; */ static int dtSplitRoot(tid_t tid, struct inode *ip, struct dtsplit * split, struct metapage ** rmpp) { struct super_block *sb = ip->i_sb; struct metapage *smp; dtroot_t *sp; struct metapage *rmp; dtpage_t *rp; s64 rbn; int xlen; int xsize; struct dtslot *f; s8 *stbl; int fsi, stblsize, n; struct idtentry *s; pxd_t *ppxd; struct pxdlist *pxdlist; pxd_t *pxd; struct dt_lock *dtlck; struct tlock *tlck; struct lv *lv; int rc; /* get split root page */ smp = split->mp; sp = &JFS_IP(ip)->i_dtroot; /* * allocate/initialize a single (right) child page * * N.B. at first split, a one (or two) block to fit new entry * is allocated; at subsequent split, a full page is allocated; */ pxdlist = split->pxdlist; pxd = &pxdlist->pxd[pxdlist->npxd]; pxdlist->npxd++; rbn = addressPXD(pxd); xlen = lengthPXD(pxd); xsize = xlen << JFS_SBI(sb)->l2bsize; rmp = get_metapage(ip, rbn, xsize, 1); if (!rmp) return -EIO; rp = rmp->data; /* Allocate blocks to quota. */ rc = dquot_alloc_block(ip, lengthPXD(pxd)); if (rc) { release_metapage(rmp); return rc; } BT_MARK_DIRTY(rmp, ip); /* * acquire a transaction lock on the new right page */ tlck = txLock(tid, ip, rmp, tlckDTREE | tlckNEW); dtlck = (struct dt_lock *) & tlck->lock; rp->header.flag = (sp->header.flag & BT_LEAF) ? BT_LEAF : BT_INTERNAL; rp->header.self = *pxd; /* initialize sibling pointers */ rp->header.next = 0; rp->header.prev = 0; /* * move in-line root page into new right page extent */ /* linelock header + copied entries + new stbl (1st slot) in new page */ ASSERT(dtlck->index == 0); lv = & dtlck->lv[0]; lv->offset = 0; lv->length = 10; /* 1 + 8 + 1 */ dtlck->index++; n = xsize >> L2DTSLOTSIZE; rp->header.maxslot = n; stblsize = (n + 31) >> L2DTSLOTSIZE; /* copy old stbl to new stbl at start of extended area */ rp->header.stblindex = DTROOTMAXSLOT; stbl = (s8 *) & rp->slot[DTROOTMAXSLOT]; memcpy(stbl, sp->header.stbl, sp->header.nextindex); rp->header.nextindex = sp->header.nextindex; /* copy old data area to start of new data area */ memcpy(&rp->slot[1], &sp->slot[1], IDATASIZE); /* * append free region of newly extended area at tail of freelist */ /* init free region of newly extended area */ fsi = n = DTROOTMAXSLOT + stblsize; f = &rp->slot[fsi]; for (fsi++; fsi < rp->header.maxslot; f++, fsi++) f->next = fsi; f->next = -1; /* append new free region at tail of old freelist */ fsi = sp->header.freelist; if (fsi == -1) rp->header.freelist = n; else { rp->header.freelist = fsi; do { f = &rp->slot[fsi]; fsi = f->next; } while (fsi >= 0); f->next = n; } rp->header.freecnt = sp->header.freecnt + rp->header.maxslot - n; /* * Update directory index table for entries now in right page */ if ((rp->header.flag & BT_LEAF) && DO_INDEX(ip)) { s64 lblock; struct metapage *mp = NULL; struct ldtentry *ldtentry; stbl = DT_GETSTBL(rp); for (n = 0; n < rp->header.nextindex; n++) { ldtentry = (struct ldtentry *) & rp->slot[stbl[n]]; modify_index(tid, ip, le32_to_cpu(ldtentry->index), rbn, n, &mp, &lblock); } if (mp) release_metapage(mp); } /* * insert the new entry into the new right/child page * (skip index in the new right page will not change) */ dtInsertEntry(rp, split->index, split->key, split->data, &dtlck); /* * reset parent/root page * * set the 1st entry offset to 0, which force the left-most key * at any level of the tree to be less than any search key. * * The btree comparison code guarantees that the left-most key on any * level of the tree is never used, so it doesn't need to be filled in. */ BT_MARK_DIRTY(smp, ip); /* * acquire a transaction lock on the root page (in-memory inode) */ tlck = txLock(tid, ip, smp, tlckDTREE | tlckNEW | tlckBTROOT); dtlck = (struct dt_lock *) & tlck->lock; /* linelock root */ ASSERT(dtlck->index == 0); lv = & dtlck->lv[0]; lv->offset = 0; lv->length = DTROOTMAXSLOT; dtlck->index++; /* update page header of root */ if (sp->header.flag & BT_LEAF) { sp->header.flag &= ~BT_LEAF; sp->header.flag |= BT_INTERNAL; } /* init the first entry */ s = (struct idtentry *) & sp->slot[DTENTRYSTART]; ppxd = (pxd_t *) s; *ppxd = *pxd; s->next = -1; s->namlen = 0; stbl = sp->header.stbl; stbl[0] = DTENTRYSTART; sp->header.nextindex = 1; /* init freelist */ fsi = DTENTRYSTART + 1; f = &sp->slot[fsi]; /* init free region of remaining area */ for (fsi++; fsi < DTROOTMAXSLOT; f++, fsi++) f->next = fsi; f->next = -1; sp->header.freelist = DTENTRYSTART + 1; sp->header.freecnt = DTROOTMAXSLOT - (DTENTRYSTART + 1); *rmpp = rmp; return 0; } /* * dtDelete() * * function: delete the entry(s) referenced by a key. * * parameter: * * return: */ int dtDelete(tid_t tid, struct inode *ip, struct component_name * key, ino_t * ino, int flag) { int rc = 0; s64 bn; struct metapage *mp, *imp; dtpage_t *p; int index; struct btstack btstack; struct dt_lock *dtlck; struct tlock *tlck; struct lv *lv; int i; struct ldtentry *ldtentry; u8 *stbl; u32 table_index, next_index; struct metapage *nmp; dtpage_t *np; /* * search for the entry to delete: * * dtSearch() returns (leaf page pinned, index at which to delete). */ if ((rc = dtSearch(ip, key, ino, &btstack, flag))) return rc; /* retrieve search result */ DT_GETSEARCH(ip, btstack.top, bn, mp, p, index); /* * We need to find put the index of the next entry into the * directory index table in order to resume a readdir from this * entry. */ if (DO_INDEX(ip)) { stbl = DT_GETSTBL(p); ldtentry = (struct ldtentry *) & p->slot[stbl[index]]; table_index = le32_to_cpu(ldtentry->index); if (index == (p->header.nextindex - 1)) { /* * Last entry in this leaf page */ if ((p->header.flag & BT_ROOT) || (p->header.next == 0)) next_index = -1; else { /* Read next leaf page */ DT_GETPAGE(ip, le64_to_cpu(p->header.next), nmp, PSIZE, np, rc); if (rc) next_index = -1; else { stbl = DT_GETSTBL(np); ldtentry = (struct ldtentry *) & np-> slot[stbl[0]]; next_index = le32_to_cpu(ldtentry->index); DT_PUTPAGE(nmp); } } } else { ldtentry = (struct ldtentry *) & p->slot[stbl[index + 1]]; next_index = le32_to_cpu(ldtentry->index); } free_index(tid, ip, table_index, next_index); } /* * the leaf page becomes empty, delete the page */ if (p->header.nextindex == 1) { /* delete empty page */ rc = dtDeleteUp(tid, ip, mp, p, &btstack); } /* * the leaf page has other entries remaining: * * delete the entry from the leaf page. */ else { BT_MARK_DIRTY(mp, ip); /* * acquire a transaction lock on the leaf page */ tlck = txLock(tid, ip, mp, tlckDTREE | tlckENTRY); dtlck = (struct dt_lock *) & tlck->lock; /* * Do not assume that dtlck->index will be zero. During a * rename within a directory, this transaction may have * modified this page already when adding the new entry. */ /* linelock header */ if (dtlck->index >= dtlck->maxcnt) dtlck = (struct dt_lock *) txLinelock(dtlck); lv = & dtlck->lv[dtlck->index]; lv->offset = 0; lv->length = 1; dtlck->index++; /* linelock stbl of non-root leaf page */ if (!(p->header.flag & BT_ROOT)) { if (dtlck->index >= dtlck->maxcnt) dtlck = (struct dt_lock *) txLinelock(dtlck); lv = & dtlck->lv[dtlck->index]; i = index >> L2DTSLOTSIZE; lv->offset = p->header.stblindex + i; lv->length = ((p->header.nextindex - 1) >> L2DTSLOTSIZE) - i + 1; dtlck->index++; } /* free the leaf entry */ dtDeleteEntry(p, index, &dtlck); /* * Update directory index table for entries moved in stbl */ if (DO_INDEX(ip) && index < p->header.nextindex) { s64 lblock; imp = NULL; stbl = DT_GETSTBL(p); for (i = index; i < p->header.nextindex; i++) { ldtentry = (struct ldtentry *) & p->slot[stbl[i]]; modify_index(tid, ip, le32_to_cpu(ldtentry->index), bn, i, &imp, &lblock); } if (imp) release_metapage(imp); } DT_PUTPAGE(mp); } return rc; } /* * dtDeleteUp() * * function: * free empty pages as propagating deletion up the tree * * parameter: * * return: */ static int dtDeleteUp(tid_t tid, struct inode *ip, struct metapage * fmp, dtpage_t * fp, struct btstack * btstack) { int rc = 0; struct metapage *mp; dtpage_t *p; int index, nextindex; int xlen; struct btframe *parent; struct dt_lock *dtlck; struct tlock *tlck; struct lv *lv; struct pxd_lock *pxdlock; int i; /* * keep the root leaf page which has become empty */ if (BT_IS_ROOT(fmp)) { /* * reset the root * * dtInitRoot() acquires txlock on the root */ dtInitRoot(tid, ip, PARENT(ip)); DT_PUTPAGE(fmp); return 0; } /* * free the non-root leaf page */ /* * acquire a transaction lock on the page * * write FREEXTENT|NOREDOPAGE log record * N.B. linelock is overlaid as freed extent descriptor, and * the buffer page is freed; */ tlck = txMaplock(tid, ip, tlckDTREE | tlckFREE); pxdlock = (struct pxd_lock *) & tlck->lock; pxdlock->flag = mlckFREEPXD; pxdlock->pxd = fp->header.self; pxdlock->index = 1; /* update sibling pointers */ if ((rc = dtRelink(tid, ip, fp))) { BT_PUTPAGE(fmp); return rc; } xlen = lengthPXD(&fp->header.self); /* Free quota allocation. */ dquot_free_block(ip, xlen); /* free/invalidate its buffer page */ discard_metapage(fmp); /* * propagate page deletion up the directory tree * * If the delete from the parent page makes it empty, * continue all the way up the tree. * stop if the root page is reached (which is never deleted) or * if the entry deletion does not empty the page. */ while ((parent = BT_POP(btstack)) != NULL) { /* pin the parent page <sp> */ DT_GETPAGE(ip, parent->bn, mp, PSIZE, p, rc); if (rc) return rc; /* * free the extent of the child page deleted */ index = parent->index; /* * delete the entry for the child page from parent */ nextindex = p->header.nextindex; /* * the parent has the single entry being deleted: * * free the parent page which has become empty. */ if (nextindex == 1) { /* * keep the root internal page which has become empty */ if (p->header.flag & BT_ROOT) { /* * reset the root * * dtInitRoot() acquires txlock on the root */ dtInitRoot(tid, ip, PARENT(ip)); DT_PUTPAGE(mp); return 0; } /* * free the parent page */ else { /* * acquire a transaction lock on the page * * write FREEXTENT|NOREDOPAGE log record */ tlck = txMaplock(tid, ip, tlckDTREE | tlckFREE); pxdlock = (struct pxd_lock *) & tlck->lock; pxdlock->flag = mlckFREEPXD; pxdlock->pxd = p->header.self; pxdlock->index = 1; /* update sibling pointers */ if ((rc = dtRelink(tid, ip, p))) { DT_PUTPAGE(mp); return rc; } xlen = lengthPXD(&p->header.self); /* Free quota allocation */ dquot_free_block(ip, xlen); /* free/invalidate its buffer page */ discard_metapage(mp); /* propagate up */ continue; } } /* * the parent has other entries remaining: * * delete the router entry from the parent page. */ BT_MARK_DIRTY(mp, ip); /* * acquire a transaction lock on the page * * action: router entry deletion */ tlck = txLock(tid, ip, mp, tlckDTREE | tlckENTRY); dtlck = (struct dt_lock *) & tlck->lock; /* linelock header */ if (dtlck->index >= dtlck->maxcnt) dtlck = (struct dt_lock *) txLinelock(dtlck); lv = & dtlck->lv[dtlck->index]; lv->offset = 0; lv->length = 1; dtlck->index++; /* linelock stbl of non-root leaf page */ if (!(p->header.flag & BT_ROOT)) { if (dtlck->index < dtlck->maxcnt) lv++; else { dtlck = (struct dt_lock *) txLinelock(dtlck); lv = & dtlck->lv[0]; } i = index >> L2DTSLOTSIZE; lv->offset = p->header.stblindex + i; lv->length = ((p->header.nextindex - 1) >> L2DTSLOTSIZE) - i + 1; dtlck->index++; } /* free the router entry */ dtDeleteEntry(p, index, &dtlck); /* reset key of new leftmost entry of level (for consistency) */ if (index == 0 && ((p->header.flag & BT_ROOT) || p->header.prev == 0)) dtTruncateEntry(p, 0, &dtlck); /* unpin the parent page */ DT_PUTPAGE(mp); /* exit propagation up */ break; } if (!DO_INDEX(ip)) ip->i_size -= PSIZE; return 0; } /* * dtRelink() * * function: * link around a freed page. * * parameter: * fp: page to be freed * * return: */ static int dtRelink(tid_t tid, struct inode *ip, dtpage_t * p) { int rc; struct metapage *mp; s64 nextbn, prevbn; struct tlock *tlck; struct dt_lock *dtlck; struct lv *lv; nextbn = le64_to_cpu(p->header.next); prevbn = le64_to_cpu(p->header.prev); /* update prev pointer of the next page */ if (nextbn != 0) { DT_GETPAGE(ip, nextbn, mp, PSIZE, p, rc); if (rc) return rc; BT_MARK_DIRTY(mp, ip); /* * acquire a transaction lock on the next page * * action: update prev pointer; */ tlck = txLock(tid, ip, mp, tlckDTREE | tlckRELINK); jfs_info("dtRelink nextbn: tlck = 0x%p, ip = 0x%p, mp=0x%p", tlck, ip, mp); dtlck = (struct dt_lock *) & tlck->lock; /* linelock header */ if (dtlck->index >= dtlck->maxcnt) dtlck = (struct dt_lock *) txLinelock(dtlck); lv = & dtlck->lv[dtlck->index]; lv->offset = 0; lv->length = 1; dtlck->index++; p->header.prev = cpu_to_le64(prevbn); DT_PUTPAGE(mp); } /* update next pointer of the previous page */ if (prevbn != 0) { DT_GETPAGE(ip, prevbn, mp, PSIZE, p, rc); if (rc) return rc; BT_MARK_DIRTY(mp, ip); /* * acquire a transaction lock on the prev page * * action: update next pointer; */ tlck = txLock(tid, ip, mp, tlckDTREE | tlckRELINK); jfs_info("dtRelink prevbn: tlck = 0x%p, ip = 0x%p, mp=0x%p", tlck, ip, mp); dtlck = (struct dt_lock *) & tlck->lock; /* linelock header */ if (dtlck->index >= dtlck->maxcnt) dtlck = (struct dt_lock *) txLinelock(dtlck); lv = & dtlck->lv[dtlck->index]; lv->offset = 0; lv->length = 1; dtlck->index++; p->header.next = cpu_to_le64(nextbn); DT_PUTPAGE(mp); } return 0; } /* * dtInitRoot() * * initialize directory root (inline in inode) */ void dtInitRoot(tid_t tid, struct inode *ip, u32 idotdot) { struct jfs_inode_info *jfs_ip = JFS_IP(ip); dtroot_t *p; int fsi; struct dtslot *f; struct tlock *tlck; struct dt_lock *dtlck; struct lv *lv; u16 xflag_save; /* * If this was previously an non-empty directory, we need to remove * the old directory table. */ if (DO_INDEX(ip)) { if (!jfs_dirtable_inline(ip)) { struct tblock *tblk = tid_to_tblock(tid); /* * We're playing games with the tid's xflag. If * we're removing a regular file, the file's xtree * is committed with COMMIT_PMAP, but we always * commit the directories xtree with COMMIT_PWMAP. */ xflag_save = tblk->xflag; tblk->xflag = 0; /* * xtTruncate isn't guaranteed to fully truncate * the xtree. The caller needs to check i_size * after committing the transaction to see if * additional truncation is needed. The * COMMIT_Stale flag tells caller that we * initiated the truncation. */ xtTruncate(tid, ip, 0, COMMIT_PWMAP); set_cflag(COMMIT_Stale, ip); tblk->xflag = xflag_save; } else ip->i_size = 1; jfs_ip->next_index = 2; } else ip->i_size = IDATASIZE; /* * acquire a transaction lock on the root * * action: directory initialization; */ tlck = txLock(tid, ip, (struct metapage *) & jfs_ip->bxflag, tlckDTREE | tlckENTRY | tlckBTROOT); dtlck = (struct dt_lock *) & tlck->lock; /* linelock root */ ASSERT(dtlck->index == 0); lv = & dtlck->lv[0]; lv->offset = 0; lv->length = DTROOTMAXSLOT; dtlck->index++; p = &jfs_ip->i_dtroot; p->header.flag = DXD_INDEX | BT_ROOT | BT_LEAF; p->header.nextindex = 0; /* init freelist */ fsi = 1; f = &p->slot[fsi]; /* init data area of root */ for (fsi++; fsi < DTROOTMAXSLOT; f++, fsi++) f->next = fsi; f->next = -1; p->header.freelist = 1; p->header.freecnt = 8; /* init '..' entry */ p->header.idotdot = cpu_to_le32(idotdot); return; } /* * add_missing_indices() * * function: Fix dtree page in which one or more entries has an invalid index. * fsck.jfs should really fix this, but it currently does not. * Called from jfs_readdir when bad index is detected. */ static void add_missing_indices(struct inode *inode, s64 bn) { struct ldtentry *d; struct dt_lock *dtlck; int i; uint index; struct lv *lv; struct metapage *mp; dtpage_t *p; int rc; s8 *stbl; tid_t tid; struct tlock *tlck; tid = txBegin(inode->i_sb, 0); DT_GETPAGE(inode, bn, mp, PSIZE, p, rc); if (rc) { printk(KERN_ERR "DT_GETPAGE failed!\n"); goto end; } BT_MARK_DIRTY(mp, inode); ASSERT(p->header.flag & BT_LEAF); tlck = txLock(tid, inode, mp, tlckDTREE | tlckENTRY); if (BT_IS_ROOT(mp)) tlck->type |= tlckBTROOT; dtlck = (struct dt_lock *) &tlck->lock; stbl = DT_GETSTBL(p); for (i = 0; i < p->header.nextindex; i++) { d = (struct ldtentry *) &p->slot[stbl[i]]; index = le32_to_cpu(d->index); if ((index < 2) || (index >= JFS_IP(inode)->next_index)) { d->index = cpu_to_le32(add_index(tid, inode, bn, i)); if (dtlck->index >= dtlck->maxcnt) dtlck = (struct dt_lock *) txLinelock(dtlck); lv = &dtlck->lv[dtlck->index]; lv->offset = stbl[i]; lv->length = 1; dtlck->index++; } } DT_PUTPAGE(mp); (void) txCommit(tid, 1, &inode, 0); end: txEnd(tid); } /* * Buffer to hold directory entry info while traversing a dtree page * before being fed to the filldir function */ struct jfs_dirent { loff_t position; int ino; u16 name_len; char name[]; }; /* * function to determine next variable-sized jfs_dirent in buffer */ static inline struct jfs_dirent *next_jfs_dirent(struct jfs_dirent *dirent) { return (struct jfs_dirent *) ((char *)dirent + ((sizeof (struct jfs_dirent) + dirent->name_len + 1 + sizeof (loff_t) - 1) & ~(sizeof (loff_t) - 1))); } /* * jfs_readdir() * * function: read directory entries sequentially * from the specified entry offset * * parameter: * * return: offset = (pn, index) of start entry * of next jfs_readdir()/dtRead() */ int jfs_readdir(struct file *file, struct dir_context *ctx) { struct inode *ip = file_inode(file); struct nls_table *codepage = JFS_SBI(ip->i_sb)->nls_tab; int rc = 0; loff_t dtpos; /* legacy OS/2 style position */ struct dtoffset { s16 pn; s16 index; s32 unused; } *dtoffset = (struct dtoffset *) &dtpos; s64 bn; struct metapage *mp; dtpage_t *p; int index; s8 *stbl; struct btstack btstack; int i, next; struct ldtentry *d; struct dtslot *t; int d_namleft, len, outlen; unsigned long dirent_buf; char *name_ptr; u32 dir_index; int do_index = 0; uint loop_count = 0; struct jfs_dirent *jfs_dirent; int jfs_dirents; int overflow, fix_page, page_fixed = 0; static int unique_pos = 2; /* If we can't fix broken index */ if (ctx->pos == DIREND) return 0; if (DO_INDEX(ip)) { /* * persistent index is stored in directory entries. * Special cases: 0 = . * 1 = .. * -1 = End of directory */ do_index = 1; dir_index = (u32) ctx->pos; /* * NFSv4 reserves cookies 1 and 2 for . and .. so the value * we return to the vfs is one greater than the one we use * internally. */ if (dir_index) dir_index--; if (dir_index > 1) { struct dir_table_slot dirtab_slot; if (dtEmpty(ip) || (dir_index >= JFS_IP(ip)->next_index)) { /* Stale position. Directory has shrunk */ ctx->pos = DIREND; return 0; } repeat: rc = read_index(ip, dir_index, &dirtab_slot); if (rc) { ctx->pos = DIREND; return rc; } if (dirtab_slot.flag == DIR_INDEX_FREE) { if (loop_count++ > JFS_IP(ip)->next_index) { jfs_err("jfs_readdir detected infinite loop!"); ctx->pos = DIREND; return 0; } dir_index = le32_to_cpu(dirtab_slot.addr2); if (dir_index == -1) { ctx->pos = DIREND; return 0; } goto repeat; } bn = addressDTS(&dirtab_slot); index = dirtab_slot.slot; DT_GETPAGE(ip, bn, mp, PSIZE, p, rc); if (rc) { ctx->pos = DIREND; return 0; } if (p->header.flag & BT_INTERNAL) { jfs_err("jfs_readdir: bad index table"); DT_PUTPAGE(mp); ctx->pos = DIREND; return 0; } } else { if (dir_index == 0) { /* * self "." */ ctx->pos = 1; if (!dir_emit(ctx, ".", 1, ip->i_ino, DT_DIR)) return 0; } /* * parent ".." */ ctx->pos = 2; if (!dir_emit(ctx, "..", 2, PARENT(ip), DT_DIR)) return 0; /* * Find first entry of left-most leaf */ if (dtEmpty(ip)) { ctx->pos = DIREND; return 0; } if ((rc = dtReadFirst(ip, &btstack))) return rc; DT_GETSEARCH(ip, btstack.top, bn, mp, p, index); } } else { /* * Legacy filesystem - OS/2 & Linux JFS < 0.3.6 * * pn = 0; index = 1: First entry "." * pn = 0; index = 2: Second entry ".." * pn > 0: Real entries, pn=1 -> leftmost page * pn = index = -1: No more entries */ dtpos = ctx->pos; if (dtpos < 2) { /* build "." entry */ ctx->pos = 1; if (!dir_emit(ctx, ".", 1, ip->i_ino, DT_DIR)) return 0; dtoffset->index = 2; ctx->pos = dtpos; } if (dtoffset->pn == 0) { if (dtoffset->index == 2) { /* build ".." entry */ if (!dir_emit(ctx, "..", 2, PARENT(ip), DT_DIR)) return 0; } else { jfs_err("jfs_readdir called with invalid offset!"); } dtoffset->pn = 1; dtoffset->index = 0; ctx->pos = dtpos; } if (dtEmpty(ip)) { ctx->pos = DIREND; return 0; } if ((rc = dtReadNext(ip, &ctx->pos, &btstack))) { jfs_err("jfs_readdir: unexpected rc = %d from dtReadNext", rc); ctx->pos = DIREND; return 0; } /* get start leaf page and index */ DT_GETSEARCH(ip, btstack.top, bn, mp, p, index); /* offset beyond directory eof ? */ if (bn < 0) { ctx->pos = DIREND; return 0; } } dirent_buf = __get_free_page(GFP_KERNEL); if (dirent_buf == 0) { DT_PUTPAGE(mp); jfs_warn("jfs_readdir: __get_free_page failed!"); ctx->pos = DIREND; return -ENOMEM; } while (1) { jfs_dirent = (struct jfs_dirent *) dirent_buf; jfs_dirents = 0; overflow = fix_page = 0; stbl = DT_GETSTBL(p); for (i = index; i < p->header.nextindex; i++) { d = (struct ldtentry *) & p->slot[stbl[i]]; if (((long) jfs_dirent + d->namlen + 1) > (dirent_buf + PAGE_SIZE)) { /* DBCS codepages could overrun dirent_buf */ index = i; overflow = 1; break; } d_namleft = d->namlen; name_ptr = jfs_dirent->name; jfs_dirent->ino = le32_to_cpu(d->inumber); if (do_index) { len = min(d_namleft, DTLHDRDATALEN); jfs_dirent->position = le32_to_cpu(d->index); /* * d->index should always be valid, but it * isn't. fsck.jfs doesn't create the * directory index for the lost+found * directory. Rather than let it go, * we can try to fix it. */ if ((jfs_dirent->position < 2) || (jfs_dirent->position >= JFS_IP(ip)->next_index)) { if (!page_fixed && !isReadOnly(ip)) { fix_page = 1; /* * setting overflow and setting * index to i will cause the * same page to be processed * again starting here */ overflow = 1; index = i; break; } jfs_dirent->position = unique_pos++; } /* * We add 1 to the index because we may * use a value of 2 internally, and NFSv4 * doesn't like that. */ jfs_dirent->position++; } else { jfs_dirent->position = dtpos; len = min(d_namleft, DTLHDRDATALEN_LEGACY); } /* copy the name of head/only segment */ outlen = jfs_strfromUCS_le(name_ptr, d->name, len, codepage); jfs_dirent->name_len = outlen; /* copy name in the additional segment(s) */ next = d->next; while (next >= 0) { t = (struct dtslot *) & p->slot[next]; name_ptr += outlen; d_namleft -= len; /* Sanity Check */ if (d_namleft == 0) { jfs_error(ip->i_sb, "JFS:Dtree error: ino = %ld, bn=%lld, index = %d\n", (long)ip->i_ino, (long long)bn, i); goto skip_one; } len = min(d_namleft, DTSLOTDATALEN); outlen = jfs_strfromUCS_le(name_ptr, t->name, len, codepage); jfs_dirent->name_len += outlen; next = t->next; } jfs_dirents++; jfs_dirent = next_jfs_dirent(jfs_dirent); skip_one: if (!do_index) dtoffset->index++; } if (!overflow) { /* Point to next leaf page */ if (p->header.flag & BT_ROOT) bn = 0; else { bn = le64_to_cpu(p->header.next); index = 0; /* update offset (pn:index) for new page */ if (!do_index) { dtoffset->pn++; dtoffset->index = 0; } } page_fixed = 0; } /* unpin previous leaf page */ DT_PUTPAGE(mp); jfs_dirent = (struct jfs_dirent *) dirent_buf; while (jfs_dirents--) { ctx->pos = jfs_dirent->position; if (!dir_emit(ctx, jfs_dirent->name, jfs_dirent->name_len, jfs_dirent->ino, DT_UNKNOWN)) goto out; jfs_dirent = next_jfs_dirent(jfs_dirent); } if (fix_page) { add_missing_indices(ip, bn); page_fixed = 1; } if (!overflow && (bn == 0)) { ctx->pos = DIREND; break; } DT_GETPAGE(ip, bn, mp, PSIZE, p, rc); if (rc) { free_page(dirent_buf); return rc; } } out: free_page(dirent_buf); return rc; } /* * dtReadFirst() * * function: get the leftmost page of the directory */ static int dtReadFirst(struct inode *ip, struct btstack * btstack) { int rc = 0; s64 bn; int psize = 288; /* initial in-line directory */ struct metapage *mp; dtpage_t *p; s8 *stbl; struct btframe *btsp; pxd_t *xd; BT_CLR(btstack); /* reset stack */ /* * descend leftmost path of the tree * * by convention, root bn = 0. */ for (bn = 0;;) { DT_GETPAGE(ip, bn, mp, psize, p, rc); if (rc) return rc; /* * leftmost leaf page */ if (p->header.flag & BT_LEAF) { /* return leftmost entry */ btsp = btstack->top; btsp->bn = bn; btsp->index = 0; btsp->mp = mp; return 0; } /* * descend down to leftmost child page */ if (BT_STACK_FULL(btstack)) { DT_PUTPAGE(mp); jfs_error(ip->i_sb, "btstack overrun\n"); BT_STACK_DUMP(btstack); return -EIO; } /* push (bn, index) of the parent page/entry */ BT_PUSH(btstack, bn, 0); /* get the leftmost entry */ stbl = DT_GETSTBL(p); xd = (pxd_t *) & p->slot[stbl[0]]; /* get the child page block address */ bn = addressPXD(xd); psize = lengthPXD(xd) << JFS_SBI(ip->i_sb)->l2bsize; /* unpin the parent page */ DT_PUTPAGE(mp); } } /* * dtReadNext() * * function: get the page of the specified offset (pn:index) * * return: if (offset > eof), bn = -1; * * note: if index > nextindex of the target leaf page, * start with 1st entry of next leaf page; */ static int dtReadNext(struct inode *ip, loff_t * offset, struct btstack * btstack) { int rc = 0; struct dtoffset { s16 pn; s16 index; s32 unused; } *dtoffset = (struct dtoffset *) offset; s64 bn; struct metapage *mp; dtpage_t *p; int index; int pn; s8 *stbl; struct btframe *btsp, *parent; pxd_t *xd; /* * get leftmost leaf page pinned */ if ((rc = dtReadFirst(ip, btstack))) return rc; /* get leaf page */ DT_GETSEARCH(ip, btstack->top, bn, mp, p, index); /* get the start offset (pn:index) */ pn = dtoffset->pn - 1; /* Now pn = 0 represents leftmost leaf */ index = dtoffset->index; /* start at leftmost page ? */ if (pn == 0) { /* offset beyond eof ? */ if (index < p->header.nextindex) goto out; if (p->header.flag & BT_ROOT) { bn = -1; goto out; } /* start with 1st entry of next leaf page */ dtoffset->pn++; dtoffset->index = index = 0; goto a; } /* start at non-leftmost page: scan parent pages for large pn */ if (p->header.flag & BT_ROOT) { bn = -1; goto out; } /* start after next leaf page ? */ if (pn > 1) goto b; /* get leaf page pn = 1 */ a: bn = le64_to_cpu(p->header.next); /* unpin leaf page */ DT_PUTPAGE(mp); /* offset beyond eof ? */ if (bn == 0) { bn = -1; goto out; } goto c; /* * scan last internal page level to get target leaf page */ b: /* unpin leftmost leaf page */ DT_PUTPAGE(mp); /* get left most parent page */ btsp = btstack->top; parent = btsp - 1; bn = parent->bn; DT_GETPAGE(ip, bn, mp, PSIZE, p, rc); if (rc) return rc; /* scan parent pages at last internal page level */ while (pn >= p->header.nextindex) { pn -= p->header.nextindex; /* get next parent page address */ bn = le64_to_cpu(p->header.next); /* unpin current parent page */ DT_PUTPAGE(mp); /* offset beyond eof ? */ if (bn == 0) { bn = -1; goto out; } /* get next parent page */ DT_GETPAGE(ip, bn, mp, PSIZE, p, rc); if (rc) return rc; /* update parent page stack frame */ parent->bn = bn; } /* get leaf page address */ stbl = DT_GETSTBL(p); xd = (pxd_t *) & p->slot[stbl[pn]]; bn = addressPXD(xd); /* unpin parent page */ DT_PUTPAGE(mp); /* * get target leaf page */ c: DT_GETPAGE(ip, bn, mp, PSIZE, p, rc); if (rc) return rc; /* * leaf page has been completed: * start with 1st entry of next leaf page */ if (index >= p->header.nextindex) { bn = le64_to_cpu(p->header.next); /* unpin leaf page */ DT_PUTPAGE(mp); /* offset beyond eof ? */ if (bn == 0) { bn = -1; goto out; } /* get next leaf page */ DT_GETPAGE(ip, bn, mp, PSIZE, p, rc); if (rc) return rc; /* start with 1st entry of next leaf page */ dtoffset->pn++; dtoffset->index = 0; } out: /* return target leaf page pinned */ btsp = btstack->top; btsp->bn = bn; btsp->index = dtoffset->index; btsp->mp = mp; return 0; } /* * dtCompare() * * function: compare search key with an internal entry * * return: * < 0 if k is < record * = 0 if k is = record * > 0 if k is > record */ static int dtCompare(struct component_name * key, /* search key */ dtpage_t * p, /* directory page */ int si) { /* entry slot index */ wchar_t *kname; __le16 *name; int klen, namlen, len, rc; struct idtentry *ih; struct dtslot *t; /* * force the left-most key on internal pages, at any level of * the tree, to be less than any search key. * this obviates having to update the leftmost key on an internal * page when the user inserts a new key in the tree smaller than * anything that has been stored. * * (? if/when dtSearch() narrows down to 1st entry (index = 0), * at any internal page at any level of the tree, * it descends to child of the entry anyway - * ? make the entry as min size dummy entry) * * if (e->index == 0 && h->prevpg == P_INVALID && !(h->flags & BT_LEAF)) * return (1); */ kname = key->name; klen = key->namlen; ih = (struct idtentry *) & p->slot[si]; si = ih->next; name = ih->name; namlen = ih->namlen; len = min(namlen, DTIHDRDATALEN); /* compare with head/only segment */ len = min(klen, len); if ((rc = UniStrncmp_le(kname, name, len))) return rc; klen -= len; namlen -= len; /* compare with additional segment(s) */ kname += len; while (klen > 0 && namlen > 0) { /* compare with next name segment */ t = (struct dtslot *) & p->slot[si]; len = min(namlen, DTSLOTDATALEN); len = min(klen, len); name = t->name; if ((rc = UniStrncmp_le(kname, name, len))) return rc; klen -= len; namlen -= len; kname += len; si = t->next; } return (klen - namlen); } /* * ciCompare() * * function: compare search key with an (leaf/internal) entry * * return: * < 0 if k is < record * = 0 if k is = record * > 0 if k is > record */ static int ciCompare(struct component_name * key, /* search key */ dtpage_t * p, /* directory page */ int si, /* entry slot index */ int flag) { wchar_t *kname, x; __le16 *name; int klen, namlen, len, rc; struct ldtentry *lh; struct idtentry *ih; struct dtslot *t; int i; /* * force the left-most key on internal pages, at any level of * the tree, to be less than any search key. * this obviates having to update the leftmost key on an internal * page when the user inserts a new key in the tree smaller than * anything that has been stored. * * (? if/when dtSearch() narrows down to 1st entry (index = 0), * at any internal page at any level of the tree, * it descends to child of the entry anyway - * ? make the entry as min size dummy entry) * * if (e->index == 0 && h->prevpg == P_INVALID && !(h->flags & BT_LEAF)) * return (1); */ kname = key->name; klen = key->namlen; /* * leaf page entry */ if (p->header.flag & BT_LEAF) { lh = (struct ldtentry *) & p->slot[si]; si = lh->next; name = lh->name; namlen = lh->namlen; if (flag & JFS_DIR_INDEX) len = min(namlen, DTLHDRDATALEN); else len = min(namlen, DTLHDRDATALEN_LEGACY); } /* * internal page entry */ else { ih = (struct idtentry *) & p->slot[si]; si = ih->next; name = ih->name; namlen = ih->namlen; len = min(namlen, DTIHDRDATALEN); } /* compare with head/only segment */ len = min(klen, len); for (i = 0; i < len; i++, kname++, name++) { /* only uppercase if case-insensitive support is on */ if ((flag & JFS_OS2) == JFS_OS2) x = UniToupper(le16_to_cpu(*name)); else x = le16_to_cpu(*name); if ((rc = *kname - x)) return rc; } klen -= len; namlen -= len; /* compare with additional segment(s) */ while (klen > 0 && namlen > 0) { /* compare with next name segment */ t = (struct dtslot *) & p->slot[si]; len = min(namlen, DTSLOTDATALEN); len = min(klen, len); name = t->name; for (i = 0; i < len; i++, kname++, name++) { /* only uppercase if case-insensitive support is on */ if ((flag & JFS_OS2) == JFS_OS2) x = UniToupper(le16_to_cpu(*name)); else x = le16_to_cpu(*name); if ((rc = *kname - x)) return rc; } klen -= len; namlen -= len; si = t->next; } return (klen - namlen); } /* * ciGetLeafPrefixKey() * * function: compute prefix of suffix compression * from two adjacent leaf entries * across page boundary * * return: non-zero on error * */ static int ciGetLeafPrefixKey(dtpage_t * lp, int li, dtpage_t * rp, int ri, struct component_name * key, int flag) { int klen, namlen; wchar_t *pl, *pr, *kname; struct component_name lkey; struct component_name rkey; lkey.name = kmalloc_array(JFS_NAME_MAX + 1, sizeof(wchar_t), GFP_KERNEL); if (lkey.name == NULL) return -ENOMEM; rkey.name = kmalloc_array(JFS_NAME_MAX + 1, sizeof(wchar_t), GFP_KERNEL); if (rkey.name == NULL) { kfree(lkey.name); return -ENOMEM; } /* get left and right key */ dtGetKey(lp, li, &lkey, flag); lkey.name[lkey.namlen] = 0; if ((flag & JFS_OS2) == JFS_OS2) ciToUpper(&lkey); dtGetKey(rp, ri, &rkey, flag); rkey.name[rkey.namlen] = 0; if ((flag & JFS_OS2) == JFS_OS2) ciToUpper(&rkey); /* compute prefix */ klen = 0; kname = key->name; namlen = min(lkey.namlen, rkey.namlen); for (pl = lkey.name, pr = rkey.name; namlen; pl++, pr++, namlen--, klen++, kname++) { *kname = *pr; if (*pl != *pr) { key->namlen = klen + 1; goto free_names; } } /* l->namlen <= r->namlen since l <= r */ if (lkey.namlen < rkey.namlen) { *kname = *pr; key->namlen = klen + 1; } else /* l->namelen == r->namelen */ key->namlen = klen; free_names: kfree(lkey.name); kfree(rkey.name); return 0; } /* * dtGetKey() * * function: get key of the entry */ static void dtGetKey(dtpage_t * p, int i, /* entry index */ struct component_name * key, int flag) { int si; s8 *stbl; struct ldtentry *lh; struct idtentry *ih; struct dtslot *t; int namlen, len; wchar_t *kname; __le16 *name; /* get entry */ stbl = DT_GETSTBL(p); si = stbl[i]; if (p->header.flag & BT_LEAF) { lh = (struct ldtentry *) & p->slot[si]; si = lh->next; namlen = lh->namlen; name = lh->name; if (flag & JFS_DIR_INDEX) len = min(namlen, DTLHDRDATALEN); else len = min(namlen, DTLHDRDATALEN_LEGACY); } else { ih = (struct idtentry *) & p->slot[si]; si = ih->next; namlen = ih->namlen; name = ih->name; len = min(namlen, DTIHDRDATALEN); } key->namlen = namlen; kname = key->name; /* * move head/only segment */ UniStrncpy_from_le(kname, name, len); /* * move additional segment(s) */ while (si >= 0) { /* get next segment */ t = &p->slot[si]; kname += len; namlen -= len; len = min(namlen, DTSLOTDATALEN); UniStrncpy_from_le(kname, t->name, len); si = t->next; } } /* * dtInsertEntry() * * function: allocate free slot(s) and * write a leaf/internal entry * * return: entry slot index */ static void dtInsertEntry(dtpage_t * p, int index, struct component_name * key, ddata_t * data, struct dt_lock ** dtlock) { struct dtslot *h, *t; struct ldtentry *lh = NULL; struct idtentry *ih = NULL; int hsi, fsi, klen, len, nextindex; wchar_t *kname; __le16 *name; s8 *stbl; pxd_t *xd; struct dt_lock *dtlck = *dtlock; struct lv *lv; int xsi, n; s64 bn = 0; struct metapage *mp = NULL; klen = key->namlen; kname = key->name; /* allocate a free slot */ hsi = fsi = p->header.freelist; h = &p->slot[fsi]; p->header.freelist = h->next; --p->header.freecnt; /* open new linelock */ if (dtlck->index >= dtlck->maxcnt) dtlck = (struct dt_lock *) txLinelock(dtlck); lv = & dtlck->lv[dtlck->index]; lv->offset = hsi; /* write head/only segment */ if (p->header.flag & BT_LEAF) { lh = (struct ldtentry *) h; lh->next = h->next; lh->inumber = cpu_to_le32(data->leaf.ino); lh->namlen = klen; name = lh->name; if (data->leaf.ip) { len = min(klen, DTLHDRDATALEN); if (!(p->header.flag & BT_ROOT)) bn = addressPXD(&p->header.self); lh->index = cpu_to_le32(add_index(data->leaf.tid, data->leaf.ip, bn, index)); } else len = min(klen, DTLHDRDATALEN_LEGACY); } else { ih = (struct idtentry *) h; ih->next = h->next; xd = (pxd_t *) ih; *xd = data->xd; ih->namlen = klen; name = ih->name; len = min(klen, DTIHDRDATALEN); } UniStrncpy_to_le(name, kname, len); n = 1; xsi = hsi; /* write additional segment(s) */ t = h; klen -= len; while (klen) { /* get free slot */ fsi = p->header.freelist; t = &p->slot[fsi]; p->header.freelist = t->next; --p->header.freecnt; /* is next slot contiguous ? */ if (fsi != xsi + 1) { /* close current linelock */ lv->length = n; dtlck->index++; /* open new linelock */ if (dtlck->index < dtlck->maxcnt) lv++; else { dtlck = (struct dt_lock *) txLinelock(dtlck); lv = & dtlck->lv[0]; } lv->offset = fsi; n = 0; } kname += len; len = min(klen, DTSLOTDATALEN); UniStrncpy_to_le(t->name, kname, len); n++; xsi = fsi; klen -= len; } /* close current linelock */ lv->length = n; dtlck->index++; *dtlock = dtlck; /* terminate last/only segment */ if (h == t) { /* single segment entry */ if (p->header.flag & BT_LEAF) lh->next = -1; else ih->next = -1; } else /* multi-segment entry */ t->next = -1; /* if insert into middle, shift right succeeding entries in stbl */ stbl = DT_GETSTBL(p); nextindex = p->header.nextindex; if (index < nextindex) { memmove(stbl + index + 1, stbl + index, nextindex - index); if ((p->header.flag & BT_LEAF) && data->leaf.ip) { s64 lblock; /* * Need to update slot number for entries that moved * in the stbl */ mp = NULL; for (n = index + 1; n <= nextindex; n++) { lh = (struct ldtentry *) & (p->slot[stbl[n]]); modify_index(data->leaf.tid, data->leaf.ip, le32_to_cpu(lh->index), bn, n, &mp, &lblock); } if (mp) release_metapage(mp); } } stbl[index] = hsi; /* advance next available entry index of stbl */ ++p->header.nextindex; } /* * dtMoveEntry() * * function: move entries from split/left page to new/right page * * nextindex of dst page and freelist/freecnt of both pages * are updated. */ static void dtMoveEntry(dtpage_t * sp, int si, dtpage_t * dp, struct dt_lock ** sdtlock, struct dt_lock ** ddtlock, int do_index) { int ssi, next; /* src slot index */ int di; /* dst entry index */ int dsi; /* dst slot index */ s8 *sstbl, *dstbl; /* sorted entry table */ int snamlen, len; struct ldtentry *slh, *dlh = NULL; struct idtentry *sih, *dih = NULL; struct dtslot *h, *s, *d; struct dt_lock *sdtlck = *sdtlock, *ddtlck = *ddtlock; struct lv *slv, *dlv; int xssi, ns, nd; int sfsi; sstbl = (s8 *) & sp->slot[sp->header.stblindex]; dstbl = (s8 *) & dp->slot[dp->header.stblindex]; dsi = dp->header.freelist; /* first (whole page) free slot */ sfsi = sp->header.freelist; /* linelock destination entry slot */ dlv = & ddtlck->lv[ddtlck->index]; dlv->offset = dsi; /* linelock source entry slot */ slv = & sdtlck->lv[sdtlck->index]; slv->offset = sstbl[si]; xssi = slv->offset - 1; /* * move entries */ ns = nd = 0; for (di = 0; si < sp->header.nextindex; si++, di++) { ssi = sstbl[si]; dstbl[di] = dsi; /* is next slot contiguous ? */ if (ssi != xssi + 1) { /* close current linelock */ slv->length = ns; sdtlck->index++; /* open new linelock */ if (sdtlck->index < sdtlck->maxcnt) slv++; else { sdtlck = (struct dt_lock *) txLinelock(sdtlck); slv = & sdtlck->lv[0]; } slv->offset = ssi; ns = 0; } /* * move head/only segment of an entry */ /* get dst slot */ h = d = &dp->slot[dsi]; /* get src slot and move */ s = &sp->slot[ssi]; if (sp->header.flag & BT_LEAF) { /* get source entry */ slh = (struct ldtentry *) s; dlh = (struct ldtentry *) h; snamlen = slh->namlen; if (do_index) { len = min(snamlen, DTLHDRDATALEN); dlh->index = slh->index; /* little-endian */ } else len = min(snamlen, DTLHDRDATALEN_LEGACY); memcpy(dlh, slh, 6 + len * 2); next = slh->next; /* update dst head/only segment next field */ dsi++; dlh->next = dsi; } else { sih = (struct idtentry *) s; snamlen = sih->namlen; len = min(snamlen, DTIHDRDATALEN); dih = (struct idtentry *) h; memcpy(dih, sih, 10 + len * 2); next = sih->next; dsi++; dih->next = dsi; } /* free src head/only segment */ s->next = sfsi; s->cnt = 1; sfsi = ssi; ns++; nd++; xssi = ssi; /* * move additional segment(s) of the entry */ snamlen -= len; while ((ssi = next) >= 0) { /* is next slot contiguous ? */ if (ssi != xssi + 1) { /* close current linelock */ slv->length = ns; sdtlck->index++; /* open new linelock */ if (sdtlck->index < sdtlck->maxcnt) slv++; else { sdtlck = (struct dt_lock *) txLinelock(sdtlck); slv = & sdtlck->lv[0]; } slv->offset = ssi; ns = 0; } /* get next source segment */ s = &sp->slot[ssi]; /* get next destination free slot */ d++; len = min(snamlen, DTSLOTDATALEN); UniStrncpy_le(d->name, s->name, len); ns++; nd++; xssi = ssi; dsi++; d->next = dsi; /* free source segment */ next = s->next; s->next = sfsi; s->cnt = 1; sfsi = ssi; snamlen -= len; } /* end while */ /* terminate dst last/only segment */ if (h == d) { /* single segment entry */ if (dp->header.flag & BT_LEAF) dlh->next = -1; else dih->next = -1; } else /* multi-segment entry */ d->next = -1; } /* end for */ /* close current linelock */ slv->length = ns; sdtlck->index++; *sdtlock = sdtlck; dlv->length = nd; ddtlck->index++; *ddtlock = ddtlck; /* update source header */ sp->header.freelist = sfsi; sp->header.freecnt += nd; /* update destination header */ dp->header.nextindex = di; dp->header.freelist = dsi; dp->header.freecnt -= nd; } /* * dtDeleteEntry() * * function: free a (leaf/internal) entry * * log freelist header, stbl, and each segment slot of entry * (even though last/only segment next field is modified, * physical image logging requires all segment slots of * the entry logged to avoid applying previous updates * to the same slots) */ static void dtDeleteEntry(dtpage_t * p, int fi, struct dt_lock ** dtlock) { int fsi; /* free entry slot index */ s8 *stbl; struct dtslot *t; int si, freecnt; struct dt_lock *dtlck = *dtlock; struct lv *lv; int xsi, n; /* get free entry slot index */ stbl = DT_GETSTBL(p); fsi = stbl[fi]; /* open new linelock */ if (dtlck->index >= dtlck->maxcnt) dtlck = (struct dt_lock *) txLinelock(dtlck); lv = & dtlck->lv[dtlck->index]; lv->offset = fsi; /* get the head/only segment */ t = &p->slot[fsi]; if (p->header.flag & BT_LEAF) si = ((struct ldtentry *) t)->next; else si = ((struct idtentry *) t)->next; t->next = si; t->cnt = 1; n = freecnt = 1; xsi = fsi; /* find the last/only segment */ while (si >= 0) { /* is next slot contiguous ? */ if (si != xsi + 1) { /* close current linelock */ lv->length = n; dtlck->index++; /* open new linelock */ if (dtlck->index < dtlck->maxcnt) lv++; else { dtlck = (struct dt_lock *) txLinelock(dtlck); lv = & dtlck->lv[0]; } lv->offset = si; n = 0; } n++; xsi = si; freecnt++; t = &p->slot[si]; t->cnt = 1; si = t->next; } /* close current linelock */ lv->length = n; dtlck->index++; *dtlock = dtlck; /* update freelist */ t->next = p->header.freelist; p->header.freelist = fsi; p->header.freecnt += freecnt; /* if delete from middle, * shift left the succedding entries in the stbl */ si = p->header.nextindex; if (fi < si - 1) memmove(&stbl[fi], &stbl[fi + 1], si - fi - 1); p->header.nextindex--; } /* * dtTruncateEntry() * * function: truncate a (leaf/internal) entry * * log freelist header, stbl, and each segment slot of entry * (even though last/only segment next field is modified, * physical image logging requires all segment slots of * the entry logged to avoid applying previous updates * to the same slots) */ static void dtTruncateEntry(dtpage_t * p, int ti, struct dt_lock ** dtlock) { int tsi; /* truncate entry slot index */ s8 *stbl; struct dtslot *t; int si, freecnt; struct dt_lock *dtlck = *dtlock; struct lv *lv; int fsi, xsi, n; /* get free entry slot index */ stbl = DT_GETSTBL(p); tsi = stbl[ti]; /* open new linelock */ if (dtlck->index >= dtlck->maxcnt) dtlck = (struct dt_lock *) txLinelock(dtlck); lv = & dtlck->lv[dtlck->index]; lv->offset = tsi; /* get the head/only segment */ t = &p->slot[tsi]; ASSERT(p->header.flag & BT_INTERNAL); ((struct idtentry *) t)->namlen = 0; si = ((struct idtentry *) t)->next; ((struct idtentry *) t)->next = -1; n = 1; freecnt = 0; fsi = si; xsi = tsi; /* find the last/only segment */ while (si >= 0) { /* is next slot contiguous ? */ if (si != xsi + 1) { /* close current linelock */ lv->length = n; dtlck->index++; /* open new linelock */ if (dtlck->index < dtlck->maxcnt) lv++; else { dtlck = (struct dt_lock *) txLinelock(dtlck); lv = & dtlck->lv[0]; } lv->offset = si; n = 0; } n++; xsi = si; freecnt++; t = &p->slot[si]; t->cnt = 1; si = t->next; } /* close current linelock */ lv->length = n; dtlck->index++; *dtlock = dtlck; /* update freelist */ if (freecnt == 0) return; t->next = p->header.freelist; p->header.freelist = fsi; p->header.freecnt += freecnt; } /* * dtLinelockFreelist() */ static void dtLinelockFreelist(dtpage_t * p, /* directory page */ int m, /* max slot index */ struct dt_lock ** dtlock) { int fsi; /* free entry slot index */ struct dtslot *t; int si; struct dt_lock *dtlck = *dtlock; struct lv *lv; int xsi, n; /* get free entry slot index */ fsi = p->header.freelist; /* open new linelock */ if (dtlck->index >= dtlck->maxcnt) dtlck = (struct dt_lock *) txLinelock(dtlck); lv = & dtlck->lv[dtlck->index]; lv->offset = fsi; n = 1; xsi = fsi; t = &p->slot[fsi]; si = t->next; /* find the last/only segment */ while (si < m && si >= 0) { /* is next slot contiguous ? */ if (si != xsi + 1) { /* close current linelock */ lv->length = n; dtlck->index++; /* open new linelock */ if (dtlck->index < dtlck->maxcnt) lv++; else { dtlck = (struct dt_lock *) txLinelock(dtlck); lv = & dtlck->lv[0]; } lv->offset = si; n = 0; } n++; xsi = si; t = &p->slot[si]; si = t->next; } /* close current linelock */ lv->length = n; dtlck->index++; *dtlock = dtlck; } /* * NAME: dtModify * * FUNCTION: Modify the inode number part of a directory entry * * PARAMETERS: * tid - Transaction id * ip - Inode of parent directory * key - Name of entry to be modified * orig_ino - Original inode number expected in entry * new_ino - New inode number to put into entry * flag - JFS_RENAME * * RETURNS: * -ESTALE - If entry found does not match orig_ino passed in * -ENOENT - If no entry can be found to match key * 0 - If successfully modified entry */ int dtModify(tid_t tid, struct inode *ip, struct component_name * key, ino_t * orig_ino, ino_t new_ino, int flag) { int rc; s64 bn; struct metapage *mp; dtpage_t *p; int index; struct btstack btstack; struct tlock *tlck; struct dt_lock *dtlck; struct lv *lv; s8 *stbl; int entry_si; /* entry slot index */ struct ldtentry *entry; /* * search for the entry to modify: * * dtSearch() returns (leaf page pinned, index at which to modify). */ if ((rc = dtSearch(ip, key, orig_ino, &btstack, flag))) return rc; /* retrieve search result */ DT_GETSEARCH(ip, btstack.top, bn, mp, p, index); BT_MARK_DIRTY(mp, ip); /* * acquire a transaction lock on the leaf page of named entry */ tlck = txLock(tid, ip, mp, tlckDTREE | tlckENTRY); dtlck = (struct dt_lock *) & tlck->lock; /* get slot index of the entry */ stbl = DT_GETSTBL(p); entry_si = stbl[index]; /* linelock entry */ ASSERT(dtlck->index == 0); lv = & dtlck->lv[0]; lv->offset = entry_si; lv->length = 1; dtlck->index++; /* get the head/only segment */ entry = (struct ldtentry *) & p->slot[entry_si]; /* substitute the inode number of the entry */ entry->inumber = cpu_to_le32(new_ino); /* unpin the leaf page */ DT_PUTPAGE(mp); return 0; } |
| 2 5310 5314 14 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 | // SPDX-License-Identifier: GPL-2.0 /* * bio-integrity.c - bio data integrity extensions * * Copyright (C) 2007, 2008, 2009 Oracle Corporation * Written by: Martin K. Petersen <martin.petersen@oracle.com> */ #include <linux/blk-integrity.h> #include <linux/mempool.h> #include <linux/export.h> #include <linux/bio.h> #include <linux/workqueue.h> #include <linux/slab.h> #include "blk.h" static struct kmem_cache *bip_slab; static struct workqueue_struct *kintegrityd_wq; void blk_flush_integrity(void) { flush_workqueue(kintegrityd_wq); } /** * bio_integrity_free - Free bio integrity payload * @bio: bio containing bip to be freed * * Description: Free the integrity portion of a bio. */ void bio_integrity_free(struct bio *bio) { struct bio_integrity_payload *bip = bio_integrity(bio); struct bio_set *bs = bio->bi_pool; if (bs && mempool_initialized(&bs->bio_integrity_pool)) { if (bip->bip_vec) bvec_free(&bs->bvec_integrity_pool, bip->bip_vec, bip->bip_max_vcnt); mempool_free(bip, &bs->bio_integrity_pool); } else { kfree(bip); } bio->bi_integrity = NULL; bio->bi_opf &= ~REQ_INTEGRITY; } /** * bio_integrity_alloc - Allocate integrity payload and attach it to bio * @bio: bio to attach integrity metadata to * @gfp_mask: Memory allocation mask * @nr_vecs: Number of integrity metadata scatter-gather elements * * Description: This function prepares a bio for attaching integrity * metadata. nr_vecs specifies the maximum number of pages containing * integrity metadata that can be attached. */ struct bio_integrity_payload *bio_integrity_alloc(struct bio *bio, gfp_t gfp_mask, unsigned int nr_vecs) { struct bio_integrity_payload *bip; struct bio_set *bs = bio->bi_pool; unsigned inline_vecs; if (WARN_ON_ONCE(bio_has_crypt_ctx(bio))) return ERR_PTR(-EOPNOTSUPP); if (!bs || !mempool_initialized(&bs->bio_integrity_pool)) { bip = kmalloc(struct_size(bip, bip_inline_vecs, nr_vecs), gfp_mask); inline_vecs = nr_vecs; } else { bip = mempool_alloc(&bs->bio_integrity_pool, gfp_mask); inline_vecs = BIO_INLINE_VECS; } if (unlikely(!bip)) return ERR_PTR(-ENOMEM); memset(bip, 0, sizeof(*bip)); /* always report as many vecs as asked explicitly, not inline vecs */ bip->bip_max_vcnt = nr_vecs; if (nr_vecs > inline_vecs) { bip->bip_vec = bvec_alloc(&bs->bvec_integrity_pool, &bip->bip_max_vcnt, gfp_mask); if (!bip->bip_vec) goto err; } else if (nr_vecs) { bip->bip_vec = bip->bip_inline_vecs; } bip->bip_bio = bio; bio->bi_integrity = bip; bio->bi_opf |= REQ_INTEGRITY; return bip; err: if (bs && mempool_initialized(&bs->bio_integrity_pool)) mempool_free(bip, &bs->bio_integrity_pool); else kfree(bip); return ERR_PTR(-ENOMEM); } EXPORT_SYMBOL(bio_integrity_alloc); static void bio_integrity_unpin_bvec(struct bio_vec *bv, int nr_vecs, bool dirty) { int i; for (i = 0; i < nr_vecs; i++) { if (dirty && !PageCompound(bv[i].bv_page)) set_page_dirty_lock(bv[i].bv_page); unpin_user_page(bv[i].bv_page); } } static void bio_integrity_uncopy_user(struct bio_integrity_payload *bip) { unsigned short nr_vecs = bip->bip_max_vcnt - 1; struct bio_vec *copy = &bip->bip_vec[1]; size_t bytes = bip->bip_iter.bi_size; struct iov_iter iter; int ret; iov_iter_bvec(&iter, ITER_DEST, copy, nr_vecs, bytes); ret = copy_to_iter(bvec_virt(bip->bip_vec), bytes, &iter); WARN_ON_ONCE(ret != bytes); bio_integrity_unpin_bvec(copy, nr_vecs, true); } /** * bio_integrity_unmap_user - Unmap user integrity payload * @bio: bio containing bip to be unmapped * * Unmap the user mapped integrity portion of a bio. */ void bio_integrity_unmap_user(struct bio *bio) { struct bio_integrity_payload *bip = bio_integrity(bio); if (bip->bip_flags & BIP_COPY_USER) { if (bio_data_dir(bio) == READ) bio_integrity_uncopy_user(bip); kfree(bvec_virt(bip->bip_vec)); return; } bio_integrity_unpin_bvec(bip->bip_vec, bip->bip_max_vcnt, bio_data_dir(bio) == READ); } /** * bio_integrity_add_page - Attach integrity metadata * @bio: bio to update * @page: page containing integrity metadata * @len: number of bytes of integrity metadata in page * @offset: start offset within page * * Description: Attach a page containing integrity metadata to bio. */ int bio_integrity_add_page(struct bio *bio, struct page *page, unsigned int len, unsigned int offset) { struct request_queue *q = bdev_get_queue(bio->bi_bdev); struct bio_integrity_payload *bip = bio_integrity(bio); if (bip->bip_vcnt > 0) { struct bio_vec *bv = &bip->bip_vec[bip->bip_vcnt - 1]; bool same_page = false; if (bvec_try_merge_hw_page(q, bv, page, len, offset, &same_page)) { bip->bip_iter.bi_size += len; return len; } if (bip->bip_vcnt >= min(bip->bip_max_vcnt, queue_max_integrity_segments(q))) return 0; /* * If the queue doesn't support SG gaps and adding this segment * would create a gap, disallow it. */ if (bvec_gap_to_prev(&q->limits, bv, offset)) return 0; } bvec_set_page(&bip->bip_vec[bip->bip_vcnt], page, len, offset); bip->bip_vcnt++; bip->bip_iter.bi_size += len; return len; } EXPORT_SYMBOL(bio_integrity_add_page); static int bio_integrity_copy_user(struct bio *bio, struct bio_vec *bvec, int nr_vecs, unsigned int len, unsigned int direction, u32 seed) { bool write = direction == ITER_SOURCE; struct bio_integrity_payload *bip; struct iov_iter iter; void *buf; int ret; buf = kmalloc(len, GFP_KERNEL); if (!buf) return -ENOMEM; if (write) { iov_iter_bvec(&iter, direction, bvec, nr_vecs, len); if (!copy_from_iter_full(buf, len, &iter)) { ret = -EFAULT; goto free_buf; } bip = bio_integrity_alloc(bio, GFP_KERNEL, 1); } else { memset(buf, 0, len); /* * We need to preserve the original bvec and the number of vecs * in it for completion handling */ bip = bio_integrity_alloc(bio, GFP_KERNEL, nr_vecs + 1); } if (IS_ERR(bip)) { ret = PTR_ERR(bip); goto free_buf; } if (write) bio_integrity_unpin_bvec(bvec, nr_vecs, false); else memcpy(&bip->bip_vec[1], bvec, nr_vecs * sizeof(*bvec)); ret = bio_integrity_add_page(bio, virt_to_page(buf), len, offset_in_page(buf)); if (ret != len) { ret = -ENOMEM; goto free_bip; } bip->bip_flags |= BIP_COPY_USER; bip->bip_iter.bi_sector = seed; bip->bip_vcnt = nr_vecs; return 0; free_bip: bio_integrity_free(bio); free_buf: kfree(buf); return ret; } static int bio_integrity_init_user(struct bio *bio, struct bio_vec *bvec, int nr_vecs, unsigned int len, u32 seed) { struct bio_integrity_payload *bip; bip = bio_integrity_alloc(bio, GFP_KERNEL, nr_vecs); if (IS_ERR(bip)) return PTR_ERR(bip); memcpy(bip->bip_vec, bvec, nr_vecs * sizeof(*bvec)); bip->bip_iter.bi_sector = seed; bip->bip_iter.bi_size = len; bip->bip_vcnt = nr_vecs; return 0; } static unsigned int bvec_from_pages(struct bio_vec *bvec, struct page **pages, int nr_vecs, ssize_t bytes, ssize_t offset) { unsigned int nr_bvecs = 0; int i, j; for (i = 0; i < nr_vecs; i = j) { size_t size = min_t(size_t, bytes, PAGE_SIZE - offset); struct folio *folio = page_folio(pages[i]); bytes -= size; for (j = i + 1; j < nr_vecs; j++) { size_t next = min_t(size_t, PAGE_SIZE, bytes); if (page_folio(pages[j]) != folio || pages[j] != pages[j - 1] + 1) break; unpin_user_page(pages[j]); size += next; bytes -= next; } bvec_set_page(&bvec[nr_bvecs], pages[i], size, offset); offset = 0; nr_bvecs++; } return nr_bvecs; } int bio_integrity_map_user(struct bio *bio, void __user *ubuf, ssize_t bytes, u32 seed) { struct request_queue *q = bdev_get_queue(bio->bi_bdev); unsigned int align = blk_lim_dma_alignment_and_pad(&q->limits); struct page *stack_pages[UIO_FASTIOV], **pages = stack_pages; struct bio_vec stack_vec[UIO_FASTIOV], *bvec = stack_vec; unsigned int direction, nr_bvecs; struct iov_iter iter; int ret, nr_vecs; size_t offset; bool copy; if (bio_integrity(bio)) return -EINVAL; if (bytes >> SECTOR_SHIFT > queue_max_hw_sectors(q)) return -E2BIG; if (bio_data_dir(bio) == READ) direction = ITER_DEST; else direction = ITER_SOURCE; iov_iter_ubuf(&iter, direction, ubuf, bytes); nr_vecs = iov_iter_npages(&iter, BIO_MAX_VECS + 1); if (nr_vecs > BIO_MAX_VECS) return -E2BIG; if (nr_vecs > UIO_FASTIOV) { bvec = kcalloc(nr_vecs, sizeof(*bvec), GFP_KERNEL); if (!bvec) return -ENOMEM; pages = NULL; } copy = !iov_iter_is_aligned(&iter, align, align); ret = iov_iter_extract_pages(&iter, &pages, bytes, nr_vecs, 0, &offset); if (unlikely(ret < 0)) goto free_bvec; nr_bvecs = bvec_from_pages(bvec, pages, nr_vecs, bytes, offset); if (pages != stack_pages) kvfree(pages); if (nr_bvecs > queue_max_integrity_segments(q)) copy = true; if (copy) ret = bio_integrity_copy_user(bio, bvec, nr_bvecs, bytes, direction, seed); else ret = bio_integrity_init_user(bio, bvec, nr_bvecs, bytes, seed); if (ret) goto release_pages; if (bvec != stack_vec) kfree(bvec); return 0; release_pages: bio_integrity_unpin_bvec(bvec, nr_bvecs, false); free_bvec: if (bvec != stack_vec) kfree(bvec); return ret; } /** * bio_integrity_prep - Prepare bio for integrity I/O * @bio: bio to prepare * * Description: Checks if the bio already has an integrity payload attached. * If it does, the payload has been generated by another kernel subsystem, * and we just pass it through. Otherwise allocates integrity payload. * The bio must have data direction, target device and start sector set priot * to calling. In the WRITE case, integrity metadata will be generated using * the block device's integrity function. In the READ case, the buffer * will be prepared for DMA and a suitable end_io handler set up. */ bool bio_integrity_prep(struct bio *bio) { struct bio_integrity_payload *bip; struct blk_integrity *bi = blk_get_integrity(bio->bi_bdev->bd_disk); unsigned int len; void *buf; gfp_t gfp = GFP_NOIO; if (!bi) return true; if (!bio_sectors(bio)) return true; /* Already protected? */ if (bio_integrity(bio)) return true; switch (bio_op(bio)) { case REQ_OP_READ: if (bi->flags & BLK_INTEGRITY_NOVERIFY) return true; break; case REQ_OP_WRITE: if (bi->flags & BLK_INTEGRITY_NOGENERATE) return true; /* * Zero the memory allocated to not leak uninitialized kernel * memory to disk for non-integrity metadata where nothing else * initializes the memory. */ if (bi->csum_type == BLK_INTEGRITY_CSUM_NONE) gfp |= __GFP_ZERO; break; default: return true; } /* Allocate kernel buffer for protection data */ len = bio_integrity_bytes(bi, bio_sectors(bio)); buf = kmalloc(len, gfp); if (unlikely(buf == NULL)) { goto err_end_io; } bip = bio_integrity_alloc(bio, GFP_NOIO, 1); if (IS_ERR(bip)) { kfree(buf); goto err_end_io; } bip->bip_flags |= BIP_BLOCK_INTEGRITY; bip_set_seed(bip, bio->bi_iter.bi_sector); if (bi->csum_type == BLK_INTEGRITY_CSUM_IP) bip->bip_flags |= BIP_IP_CHECKSUM; if (bio_integrity_add_page(bio, virt_to_page(buf), len, offset_in_page(buf)) < len) { printk(KERN_ERR "could not attach integrity payload\n"); goto err_end_io; } /* Auto-generate integrity metadata if this is a write */ if (bio_data_dir(bio) == WRITE) blk_integrity_generate(bio); else bip->bio_iter = bio->bi_iter; return true; err_end_io: bio->bi_status = BLK_STS_RESOURCE; bio_endio(bio); return false; } EXPORT_SYMBOL(bio_integrity_prep); /** * bio_integrity_verify_fn - Integrity I/O completion worker * @work: Work struct stored in bio to be verified * * Description: This workqueue function is called to complete a READ * request. The function verifies the transferred integrity metadata * and then calls the original bio end_io function. */ static void bio_integrity_verify_fn(struct work_struct *work) { struct bio_integrity_payload *bip = container_of(work, struct bio_integrity_payload, bip_work); struct bio *bio = bip->bip_bio; blk_integrity_verify(bio); kfree(bvec_virt(bip->bip_vec)); bio_integrity_free(bio); bio_endio(bio); } /** * __bio_integrity_endio - Integrity I/O completion function * @bio: Protected bio * * Description: Completion for integrity I/O * * Normally I/O completion is done in interrupt context. However, * verifying I/O integrity is a time-consuming task which must be run * in process context. This function postpones completion * accordingly. */ bool __bio_integrity_endio(struct bio *bio) { struct blk_integrity *bi = blk_get_integrity(bio->bi_bdev->bd_disk); struct bio_integrity_payload *bip = bio_integrity(bio); if (bio_op(bio) == REQ_OP_READ && !bio->bi_status && bi->csum_type) { INIT_WORK(&bip->bip_work, bio_integrity_verify_fn); queue_work(kintegrityd_wq, &bip->bip_work); return false; } kfree(bvec_virt(bip->bip_vec)); bio_integrity_free(bio); return true; } /** * bio_integrity_advance - Advance integrity vector * @bio: bio whose integrity vector to update * @bytes_done: number of data bytes that have been completed * * Description: This function calculates how many integrity bytes the * number of completed data bytes correspond to and advances the * integrity vector accordingly. */ void bio_integrity_advance(struct bio *bio, unsigned int bytes_done) { struct bio_integrity_payload *bip = bio_integrity(bio); struct blk_integrity *bi = blk_get_integrity(bio->bi_bdev->bd_disk); unsigned bytes = bio_integrity_bytes(bi, bytes_done >> 9); bip->bip_iter.bi_sector += bio_integrity_intervals(bi, bytes_done >> 9); bvec_iter_advance(bip->bip_vec, &bip->bip_iter, bytes); } /** * bio_integrity_trim - Trim integrity vector * @bio: bio whose integrity vector to update * * Description: Used to trim the integrity vector in a cloned bio. */ void bio_integrity_trim(struct bio *bio) { struct bio_integrity_payload *bip = bio_integrity(bio); struct blk_integrity *bi = blk_get_integrity(bio->bi_bdev->bd_disk); bip->bip_iter.bi_size = bio_integrity_bytes(bi, bio_sectors(bio)); } EXPORT_SYMBOL(bio_integrity_trim); /** * bio_integrity_clone - Callback for cloning bios with integrity metadata * @bio: New bio * @bio_src: Original bio * @gfp_mask: Memory allocation mask * * Description: Called to allocate a bip when cloning a bio */ int bio_integrity_clone(struct bio *bio, struct bio *bio_src, gfp_t gfp_mask) { struct bio_integrity_payload *bip_src = bio_integrity(bio_src); struct bio_integrity_payload *bip; BUG_ON(bip_src == NULL); bip = bio_integrity_alloc(bio, gfp_mask, 0); if (IS_ERR(bip)) return PTR_ERR(bip); bip->bip_vec = bip_src->bip_vec; bip->bip_iter = bip_src->bip_iter; bip->bip_flags = bip_src->bip_flags & ~BIP_BLOCK_INTEGRITY; return 0; } int bioset_integrity_create(struct bio_set *bs, int pool_size) { if (mempool_initialized(&bs->bio_integrity_pool)) return 0; if (mempool_init_slab_pool(&bs->bio_integrity_pool, pool_size, bip_slab)) return -1; if (biovec_init_pool(&bs->bvec_integrity_pool, pool_size)) { mempool_exit(&bs->bio_integrity_pool); return -1; } return 0; } EXPORT_SYMBOL(bioset_integrity_create); void bioset_integrity_free(struct bio_set *bs) { mempool_exit(&bs->bio_integrity_pool); mempool_exit(&bs->bvec_integrity_pool); } void __init bio_integrity_init(void) { /* * kintegrityd won't block much but may burn a lot of CPU cycles. * Make it highpri CPU intensive wq with max concurrency of 1. */ kintegrityd_wq = alloc_workqueue("kintegrityd", WQ_MEM_RECLAIM | WQ_HIGHPRI | WQ_CPU_INTENSIVE, 1); if (!kintegrityd_wq) panic("Failed to create kintegrityd\n"); bip_slab = kmem_cache_create("bio_integrity_payload", sizeof(struct bio_integrity_payload) + sizeof(struct bio_vec) * BIO_INLINE_VECS, 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); } |
| 43 31 40 24 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 | /* * linux/fs/nls/nls_koi8-u.c * * Charset koi8-u translation tables. * The Unicode to charset table has only exact mappings. */ #include <linux/module.h> #include <linux/kernel.h> #include <linux/string.h> #include <linux/nls.h> #include <linux/errno.h> static const wchar_t charset2uni[256] = { /* 0x00*/ 0x0000, 0x0001, 0x0002, 0x0003, 0x0004, 0x0005, 0x0006, 0x0007, 0x0008, 0x0009, 0x000a, 0x000b, 0x000c, 0x000d, 0x000e, 0x000f, /* 0x10*/ 0x0010, 0x0011, 0x0012, 0x0013, 0x0014, 0x0015, 0x0016, 0x0017, 0x0018, 0x0019, 0x001a, 0x001b, 0x001c, 0x001d, 0x001e, 0x001f, /* 0x20*/ 0x0020, 0x0021, 0x0022, 0x0023, 0x0024, 0x0025, 0x0026, 0x0027, 0x0028, 0x0029, 0x002a, 0x002b, 0x002c, 0x002d, 0x002e, 0x002f, /* 0x30*/ 0x0030, 0x0031, 0x0032, 0x0033, 0x0034, 0x0035, 0x0036, 0x0037, 0x0038, 0x0039, 0x003a, 0x003b, 0x003c, 0x003d, 0x003e, 0x003f, /* 0x40*/ 0x0040, 0x0041, 0x0042, 0x0043, 0x0044, 0x0045, 0x0046, 0x0047, 0x0048, 0x0049, 0x004a, 0x004b, 0x004c, 0x004d, 0x004e, 0x004f, /* 0x50*/ 0x0050, 0x0051, 0x0052, 0x0053, 0x0054, 0x0055, 0x0056, 0x0057, 0x0058, 0x0059, 0x005a, 0x005b, 0x005c, 0x005d, 0x005e, 0x005f, /* 0x60*/ 0x0060, 0x0061, 0x0062, 0x0063, 0x0064, 0x0065, 0x0066, 0x0067, 0x0068, 0x0069, 0x006a, 0x006b, 0x006c, 0x006d, 0x006e, 0x006f, /* 0x70*/ 0x0070, 0x0071, 0x0072, 0x0073, 0x0074, 0x0075, 0x0076, 0x0077, 0x0078, 0x0079, 0x007a, 0x007b, 0x007c, 0x007d, 0x007e, 0x007f, /* 0x80*/ 0x2500, 0x2502, 0x250c, 0x2510, 0x2514, 0x2518, 0x251c, 0x2524, 0x252c, 0x2534, 0x253c, 0x2580, 0x2584, 0x2588, 0x258c, 0x2590, /* 0x90*/ 0x2591, 0x2592, 0x2593, 0x2320, 0x25a0, 0x2219, 0x221a, 0x2248, 0x2264, 0x2265, 0x00a0, 0x2321, 0x00b0, 0x00b2, 0x00b7, 0x00f7, /* 0xa0*/ 0x2550, 0x2551, 0x2552, 0x0451, 0x0454, 0x2554, 0x0456, 0x0457, 0x2557, 0x2558, 0x2559, 0x255a, 0x255b, 0x0491, 0x255d, 0x255e, /* 0xb0*/ 0x255f, 0x2560, 0x2561, 0x0401, 0x0404, 0x2563, 0x0406, 0x0407, 0x2566, 0x2567, 0x2568, 0x2569, 0x256a, 0x0490, 0x256c, 0x00a9, /* 0xc0*/ 0x044e, 0x0430, 0x0431, 0x0446, 0x0434, 0x0435, 0x0444, 0x0433, 0x0445, 0x0438, 0x0439, 0x043a, 0x043b, 0x043c, 0x043d, 0x043e, /* 0xd0*/ 0x043f, 0x044f, 0x0440, 0x0441, 0x0442, 0x0443, 0x0436, 0x0432, 0x044c, 0x044b, 0x0437, 0x0448, 0x044d, 0x0449, 0x0447, 0x044a, /* 0xe0*/ 0x042e, 0x0410, 0x0411, 0x0426, 0x0414, 0x0415, 0x0424, 0x0413, 0x0425, 0x0418, 0x0419, 0x041a, 0x041b, 0x041c, 0x041d, 0x041e, /* 0xf0*/ 0x041f, 0x042f, 0x0420, 0x0421, 0x0422, 0x0423, 0x0416, 0x0412, 0x042c, 0x042b, 0x0417, 0x0428, 0x042d, 0x0429, 0x0427, 0x042a, }; static const unsigned char page00[256] = { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */ 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */ 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */ 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */ 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */ 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */ 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ 0x9a, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */ 0x00, 0xbf, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */ 0x9c, 0x00, 0x9d, 0x00, 0x00, 0x00, 0x00, 0x9e, /* 0xb0-0xb7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc0-0xc7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd8-0xdf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe0-0xe7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe8-0xef */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9f, /* 0xf0-0xf7 */ }; static const unsigned char page04[256] = { 0x00, 0xb3, 0x00, 0x00, 0xb4, 0x00, 0xb6, 0xb7, /* 0x00-0x07 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ 0xe1, 0xe2, 0xf7, 0xe7, 0xe4, 0xe5, 0xf6, 0xfa, /* 0x10-0x17 */ 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, 0xf0, /* 0x18-0x1f */ 0xf2, 0xf3, 0xf4, 0xf5, 0xe6, 0xe8, 0xe3, 0xfe, /* 0x20-0x27 */ 0xfb, 0xfd, 0xff, 0xf9, 0xf8, 0xfc, 0xe0, 0xf1, /* 0x28-0x2f */ 0xc1, 0xc2, 0xd7, 0xc7, 0xc4, 0xc5, 0xd6, 0xda, /* 0x30-0x37 */ 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, 0xd0, /* 0x38-0x3f */ 0xd2, 0xd3, 0xd4, 0xd5, 0xc6, 0xc8, 0xc3, 0xde, /* 0x40-0x47 */ 0xdb, 0xdd, 0xdf, 0xd9, 0xd8, 0xdc, 0xc0, 0xd1, /* 0x48-0x4f */ 0x00, 0xa3, 0x00, 0x00, 0xa4, 0x00, 0xa6, 0xa7, /* 0x50-0x57 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ 0xbd, 0xad, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ }; static const unsigned char page22[256] = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ 0x00, 0x95, 0x96, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ 0x97, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */ 0x00, 0x00, 0x00, 0x00, 0x98, 0x99, 0x00, 0x00, /* 0x60-0x67 */ }; static const unsigned char page23[256] = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */ 0x93, 0x9b, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ }; static const unsigned char page25[256] = { 0x80, 0x00, 0x81, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ 0x00, 0x00, 0x00, 0x00, 0x82, 0x00, 0x00, 0x00, /* 0x08-0x0f */ 0x83, 0x00, 0x00, 0x00, 0x84, 0x00, 0x00, 0x00, /* 0x10-0x17 */ 0x85, 0x00, 0x00, 0x00, 0x86, 0x00, 0x00, 0x00, /* 0x18-0x1f */ 0x00, 0x00, 0x00, 0x00, 0x87, 0x00, 0x00, 0x00, /* 0x20-0x27 */ 0x00, 0x00, 0x00, 0x00, 0x88, 0x00, 0x00, 0x00, /* 0x28-0x2f */ 0x00, 0x00, 0x00, 0x00, 0x89, 0x00, 0x00, 0x00, /* 0x30-0x37 */ 0x00, 0x00, 0x00, 0x00, 0x8a, 0x00, 0x00, 0x00, /* 0x38-0x3f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ 0xa0, 0xa1, 0xa2, 0x00, 0xa5, 0x00, 0x00, 0xa8, /* 0x50-0x57 */ 0xa9, 0xaa, 0xab, 0xac, 0x00, 0xae, 0xaf, 0xb0, /* 0x58-0x5f */ 0xb1, 0xb2, 0x00, 0xb5, 0x00, 0x00, 0xb8, 0xb9, /* 0x60-0x67 */ 0xba, 0xbb, 0xbc, 0x00, 0xbe, 0x00, 0x00, 0x00, /* 0x68-0x6f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */ 0x8b, 0x00, 0x00, 0x00, 0x8c, 0x00, 0x00, 0x00, /* 0x80-0x87 */ 0x8d, 0x00, 0x00, 0x00, 0x8e, 0x00, 0x00, 0x00, /* 0x88-0x8f */ 0x8f, 0x90, 0x91, 0x92, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ 0x94, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */ }; static const unsigned char *const page_uni2charset[256] = { page00, NULL, NULL, NULL, page04, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, page22, page23, NULL, page25, NULL, NULL, }; static const unsigned char charset2lower[256] = { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ 0x40, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x40-0x47 */ 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x48-0x4f */ 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x50-0x57 */ 0x78, 0x79, 0x7a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */ 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */ 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */ 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */ 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */ 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */ 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x98-0x9f */ 0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, /* 0xa0-0xa7 */ 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, /* 0xa8-0xaf */ 0xb0, 0xb1, 0xb2, 0xa3, 0xa4, 0xb5, 0xa6, 0xa7, /* 0xb0-0xb7 */ 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xad, 0xbe, 0xbf, /* 0xb8-0xbf */ 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xc0-0xc7 */ 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */ 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0xd0-0xd7 */ 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, /* 0xd8-0xdf */ 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xe0-0xe7 */ 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xe8-0xef */ 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0xf0-0xf7 */ 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, /* 0xf8-0xff */ }; static const unsigned char charset2upper[256] = { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */ 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */ 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */ 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ 0x60, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x60-0x67 */ 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x68-0x6f */ 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x70-0x77 */ 0x58, 0x59, 0x5a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */ 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */ 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */ 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x98-0x9f */ 0xa0, 0xa1, 0xa2, 0xb3, 0xb4, 0xa5, 0xb6, 0xb7, /* 0xa0-0xa7 */ 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xbd, 0xae, 0xaf, /* 0xa8-0xaf */ 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */ 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, /* 0xb8-0xbf */ 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0xc0-0xc7 */ 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xc8-0xcf */ 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xd0-0xd7 */ 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, /* 0xd8-0xdf */ 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0xe0-0xe7 */ 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xe8-0xef */ 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xf0-0xf7 */ 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, /* 0xf8-0xff */ }; static int uni2char(wchar_t uni, unsigned char *out, int boundlen) { const unsigned char *uni2charset; unsigned char cl = uni & 0x00ff; unsigned char ch = (uni & 0xff00) >> 8; if (boundlen <= 0) return -ENAMETOOLONG; uni2charset = page_uni2charset[ch]; if (uni2charset && uni2charset[cl]) out[0] = uni2charset[cl]; else return -EINVAL; return 1; } static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni) { *uni = charset2uni[*rawstring]; if (*uni == 0x0000) return -EINVAL; return 1; } static struct nls_table table = { .charset = "koi8-u", .uni2char = uni2char, .char2uni = char2uni, .charset2lower = charset2lower, .charset2upper = charset2upper, }; static int __init init_nls_koi8_u(void) { return register_nls(&table); } static void __exit exit_nls_koi8_u(void) { unregister_nls(&table); } module_init(init_nls_koi8_u) module_exit(exit_nls_koi8_u) MODULE_DESCRIPTION("NLS KOI8-U (Ukrainian)"); MODULE_LICENSE("Dual BSD/GPL"); |
| 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_COMPACTION_H #define _LINUX_COMPACTION_H /* * Determines how hard direct compaction should try to succeed. * Lower value means higher priority, analogically to reclaim priority. */ enum compact_priority { COMPACT_PRIO_SYNC_FULL, MIN_COMPACT_PRIORITY = COMPACT_PRIO_SYNC_FULL, COMPACT_PRIO_SYNC_LIGHT, MIN_COMPACT_COSTLY_PRIORITY = COMPACT_PRIO_SYNC_LIGHT, DEF_COMPACT_PRIORITY = COMPACT_PRIO_SYNC_LIGHT, COMPACT_PRIO_ASYNC, INIT_COMPACT_PRIORITY = COMPACT_PRIO_ASYNC }; /* Return values for compact_zone() and try_to_compact_pages() */ /* When adding new states, please adjust include/trace/events/compaction.h */ enum compact_result { /* For more detailed tracepoint output - internal to compaction */ COMPACT_NOT_SUITABLE_ZONE, /* * compaction didn't start as it was not possible or direct reclaim * was more suitable */ COMPACT_SKIPPED, /* compaction didn't start as it was deferred due to past failures */ COMPACT_DEFERRED, /* For more detailed tracepoint output - internal to compaction */ COMPACT_NO_SUITABLE_PAGE, /* compaction should continue to another pageblock */ COMPACT_CONTINUE, /* * The full zone was compacted scanned but wasn't successful to compact * suitable pages. */ COMPACT_COMPLETE, /* * direct compaction has scanned part of the zone but wasn't successful * to compact suitable pages. */ COMPACT_PARTIAL_SKIPPED, /* compaction terminated prematurely due to lock contentions */ COMPACT_CONTENDED, /* * direct compaction terminated after concluding that the allocation * should now succeed */ COMPACT_SUCCESS, }; struct alloc_context; /* in mm/internal.h */ /* * Number of free order-0 pages that should be available above given watermark * to make sure compaction has reasonable chance of not running out of free * pages that it needs to isolate as migration target during its work. */ static inline unsigned long compact_gap(unsigned int order) { /* * Although all the isolations for migration are temporary, compaction * free scanner may have up to 1 << order pages on its list and then * try to split an (order - 1) free page. At that point, a gap of * 1 << order might not be enough, so it's safer to require twice that * amount. Note that the number of pages on the list is also * effectively limited by COMPACT_CLUSTER_MAX, as that's the maximum * that the migrate scanner can have isolated on migrate list, and free * scanner is only invoked when the number of isolated free pages is * lower than that. But it's not worth to complicate the formula here * as a bigger gap for higher orders than strictly necessary can also * improve chances of compaction success. */ return 2UL << order; } #ifdef CONFIG_COMPACTION extern unsigned int extfrag_for_order(struct zone *zone, unsigned int order); extern int fragmentation_index(struct zone *zone, unsigned int order); extern enum compact_result try_to_compact_pages(gfp_t gfp_mask, unsigned int order, unsigned int alloc_flags, const struct alloc_context *ac, enum compact_priority prio, struct page **page); extern void reset_isolation_suitable(pg_data_t *pgdat); extern bool compaction_suitable(struct zone *zone, int order, int highest_zoneidx); extern void compaction_defer_reset(struct zone *zone, int order, bool alloc_success); bool compaction_zonelist_suitable(struct alloc_context *ac, int order, int alloc_flags); extern void __meminit kcompactd_run(int nid); extern void __meminit kcompactd_stop(int nid); extern void wakeup_kcompactd(pg_data_t *pgdat, int order, int highest_zoneidx); #else static inline void reset_isolation_suitable(pg_data_t *pgdat) { } static inline bool compaction_suitable(struct zone *zone, int order, int highest_zoneidx) { return false; } static inline void kcompactd_run(int nid) { } static inline void kcompactd_stop(int nid) { } static inline void wakeup_kcompactd(pg_data_t *pgdat, int order, int highest_zoneidx) { } #endif /* CONFIG_COMPACTION */ struct node; #if defined(CONFIG_COMPACTION) && defined(CONFIG_SYSFS) && defined(CONFIG_NUMA) extern int compaction_register_node(struct node *node); extern void compaction_unregister_node(struct node *node); #else static inline int compaction_register_node(struct node *node) { return 0; } static inline void compaction_unregister_node(struct node *node) { } #endif /* CONFIG_COMPACTION && CONFIG_SYSFS && CONFIG_NUMA */ #endif /* _LINUX_COMPACTION_H */ |
| 1 1 1 1 11 10 1 1 2 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 | // SPDX-License-Identifier: GPL-2.0 /* * FUSE inode io modes. * * Copyright (c) 2024 CTERA Networks. */ #include "fuse_i.h" #include <linux/kernel.h> #include <linux/sched.h> #include <linux/file.h> #include <linux/fs.h> /* * Return true if need to wait for new opens in caching mode. */ static inline bool fuse_is_io_cache_wait(struct fuse_inode *fi) { return READ_ONCE(fi->iocachectr) < 0 && !fuse_inode_backing(fi); } /* * Called on cached file open() and on first mmap() of direct_io file. * Takes cached_io inode mode reference to be dropped on file release. * * Blocks new parallel dio writes and waits for the in-progress parallel dio * writes to complete. */ int fuse_file_cached_io_open(struct inode *inode, struct fuse_file *ff) { struct fuse_inode *fi = get_fuse_inode(inode); /* There are no io modes if server does not implement open */ if (!ff->args) return 0; spin_lock(&fi->lock); /* * Setting the bit advises new direct-io writes to use an exclusive * lock - without it the wait below might be forever. */ while (fuse_is_io_cache_wait(fi)) { set_bit(FUSE_I_CACHE_IO_MODE, &fi->state); spin_unlock(&fi->lock); wait_event(fi->direct_io_waitq, !fuse_is_io_cache_wait(fi)); spin_lock(&fi->lock); } /* * Check if inode entered passthrough io mode while waiting for parallel * dio write completion. */ if (fuse_inode_backing(fi)) { clear_bit(FUSE_I_CACHE_IO_MODE, &fi->state); spin_unlock(&fi->lock); return -ETXTBSY; } WARN_ON(ff->iomode == IOM_UNCACHED); if (ff->iomode == IOM_NONE) { ff->iomode = IOM_CACHED; if (fi->iocachectr == 0) set_bit(FUSE_I_CACHE_IO_MODE, &fi->state); fi->iocachectr++; } spin_unlock(&fi->lock); return 0; } static void fuse_file_cached_io_release(struct fuse_file *ff, struct fuse_inode *fi) { spin_lock(&fi->lock); WARN_ON(fi->iocachectr <= 0); WARN_ON(ff->iomode != IOM_CACHED); ff->iomode = IOM_NONE; fi->iocachectr--; if (fi->iocachectr == 0) clear_bit(FUSE_I_CACHE_IO_MODE, &fi->state); spin_unlock(&fi->lock); } /* Start strictly uncached io mode where cache access is not allowed */ int fuse_inode_uncached_io_start(struct fuse_inode *fi, struct fuse_backing *fb) { struct fuse_backing *oldfb; int err = 0; spin_lock(&fi->lock); /* deny conflicting backing files on same fuse inode */ oldfb = fuse_inode_backing(fi); if (fb && oldfb && oldfb != fb) { err = -EBUSY; goto unlock; } if (fi->iocachectr > 0) { err = -ETXTBSY; goto unlock; } fi->iocachectr--; /* fuse inode holds a single refcount of backing file */ if (fb && !oldfb) { oldfb = fuse_inode_backing_set(fi, fb); WARN_ON_ONCE(oldfb != NULL); } else { fuse_backing_put(fb); } unlock: spin_unlock(&fi->lock); return err; } /* Takes uncached_io inode mode reference to be dropped on file release */ static int fuse_file_uncached_io_open(struct inode *inode, struct fuse_file *ff, struct fuse_backing *fb) { struct fuse_inode *fi = get_fuse_inode(inode); int err; err = fuse_inode_uncached_io_start(fi, fb); if (err) return err; WARN_ON(ff->iomode != IOM_NONE); ff->iomode = IOM_UNCACHED; return 0; } void fuse_inode_uncached_io_end(struct fuse_inode *fi) { struct fuse_backing *oldfb = NULL; spin_lock(&fi->lock); WARN_ON(fi->iocachectr >= 0); fi->iocachectr++; if (!fi->iocachectr) { wake_up(&fi->direct_io_waitq); oldfb = fuse_inode_backing_set(fi, NULL); } spin_unlock(&fi->lock); if (oldfb) fuse_backing_put(oldfb); } /* Drop uncached_io reference from passthrough open */ static void fuse_file_uncached_io_release(struct fuse_file *ff, struct fuse_inode *fi) { WARN_ON(ff->iomode != IOM_UNCACHED); ff->iomode = IOM_NONE; fuse_inode_uncached_io_end(fi); } /* * Open flags that are allowed in combination with FOPEN_PASSTHROUGH. * A combination of FOPEN_PASSTHROUGH and FOPEN_DIRECT_IO means that read/write * operations go directly to the server, but mmap is done on the backing file. * FOPEN_PASSTHROUGH mode should not co-exist with any users of the fuse inode * page cache, so FOPEN_KEEP_CACHE is a strange and undesired combination. */ #define FOPEN_PASSTHROUGH_MASK \ (FOPEN_PASSTHROUGH | FOPEN_DIRECT_IO | FOPEN_PARALLEL_DIRECT_WRITES | \ FOPEN_NOFLUSH) static int fuse_file_passthrough_open(struct inode *inode, struct file *file) { struct fuse_file *ff = file->private_data; struct fuse_conn *fc = get_fuse_conn(inode); struct fuse_backing *fb; int err; /* Check allowed conditions for file open in passthrough mode */ if (!IS_ENABLED(CONFIG_FUSE_PASSTHROUGH) || !fc->passthrough || (ff->open_flags & ~FOPEN_PASSTHROUGH_MASK)) return -EINVAL; fb = fuse_passthrough_open(file, inode, ff->args->open_outarg.backing_id); if (IS_ERR(fb)) return PTR_ERR(fb); /* First passthrough file open denies caching inode io mode */ err = fuse_file_uncached_io_open(inode, ff, fb); if (!err) return 0; fuse_passthrough_release(ff, fb); fuse_backing_put(fb); return err; } /* Request access to submit new io to inode via open file */ int fuse_file_io_open(struct file *file, struct inode *inode) { struct fuse_file *ff = file->private_data; struct fuse_inode *fi = get_fuse_inode(inode); int err; /* * io modes are not relevant with DAX and with server that does not * implement open. */ if (FUSE_IS_DAX(inode) || !ff->args) return 0; /* * Server is expected to use FOPEN_PASSTHROUGH for all opens of an inode * which is already open for passthrough. */ err = -EINVAL; if (fuse_inode_backing(fi) && !(ff->open_flags & FOPEN_PASSTHROUGH)) goto fail; /* * FOPEN_PARALLEL_DIRECT_WRITES requires FOPEN_DIRECT_IO. */ if (!(ff->open_flags & FOPEN_DIRECT_IO)) ff->open_flags &= ~FOPEN_PARALLEL_DIRECT_WRITES; /* * First passthrough file open denies caching inode io mode. * First caching file open enters caching inode io mode. * * Note that if user opens a file open with O_DIRECT, but server did * not specify FOPEN_DIRECT_IO, a later fcntl() could remove O_DIRECT, * so we put the inode in caching mode to prevent parallel dio. */ if ((ff->open_flags & FOPEN_DIRECT_IO) && !(ff->open_flags & FOPEN_PASSTHROUGH)) return 0; if (ff->open_flags & FOPEN_PASSTHROUGH) err = fuse_file_passthrough_open(inode, file); else err = fuse_file_cached_io_open(inode, ff); if (err) goto fail; return 0; fail: pr_debug("failed to open file in requested io mode (open_flags=0x%x, err=%i).\n", ff->open_flags, err); /* * The file open mode determines the inode io mode. * Using incorrect open mode is a server mistake, which results in * user visible failure of open() with EIO error. */ return -EIO; } /* No more pending io and no new io possible to inode via open/mmapped file */ void fuse_file_io_release(struct fuse_file *ff, struct inode *inode) { struct fuse_inode *fi = get_fuse_inode(inode); /* * Last passthrough file close allows caching inode io mode. * Last caching file close exits caching inode io mode. */ switch (ff->iomode) { case IOM_NONE: /* Nothing to do */ break; case IOM_UNCACHED: fuse_file_uncached_io_release(ff, fi); break; case IOM_CACHED: fuse_file_cached_io_release(ff, fi); break; } } |
| 1 1 1 1 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 | // SPDX-License-Identifier: GPL-2.0-only /* * VMware VMCI Driver * * Copyright (C) 2012 VMware, Inc. All rights reserved. */ #include <linux/vmw_vmci_defs.h> #include <linux/vmw_vmci_api.h> #include <linux/miscdevice.h> #include <linux/interrupt.h> #include <linux/highmem.h> #include <linux/atomic.h> #include <linux/kernel.h> #include <linux/module.h> #include <linux/mutex.h> #include <linux/sched.h> #include <linux/cred.h> #include <linux/slab.h> #include <linux/file.h> #include <linux/init.h> #include <linux/poll.h> #include <linux/pci.h> #include <linux/smp.h> #include <linux/fs.h> #include <linux/io.h> #include "vmci_handle_array.h" #include "vmci_queue_pair.h" #include "vmci_datagram.h" #include "vmci_doorbell.h" #include "vmci_resource.h" #include "vmci_context.h" #include "vmci_driver.h" #include "vmci_event.h" #define VMCI_UTIL_NUM_RESOURCES 1 enum { VMCI_NOTIFY_RESOURCE_QUEUE_PAIR = 0, VMCI_NOTIFY_RESOURCE_DOOR_BELL = 1, }; enum { VMCI_NOTIFY_RESOURCE_ACTION_NOTIFY = 0, VMCI_NOTIFY_RESOURCE_ACTION_CREATE = 1, VMCI_NOTIFY_RESOURCE_ACTION_DESTROY = 2, }; /* * VMCI driver initialization. This block can also be used to * pass initial group membership etc. */ struct vmci_init_blk { u32 cid; u32 flags; }; /* VMCIqueue_pairAllocInfo_VMToVM */ struct vmci_qp_alloc_info_vmvm { struct vmci_handle handle; u32 peer; u32 flags; u64 produce_size; u64 consume_size; u64 produce_page_file; /* User VA. */ u64 consume_page_file; /* User VA. */ u64 produce_page_file_size; /* Size of the file name array. */ u64 consume_page_file_size; /* Size of the file name array. */ s32 result; u32 _pad; }; /* VMCISetNotifyInfo: Used to pass notify flag's address to the host driver. */ struct vmci_set_notify_info { u64 notify_uva; s32 result; u32 _pad; }; /* * Per-instance host state */ struct vmci_host_dev { struct vmci_ctx *context; int user_version; enum vmci_obj_type ct_type; struct mutex lock; /* Mutex lock for vmci context access */ }; static struct vmci_ctx *host_context; static bool vmci_host_device_initialized; static atomic_t vmci_host_active_users = ATOMIC_INIT(0); /* * Determines whether the VMCI host personality is * available. Since the core functionality of the host driver is * always present, all guests could possibly use the host * personality. However, to minimize the deviation from the * pre-unified driver state of affairs, we only consider the host * device active if there is no active guest device or if there * are VMX'en with active VMCI contexts using the host device. */ bool vmci_host_code_active(void) { return vmci_host_device_initialized && (!vmci_guest_code_active() || atomic_read(&vmci_host_active_users) > 0); } int vmci_host_users(void) { return atomic_read(&vmci_host_active_users); } /* * Called on open of /dev/vmci. */ static int vmci_host_open(struct inode *inode, struct file *filp) { struct vmci_host_dev *vmci_host_dev; vmci_host_dev = kzalloc(sizeof(struct vmci_host_dev), GFP_KERNEL); if (vmci_host_dev == NULL) return -ENOMEM; vmci_host_dev->ct_type = VMCIOBJ_NOT_SET; mutex_init(&vmci_host_dev->lock); filp->private_data = vmci_host_dev; return 0; } /* * Called on close of /dev/vmci, most often when the process * exits. */ static int vmci_host_close(struct inode *inode, struct file *filp) { struct vmci_host_dev *vmci_host_dev = filp->private_data; if (vmci_host_dev->ct_type == VMCIOBJ_CONTEXT) { vmci_ctx_destroy(vmci_host_dev->context); vmci_host_dev->context = NULL; /* * The number of active contexts is used to track whether any * VMX'en are using the host personality. It is incremented when * a context is created through the IOCTL_VMCI_INIT_CONTEXT * ioctl. */ atomic_dec(&vmci_host_active_users); } vmci_host_dev->ct_type = VMCIOBJ_NOT_SET; kfree(vmci_host_dev); filp->private_data = NULL; return 0; } /* * This is used to wake up the VMX when a VMCI call arrives, or * to wake up select() or poll() at the next clock tick. */ static __poll_t vmci_host_poll(struct file *filp, poll_table *wait) { struct vmci_host_dev *vmci_host_dev = filp->private_data; struct vmci_ctx *context; __poll_t mask = 0; if (vmci_host_dev->ct_type == VMCIOBJ_CONTEXT) { /* * Read context only if ct_type == VMCIOBJ_CONTEXT to make * sure that context is initialized */ context = vmci_host_dev->context; /* Check for VMCI calls to this VM context. */ if (wait) poll_wait(filp, &context->host_context.wait_queue, wait); spin_lock(&context->lock); if (context->pending_datagrams > 0 || vmci_handle_arr_get_size( context->pending_doorbell_array) > 0) { mask = EPOLLIN; } spin_unlock(&context->lock); } return mask; } /* * Copies the handles of a handle array into a user buffer, and * returns the new length in userBufferSize. If the copy to the * user buffer fails, the functions still returns VMCI_SUCCESS, * but retval != 0. */ static int drv_cp_harray_to_user(void __user *user_buf_uva, u64 *user_buf_size, struct vmci_handle_arr *handle_array, int *retval) { u32 array_size = 0; struct vmci_handle *handles; if (handle_array) array_size = vmci_handle_arr_get_size(handle_array); if (array_size * sizeof(*handles) > *user_buf_size) return VMCI_ERROR_MORE_DATA; *user_buf_size = array_size * sizeof(*handles); if (*user_buf_size) *retval = copy_to_user(user_buf_uva, vmci_handle_arr_get_handles (handle_array), *user_buf_size); return VMCI_SUCCESS; } /* * Sets up a given context for notify to work. Maps the notify * boolean in user VA into kernel space. */ static int vmci_host_setup_notify(struct vmci_ctx *context, unsigned long uva) { int retval; if (context->notify_page) { pr_devel("%s: Notify mechanism is already set up\n", __func__); return VMCI_ERROR_DUPLICATE_ENTRY; } /* * We are using 'bool' internally, but let's make sure we explicit * about the size. */ BUILD_BUG_ON(sizeof(bool) != sizeof(u8)); /* * Lock physical page backing a given user VA. */ retval = get_user_pages_fast(uva, 1, FOLL_WRITE, &context->notify_page); if (retval != 1) { context->notify_page = NULL; return VMCI_ERROR_GENERIC; } if (context->notify_page == NULL) return VMCI_ERROR_UNAVAILABLE; /* * Map the locked page and set up notify pointer. */ context->notify = kmap(context->notify_page) + (uva & (PAGE_SIZE - 1)); vmci_ctx_check_signal_notify(context); return VMCI_SUCCESS; } static int vmci_host_get_version(struct vmci_host_dev *vmci_host_dev, unsigned int cmd, void __user *uptr) { if (cmd == IOCTL_VMCI_VERSION2) { int __user *vptr = uptr; if (get_user(vmci_host_dev->user_version, vptr)) return -EFAULT; } /* * The basic logic here is: * * If the user sends in a version of 0 tell it our version. * If the user didn't send in a version, tell it our version. * If the user sent in an old version, tell it -its- version. * If the user sent in an newer version, tell it our version. * * The rationale behind telling the caller its version is that * Workstation 6.5 required that VMX and VMCI kernel module were * version sync'd. All new VMX users will be programmed to * handle the VMCI kernel module version. */ if (vmci_host_dev->user_version > 0 && vmci_host_dev->user_version < VMCI_VERSION_HOSTQP) { return vmci_host_dev->user_version; } return VMCI_VERSION; } #define vmci_ioctl_err(fmt, ...) \ pr_devel("%s: " fmt, ioctl_name, ##__VA_ARGS__) static int vmci_host_do_init_context(struct vmci_host_dev *vmci_host_dev, const char *ioctl_name, void __user *uptr) { struct vmci_init_blk init_block; const struct cred *cred; int retval; if (copy_from_user(&init_block, uptr, sizeof(init_block))) { vmci_ioctl_err("error reading init block\n"); return -EFAULT; } mutex_lock(&vmci_host_dev->lock); if (vmci_host_dev->ct_type != VMCIOBJ_NOT_SET) { vmci_ioctl_err("received VMCI init on initialized handle\n"); retval = -EINVAL; goto out; } if (init_block.flags & ~VMCI_PRIVILEGE_FLAG_RESTRICTED) { vmci_ioctl_err("unsupported VMCI restriction flag\n"); retval = -EINVAL; goto out; } cred = get_current_cred(); vmci_host_dev->context = vmci_ctx_create(init_block.cid, init_block.flags, 0, vmci_host_dev->user_version, cred); put_cred(cred); if (IS_ERR(vmci_host_dev->context)) { retval = PTR_ERR(vmci_host_dev->context); vmci_ioctl_err("error initializing context\n"); goto out; } /* * Copy cid to userlevel, we do this to allow the VMX * to enforce its policy on cid generation. */ init_block.cid = vmci_ctx_get_id(vmci_host_dev->context); if (copy_to_user(uptr, &init_block, sizeof(init_block))) { vmci_ctx_destroy(vmci_host_dev->context); vmci_host_dev->context = NULL; vmci_ioctl_err("error writing init block\n"); retval = -EFAULT; goto out; } vmci_host_dev->ct_type = VMCIOBJ_CONTEXT; atomic_inc(&vmci_host_active_users); vmci_call_vsock_callback(true); retval = 0; out: mutex_unlock(&vmci_host_dev->lock); return retval; } static int vmci_host_do_send_datagram(struct vmci_host_dev *vmci_host_dev, const char *ioctl_name, void __user *uptr) { struct vmci_datagram_snd_rcv_info send_info; struct vmci_datagram *dg = NULL; u32 cid; if (vmci_host_dev->ct_type != VMCIOBJ_CONTEXT) { vmci_ioctl_err("only valid for contexts\n"); return -EINVAL; } if (copy_from_user(&send_info, uptr, sizeof(send_info))) return -EFAULT; if (send_info.len > VMCI_MAX_DG_SIZE) { vmci_ioctl_err("datagram is too big (size=%d)\n", send_info.len); return -EINVAL; } if (send_info.len < sizeof(*dg)) { vmci_ioctl_err("datagram is too small (size=%d)\n", send_info.len); return -EINVAL; } dg = memdup_user((void __user *)(uintptr_t)send_info.addr, send_info.len); if (IS_ERR(dg)) { vmci_ioctl_err( "cannot allocate memory to dispatch datagram\n"); return PTR_ERR(dg); } if (VMCI_DG_SIZE(dg) != send_info.len) { vmci_ioctl_err("datagram size mismatch\n"); kfree(dg); return -EINVAL; } pr_devel("Datagram dst (handle=0x%x:0x%x) src (handle=0x%x:0x%x), payload (size=%llu bytes)\n", dg->dst.context, dg->dst.resource, dg->src.context, dg->src.resource, (unsigned long long)dg->payload_size); /* Get source context id. */ cid = vmci_ctx_get_id(vmci_host_dev->context); send_info.result = vmci_datagram_dispatch(cid, dg, true); kfree(dg); return copy_to_user(uptr, &send_info, sizeof(send_info)) ? -EFAULT : 0; } static int vmci_host_do_receive_datagram(struct vmci_host_dev *vmci_host_dev, const char *ioctl_name, void __user *uptr) { struct vmci_datagram_snd_rcv_info recv_info; struct vmci_datagram *dg = NULL; int retval; size_t size; if (vmci_host_dev->ct_type != VMCIOBJ_CONTEXT) { vmci_ioctl_err("only valid for contexts\n"); return -EINVAL; } if (copy_from_user(&recv_info, uptr, sizeof(recv_info))) return -EFAULT; size = recv_info.len; recv_info.result = vmci_ctx_dequeue_datagram(vmci_host_dev->context, &size, &dg); if (recv_info.result >= VMCI_SUCCESS) { void __user *ubuf = (void __user *)(uintptr_t)recv_info.addr; retval = copy_to_user(ubuf, dg, VMCI_DG_SIZE(dg)); kfree(dg); if (retval != 0) return -EFAULT; } return copy_to_user(uptr, &recv_info, sizeof(recv_info)) ? -EFAULT : 0; } static int vmci_host_do_alloc_queuepair(struct vmci_host_dev *vmci_host_dev, const char *ioctl_name, void __user *uptr) { struct vmci_handle handle; int vmci_status; int __user *retptr; if (vmci_host_dev->ct_type != VMCIOBJ_CONTEXT) { vmci_ioctl_err("only valid for contexts\n"); return -EINVAL; } if (vmci_host_dev->user_version < VMCI_VERSION_NOVMVM) { struct vmci_qp_alloc_info_vmvm alloc_info; struct vmci_qp_alloc_info_vmvm __user *info = uptr; if (copy_from_user(&alloc_info, uptr, sizeof(alloc_info))) return -EFAULT; handle = alloc_info.handle; retptr = &info->result; vmci_status = vmci_qp_broker_alloc(alloc_info.handle, alloc_info.peer, alloc_info.flags, VMCI_NO_PRIVILEGE_FLAGS, alloc_info.produce_size, alloc_info.consume_size, NULL, vmci_host_dev->context); if (vmci_status == VMCI_SUCCESS) vmci_status = VMCI_SUCCESS_QUEUEPAIR_CREATE; } else { struct vmci_qp_alloc_info alloc_info; struct vmci_qp_alloc_info __user *info = uptr; struct vmci_qp_page_store page_store; if (copy_from_user(&alloc_info, uptr, sizeof(alloc_info))) return -EFAULT; handle = alloc_info.handle; retptr = &info->result; page_store.pages = alloc_info.ppn_va; page_store.len = alloc_info.num_ppns; vmci_status = vmci_qp_broker_alloc(alloc_info.handle, alloc_info.peer, alloc_info.flags, VMCI_NO_PRIVILEGE_FLAGS, alloc_info.produce_size, alloc_info.consume_size, &page_store, vmci_host_dev->context); } if (put_user(vmci_status, retptr)) { if (vmci_status >= VMCI_SUCCESS) { vmci_status = vmci_qp_broker_detach(handle, vmci_host_dev->context); } return -EFAULT; } return 0; } static int vmci_host_do_queuepair_setva(struct vmci_host_dev *vmci_host_dev, const char *ioctl_name, void __user *uptr) { struct vmci_qp_set_va_info set_va_info; struct vmci_qp_set_va_info __user *info = uptr; s32 result; if (vmci_host_dev->ct_type != VMCIOBJ_CONTEXT) { vmci_ioctl_err("only valid for contexts\n"); return -EINVAL; } if (vmci_host_dev->user_version < VMCI_VERSION_NOVMVM) { vmci_ioctl_err("is not allowed\n"); return -EINVAL; } if (copy_from_user(&set_va_info, uptr, sizeof(set_va_info))) return -EFAULT; if (set_va_info.va) { /* * VMX is passing down a new VA for the queue * pair mapping. */ result = vmci_qp_broker_map(set_va_info.handle, vmci_host_dev->context, set_va_info.va); } else { /* * The queue pair is about to be unmapped by * the VMX. */ result = vmci_qp_broker_unmap(set_va_info.handle, vmci_host_dev->context, 0); } return put_user(result, &info->result) ? -EFAULT : 0; } static int vmci_host_do_queuepair_setpf(struct vmci_host_dev *vmci_host_dev, const char *ioctl_name, void __user *uptr) { struct vmci_qp_page_file_info page_file_info; struct vmci_qp_page_file_info __user *info = uptr; s32 result; if (vmci_host_dev->user_version < VMCI_VERSION_HOSTQP || vmci_host_dev->user_version >= VMCI_VERSION_NOVMVM) { vmci_ioctl_err("not supported on this VMX (version=%d)\n", vmci_host_dev->user_version); return -EINVAL; } if (vmci_host_dev->ct_type != VMCIOBJ_CONTEXT) { vmci_ioctl_err("only valid for contexts\n"); return -EINVAL; } if (copy_from_user(&page_file_info, uptr, sizeof(*info))) return -EFAULT; /* * Communicate success pre-emptively to the caller. Note that the * basic premise is that it is incumbent upon the caller not to look at * the info.result field until after the ioctl() returns. And then, * only if the ioctl() result indicates no error. We send up the * SUCCESS status before calling SetPageStore() store because failing * to copy up the result code means unwinding the SetPageStore(). * * It turns out the logic to unwind a SetPageStore() opens a can of * worms. For example, if a host had created the queue_pair and a * guest attaches and SetPageStore() is successful but writing success * fails, then ... the host has to be stopped from writing (anymore) * data into the queue_pair. That means an additional test in the * VMCI_Enqueue() code path. Ugh. */ if (put_user(VMCI_SUCCESS, &info->result)) { /* * In this case, we can't write a result field of the * caller's info block. So, we don't even try to * SetPageStore(). */ return -EFAULT; } result = vmci_qp_broker_set_page_store(page_file_info.handle, page_file_info.produce_va, page_file_info.consume_va, vmci_host_dev->context); if (result < VMCI_SUCCESS) { if (put_user(result, &info->result)) { /* * Note that in this case the SetPageStore() * call failed but we were unable to * communicate that to the caller (because the * copy_to_user() call failed). So, if we * simply return an error (in this case * -EFAULT) then the caller will know that the * SetPageStore failed even though we couldn't * put the result code in the result field and * indicate exactly why it failed. * * That says nothing about the issue where we * were once able to write to the caller's info * memory and now can't. Something more * serious is probably going on than the fact * that SetPageStore() didn't work. */ return -EFAULT; } } return 0; } static int vmci_host_do_qp_detach(struct vmci_host_dev *vmci_host_dev, const char *ioctl_name, void __user *uptr) { struct vmci_qp_dtch_info detach_info; struct vmci_qp_dtch_info __user *info = uptr; s32 result; if (vmci_host_dev->ct_type != VMCIOBJ_CONTEXT) { vmci_ioctl_err("only valid for contexts\n"); return -EINVAL; } if (copy_from_user(&detach_info, uptr, sizeof(detach_info))) return -EFAULT; result = vmci_qp_broker_detach(detach_info.handle, vmci_host_dev->context); if (result == VMCI_SUCCESS && vmci_host_dev->user_version < VMCI_VERSION_NOVMVM) { result = VMCI_SUCCESS_LAST_DETACH; } return put_user(result, &info->result) ? -EFAULT : 0; } static int vmci_host_do_ctx_add_notify(struct vmci_host_dev *vmci_host_dev, const char *ioctl_name, void __user *uptr) { struct vmci_ctx_info ar_info; struct vmci_ctx_info __user *info = uptr; s32 result; u32 cid; if (vmci_host_dev->ct_type != VMCIOBJ_CONTEXT) { vmci_ioctl_err("only valid for contexts\n"); return -EINVAL; } if (copy_from_user(&ar_info, uptr, sizeof(ar_info))) return -EFAULT; cid = vmci_ctx_get_id(vmci_host_dev->context); result = vmci_ctx_add_notification(cid, ar_info.remote_cid); return put_user(result, &info->result) ? -EFAULT : 0; } static int vmci_host_do_ctx_remove_notify(struct vmci_host_dev *vmci_host_dev, const char *ioctl_name, void __user *uptr) { struct vmci_ctx_info ar_info; struct vmci_ctx_info __user *info = uptr; u32 cid; int result; if (vmci_host_dev->ct_type != VMCIOBJ_CONTEXT) { vmci_ioctl_err("only valid for contexts\n"); return -EINVAL; } if (copy_from_user(&ar_info, uptr, sizeof(ar_info))) return -EFAULT; cid = vmci_ctx_get_id(vmci_host_dev->context); result = vmci_ctx_remove_notification(cid, ar_info.remote_cid); return put_user(result, &info->result) ? -EFAULT : 0; } static int vmci_host_do_ctx_get_cpt_state(struct vmci_host_dev *vmci_host_dev, const char *ioctl_name, void __user *uptr) { struct vmci_ctx_chkpt_buf_info get_info; u32 cid; void *cpt_buf; int retval; if (vmci_host_dev->ct_type != VMCIOBJ_CONTEXT) { vmci_ioctl_err("only valid for contexts\n"); return -EINVAL; } if (copy_from_user(&get_info, uptr, sizeof(get_info))) return -EFAULT; cid = vmci_ctx_get_id(vmci_host_dev->context); get_info.result = vmci_ctx_get_chkpt_state(cid, get_info.cpt_type, &get_info.buf_size, &cpt_buf); if (get_info.result == VMCI_SUCCESS && get_info.buf_size) { void __user *ubuf = (void __user *)(uintptr_t)get_info.cpt_buf; retval = copy_to_user(ubuf, cpt_buf, get_info.buf_size); kfree(cpt_buf); if (retval) return -EFAULT; } return copy_to_user(uptr, &get_info, sizeof(get_info)) ? -EFAULT : 0; } static int vmci_host_do_ctx_set_cpt_state(struct vmci_host_dev *vmci_host_dev, const char *ioctl_name, void __user *uptr) { struct vmci_ctx_chkpt_buf_info set_info; u32 cid; void *cpt_buf; int retval; if (vmci_host_dev->ct_type != VMCIOBJ_CONTEXT) { vmci_ioctl_err("only valid for contexts\n"); return -EINVAL; } if (copy_from_user(&set_info, uptr, sizeof(set_info))) return -EFAULT; cpt_buf = memdup_user((void __user *)(uintptr_t)set_info.cpt_buf, set_info.buf_size); if (IS_ERR(cpt_buf)) return PTR_ERR(cpt_buf); cid = vmci_ctx_get_id(vmci_host_dev->context); set_info.result = vmci_ctx_set_chkpt_state(cid, set_info.cpt_type, set_info.buf_size, cpt_buf); retval = copy_to_user(uptr, &set_info, sizeof(set_info)) ? -EFAULT : 0; kfree(cpt_buf); return retval; } static int vmci_host_do_get_context_id(struct vmci_host_dev *vmci_host_dev, const char *ioctl_name, void __user *uptr) { u32 __user *u32ptr = uptr; return put_user(VMCI_HOST_CONTEXT_ID, u32ptr) ? -EFAULT : 0; } static int vmci_host_do_set_notify(struct vmci_host_dev *vmci_host_dev, const char *ioctl_name, void __user *uptr) { struct vmci_set_notify_info notify_info; if (vmci_host_dev->ct_type != VMCIOBJ_CONTEXT) { vmci_ioctl_err("only valid for contexts\n"); return -EINVAL; } if (copy_from_user(¬ify_info, uptr, sizeof(notify_info))) return -EFAULT; if (notify_info.notify_uva) { notify_info.result = vmci_host_setup_notify(vmci_host_dev->context, notify_info.notify_uva); } else { vmci_ctx_unset_notify(vmci_host_dev->context); notify_info.result = VMCI_SUCCESS; } return copy_to_user(uptr, ¬ify_info, sizeof(notify_info)) ? -EFAULT : 0; } static int vmci_host_do_notify_resource(struct vmci_host_dev *vmci_host_dev, const char *ioctl_name, void __user *uptr) { struct vmci_dbell_notify_resource_info info; u32 cid; if (vmci_host_dev->user_version < VMCI_VERSION_NOTIFY) { vmci_ioctl_err("invalid for current VMX versions\n"); return -EINVAL; } if (vmci_host_dev->ct_type != VMCIOBJ_CONTEXT) { vmci_ioctl_err("only valid for contexts\n"); return -EINVAL; } if (copy_from_user(&info, uptr, sizeof(info))) return -EFAULT; cid = vmci_ctx_get_id(vmci_host_dev->context); switch (info.action) { case VMCI_NOTIFY_RESOURCE_ACTION_NOTIFY: if (info.resource == VMCI_NOTIFY_RESOURCE_DOOR_BELL) { u32 flags = VMCI_NO_PRIVILEGE_FLAGS; info.result = vmci_ctx_notify_dbell(cid, info.handle, flags); } else { info.result = VMCI_ERROR_UNAVAILABLE; } break; case VMCI_NOTIFY_RESOURCE_ACTION_CREATE: info.result = vmci_ctx_dbell_create(cid, info.handle); break; case VMCI_NOTIFY_RESOURCE_ACTION_DESTROY: info.result = vmci_ctx_dbell_destroy(cid, info.handle); break; default: vmci_ioctl_err("got unknown action (action=%d)\n", info.action); info.result = VMCI_ERROR_INVALID_ARGS; } return copy_to_user(uptr, &info, sizeof(info)) ? -EFAULT : 0; } static int vmci_host_do_recv_notifications(struct vmci_host_dev *vmci_host_dev, const char *ioctl_name, void __user *uptr) { struct vmci_ctx_notify_recv_info info; struct vmci_handle_arr *db_handle_array; struct vmci_handle_arr *qp_handle_array; void __user *ubuf; u32 cid; int retval = 0; if (vmci_host_dev->ct_type != VMCIOBJ_CONTEXT) { vmci_ioctl_err("only valid for contexts\n"); return -EINVAL; } if (vmci_host_dev->user_version < VMCI_VERSION_NOTIFY) { vmci_ioctl_err("not supported for the current vmx version\n"); return -EINVAL; } if (copy_from_user(&info, uptr, sizeof(info))) return -EFAULT; if ((info.db_handle_buf_size && !info.db_handle_buf_uva) || (info.qp_handle_buf_size && !info.qp_handle_buf_uva)) { return -EINVAL; } cid = vmci_ctx_get_id(vmci_host_dev->context); info.result = vmci_ctx_rcv_notifications_get(cid, &db_handle_array, &qp_handle_array); if (info.result != VMCI_SUCCESS) return copy_to_user(uptr, &info, sizeof(info)) ? -EFAULT : 0; ubuf = (void __user *)(uintptr_t)info.db_handle_buf_uva; info.result = drv_cp_harray_to_user(ubuf, &info.db_handle_buf_size, db_handle_array, &retval); if (info.result == VMCI_SUCCESS && !retval) { ubuf = (void __user *)(uintptr_t)info.qp_handle_buf_uva; info.result = drv_cp_harray_to_user(ubuf, &info.qp_handle_buf_size, qp_handle_array, &retval); } if (!retval && copy_to_user(uptr, &info, sizeof(info))) retval = -EFAULT; vmci_ctx_rcv_notifications_release(cid, db_handle_array, qp_handle_array, info.result == VMCI_SUCCESS && !retval); return retval; } static long vmci_host_unlocked_ioctl(struct file *filp, unsigned int iocmd, unsigned long ioarg) { #define VMCI_DO_IOCTL(ioctl_name, ioctl_fn) do { \ char *name = "IOCTL_VMCI_" # ioctl_name; \ return vmci_host_do_ ## ioctl_fn( \ vmci_host_dev, name, uptr); \ } while (0) struct vmci_host_dev *vmci_host_dev = filp->private_data; void __user *uptr = (void __user *)ioarg; switch (iocmd) { case IOCTL_VMCI_INIT_CONTEXT: VMCI_DO_IOCTL(INIT_CONTEXT, init_context); case IOCTL_VMCI_DATAGRAM_SEND: VMCI_DO_IOCTL(DATAGRAM_SEND, send_datagram); case IOCTL_VMCI_DATAGRAM_RECEIVE: VMCI_DO_IOCTL(DATAGRAM_RECEIVE, receive_datagram); case IOCTL_VMCI_QUEUEPAIR_ALLOC: VMCI_DO_IOCTL(QUEUEPAIR_ALLOC, alloc_queuepair); case IOCTL_VMCI_QUEUEPAIR_SETVA: VMCI_DO_IOCTL(QUEUEPAIR_SETVA, queuepair_setva); case IOCTL_VMCI_QUEUEPAIR_SETPAGEFILE: VMCI_DO_IOCTL(QUEUEPAIR_SETPAGEFILE, queuepair_setpf); case IOCTL_VMCI_QUEUEPAIR_DETACH: VMCI_DO_IOCTL(QUEUEPAIR_DETACH, qp_detach); case IOCTL_VMCI_CTX_ADD_NOTIFICATION: VMCI_DO_IOCTL(CTX_ADD_NOTIFICATION, ctx_add_notify); case IOCTL_VMCI_CTX_REMOVE_NOTIFICATION: VMCI_DO_IOCTL(CTX_REMOVE_NOTIFICATION, ctx_remove_notify); case IOCTL_VMCI_CTX_GET_CPT_STATE: VMCI_DO_IOCTL(CTX_GET_CPT_STATE, ctx_get_cpt_state); case IOCTL_VMCI_CTX_SET_CPT_STATE: VMCI_DO_IOCTL(CTX_SET_CPT_STATE, ctx_set_cpt_state); case IOCTL_VMCI_GET_CONTEXT_ID: VMCI_DO_IOCTL(GET_CONTEXT_ID, get_context_id); case IOCTL_VMCI_SET_NOTIFY: VMCI_DO_IOCTL(SET_NOTIFY, set_notify); case IOCTL_VMCI_NOTIFY_RESOURCE: VMCI_DO_IOCTL(NOTIFY_RESOURCE, notify_resource); case IOCTL_VMCI_NOTIFICATIONS_RECEIVE: VMCI_DO_IOCTL(NOTIFICATIONS_RECEIVE, recv_notifications); case IOCTL_VMCI_VERSION: case IOCTL_VMCI_VERSION2: return vmci_host_get_version(vmci_host_dev, iocmd, uptr); default: pr_devel("%s: Unknown ioctl (iocmd=%d)\n", __func__, iocmd); return -EINVAL; } #undef VMCI_DO_IOCTL } static const struct file_operations vmuser_fops = { .owner = THIS_MODULE, .open = vmci_host_open, .release = vmci_host_close, .poll = vmci_host_poll, .unlocked_ioctl = vmci_host_unlocked_ioctl, .compat_ioctl = compat_ptr_ioctl, }; static struct miscdevice vmci_host_miscdev = { .name = "vmci", .minor = MISC_DYNAMIC_MINOR, .fops = &vmuser_fops, }; int __init vmci_host_init(void) { int error; host_context = vmci_ctx_create(VMCI_HOST_CONTEXT_ID, VMCI_DEFAULT_PROC_PRIVILEGE_FLAGS, -1, VMCI_VERSION, NULL); if (IS_ERR(host_context)) { error = PTR_ERR(host_context); pr_warn("Failed to initialize VMCIContext (error%d)\n", error); return error; } error = misc_register(&vmci_host_miscdev); if (error) { pr_warn("Module registration error (name=%s, major=%d, minor=%d, err=%d)\n", vmci_host_miscdev.name, MISC_MAJOR, vmci_host_miscdev.minor, error); pr_warn("Unable to initialize host personality\n"); vmci_ctx_destroy(host_context); return error; } pr_info("VMCI host device registered (name=%s, major=%d, minor=%d)\n", vmci_host_miscdev.name, MISC_MAJOR, vmci_host_miscdev.minor); vmci_host_device_initialized = true; return 0; } void __exit vmci_host_exit(void) { vmci_host_device_initialized = false; misc_deregister(&vmci_host_miscdev); vmci_ctx_destroy(host_context); vmci_qp_broker_exit(); pr_debug("VMCI host driver module unloaded\n"); } |
| 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 | /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. NET is implemented using the BSD Socket * interface as the means of communication with the user level. * * Definitions for the Ethernet handlers. * * Version: @(#)eth.h 1.0.4 05/13/93 * * Authors: Ross Biro * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> * * Relocated to include/linux where it belongs by Alan Cox * <gw4pts@gw4pts.ampr.org> */ #ifndef _LINUX_ETHERDEVICE_H #define _LINUX_ETHERDEVICE_H #include <linux/if_ether.h> #include <linux/netdevice.h> #include <linux/random.h> #include <linux/crc32.h> #include <linux/unaligned.h> #include <asm/bitsperlong.h> #ifdef __KERNEL__ struct device; struct fwnode_handle; int eth_platform_get_mac_address(struct device *dev, u8 *mac_addr); int platform_get_ethdev_address(struct device *dev, struct net_device *netdev); unsigned char *arch_get_platform_mac_address(void); int nvmem_get_mac_address(struct device *dev, void *addrbuf); int device_get_mac_address(struct device *dev, char *addr); int device_get_ethdev_address(struct device *dev, struct net_device *netdev); int fwnode_get_mac_address(struct fwnode_handle *fwnode, char *addr); u32 eth_get_headlen(const struct net_device *dev, const void *data, u32 len); __be16 eth_type_trans(struct sk_buff *skb, struct net_device *dev); extern const struct header_ops eth_header_ops; int eth_header(struct sk_buff *skb, struct net_device *dev, unsigned short type, const void *daddr, const void *saddr, unsigned len); int eth_header_parse(const struct sk_buff *skb, unsigned char *haddr); int eth_header_cache(const struct neighbour *neigh, struct hh_cache *hh, __be16 type); void eth_header_cache_update(struct hh_cache *hh, const struct net_device *dev, const unsigned char *haddr); __be16 eth_header_parse_protocol(const struct sk_buff *skb); int eth_prepare_mac_addr_change(struct net_device *dev, void *p); void eth_commit_mac_addr_change(struct net_device *dev, void *p); int eth_mac_addr(struct net_device *dev, void *p); int eth_validate_addr(struct net_device *dev); struct net_device *alloc_etherdev_mqs(int sizeof_priv, unsigned int txqs, unsigned int rxqs); #define alloc_etherdev(sizeof_priv) alloc_etherdev_mq(sizeof_priv, 1) #define alloc_etherdev_mq(sizeof_priv, count) alloc_etherdev_mqs(sizeof_priv, count, count) struct net_device *devm_alloc_etherdev_mqs(struct device *dev, int sizeof_priv, unsigned int txqs, unsigned int rxqs); #define devm_alloc_etherdev(dev, sizeof_priv) devm_alloc_etherdev_mqs(dev, sizeof_priv, 1, 1) struct sk_buff *eth_gro_receive(struct list_head *head, struct sk_buff *skb); int eth_gro_complete(struct sk_buff *skb, int nhoff); /* Reserved Ethernet Addresses per IEEE 802.1Q */ static const u8 eth_reserved_addr_base[ETH_ALEN] __aligned(2) = { 0x01, 0x80, 0xc2, 0x00, 0x00, 0x00 }; #define eth_stp_addr eth_reserved_addr_base static const u8 eth_ipv4_mcast_addr_base[ETH_ALEN] __aligned(2) = { 0x01, 0x00, 0x5e, 0x00, 0x00, 0x00 }; static const u8 eth_ipv6_mcast_addr_base[ETH_ALEN] __aligned(2) = { 0x33, 0x33, 0x00, 0x00, 0x00, 0x00 }; /** * is_link_local_ether_addr - Determine if given Ethernet address is link-local * @addr: Pointer to a six-byte array containing the Ethernet address * * Return true if address is link local reserved addr (01:80:c2:00:00:0X) per * IEEE 802.1Q 8.6.3 Frame filtering. * * Please note: addr must be aligned to u16. */ static inline bool is_link_local_ether_addr(const u8 *addr) { __be16 *a = (__be16 *)addr; static const __be16 *b = (const __be16 *)eth_reserved_addr_base; static const __be16 m = cpu_to_be16(0xfff0); #if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) return (((*(const u32 *)addr) ^ (*(const u32 *)b)) | (__force int)((a[2] ^ b[2]) & m)) == 0; #else return ((a[0] ^ b[0]) | (a[1] ^ b[1]) | ((a[2] ^ b[2]) & m)) == 0; #endif } /** * is_zero_ether_addr - Determine if give Ethernet address is all zeros. * @addr: Pointer to a six-byte array containing the Ethernet address * * Return true if the address is all zeroes. * * Please note: addr must be aligned to u16. */ static inline bool is_zero_ether_addr(const u8 *addr) { #if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) return ((*(const u32 *)addr) | (*(const u16 *)(addr + 4))) == 0; #else return (*(const u16 *)(addr + 0) | *(const u16 *)(addr + 2) | *(const u16 *)(addr + 4)) == 0; #endif } /** * is_multicast_ether_addr - Determine if the Ethernet address is a multicast. * @addr: Pointer to a six-byte array containing the Ethernet address * * Return true if the address is a multicast address. * By definition the broadcast address is also a multicast address. */ static inline bool is_multicast_ether_addr(const u8 *addr) { #if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) u32 a = *(const u32 *)addr; #else u16 a = *(const u16 *)addr; #endif #ifdef __BIG_ENDIAN return 0x01 & (a >> ((sizeof(a) * 8) - 8)); #else return 0x01 & a; #endif } static inline bool is_multicast_ether_addr_64bits(const u8 *addr) { #if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64 #ifdef __BIG_ENDIAN return 0x01 & ((*(const u64 *)addr) >> 56); #else return 0x01 & (*(const u64 *)addr); #endif #else return is_multicast_ether_addr(addr); #endif } /** * is_local_ether_addr - Determine if the Ethernet address is locally-assigned one (IEEE 802). * @addr: Pointer to a six-byte array containing the Ethernet address * * Return true if the address is a local address. */ static inline bool is_local_ether_addr(const u8 *addr) { return 0x02 & addr[0]; } /** * is_broadcast_ether_addr - Determine if the Ethernet address is broadcast * @addr: Pointer to a six-byte array containing the Ethernet address * * Return true if the address is the broadcast address. * * Please note: addr must be aligned to u16. */ static inline bool is_broadcast_ether_addr(const u8 *addr) { return (*(const u16 *)(addr + 0) & *(const u16 *)(addr + 2) & *(const u16 *)(addr + 4)) == 0xffff; } /** * is_unicast_ether_addr - Determine if the Ethernet address is unicast * @addr: Pointer to a six-byte array containing the Ethernet address * * Return true if the address is a unicast address. */ static inline bool is_unicast_ether_addr(const u8 *addr) { return !is_multicast_ether_addr(addr); } /** * is_valid_ether_addr - Determine if the given Ethernet address is valid * @addr: Pointer to a six-byte array containing the Ethernet address * * Check that the Ethernet address (MAC) is not 00:00:00:00:00:00, is not * a multicast address, and is not FF:FF:FF:FF:FF:FF. * * Return true if the address is valid. * * Please note: addr must be aligned to u16. */ static inline bool is_valid_ether_addr(const u8 *addr) { /* FF:FF:FF:FF:FF:FF is a multicast address so we don't need to * explicitly check for it here. */ return !is_multicast_ether_addr(addr) && !is_zero_ether_addr(addr); } /** * eth_proto_is_802_3 - Determine if a given Ethertype/length is a protocol * @proto: Ethertype/length value to be tested * * Check that the value from the Ethertype/length field is a valid Ethertype. * * Return true if the valid is an 802.3 supported Ethertype. */ static inline bool eth_proto_is_802_3(__be16 proto) { #ifndef __BIG_ENDIAN /* if CPU is little endian mask off bits representing LSB */ proto &= htons(0xFF00); #endif /* cast both to u16 and compare since LSB can be ignored */ return (__force u16)proto >= (__force u16)htons(ETH_P_802_3_MIN); } /** * eth_random_addr - Generate software assigned random Ethernet address * @addr: Pointer to a six-byte array containing the Ethernet address * * Generate a random Ethernet address (MAC) that is not multicast * and has the local assigned bit set. */ static inline void eth_random_addr(u8 *addr) { get_random_bytes(addr, ETH_ALEN); addr[0] &= 0xfe; /* clear multicast bit */ addr[0] |= 0x02; /* set local assignment bit (IEEE802) */ } /** * eth_broadcast_addr - Assign broadcast address * @addr: Pointer to a six-byte array containing the Ethernet address * * Assign the broadcast address to the given address array. */ static inline void eth_broadcast_addr(u8 *addr) { memset(addr, 0xff, ETH_ALEN); } /** * eth_zero_addr - Assign zero address * @addr: Pointer to a six-byte array containing the Ethernet address * * Assign the zero address to the given address array. */ static inline void eth_zero_addr(u8 *addr) { memset(addr, 0x00, ETH_ALEN); } /** * eth_hw_addr_random - Generate software assigned random Ethernet and * set device flag * @dev: pointer to net_device structure * * Generate a random Ethernet address (MAC) to be used by a net device * and set addr_assign_type so the state can be read by sysfs and be * used by userspace. */ static inline void eth_hw_addr_random(struct net_device *dev) { u8 addr[ETH_ALEN]; eth_random_addr(addr); __dev_addr_set(dev, addr, ETH_ALEN); dev->addr_assign_type = NET_ADDR_RANDOM; } /** * eth_hw_addr_crc - Calculate CRC from netdev_hw_addr * @ha: pointer to hardware address * * Calculate CRC from a hardware address as basis for filter hashes. */ static inline u32 eth_hw_addr_crc(struct netdev_hw_addr *ha) { return ether_crc(ETH_ALEN, ha->addr); } /** * ether_addr_copy - Copy an Ethernet address * @dst: Pointer to a six-byte array Ethernet address destination * @src: Pointer to a six-byte array Ethernet address source * * Please note: dst & src must both be aligned to u16. */ static inline void ether_addr_copy(u8 *dst, const u8 *src) { #if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) *(u32 *)dst = *(const u32 *)src; *(u16 *)(dst + 4) = *(const u16 *)(src + 4); #else u16 *a = (u16 *)dst; const u16 *b = (const u16 *)src; a[0] = b[0]; a[1] = b[1]; a[2] = b[2]; #endif } /** * eth_hw_addr_set - Assign Ethernet address to a net_device * @dev: pointer to net_device structure * @addr: address to assign * * Assign given address to the net_device, addr_assign_type is not changed. */ static inline void eth_hw_addr_set(struct net_device *dev, const u8 *addr) { __dev_addr_set(dev, addr, ETH_ALEN); } /** * eth_hw_addr_inherit - Copy dev_addr from another net_device * @dst: pointer to net_device to copy dev_addr to * @src: pointer to net_device to copy dev_addr from * * Copy the Ethernet address from one net_device to another along with * the address attributes (addr_assign_type). */ static inline void eth_hw_addr_inherit(struct net_device *dst, struct net_device *src) { dst->addr_assign_type = src->addr_assign_type; eth_hw_addr_set(dst, src->dev_addr); } /** * ether_addr_equal - Compare two Ethernet addresses * @addr1: Pointer to a six-byte array containing the Ethernet address * @addr2: Pointer other six-byte array containing the Ethernet address * * Compare two Ethernet addresses, returns true if equal * * Please note: addr1 & addr2 must both be aligned to u16. */ static inline bool ether_addr_equal(const u8 *addr1, const u8 *addr2) { #if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) u32 fold = ((*(const u32 *)addr1) ^ (*(const u32 *)addr2)) | ((*(const u16 *)(addr1 + 4)) ^ (*(const u16 *)(addr2 + 4))); return fold == 0; #else const u16 *a = (const u16 *)addr1; const u16 *b = (const u16 *)addr2; return ((a[0] ^ b[0]) | (a[1] ^ b[1]) | (a[2] ^ b[2])) == 0; #endif } /** * ether_addr_equal_64bits - Compare two Ethernet addresses * @addr1: Pointer to an array of 8 bytes * @addr2: Pointer to an other array of 8 bytes * * Compare two Ethernet addresses, returns true if equal, false otherwise. * * The function doesn't need any conditional branches and possibly uses * word memory accesses on CPU allowing cheap unaligned memory reads. * arrays = { byte1, byte2, byte3, byte4, byte5, byte6, pad1, pad2 } * * Please note that alignment of addr1 & addr2 are only guaranteed to be 16 bits. */ static inline bool ether_addr_equal_64bits(const u8 *addr1, const u8 *addr2) { #if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64 u64 fold = (*(const u64 *)addr1) ^ (*(const u64 *)addr2); #ifdef __BIG_ENDIAN return (fold >> 16) == 0; #else return (fold << 16) == 0; #endif #else return ether_addr_equal(addr1, addr2); #endif } /** * ether_addr_equal_unaligned - Compare two not u16 aligned Ethernet addresses * @addr1: Pointer to a six-byte array containing the Ethernet address * @addr2: Pointer other six-byte array containing the Ethernet address * * Compare two Ethernet addresses, returns true if equal * * Please note: Use only when any Ethernet address may not be u16 aligned. */ static inline bool ether_addr_equal_unaligned(const u8 *addr1, const u8 *addr2) { #if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) return ether_addr_equal(addr1, addr2); #else return memcmp(addr1, addr2, ETH_ALEN) == 0; #endif } /** * ether_addr_equal_masked - Compare two Ethernet addresses with a mask * @addr1: Pointer to a six-byte array containing the 1st Ethernet address * @addr2: Pointer to a six-byte array containing the 2nd Ethernet address * @mask: Pointer to a six-byte array containing the Ethernet address bitmask * * Compare two Ethernet addresses with a mask, returns true if for every bit * set in the bitmask the equivalent bits in the ethernet addresses are equal. * Using a mask with all bits set is a slower ether_addr_equal. */ static inline bool ether_addr_equal_masked(const u8 *addr1, const u8 *addr2, const u8 *mask) { int i; for (i = 0; i < ETH_ALEN; i++) { if ((addr1[i] ^ addr2[i]) & mask[i]) return false; } return true; } static inline bool ether_addr_is_ipv4_mcast(const u8 *addr) { u8 mask[ETH_ALEN] = { 0xff, 0xff, 0xff, 0x80, 0x00, 0x00 }; return ether_addr_equal_masked(addr, eth_ipv4_mcast_addr_base, mask); } static inline bool ether_addr_is_ipv6_mcast(const u8 *addr) { u8 mask[ETH_ALEN] = { 0xff, 0xff, 0x00, 0x00, 0x00, 0x00 }; return ether_addr_equal_masked(addr, eth_ipv6_mcast_addr_base, mask); } static inline bool ether_addr_is_ip_mcast(const u8 *addr) { return ether_addr_is_ipv4_mcast(addr) || ether_addr_is_ipv6_mcast(addr); } /** * ether_addr_to_u64 - Convert an Ethernet address into a u64 value. * @addr: Pointer to a six-byte array containing the Ethernet address * * Return a u64 value of the address */ static inline u64 ether_addr_to_u64(const u8 *addr) { u64 u = 0; int i; for (i = 0; i < ETH_ALEN; i++) u = u << 8 | addr[i]; return u; } /** * u64_to_ether_addr - Convert a u64 to an Ethernet address. * @u: u64 to convert to an Ethernet MAC address * @addr: Pointer to a six-byte array to contain the Ethernet address */ static inline void u64_to_ether_addr(u64 u, u8 *addr) { int i; for (i = ETH_ALEN - 1; i >= 0; i--) { addr[i] = u & 0xff; u = u >> 8; } } /** * eth_addr_dec - Decrement the given MAC address * * @addr: Pointer to a six-byte array containing Ethernet address to decrement */ static inline void eth_addr_dec(u8 *addr) { u64 u = ether_addr_to_u64(addr); u--; u64_to_ether_addr(u, addr); } /** * eth_addr_inc() - Increment the given MAC address. * @addr: Pointer to a six-byte array containing Ethernet address to increment. */ static inline void eth_addr_inc(u8 *addr) { u64 u = ether_addr_to_u64(addr); u++; u64_to_ether_addr(u, addr); } /** * eth_addr_add() - Add (or subtract) an offset to/from the given MAC address. * * @offset: Offset to add. * @addr: Pointer to a six-byte array containing Ethernet address to increment. */ static inline void eth_addr_add(u8 *addr, long offset) { u64 u = ether_addr_to_u64(addr); u += offset; u64_to_ether_addr(u, addr); } /** * is_etherdev_addr - Tell if given Ethernet address belongs to the device. * @dev: Pointer to a device structure * @addr: Pointer to a six-byte array containing the Ethernet address * * Compare passed address with all addresses of the device. Return true if the * address if one of the device addresses. * * Note that this function calls ether_addr_equal_64bits() so take care of * the right padding. */ static inline bool is_etherdev_addr(const struct net_device *dev, const u8 addr[6 + 2]) { struct netdev_hw_addr *ha; bool res = false; rcu_read_lock(); for_each_dev_addr(dev, ha) { res = ether_addr_equal_64bits(addr, ha->addr); if (res) break; } rcu_read_unlock(); return res; } #endif /* __KERNEL__ */ /** * compare_ether_header - Compare two Ethernet headers * @a: Pointer to Ethernet header * @b: Pointer to Ethernet header * * Compare two Ethernet headers, returns 0 if equal. * This assumes that the network header (i.e., IP header) is 4-byte * aligned OR the platform can handle unaligned access. This is the * case for all packets coming into netif_receive_skb or similar * entry points. */ static inline unsigned long compare_ether_header(const void *a, const void *b) { #if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64 unsigned long fold; /* * We want to compare 14 bytes: * [a0 ... a13] ^ [b0 ... b13] * Use two long XOR, ORed together, with an overlap of two bytes. * [a0 a1 a2 a3 a4 a5 a6 a7 ] ^ [b0 b1 b2 b3 b4 b5 b6 b7 ] | * [a6 a7 a8 a9 a10 a11 a12 a13] ^ [b6 b7 b8 b9 b10 b11 b12 b13] * This means the [a6 a7] ^ [b6 b7] part is done two times. */ fold = *(unsigned long *)a ^ *(unsigned long *)b; fold |= *(unsigned long *)(a + 6) ^ *(unsigned long *)(b + 6); return fold; #else u32 *a32 = (u32 *)((u8 *)a + 2); u32 *b32 = (u32 *)((u8 *)b + 2); return (*(u16 *)a ^ *(u16 *)b) | (a32[0] ^ b32[0]) | (a32[1] ^ b32[1]) | (a32[2] ^ b32[2]); #endif } /** * eth_hw_addr_gen - Generate and assign Ethernet address to a port * @dev: pointer to port's net_device structure * @base_addr: base Ethernet address * @id: offset to add to the base address * * Generate a MAC address using a base address and an offset and assign it * to a net_device. Commonly used by switch drivers which need to compute * addresses for all their ports. addr_assign_type is not changed. */ static inline void eth_hw_addr_gen(struct net_device *dev, const u8 *base_addr, unsigned int id) { u64 u = ether_addr_to_u64(base_addr); u8 addr[ETH_ALEN]; u += id; u64_to_ether_addr(u, addr); eth_hw_addr_set(dev, addr); } /** * eth_skb_pkt_type - Assign packet type if destination address does not match * @skb: Assigned a packet type if address does not match @dev address * @dev: Network device used to compare packet address against * * If the destination MAC address of the packet does not match the network * device address, assign an appropriate packet type. */ static inline void eth_skb_pkt_type(struct sk_buff *skb, const struct net_device *dev) { const struct ethhdr *eth = eth_hdr(skb); if (unlikely(!ether_addr_equal_64bits(eth->h_dest, dev->dev_addr))) { if (unlikely(is_multicast_ether_addr_64bits(eth->h_dest))) { if (ether_addr_equal_64bits(eth->h_dest, dev->broadcast)) skb->pkt_type = PACKET_BROADCAST; else skb->pkt_type = PACKET_MULTICAST; } else { skb->pkt_type = PACKET_OTHERHOST; } } } static inline struct ethhdr *eth_skb_pull_mac(struct sk_buff *skb) { struct ethhdr *eth = (struct ethhdr *)skb->data; skb_pull_inline(skb, ETH_HLEN); return eth; } /** * eth_skb_pad - Pad buffer to minimum number of octets for Ethernet frame * @skb: Buffer to pad * * An Ethernet frame should have a minimum size of 60 bytes. This function * takes short frames and pads them with zeros up to the 60 byte limit. */ static inline int eth_skb_pad(struct sk_buff *skb) { return skb_put_padto(skb, ETH_ZLEN); } #endif /* _LINUX_ETHERDEVICE_H */ |
| 301 298 2 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 | // SPDX-License-Identifier: GPL-2.0-only /* * Memory merging support. * * This code enables dynamic sharing of identical pages found in different * memory areas, even if they are not shared by fork() * * Copyright (C) 2008-2009 Red Hat, Inc. * Authors: * Izik Eidus * Andrea Arcangeli * Chris Wright * Hugh Dickins */ #include <linux/errno.h> #include <linux/mm.h> #include <linux/mm_inline.h> #include <linux/fs.h> #include <linux/mman.h> #include <linux/sched.h> #include <linux/sched/mm.h> #include <linux/sched/coredump.h> #include <linux/sched/cputime.h> #include <linux/rwsem.h> #include <linux/pagemap.h> #include <linux/rmap.h> #include <linux/spinlock.h> #include <linux/xxhash.h> #include <linux/delay.h> #include <linux/kthread.h> #include <linux/wait.h> #include <linux/slab.h> #include <linux/rbtree.h> #include <linux/memory.h> #include <linux/mmu_notifier.h> #include <linux/swap.h> #include <linux/ksm.h> #include <linux/hashtable.h> #include <linux/freezer.h> #include <linux/oom.h> #include <linux/numa.h> #include <linux/pagewalk.h> #include <asm/tlbflush.h> #include "internal.h" #include "mm_slot.h" #define CREATE_TRACE_POINTS #include <trace/events/ksm.h> #ifdef CONFIG_NUMA #define NUMA(x) (x) #define DO_NUMA(x) do { (x); } while (0) #else #define NUMA(x) (0) #define DO_NUMA(x) do { } while (0) #endif typedef u8 rmap_age_t; /** * DOC: Overview * * A few notes about the KSM scanning process, * to make it easier to understand the data structures below: * * In order to reduce excessive scanning, KSM sorts the memory pages by their * contents into a data structure that holds pointers to the pages' locations. * * Since the contents of the pages may change at any moment, KSM cannot just * insert the pages into a normal sorted tree and expect it to find anything. * Therefore KSM uses two data structures - the stable and the unstable tree. * * The stable tree holds pointers to all the merged pages (ksm pages), sorted * by their contents. Because each such page is write-protected, searching on * this tree is fully assured to be working (except when pages are unmapped), * and therefore this tree is called the stable tree. * * The stable tree node includes information required for reverse * mapping from a KSM page to virtual addresses that map this page. * * In order to avoid large latencies of the rmap walks on KSM pages, * KSM maintains two types of nodes in the stable tree: * * * the regular nodes that keep the reverse mapping structures in a * linked list * * the "chains" that link nodes ("dups") that represent the same * write protected memory content, but each "dup" corresponds to a * different KSM page copy of that content * * Internally, the regular nodes, "dups" and "chains" are represented * using the same struct ksm_stable_node structure. * * In addition to the stable tree, KSM uses a second data structure called the * unstable tree: this tree holds pointers to pages which have been found to * be "unchanged for a period of time". The unstable tree sorts these pages * by their contents, but since they are not write-protected, KSM cannot rely * upon the unstable tree to work correctly - the unstable tree is liable to * be corrupted as its contents are modified, and so it is called unstable. * * KSM solves this problem by several techniques: * * 1) The unstable tree is flushed every time KSM completes scanning all * memory areas, and then the tree is rebuilt again from the beginning. * 2) KSM will only insert into the unstable tree, pages whose hash value * has not changed since the previous scan of all memory areas. * 3) The unstable tree is a RedBlack Tree - so its balancing is based on the * colors of the nodes and not on their contents, assuring that even when * the tree gets "corrupted" it won't get out of balance, so scanning time * remains the same (also, searching and inserting nodes in an rbtree uses * the same algorithm, so we have no overhead when we flush and rebuild). * 4) KSM never flushes the stable tree, which means that even if it were to * take 10 attempts to find a page in the unstable tree, once it is found, * it is secured in the stable tree. (When we scan a new page, we first * compare it against the stable tree, and then against the unstable tree.) * * If the merge_across_nodes tunable is unset, then KSM maintains multiple * stable trees and multiple unstable trees: one of each for each NUMA node. */ /** * struct ksm_mm_slot - ksm information per mm that is being scanned * @slot: hash lookup from mm to mm_slot * @rmap_list: head for this mm_slot's singly-linked list of rmap_items */ struct ksm_mm_slot { struct mm_slot slot; struct ksm_rmap_item *rmap_list; }; /** * struct ksm_scan - cursor for scanning * @mm_slot: the current mm_slot we are scanning * @address: the next address inside that to be scanned * @rmap_list: link to the next rmap to be scanned in the rmap_list * @seqnr: count of completed full scans (needed when removing unstable node) * * There is only the one ksm_scan instance of this cursor structure. */ struct ksm_scan { struct ksm_mm_slot *mm_slot; unsigned long address; struct ksm_rmap_item **rmap_list; unsigned long seqnr; }; /** * struct ksm_stable_node - node of the stable rbtree * @node: rb node of this ksm page in the stable tree * @head: (overlaying parent) &migrate_nodes indicates temporarily on that list * @hlist_dup: linked into the stable_node->hlist with a stable_node chain * @list: linked into migrate_nodes, pending placement in the proper node tree * @hlist: hlist head of rmap_items using this ksm page * @kpfn: page frame number of this ksm page (perhaps temporarily on wrong nid) * @chain_prune_time: time of the last full garbage collection * @rmap_hlist_len: number of rmap_item entries in hlist or STABLE_NODE_CHAIN * @nid: NUMA node id of stable tree in which linked (may not match kpfn) */ struct ksm_stable_node { union { struct rb_node node; /* when node of stable tree */ struct { /* when listed for migration */ struct list_head *head; struct { struct hlist_node hlist_dup; struct list_head list; }; }; }; struct hlist_head hlist; union { unsigned long kpfn; unsigned long chain_prune_time; }; /* * STABLE_NODE_CHAIN can be any negative number in * rmap_hlist_len negative range, but better not -1 to be able * to reliably detect underflows. */ #define STABLE_NODE_CHAIN -1024 int rmap_hlist_len; #ifdef CONFIG_NUMA int nid; #endif }; /** * struct ksm_rmap_item - reverse mapping item for virtual addresses * @rmap_list: next rmap_item in mm_slot's singly-linked rmap_list * @anon_vma: pointer to anon_vma for this mm,address, when in stable tree * @nid: NUMA node id of unstable tree in which linked (may not match page) * @mm: the memory structure this rmap_item is pointing into * @address: the virtual address this rmap_item tracks (+ flags in low bits) * @oldchecksum: previous checksum of the page at that virtual address * @node: rb node of this rmap_item in the unstable tree * @head: pointer to stable_node heading this list in the stable tree * @hlist: link into hlist of rmap_items hanging off that stable_node * @age: number of scan iterations since creation * @remaining_skips: how many scans to skip */ struct ksm_rmap_item { struct ksm_rmap_item *rmap_list; union { struct anon_vma *anon_vma; /* when stable */ #ifdef CONFIG_NUMA int nid; /* when node of unstable tree */ #endif }; struct mm_struct *mm; unsigned long address; /* + low bits used for flags below */ unsigned int oldchecksum; /* when unstable */ rmap_age_t age; rmap_age_t remaining_skips; union { struct rb_node node; /* when node of unstable tree */ struct { /* when listed from stable tree */ struct ksm_stable_node *head; struct hlist_node hlist; }; }; }; #define SEQNR_MASK 0x0ff /* low bits of unstable tree seqnr */ #define UNSTABLE_FLAG 0x100 /* is a node of the unstable tree */ #define STABLE_FLAG 0x200 /* is listed from the stable tree */ /* The stable and unstable tree heads */ static struct rb_root one_stable_tree[1] = { RB_ROOT }; static struct rb_root one_unstable_tree[1] = { RB_ROOT }; static struct rb_root *root_stable_tree = one_stable_tree; static struct rb_root *root_unstable_tree = one_unstable_tree; /* Recently migrated nodes of stable tree, pending proper placement */ static LIST_HEAD(migrate_nodes); #define STABLE_NODE_DUP_HEAD ((struct list_head *)&migrate_nodes.prev) #define MM_SLOTS_HASH_BITS 10 static DEFINE_HASHTABLE(mm_slots_hash, MM_SLOTS_HASH_BITS); static struct ksm_mm_slot ksm_mm_head = { .slot.mm_node = LIST_HEAD_INIT(ksm_mm_head.slot.mm_node), }; static struct ksm_scan ksm_scan = { .mm_slot = &ksm_mm_head, }; static struct kmem_cache *rmap_item_cache; static struct kmem_cache *stable_node_cache; static struct kmem_cache *mm_slot_cache; /* Default number of pages to scan per batch */ #define DEFAULT_PAGES_TO_SCAN 100 /* The number of pages scanned */ static unsigned long ksm_pages_scanned; /* The number of nodes in the stable tree */ static unsigned long ksm_pages_shared; /* The number of page slots additionally sharing those nodes */ static unsigned long ksm_pages_sharing; /* The number of nodes in the unstable tree */ static unsigned long ksm_pages_unshared; /* The number of rmap_items in use: to calculate pages_volatile */ static unsigned long ksm_rmap_items; /* The number of stable_node chains */ static unsigned long ksm_stable_node_chains; /* The number of stable_node dups linked to the stable_node chains */ static unsigned long ksm_stable_node_dups; /* Delay in pruning stale stable_node_dups in the stable_node_chains */ static unsigned int ksm_stable_node_chains_prune_millisecs = 2000; /* Maximum number of page slots sharing a stable node */ static int ksm_max_page_sharing = 256; /* Number of pages ksmd should scan in one batch */ static unsigned int ksm_thread_pages_to_scan = DEFAULT_PAGES_TO_SCAN; /* Milliseconds ksmd should sleep between batches */ static unsigned int ksm_thread_sleep_millisecs = 20; /* Checksum of an empty (zeroed) page */ static unsigned int zero_checksum __read_mostly; /* Whether to merge empty (zeroed) pages with actual zero pages */ static bool ksm_use_zero_pages __read_mostly; /* Skip pages that couldn't be de-duplicated previously */ /* Default to true at least temporarily, for testing */ static bool ksm_smart_scan = true; /* The number of zero pages which is placed by KSM */ atomic_long_t ksm_zero_pages = ATOMIC_LONG_INIT(0); /* The number of pages that have been skipped due to "smart scanning" */ static unsigned long ksm_pages_skipped; /* Don't scan more than max pages per batch. */ static unsigned long ksm_advisor_max_pages_to_scan = 30000; /* Min CPU for scanning pages per scan */ #define KSM_ADVISOR_MIN_CPU 10 /* Max CPU for scanning pages per scan */ static unsigned int ksm_advisor_max_cpu = 70; /* Target scan time in seconds to analyze all KSM candidate pages. */ static unsigned long ksm_advisor_target_scan_time = 200; /* Exponentially weighted moving average. */ #define EWMA_WEIGHT 30 /** * struct advisor_ctx - metadata for KSM advisor * @start_scan: start time of the current scan * @scan_time: scan time of previous scan * @change: change in percent to pages_to_scan parameter * @cpu_time: cpu time consumed by the ksmd thread in the previous scan */ struct advisor_ctx { ktime_t start_scan; unsigned long scan_time; unsigned long change; unsigned long long cpu_time; }; static struct advisor_ctx advisor_ctx; /* Define different advisor's */ enum ksm_advisor_type { KSM_ADVISOR_NONE, KSM_ADVISOR_SCAN_TIME, }; static enum ksm_advisor_type ksm_advisor; #ifdef CONFIG_SYSFS /* * Only called through the sysfs control interface: */ /* At least scan this many pages per batch. */ static unsigned long ksm_advisor_min_pages_to_scan = 500; static void set_advisor_defaults(void) { if (ksm_advisor == KSM_ADVISOR_NONE) { ksm_thread_pages_to_scan = DEFAULT_PAGES_TO_SCAN; } else if (ksm_advisor == KSM_ADVISOR_SCAN_TIME) { advisor_ctx = (const struct advisor_ctx){ 0 }; ksm_thread_pages_to_scan = ksm_advisor_min_pages_to_scan; } } #endif /* CONFIG_SYSFS */ static inline void advisor_start_scan(void) { if (ksm_advisor == KSM_ADVISOR_SCAN_TIME) advisor_ctx.start_scan = ktime_get(); } /* * Use previous scan time if available, otherwise use current scan time as an * approximation for the previous scan time. */ static inline unsigned long prev_scan_time(struct advisor_ctx *ctx, unsigned long scan_time) { return ctx->scan_time ? ctx->scan_time : scan_time; } /* Calculate exponential weighted moving average */ static unsigned long ewma(unsigned long prev, unsigned long curr) { return ((100 - EWMA_WEIGHT) * prev + EWMA_WEIGHT * curr) / 100; } /* * The scan time advisor is based on the current scan rate and the target * scan rate. * * new_pages_to_scan = pages_to_scan * (scan_time / target_scan_time) * * To avoid perturbations it calculates a change factor of previous changes. * A new change factor is calculated for each iteration and it uses an * exponentially weighted moving average. The new pages_to_scan value is * multiplied with that change factor: * * new_pages_to_scan *= change facor * * The new_pages_to_scan value is limited by the cpu min and max values. It * calculates the cpu percent for the last scan and calculates the new * estimated cpu percent cost for the next scan. That value is capped by the * cpu min and max setting. * * In addition the new pages_to_scan value is capped by the max and min * limits. */ static void scan_time_advisor(void) { unsigned int cpu_percent; unsigned long cpu_time; unsigned long cpu_time_diff; unsigned long cpu_time_diff_ms; unsigned long pages; unsigned long per_page_cost; unsigned long factor; unsigned long change; unsigned long last_scan_time; unsigned long scan_time; /* Convert scan time to seconds */ scan_time = div_s64(ktime_ms_delta(ktime_get(), advisor_ctx.start_scan), MSEC_PER_SEC); scan_time = scan_time ? scan_time : 1; /* Calculate CPU consumption of ksmd background thread */ cpu_time = task_sched_runtime(current); cpu_time_diff = cpu_time - advisor_ctx.cpu_time; cpu_time_diff_ms = cpu_time_diff / 1000 / 1000; cpu_percent = (cpu_time_diff_ms * 100) / (scan_time * 1000); cpu_percent = cpu_percent ? cpu_percent : 1; last_scan_time = prev_scan_time(&advisor_ctx, scan_time); /* Calculate scan time as percentage of target scan time */ factor = ksm_advisor_target_scan_time * 100 / scan_time; factor = factor ? factor : 1; /* * Calculate scan time as percentage of last scan time and use * exponentially weighted average to smooth it */ change = scan_time * 100 / last_scan_time; change = change ? change : 1; change = ewma(advisor_ctx.change, change); /* Calculate new scan rate based on target scan rate. */ pages = ksm_thread_pages_to_scan * 100 / factor; /* Update pages_to_scan by weighted change percentage. */ pages = pages * change / 100; /* Cap new pages_to_scan value */ per_page_cost = ksm_thread_pages_to_scan / cpu_percent; per_page_cost = per_page_cost ? per_page_cost : 1; pages = min(pages, per_page_cost * ksm_advisor_max_cpu); pages = max(pages, per_page_cost * KSM_ADVISOR_MIN_CPU); pages = min(pages, ksm_advisor_max_pages_to_scan); /* Update advisor context */ advisor_ctx.change = change; advisor_ctx.scan_time = scan_time; advisor_ctx.cpu_time = cpu_time; ksm_thread_pages_to_scan = pages; trace_ksm_advisor(scan_time, pages, cpu_percent); } static void advisor_stop_scan(void) { if (ksm_advisor == KSM_ADVISOR_SCAN_TIME) scan_time_advisor(); } #ifdef CONFIG_NUMA /* Zeroed when merging across nodes is not allowed */ static unsigned int ksm_merge_across_nodes = 1; static int ksm_nr_node_ids = 1; #else #define ksm_merge_across_nodes 1U #define ksm_nr_node_ids 1 #endif #define KSM_RUN_STOP 0 #define KSM_RUN_MERGE 1 #define KSM_RUN_UNMERGE 2 #define KSM_RUN_OFFLINE 4 static unsigned long ksm_run = KSM_RUN_STOP; static void wait_while_offlining(void); static DECLARE_WAIT_QUEUE_HEAD(ksm_thread_wait); static DECLARE_WAIT_QUEUE_HEAD(ksm_iter_wait); static DEFINE_MUTEX(ksm_thread_mutex); static DEFINE_SPINLOCK(ksm_mmlist_lock); static int __init ksm_slab_init(void) { rmap_item_cache = KMEM_CACHE(ksm_rmap_item, 0); if (!rmap_item_cache) goto out; stable_node_cache = KMEM_CACHE(ksm_stable_node, 0); if (!stable_node_cache) goto out_free1; mm_slot_cache = KMEM_CACHE(ksm_mm_slot, 0); if (!mm_slot_cache) goto out_free2; return 0; out_free2: kmem_cache_destroy(stable_node_cache); out_free1: kmem_cache_destroy(rmap_item_cache); out: return -ENOMEM; } static void __init ksm_slab_free(void) { kmem_cache_destroy(mm_slot_cache); kmem_cache_destroy(stable_node_cache); kmem_cache_destroy(rmap_item_cache); mm_slot_cache = NULL; } static __always_inline bool is_stable_node_chain(struct ksm_stable_node *chain) { return chain->rmap_hlist_len == STABLE_NODE_CHAIN; } static __always_inline bool is_stable_node_dup(struct ksm_stable_node *dup) { return dup->head == STABLE_NODE_DUP_HEAD; } static inline void stable_node_chain_add_dup(struct ksm_stable_node *dup, struct ksm_stable_node *chain) { VM_BUG_ON(is_stable_node_dup(dup)); dup->head = STABLE_NODE_DUP_HEAD; VM_BUG_ON(!is_stable_node_chain(chain)); hlist_add_head(&dup->hlist_dup, &chain->hlist); ksm_stable_node_dups++; } static inline void __stable_node_dup_del(struct ksm_stable_node *dup) { VM_BUG_ON(!is_stable_node_dup(dup)); hlist_del(&dup->hlist_dup); ksm_stable_node_dups--; } static inline void stable_node_dup_del(struct ksm_stable_node *dup) { VM_BUG_ON(is_stable_node_chain(dup)); if (is_stable_node_dup(dup)) __stable_node_dup_del(dup); else rb_erase(&dup->node, root_stable_tree + NUMA(dup->nid)); #ifdef CONFIG_DEBUG_VM dup->head = NULL; #endif } static inline struct ksm_rmap_item *alloc_rmap_item(void) { struct ksm_rmap_item *rmap_item; rmap_item = kmem_cache_zalloc(rmap_item_cache, GFP_KERNEL | __GFP_NORETRY | __GFP_NOWARN); if (rmap_item) ksm_rmap_items++; return rmap_item; } static inline void free_rmap_item(struct ksm_rmap_item *rmap_item) { ksm_rmap_items--; rmap_item->mm->ksm_rmap_items--; rmap_item->mm = NULL; /* debug safety */ kmem_cache_free(rmap_item_cache, rmap_item); } static inline struct ksm_stable_node *alloc_stable_node(void) { /* * The allocation can take too long with GFP_KERNEL when memory is under * pressure, which may lead to hung task warnings. Adding __GFP_HIGH * grants access to memory reserves, helping to avoid this problem. */ return kmem_cache_alloc(stable_node_cache, GFP_KERNEL | __GFP_HIGH); } static inline void free_stable_node(struct ksm_stable_node *stable_node) { VM_BUG_ON(stable_node->rmap_hlist_len && !is_stable_node_chain(stable_node)); kmem_cache_free(stable_node_cache, stable_node); } /* * ksmd, and unmerge_and_remove_all_rmap_items(), must not touch an mm's * page tables after it has passed through ksm_exit() - which, if necessary, * takes mmap_lock briefly to serialize against them. ksm_exit() does not set * a special flag: they can just back out as soon as mm_users goes to zero. * ksm_test_exit() is used throughout to make this test for exit: in some * places for correctness, in some places just to avoid unnecessary work. */ static inline bool ksm_test_exit(struct mm_struct *mm) { return atomic_read(&mm->mm_users) == 0; } /* * We use break_ksm to break COW on a ksm page by triggering unsharing, * such that the ksm page will get replaced by an exclusive anonymous page. * * We take great care only to touch a ksm page, in a VM_MERGEABLE vma, * in case the application has unmapped and remapped mm,addr meanwhile. * Could a ksm page appear anywhere else? Actually yes, in a VM_PFNMAP * mmap of /dev/mem, where we would not want to touch it. * * FAULT_FLAG_REMOTE/FOLL_REMOTE are because we do this outside the context * of the process that owns 'vma'. We also do not want to enforce * protection keys here anyway. */ static int break_ksm(struct vm_area_struct *vma, unsigned long addr, bool lock_vma) { vm_fault_t ret = 0; if (lock_vma) vma_start_write(vma); do { bool ksm_page = false; struct folio_walk fw; struct folio *folio; cond_resched(); folio = folio_walk_start(&fw, vma, addr, FW_MIGRATION | FW_ZEROPAGE); if (folio) { /* Small folio implies FW_LEVEL_PTE. */ if (!folio_test_large(folio) && (folio_test_ksm(folio) || is_ksm_zero_pte(fw.pte))) ksm_page = true; folio_walk_end(&fw, vma); } if (!ksm_page) return 0; ret = handle_mm_fault(vma, addr, FAULT_FLAG_UNSHARE | FAULT_FLAG_REMOTE, NULL); } while (!(ret & (VM_FAULT_SIGBUS | VM_FAULT_SIGSEGV | VM_FAULT_OOM))); /* * We must loop until we no longer find a KSM page because * handle_mm_fault() may back out if there's any difficulty e.g. if * pte accessed bit gets updated concurrently. * * VM_FAULT_SIGBUS could occur if we race with truncation of the * backing file, which also invalidates anonymous pages: that's * okay, that truncation will have unmapped the PageKsm for us. * * VM_FAULT_OOM: at the time of writing (late July 2009), setting * aside mem_cgroup limits, VM_FAULT_OOM would only be set if the * current task has TIF_MEMDIE set, and will be OOM killed on return * to user; and ksmd, having no mm, would never be chosen for that. * * But if the mm is in a limited mem_cgroup, then the fault may fail * with VM_FAULT_OOM even if the current task is not TIF_MEMDIE; and * even ksmd can fail in this way - though it's usually breaking ksm * just to undo a merge it made a moment before, so unlikely to oom. * * That's a pity: we might therefore have more kernel pages allocated * than we're counting as nodes in the stable tree; but ksm_do_scan * will retry to break_cow on each pass, so should recover the page * in due course. The important thing is to not let VM_MERGEABLE * be cleared while any such pages might remain in the area. */ return (ret & VM_FAULT_OOM) ? -ENOMEM : 0; } static bool vma_ksm_compatible(struct vm_area_struct *vma) { if (vma->vm_flags & (VM_SHARED | VM_MAYSHARE | VM_PFNMAP | VM_IO | VM_DONTEXPAND | VM_HUGETLB | VM_MIXEDMAP| VM_DROPPABLE)) return false; /* just ignore the advice */ if (vma_is_dax(vma)) return false; #ifdef VM_SAO if (vma->vm_flags & VM_SAO) return false; #endif #ifdef VM_SPARC_ADI if (vma->vm_flags & VM_SPARC_ADI) return false; #endif return true; } static struct vm_area_struct *find_mergeable_vma(struct mm_struct *mm, unsigned long addr) { struct vm_area_struct *vma; if (ksm_test_exit(mm)) return NULL; vma = vma_lookup(mm, addr); if (!vma || !(vma->vm_flags & VM_MERGEABLE) || !vma->anon_vma) return NULL; return vma; } static void break_cow(struct ksm_rmap_item *rmap_item) { struct mm_struct *mm = rmap_item->mm; unsigned long addr = rmap_item->address; struct vm_area_struct *vma; /* * It is not an accident that whenever we want to break COW * to undo, we also need to drop a reference to the anon_vma. */ put_anon_vma(rmap_item->anon_vma); mmap_read_lock(mm); vma = find_mergeable_vma(mm, addr); if (vma) break_ksm(vma, addr, false); mmap_read_unlock(mm); } static struct page *get_mergeable_page(struct ksm_rmap_item *rmap_item) { struct mm_struct *mm = rmap_item->mm; unsigned long addr = rmap_item->address; struct vm_area_struct *vma; struct page *page = NULL; struct folio_walk fw; struct folio *folio; mmap_read_lock(mm); vma = find_mergeable_vma(mm, addr); if (!vma) goto out; folio = folio_walk_start(&fw, vma, addr, 0); if (folio) { if (!folio_is_zone_device(folio) && folio_test_anon(folio)) { folio_get(folio); page = fw.page; } folio_walk_end(&fw, vma); } out: if (page) { flush_anon_page(vma, page, addr); flush_dcache_page(page); } mmap_read_unlock(mm); return page; } /* * This helper is used for getting right index into array of tree roots. * When merge_across_nodes knob is set to 1, there are only two rb-trees for * stable and unstable pages from all nodes with roots in index 0. Otherwise, * every node has its own stable and unstable tree. */ static inline int get_kpfn_nid(unsigned long kpfn) { return ksm_merge_across_nodes ? 0 : NUMA(pfn_to_nid(kpfn)); } static struct ksm_stable_node *alloc_stable_node_chain(struct ksm_stable_node *dup, struct rb_root *root) { struct ksm_stable_node *chain = alloc_stable_node(); VM_BUG_ON(is_stable_node_chain(dup)); if (likely(chain)) { INIT_HLIST_HEAD(&chain->hlist); chain->chain_prune_time = jiffies; chain->rmap_hlist_len = STABLE_NODE_CHAIN; #if defined (CONFIG_DEBUG_VM) && defined(CONFIG_NUMA) chain->nid = NUMA_NO_NODE; /* debug */ #endif ksm_stable_node_chains++; /* * Put the stable node chain in the first dimension of * the stable tree and at the same time remove the old * stable node. */ rb_replace_node(&dup->node, &chain->node, root); /* * Move the old stable node to the second dimension * queued in the hlist_dup. The invariant is that all * dup stable_nodes in the chain->hlist point to pages * that are write protected and have the exact same * content. */ stable_node_chain_add_dup(dup, chain); } return chain; } static inline void free_stable_node_chain(struct ksm_stable_node *chain, struct rb_root *root) { rb_erase(&chain->node, root); free_stable_node(chain); ksm_stable_node_chains--; } static void remove_node_from_stable_tree(struct ksm_stable_node *stable_node) { struct ksm_rmap_item *rmap_item; /* check it's not STABLE_NODE_CHAIN or negative */ BUG_ON(stable_node->rmap_hlist_len < 0); hlist_for_each_entry(rmap_item, &stable_node->hlist, hlist) { if (rmap_item->hlist.next) { ksm_pages_sharing--; trace_ksm_remove_rmap_item(stable_node->kpfn, rmap_item, rmap_item->mm); } else { ksm_pages_shared--; } rmap_item->mm->ksm_merging_pages--; VM_BUG_ON(stable_node->rmap_hlist_len <= 0); stable_node->rmap_hlist_len--; put_anon_vma(rmap_item->anon_vma); rmap_item->address &= PAGE_MASK; cond_resched(); } /* * We need the second aligned pointer of the migrate_nodes * list_head to stay clear from the rb_parent_color union * (aligned and different than any node) and also different * from &migrate_nodes. This will verify that future list.h changes * don't break STABLE_NODE_DUP_HEAD. Only recent gcc can handle it. */ BUILD_BUG_ON(STABLE_NODE_DUP_HEAD <= &migrate_nodes); BUILD_BUG_ON(STABLE_NODE_DUP_HEAD >= &migrate_nodes + 1); trace_ksm_remove_ksm_page(stable_node->kpfn); if (stable_node->head == &migrate_nodes) list_del(&stable_node->list); else stable_node_dup_del(stable_node); free_stable_node(stable_node); } enum ksm_get_folio_flags { KSM_GET_FOLIO_NOLOCK, KSM_GET_FOLIO_LOCK, KSM_GET_FOLIO_TRYLOCK }; /* * ksm_get_folio: checks if the page indicated by the stable node * is still its ksm page, despite having held no reference to it. * In which case we can trust the content of the page, and it * returns the gotten page; but if the page has now been zapped, * remove the stale node from the stable tree and return NULL. * But beware, the stable node's page might be being migrated. * * You would expect the stable_node to hold a reference to the ksm page. * But if it increments the page's count, swapping out has to wait for * ksmd to come around again before it can free the page, which may take * seconds or even minutes: much too unresponsive. So instead we use a * "keyhole reference": access to the ksm page from the stable node peeps * out through its keyhole to see if that page still holds the right key, * pointing back to this stable node. This relies on freeing a PageAnon * page to reset its page->mapping to NULL, and relies on no other use of * a page to put something that might look like our key in page->mapping. * is on its way to being freed; but it is an anomaly to bear in mind. */ static struct folio *ksm_get_folio(struct ksm_stable_node *stable_node, enum ksm_get_folio_flags flags) { struct folio *folio; void *expected_mapping; unsigned long kpfn; expected_mapping = (void *)((unsigned long)stable_node | PAGE_MAPPING_KSM); again: kpfn = READ_ONCE(stable_node->kpfn); /* Address dependency. */ folio = pfn_folio(kpfn); if (READ_ONCE(folio->mapping) != expected_mapping) goto stale; /* * We cannot do anything with the page while its refcount is 0. * Usually 0 means free, or tail of a higher-order page: in which * case this node is no longer referenced, and should be freed; * however, it might mean that the page is under page_ref_freeze(). * The __remove_mapping() case is easy, again the node is now stale; * the same is in reuse_ksm_page() case; but if page is swapcache * in folio_migrate_mapping(), it might still be our page, * in which case it's essential to keep the node. */ while (!folio_try_get(folio)) { /* * Another check for folio->mapping != expected_mapping * would work here too. We have chosen to test the * swapcache flag to optimize the common case, when the * folio is or is about to be freed: the swapcache flag * is cleared (under spin_lock_irq) in the ref_freeze * section of __remove_mapping(); but anon folio->mapping * is reset to NULL later, in free_pages_prepare(). */ if (!folio_test_swapcache(folio)) goto stale; cpu_relax(); } if (READ_ONCE(folio->mapping) != expected_mapping) { folio_put(folio); goto stale; } if (flags == KSM_GET_FOLIO_TRYLOCK) { if (!folio_trylock(folio)) { folio_put(folio); return ERR_PTR(-EBUSY); } } else if (flags == KSM_GET_FOLIO_LOCK) folio_lock(folio); if (flags != KSM_GET_FOLIO_NOLOCK) { if (READ_ONCE(folio->mapping) != expected_mapping) { folio_unlock(folio); folio_put(folio); goto stale; } } return folio; stale: /* * We come here from above when folio->mapping or the swapcache flag * suggests that the node is stale; but it might be under migration. * We need smp_rmb(), matching the smp_wmb() in folio_migrate_ksm(), * before checking whether node->kpfn has been changed. */ smp_rmb(); if (READ_ONCE(stable_node->kpfn) != kpfn) goto again; remove_node_from_stable_tree(stable_node); return NULL; } /* * Removing rmap_item from stable or unstable tree. * This function will clean the information from the stable/unstable tree. */ static void remove_rmap_item_from_tree(struct ksm_rmap_item *rmap_item) { if (rmap_item->address & STABLE_FLAG) { struct ksm_stable_node *stable_node; struct folio *folio; stable_node = rmap_item->head; folio = ksm_get_folio(stable_node, KSM_GET_FOLIO_LOCK); if (!folio) goto out; hlist_del(&rmap_item->hlist); folio_unlock(folio); folio_put(folio); if (!hlist_empty(&stable_node->hlist)) ksm_pages_sharing--; else ksm_pages_shared--; rmap_item->mm->ksm_merging_pages--; VM_BUG_ON(stable_node->rmap_hlist_len <= 0); stable_node->rmap_hlist_len--; put_anon_vma(rmap_item->anon_vma); rmap_item->head = NULL; rmap_item->address &= PAGE_MASK; } else if (rmap_item->address & UNSTABLE_FLAG) { unsigned char age; /* * Usually ksmd can and must skip the rb_erase, because * root_unstable_tree was already reset to RB_ROOT. * But be careful when an mm is exiting: do the rb_erase * if this rmap_item was inserted by this scan, rather * than left over from before. */ age = (unsigned char)(ksm_scan.seqnr - rmap_item->address); BUG_ON(age > 1); if (!age) rb_erase(&rmap_item->node, root_unstable_tree + NUMA(rmap_item->nid)); ksm_pages_unshared--; rmap_item->address &= PAGE_MASK; } out: cond_resched(); /* we're called from many long loops */ } static void remove_trailing_rmap_items(struct ksm_rmap_item **rmap_list) { while (*rmap_list) { struct ksm_rmap_item *rmap_item = *rmap_list; *rmap_list = rmap_item->rmap_list; remove_rmap_item_from_tree(rmap_item); free_rmap_item(rmap_item); } } /* * Though it's very tempting to unmerge rmap_items from stable tree rather * than check every pte of a given vma, the locking doesn't quite work for * that - an rmap_item is assigned to the stable tree after inserting ksm * page and upping mmap_lock. Nor does it fit with the way we skip dup'ing * rmap_items from parent to child at fork time (so as not to waste time * if exit comes before the next scan reaches it). * * Similarly, although we'd like to remove rmap_items (so updating counts * and freeing memory) when unmerging an area, it's easier to leave that * to the next pass of ksmd - consider, for example, how ksmd might be * in cmp_and_merge_page on one of the rmap_items we would be removing. */ static int unmerge_ksm_pages(struct vm_area_struct *vma, unsigned long start, unsigned long end, bool lock_vma) { unsigned long addr; int err = 0; for (addr = start; addr < end && !err; addr += PAGE_SIZE) { if (ksm_test_exit(vma->vm_mm)) break; if (signal_pending(current)) err = -ERESTARTSYS; else err = break_ksm(vma, addr, lock_vma); } return err; } static inline struct ksm_stable_node *folio_stable_node(struct folio *folio) { return folio_test_ksm(folio) ? folio_raw_mapping(folio) : NULL; } static inline struct ksm_stable_node *page_stable_node(struct page *page) { return folio_stable_node(page_folio(page)); } static inline void folio_set_stable_node(struct folio *folio, struct ksm_stable_node *stable_node) { VM_WARN_ON_FOLIO(folio_test_anon(folio) && PageAnonExclusive(&folio->page), folio); folio->mapping = (void *)((unsigned long)stable_node | PAGE_MAPPING_KSM); } #ifdef CONFIG_SYSFS /* * Only called through the sysfs control interface: */ static int remove_stable_node(struct ksm_stable_node *stable_node) { struct folio *folio; int err; folio = ksm_get_folio(stable_node, KSM_GET_FOLIO_LOCK); if (!folio) { /* * ksm_get_folio did remove_node_from_stable_tree itself. */ return 0; } /* * Page could be still mapped if this races with __mmput() running in * between ksm_exit() and exit_mmap(). Just refuse to let * merge_across_nodes/max_page_sharing be switched. */ err = -EBUSY; if (!folio_mapped(folio)) { /* * The stable node did not yet appear stale to ksm_get_folio(), * since that allows for an unmapped ksm folio to be recognized * right up until it is freed; but the node is safe to remove. * This folio might be in an LRU cache waiting to be freed, * or it might be in the swapcache (perhaps under writeback), * or it might have been removed from swapcache a moment ago. */ folio_set_stable_node(folio, NULL); remove_node_from_stable_tree(stable_node); err = 0; } folio_unlock(folio); folio_put(folio); return err; } static int remove_stable_node_chain(struct ksm_stable_node *stable_node, struct rb_root *root) { struct ksm_stable_node *dup; struct hlist_node *hlist_safe; if (!is_stable_node_chain(stable_node)) { VM_BUG_ON(is_stable_node_dup(stable_node)); if (remove_stable_node(stable_node)) return true; else return false; } hlist_for_each_entry_safe(dup, hlist_safe, &stable_node->hlist, hlist_dup) { VM_BUG_ON(!is_stable_node_dup(dup)); if (remove_stable_node(dup)) return true; } BUG_ON(!hlist_empty(&stable_node->hlist)); free_stable_node_chain(stable_node, root); return false; } static int remove_all_stable_nodes(void) { struct ksm_stable_node *stable_node, *next; int nid; int err = 0; for (nid = 0; nid < ksm_nr_node_ids; nid++) { while (root_stable_tree[nid].rb_node) { stable_node = rb_entry(root_stable_tree[nid].rb_node, struct ksm_stable_node, node); if (remove_stable_node_chain(stable_node, root_stable_tree + nid)) { err = -EBUSY; break; /* proceed to next nid */ } cond_resched(); } } list_for_each_entry_safe(stable_node, next, &migrate_nodes, list) { if (remove_stable_node(stable_node)) err = -EBUSY; cond_resched(); } return err; } static int unmerge_and_remove_all_rmap_items(void) { struct ksm_mm_slot *mm_slot; struct mm_slot *slot; struct mm_struct *mm; struct vm_area_struct *vma; int err = 0; spin_lock(&ksm_mmlist_lock); slot = list_entry(ksm_mm_head.slot.mm_node.next, struct mm_slot, mm_node); ksm_scan.mm_slot = mm_slot_entry(slot, struct ksm_mm_slot, slot); spin_unlock(&ksm_mmlist_lock); for (mm_slot = ksm_scan.mm_slot; mm_slot != &ksm_mm_head; mm_slot = ksm_scan.mm_slot) { VMA_ITERATOR(vmi, mm_slot->slot.mm, 0); mm = mm_slot->slot.mm; mmap_read_lock(mm); /* * Exit right away if mm is exiting to avoid lockdep issue in * the maple tree */ if (ksm_test_exit(mm)) goto mm_exiting; for_each_vma(vmi, vma) { if (!(vma->vm_flags & VM_MERGEABLE) || !vma->anon_vma) continue; err = unmerge_ksm_pages(vma, vma->vm_start, vma->vm_end, false); if (err) goto error; } mm_exiting: remove_trailing_rmap_items(&mm_slot->rmap_list); mmap_read_unlock(mm); spin_lock(&ksm_mmlist_lock); slot = list_entry(mm_slot->slot.mm_node.next, struct mm_slot, mm_node); ksm_scan.mm_slot = mm_slot_entry(slot, struct ksm_mm_slot, slot); if (ksm_test_exit(mm)) { hash_del(&mm_slot->slot.hash); list_del(&mm_slot->slot.mm_node); spin_unlock(&ksm_mmlist_lock); mm_slot_free(mm_slot_cache, mm_slot); clear_bit(MMF_VM_MERGEABLE, &mm->flags); clear_bit(MMF_VM_MERGE_ANY, &mm->flags); mmdrop(mm); } else spin_unlock(&ksm_mmlist_lock); } /* Clean up stable nodes, but don't worry if some are still busy */ remove_all_stable_nodes(); ksm_scan.seqnr = 0; return 0; error: mmap_read_unlock(mm); spin_lock(&ksm_mmlist_lock); ksm_scan.mm_slot = &ksm_mm_head; spin_unlock(&ksm_mmlist_lock); return err; } #endif /* CONFIG_SYSFS */ static u32 calc_checksum(struct page *page) { u32 checksum; void *addr = kmap_local_page(page); checksum = xxhash(addr, PAGE_SIZE, 0); kunmap_local(addr); return checksum; } static int write_protect_page(struct vm_area_struct *vma, struct folio *folio, pte_t *orig_pte) { struct mm_struct *mm = vma->vm_mm; DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, 0, 0); int swapped; int err = -EFAULT; struct mmu_notifier_range range; bool anon_exclusive; pte_t entry; if (WARN_ON_ONCE(folio_test_large(folio))) return err; pvmw.address = page_address_in_vma(&folio->page, vma); if (pvmw.address == -EFAULT) goto out; mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, pvmw.address, pvmw.address + PAGE_SIZE); mmu_notifier_invalidate_range_start(&range); if (!page_vma_mapped_walk(&pvmw)) goto out_mn; if (WARN_ONCE(!pvmw.pte, "Unexpected PMD mapping?")) goto out_unlock; anon_exclusive = PageAnonExclusive(&folio->page); entry = ptep_get(pvmw.pte); if (pte_write(entry) || pte_dirty(entry) || anon_exclusive || mm_tlb_flush_pending(mm)) { swapped = folio_test_swapcache(folio); flush_cache_page(vma, pvmw.address, folio_pfn(folio)); /* * Ok this is tricky, when get_user_pages_fast() run it doesn't * take any lock, therefore the check that we are going to make * with the pagecount against the mapcount is racy and * O_DIRECT can happen right after the check. * So we clear the pte and flush the tlb before the check * this assure us that no O_DIRECT can happen after the check * or in the middle of the check. * * No need to notify as we are downgrading page table to read * only not changing it to point to a new page. * * See Documentation/mm/mmu_notifier.rst */ entry = ptep_clear_flush(vma, pvmw.address, pvmw.pte); /* * Check that no O_DIRECT or similar I/O is in progress on the * page */ if (folio_mapcount(folio) + 1 + swapped != folio_ref_count(folio)) { set_pte_at(mm, pvmw.address, pvmw.pte, entry); goto out_unlock; } /* See folio_try_share_anon_rmap_pte(): clear PTE first. */ if (anon_exclusive && folio_try_share_anon_rmap_pte(folio, &folio->page)) { set_pte_at(mm, pvmw.address, pvmw.pte, entry); goto out_unlock; } if (pte_dirty(entry)) folio_mark_dirty(folio); entry = pte_mkclean(entry); if (pte_write(entry)) entry = pte_wrprotect(entry); set_pte_at(mm, pvmw.address, pvmw.pte, entry); } *orig_pte = entry; err = 0; out_unlock: page_vma_mapped_walk_done(&pvmw); out_mn: mmu_notifier_invalidate_range_end(&range); out: return err; } /** * replace_page - replace page in vma by new ksm page * @vma: vma that holds the pte pointing to page * @page: the page we are replacing by kpage * @kpage: the ksm page we replace page by * @orig_pte: the original value of the pte * * Returns 0 on success, -EFAULT on failure. */ static int replace_page(struct vm_area_struct *vma, struct page *page, struct page *kpage, pte_t orig_pte) { struct folio *kfolio = page_folio(kpage); struct mm_struct *mm = vma->vm_mm; struct folio *folio; pmd_t *pmd; pmd_t pmde; pte_t *ptep; pte_t newpte; spinlock_t *ptl; unsigned long addr; int err = -EFAULT; struct mmu_notifier_range range; addr = page_address_in_vma(page, vma); if (addr == -EFAULT) goto out; pmd = mm_find_pmd(mm, addr); if (!pmd) goto out; /* * Some THP functions use the sequence pmdp_huge_clear_flush(), set_pmd_at() * without holding anon_vma lock for write. So when looking for a * genuine pmde (in which to find pte), test present and !THP together. */ pmde = pmdp_get_lockless(pmd); if (!pmd_present(pmde) || pmd_trans_huge(pmde)) goto out; mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, addr, addr + PAGE_SIZE); mmu_notifier_invalidate_range_start(&range); ptep = pte_offset_map_lock(mm, pmd, addr, &ptl); if (!ptep) goto out_mn; if (!pte_same(ptep_get(ptep), orig_pte)) { pte_unmap_unlock(ptep, ptl); goto out_mn; } VM_BUG_ON_PAGE(PageAnonExclusive(page), page); VM_BUG_ON_FOLIO(folio_test_anon(kfolio) && PageAnonExclusive(kpage), kfolio); /* * No need to check ksm_use_zero_pages here: we can only have a * zero_page here if ksm_use_zero_pages was enabled already. */ if (!is_zero_pfn(page_to_pfn(kpage))) { folio_get(kfolio); folio_add_anon_rmap_pte(kfolio, kpage, vma, addr, RMAP_NONE); newpte = mk_pte(kpage, vma->vm_page_prot); } else { /* * Use pte_mkdirty to mark the zero page mapped by KSM, and then * we can easily track all KSM-placed zero pages by checking if * the dirty bit in zero page's PTE is set. */ newpte = pte_mkdirty(pte_mkspecial(pfn_pte(page_to_pfn(kpage), vma->vm_page_prot))); ksm_map_zero_page(mm); /* * We're replacing an anonymous page with a zero page, which is * not anonymous. We need to do proper accounting otherwise we * will get wrong values in /proc, and a BUG message in dmesg * when tearing down the mm. */ dec_mm_counter(mm, MM_ANONPAGES); } flush_cache_page(vma, addr, pte_pfn(ptep_get(ptep))); /* * No need to notify as we are replacing a read only page with another * read only page with the same content. * * See Documentation/mm/mmu_notifier.rst */ ptep_clear_flush(vma, addr, ptep); set_pte_at(mm, addr, ptep, newpte); folio = page_folio(page); folio_remove_rmap_pte(folio, page, vma); if (!folio_mapped(folio)) folio_free_swap(folio); folio_put(folio); pte_unmap_unlock(ptep, ptl); err = 0; out_mn: mmu_notifier_invalidate_range_end(&range); out: return err; } /* * try_to_merge_one_page - take two pages and merge them into one * @vma: the vma that holds the pte pointing to page * @page: the PageAnon page that we want to replace with kpage * @kpage: the PageKsm page that we want to map instead of page, * or NULL the first time when we want to use page as kpage. * * This function returns 0 if the pages were merged, -EFAULT otherwise. */ static int try_to_merge_one_page(struct vm_area_struct *vma, struct page *page, struct page *kpage) { pte_t orig_pte = __pte(0); int err = -EFAULT; if (page == kpage) /* ksm page forked */ return 0; if (!PageAnon(page)) goto out; /* * We need the folio lock to read a stable swapcache flag in * write_protect_page(). We use trylock_page() instead of * lock_page() because we don't want to wait here - we * prefer to continue scanning and merging different pages, * then come back to this page when it is unlocked. */ if (!trylock_page(page)) goto out; if (PageTransCompound(page)) { if (split_huge_page(page)) goto out_unlock; } /* * If this anonymous page is mapped only here, its pte may need * to be write-protected. If it's mapped elsewhere, all of its * ptes are necessarily already write-protected. But in either * case, we need to lock and check page_count is not raised. */ if (write_protect_page(vma, page_folio(page), &orig_pte) == 0) { if (!kpage) { /* * While we hold page lock, upgrade page from * PageAnon+anon_vma to PageKsm+NULL stable_node: * stable_tree_insert() will update stable_node. */ folio_set_stable_node(page_folio(page), NULL); mark_page_accessed(page); /* * Page reclaim just frees a clean page with no dirty * ptes: make sure that the ksm page would be swapped. */ if (!PageDirty(page)) SetPageDirty(page); err = 0; } else if (pages_identical(page, kpage)) err = replace_page(vma, page, kpage, orig_pte); } out_unlock: unlock_page(page); out: return err; } /* * This function returns 0 if the pages were merged or if they are * no longer merging candidates (e.g., VMA stale), -EFAULT otherwise. */ static int try_to_merge_with_zero_page(struct ksm_rmap_item *rmap_item, struct page *page) { struct mm_struct *mm = rmap_item->mm; int err = -EFAULT; /* * Same checksum as an empty page. We attempt to merge it with the * appropriate zero page if the user enabled this via sysfs. */ if (ksm_use_zero_pages && (rmap_item->oldchecksum == zero_checksum)) { struct vm_area_struct *vma; mmap_read_lock(mm); vma = find_mergeable_vma(mm, rmap_item->address); if (vma) { err = try_to_merge_one_page(vma, page, ZERO_PAGE(rmap_item->address)); trace_ksm_merge_one_page( page_to_pfn(ZERO_PAGE(rmap_item->address)), rmap_item, mm, err); } else { /* * If the vma is out of date, we do not need to * continue. */ err = 0; } mmap_read_unlock(mm); } return err; } /* * try_to_merge_with_ksm_page - like try_to_merge_two_pages, * but no new kernel page is allocated: kpage must already be a ksm page. * * This function returns 0 if the pages were merged, -EFAULT otherwise. */ static int try_to_merge_with_ksm_page(struct ksm_rmap_item *rmap_item, struct page *page, struct page *kpage) { struct mm_struct *mm = rmap_item->mm; struct vm_area_struct *vma; int err = -EFAULT; mmap_read_lock(mm); vma = find_mergeable_vma(mm, rmap_item->address); if (!vma) goto out; err = try_to_merge_one_page(vma, page, kpage); if (err) goto out; /* Unstable nid is in union with stable anon_vma: remove first */ remove_rmap_item_from_tree(rmap_item); /* Must get reference to anon_vma while still holding mmap_lock */ rmap_item->anon_vma = vma->anon_vma; get_anon_vma(vma->anon_vma); out: mmap_read_unlock(mm); trace_ksm_merge_with_ksm_page(kpage, page_to_pfn(kpage ? kpage : page), rmap_item, mm, err); return err; } /* * try_to_merge_two_pages - take two identical pages and prepare them * to be merged into one page. * * This function returns the kpage if we successfully merged two identical * pages into one ksm page, NULL otherwise. * * Note that this function upgrades page to ksm page: if one of the pages * is already a ksm page, try_to_merge_with_ksm_page should be used. */ static struct page *try_to_merge_two_pages(struct ksm_rmap_item *rmap_item, struct page *page, struct ksm_rmap_item *tree_rmap_item, struct page *tree_page) { int err; err = try_to_merge_with_ksm_page(rmap_item, page, NULL); if (!err) { err = try_to_merge_with_ksm_page(tree_rmap_item, tree_page, page); /* * If that fails, we have a ksm page with only one pte * pointing to it: so break it. */ if (err) break_cow(rmap_item); } return err ? NULL : page; } static __always_inline bool __is_page_sharing_candidate(struct ksm_stable_node *stable_node, int offset) { VM_BUG_ON(stable_node->rmap_hlist_len < 0); /* * Check that at least one mapping still exists, otherwise * there's no much point to merge and share with this * stable_node, as the underlying tree_page of the other * sharer is going to be freed soon. */ return stable_node->rmap_hlist_len && stable_node->rmap_hlist_len + offset < ksm_max_page_sharing; } static __always_inline bool is_page_sharing_candidate(struct ksm_stable_node *stable_node) { return __is_page_sharing_candidate(stable_node, 0); } static struct folio *stable_node_dup(struct ksm_stable_node **_stable_node_dup, struct ksm_stable_node **_stable_node, struct rb_root *root, bool prune_stale_stable_nodes) { struct ksm_stable_node *dup, *found = NULL, *stable_node = *_stable_node; struct hlist_node *hlist_safe; struct folio *folio, *tree_folio = NULL; int found_rmap_hlist_len; if (!prune_stale_stable_nodes || time_before(jiffies, stable_node->chain_prune_time + msecs_to_jiffies( ksm_stable_node_chains_prune_millisecs))) prune_stale_stable_nodes = false; else stable_node->chain_prune_time = jiffies; hlist_for_each_entry_safe(dup, hlist_safe, &stable_node->hlist, hlist_dup) { cond_resched(); /* * We must walk all stable_node_dup to prune the stale * stable nodes during lookup. * * ksm_get_folio can drop the nodes from the * stable_node->hlist if they point to freed pages * (that's why we do a _safe walk). The "dup" * stable_node parameter itself will be freed from * under us if it returns NULL. */ folio = ksm_get_folio(dup, KSM_GET_FOLIO_NOLOCK); if (!folio) continue; /* Pick the best candidate if possible. */ if (!found || (is_page_sharing_candidate(dup) && (!is_page_sharing_candidate(found) || dup->rmap_hlist_len > found_rmap_hlist_len))) { if (found) folio_put(tree_folio); found = dup; found_rmap_hlist_len = found->rmap_hlist_len; tree_folio = folio; /* skip put_page for found candidate */ if (!prune_stale_stable_nodes && is_page_sharing_candidate(found)) break; continue; } folio_put(folio); } if (found) { if (hlist_is_singular_node(&found->hlist_dup, &stable_node->hlist)) { /* * If there's not just one entry it would * corrupt memory, better BUG_ON. In KSM * context with no lock held it's not even * fatal. */ BUG_ON(stable_node->hlist.first->next); /* * There's just one entry and it is below the * deduplication limit so drop the chain. */ rb_replace_node(&stable_node->node, &found->node, root); free_stable_node(stable_node); ksm_stable_node_chains--; ksm_stable_node_dups--; /* * NOTE: the caller depends on the stable_node * to be equal to stable_node_dup if the chain * was collapsed. */ *_stable_node = found; /* * Just for robustness, as stable_node is * otherwise left as a stable pointer, the * compiler shall optimize it away at build * time. */ stable_node = NULL; } else if (stable_node->hlist.first != &found->hlist_dup && __is_page_sharing_candidate(found, 1)) { /* * If the found stable_node dup can accept one * more future merge (in addition to the one * that is underway) and is not at the head of * the chain, put it there so next search will * be quicker in the !prune_stale_stable_nodes * case. * * NOTE: it would be inaccurate to use nr > 1 * instead of checking the hlist.first pointer * directly, because in the * prune_stale_stable_nodes case "nr" isn't * the position of the found dup in the chain, * but the total number of dups in the chain. */ hlist_del(&found->hlist_dup); hlist_add_head(&found->hlist_dup, &stable_node->hlist); } } else { /* Its hlist must be empty if no one found. */ free_stable_node_chain(stable_node, root); } *_stable_node_dup = found; return tree_folio; } /* * Like for ksm_get_folio, this function can free the *_stable_node and * *_stable_node_dup if the returned tree_page is NULL. * * It can also free and overwrite *_stable_node with the found * stable_node_dup if the chain is collapsed (in which case * *_stable_node will be equal to *_stable_node_dup like if the chain * never existed). It's up to the caller to verify tree_page is not * NULL before dereferencing *_stable_node or *_stable_node_dup. * * *_stable_node_dup is really a second output parameter of this * function and will be overwritten in all cases, the caller doesn't * need to initialize it. */ static struct folio *__stable_node_chain(struct ksm_stable_node **_stable_node_dup, struct ksm_stable_node **_stable_node, struct rb_root *root, bool prune_stale_stable_nodes) { struct ksm_stable_node *stable_node = *_stable_node; if (!is_stable_node_chain(stable_node)) { *_stable_node_dup = stable_node; return ksm_get_folio(stable_node, KSM_GET_FOLIO_NOLOCK); } return stable_node_dup(_stable_node_dup, _stable_node, root, prune_stale_stable_nodes); } static __always_inline struct folio *chain_prune(struct ksm_stable_node **s_n_d, struct ksm_stable_node **s_n, struct rb_root *root) { return __stable_node_chain(s_n_d, s_n, root, true); } static __always_inline struct folio *chain(struct ksm_stable_node **s_n_d, struct ksm_stable_node **s_n, struct rb_root *root) { return __stable_node_chain(s_n_d, s_n, root, false); } /* * stable_tree_search - search for page inside the stable tree * * This function checks if there is a page inside the stable tree * with identical content to the page that we are scanning right now. * * This function returns the stable tree node of identical content if found, * NULL otherwise. */ static struct page *stable_tree_search(struct page *page) { int nid; struct rb_root *root; struct rb_node **new; struct rb_node *parent; struct ksm_stable_node *stable_node, *stable_node_dup; struct ksm_stable_node *page_node; struct folio *folio; folio = page_folio(page); page_node = folio_stable_node(folio); if (page_node && page_node->head != &migrate_nodes) { /* ksm page forked */ folio_get(folio); return &folio->page; } nid = get_kpfn_nid(folio_pfn(folio)); root = root_stable_tree + nid; again: new = &root->rb_node; parent = NULL; while (*new) { struct folio *tree_folio; int ret; cond_resched(); stable_node = rb_entry(*new, struct ksm_stable_node, node); tree_folio = chain_prune(&stable_node_dup, &stable_node, root); if (!tree_folio) { /* * If we walked over a stale stable_node, * ksm_get_folio() will call rb_erase() and it * may rebalance the tree from under us. So * restart the search from scratch. Returning * NULL would be safe too, but we'd generate * false negative insertions just because some * stable_node was stale. */ goto again; } ret = memcmp_pages(page, &tree_folio->page); folio_put(tree_folio); parent = *new; if (ret < 0) new = &parent->rb_left; else if (ret > 0) new = &parent->rb_right; else { if (page_node) { VM_BUG_ON(page_node->head != &migrate_nodes); /* * If the mapcount of our migrated KSM folio is * at most 1, we can merge it with another * KSM folio where we know that we have space * for one more mapping without exceeding the * ksm_max_page_sharing limit: see * chain_prune(). This way, we can avoid adding * this stable node to the chain. */ if (folio_mapcount(folio) > 1) goto chain_append; } if (!is_page_sharing_candidate(stable_node_dup)) { /* * If the stable_node is a chain and * we got a payload match in memcmp * but we cannot merge the scanned * page in any of the existing * stable_node dups because they're * all full, we need to wait the * scanned page to find itself a match * in the unstable tree to create a * brand new KSM page to add later to * the dups of this stable_node. */ return NULL; } /* * Lock and unlock the stable_node's page (which * might already have been migrated) so that page * migration is sure to notice its raised count. * It would be more elegant to return stable_node * than kpage, but that involves more changes. */ tree_folio = ksm_get_folio(stable_node_dup, KSM_GET_FOLIO_TRYLOCK); if (PTR_ERR(tree_folio) == -EBUSY) return ERR_PTR(-EBUSY); if (unlikely(!tree_folio)) /* * The tree may have been rebalanced, * so re-evaluate parent and new. */ goto again; folio_unlock(tree_folio); if (get_kpfn_nid(stable_node_dup->kpfn) != NUMA(stable_node_dup->nid)) { folio_put(tree_folio); goto replace; } return &tree_folio->page; } } if (!page_node) return NULL; list_del(&page_node->list); DO_NUMA(page_node->nid = nid); rb_link_node(&page_node->node, parent, new); rb_insert_color(&page_node->node, root); out: if (is_page_sharing_candidate(page_node)) { folio_get(folio); return &folio->page; } else return NULL; replace: /* * If stable_node was a chain and chain_prune collapsed it, * stable_node has been updated to be the new regular * stable_node. A collapse of the chain is indistinguishable * from the case there was no chain in the stable * rbtree. Otherwise stable_node is the chain and * stable_node_dup is the dup to replace. */ if (stable_node_dup == stable_node) { VM_BUG_ON(is_stable_node_chain(stable_node_dup)); VM_BUG_ON(is_stable_node_dup(stable_node_dup)); /* there is no chain */ if (page_node) { VM_BUG_ON(page_node->head != &migrate_nodes); list_del(&page_node->list); DO_NUMA(page_node->nid = nid); rb_replace_node(&stable_node_dup->node, &page_node->node, root); if (is_page_sharing_candidate(page_node)) folio_get(folio); else folio = NULL; } else { rb_erase(&stable_node_dup->node, root); folio = NULL; } } else { VM_BUG_ON(!is_stable_node_chain(stable_node)); __stable_node_dup_del(stable_node_dup); if (page_node) { VM_BUG_ON(page_node->head != &migrate_nodes); list_del(&page_node->list); DO_NUMA(page_node->nid = nid); stable_node_chain_add_dup(page_node, stable_node); if (is_page_sharing_candidate(page_node)) folio_get(folio); else folio = NULL; } else { folio = NULL; } } stable_node_dup->head = &migrate_nodes; list_add(&stable_node_dup->list, stable_node_dup->head); return &folio->page; chain_append: /* * If stable_node was a chain and chain_prune collapsed it, * stable_node has been updated to be the new regular * stable_node. A collapse of the chain is indistinguishable * from the case there was no chain in the stable * rbtree. Otherwise stable_node is the chain and * stable_node_dup is the dup to replace. */ if (stable_node_dup == stable_node) { VM_BUG_ON(is_stable_node_dup(stable_node_dup)); /* chain is missing so create it */ stable_node = alloc_stable_node_chain(stable_node_dup, root); if (!stable_node) return NULL; } /* * Add this stable_node dup that was * migrated to the stable_node chain * of the current nid for this page * content. */ VM_BUG_ON(!is_stable_node_dup(stable_node_dup)); VM_BUG_ON(page_node->head != &migrate_nodes); list_del(&page_node->list); DO_NUMA(page_node->nid = nid); stable_node_chain_add_dup(page_node, stable_node); goto out; } /* * stable_tree_insert - insert stable tree node pointing to new ksm page * into the stable tree. * * This function returns the stable tree node just allocated on success, * NULL otherwise. */ static struct ksm_stable_node *stable_tree_insert(struct folio *kfolio) { int nid; unsigned long kpfn; struct rb_root *root; struct rb_node **new; struct rb_node *parent; struct ksm_stable_node *stable_node, *stable_node_dup; bool need_chain = false; kpfn = folio_pfn(kfolio); nid = get_kpfn_nid(kpfn); root = root_stable_tree + nid; again: parent = NULL; new = &root->rb_node; while (*new) { struct folio *tree_folio; int ret; cond_resched(); stable_node = rb_entry(*new, struct ksm_stable_node, node); tree_folio = chain(&stable_node_dup, &stable_node, root); if (!tree_folio) { /* * If we walked over a stale stable_node, * ksm_get_folio() will call rb_erase() and it * may rebalance the tree from under us. So * restart the search from scratch. Returning * NULL would be safe too, but we'd generate * false negative insertions just because some * stable_node was stale. */ goto again; } ret = memcmp_pages(&kfolio->page, &tree_folio->page); folio_put(tree_folio); parent = *new; if (ret < 0) new = &parent->rb_left; else if (ret > 0) new = &parent->rb_right; else { need_chain = true; break; } } stable_node_dup = alloc_stable_node(); if (!stable_node_dup) return NULL; INIT_HLIST_HEAD(&stable_node_dup->hlist); stable_node_dup->kpfn = kpfn; stable_node_dup->rmap_hlist_len = 0; DO_NUMA(stable_node_dup->nid = nid); if (!need_chain) { rb_link_node(&stable_node_dup->node, parent, new); rb_insert_color(&stable_node_dup->node, root); } else { if (!is_stable_node_chain(stable_node)) { struct ksm_stable_node *orig = stable_node; /* chain is missing so create it */ stable_node = alloc_stable_node_chain(orig, root); if (!stable_node) { free_stable_node(stable_node_dup); return NULL; } } stable_node_chain_add_dup(stable_node_dup, stable_node); } folio_set_stable_node(kfolio, stable_node_dup); return stable_node_dup; } /* * unstable_tree_search_insert - search for identical page, * else insert rmap_item into the unstable tree. * * This function searches for a page in the unstable tree identical to the * page currently being scanned; and if no identical page is found in the * tree, we insert rmap_item as a new object into the unstable tree. * * This function returns pointer to rmap_item found to be identical * to the currently scanned page, NULL otherwise. * * This function does both searching and inserting, because they share * the same walking algorithm in an rbtree. */ static struct ksm_rmap_item *unstable_tree_search_insert(struct ksm_rmap_item *rmap_item, struct page *page, struct page **tree_pagep) { struct rb_node **new; struct rb_root *root; struct rb_node *parent = NULL; int nid; nid = get_kpfn_nid(page_to_pfn(page)); root = root_unstable_tree + nid; new = &root->rb_node; while (*new) { struct ksm_rmap_item *tree_rmap_item; struct page *tree_page; int ret; cond_resched(); tree_rmap_item = rb_entry(*new, struct ksm_rmap_item, node); tree_page = get_mergeable_page(tree_rmap_item); if (!tree_page) return NULL; /* * Don't substitute a ksm page for a forked page. */ if (page == tree_page) { put_page(tree_page); return NULL; } ret = memcmp_pages(page, tree_page); parent = *new; if (ret < 0) { put_page(tree_page); new = &parent->rb_left; } else if (ret > 0) { put_page(tree_page); new = &parent->rb_right; } else if (!ksm_merge_across_nodes && page_to_nid(tree_page) != nid) { /* * If tree_page has been migrated to another NUMA node, * it will be flushed out and put in the right unstable * tree next time: only merge with it when across_nodes. */ put_page(tree_page); return NULL; } else { *tree_pagep = tree_page; return tree_rmap_item; } } rmap_item->address |= UNSTABLE_FLAG; rmap_item->address |= (ksm_scan.seqnr & SEQNR_MASK); DO_NUMA(rmap_item->nid = nid); rb_link_node(&rmap_item->node, parent, new); rb_insert_color(&rmap_item->node, root); ksm_pages_unshared++; return NULL; } /* * stable_tree_append - add another rmap_item to the linked list of * rmap_items hanging off a given node of the stable tree, all sharing * the same ksm page. */ static void stable_tree_append(struct ksm_rmap_item *rmap_item, struct ksm_stable_node *stable_node, bool max_page_sharing_bypass) { /* * rmap won't find this mapping if we don't insert the * rmap_item in the right stable_node * duplicate. page_migration could break later if rmap breaks, * so we can as well crash here. We really need to check for * rmap_hlist_len == STABLE_NODE_CHAIN, but we can as well check * for other negative values as an underflow if detected here * for the first time (and not when decreasing rmap_hlist_len) * would be sign of memory corruption in the stable_node. */ BUG_ON(stable_node->rmap_hlist_len < 0); stable_node->rmap_hlist_len++; if (!max_page_sharing_bypass) /* possibly non fatal but unexpected overflow, only warn */ WARN_ON_ONCE(stable_node->rmap_hlist_len > ksm_max_page_sharing); rmap_item->head = stable_node; rmap_item->address |= STABLE_FLAG; hlist_add_head(&rmap_item->hlist, &stable_node->hlist); if (rmap_item->hlist.next) ksm_pages_sharing++; else ksm_pages_shared++; rmap_item->mm->ksm_merging_pages++; } /* * cmp_and_merge_page - first see if page can be merged into the stable tree; * if not, compare checksum to previous and if it's the same, see if page can * be inserted into the unstable tree, or merged with a page already there and * both transferred to the stable tree. * * @page: the page that we are searching identical page to. * @rmap_item: the reverse mapping into the virtual address of this page */ static void cmp_and_merge_page(struct page *page, struct ksm_rmap_item *rmap_item) { struct ksm_rmap_item *tree_rmap_item; struct page *tree_page = NULL; struct ksm_stable_node *stable_node; struct page *kpage; unsigned int checksum; int err; bool max_page_sharing_bypass = false; stable_node = page_stable_node(page); if (stable_node) { if (stable_node->head != &migrate_nodes && get_kpfn_nid(READ_ONCE(stable_node->kpfn)) != NUMA(stable_node->nid)) { stable_node_dup_del(stable_node); stable_node->head = &migrate_nodes; list_add(&stable_node->list, stable_node->head); } if (stable_node->head != &migrate_nodes && rmap_item->head == stable_node) return; /* * If it's a KSM fork, allow it to go over the sharing limit * without warnings. */ if (!is_page_sharing_candidate(stable_node)) max_page_sharing_bypass = true; } else { remove_rmap_item_from_tree(rmap_item); /* * If the hash value of the page has changed from the last time * we calculated it, this page is changing frequently: therefore we * don't want to insert it in the unstable tree, and we don't want * to waste our time searching for something identical to it there. */ checksum = calc_checksum(page); if (rmap_item->oldchecksum != checksum) { rmap_item->oldchecksum = checksum; return; } if (!try_to_merge_with_zero_page(rmap_item, page)) return; } /* We first start with searching the page inside the stable tree */ kpage = stable_tree_search(page); if (kpage == page && rmap_item->head == stable_node) { put_page(kpage); return; } remove_rmap_item_from_tree(rmap_item); if (kpage) { if (PTR_ERR(kpage) == -EBUSY) return; err = try_to_merge_with_ksm_page(rmap_item, page, kpage); if (!err) { /* * The page was successfully merged: * add its rmap_item to the stable tree. */ lock_page(kpage); stable_tree_append(rmap_item, page_stable_node(kpage), max_page_sharing_bypass); unlock_page(kpage); } put_page(kpage); return; } tree_rmap_item = unstable_tree_search_insert(rmap_item, page, &tree_page); if (tree_rmap_item) { bool split; kpage = try_to_merge_two_pages(rmap_item, page, tree_rmap_item, tree_page); /* * If both pages we tried to merge belong to the same compound * page, then we actually ended up increasing the reference * count of the same compound page twice, and split_huge_page * failed. * Here we set a flag if that happened, and we use it later to * try split_huge_page again. Since we call put_page right * afterwards, the reference count will be correct and * split_huge_page should succeed. */ split = PageTransCompound(page) && compound_head(page) == compound_head(tree_page); put_page(tree_page); if (kpage) { /* * The pages were successfully merged: insert new * node in the stable tree and add both rmap_items. */ lock_page(kpage); stable_node = stable_tree_insert(page_folio(kpage)); if (stable_node) { stable_tree_append(tree_rmap_item, stable_node, false); stable_tree_append(rmap_item, stable_node, false); } unlock_page(kpage); /* * If we fail to insert the page into the stable tree, * we will have 2 virtual addresses that are pointing * to a ksm page left outside the stable tree, * in which case we need to break_cow on both. */ if (!stable_node) { break_cow(tree_rmap_item); break_cow(rmap_item); } } else if (split) { /* * We are here if we tried to merge two pages and * failed because they both belonged to the same * compound page. We will split the page now, but no * merging will take place. * We do not want to add the cost of a full lock; if * the page is locked, it is better to skip it and * perhaps try again later. */ if (!trylock_page(page)) return; split_huge_page(page); unlock_page(page); } } } static struct ksm_rmap_item *get_next_rmap_item(struct ksm_mm_slot *mm_slot, struct ksm_rmap_item **rmap_list, unsigned long addr) { struct ksm_rmap_item *rmap_item; while (*rmap_list) { rmap_item = *rmap_list; if ((rmap_item->address & PAGE_MASK) == addr) return rmap_item; if (rmap_item->address > addr) break; *rmap_list = rmap_item->rmap_list; remove_rmap_item_from_tree(rmap_item); free_rmap_item(rmap_item); } rmap_item = alloc_rmap_item(); if (rmap_item) { /* It has already been zeroed */ rmap_item->mm = mm_slot->slot.mm; rmap_item->mm->ksm_rmap_items++; rmap_item->address = addr; rmap_item->rmap_list = *rmap_list; *rmap_list = rmap_item; } return rmap_item; } /* * Calculate skip age for the ksm page age. The age determines how often * de-duplicating has already been tried unsuccessfully. If the age is * smaller, the scanning of this page is skipped for less scans. * * @age: rmap_item age of page */ static unsigned int skip_age(rmap_age_t age) { if (age <= 3) return 1; if (age <= 5) return 2; if (age <= 8) return 4; return 8; } /* * Determines if a page should be skipped for the current scan. * * @page: page to check * @rmap_item: associated rmap_item of page */ static bool should_skip_rmap_item(struct page *page, struct ksm_rmap_item *rmap_item) { rmap_age_t age; if (!ksm_smart_scan) return false; /* * Never skip pages that are already KSM; pages cmp_and_merge_page() * will essentially ignore them, but we still have to process them * properly. */ if (PageKsm(page)) return false; age = rmap_item->age; if (age != U8_MAX) rmap_item->age++; /* * Smaller ages are not skipped, they need to get a chance to go * through the different phases of the KSM merging. */ if (age < 3) return false; /* * Are we still allowed to skip? If not, then don't skip it * and determine how much more often we are allowed to skip next. */ if (!rmap_item->remaining_skips) { rmap_item->remaining_skips = skip_age(age); return false; } /* Skip this page */ ksm_pages_skipped++; rmap_item->remaining_skips--; remove_rmap_item_from_tree(rmap_item); return true; } static struct ksm_rmap_item *scan_get_next_rmap_item(struct page **page) { struct mm_struct *mm; struct ksm_mm_slot *mm_slot; struct mm_slot *slot; struct vm_area_struct *vma; struct ksm_rmap_item *rmap_item; struct vma_iterator vmi; int nid; if (list_empty(&ksm_mm_head.slot.mm_node)) return NULL; mm_slot = ksm_scan.mm_slot; if (mm_slot == &ksm_mm_head) { advisor_start_scan(); trace_ksm_start_scan(ksm_scan.seqnr, ksm_rmap_items); /* * A number of pages can hang around indefinitely in per-cpu * LRU cache, raised page count preventing write_protect_page * from merging them. Though it doesn't really matter much, * it is puzzling to see some stuck in pages_volatile until * other activity jostles them out, and they also prevented * LTP's KSM test from succeeding deterministically; so drain * them here (here rather than on entry to ksm_do_scan(), * so we don't IPI too often when pages_to_scan is set low). */ lru_add_drain_all(); /* * Whereas stale stable_nodes on the stable_tree itself * get pruned in the regular course of stable_tree_search(), * those moved out to the migrate_nodes list can accumulate: * so prune them once before each full scan. */ if (!ksm_merge_across_nodes) { struct ksm_stable_node *stable_node, *next; struct folio *folio; list_for_each_entry_safe(stable_node, next, &migrate_nodes, list) { folio = ksm_get_folio(stable_node, KSM_GET_FOLIO_NOLOCK); if (folio) folio_put(folio); cond_resched(); } } for (nid = 0; nid < ksm_nr_node_ids; nid++) root_unstable_tree[nid] = RB_ROOT; spin_lock(&ksm_mmlist_lock); slot = list_entry(mm_slot->slot.mm_node.next, struct mm_slot, mm_node); mm_slot = mm_slot_entry(slot, struct ksm_mm_slot, slot); ksm_scan.mm_slot = mm_slot; spin_unlock(&ksm_mmlist_lock); /* * Although we tested list_empty() above, a racing __ksm_exit * of the last mm on the list may have removed it since then. */ if (mm_slot == &ksm_mm_head) return NULL; next_mm: ksm_scan.address = 0; ksm_scan.rmap_list = &mm_slot->rmap_list; } slot = &mm_slot->slot; mm = slot->mm; vma_iter_init(&vmi, mm, ksm_scan.address); mmap_read_lock(mm); if (ksm_test_exit(mm)) goto no_vmas; for_each_vma(vmi, vma) { if (!(vma->vm_flags & VM_MERGEABLE)) continue; if (ksm_scan.address < vma->vm_start) ksm_scan.address = vma->vm_start; if (!vma->anon_vma) ksm_scan.address = vma->vm_end; while (ksm_scan.address < vma->vm_end) { struct page *tmp_page = NULL; struct folio_walk fw; struct folio *folio; if (ksm_test_exit(mm)) break; folio = folio_walk_start(&fw, vma, ksm_scan.address, 0); if (folio) { if (!folio_is_zone_device(folio) && folio_test_anon(folio)) { folio_get(folio); tmp_page = fw.page; } folio_walk_end(&fw, vma); } if (tmp_page) { flush_anon_page(vma, tmp_page, ksm_scan.address); flush_dcache_page(tmp_page); rmap_item = get_next_rmap_item(mm_slot, ksm_scan.rmap_list, ksm_scan.address); if (rmap_item) { ksm_scan.rmap_list = &rmap_item->rmap_list; if (should_skip_rmap_item(tmp_page, rmap_item)) { folio_put(folio); goto next_page; } ksm_scan.address += PAGE_SIZE; *page = tmp_page; } else { folio_put(folio); } mmap_read_unlock(mm); return rmap_item; } next_page: ksm_scan.address += PAGE_SIZE; cond_resched(); } } if (ksm_test_exit(mm)) { no_vmas: ksm_scan.address = 0; ksm_scan.rmap_list = &mm_slot->rmap_list; } /* * Nuke all the rmap_items that are above this current rmap: * because there were no VM_MERGEABLE vmas with such addresses. */ remove_trailing_rmap_items(ksm_scan.rmap_list); spin_lock(&ksm_mmlist_lock); slot = list_entry(mm_slot->slot.mm_node.next, struct mm_slot, mm_node); ksm_scan.mm_slot = mm_slot_entry(slot, struct ksm_mm_slot, slot); if (ksm_scan.address == 0) { /* * We've completed a full scan of all vmas, holding mmap_lock * throughout, and found no VM_MERGEABLE: so do the same as * __ksm_exit does to remove this mm from all our lists now. * This applies either when cleaning up after __ksm_exit * (but beware: we can reach here even before __ksm_exit), * or when all VM_MERGEABLE areas have been unmapped (and * mmap_lock then protects against race with MADV_MERGEABLE). */ hash_del(&mm_slot->slot.hash); list_del(&mm_slot->slot.mm_node); spin_unlock(&ksm_mmlist_lock); mm_slot_free(mm_slot_cache, mm_slot); clear_bit(MMF_VM_MERGEABLE, &mm->flags); clear_bit(MMF_VM_MERGE_ANY, &mm->flags); mmap_read_unlock(mm); mmdrop(mm); } else { mmap_read_unlock(mm); /* * mmap_read_unlock(mm) first because after * spin_unlock(&ksm_mmlist_lock) run, the "mm" may * already have been freed under us by __ksm_exit() * because the "mm_slot" is still hashed and * ksm_scan.mm_slot doesn't point to it anymore. */ spin_unlock(&ksm_mmlist_lock); } /* Repeat until we've completed scanning the whole list */ mm_slot = ksm_scan.mm_slot; if (mm_slot != &ksm_mm_head) goto next_mm; advisor_stop_scan(); trace_ksm_stop_scan(ksm_scan.seqnr, ksm_rmap_items); ksm_scan.seqnr++; return NULL; } /** * ksm_do_scan - the ksm scanner main worker function. * @scan_npages: number of pages we want to scan before we return. */ static void ksm_do_scan(unsigned int scan_npages) { struct ksm_rmap_item *rmap_item; struct page *page; while (scan_npages-- && likely(!freezing(current))) { cond_resched(); rmap_item = scan_get_next_rmap_item(&page); if (!rmap_item) return; cmp_and_merge_page(page, rmap_item); put_page(page); ksm_pages_scanned++; } } static int ksmd_should_run(void) { return (ksm_run & KSM_RUN_MERGE) && !list_empty(&ksm_mm_head.slot.mm_node); } static int ksm_scan_thread(void *nothing) { unsigned int sleep_ms; set_freezable(); set_user_nice(current, 5); while (!kthread_should_stop()) { mutex_lock(&ksm_thread_mutex); wait_while_offlining(); if (ksmd_should_run()) ksm_do_scan(ksm_thread_pages_to_scan); mutex_unlock(&ksm_thread_mutex); if (ksmd_should_run()) { sleep_ms = READ_ONCE(ksm_thread_sleep_millisecs); wait_event_freezable_timeout(ksm_iter_wait, sleep_ms != READ_ONCE(ksm_thread_sleep_millisecs), msecs_to_jiffies(sleep_ms)); } else { wait_event_freezable(ksm_thread_wait, ksmd_should_run() || kthread_should_stop()); } } return 0; } static void __ksm_add_vma(struct vm_area_struct *vma) { unsigned long vm_flags = vma->vm_flags; if (vm_flags & VM_MERGEABLE) return; if (vma_ksm_compatible(vma)) vm_flags_set(vma, VM_MERGEABLE); } static int __ksm_del_vma(struct vm_area_struct *vma) { int err; if (!(vma->vm_flags & VM_MERGEABLE)) return 0; if (vma->anon_vma) { err = unmerge_ksm_pages(vma, vma->vm_start, vma->vm_end, true); if (err) return err; } vm_flags_clear(vma, VM_MERGEABLE); return 0; } /** * ksm_add_vma - Mark vma as mergeable if compatible * * @vma: Pointer to vma */ void ksm_add_vma(struct vm_area_struct *vma) { struct mm_struct *mm = vma->vm_mm; if (test_bit(MMF_VM_MERGE_ANY, &mm->flags)) __ksm_add_vma(vma); } static void ksm_add_vmas(struct mm_struct *mm) { struct vm_area_struct *vma; VMA_ITERATOR(vmi, mm, 0); for_each_vma(vmi, vma) __ksm_add_vma(vma); } static int ksm_del_vmas(struct mm_struct *mm) { struct vm_area_struct *vma; int err; VMA_ITERATOR(vmi, mm, 0); for_each_vma(vmi, vma) { err = __ksm_del_vma(vma); if (err) return err; } return 0; } /** * ksm_enable_merge_any - Add mm to mm ksm list and enable merging on all * compatible VMA's * * @mm: Pointer to mm * * Returns 0 on success, otherwise error code */ int ksm_enable_merge_any(struct mm_struct *mm) { int err; if (test_bit(MMF_VM_MERGE_ANY, &mm->flags)) return 0; if (!test_bit(MMF_VM_MERGEABLE, &mm->flags)) { err = __ksm_enter(mm); if (err) return err; } set_bit(MMF_VM_MERGE_ANY, &mm->flags); ksm_add_vmas(mm); return 0; } /** * ksm_disable_merge_any - Disable merging on all compatible VMA's of the mm, * previously enabled via ksm_enable_merge_any(). * * Disabling merging implies unmerging any merged pages, like setting * MADV_UNMERGEABLE would. If unmerging fails, the whole operation fails and * merging on all compatible VMA's remains enabled. * * @mm: Pointer to mm * * Returns 0 on success, otherwise error code */ int ksm_disable_merge_any(struct mm_struct *mm) { int err; if (!test_bit(MMF_VM_MERGE_ANY, &mm->flags)) return 0; err = ksm_del_vmas(mm); if (err) { ksm_add_vmas(mm); return err; } clear_bit(MMF_VM_MERGE_ANY, &mm->flags); return 0; } int ksm_disable(struct mm_struct *mm) { mmap_assert_write_locked(mm); if (!test_bit(MMF_VM_MERGEABLE, &mm->flags)) return 0; if (test_bit(MMF_VM_MERGE_ANY, &mm->flags)) return ksm_disable_merge_any(mm); return ksm_del_vmas(mm); } int ksm_madvise(struct vm_area_struct *vma, unsigned long start, unsigned long end, int advice, unsigned long *vm_flags) { struct mm_struct *mm = vma->vm_mm; int err; switch (advice) { case MADV_MERGEABLE: if (vma->vm_flags & VM_MERGEABLE) return 0; if (!vma_ksm_compatible(vma)) return 0; if (!test_bit(MMF_VM_MERGEABLE, &mm->flags)) { err = __ksm_enter(mm); if (err) return err; } *vm_flags |= VM_MERGEABLE; break; case MADV_UNMERGEABLE: if (!(*vm_flags & VM_MERGEABLE)) return 0; /* just ignore the advice */ if (vma->anon_vma) { err = unmerge_ksm_pages(vma, start, end, true); if (err) return err; } *vm_flags &= ~VM_MERGEABLE; break; } return 0; } EXPORT_SYMBOL_GPL(ksm_madvise); int __ksm_enter(struct mm_struct *mm) { struct ksm_mm_slot *mm_slot; struct mm_slot *slot; int needs_wakeup; mm_slot = mm_slot_alloc(mm_slot_cache); if (!mm_slot) return -ENOMEM; slot = &mm_slot->slot; /* Check ksm_run too? Would need tighter locking */ needs_wakeup = list_empty(&ksm_mm_head.slot.mm_node); spin_lock(&ksm_mmlist_lock); mm_slot_insert(mm_slots_hash, mm, slot); /* * When KSM_RUN_MERGE (or KSM_RUN_STOP), * insert just behind the scanning cursor, to let the area settle * down a little; when fork is followed by immediate exec, we don't * want ksmd to waste time setting up and tearing down an rmap_list. * * But when KSM_RUN_UNMERGE, it's important to insert ahead of its * scanning cursor, otherwise KSM pages in newly forked mms will be * missed: then we might as well insert at the end of the list. */ if (ksm_run & KSM_RUN_UNMERGE) list_add_tail(&slot->mm_node, &ksm_mm_head.slot.mm_node); else list_add_tail(&slot->mm_node, &ksm_scan.mm_slot->slot.mm_node); spin_unlock(&ksm_mmlist_lock); set_bit(MMF_VM_MERGEABLE, &mm->flags); mmgrab(mm); if (needs_wakeup) wake_up_interruptible(&ksm_thread_wait); trace_ksm_enter(mm); return 0; } void __ksm_exit(struct mm_struct *mm) { struct ksm_mm_slot *mm_slot; struct mm_slot *slot; int easy_to_free = 0; /* * This process is exiting: if it's straightforward (as is the * case when ksmd was never running), free mm_slot immediately. * But if it's at the cursor or has rmap_items linked to it, use * mmap_lock to synchronize with any break_cows before pagetables * are freed, and leave the mm_slot on the list for ksmd to free. * Beware: ksm may already have noticed it exiting and freed the slot. */ spin_lock(&ksm_mmlist_lock); slot = mm_slot_lookup(mm_slots_hash, mm); mm_slot = mm_slot_entry(slot, struct ksm_mm_slot, slot); if (mm_slot && ksm_scan.mm_slot != mm_slot) { if (!mm_slot->rmap_list) { hash_del(&slot->hash); list_del(&slot->mm_node); easy_to_free = 1; } else { list_move(&slot->mm_node, &ksm_scan.mm_slot->slot.mm_node); } } spin_unlock(&ksm_mmlist_lock); if (easy_to_free) { mm_slot_free(mm_slot_cache, mm_slot); clear_bit(MMF_VM_MERGE_ANY, &mm->flags); clear_bit(MMF_VM_MERGEABLE, &mm->flags); mmdrop(mm); } else if (mm_slot) { mmap_write_lock(mm); mmap_write_unlock(mm); } trace_ksm_exit(mm); } struct folio *ksm_might_need_to_copy(struct folio *folio, struct vm_area_struct *vma, unsigned long addr) { struct page *page = folio_page(folio, 0); struct anon_vma *anon_vma = folio_anon_vma(folio); struct folio *new_folio; if (folio_test_large(folio)) return folio; if (folio_test_ksm(folio)) { if (folio_stable_node(folio) && !(ksm_run & KSM_RUN_UNMERGE)) return folio; /* no need to copy it */ } else if (!anon_vma) { return folio; /* no need to copy it */ } else if (folio->index == linear_page_index(vma, addr) && anon_vma->root == vma->anon_vma->root) { return folio; /* still no need to copy it */ } if (PageHWPoison(page)) return ERR_PTR(-EHWPOISON); if (!folio_test_uptodate(folio)) return folio; /* let do_swap_page report the error */ new_folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0, vma, addr, false); if (new_folio && mem_cgroup_charge(new_folio, vma->vm_mm, GFP_KERNEL)) { folio_put(new_folio); new_folio = NULL; } if (new_folio) { if (copy_mc_user_highpage(folio_page(new_folio, 0), page, addr, vma)) { folio_put(new_folio); return ERR_PTR(-EHWPOISON); } folio_set_dirty(new_folio); __folio_mark_uptodate(new_folio); __folio_set_locked(new_folio); #ifdef CONFIG_SWAP count_vm_event(KSM_SWPIN_COPY); #endif } return new_folio; } void rmap_walk_ksm(struct folio *folio, struct rmap_walk_control *rwc) { struct ksm_stable_node *stable_node; struct ksm_rmap_item *rmap_item; int search_new_forks = 0; VM_BUG_ON_FOLIO(!folio_test_ksm(folio), folio); /* * Rely on the page lock to protect against concurrent modifications * to that page's node of the stable tree. */ VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); stable_node = folio_stable_node(folio); if (!stable_node) return; again: hlist_for_each_entry(rmap_item, &stable_node->hlist, hlist) { struct anon_vma *anon_vma = rmap_item->anon_vma; struct anon_vma_chain *vmac; struct vm_area_struct *vma; cond_resched(); if (!anon_vma_trylock_read(anon_vma)) { if (rwc->try_lock) { rwc->contended = true; return; } anon_vma_lock_read(anon_vma); } anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root, 0, ULONG_MAX) { unsigned long addr; cond_resched(); vma = vmac->vma; /* Ignore the stable/unstable/sqnr flags */ addr = rmap_item->address & PAGE_MASK; if (addr < vma->vm_start || addr >= vma->vm_end) continue; /* * Initially we examine only the vma which covers this * rmap_item; but later, if there is still work to do, * we examine covering vmas in other mms: in case they * were forked from the original since ksmd passed. */ if ((rmap_item->mm == vma->vm_mm) == search_new_forks) continue; if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg)) continue; if (!rwc->rmap_one(folio, vma, addr, rwc->arg)) { anon_vma_unlock_read(anon_vma); return; } if (rwc->done && rwc->done(folio)) { anon_vma_unlock_read(anon_vma); return; } } anon_vma_unlock_read(anon_vma); } if (!search_new_forks++) goto again; } #ifdef CONFIG_MEMORY_FAILURE /* * Collect processes when the error hit an ksm page. */ void collect_procs_ksm(struct folio *folio, struct page *page, struct list_head *to_kill, int force_early) { struct ksm_stable_node *stable_node; struct ksm_rmap_item *rmap_item; struct vm_area_struct *vma; struct task_struct *tsk; stable_node = folio_stable_node(folio); if (!stable_node) return; hlist_for_each_entry(rmap_item, &stable_node->hlist, hlist) { struct anon_vma *av = rmap_item->anon_vma; anon_vma_lock_read(av); rcu_read_lock(); for_each_process(tsk) { struct anon_vma_chain *vmac; unsigned long addr; struct task_struct *t = task_early_kill(tsk, force_early); if (!t) continue; anon_vma_interval_tree_foreach(vmac, &av->rb_root, 0, ULONG_MAX) { vma = vmac->vma; if (vma->vm_mm == t->mm) { addr = rmap_item->address & PAGE_MASK; add_to_kill_ksm(t, page, vma, to_kill, addr); } } } rcu_read_unlock(); anon_vma_unlock_read(av); } } #endif #ifdef CONFIG_MIGRATION void folio_migrate_ksm(struct folio *newfolio, struct folio *folio) { struct ksm_stable_node *stable_node; VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); VM_BUG_ON_FOLIO(!folio_test_locked(newfolio), newfolio); VM_BUG_ON_FOLIO(newfolio->mapping != folio->mapping, newfolio); stable_node = folio_stable_node(folio); if (stable_node) { VM_BUG_ON_FOLIO(stable_node->kpfn != folio_pfn(folio), folio); stable_node->kpfn = folio_pfn(newfolio); /* * newfolio->mapping was set in advance; now we need smp_wmb() * to make sure that the new stable_node->kpfn is visible * to ksm_get_folio() before it can see that folio->mapping * has gone stale (or that the swapcache flag has been cleared). */ smp_wmb(); folio_set_stable_node(folio, NULL); } } #endif /* CONFIG_MIGRATION */ #ifdef CONFIG_MEMORY_HOTREMOVE static void wait_while_offlining(void) { while (ksm_run & KSM_RUN_OFFLINE) { mutex_unlock(&ksm_thread_mutex); wait_on_bit(&ksm_run, ilog2(KSM_RUN_OFFLINE), TASK_UNINTERRUPTIBLE); mutex_lock(&ksm_thread_mutex); } } static bool stable_node_dup_remove_range(struct ksm_stable_node *stable_node, unsigned long start_pfn, unsigned long end_pfn) { if (stable_node->kpfn >= start_pfn && stable_node->kpfn < end_pfn) { /* * Don't ksm_get_folio, page has already gone: * which is why we keep kpfn instead of page* */ remove_node_from_stable_tree(stable_node); return true; } return false; } static bool stable_node_chain_remove_range(struct ksm_stable_node *stable_node, unsigned long start_pfn, unsigned long end_pfn, struct rb_root *root) { struct ksm_stable_node *dup; struct hlist_node *hlist_safe; if (!is_stable_node_chain(stable_node)) { VM_BUG_ON(is_stable_node_dup(stable_node)); return stable_node_dup_remove_range(stable_node, start_pfn, end_pfn); } hlist_for_each_entry_safe(dup, hlist_safe, &stable_node->hlist, hlist_dup) { VM_BUG_ON(!is_stable_node_dup(dup)); stable_node_dup_remove_range(dup, start_pfn, end_pfn); } if (hlist_empty(&stable_node->hlist)) { free_stable_node_chain(stable_node, root); return true; /* notify caller that tree was rebalanced */ } else return false; } static void ksm_check_stable_tree(unsigned long start_pfn, unsigned long end_pfn) { struct ksm_stable_node *stable_node, *next; struct rb_node *node; int nid; for (nid = 0; nid < ksm_nr_node_ids; nid++) { node = rb_first(root_stable_tree + nid); while (node) { stable_node = rb_entry(node, struct ksm_stable_node, node); if (stable_node_chain_remove_range(stable_node, start_pfn, end_pfn, root_stable_tree + nid)) node = rb_first(root_stable_tree + nid); else node = rb_next(node); cond_resched(); } } list_for_each_entry_safe(stable_node, next, &migrate_nodes, list) { if (stable_node->kpfn >= start_pfn && stable_node->kpfn < end_pfn) remove_node_from_stable_tree(stable_node); cond_resched(); } } static int ksm_memory_callback(struct notifier_block *self, unsigned long action, void *arg) { struct memory_notify *mn = arg; switch (action) { case MEM_GOING_OFFLINE: /* * Prevent ksm_do_scan(), unmerge_and_remove_all_rmap_items() * and remove_all_stable_nodes() while memory is going offline: * it is unsafe for them to touch the stable tree at this time. * But unmerge_ksm_pages(), rmap lookups and other entry points * which do not need the ksm_thread_mutex are all safe. */ mutex_lock(&ksm_thread_mutex); ksm_run |= KSM_RUN_OFFLINE; mutex_unlock(&ksm_thread_mutex); break; case MEM_OFFLINE: /* * Most of the work is done by page migration; but there might * be a few stable_nodes left over, still pointing to struct * pages which have been offlined: prune those from the tree, * otherwise ksm_get_folio() might later try to access a * non-existent struct page. */ ksm_check_stable_tree(mn->start_pfn, mn->start_pfn + mn->nr_pages); fallthrough; case MEM_CANCEL_OFFLINE: mutex_lock(&ksm_thread_mutex); ksm_run &= ~KSM_RUN_OFFLINE; mutex_unlock(&ksm_thread_mutex); smp_mb(); /* wake_up_bit advises this */ wake_up_bit(&ksm_run, ilog2(KSM_RUN_OFFLINE)); break; } return NOTIFY_OK; } #else static void wait_while_offlining(void) { } #endif /* CONFIG_MEMORY_HOTREMOVE */ #ifdef CONFIG_PROC_FS long ksm_process_profit(struct mm_struct *mm) { return (long)(mm->ksm_merging_pages + mm_ksm_zero_pages(mm)) * PAGE_SIZE - mm->ksm_rmap_items * sizeof(struct ksm_rmap_item); } #endif /* CONFIG_PROC_FS */ #ifdef CONFIG_SYSFS /* * This all compiles without CONFIG_SYSFS, but is a waste of space. */ #define KSM_ATTR_RO(_name) \ static struct kobj_attribute _name##_attr = __ATTR_RO(_name) #define KSM_ATTR(_name) \ static struct kobj_attribute _name##_attr = __ATTR_RW(_name) static ssize_t sleep_millisecs_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return sysfs_emit(buf, "%u\n", ksm_thread_sleep_millisecs); } static ssize_t sleep_millisecs_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { unsigned int msecs; int err; err = kstrtouint(buf, 10, &msecs); if (err) return -EINVAL; ksm_thread_sleep_millisecs = msecs; wake_up_interruptible(&ksm_iter_wait); return count; } KSM_ATTR(sleep_millisecs); static ssize_t pages_to_scan_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return sysfs_emit(buf, "%u\n", ksm_thread_pages_to_scan); } static ssize_t pages_to_scan_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { unsigned int nr_pages; int err; if (ksm_advisor != KSM_ADVISOR_NONE) return -EINVAL; err = kstrtouint(buf, 10, &nr_pages); if (err) return -EINVAL; ksm_thread_pages_to_scan = nr_pages; return count; } KSM_ATTR(pages_to_scan); static ssize_t run_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return sysfs_emit(buf, "%lu\n", ksm_run); } static ssize_t run_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { unsigned int flags; int err; err = kstrtouint(buf, 10, &flags); if (err) return -EINVAL; if (flags > KSM_RUN_UNMERGE) return -EINVAL; /* * KSM_RUN_MERGE sets ksmd running, and 0 stops it running. * KSM_RUN_UNMERGE stops it running and unmerges all rmap_items, * breaking COW to free the pages_shared (but leaves mm_slots * on the list for when ksmd may be set running again). */ mutex_lock(&ksm_thread_mutex); wait_while_offlining(); if (ksm_run != flags) { ksm_run = flags; if (flags & KSM_RUN_UNMERGE) { set_current_oom_origin(); err = unmerge_and_remove_all_rmap_items(); clear_current_oom_origin(); if (err) { ksm_run = KSM_RUN_STOP; count = err; } } } mutex_unlock(&ksm_thread_mutex); if (flags & KSM_RUN_MERGE) wake_up_interruptible(&ksm_thread_wait); return count; } KSM_ATTR(run); #ifdef CONFIG_NUMA static ssize_t merge_across_nodes_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return sysfs_emit(buf, "%u\n", ksm_merge_across_nodes); } static ssize_t merge_across_nodes_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { int err; unsigned long knob; err = kstrtoul(buf, 10, &knob); if (err) return err; if (knob > 1) return -EINVAL; mutex_lock(&ksm_thread_mutex); wait_while_offlining(); if (ksm_merge_across_nodes != knob) { if (ksm_pages_shared || remove_all_stable_nodes()) err = -EBUSY; else if (root_stable_tree == one_stable_tree) { struct rb_root *buf; /* * This is the first time that we switch away from the * default of merging across nodes: must now allocate * a buffer to hold as many roots as may be needed. * Allocate stable and unstable together: * MAXSMP NODES_SHIFT 10 will use 16kB. */ buf = kcalloc(nr_node_ids + nr_node_ids, sizeof(*buf), GFP_KERNEL); /* Let us assume that RB_ROOT is NULL is zero */ if (!buf) err = -ENOMEM; else { root_stable_tree = buf; root_unstable_tree = buf + nr_node_ids; /* Stable tree is empty but not the unstable */ root_unstable_tree[0] = one_unstable_tree[0]; } } if (!err) { ksm_merge_across_nodes = knob; ksm_nr_node_ids = knob ? 1 : nr_node_ids; } } mutex_unlock(&ksm_thread_mutex); return err ? err : count; } KSM_ATTR(merge_across_nodes); #endif static ssize_t use_zero_pages_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return sysfs_emit(buf, "%u\n", ksm_use_zero_pages); } static ssize_t use_zero_pages_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { int err; bool value; err = kstrtobool(buf, &value); if (err) return -EINVAL; ksm_use_zero_pages = value; return count; } KSM_ATTR(use_zero_pages); static ssize_t max_page_sharing_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return sysfs_emit(buf, "%u\n", ksm_max_page_sharing); } static ssize_t max_page_sharing_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { int err; int knob; err = kstrtoint(buf, 10, &knob); if (err) return err; /* * When a KSM page is created it is shared by 2 mappings. This * being a signed comparison, it implicitly verifies it's not * negative. */ if (knob < 2) return -EINVAL; if (READ_ONCE(ksm_max_page_sharing) == knob) return count; mutex_lock(&ksm_thread_mutex); wait_while_offlining(); if (ksm_max_page_sharing != knob) { if (ksm_pages_shared || remove_all_stable_nodes()) err = -EBUSY; else ksm_max_page_sharing = knob; } mutex_unlock(&ksm_thread_mutex); return err ? err : count; } KSM_ATTR(max_page_sharing); static ssize_t pages_scanned_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return sysfs_emit(buf, "%lu\n", ksm_pages_scanned); } KSM_ATTR_RO(pages_scanned); static ssize_t pages_shared_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return sysfs_emit(buf, "%lu\n", ksm_pages_shared); } KSM_ATTR_RO(pages_shared); static ssize_t pages_sharing_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return sysfs_emit(buf, "%lu\n", ksm_pages_sharing); } KSM_ATTR_RO(pages_sharing); static ssize_t pages_unshared_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return sysfs_emit(buf, "%lu\n", ksm_pages_unshared); } KSM_ATTR_RO(pages_unshared); static ssize_t pages_volatile_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { long ksm_pages_volatile; ksm_pages_volatile = ksm_rmap_items - ksm_pages_shared - ksm_pages_sharing - ksm_pages_unshared; /* * It was not worth any locking to calculate that statistic, * but it might therefore sometimes be negative: conceal that. */ if (ksm_pages_volatile < 0) ksm_pages_volatile = 0; return sysfs_emit(buf, "%ld\n", ksm_pages_volatile); } KSM_ATTR_RO(pages_volatile); static ssize_t pages_skipped_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return sysfs_emit(buf, "%lu\n", ksm_pages_skipped); } KSM_ATTR_RO(pages_skipped); static ssize_t ksm_zero_pages_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return sysfs_emit(buf, "%ld\n", atomic_long_read(&ksm_zero_pages)); } KSM_ATTR_RO(ksm_zero_pages); static ssize_t general_profit_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { long general_profit; general_profit = (ksm_pages_sharing + atomic_long_read(&ksm_zero_pages)) * PAGE_SIZE - ksm_rmap_items * sizeof(struct ksm_rmap_item); return sysfs_emit(buf, "%ld\n", general_profit); } KSM_ATTR_RO(general_profit); static ssize_t stable_node_dups_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return sysfs_emit(buf, "%lu\n", ksm_stable_node_dups); } KSM_ATTR_RO(stable_node_dups); static ssize_t stable_node_chains_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return sysfs_emit(buf, "%lu\n", ksm_stable_node_chains); } KSM_ATTR_RO(stable_node_chains); static ssize_t stable_node_chains_prune_millisecs_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return sysfs_emit(buf, "%u\n", ksm_stable_node_chains_prune_millisecs); } static ssize_t stable_node_chains_prune_millisecs_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { unsigned int msecs; int err; err = kstrtouint(buf, 10, &msecs); if (err) return -EINVAL; ksm_stable_node_chains_prune_millisecs = msecs; return count; } KSM_ATTR(stable_node_chains_prune_millisecs); static ssize_t full_scans_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return sysfs_emit(buf, "%lu\n", ksm_scan.seqnr); } KSM_ATTR_RO(full_scans); static ssize_t smart_scan_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return sysfs_emit(buf, "%u\n", ksm_smart_scan); } static ssize_t smart_scan_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { int err; bool value; err = kstrtobool(buf, &value); if (err) return -EINVAL; ksm_smart_scan = value; return count; } KSM_ATTR(smart_scan); static ssize_t advisor_mode_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { const char *output; if (ksm_advisor == KSM_ADVISOR_NONE) output = "[none] scan-time"; else if (ksm_advisor == KSM_ADVISOR_SCAN_TIME) output = "none [scan-time]"; return sysfs_emit(buf, "%s\n", output); } static ssize_t advisor_mode_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { enum ksm_advisor_type curr_advisor = ksm_advisor; if (sysfs_streq("scan-time", buf)) ksm_advisor = KSM_ADVISOR_SCAN_TIME; else if (sysfs_streq("none", buf)) ksm_advisor = KSM_ADVISOR_NONE; else return -EINVAL; /* Set advisor default values */ if (curr_advisor != ksm_advisor) set_advisor_defaults(); return count; } KSM_ATTR(advisor_mode); static ssize_t advisor_max_cpu_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return sysfs_emit(buf, "%u\n", ksm_advisor_max_cpu); } static ssize_t advisor_max_cpu_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { int err; unsigned long value; err = kstrtoul(buf, 10, &value); if (err) return -EINVAL; ksm_advisor_max_cpu = value; return count; } KSM_ATTR(advisor_max_cpu); static ssize_t advisor_min_pages_to_scan_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return sysfs_emit(buf, "%lu\n", ksm_advisor_min_pages_to_scan); } static ssize_t advisor_min_pages_to_scan_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { int err; unsigned long value; err = kstrtoul(buf, 10, &value); if (err) return -EINVAL; ksm_advisor_min_pages_to_scan = value; return count; } KSM_ATTR(advisor_min_pages_to_scan); static ssize_t advisor_max_pages_to_scan_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return sysfs_emit(buf, "%lu\n", ksm_advisor_max_pages_to_scan); } static ssize_t advisor_max_pages_to_scan_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { int err; unsigned long value; err = kstrtoul(buf, 10, &value); if (err) return -EINVAL; ksm_advisor_max_pages_to_scan = value; return count; } KSM_ATTR(advisor_max_pages_to_scan); static ssize_t advisor_target_scan_time_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return sysfs_emit(buf, "%lu\n", ksm_advisor_target_scan_time); } static ssize_t advisor_target_scan_time_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { int err; unsigned long value; err = kstrtoul(buf, 10, &value); if (err) return -EINVAL; if (value < 1) return -EINVAL; ksm_advisor_target_scan_time = value; return count; } KSM_ATTR(advisor_target_scan_time); static struct attribute *ksm_attrs[] = { &sleep_millisecs_attr.attr, &pages_to_scan_attr.attr, &run_attr.attr, &pages_scanned_attr.attr, &pages_shared_attr.attr, &pages_sharing_attr.attr, &pages_unshared_attr.attr, &pages_volatile_attr.attr, &pages_skipped_attr.attr, &ksm_zero_pages_attr.attr, &full_scans_attr.attr, #ifdef CONFIG_NUMA &merge_across_nodes_attr.attr, #endif &max_page_sharing_attr.attr, &stable_node_chains_attr.attr, &stable_node_dups_attr.attr, &stable_node_chains_prune_millisecs_attr.attr, &use_zero_pages_attr.attr, &general_profit_attr.attr, &smart_scan_attr.attr, &advisor_mode_attr.attr, &advisor_max_cpu_attr.attr, &advisor_min_pages_to_scan_attr.attr, &advisor_max_pages_to_scan_attr.attr, &advisor_target_scan_time_attr.attr, NULL, }; static const struct attribute_group ksm_attr_group = { .attrs = ksm_attrs, .name = "ksm", }; #endif /* CONFIG_SYSFS */ static int __init ksm_init(void) { struct task_struct *ksm_thread; int err; /* The correct value depends on page size and endianness */ zero_checksum = calc_checksum(ZERO_PAGE(0)); /* Default to false for backwards compatibility */ ksm_use_zero_pages = false; err = ksm_slab_init(); if (err) goto out; ksm_thread = kthread_run(ksm_scan_thread, NULL, "ksmd"); if (IS_ERR(ksm_thread)) { pr_err("ksm: creating kthread failed\n"); err = PTR_ERR(ksm_thread); goto out_free; } #ifdef CONFIG_SYSFS err = sysfs_create_group(mm_kobj, &ksm_attr_group); if (err) { pr_err("ksm: register sysfs failed\n"); kthread_stop(ksm_thread); goto out_free; } #else ksm_run = KSM_RUN_MERGE; /* no way for user to start it */ #endif /* CONFIG_SYSFS */ #ifdef CONFIG_MEMORY_HOTREMOVE /* There is no significance to this priority 100 */ hotplug_memory_notifier(ksm_memory_callback, KSM_CALLBACK_PRI); #endif return 0; out_free: ksm_slab_free(); out: return err; } subsys_initcall(ksm_init); |
| 11 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_DAX_H #define _LINUX_DAX_H #include <linux/fs.h> #include <linux/mm.h> #include <linux/radix-tree.h> typedef unsigned long dax_entry_t; struct dax_device; struct gendisk; struct iomap_ops; struct iomap_iter; struct iomap; enum dax_access_mode { DAX_ACCESS, DAX_RECOVERY_WRITE, }; struct dax_operations { /* * direct_access: translate a device-relative * logical-page-offset into an absolute physical pfn. Return the * number of pages available for DAX at that pfn. */ long (*direct_access)(struct dax_device *, pgoff_t, long, enum dax_access_mode, void **, pfn_t *); /* * Validate whether this device is usable as an fsdax backing * device. */ bool (*dax_supported)(struct dax_device *, struct block_device *, int, sector_t, sector_t); /* zero_page_range: required operation. Zero page range */ int (*zero_page_range)(struct dax_device *, pgoff_t, size_t); /* * recovery_write: recover a poisoned range by DAX device driver * capable of clearing poison. */ size_t (*recovery_write)(struct dax_device *dax_dev, pgoff_t pgoff, void *addr, size_t bytes, struct iov_iter *iter); }; struct dax_holder_operations { /* * notify_failure - notify memory failure into inner holder device * @dax_dev: the dax device which contains the holder * @offset: offset on this dax device where memory failure occurs * @len: length of this memory failure event * @flags: action flags for memory failure handler */ int (*notify_failure)(struct dax_device *dax_dev, u64 offset, u64 len, int mf_flags); }; #if IS_ENABLED(CONFIG_DAX) struct dax_device *alloc_dax(void *private, const struct dax_operations *ops); void *dax_holder(struct dax_device *dax_dev); void put_dax(struct dax_device *dax_dev); void kill_dax(struct dax_device *dax_dev); void dax_write_cache(struct dax_device *dax_dev, bool wc); bool dax_write_cache_enabled(struct dax_device *dax_dev); bool dax_synchronous(struct dax_device *dax_dev); void set_dax_nocache(struct dax_device *dax_dev); void set_dax_nomc(struct dax_device *dax_dev); void set_dax_synchronous(struct dax_device *dax_dev); size_t dax_recovery_write(struct dax_device *dax_dev, pgoff_t pgoff, void *addr, size_t bytes, struct iov_iter *i); /* * Check if given mapping is supported by the file / underlying device. */ static inline bool daxdev_mapping_supported(struct vm_area_struct *vma, struct dax_device *dax_dev) { if (!(vma->vm_flags & VM_SYNC)) return true; if (!IS_DAX(file_inode(vma->vm_file))) return false; return dax_synchronous(dax_dev); } #else static inline void *dax_holder(struct dax_device *dax_dev) { return NULL; } static inline struct dax_device *alloc_dax(void *private, const struct dax_operations *ops) { return ERR_PTR(-EOPNOTSUPP); } static inline void put_dax(struct dax_device *dax_dev) { } static inline void kill_dax(struct dax_device *dax_dev) { } static inline void dax_write_cache(struct dax_device *dax_dev, bool wc) { } static inline bool dax_write_cache_enabled(struct dax_device *dax_dev) { return false; } static inline bool dax_synchronous(struct dax_device *dax_dev) { return true; } static inline void set_dax_nocache(struct dax_device *dax_dev) { } static inline void set_dax_nomc(struct dax_device *dax_dev) { } static inline void set_dax_synchronous(struct dax_device *dax_dev) { } static inline bool daxdev_mapping_supported(struct vm_area_struct *vma, struct dax_device *dax_dev) { return !(vma->vm_flags & VM_SYNC); } static inline size_t dax_recovery_write(struct dax_device *dax_dev, pgoff_t pgoff, void *addr, size_t bytes, struct iov_iter *i) { return 0; } #endif struct writeback_control; #if defined(CONFIG_BLOCK) && defined(CONFIG_FS_DAX) int dax_add_host(struct dax_device *dax_dev, struct gendisk *disk); void dax_remove_host(struct gendisk *disk); struct dax_device *fs_dax_get_by_bdev(struct block_device *bdev, u64 *start_off, void *holder, const struct dax_holder_operations *ops); void fs_put_dax(struct dax_device *dax_dev, void *holder); #else static inline int dax_add_host(struct dax_device *dax_dev, struct gendisk *disk) { return 0; } static inline void dax_remove_host(struct gendisk *disk) { } static inline struct dax_device *fs_dax_get_by_bdev(struct block_device *bdev, u64 *start_off, void *holder, const struct dax_holder_operations *ops) { return NULL; } static inline void fs_put_dax(struct dax_device *dax_dev, void *holder) { } #endif /* CONFIG_BLOCK && CONFIG_FS_DAX */ #if IS_ENABLED(CONFIG_FS_DAX) int dax_writeback_mapping_range(struct address_space *mapping, struct dax_device *dax_dev, struct writeback_control *wbc); struct page *dax_layout_busy_page(struct address_space *mapping); struct page *dax_layout_busy_page_range(struct address_space *mapping, loff_t start, loff_t end); dax_entry_t dax_lock_folio(struct folio *folio); void dax_unlock_folio(struct folio *folio, dax_entry_t cookie); dax_entry_t dax_lock_mapping_entry(struct address_space *mapping, unsigned long index, struct page **page); void dax_unlock_mapping_entry(struct address_space *mapping, unsigned long index, dax_entry_t cookie); #else static inline struct page *dax_layout_busy_page(struct address_space *mapping) { return NULL; } static inline struct page *dax_layout_busy_page_range(struct address_space *mapping, pgoff_t start, pgoff_t nr_pages) { return NULL; } static inline int dax_writeback_mapping_range(struct address_space *mapping, struct dax_device *dax_dev, struct writeback_control *wbc) { return -EOPNOTSUPP; } static inline dax_entry_t dax_lock_folio(struct folio *folio) { if (IS_DAX(folio->mapping->host)) return ~0UL; return 0; } static inline void dax_unlock_folio(struct folio *folio, dax_entry_t cookie) { } static inline dax_entry_t dax_lock_mapping_entry(struct address_space *mapping, unsigned long index, struct page **page) { return 0; } static inline void dax_unlock_mapping_entry(struct address_space *mapping, unsigned long index, dax_entry_t cookie) { } #endif int dax_file_unshare(struct inode *inode, loff_t pos, loff_t len, const struct iomap_ops *ops); int dax_zero_range(struct inode *inode, loff_t pos, loff_t len, bool *did_zero, const struct iomap_ops *ops); int dax_truncate_page(struct inode *inode, loff_t pos, bool *did_zero, const struct iomap_ops *ops); #if IS_ENABLED(CONFIG_DAX) int dax_read_lock(void); void dax_read_unlock(int id); #else static inline int dax_read_lock(void) { return 0; } static inline void dax_read_unlock(int id) { } #endif /* CONFIG_DAX */ bool dax_alive(struct dax_device *dax_dev); void *dax_get_private(struct dax_device *dax_dev); long dax_direct_access(struct dax_device *dax_dev, pgoff_t pgoff, long nr_pages, enum dax_access_mode mode, void **kaddr, pfn_t *pfn); size_t dax_copy_from_iter(struct dax_device *dax_dev, pgoff_t pgoff, void *addr, size_t bytes, struct iov_iter *i); size_t dax_copy_to_iter(struct dax_device *dax_dev, pgoff_t pgoff, void *addr, size_t bytes, struct iov_iter *i); int dax_zero_page_range(struct dax_device *dax_dev, pgoff_t pgoff, size_t nr_pages); int dax_holder_notify_failure(struct dax_device *dax_dev, u64 off, u64 len, int mf_flags); void dax_flush(struct dax_device *dax_dev, void *addr, size_t size); ssize_t dax_iomap_rw(struct kiocb *iocb, struct iov_iter *iter, const struct iomap_ops *ops); vm_fault_t dax_iomap_fault(struct vm_fault *vmf, unsigned int order, pfn_t *pfnp, int *errp, const struct iomap_ops *ops); vm_fault_t dax_finish_sync_fault(struct vm_fault *vmf, unsigned int order, pfn_t pfn); int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index); int dax_invalidate_mapping_entry_sync(struct address_space *mapping, pgoff_t index); int dax_dedupe_file_range_compare(struct inode *src, loff_t srcoff, struct inode *dest, loff_t destoff, loff_t len, bool *is_same, const struct iomap_ops *ops); int dax_remap_file_range_prep(struct file *file_in, loff_t pos_in, struct file *file_out, loff_t pos_out, loff_t *len, unsigned int remap_flags, const struct iomap_ops *ops); static inline bool dax_mapping(struct address_space *mapping) { return mapping->host && IS_DAX(mapping->host); } /* * Due to dax's memory and block duo personalities, hwpoison reporting * takes into consideration which personality is presently visible. * When dax acts like a block device, such as in block IO, an encounter of * dax hwpoison is reported as -EIO. * When dax acts like memory, such as in page fault, a detection of hwpoison * is reported as -EHWPOISON which leads to VM_FAULT_HWPOISON. */ static inline int dax_mem2blk_err(int err) { return (err == -EHWPOISON) ? -EIO : err; } #ifdef CONFIG_DEV_DAX_HMEM_DEVICES void hmem_register_resource(int target_nid, struct resource *r); #else static inline void hmem_register_resource(int target_nid, struct resource *r) { } #endif typedef int (*walk_hmem_fn)(struct device *dev, int target_nid, const struct resource *res); int walk_hmem_resources(struct device *dev, walk_hmem_fn fn); #endif |
| 6 5 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef BTRFS_BLOCK_GROUP_H #define BTRFS_BLOCK_GROUP_H #include <linux/atomic.h> #include <linux/mutex.h> #include <linux/list.h> #include <linux/spinlock.h> #include <linux/refcount.h> #include <linux/wait.h> #include <linux/sizes.h> #include <linux/rwsem.h> #include <linux/rbtree.h> #include <uapi/linux/btrfs_tree.h> #include "free-space-cache.h" struct btrfs_chunk_map; struct btrfs_fs_info; struct btrfs_inode; struct btrfs_trans_handle; enum btrfs_disk_cache_state { BTRFS_DC_WRITTEN, BTRFS_DC_ERROR, BTRFS_DC_CLEAR, BTRFS_DC_SETUP, }; enum btrfs_block_group_size_class { /* Unset */ BTRFS_BG_SZ_NONE, /* 0 < size <= 128K */ BTRFS_BG_SZ_SMALL, /* 128K < size <= 8M */ BTRFS_BG_SZ_MEDIUM, /* 8M < size < BG_LENGTH */ BTRFS_BG_SZ_LARGE, }; /* * This describes the state of the block_group for async discard. This is due * to the two pass nature of it where extent discarding is prioritized over * bitmap discarding. BTRFS_DISCARD_RESET_CURSOR is set when we are resetting * between lists to prevent contention for discard state variables * (eg. discard_cursor). */ enum btrfs_discard_state { BTRFS_DISCARD_EXTENTS, BTRFS_DISCARD_BITMAPS, BTRFS_DISCARD_RESET_CURSOR, }; /* * Control flags for do_chunk_alloc's force field CHUNK_ALLOC_NO_FORCE means to * only allocate a chunk if we really need one. * * CHUNK_ALLOC_LIMITED means to only try and allocate one if we have very few * chunks already allocated. This is used as part of the clustering code to * help make sure we have a good pool of storage to cluster in, without filling * the FS with empty chunks * * CHUNK_ALLOC_FORCE means it must try to allocate one * * CHUNK_ALLOC_FORCE_FOR_EXTENT like CHUNK_ALLOC_FORCE but called from * find_free_extent() that also activaes the zone */ enum btrfs_chunk_alloc_enum { CHUNK_ALLOC_NO_FORCE, CHUNK_ALLOC_LIMITED, CHUNK_ALLOC_FORCE, CHUNK_ALLOC_FORCE_FOR_EXTENT, }; /* Block group flags set at runtime */ enum btrfs_block_group_flags { BLOCK_GROUP_FLAG_IREF, BLOCK_GROUP_FLAG_REMOVED, BLOCK_GROUP_FLAG_TO_COPY, BLOCK_GROUP_FLAG_RELOCATING_REPAIR, BLOCK_GROUP_FLAG_CHUNK_ITEM_INSERTED, BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, BLOCK_GROUP_FLAG_ZONED_DATA_RELOC, /* Does the block group need to be added to the free space tree? */ BLOCK_GROUP_FLAG_NEEDS_FREE_SPACE, /* Indicate that the block group is placed on a sequential zone */ BLOCK_GROUP_FLAG_SEQUENTIAL_ZONE, /* * Indicate that block group is in the list of new block groups of a * transaction. */ BLOCK_GROUP_FLAG_NEW, }; enum btrfs_caching_type { BTRFS_CACHE_NO, BTRFS_CACHE_STARTED, BTRFS_CACHE_FINISHED, BTRFS_CACHE_ERROR, }; struct btrfs_caching_control { struct list_head list; struct mutex mutex; wait_queue_head_t wait; struct btrfs_work work; struct btrfs_block_group *block_group; /* Track progress of caching during allocation. */ atomic_t progress; refcount_t count; }; /* Once caching_thread() finds this much free space, it will wake up waiters. */ #define CACHING_CTL_WAKE_UP SZ_2M struct btrfs_block_group { struct btrfs_fs_info *fs_info; struct btrfs_inode *inode; spinlock_t lock; u64 start; u64 length; u64 pinned; u64 reserved; u64 used; u64 delalloc_bytes; u64 bytes_super; u64 flags; u64 cache_generation; u64 global_root_id; /* * The last committed used bytes of this block group, if the above @used * is still the same as @commit_used, we don't need to update block * group item of this block group. */ u64 commit_used; /* * If the free space extent count exceeds this number, convert the block * group to bitmaps. */ u32 bitmap_high_thresh; /* * If the free space extent count drops below this number, convert the * block group back to extents. */ u32 bitmap_low_thresh; /* * It is just used for the delayed data space allocation because * only the data space allocation and the relative metadata update * can be done cross the transaction. */ struct rw_semaphore data_rwsem; /* For raid56, this is a full stripe, without parity */ unsigned long full_stripe_len; unsigned long runtime_flags; unsigned int ro; int disk_cache_state; /* Cache tracking stuff */ int cached; struct btrfs_caching_control *caching_ctl; struct btrfs_space_info *space_info; /* Free space cache stuff */ struct btrfs_free_space_ctl *free_space_ctl; /* Block group cache stuff */ struct rb_node cache_node; /* For block groups in the same raid type */ struct list_head list; refcount_t refs; /* * List of struct btrfs_free_clusters for this block group. * Today it will only have one thing on it, but that may change */ struct list_head cluster_list; /* * Used for several lists: * * 1) struct btrfs_fs_info::unused_bgs * 2) struct btrfs_fs_info::reclaim_bgs * 3) struct btrfs_transaction::deleted_bgs * 4) struct btrfs_trans_handle::new_bgs */ struct list_head bg_list; /* For read-only block groups */ struct list_head ro_list; /* * When non-zero it means the block group's logical address and its * device extents can not be reused for future block group allocations * until the counter goes down to 0. This is to prevent them from being * reused while some task is still using the block group after it was * deleted - we want to make sure they can only be reused for new block * groups after that task is done with the deleted block group. */ atomic_t frozen; /* For discard operations */ struct list_head discard_list; int discard_index; u64 discard_eligible_time; u64 discard_cursor; enum btrfs_discard_state discard_state; /* For dirty block groups */ struct list_head dirty_list; struct list_head io_list; struct btrfs_io_ctl io_ctl; /* * Incremented when doing extent allocations and holding a read lock * on the space_info's groups_sem semaphore. * Decremented when an ordered extent that represents an IO against this * block group's range is created (after it's added to its inode's * root's list of ordered extents) or immediately after the allocation * if it's a metadata extent or fallocate extent (for these cases we * don't create ordered extents). */ atomic_t reservations; /* * Incremented while holding the spinlock *lock* by a task checking if * it can perform a nocow write (incremented if the value for the *ro* * field is 0). Decremented by such tasks once they create an ordered * extent or before that if some error happens before reaching that step. * This is to prevent races between block group relocation and nocow * writes through direct IO. */ atomic_t nocow_writers; /* Lock for free space tree operations. */ struct mutex free_space_lock; /* * Number of extents in this block group used for swap files. * All accesses protected by the spinlock 'lock'. */ int swap_extents; /* * Allocation offset for the block group to implement sequential * allocation. This is used only on a zoned filesystem. */ u64 alloc_offset; u64 zone_unusable; u64 zone_capacity; u64 meta_write_pointer; struct btrfs_chunk_map *physical_map; struct list_head active_bg_list; struct work_struct zone_finish_work; struct extent_buffer *last_eb; enum btrfs_block_group_size_class size_class; u64 reclaim_mark; }; static inline u64 btrfs_block_group_end(const struct btrfs_block_group *block_group) { return (block_group->start + block_group->length); } static inline bool btrfs_is_block_group_used(const struct btrfs_block_group *bg) { lockdep_assert_held(&bg->lock); return (bg->used > 0 || bg->reserved > 0 || bg->pinned > 0); } static inline bool btrfs_is_block_group_data_only(const struct btrfs_block_group *block_group) { /* * In mixed mode the fragmentation is expected to be high, lowering the * efficiency, so only proper data block groups are considered. */ return (block_group->flags & BTRFS_BLOCK_GROUP_DATA) && !(block_group->flags & BTRFS_BLOCK_GROUP_METADATA); } #ifdef CONFIG_BTRFS_DEBUG int btrfs_should_fragment_free_space(const struct btrfs_block_group *block_group); #endif struct btrfs_block_group *btrfs_lookup_first_block_group( struct btrfs_fs_info *info, u64 bytenr); struct btrfs_block_group *btrfs_lookup_block_group( struct btrfs_fs_info *info, u64 bytenr); struct btrfs_block_group *btrfs_next_block_group( struct btrfs_block_group *cache); void btrfs_get_block_group(struct btrfs_block_group *cache); void btrfs_put_block_group(struct btrfs_block_group *cache); void btrfs_dec_block_group_reservations(struct btrfs_fs_info *fs_info, const u64 start); void btrfs_wait_block_group_reservations(struct btrfs_block_group *bg); struct btrfs_block_group *btrfs_inc_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr); void btrfs_dec_nocow_writers(struct btrfs_block_group *bg); void btrfs_wait_nocow_writers(struct btrfs_block_group *bg); void btrfs_wait_block_group_cache_progress(struct btrfs_block_group *cache, u64 num_bytes); int btrfs_cache_block_group(struct btrfs_block_group *cache, bool wait); struct btrfs_caching_control *btrfs_get_caching_control( struct btrfs_block_group *cache); int btrfs_add_new_free_space(struct btrfs_block_group *block_group, u64 start, u64 end, u64 *total_added_ret); struct btrfs_trans_handle *btrfs_start_trans_remove_block_group( struct btrfs_fs_info *fs_info, const u64 chunk_offset); int btrfs_remove_block_group(struct btrfs_trans_handle *trans, struct btrfs_chunk_map *map); void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info); void btrfs_mark_bg_unused(struct btrfs_block_group *bg); void btrfs_reclaim_bgs_work(struct work_struct *work); void btrfs_reclaim_bgs(struct btrfs_fs_info *fs_info); void btrfs_mark_bg_to_reclaim(struct btrfs_block_group *bg); int btrfs_read_block_groups(struct btrfs_fs_info *info); struct btrfs_block_group *btrfs_make_block_group(struct btrfs_trans_handle *trans, u64 type, u64 chunk_offset, u64 size); void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans); int btrfs_inc_block_group_ro(struct btrfs_block_group *cache, bool do_chunk_alloc); void btrfs_dec_block_group_ro(struct btrfs_block_group *cache); int btrfs_start_dirty_block_groups(struct btrfs_trans_handle *trans); int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans); int btrfs_setup_space_cache(struct btrfs_trans_handle *trans); int btrfs_update_block_group(struct btrfs_trans_handle *trans, u64 bytenr, u64 num_bytes, bool alloc); int btrfs_add_reserved_bytes(struct btrfs_block_group *cache, u64 ram_bytes, u64 num_bytes, int delalloc, bool force_wrong_size_class); void btrfs_free_reserved_bytes(struct btrfs_block_group *cache, u64 num_bytes, int delalloc); int btrfs_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags, enum btrfs_chunk_alloc_enum force); int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans, u64 type); void check_system_chunk(struct btrfs_trans_handle *trans, const u64 type); void btrfs_reserve_chunk_metadata(struct btrfs_trans_handle *trans, bool is_item_insertion); u64 btrfs_get_alloc_profile(struct btrfs_fs_info *fs_info, u64 orig_flags); void btrfs_put_block_group_cache(struct btrfs_fs_info *info); int btrfs_free_block_groups(struct btrfs_fs_info *info); int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start, u64 physical, u64 **logical, int *naddrs, int *stripe_len); static inline u64 btrfs_data_alloc_profile(struct btrfs_fs_info *fs_info) { return btrfs_get_alloc_profile(fs_info, BTRFS_BLOCK_GROUP_DATA); } static inline u64 btrfs_metadata_alloc_profile(struct btrfs_fs_info *fs_info) { return btrfs_get_alloc_profile(fs_info, BTRFS_BLOCK_GROUP_METADATA); } static inline u64 btrfs_system_alloc_profile(struct btrfs_fs_info *fs_info) { return btrfs_get_alloc_profile(fs_info, BTRFS_BLOCK_GROUP_SYSTEM); } static inline int btrfs_block_group_done(const struct btrfs_block_group *cache) { smp_mb(); return cache->cached == BTRFS_CACHE_FINISHED || cache->cached == BTRFS_CACHE_ERROR; } void btrfs_freeze_block_group(struct btrfs_block_group *cache); void btrfs_unfreeze_block_group(struct btrfs_block_group *cache); bool btrfs_inc_block_group_swap_extents(struct btrfs_block_group *bg); void btrfs_dec_block_group_swap_extents(struct btrfs_block_group *bg, int amount); enum btrfs_block_group_size_class btrfs_calc_block_group_size_class(u64 size); int btrfs_use_block_group_size_class(struct btrfs_block_group *bg, enum btrfs_block_group_size_class size_class, bool force_wrong_size_class); bool btrfs_block_group_should_use_size_class(const struct btrfs_block_group *bg); #endif /* BTRFS_BLOCK_GROUP_H */ |
| 72 73 5 5 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 | // SPDX-License-Identifier: GPL-2.0 #include <linux/cache.h> #include <linux/sched.h> #include <linux/slab.h> #include <linux/pid_namespace.h> #include "internal.h" /* * /proc/self: */ static const char *proc_self_get_link(struct dentry *dentry, struct inode *inode, struct delayed_call *done) { struct pid_namespace *ns = proc_pid_ns(inode->i_sb); pid_t tgid = task_tgid_nr_ns(current, ns); char *name; if (!tgid) return ERR_PTR(-ENOENT); /* max length of unsigned int in decimal + NULL term */ name = kmalloc(10 + 1, dentry ? GFP_KERNEL : GFP_ATOMIC); if (unlikely(!name)) return dentry ? ERR_PTR(-ENOMEM) : ERR_PTR(-ECHILD); sprintf(name, "%u", tgid); set_delayed_call(done, kfree_link, name); return name; } static const struct inode_operations proc_self_inode_operations = { .get_link = proc_self_get_link, }; static unsigned self_inum __ro_after_init; int proc_setup_self(struct super_block *s) { struct inode *root_inode = d_inode(s->s_root); struct proc_fs_info *fs_info = proc_sb_info(s); struct dentry *self; int ret = -ENOMEM; inode_lock(root_inode); self = d_alloc_name(s->s_root, "self"); if (self) { struct inode *inode = new_inode(s); if (inode) { inode->i_ino = self_inum; simple_inode_init_ts(inode); inode->i_mode = S_IFLNK | S_IRWXUGO; inode->i_uid = GLOBAL_ROOT_UID; inode->i_gid = GLOBAL_ROOT_GID; inode->i_op = &proc_self_inode_operations; d_add(self, inode); ret = 0; } else { dput(self); } } inode_unlock(root_inode); if (ret) pr_err("proc_fill_super: can't allocate /proc/self\n"); else fs_info->proc_self = self; return ret; } void __init proc_self_init(void) { proc_alloc_inum(&self_inum); } |
| 74 61 15 1 17 17 7 3 9 3 1 9 9 9 9 376 375 173 173 152 24 3 3 3 3 3 3 2 9 9 2 8 156 4 5 6 1 148 147 1 1 8 177 181 3 1 184 1 158 6 2 14 147 160 4 5 1 142 118 3 3 3 3 3 3 3 1 14 136 14 146 129 24 146 11 118 25 136 1 14 13 1 1 13 2 12 13 1 14 125 135 1 132 3 135 134 1 160 1 159 157 156 2 156 110 110 2 108 108 108 63 24 24 24 63 63 63 124 39 143 174 182 6 176 1 2 116 1 58 2 172 2 42 186 185 3 182 2 174 174 172 172 171 171 1 172 37 1 135 171 1 171 171 171 36 135 171 36 170 228 9 1 12 197 7 42 2 162 2 157 3 17 135 14 188 10 137 37 2 1 47 125 69 2 108 2 126 46 1 171 6 167 171 48 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 | // SPDX-License-Identifier: GPL-2.0-only /* * * Copyright (C) 2011 Novell Inc. */ #include <uapi/linux/magic.h> #include <linux/fs.h> #include <linux/namei.h> #include <linux/xattr.h> #include <linux/mount.h> #include <linux/parser.h> #include <linux/module.h> #include <linux/statfs.h> #include <linux/seq_file.h> #include <linux/posix_acl_xattr.h> #include <linux/exportfs.h> #include <linux/file.h> #include <linux/fs_context.h> #include <linux/fs_parser.h> #include "overlayfs.h" #include "params.h" MODULE_AUTHOR("Miklos Szeredi <miklos@szeredi.hu>"); MODULE_DESCRIPTION("Overlay filesystem"); MODULE_LICENSE("GPL"); struct ovl_dir_cache; static struct dentry *ovl_d_real(struct dentry *dentry, enum d_real_type type) { struct dentry *upper, *lower; int err; switch (type) { case D_REAL_DATA: case D_REAL_METADATA: break; |