| 49 4 3 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 | // SPDX-License-Identifier: GPL-2.0 #include <linux/kernel.h> #include <linux/module.h> #include <linux/device.h> #include <linux/netdevice.h> #include <net/bonding.h> #include <net/bond_alb.h> #if defined(CONFIG_DEBUG_FS) && !defined(CONFIG_NET_NS) #include <linux/debugfs.h> #include <linux/seq_file.h> static struct dentry *bonding_debug_root; /* Show RLB hash table */ static int bond_debug_rlb_hash_show(struct seq_file *m, void *v) { struct bonding *bond = m->private; struct alb_bond_info *bond_info = &(BOND_ALB_INFO(bond)); struct rlb_client_info *client_info; u32 hash_index; if (BOND_MODE(bond) != BOND_MODE_ALB) return 0; seq_printf(m, "SourceIP DestinationIP " "Destination MAC DEV\n"); spin_lock_bh(&bond->mode_lock); hash_index = bond_info->rx_hashtbl_used_head; for (; hash_index != RLB_NULL_INDEX; hash_index = client_info->used_next) { client_info = &(bond_info->rx_hashtbl[hash_index]); seq_printf(m, "%-15pI4 %-15pI4 %-17pM %s\n", &client_info->ip_src, &client_info->ip_dst, &client_info->mac_dst, client_info->slave->dev->name); } spin_unlock_bh(&bond->mode_lock); return 0; } DEFINE_SHOW_ATTRIBUTE(bond_debug_rlb_hash); void bond_debug_register(struct bonding *bond) { bond->debug_dir = debugfs_create_dir(bond->dev->name, bonding_debug_root); debugfs_create_file("rlb_hash_table", 0400, bond->debug_dir, bond, &bond_debug_rlb_hash_fops); } void bond_debug_unregister(struct bonding *bond) { debugfs_remove_recursive(bond->debug_dir); } void bond_debug_reregister(struct bonding *bond) { int err = debugfs_change_name(bond->debug_dir, "%s", bond->dev->name); if (err) { netdev_warn(bond->dev, "failed to reregister, so just unregister old one\n"); bond_debug_unregister(bond); } } void __init bond_create_debugfs(void) { bonding_debug_root = debugfs_create_dir("bonding", NULL); if (IS_ERR(bonding_debug_root)) pr_warn("Warning: Cannot create bonding directory in debugfs\n"); } void bond_destroy_debugfs(void) { debugfs_remove_recursive(bonding_debug_root); bonding_debug_root = NULL; } #else /* !CONFIG_DEBUG_FS */ void bond_debug_register(struct bonding *bond) { } void bond_debug_unregister(struct bonding *bond) { } void bond_debug_reregister(struct bonding *bond) { } void __init bond_create_debugfs(void) { } void bond_destroy_debugfs(void) { } #endif /* CONFIG_DEBUG_FS */ |
| 31 31 31 20 11 31 20 11 20 11 26 5 26 5 10 10 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 | // SPDX-License-Identifier: GPL-2.0 /* * Functions related to sysfs handling */ #include <linux/kernel.h> #include <linux/slab.h> #include <linux/module.h> #include <linux/bio.h> #include <linux/blkdev.h> #include <linux/backing-dev.h> #include <linux/blktrace_api.h> #include <linux/debugfs.h> #include "blk.h" #include "blk-mq.h" #include "blk-mq-debugfs.h" #include "blk-mq-sched.h" #include "blk-rq-qos.h" #include "blk-wbt.h" #include "blk-cgroup.h" #include "blk-throttle.h" struct queue_sysfs_entry { struct attribute attr; ssize_t (*show)(struct gendisk *disk, char *page); ssize_t (*store)(struct gendisk *disk, const char *page, size_t count); int (*store_limit)(struct gendisk *disk, const char *page, size_t count, struct queue_limits *lim); void (*load_module)(struct gendisk *disk, const char *page, size_t count); }; static ssize_t queue_var_show(unsigned long var, char *page) { return sysfs_emit(page, "%lu\n", var); } static ssize_t queue_var_store(unsigned long *var, const char *page, size_t count) { int err; unsigned long v; err = kstrtoul(page, 10, &v); if (err || v > UINT_MAX) return -EINVAL; *var = v; return count; } static ssize_t queue_requests_show(struct gendisk *disk, char *page) { return queue_var_show(disk->queue->nr_requests, page); } static ssize_t queue_requests_store(struct gendisk *disk, const char *page, size_t count) { unsigned long nr; int ret, err; if (!queue_is_mq(disk->queue)) return -EINVAL; ret = queue_var_store(&nr, page, count); if (ret < 0) return ret; if (nr < BLKDEV_MIN_RQ) nr = BLKDEV_MIN_RQ; err = blk_mq_update_nr_requests(disk->queue, nr); if (err) return err; return ret; } static ssize_t queue_ra_show(struct gendisk *disk, char *page) { return queue_var_show(disk->bdi->ra_pages << (PAGE_SHIFT - 10), page); } static ssize_t queue_ra_store(struct gendisk *disk, const char *page, size_t count) { unsigned long ra_kb; ssize_t ret; ret = queue_var_store(&ra_kb, page, count); if (ret < 0) return ret; disk->bdi->ra_pages = ra_kb >> (PAGE_SHIFT - 10); return ret; } #define QUEUE_SYSFS_LIMIT_SHOW(_field) \ static ssize_t queue_##_field##_show(struct gendisk *disk, char *page) \ { \ return queue_var_show(disk->queue->limits._field, page); \ } QUEUE_SYSFS_LIMIT_SHOW(max_segments) QUEUE_SYSFS_LIMIT_SHOW(max_discard_segments) QUEUE_SYSFS_LIMIT_SHOW(max_integrity_segments) QUEUE_SYSFS_LIMIT_SHOW(max_segment_size) QUEUE_SYSFS_LIMIT_SHOW(logical_block_size) QUEUE_SYSFS_LIMIT_SHOW(physical_block_size) QUEUE_SYSFS_LIMIT_SHOW(chunk_sectors) QUEUE_SYSFS_LIMIT_SHOW(io_min) QUEUE_SYSFS_LIMIT_SHOW(io_opt) QUEUE_SYSFS_LIMIT_SHOW(discard_granularity) QUEUE_SYSFS_LIMIT_SHOW(zone_write_granularity) QUEUE_SYSFS_LIMIT_SHOW(virt_boundary_mask) QUEUE_SYSFS_LIMIT_SHOW(dma_alignment) QUEUE_SYSFS_LIMIT_SHOW(max_open_zones) QUEUE_SYSFS_LIMIT_SHOW(max_active_zones) QUEUE_SYSFS_LIMIT_SHOW(atomic_write_unit_min) QUEUE_SYSFS_LIMIT_SHOW(atomic_write_unit_max) #define QUEUE_SYSFS_LIMIT_SHOW_SECTORS_TO_BYTES(_field) \ static ssize_t queue_##_field##_show(struct gendisk *disk, char *page) \ { \ return sysfs_emit(page, "%llu\n", \ (unsigned long long)disk->queue->limits._field << \ SECTOR_SHIFT); \ } QUEUE_SYSFS_LIMIT_SHOW_SECTORS_TO_BYTES(max_discard_sectors) QUEUE_SYSFS_LIMIT_SHOW_SECTORS_TO_BYTES(max_hw_discard_sectors) QUEUE_SYSFS_LIMIT_SHOW_SECTORS_TO_BYTES(max_write_zeroes_sectors) QUEUE_SYSFS_LIMIT_SHOW_SECTORS_TO_BYTES(atomic_write_max_sectors) QUEUE_SYSFS_LIMIT_SHOW_SECTORS_TO_BYTES(atomic_write_boundary_sectors) QUEUE_SYSFS_LIMIT_SHOW_SECTORS_TO_BYTES(max_zone_append_sectors) #define QUEUE_SYSFS_LIMIT_SHOW_SECTORS_TO_KB(_field) \ static ssize_t queue_##_field##_show(struct gendisk *disk, char *page) \ { \ return queue_var_show(disk->queue->limits._field >> 1, page); \ } QUEUE_SYSFS_LIMIT_SHOW_SECTORS_TO_KB(max_sectors) QUEUE_SYSFS_LIMIT_SHOW_SECTORS_TO_KB(max_hw_sectors) #define QUEUE_SYSFS_SHOW_CONST(_name, _val) \ static ssize_t queue_##_name##_show(struct gendisk *disk, char *page) \ { \ return sysfs_emit(page, "%d\n", _val); \ } /* deprecated fields */ QUEUE_SYSFS_SHOW_CONST(discard_zeroes_data, 0) QUEUE_SYSFS_SHOW_CONST(write_same_max, 0) QUEUE_SYSFS_SHOW_CONST(poll_delay, -1) static int queue_max_discard_sectors_store(struct gendisk *disk, const char *page, size_t count, struct queue_limits *lim) { unsigned long max_discard_bytes; ssize_t ret; ret = queue_var_store(&max_discard_bytes, page, count); if (ret < 0) return ret; if (max_discard_bytes & (disk->queue->limits.discard_granularity - 1)) return -EINVAL; if ((max_discard_bytes >> SECTOR_SHIFT) > UINT_MAX) return -EINVAL; lim->max_user_discard_sectors = max_discard_bytes >> SECTOR_SHIFT; return 0; } static int queue_max_sectors_store(struct gendisk *disk, const char *page, size_t count, struct queue_limits *lim) { unsigned long max_sectors_kb; ssize_t ret; ret = queue_var_store(&max_sectors_kb, page, count); if (ret < 0) return ret; lim->max_user_sectors = max_sectors_kb << 1; return 0; } static ssize_t queue_feature_store(struct gendisk *disk, const char *page, size_t count, struct queue_limits *lim, blk_features_t feature) { unsigned long val; ssize_t ret; ret = queue_var_store(&val, page, count); if (ret < 0) return ret; if (val) lim->features |= feature; else lim->features &= ~feature; return 0; } #define QUEUE_SYSFS_FEATURE(_name, _feature) \ static ssize_t queue_##_name##_show(struct gendisk *disk, char *page) \ { \ return sysfs_emit(page, "%u\n", \ !!(disk->queue->limits.features & _feature)); \ } \ static int queue_##_name##_store(struct gendisk *disk, \ const char *page, size_t count, struct queue_limits *lim) \ { \ return queue_feature_store(disk, page, count, lim, _feature); \ } QUEUE_SYSFS_FEATURE(rotational, BLK_FEAT_ROTATIONAL) QUEUE_SYSFS_FEATURE(add_random, BLK_FEAT_ADD_RANDOM) QUEUE_SYSFS_FEATURE(iostats, BLK_FEAT_IO_STAT) QUEUE_SYSFS_FEATURE(stable_writes, BLK_FEAT_STABLE_WRITES); #define QUEUE_SYSFS_FEATURE_SHOW(_name, _feature) \ static ssize_t queue_##_name##_show(struct gendisk *disk, char *page) \ { \ return sysfs_emit(page, "%u\n", \ !!(disk->queue->limits.features & _feature)); \ } QUEUE_SYSFS_FEATURE_SHOW(fua, BLK_FEAT_FUA); QUEUE_SYSFS_FEATURE_SHOW(dax, BLK_FEAT_DAX); static ssize_t queue_poll_show(struct gendisk *disk, char *page) { if (queue_is_mq(disk->queue)) return sysfs_emit(page, "%u\n", blk_mq_can_poll(disk->queue)); return sysfs_emit(page, "%u\n", !!(disk->queue->limits.features & BLK_FEAT_POLL)); } static ssize_t queue_zoned_show(struct gendisk *disk, char *page) { if (blk_queue_is_zoned(disk->queue)) return sysfs_emit(page, "host-managed\n"); return sysfs_emit(page, "none\n"); } static ssize_t queue_nr_zones_show(struct gendisk *disk, char *page) { return queue_var_show(disk_nr_zones(disk), page); } static ssize_t queue_iostats_passthrough_show(struct gendisk *disk, char *page) { return queue_var_show(!!blk_queue_passthrough_stat(disk->queue), page); } static int queue_iostats_passthrough_store(struct gendisk *disk, const char *page, size_t count, struct queue_limits *lim) { unsigned long ios; ssize_t ret; ret = queue_var_store(&ios, page, count); if (ret < 0) return ret; if (ios) lim->flags |= BLK_FLAG_IOSTATS_PASSTHROUGH; else lim->flags &= ~BLK_FLAG_IOSTATS_PASSTHROUGH; return 0; } static ssize_t queue_nomerges_show(struct gendisk *disk, char *page) { return queue_var_show((blk_queue_nomerges(disk->queue) << 1) | blk_queue_noxmerges(disk->queue), page); } static ssize_t queue_nomerges_store(struct gendisk *disk, const char *page, size_t count) { unsigned long nm; ssize_t ret = queue_var_store(&nm, page, count); if (ret < 0) return ret; blk_queue_flag_clear(QUEUE_FLAG_NOMERGES, disk->queue); blk_queue_flag_clear(QUEUE_FLAG_NOXMERGES, disk->queue); if (nm == 2) blk_queue_flag_set(QUEUE_FLAG_NOMERGES, disk->queue); else if (nm) blk_queue_flag_set(QUEUE_FLAG_NOXMERGES, disk->queue); return ret; } static ssize_t queue_rq_affinity_show(struct gendisk *disk, char *page) { bool set = test_bit(QUEUE_FLAG_SAME_COMP, &disk->queue->queue_flags); bool force = test_bit(QUEUE_FLAG_SAME_FORCE, &disk->queue->queue_flags); return queue_var_show(set << force, page); } static ssize_t queue_rq_affinity_store(struct gendisk *disk, const char *page, size_t count) { ssize_t ret = -EINVAL; #ifdef CONFIG_SMP struct request_queue *q = disk->queue; unsigned long val; ret = queue_var_store(&val, page, count); if (ret < 0) return ret; if (val == 2) { blk_queue_flag_set(QUEUE_FLAG_SAME_COMP, q); blk_queue_flag_set(QUEUE_FLAG_SAME_FORCE, q); } else if (val == 1) { blk_queue_flag_set(QUEUE_FLAG_SAME_COMP, q); blk_queue_flag_clear(QUEUE_FLAG_SAME_FORCE, q); } else if (val == 0) { blk_queue_flag_clear(QUEUE_FLAG_SAME_COMP, q); blk_queue_flag_clear(QUEUE_FLAG_SAME_FORCE, q); } #endif return ret; } static ssize_t queue_poll_delay_store(struct gendisk *disk, const char *page, size_t count) { return count; } static ssize_t queue_poll_store(struct gendisk *disk, const char *page, size_t count) { if (!(disk->queue->limits.features & BLK_FEAT_POLL)) return -EINVAL; pr_info_ratelimited("writes to the poll attribute are ignored.\n"); pr_info_ratelimited("please use driver specific parameters instead.\n"); return count; } static ssize_t queue_io_timeout_show(struct gendisk *disk, char *page) { return sysfs_emit(page, "%u\n", jiffies_to_msecs(disk->queue->rq_timeout)); } static ssize_t queue_io_timeout_store(struct gendisk *disk, const char *page, size_t count) { unsigned int val; int err; err = kstrtou32(page, 10, &val); if (err || val == 0) return -EINVAL; blk_queue_rq_timeout(disk->queue, msecs_to_jiffies(val)); return count; } static ssize_t queue_wc_show(struct gendisk *disk, char *page) { if (blk_queue_write_cache(disk->queue)) return sysfs_emit(page, "write back\n"); return sysfs_emit(page, "write through\n"); } static int queue_wc_store(struct gendisk *disk, const char *page, size_t count, struct queue_limits *lim) { bool disable; if (!strncmp(page, "write back", 10)) { disable = false; } else if (!strncmp(page, "write through", 13) || !strncmp(page, "none", 4)) { disable = true; } else { return -EINVAL; } if (disable) lim->flags |= BLK_FLAG_WRITE_CACHE_DISABLED; else lim->flags &= ~BLK_FLAG_WRITE_CACHE_DISABLED; return 0; } #define QUEUE_RO_ENTRY(_prefix, _name) \ static struct queue_sysfs_entry _prefix##_entry = { \ .attr = { .name = _name, .mode = 0444 }, \ .show = _prefix##_show, \ }; #define QUEUE_RW_ENTRY(_prefix, _name) \ static struct queue_sysfs_entry _prefix##_entry = { \ .attr = { .name = _name, .mode = 0644 }, \ .show = _prefix##_show, \ .store = _prefix##_store, \ }; #define QUEUE_LIM_RW_ENTRY(_prefix, _name) \ static struct queue_sysfs_entry _prefix##_entry = { \ .attr = { .name = _name, .mode = 0644 }, \ .show = _prefix##_show, \ .store_limit = _prefix##_store, \ } #define QUEUE_RW_LOAD_MODULE_ENTRY(_prefix, _name) \ static struct queue_sysfs_entry _prefix##_entry = { \ .attr = { .name = _name, .mode = 0644 }, \ .show = _prefix##_show, \ .load_module = _prefix##_load_module, \ .store = _prefix##_store, \ } QUEUE_RW_ENTRY(queue_requests, "nr_requests"); QUEUE_RW_ENTRY(queue_ra, "read_ahead_kb"); QUEUE_LIM_RW_ENTRY(queue_max_sectors, "max_sectors_kb"); QUEUE_RO_ENTRY(queue_max_hw_sectors, "max_hw_sectors_kb"); QUEUE_RO_ENTRY(queue_max_segments, "max_segments"); QUEUE_RO_ENTRY(queue_max_integrity_segments, "max_integrity_segments"); QUEUE_RO_ENTRY(queue_max_segment_size, "max_segment_size"); QUEUE_RW_LOAD_MODULE_ENTRY(elv_iosched, "scheduler"); QUEUE_RO_ENTRY(queue_logical_block_size, "logical_block_size"); QUEUE_RO_ENTRY(queue_physical_block_size, "physical_block_size"); QUEUE_RO_ENTRY(queue_chunk_sectors, "chunk_sectors"); QUEUE_RO_ENTRY(queue_io_min, "minimum_io_size"); QUEUE_RO_ENTRY(queue_io_opt, "optimal_io_size"); QUEUE_RO_ENTRY(queue_max_discard_segments, "max_discard_segments"); QUEUE_RO_ENTRY(queue_discard_granularity, "discard_granularity"); QUEUE_RO_ENTRY(queue_max_hw_discard_sectors, "discard_max_hw_bytes"); QUEUE_LIM_RW_ENTRY(queue_max_discard_sectors, "discard_max_bytes"); QUEUE_RO_ENTRY(queue_discard_zeroes_data, "discard_zeroes_data"); QUEUE_RO_ENTRY(queue_atomic_write_max_sectors, "atomic_write_max_bytes"); QUEUE_RO_ENTRY(queue_atomic_write_boundary_sectors, "atomic_write_boundary_bytes"); QUEUE_RO_ENTRY(queue_atomic_write_unit_max, "atomic_write_unit_max_bytes"); QUEUE_RO_ENTRY(queue_atomic_write_unit_min, "atomic_write_unit_min_bytes"); QUEUE_RO_ENTRY(queue_write_same_max, "write_same_max_bytes"); QUEUE_RO_ENTRY(queue_max_write_zeroes_sectors, "write_zeroes_max_bytes"); QUEUE_RO_ENTRY(queue_max_zone_append_sectors, "zone_append_max_bytes"); QUEUE_RO_ENTRY(queue_zone_write_granularity, "zone_write_granularity"); QUEUE_RO_ENTRY(queue_zoned, "zoned"); QUEUE_RO_ENTRY(queue_nr_zones, "nr_zones"); QUEUE_RO_ENTRY(queue_max_open_zones, "max_open_zones"); QUEUE_RO_ENTRY(queue_max_active_zones, "max_active_zones"); QUEUE_RW_ENTRY(queue_nomerges, "nomerges"); QUEUE_LIM_RW_ENTRY(queue_iostats_passthrough, "iostats_passthrough"); QUEUE_RW_ENTRY(queue_rq_affinity, "rq_affinity"); QUEUE_RW_ENTRY(queue_poll, "io_poll"); QUEUE_RW_ENTRY(queue_poll_delay, "io_poll_delay"); QUEUE_LIM_RW_ENTRY(queue_wc, "write_cache"); QUEUE_RO_ENTRY(queue_fua, "fua"); QUEUE_RO_ENTRY(queue_dax, "dax"); QUEUE_RW_ENTRY(queue_io_timeout, "io_timeout"); QUEUE_RO_ENTRY(queue_virt_boundary_mask, "virt_boundary_mask"); QUEUE_RO_ENTRY(queue_dma_alignment, "dma_alignment"); /* legacy alias for logical_block_size: */ static struct queue_sysfs_entry queue_hw_sector_size_entry = { .attr = {.name = "hw_sector_size", .mode = 0444 }, .show = queue_logical_block_size_show, }; QUEUE_LIM_RW_ENTRY(queue_rotational, "rotational"); QUEUE_LIM_RW_ENTRY(queue_iostats, "iostats"); QUEUE_LIM_RW_ENTRY(queue_add_random, "add_random"); QUEUE_LIM_RW_ENTRY(queue_stable_writes, "stable_writes"); #ifdef CONFIG_BLK_WBT static ssize_t queue_var_store64(s64 *var, const char *page) { int err; s64 v; err = kstrtos64(page, 10, &v); if (err < 0) return err; *var = v; return 0; } static ssize_t queue_wb_lat_show(struct gendisk *disk, char *page) { if (!wbt_rq_qos(disk->queue)) return -EINVAL; if (wbt_disabled(disk->queue)) return sysfs_emit(page, "0\n"); return sysfs_emit(page, "%llu\n", div_u64(wbt_get_min_lat(disk->queue), 1000)); } static ssize_t queue_wb_lat_store(struct gendisk *disk, const char *page, size_t count) { struct request_queue *q = disk->queue; struct rq_qos *rqos; ssize_t ret; s64 val; ret = queue_var_store64(&val, page); if (ret < 0) return ret; if (val < -1) return -EINVAL; rqos = wbt_rq_qos(q); if (!rqos) { ret = wbt_init(disk); if (ret) return ret; } if (val == -1) val = wbt_default_latency_nsec(q); else if (val >= 0) val *= 1000ULL; if (wbt_get_min_lat(q) == val) return count; /* * Ensure that the queue is idled, in case the latency update * ends up either enabling or disabling wbt completely. We can't * have IO inflight if that happens. */ blk_mq_quiesce_queue(q); wbt_set_min_lat(q, val); blk_mq_unquiesce_queue(q); return count; } QUEUE_RW_ENTRY(queue_wb_lat, "wbt_lat_usec"); #endif /* Common attributes for bio-based and request-based queues. */ static struct attribute *queue_attrs[] = { &queue_ra_entry.attr, &queue_max_hw_sectors_entry.attr, &queue_max_sectors_entry.attr, &queue_max_segments_entry.attr, &queue_max_discard_segments_entry.attr, &queue_max_integrity_segments_entry.attr, &queue_max_segment_size_entry.attr, &queue_hw_sector_size_entry.attr, &queue_logical_block_size_entry.attr, &queue_physical_block_size_entry.attr, &queue_chunk_sectors_entry.attr, &queue_io_min_entry.attr, &queue_io_opt_entry.attr, &queue_discard_granularity_entry.attr, &queue_max_discard_sectors_entry.attr, &queue_max_hw_discard_sectors_entry.attr, &queue_discard_zeroes_data_entry.attr, &queue_atomic_write_max_sectors_entry.attr, &queue_atomic_write_boundary_sectors_entry.attr, &queue_atomic_write_unit_min_entry.attr, &queue_atomic_write_unit_max_entry.attr, &queue_write_same_max_entry.attr, &queue_max_write_zeroes_sectors_entry.attr, &queue_max_zone_append_sectors_entry.attr, &queue_zone_write_granularity_entry.attr, &queue_rotational_entry.attr, &queue_zoned_entry.attr, &queue_nr_zones_entry.attr, &queue_max_open_zones_entry.attr, &queue_max_active_zones_entry.attr, &queue_nomerges_entry.attr, &queue_iostats_passthrough_entry.attr, &queue_iostats_entry.attr, &queue_stable_writes_entry.attr, &queue_add_random_entry.attr, &queue_poll_entry.attr, &queue_wc_entry.attr, &queue_fua_entry.attr, &queue_dax_entry.attr, &queue_poll_delay_entry.attr, &queue_virt_boundary_mask_entry.attr, &queue_dma_alignment_entry.attr, NULL, }; /* Request-based queue attributes that are not relevant for bio-based queues. */ static struct attribute *blk_mq_queue_attrs[] = { &queue_requests_entry.attr, &elv_iosched_entry.attr, &queue_rq_affinity_entry.attr, &queue_io_timeout_entry.attr, #ifdef CONFIG_BLK_WBT &queue_wb_lat_entry.attr, #endif NULL, }; static umode_t queue_attr_visible(struct kobject *kobj, struct attribute *attr, int n) { struct gendisk *disk = container_of(kobj, struct gendisk, queue_kobj); struct request_queue *q = disk->queue; if ((attr == &queue_max_open_zones_entry.attr || attr == &queue_max_active_zones_entry.attr) && !blk_queue_is_zoned(q)) return 0; return attr->mode; } static umode_t blk_mq_queue_attr_visible(struct kobject *kobj, struct attribute *attr, int n) { struct gendisk *disk = container_of(kobj, struct gendisk, queue_kobj); struct request_queue *q = disk->queue; if (!queue_is_mq(q)) return 0; if (attr == &queue_io_timeout_entry.attr && !q->mq_ops->timeout) return 0; return attr->mode; } static struct attribute_group queue_attr_group = { .attrs = queue_attrs, .is_visible = queue_attr_visible, }; static struct attribute_group blk_mq_queue_attr_group = { .attrs = blk_mq_queue_attrs, .is_visible = blk_mq_queue_attr_visible, }; #define to_queue(atr) container_of((atr), struct queue_sysfs_entry, attr) static ssize_t queue_attr_show(struct kobject *kobj, struct attribute *attr, char *page) { struct queue_sysfs_entry *entry = to_queue(attr); struct gendisk *disk = container_of(kobj, struct gendisk, queue_kobj); ssize_t res; if (!entry->show) return -EIO; mutex_lock(&disk->queue->sysfs_lock); res = entry->show(disk, page); mutex_unlock(&disk->queue->sysfs_lock); return res; } static ssize_t queue_attr_store(struct kobject *kobj, struct attribute *attr, const char *page, size_t length) { struct queue_sysfs_entry *entry = to_queue(attr); struct gendisk *disk = container_of(kobj, struct gendisk, queue_kobj); struct request_queue *q = disk->queue; unsigned int memflags; ssize_t res; if (!entry->store_limit && !entry->store) return -EIO; /* * If the attribute needs to load a module, do it before freezing the * queue to ensure that the module file can be read when the request * queue is the one for the device storing the module file. */ if (entry->load_module) entry->load_module(disk, page, length); if (entry->store_limit) { struct queue_limits lim = queue_limits_start_update(q); res = entry->store_limit(disk, page, length, &lim); if (res < 0) { queue_limits_cancel_update(q); return res; } res = queue_limits_commit_update_frozen(q, &lim); if (res) return res; return length; } mutex_lock(&q->sysfs_lock); memflags = blk_mq_freeze_queue(q); res = entry->store(disk, page, length); blk_mq_unfreeze_queue(q, memflags); mutex_unlock(&q->sysfs_lock); return res; } static const struct sysfs_ops queue_sysfs_ops = { .show = queue_attr_show, .store = queue_attr_store, }; static const struct attribute_group *blk_queue_attr_groups[] = { &queue_attr_group, &blk_mq_queue_attr_group, NULL }; static void blk_queue_release(struct kobject *kobj) { /* nothing to do here, all data is associated with the parent gendisk */ } static const struct kobj_type blk_queue_ktype = { .default_groups = blk_queue_attr_groups, .sysfs_ops = &queue_sysfs_ops, .release = blk_queue_release, }; static void blk_debugfs_remove(struct gendisk *disk) { struct request_queue *q = disk->queue; mutex_lock(&q->debugfs_mutex); blk_trace_shutdown(q); debugfs_remove_recursive(q->debugfs_dir); q->debugfs_dir = NULL; q->sched_debugfs_dir = NULL; q->rqos_debugfs_dir = NULL; mutex_unlock(&q->debugfs_mutex); } /** * blk_register_queue - register a block layer queue with sysfs * @disk: Disk of which the request queue should be registered with sysfs. */ int blk_register_queue(struct gendisk *disk) { struct request_queue *q = disk->queue; int ret; kobject_init(&disk->queue_kobj, &blk_queue_ktype); ret = kobject_add(&disk->queue_kobj, &disk_to_dev(disk)->kobj, "queue"); if (ret < 0) goto out_put_queue_kobj; if (queue_is_mq(q)) { ret = blk_mq_sysfs_register(disk); if (ret) goto out_put_queue_kobj; } mutex_lock(&q->sysfs_lock); mutex_lock(&q->debugfs_mutex); q->debugfs_dir = debugfs_create_dir(disk->disk_name, blk_debugfs_root); if (queue_is_mq(q)) blk_mq_debugfs_register(q); mutex_unlock(&q->debugfs_mutex); ret = disk_register_independent_access_ranges(disk); if (ret) goto out_debugfs_remove; if (q->elevator) { ret = elv_register_queue(q, false); if (ret) goto out_unregister_ia_ranges; } ret = blk_crypto_sysfs_register(disk); if (ret) goto out_elv_unregister; blk_queue_flag_set(QUEUE_FLAG_REGISTERED, q); wbt_enable_default(disk); /* Now everything is ready and send out KOBJ_ADD uevent */ kobject_uevent(&disk->queue_kobj, KOBJ_ADD); if (q->elevator) kobject_uevent(&q->elevator->kobj, KOBJ_ADD); mutex_unlock(&q->sysfs_lock); /* * SCSI probing may synchronously create and destroy a lot of * request_queues for non-existent devices. Shutting down a fully * functional queue takes measureable wallclock time as RCU grace * periods are involved. To avoid excessive latency in these * cases, a request_queue starts out in a degraded mode which is * faster to shut down and is made fully functional here as * request_queues for non-existent devices never get registered. */ blk_queue_flag_set(QUEUE_FLAG_INIT_DONE, q); percpu_ref_switch_to_percpu(&q->q_usage_counter); return ret; out_elv_unregister: elv_unregister_queue(q); out_unregister_ia_ranges: disk_unregister_independent_access_ranges(disk); out_debugfs_remove: blk_debugfs_remove(disk); mutex_unlock(&q->sysfs_lock); out_put_queue_kobj: kobject_put(&disk->queue_kobj); return ret; } /** * blk_unregister_queue - counterpart of blk_register_queue() * @disk: Disk of which the request queue should be unregistered from sysfs. * * Note: the caller is responsible for guaranteeing that this function is called * after blk_register_queue() has finished. */ void blk_unregister_queue(struct gendisk *disk) { struct request_queue *q = disk->queue; if (WARN_ON(!q)) return; /* Return early if disk->queue was never registered. */ if (!blk_queue_registered(q)) return; /* * Since sysfs_remove_dir() prevents adding new directory entries * before removal of existing entries starts, protect against * concurrent elv_iosched_store() calls. */ mutex_lock(&q->sysfs_lock); blk_queue_flag_clear(QUEUE_FLAG_REGISTERED, q); mutex_unlock(&q->sysfs_lock); /* * Remove the sysfs attributes before unregistering the queue data * structures that can be modified through sysfs. */ if (queue_is_mq(q)) blk_mq_sysfs_unregister(disk); blk_crypto_sysfs_unregister(disk); mutex_lock(&q->sysfs_lock); elv_unregister_queue(q); disk_unregister_independent_access_ranges(disk); mutex_unlock(&q->sysfs_lock); /* Now that we've deleted all child objects, we can delete the queue. */ kobject_uevent(&disk->queue_kobj, KOBJ_REMOVE); kobject_del(&disk->queue_kobj); blk_debugfs_remove(disk); } |
| 1 1 3 3 3 3 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 | // SPDX-License-Identifier: GPL-2.0-or-later /* * * Copyright Jonathan Naylor G4KLX (g4klx@g4klx.demon.co.uk) */ #include <linux/module.h> #include <linux/proc_fs.h> #include <linux/kernel.h> #include <linux/interrupt.h> #include <linux/fs.h> #include <linux/types.h> #include <linux/sysctl.h> #include <linux/string.h> #include <linux/socket.h> #include <linux/errno.h> #include <linux/fcntl.h> #include <linux/in.h> #include <linux/if_ether.h> /* For the statistics structure. */ #include <linux/slab.h> #include <linux/uaccess.h> #include <asm/io.h> #include <linux/inet.h> #include <linux/netdevice.h> #include <linux/etherdevice.h> #include <linux/if_arp.h> #include <linux/skbuff.h> #include <net/ip.h> #include <net/arp.h> #include <net/ax25.h> #include <net/netrom.h> /* * Only allow IP over NET/ROM frames through if the netrom device is up. */ int nr_rx_ip(struct sk_buff *skb, struct net_device *dev) { struct net_device_stats *stats = &dev->stats; if (!netif_running(dev)) { stats->rx_dropped++; return 0; } stats->rx_packets++; stats->rx_bytes += skb->len; skb->protocol = htons(ETH_P_IP); /* Spoof incoming device */ skb->dev = dev; skb->mac_header = skb->network_header; skb_reset_network_header(skb); skb->pkt_type = PACKET_HOST; netif_rx(skb); return 1; } static int nr_header(struct sk_buff *skb, struct net_device *dev, unsigned short type, const void *daddr, const void *saddr, unsigned int len) { unsigned char *buff = skb_push(skb, NR_NETWORK_LEN + NR_TRANSPORT_LEN); memcpy(buff, (saddr != NULL) ? saddr : dev->dev_addr, dev->addr_len); buff[6] &= ~AX25_CBIT; buff[6] &= ~AX25_EBIT; buff[6] |= AX25_SSSID_SPARE; buff += AX25_ADDR_LEN; if (daddr != NULL) memcpy(buff, daddr, dev->addr_len); buff[6] &= ~AX25_CBIT; buff[6] |= AX25_EBIT; buff[6] |= AX25_SSSID_SPARE; buff += AX25_ADDR_LEN; *buff++ = READ_ONCE(sysctl_netrom_network_ttl_initialiser); *buff++ = NR_PROTO_IP; *buff++ = NR_PROTO_IP; *buff++ = 0; *buff++ = 0; *buff++ = NR_PROTOEXT; if (daddr != NULL) return 37; return -37; } static int __must_check nr_set_mac_address(struct net_device *dev, void *addr) { struct sockaddr *sa = addr; int err; if (!memcmp(dev->dev_addr, sa->sa_data, dev->addr_len)) return 0; if (dev->flags & IFF_UP) { err = ax25_listen_register((ax25_address *)sa->sa_data, NULL); if (err) return err; ax25_listen_release((const ax25_address *)dev->dev_addr, NULL); } dev_addr_set(dev, sa->sa_data); return 0; } static int nr_open(struct net_device *dev) { int err; err = ax25_listen_register((const ax25_address *)dev->dev_addr, NULL); if (err) return err; netif_start_queue(dev); return 0; } static int nr_close(struct net_device *dev) { ax25_listen_release((const ax25_address *)dev->dev_addr, NULL); netif_stop_queue(dev); return 0; } static netdev_tx_t nr_xmit(struct sk_buff *skb, struct net_device *dev) { struct net_device_stats *stats = &dev->stats; unsigned int len = skb->len; if (!nr_route_frame(skb, NULL)) { kfree_skb(skb); stats->tx_errors++; return NETDEV_TX_OK; } stats->tx_packets++; stats->tx_bytes += len; return NETDEV_TX_OK; } static const struct header_ops nr_header_ops = { .create = nr_header, }; static const struct net_device_ops nr_netdev_ops = { .ndo_open = nr_open, .ndo_stop = nr_close, .ndo_start_xmit = nr_xmit, .ndo_set_mac_address = nr_set_mac_address, }; void nr_setup(struct net_device *dev) { dev->mtu = NR_MAX_PACKET_SIZE; dev->netdev_ops = &nr_netdev_ops; dev->header_ops = &nr_header_ops; dev->hard_header_len = NR_NETWORK_LEN + NR_TRANSPORT_LEN; dev->addr_len = AX25_ADDR_LEN; dev->type = ARPHRD_NETROM; /* New-style flags. */ dev->flags = IFF_NOARP; } |
| 89 87 3 58 9 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 | // SPDX-License-Identifier: GPL-2.0-only /* loopback transport for vsock using virtio_transport_common APIs * * Copyright (C) 2013-2019 Red Hat, Inc. * Authors: Asias He <asias@redhat.com> * Stefan Hajnoczi <stefanha@redhat.com> * Stefano Garzarella <sgarzare@redhat.com> * */ #include <linux/spinlock.h> #include <linux/module.h> #include <linux/list.h> #include <linux/virtio_vsock.h> struct vsock_loopback { struct workqueue_struct *workqueue; struct sk_buff_head pkt_queue; struct work_struct pkt_work; }; static struct vsock_loopback the_vsock_loopback; static u32 vsock_loopback_get_local_cid(void) { return VMADDR_CID_LOCAL; } static int vsock_loopback_send_pkt(struct sk_buff *skb) { struct vsock_loopback *vsock = &the_vsock_loopback; int len = skb->len; virtio_vsock_skb_queue_tail(&vsock->pkt_queue, skb); queue_work(vsock->workqueue, &vsock->pkt_work); return len; } static int vsock_loopback_cancel_pkt(struct vsock_sock *vsk) { struct vsock_loopback *vsock = &the_vsock_loopback; virtio_transport_purge_skbs(vsk, &vsock->pkt_queue); return 0; } static bool vsock_loopback_seqpacket_allow(u32 remote_cid); static bool vsock_loopback_msgzerocopy_allow(void) { return true; } static struct virtio_transport loopback_transport = { .transport = { .module = THIS_MODULE, .get_local_cid = vsock_loopback_get_local_cid, .init = virtio_transport_do_socket_init, .destruct = virtio_transport_destruct, .release = virtio_transport_release, .connect = virtio_transport_connect, .shutdown = virtio_transport_shutdown, .cancel_pkt = vsock_loopback_cancel_pkt, .dgram_bind = virtio_transport_dgram_bind, .dgram_dequeue = virtio_transport_dgram_dequeue, .dgram_enqueue = virtio_transport_dgram_enqueue, .dgram_allow = virtio_transport_dgram_allow, .stream_dequeue = virtio_transport_stream_dequeue, .stream_enqueue = virtio_transport_stream_enqueue, .stream_has_data = virtio_transport_stream_has_data, .stream_has_space = virtio_transport_stream_has_space, .stream_rcvhiwat = virtio_transport_stream_rcvhiwat, .stream_is_active = virtio_transport_stream_is_active, .stream_allow = virtio_transport_stream_allow, .seqpacket_dequeue = virtio_transport_seqpacket_dequeue, .seqpacket_enqueue = virtio_transport_seqpacket_enqueue, .seqpacket_allow = vsock_loopback_seqpacket_allow, .seqpacket_has_data = virtio_transport_seqpacket_has_data, .msgzerocopy_allow = vsock_loopback_msgzerocopy_allow, .notify_poll_in = virtio_transport_notify_poll_in, .notify_poll_out = virtio_transport_notify_poll_out, .notify_recv_init = virtio_transport_notify_recv_init, .notify_recv_pre_block = virtio_transport_notify_recv_pre_block, .notify_recv_pre_dequeue = virtio_transport_notify_recv_pre_dequeue, .notify_recv_post_dequeue = virtio_transport_notify_recv_post_dequeue, .notify_send_init = virtio_transport_notify_send_init, .notify_send_pre_block = virtio_transport_notify_send_pre_block, .notify_send_pre_enqueue = virtio_transport_notify_send_pre_enqueue, .notify_send_post_enqueue = virtio_transport_notify_send_post_enqueue, .notify_buffer_size = virtio_transport_notify_buffer_size, .notify_set_rcvlowat = virtio_transport_notify_set_rcvlowat, .unsent_bytes = virtio_transport_unsent_bytes, .read_skb = virtio_transport_read_skb, }, .send_pkt = vsock_loopback_send_pkt, }; static bool vsock_loopback_seqpacket_allow(u32 remote_cid) { return true; } static void vsock_loopback_work(struct work_struct *work) { struct vsock_loopback *vsock = container_of(work, struct vsock_loopback, pkt_work); struct sk_buff_head pkts; struct sk_buff *skb; skb_queue_head_init(&pkts); spin_lock_bh(&vsock->pkt_queue.lock); skb_queue_splice_init(&vsock->pkt_queue, &pkts); spin_unlock_bh(&vsock->pkt_queue.lock); while ((skb = __skb_dequeue(&pkts))) { /* Decrement the bytes_unsent counter without deallocating skb * It is freed by the receiver. */ virtio_transport_consume_skb_sent(skb, false); virtio_transport_deliver_tap_pkt(skb); virtio_transport_recv_pkt(&loopback_transport, skb); } } static int __init vsock_loopback_init(void) { struct vsock_loopback *vsock = &the_vsock_loopback; int ret; vsock->workqueue = alloc_workqueue("vsock-loopback", 0, 0); if (!vsock->workqueue) return -ENOMEM; skb_queue_head_init(&vsock->pkt_queue); INIT_WORK(&vsock->pkt_work, vsock_loopback_work); ret = vsock_core_register(&loopback_transport.transport, VSOCK_TRANSPORT_F_LOCAL); if (ret) goto out_wq; return 0; out_wq: destroy_workqueue(vsock->workqueue); return ret; } static void __exit vsock_loopback_exit(void) { struct vsock_loopback *vsock = &the_vsock_loopback; vsock_core_unregister(&loopback_transport.transport); flush_work(&vsock->pkt_work); virtio_vsock_skb_queue_purge(&vsock->pkt_queue); destroy_workqueue(vsock->workqueue); } module_init(vsock_loopback_init); module_exit(vsock_loopback_exit); MODULE_LICENSE("GPL v2"); MODULE_AUTHOR("Stefano Garzarella <sgarzare@redhat.com>"); MODULE_DESCRIPTION("loopback transport for vsock"); MODULE_ALIAS_NETPROTO(PF_VSOCK); |
| 4 2 1 2 2 3 3 3 3 3 2 2 2 4 4 4 5 5 5 1 1 1 4 4 4 4 4 4 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 | // SPDX-License-Identifier: GPL-2.0-or-later /* xfrm6_protocol.c - Generic xfrm protocol multiplexer for ipv6. * * Copyright (C) 2013 secunet Security Networks AG * * Author: * Steffen Klassert <steffen.klassert@secunet.com> * * Based on: * net/ipv4/xfrm4_protocol.c */ #include <linux/init.h> #include <linux/mutex.h> #include <linux/skbuff.h> #include <linux/icmpv6.h> #include <net/ip6_route.h> #include <net/ipv6.h> #include <net/protocol.h> #include <net/xfrm.h> static struct xfrm6_protocol __rcu *esp6_handlers __read_mostly; static struct xfrm6_protocol __rcu *ah6_handlers __read_mostly; static struct xfrm6_protocol __rcu *ipcomp6_handlers __read_mostly; static DEFINE_MUTEX(xfrm6_protocol_mutex); static inline struct xfrm6_protocol __rcu **proto_handlers(u8 protocol) { switch (protocol) { case IPPROTO_ESP: return &esp6_handlers; case IPPROTO_AH: return &ah6_handlers; case IPPROTO_COMP: return &ipcomp6_handlers; } return NULL; } #define for_each_protocol_rcu(head, handler) \ for (handler = rcu_dereference(head); \ handler != NULL; \ handler = rcu_dereference(handler->next)) \ static int xfrm6_rcv_cb(struct sk_buff *skb, u8 protocol, int err) { int ret; struct xfrm6_protocol *handler; struct xfrm6_protocol __rcu **head = proto_handlers(protocol); if (!head) return 0; for_each_protocol_rcu(*proto_handlers(protocol), handler) if ((ret = handler->cb_handler(skb, err)) <= 0) return ret; return 0; } int xfrm6_rcv_encap(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type) { int ret; struct xfrm6_protocol *handler; struct xfrm6_protocol __rcu **head = proto_handlers(nexthdr); XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip6 = NULL; XFRM_SPI_SKB_CB(skb)->family = AF_INET6; XFRM_SPI_SKB_CB(skb)->daddroff = offsetof(struct ipv6hdr, daddr); if (!head) goto out; if (!skb_dst(skb)) { const struct ipv6hdr *ip6h = ipv6_hdr(skb); int flags = RT6_LOOKUP_F_HAS_SADDR; struct dst_entry *dst; struct flowi6 fl6 = { .flowi6_iif = skb->dev->ifindex, .daddr = ip6h->daddr, .saddr = ip6h->saddr, .flowlabel = ip6_flowinfo(ip6h), .flowi6_mark = skb->mark, .flowi6_proto = ip6h->nexthdr, }; dst = ip6_route_input_lookup(dev_net(skb->dev), skb->dev, &fl6, skb, flags); if (dst->error) goto drop; skb_dst_set(skb, dst); } for_each_protocol_rcu(*head, handler) if ((ret = handler->input_handler(skb, nexthdr, spi, encap_type)) != -EINVAL) return ret; out: icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_PORT_UNREACH, 0); drop: kfree_skb(skb); return 0; } EXPORT_SYMBOL(xfrm6_rcv_encap); static int xfrm6_esp_rcv(struct sk_buff *skb) { int ret; struct xfrm6_protocol *handler; XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip6 = NULL; for_each_protocol_rcu(esp6_handlers, handler) if ((ret = handler->handler(skb)) != -EINVAL) return ret; icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_PORT_UNREACH, 0); kfree_skb(skb); return 0; } static int xfrm6_esp_err(struct sk_buff *skb, struct inet6_skb_parm *opt, u8 type, u8 code, int offset, __be32 info) { struct xfrm6_protocol *handler; for_each_protocol_rcu(esp6_handlers, handler) if (!handler->err_handler(skb, opt, type, code, offset, info)) return 0; return -ENOENT; } static int xfrm6_ah_rcv(struct sk_buff *skb) { int ret; struct xfrm6_protocol *handler; XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip6 = NULL; for_each_protocol_rcu(ah6_handlers, handler) if ((ret = handler->handler(skb)) != -EINVAL) return ret; icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_PORT_UNREACH, 0); kfree_skb(skb); return 0; } static int xfrm6_ah_err(struct sk_buff *skb, struct inet6_skb_parm *opt, u8 type, u8 code, int offset, __be32 info) { struct xfrm6_protocol *handler; for_each_protocol_rcu(ah6_handlers, handler) if (!handler->err_handler(skb, opt, type, code, offset, info)) return 0; return -ENOENT; } static int xfrm6_ipcomp_rcv(struct sk_buff *skb) { int ret; struct xfrm6_protocol *handler; XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip6 = NULL; for_each_protocol_rcu(ipcomp6_handlers, handler) if ((ret = handler->handler(skb)) != -EINVAL) return ret; icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_PORT_UNREACH, 0); kfree_skb(skb); return 0; } static int xfrm6_ipcomp_err(struct sk_buff *skb, struct inet6_skb_parm *opt, u8 type, u8 code, int offset, __be32 info) { struct xfrm6_protocol *handler; for_each_protocol_rcu(ipcomp6_handlers, handler) if (!handler->err_handler(skb, opt, type, code, offset, info)) return 0; return -ENOENT; } static const struct inet6_protocol esp6_protocol = { .handler = xfrm6_esp_rcv, .err_handler = xfrm6_esp_err, .flags = INET6_PROTO_NOPOLICY, }; static const struct inet6_protocol ah6_protocol = { .handler = xfrm6_ah_rcv, .err_handler = xfrm6_ah_err, .flags = INET6_PROTO_NOPOLICY, }; static const struct inet6_protocol ipcomp6_protocol = { .handler = xfrm6_ipcomp_rcv, .err_handler = xfrm6_ipcomp_err, .flags = INET6_PROTO_NOPOLICY, }; static const struct xfrm_input_afinfo xfrm6_input_afinfo = { .family = AF_INET6, .callback = xfrm6_rcv_cb, }; static inline const struct inet6_protocol *netproto(unsigned char protocol) { switch (protocol) { case IPPROTO_ESP: return &esp6_protocol; case IPPROTO_AH: return &ah6_protocol; case IPPROTO_COMP: return &ipcomp6_protocol; } return NULL; } int xfrm6_protocol_register(struct xfrm6_protocol *handler, unsigned char protocol) { struct xfrm6_protocol __rcu **pprev; struct xfrm6_protocol *t; bool add_netproto = false; int ret = -EEXIST; int priority = handler->priority; if (!proto_handlers(protocol) || !netproto(protocol)) return -EINVAL; mutex_lock(&xfrm6_protocol_mutex); if (!rcu_dereference_protected(*proto_handlers(protocol), lockdep_is_held(&xfrm6_protocol_mutex))) add_netproto = true; for (pprev = proto_handlers(protocol); (t = rcu_dereference_protected(*pprev, lockdep_is_held(&xfrm6_protocol_mutex))) != NULL; pprev = &t->next) { if (t->priority < priority) break; if (t->priority == priority) goto err; } handler->next = *pprev; rcu_assign_pointer(*pprev, handler); ret = 0; err: mutex_unlock(&xfrm6_protocol_mutex); if (add_netproto) { if (inet6_add_protocol(netproto(protocol), protocol)) { pr_err("%s: can't add protocol\n", __func__); ret = -EAGAIN; } } return ret; } EXPORT_SYMBOL(xfrm6_protocol_register); int xfrm6_protocol_deregister(struct xfrm6_protocol *handler, unsigned char protocol) { struct xfrm6_protocol __rcu **pprev; struct xfrm6_protocol *t; int ret = -ENOENT; if (!proto_handlers(protocol) || !netproto(protocol)) return -EINVAL; mutex_lock(&xfrm6_protocol_mutex); for (pprev = proto_handlers(protocol); (t = rcu_dereference_protected(*pprev, lockdep_is_held(&xfrm6_protocol_mutex))) != NULL; pprev = &t->next) { if (t == handler) { *pprev = handler->next; ret = 0; break; } } if (!rcu_dereference_protected(*proto_handlers(protocol), lockdep_is_held(&xfrm6_protocol_mutex))) { if (inet6_del_protocol(netproto(protocol), protocol) < 0) { pr_err("%s: can't remove protocol\n", __func__); ret = -EAGAIN; } } mutex_unlock(&xfrm6_protocol_mutex); synchronize_net(); return ret; } EXPORT_SYMBOL(xfrm6_protocol_deregister); int __init xfrm6_protocol_init(void) { return xfrm_input_register_afinfo(&xfrm6_input_afinfo); } void xfrm6_protocol_fini(void) { xfrm_input_unregister_afinfo(&xfrm6_input_afinfo); } |
| 9 3 9 6 9 9 9 8 4 8 4 8 4 8 9 4 8 9 9 4 19 18 1 17 1 17 16 1 16 15 14 1 15 15 15 15 15 15 15 15 15 14 15 15 15 14 12 12 2 2 2 2 4 8 4 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 | // SPDX-License-Identifier: GPL-2.0-only #include <net/netdev_queues.h> #include "netlink.h" #include "common.h" struct rings_req_info { struct ethnl_req_info base; }; struct rings_reply_data { struct ethnl_reply_data base; struct ethtool_ringparam ringparam; struct kernel_ethtool_ringparam kernel_ringparam; u32 supported_ring_params; }; #define RINGS_REPDATA(__reply_base) \ container_of(__reply_base, struct rings_reply_data, base) const struct nla_policy ethnl_rings_get_policy[] = { [ETHTOOL_A_RINGS_HEADER] = NLA_POLICY_NESTED(ethnl_header_policy), }; static int rings_prepare_data(const struct ethnl_req_info *req_base, struct ethnl_reply_data *reply_base, const struct genl_info *info) { struct rings_reply_data *data = RINGS_REPDATA(reply_base); struct net_device *dev = reply_base->dev; int ret; if (!dev->ethtool_ops->get_ringparam) return -EOPNOTSUPP; data->supported_ring_params = dev->ethtool_ops->supported_ring_params; ret = ethnl_ops_begin(dev); if (ret < 0) return ret; data->kernel_ringparam.tcp_data_split = dev->cfg->hds_config; data->kernel_ringparam.hds_thresh = dev->cfg->hds_thresh; dev->ethtool_ops->get_ringparam(dev, &data->ringparam, &data->kernel_ringparam, info->extack); ethnl_ops_complete(dev); return 0; } static int rings_reply_size(const struct ethnl_req_info *req_base, const struct ethnl_reply_data *reply_base) { return nla_total_size(sizeof(u32)) + /* _RINGS_RX_MAX */ nla_total_size(sizeof(u32)) + /* _RINGS_RX_MINI_MAX */ nla_total_size(sizeof(u32)) + /* _RINGS_RX_JUMBO_MAX */ nla_total_size(sizeof(u32)) + /* _RINGS_TX_MAX */ nla_total_size(sizeof(u32)) + /* _RINGS_RX */ nla_total_size(sizeof(u32)) + /* _RINGS_RX_MINI */ nla_total_size(sizeof(u32)) + /* _RINGS_RX_JUMBO */ nla_total_size(sizeof(u32)) + /* _RINGS_TX */ nla_total_size(sizeof(u32)) + /* _RINGS_RX_BUF_LEN */ nla_total_size(sizeof(u8)) + /* _RINGS_TCP_DATA_SPLIT */ nla_total_size(sizeof(u32) + /* _RINGS_CQE_SIZE */ nla_total_size(sizeof(u8)) + /* _RINGS_TX_PUSH */ nla_total_size(sizeof(u8))) + /* _RINGS_RX_PUSH */ nla_total_size(sizeof(u32)) + /* _RINGS_TX_PUSH_BUF_LEN */ nla_total_size(sizeof(u32)) + /* _RINGS_TX_PUSH_BUF_LEN_MAX */ nla_total_size(sizeof(u32)) + /* _RINGS_HDS_THRESH */ nla_total_size(sizeof(u32)); /* _RINGS_HDS_THRESH_MAX*/ } static int rings_fill_reply(struct sk_buff *skb, const struct ethnl_req_info *req_base, const struct ethnl_reply_data *reply_base) { const struct rings_reply_data *data = RINGS_REPDATA(reply_base); const struct kernel_ethtool_ringparam *kr = &data->kernel_ringparam; const struct ethtool_ringparam *ringparam = &data->ringparam; u32 supported_ring_params = data->supported_ring_params; WARN_ON(kr->tcp_data_split > ETHTOOL_TCP_DATA_SPLIT_ENABLED); if ((ringparam->rx_max_pending && (nla_put_u32(skb, ETHTOOL_A_RINGS_RX_MAX, ringparam->rx_max_pending) || nla_put_u32(skb, ETHTOOL_A_RINGS_RX, ringparam->rx_pending))) || (ringparam->rx_mini_max_pending && (nla_put_u32(skb, ETHTOOL_A_RINGS_RX_MINI_MAX, ringparam->rx_mini_max_pending) || nla_put_u32(skb, ETHTOOL_A_RINGS_RX_MINI, ringparam->rx_mini_pending))) || (ringparam->rx_jumbo_max_pending && (nla_put_u32(skb, ETHTOOL_A_RINGS_RX_JUMBO_MAX, ringparam->rx_jumbo_max_pending) || nla_put_u32(skb, ETHTOOL_A_RINGS_RX_JUMBO, ringparam->rx_jumbo_pending))) || (ringparam->tx_max_pending && (nla_put_u32(skb, ETHTOOL_A_RINGS_TX_MAX, ringparam->tx_max_pending) || nla_put_u32(skb, ETHTOOL_A_RINGS_TX, ringparam->tx_pending))) || (kr->rx_buf_len && (nla_put_u32(skb, ETHTOOL_A_RINGS_RX_BUF_LEN, kr->rx_buf_len))) || (kr->tcp_data_split && (nla_put_u8(skb, ETHTOOL_A_RINGS_TCP_DATA_SPLIT, kr->tcp_data_split))) || (kr->cqe_size && (nla_put_u32(skb, ETHTOOL_A_RINGS_CQE_SIZE, kr->cqe_size))) || nla_put_u8(skb, ETHTOOL_A_RINGS_TX_PUSH, !!kr->tx_push) || nla_put_u8(skb, ETHTOOL_A_RINGS_RX_PUSH, !!kr->rx_push) || ((supported_ring_params & ETHTOOL_RING_USE_TX_PUSH_BUF_LEN) && (nla_put_u32(skb, ETHTOOL_A_RINGS_TX_PUSH_BUF_LEN_MAX, kr->tx_push_buf_max_len) || nla_put_u32(skb, ETHTOOL_A_RINGS_TX_PUSH_BUF_LEN, kr->tx_push_buf_len))) || ((supported_ring_params & ETHTOOL_RING_USE_HDS_THRS) && (nla_put_u32(skb, ETHTOOL_A_RINGS_HDS_THRESH, kr->hds_thresh) || nla_put_u32(skb, ETHTOOL_A_RINGS_HDS_THRESH_MAX, kr->hds_thresh_max)))) return -EMSGSIZE; return 0; } /* RINGS_SET */ const struct nla_policy ethnl_rings_set_policy[] = { [ETHTOOL_A_RINGS_HEADER] = NLA_POLICY_NESTED(ethnl_header_policy), [ETHTOOL_A_RINGS_RX] = { .type = NLA_U32 }, [ETHTOOL_A_RINGS_RX_MINI] = { .type = NLA_U32 }, [ETHTOOL_A_RINGS_RX_JUMBO] = { .type = NLA_U32 }, [ETHTOOL_A_RINGS_TX] = { .type = NLA_U32 }, [ETHTOOL_A_RINGS_RX_BUF_LEN] = NLA_POLICY_MIN(NLA_U32, 1), [ETHTOOL_A_RINGS_TCP_DATA_SPLIT] = NLA_POLICY_MAX(NLA_U8, ETHTOOL_TCP_DATA_SPLIT_ENABLED), [ETHTOOL_A_RINGS_CQE_SIZE] = NLA_POLICY_MIN(NLA_U32, 1), [ETHTOOL_A_RINGS_TX_PUSH] = NLA_POLICY_MAX(NLA_U8, 1), [ETHTOOL_A_RINGS_RX_PUSH] = NLA_POLICY_MAX(NLA_U8, 1), [ETHTOOL_A_RINGS_TX_PUSH_BUF_LEN] = { .type = NLA_U32 }, [ETHTOOL_A_RINGS_HDS_THRESH] = { .type = NLA_U32 }, }; static int ethnl_set_rings_validate(struct ethnl_req_info *req_info, struct genl_info *info) { const struct ethtool_ops *ops = req_info->dev->ethtool_ops; struct nlattr **tb = info->attrs; if (tb[ETHTOOL_A_RINGS_RX_BUF_LEN] && !(ops->supported_ring_params & ETHTOOL_RING_USE_RX_BUF_LEN)) { NL_SET_ERR_MSG_ATTR(info->extack, tb[ETHTOOL_A_RINGS_RX_BUF_LEN], "setting rx buf len not supported"); return -EOPNOTSUPP; } if (tb[ETHTOOL_A_RINGS_TCP_DATA_SPLIT] && !(ops->supported_ring_params & ETHTOOL_RING_USE_TCP_DATA_SPLIT)) { NL_SET_ERR_MSG_ATTR(info->extack, tb[ETHTOOL_A_RINGS_TCP_DATA_SPLIT], "setting TCP data split is not supported"); return -EOPNOTSUPP; } if (tb[ETHTOOL_A_RINGS_HDS_THRESH] && !(ops->supported_ring_params & ETHTOOL_RING_USE_HDS_THRS)) { NL_SET_ERR_MSG_ATTR(info->extack, tb[ETHTOOL_A_RINGS_HDS_THRESH], "setting hds-thresh is not supported"); return -EOPNOTSUPP; } if (tb[ETHTOOL_A_RINGS_CQE_SIZE] && !(ops->supported_ring_params & ETHTOOL_RING_USE_CQE_SIZE)) { NL_SET_ERR_MSG_ATTR(info->extack, tb[ETHTOOL_A_RINGS_CQE_SIZE], "setting cqe size not supported"); return -EOPNOTSUPP; } if (tb[ETHTOOL_A_RINGS_TX_PUSH] && !(ops->supported_ring_params & ETHTOOL_RING_USE_TX_PUSH)) { NL_SET_ERR_MSG_ATTR(info->extack, tb[ETHTOOL_A_RINGS_TX_PUSH], "setting tx push not supported"); return -EOPNOTSUPP; } if (tb[ETHTOOL_A_RINGS_RX_PUSH] && !(ops->supported_ring_params & ETHTOOL_RING_USE_RX_PUSH)) { NL_SET_ERR_MSG_ATTR(info->extack, tb[ETHTOOL_A_RINGS_RX_PUSH], "setting rx push not supported"); return -EOPNOTSUPP; } if (tb[ETHTOOL_A_RINGS_TX_PUSH_BUF_LEN] && !(ops->supported_ring_params & ETHTOOL_RING_USE_TX_PUSH_BUF_LEN)) { NL_SET_ERR_MSG_ATTR(info->extack, tb[ETHTOOL_A_RINGS_TX_PUSH_BUF_LEN], "setting tx push buf len is not supported"); return -EOPNOTSUPP; } return ops->get_ringparam && ops->set_ringparam ? 1 : -EOPNOTSUPP; } static int ethnl_set_rings(struct ethnl_req_info *req_info, struct genl_info *info) { struct kernel_ethtool_ringparam kernel_ringparam = {}; struct ethtool_ringparam ringparam = {}; struct net_device *dev = req_info->dev; struct nlattr **tb = info->attrs; const struct nlattr *err_attr; bool mod = false; int ret; dev->ethtool_ops->get_ringparam(dev, &ringparam, &kernel_ringparam, info->extack); kernel_ringparam.tcp_data_split = dev->cfg->hds_config; ethnl_update_u32(&ringparam.rx_pending, tb[ETHTOOL_A_RINGS_RX], &mod); ethnl_update_u32(&ringparam.rx_mini_pending, tb[ETHTOOL_A_RINGS_RX_MINI], &mod); ethnl_update_u32(&ringparam.rx_jumbo_pending, tb[ETHTOOL_A_RINGS_RX_JUMBO], &mod); ethnl_update_u32(&ringparam.tx_pending, tb[ETHTOOL_A_RINGS_TX], &mod); ethnl_update_u32(&kernel_ringparam.rx_buf_len, tb[ETHTOOL_A_RINGS_RX_BUF_LEN], &mod); ethnl_update_u8(&kernel_ringparam.tcp_data_split, tb[ETHTOOL_A_RINGS_TCP_DATA_SPLIT], &mod); ethnl_update_u32(&kernel_ringparam.cqe_size, tb[ETHTOOL_A_RINGS_CQE_SIZE], &mod); ethnl_update_u8(&kernel_ringparam.tx_push, tb[ETHTOOL_A_RINGS_TX_PUSH], &mod); ethnl_update_u8(&kernel_ringparam.rx_push, tb[ETHTOOL_A_RINGS_RX_PUSH], &mod); ethnl_update_u32(&kernel_ringparam.tx_push_buf_len, tb[ETHTOOL_A_RINGS_TX_PUSH_BUF_LEN], &mod); ethnl_update_u32(&kernel_ringparam.hds_thresh, tb[ETHTOOL_A_RINGS_HDS_THRESH], &mod); if (!mod) return 0; if (kernel_ringparam.tcp_data_split == ETHTOOL_TCP_DATA_SPLIT_ENABLED && dev_xdp_sb_prog_count(dev)) { NL_SET_ERR_MSG_ATTR(info->extack, tb[ETHTOOL_A_RINGS_TCP_DATA_SPLIT], "tcp-data-split can not be enabled with single buffer XDP"); return -EINVAL; } if (dev_get_min_mp_channel_count(dev)) { if (kernel_ringparam.tcp_data_split != ETHTOOL_TCP_DATA_SPLIT_ENABLED) { NL_SET_ERR_MSG(info->extack, "can't disable tcp-data-split while device has memory provider enabled"); return -EINVAL; } else if (kernel_ringparam.hds_thresh) { NL_SET_ERR_MSG(info->extack, "can't set non-zero hds_thresh while device is memory provider enabled"); return -EINVAL; } } /* ensure new ring parameters are within limits */ if (ringparam.rx_pending > ringparam.rx_max_pending) err_attr = tb[ETHTOOL_A_RINGS_RX]; else if (ringparam.rx_mini_pending > ringparam.rx_mini_max_pending) err_attr = tb[ETHTOOL_A_RINGS_RX_MINI]; else if (ringparam.rx_jumbo_pending > ringparam.rx_jumbo_max_pending) err_attr = tb[ETHTOOL_A_RINGS_RX_JUMBO]; else if (ringparam.tx_pending > ringparam.tx_max_pending) err_attr = tb[ETHTOOL_A_RINGS_TX]; else if (kernel_ringparam.hds_thresh > kernel_ringparam.hds_thresh_max) err_attr = tb[ETHTOOL_A_RINGS_HDS_THRESH]; else err_attr = NULL; if (err_attr) { NL_SET_ERR_MSG_ATTR(info->extack, err_attr, "requested ring size exceeds maximum"); return -EINVAL; } if (kernel_ringparam.tx_push_buf_len > kernel_ringparam.tx_push_buf_max_len) { NL_SET_ERR_MSG_ATTR_FMT(info->extack, tb[ETHTOOL_A_RINGS_TX_PUSH_BUF_LEN], "Requested TX push buffer exceeds the maximum of %u", kernel_ringparam.tx_push_buf_max_len); return -EINVAL; } dev->cfg_pending->hds_config = kernel_ringparam.tcp_data_split; dev->cfg_pending->hds_thresh = kernel_ringparam.hds_thresh; ret = dev->ethtool_ops->set_ringparam(dev, &ringparam, &kernel_ringparam, info->extack); return ret < 0 ? ret : 1; } const struct ethnl_request_ops ethnl_rings_request_ops = { .request_cmd = ETHTOOL_MSG_RINGS_GET, .reply_cmd = ETHTOOL_MSG_RINGS_GET_REPLY, .hdr_attr = ETHTOOL_A_RINGS_HEADER, .req_info_size = sizeof(struct rings_req_info), .reply_data_size = sizeof(struct rings_reply_data), .prepare_data = rings_prepare_data, .reply_size = rings_reply_size, .fill_reply = rings_fill_reply, .set_validate = ethnl_set_rings_validate, .set = ethnl_set_rings, .set_ntf_cmd = ETHTOOL_MSG_RINGS_NTF, }; |
| 23 23 2 2 15 4 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 | // SPDX-License-Identifier: GPL-2.0-only /* iptables module for using new netfilter netlink queue * * (C) 2005 by Harald Welte <laforge@netfilter.org> */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/module.h> #include <linux/skbuff.h> #include <linux/netfilter.h> #include <linux/netfilter_arp.h> #include <linux/netfilter/x_tables.h> #include <linux/netfilter/xt_NFQUEUE.h> #include <net/netfilter/nf_queue.h> MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>"); MODULE_DESCRIPTION("Xtables: packet forwarding to netlink"); MODULE_LICENSE("GPL"); MODULE_ALIAS("ipt_NFQUEUE"); MODULE_ALIAS("ip6t_NFQUEUE"); MODULE_ALIAS("arpt_NFQUEUE"); static u32 jhash_initval __read_mostly; static unsigned int nfqueue_tg(struct sk_buff *skb, const struct xt_action_param *par) { const struct xt_NFQ_info *tinfo = par->targinfo; return NF_QUEUE_NR(tinfo->queuenum); } static unsigned int nfqueue_tg_v1(struct sk_buff *skb, const struct xt_action_param *par) { const struct xt_NFQ_info_v1 *info = par->targinfo; u32 queue = info->queuenum; if (info->queues_total > 1) { queue = nfqueue_hash(skb, queue, info->queues_total, xt_family(par), jhash_initval); } return NF_QUEUE_NR(queue); } static unsigned int nfqueue_tg_v2(struct sk_buff *skb, const struct xt_action_param *par) { const struct xt_NFQ_info_v2 *info = par->targinfo; unsigned int ret = nfqueue_tg_v1(skb, par); if (info->bypass) ret |= NF_VERDICT_FLAG_QUEUE_BYPASS; return ret; } static int nfqueue_tg_check(const struct xt_tgchk_param *par) { const struct xt_NFQ_info_v3 *info = par->targinfo; u32 maxid; init_hashrandom(&jhash_initval); if (info->queues_total == 0) { pr_info_ratelimited("number of total queues is 0\n"); return -EINVAL; } maxid = info->queues_total - 1 + info->queuenum; if (maxid > 0xffff) { pr_info_ratelimited("number of queues (%u) out of range (got %u)\n", info->queues_total, maxid); return -ERANGE; } if (par->target->revision == 2 && info->flags > 1) return -EINVAL; if (par->target->revision == 3 && info->flags & ~NFQ_FLAG_MASK) return -EINVAL; return 0; } static unsigned int nfqueue_tg_v3(struct sk_buff *skb, const struct xt_action_param *par) { const struct xt_NFQ_info_v3 *info = par->targinfo; u32 queue = info->queuenum; int ret; if (info->queues_total > 1) { if (info->flags & NFQ_FLAG_CPU_FANOUT) { int cpu = smp_processor_id(); queue = info->queuenum + cpu % info->queues_total; } else { queue = nfqueue_hash(skb, queue, info->queues_total, xt_family(par), jhash_initval); } } ret = NF_QUEUE_NR(queue); if (info->flags & NFQ_FLAG_BYPASS) ret |= NF_VERDICT_FLAG_QUEUE_BYPASS; return ret; } static struct xt_target nfqueue_tg_reg[] __read_mostly = { { .name = "NFQUEUE", .family = NFPROTO_UNSPEC, .target = nfqueue_tg, .targetsize = sizeof(struct xt_NFQ_info), .me = THIS_MODULE, }, { .name = "NFQUEUE", .revision = 1, .family = NFPROTO_UNSPEC, .checkentry = nfqueue_tg_check, .target = nfqueue_tg_v1, .targetsize = sizeof(struct xt_NFQ_info_v1), .me = THIS_MODULE, }, { .name = "NFQUEUE", .revision = 2, .family = NFPROTO_UNSPEC, .checkentry = nfqueue_tg_check, .target = nfqueue_tg_v2, .targetsize = sizeof(struct xt_NFQ_info_v2), .me = THIS_MODULE, }, { .name = "NFQUEUE", .revision = 3, .family = NFPROTO_UNSPEC, .checkentry = nfqueue_tg_check, .target = nfqueue_tg_v3, .targetsize = sizeof(struct xt_NFQ_info_v3), .me = THIS_MODULE, }, }; static int __init nfqueue_tg_init(void) { return xt_register_targets(nfqueue_tg_reg, ARRAY_SIZE(nfqueue_tg_reg)); } static void __exit nfqueue_tg_exit(void) { xt_unregister_targets(nfqueue_tg_reg, ARRAY_SIZE(nfqueue_tg_reg)); } module_init(nfqueue_tg_init); module_exit(nfqueue_tg_exit); |
| 1 1 1 1 3 3 3 3 1 1 1 1 1 1 2 9 6 6 1 1 7 2 5 7 9 9 3 2 3 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 | // SPDX-License-Identifier: GPL-2.0-only /* Copyright (C) 2008-2013 Jozsef Kadlecsik <kadlec@netfilter.org> */ /* Kernel module implementing an IP set type: the list:set type */ #include <linux/module.h> #include <linux/ip.h> #include <linux/rculist.h> #include <linux/skbuff.h> #include <linux/errno.h> #include <linux/netfilter/ipset/ip_set.h> #include <linux/netfilter/ipset/ip_set_list.h> #define IPSET_TYPE_REV_MIN 0 /* 1 Counters support added */ /* 2 Comments support added */ #define IPSET_TYPE_REV_MAX 3 /* skbinfo support added */ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@netfilter.org>"); IP_SET_MODULE_DESC("list:set", IPSET_TYPE_REV_MIN, IPSET_TYPE_REV_MAX); MODULE_ALIAS("ip_set_list:set"); /* Member elements */ struct set_elem { struct rcu_head rcu; struct list_head list; struct ip_set *set; /* Sigh, in order to cleanup reference */ ip_set_id_t id; } __aligned(__alignof__(u64)); struct set_adt_elem { ip_set_id_t id; ip_set_id_t refid; int before; }; /* Type structure */ struct list_set { u32 size; /* size of set list array */ struct timer_list gc; /* garbage collection */ struct ip_set *set; /* attached to this ip_set */ struct net *net; /* namespace */ struct list_head members; /* the set members */ }; static int list_set_ktest(struct ip_set *set, const struct sk_buff *skb, const struct xt_action_param *par, struct ip_set_adt_opt *opt, const struct ip_set_ext *ext) { struct list_set *map = set->data; struct ip_set_ext *mext = &opt->ext; struct set_elem *e; u32 flags = opt->cmdflags; int ret; /* Don't lookup sub-counters at all */ opt->cmdflags &= ~IPSET_FLAG_MATCH_COUNTERS; if (opt->cmdflags & IPSET_FLAG_SKIP_SUBCOUNTER_UPDATE) opt->cmdflags |= IPSET_FLAG_SKIP_COUNTER_UPDATE; list_for_each_entry_rcu(e, &map->members, list) { ret = ip_set_test(e->id, skb, par, opt); if (ret <= 0) continue; if (ip_set_match_extensions(set, ext, mext, flags, e)) return 1; } return 0; } static int list_set_kadd(struct ip_set *set, const struct sk_buff *skb, const struct xt_action_param *par, struct ip_set_adt_opt *opt, const struct ip_set_ext *ext) { struct list_set *map = set->data; struct set_elem *e; int ret; list_for_each_entry_rcu(e, &map->members, list) { if (SET_WITH_TIMEOUT(set) && ip_set_timeout_expired(ext_timeout(e, set))) continue; ret = ip_set_add(e->id, skb, par, opt); if (ret == 0) return ret; } return 0; } static int list_set_kdel(struct ip_set *set, const struct sk_buff *skb, const struct xt_action_param *par, struct ip_set_adt_opt *opt, const struct ip_set_ext *ext) { struct list_set *map = set->data; struct set_elem *e; int ret; list_for_each_entry_rcu(e, &map->members, list) { if (SET_WITH_TIMEOUT(set) && ip_set_timeout_expired(ext_timeout(e, set))) continue; ret = ip_set_del(e->id, skb, par, opt); if (ret == 0) return ret; } return 0; } static int list_set_kadt(struct ip_set *set, const struct sk_buff *skb, const struct xt_action_param *par, enum ipset_adt adt, struct ip_set_adt_opt *opt) { struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set); int ret = -EINVAL; rcu_read_lock(); switch (adt) { case IPSET_TEST: ret = list_set_ktest(set, skb, par, opt, &ext); break; case IPSET_ADD: ret = list_set_kadd(set, skb, par, opt, &ext); break; case IPSET_DEL: ret = list_set_kdel(set, skb, par, opt, &ext); break; default: break; } rcu_read_unlock(); return ret; } /* Userspace interfaces: we are protected by the nfnl mutex */ static void __list_set_del_rcu(struct rcu_head * rcu) { struct set_elem *e = container_of(rcu, struct set_elem, rcu); struct ip_set *set = e->set; ip_set_ext_destroy(set, e); kfree(e); } static void list_set_del(struct ip_set *set, struct set_elem *e) { struct list_set *map = set->data; set->elements--; list_del_rcu(&e->list); ip_set_put_byindex(map->net, e->id); call_rcu(&e->rcu, __list_set_del_rcu); } static void list_set_replace(struct ip_set *set, struct set_elem *e, struct set_elem *old) { struct list_set *map = set->data; list_replace_rcu(&old->list, &e->list); ip_set_put_byindex(map->net, old->id); call_rcu(&old->rcu, __list_set_del_rcu); } static void set_cleanup_entries(struct ip_set *set) { struct list_set *map = set->data; struct set_elem *e, *n; list_for_each_entry_safe(e, n, &map->members, list) if (ip_set_timeout_expired(ext_timeout(e, set))) list_set_del(set, e); } static int list_set_utest(struct ip_set *set, void *value, const struct ip_set_ext *ext, struct ip_set_ext *mext, u32 flags) { struct list_set *map = set->data; struct set_adt_elem *d = value; struct set_elem *e, *next, *prev = NULL; int ret = 0; rcu_read_lock(); list_for_each_entry_rcu(e, &map->members, list) { if (SET_WITH_TIMEOUT(set) && ip_set_timeout_expired(ext_timeout(e, set))) continue; else if (e->id != d->id) { prev = e; continue; } if (d->before == 0) { ret = 1; goto out; } else if (d->before > 0) { next = list_next_entry(e, list); ret = !list_is_last(&e->list, &map->members) && next->id == d->refid; } else { ret = prev && prev->id == d->refid; } goto out; } out: rcu_read_unlock(); return ret; } static void list_set_init_extensions(struct ip_set *set, const struct ip_set_ext *ext, struct set_elem *e) { if (SET_WITH_COUNTER(set)) ip_set_init_counter(ext_counter(e, set), ext); if (SET_WITH_COMMENT(set)) ip_set_init_comment(set, ext_comment(e, set), ext); if (SET_WITH_SKBINFO(set)) ip_set_init_skbinfo(ext_skbinfo(e, set), ext); /* Update timeout last */ if (SET_WITH_TIMEOUT(set)) ip_set_timeout_set(ext_timeout(e, set), ext->timeout); } static int list_set_uadd(struct ip_set *set, void *value, const struct ip_set_ext *ext, struct ip_set_ext *mext, u32 flags) { struct list_set *map = set->data; struct set_adt_elem *d = value; struct set_elem *e, *n, *prev, *next; bool flag_exist = flags & IPSET_FLAG_EXIST; /* Find where to add the new entry */ n = prev = next = NULL; list_for_each_entry_rcu(e, &map->members, list) { if (SET_WITH_TIMEOUT(set) && ip_set_timeout_expired(ext_timeout(e, set))) continue; else if (d->id == e->id) n = e; else if (d->before == 0 || e->id != d->refid) continue; else if (d->before > 0) next = e; else prev = e; } /* If before/after is used on an empty set */ if ((d->before > 0 && !next) || (d->before < 0 && !prev)) return -IPSET_ERR_REF_EXIST; /* Re-add already existing element */ if (n) { if (!flag_exist) return -IPSET_ERR_EXIST; /* Update extensions */ ip_set_ext_destroy(set, n); list_set_init_extensions(set, ext, n); /* Set is already added to the list */ ip_set_put_byindex(map->net, d->id); return 0; } /* Add new entry */ if (d->before == 0) { /* Append */ n = list_empty(&map->members) ? NULL : list_last_entry(&map->members, struct set_elem, list); } else if (d->before > 0) { /* Insert after next element */ if (!list_is_last(&next->list, &map->members)) n = list_next_entry(next, list); } else { /* Insert before prev element */ if (prev->list.prev != &map->members) n = list_prev_entry(prev, list); } /* Can we replace a timed out entry? */ if (n && !(SET_WITH_TIMEOUT(set) && ip_set_timeout_expired(ext_timeout(n, set)))) n = NULL; e = kzalloc(set->dsize, GFP_ATOMIC); if (!e) return -ENOMEM; e->id = d->id; e->set = set; INIT_LIST_HEAD(&e->list); list_set_init_extensions(set, ext, e); if (n) list_set_replace(set, e, n); else if (next) list_add_tail_rcu(&e->list, &next->list); else if (prev) list_add_rcu(&e->list, &prev->list); else list_add_tail_rcu(&e->list, &map->members); set->elements++; return 0; } static int list_set_udel(struct ip_set *set, void *value, const struct ip_set_ext *ext, struct ip_set_ext *mext, u32 flags) { struct list_set *map = set->data; struct set_adt_elem *d = value; struct set_elem *e, *n, *next, *prev = NULL; list_for_each_entry_safe(e, n, &map->members, list) { if (SET_WITH_TIMEOUT(set) && ip_set_timeout_expired(ext_timeout(e, set))) continue; else if (e->id != d->id) { prev = e; continue; } if (d->before > 0) { next = list_next_entry(e, list); if (list_is_last(&e->list, &map->members) || next->id != d->refid) return -IPSET_ERR_REF_EXIST; } else if (d->before < 0) { if (!prev || prev->id != d->refid) return -IPSET_ERR_REF_EXIST; } list_set_del(set, e); return 0; } return d->before != 0 ? -IPSET_ERR_REF_EXIST : -IPSET_ERR_EXIST; } static int list_set_uadt(struct ip_set *set, struct nlattr *tb[], enum ipset_adt adt, u32 *lineno, u32 flags, bool retried) { struct list_set *map = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; struct set_adt_elem e = { .refid = IPSET_INVALID_ID }; struct ip_set_ext ext = IP_SET_INIT_UEXT(set); struct ip_set *s; int ret = 0; if (tb[IPSET_ATTR_LINENO]) *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); if (unlikely(!tb[IPSET_ATTR_NAME] || !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS))) return -IPSET_ERR_PROTOCOL; ret = ip_set_get_extensions(set, tb, &ext); if (ret) return ret; e.id = ip_set_get_byname(map->net, nla_data(tb[IPSET_ATTR_NAME]), &s); if (e.id == IPSET_INVALID_ID) return -IPSET_ERR_NAME; /* "Loop detection" */ if (s->type->features & IPSET_TYPE_NAME) { ret = -IPSET_ERR_LOOP; goto finish; } if (tb[IPSET_ATTR_CADT_FLAGS]) { u32 f = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]); e.before = f & IPSET_FLAG_BEFORE; } if (e.before && !tb[IPSET_ATTR_NAMEREF]) { ret = -IPSET_ERR_BEFORE; goto finish; } if (tb[IPSET_ATTR_NAMEREF]) { e.refid = ip_set_get_byname(map->net, nla_data(tb[IPSET_ATTR_NAMEREF]), &s); if (e.refid == IPSET_INVALID_ID) { ret = -IPSET_ERR_NAMEREF; goto finish; } if (!e.before) e.before = -1; } if (adt != IPSET_TEST && SET_WITH_TIMEOUT(set)) set_cleanup_entries(set); ret = adtfn(set, &e, &ext, &ext, flags); finish: if (e.refid != IPSET_INVALID_ID) ip_set_put_byindex(map->net, e.refid); if (adt != IPSET_ADD || ret) ip_set_put_byindex(map->net, e.id); return ip_set_eexist(ret, flags) ? 0 : ret; } static void list_set_flush(struct ip_set *set) { struct list_set *map = set->data; struct set_elem *e, *n; list_for_each_entry_safe(e, n, &map->members, list) list_set_del(set, e); set->elements = 0; set->ext_size = 0; } static void list_set_destroy(struct ip_set *set) { struct list_set *map = set->data; WARN_ON_ONCE(!list_empty(&map->members)); kfree(map); set->data = NULL; } /* Calculate the actual memory size of the set data */ static size_t list_set_memsize(const struct list_set *map, size_t dsize) { struct set_elem *e; u32 n = 0; rcu_read_lock(); list_for_each_entry_rcu(e, &map->members, list) n++; rcu_read_unlock(); return (sizeof(*map) + n * dsize); } static int list_set_head(struct ip_set *set, struct sk_buff *skb) { const struct list_set *map = set->data; struct nlattr *nested; size_t memsize = list_set_memsize(map, set->dsize) + set->ext_size; nested = nla_nest_start(skb, IPSET_ATTR_DATA); if (!nested) goto nla_put_failure; if (nla_put_net32(skb, IPSET_ATTR_SIZE, htonl(map->size)) || nla_put_net32(skb, IPSET_ATTR_REFERENCES, htonl(set->ref)) || nla_put_net32(skb, IPSET_ATTR_MEMSIZE, htonl(memsize)) || nla_put_net32(skb, IPSET_ATTR_ELEMENTS, htonl(set->elements))) goto nla_put_failure; if (unlikely(ip_set_put_flags(skb, set))) goto nla_put_failure; nla_nest_end(skb, nested); return 0; nla_put_failure: return -EMSGSIZE; } static int list_set_list(const struct ip_set *set, struct sk_buff *skb, struct netlink_callback *cb) { const struct list_set *map = set->data; struct nlattr *atd, *nested; u32 i = 0, first = cb->args[IPSET_CB_ARG0]; char name[IPSET_MAXNAMELEN]; struct set_elem *e; int ret = 0; atd = nla_nest_start(skb, IPSET_ATTR_ADT); if (!atd) return -EMSGSIZE; rcu_read_lock(); list_for_each_entry_rcu(e, &map->members, list) { if (i < first || (SET_WITH_TIMEOUT(set) && ip_set_timeout_expired(ext_timeout(e, set)))) { i++; continue; } nested = nla_nest_start(skb, IPSET_ATTR_DATA); if (!nested) goto nla_put_failure; ip_set_name_byindex(map->net, e->id, name); if (nla_put_string(skb, IPSET_ATTR_NAME, name)) goto nla_put_failure; if (ip_set_put_extensions(skb, set, e, true)) goto nla_put_failure; nla_nest_end(skb, nested); i++; } nla_nest_end(skb, atd); /* Set listing finished */ cb->args[IPSET_CB_ARG0] = 0; goto out; nla_put_failure: nla_nest_cancel(skb, nested); if (unlikely(i == first)) { nla_nest_cancel(skb, atd); cb->args[IPSET_CB_ARG0] = 0; ret = -EMSGSIZE; } else { cb->args[IPSET_CB_ARG0] = i; nla_nest_end(skb, atd); } out: rcu_read_unlock(); return ret; } static bool list_set_same_set(const struct ip_set *a, const struct ip_set *b) { const struct list_set *x = a->data; const struct list_set *y = b->data; return x->size == y->size && a->timeout == b->timeout && a->extensions == b->extensions; } static void list_set_cancel_gc(struct ip_set *set) { struct list_set *map = set->data; if (SET_WITH_TIMEOUT(set)) timer_shutdown_sync(&map->gc); /* Flush list to drop references to other ipsets */ list_set_flush(set); } static const struct ip_set_type_variant set_variant = { .kadt = list_set_kadt, .uadt = list_set_uadt, .adt = { [IPSET_ADD] = list_set_uadd, [IPSET_DEL] = list_set_udel, [IPSET_TEST] = list_set_utest, }, .destroy = list_set_destroy, .flush = list_set_flush, .head = list_set_head, .list = list_set_list, .same_set = list_set_same_set, .cancel_gc = list_set_cancel_gc, }; static void list_set_gc(struct timer_list *t) { struct list_set *map = from_timer(map, t, gc); struct ip_set *set = map->set; spin_lock_bh(&set->lock); set_cleanup_entries(set); spin_unlock_bh(&set->lock); map->gc.expires = jiffies + IPSET_GC_PERIOD(set->timeout) * HZ; add_timer(&map->gc); } static void list_set_gc_init(struct ip_set *set, void (*gc)(struct timer_list *t)) { struct list_set *map = set->data; timer_setup(&map->gc, gc, 0); mod_timer(&map->gc, jiffies + IPSET_GC_PERIOD(set->timeout) * HZ); } /* Create list:set type of sets */ static bool init_list_set(struct net *net, struct ip_set *set, u32 size) { struct list_set *map; map = kzalloc(sizeof(*map), GFP_KERNEL); if (!map) return false; map->size = size; map->net = net; map->set = set; INIT_LIST_HEAD(&map->members); set->data = map; return true; } static struct lock_class_key list_set_lockdep_key; static int list_set_create(struct net *net, struct ip_set *set, struct nlattr *tb[], u32 flags) { u32 size = IP_SET_LIST_DEFAULT_SIZE; if (unlikely(!ip_set_optattr_netorder(tb, IPSET_ATTR_SIZE) || !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS))) return -IPSET_ERR_PROTOCOL; if (tb[IPSET_ATTR_SIZE]) size = ip_set_get_h32(tb[IPSET_ATTR_SIZE]); if (size < IP_SET_LIST_MIN_SIZE) size = IP_SET_LIST_MIN_SIZE; lockdep_set_class(&set->lock, &list_set_lockdep_key); set->variant = &set_variant; set->dsize = ip_set_elem_len(set, tb, sizeof(struct set_elem), __alignof__(struct set_elem)); if (!init_list_set(net, set, size)) return -ENOMEM; if (tb[IPSET_ATTR_TIMEOUT]) { set->timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]); list_set_gc_init(set, list_set_gc); } return 0; } static struct ip_set_type list_set_type __read_mostly = { .name = "list:set", .protocol = IPSET_PROTOCOL, .features = IPSET_TYPE_NAME | IPSET_DUMP_LAST, .dimension = IPSET_DIM_ONE, .family = NFPROTO_UNSPEC, .revision_min = IPSET_TYPE_REV_MIN, .revision_max = IPSET_TYPE_REV_MAX, .create = list_set_create, .create_policy = { [IPSET_ATTR_SIZE] = { .type = NLA_U32 }, [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 }, [IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 }, }, .adt_policy = { [IPSET_ATTR_NAME] = { .type = NLA_STRING, .len = IPSET_MAXNAMELEN }, [IPSET_ATTR_NAMEREF] = { .type = NLA_STRING, .len = IPSET_MAXNAMELEN }, [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 }, [IPSET_ATTR_LINENO] = { .type = NLA_U32 }, [IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 }, [IPSET_ATTR_BYTES] = { .type = NLA_U64 }, [IPSET_ATTR_PACKETS] = { .type = NLA_U64 }, [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING, .len = IPSET_MAX_COMMENT_SIZE }, [IPSET_ATTR_SKBMARK] = { .type = NLA_U64 }, [IPSET_ATTR_SKBPRIO] = { .type = NLA_U32 }, [IPSET_ATTR_SKBQUEUE] = { .type = NLA_U16 }, }, .me = THIS_MODULE, }; static int __init list_set_init(void) { return ip_set_type_register(&list_set_type); } static void __exit list_set_fini(void) { rcu_barrier(); ip_set_type_unregister(&list_set_type); } module_init(list_set_init); module_exit(list_set_fini); |
| 2 2 2 2 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 | // SPDX-License-Identifier: GPL-2.0-or-later /* * Squashfs - a compressed read only filesystem for Linux * * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008 * Phillip Lougher <phillip@squashfs.org.uk> * * symlink.c */ /* * This file implements code to handle symbolic links. * * The data contents of symbolic links are stored inside the symbolic * link inode within the inode table. This allows the normally small symbolic * link to be compressed as part of the inode table, achieving much greater * compression than if the symbolic link was compressed individually. */ #include <linux/fs.h> #include <linux/vfs.h> #include <linux/kernel.h> #include <linux/string.h> #include <linux/pagemap.h> #include <linux/xattr.h> #include "squashfs_fs.h" #include "squashfs_fs_sb.h" #include "squashfs_fs_i.h" #include "squashfs.h" #include "xattr.h" static int squashfs_symlink_read_folio(struct file *file, struct folio *folio) { struct inode *inode = folio->mapping->host; struct super_block *sb = inode->i_sb; struct squashfs_sb_info *msblk = sb->s_fs_info; int index = folio_pos(folio); u64 block = squashfs_i(inode)->start; int offset = squashfs_i(inode)->offset; int length = min_t(int, i_size_read(inode) - index, PAGE_SIZE); int bytes, copied, error; void *pageaddr; struct squashfs_cache_entry *entry; TRACE("Entered squashfs_symlink_readpage, page index %ld, start block " "%llx, offset %x\n", folio->index, block, offset); /* * Skip index bytes into symlink metadata. */ if (index) { bytes = squashfs_read_metadata(sb, NULL, &block, &offset, index); if (bytes < 0) { ERROR("Unable to read symlink [%llx:%x]\n", squashfs_i(inode)->start, squashfs_i(inode)->offset); error = bytes; goto out; } } /* * Read length bytes from symlink metadata. Squashfs_read_metadata * is not used here because it can sleep and we want to use * kmap_local to map the folio. Instead call the underlying * squashfs_cache_get routine. As length bytes may overlap metadata * blocks, we may need to call squashfs_cache_get multiple times. */ for (bytes = 0; bytes < length; offset = 0, bytes += copied) { entry = squashfs_cache_get(sb, msblk->block_cache, block, 0); if (entry->error) { ERROR("Unable to read symlink [%llx:%x]\n", squashfs_i(inode)->start, squashfs_i(inode)->offset); squashfs_cache_put(entry); error = entry->error; goto out; } pageaddr = kmap_local_folio(folio, 0); copied = squashfs_copy_data(pageaddr + bytes, entry, offset, length - bytes); if (copied == length - bytes) memset(pageaddr + length, 0, PAGE_SIZE - length); else block = entry->next_index; kunmap_local(pageaddr); squashfs_cache_put(entry); } flush_dcache_folio(folio); error = 0; out: folio_end_read(folio, error == 0); return error; } const struct address_space_operations squashfs_symlink_aops = { .read_folio = squashfs_symlink_read_folio }; const struct inode_operations squashfs_symlink_inode_ops = { .get_link = page_get_link, .listxattr = squashfs_listxattr }; |
| 6 6 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 | // SPDX-License-Identifier: MIT #include <drm/clients/drm_client_setup.h> #include <drm/drm_device.h> #include <drm/drm_fourcc.h> #include <drm/drm_print.h> #include "drm_client_internal.h" static char drm_client_default[16] = CONFIG_DRM_CLIENT_DEFAULT; module_param_string(active, drm_client_default, sizeof(drm_client_default), 0444); MODULE_PARM_DESC(active, "Choose which drm client to start, default is" CONFIG_DRM_CLIENT_DEFAULT "]"); /** * drm_client_setup() - Setup in-kernel DRM clients * @dev: DRM device * @format: Preferred pixel format for the device. Use NULL, unless * there is clearly a driver-preferred format. * * This function sets up the in-kernel DRM clients. Restore, hotplug * events and teardown are all taken care of. * * Drivers should call drm_client_setup() after registering the new * DRM device with drm_dev_register(). This function is safe to call * even when there are no connectors present. Setup will be retried * on the next hotplug event. * * The clients are destroyed by drm_dev_unregister(). */ void drm_client_setup(struct drm_device *dev, const struct drm_format_info *format) { #ifdef CONFIG_DRM_FBDEV_EMULATION if (!strcmp(drm_client_default, "fbdev")) { int ret; ret = drm_fbdev_client_setup(dev, format); if (ret) drm_warn(dev, "Failed to set up DRM client; error %d\n", ret); return; } #endif #ifdef CONFIG_DRM_CLIENT_LOG if (!strcmp(drm_client_default, "log")) { drm_log_register(dev); return; } #endif if (strcmp(drm_client_default, "")) drm_warn(dev, "Unknown DRM client %s\n", drm_client_default); } EXPORT_SYMBOL(drm_client_setup); /** * drm_client_setup_with_fourcc() - Setup in-kernel DRM clients for color mode * @dev: DRM device * @fourcc: Preferred pixel format as 4CC code for the device * * This function sets up the in-kernel DRM clients. It is equivalent * to drm_client_setup(), but expects a 4CC code as second argument. */ void drm_client_setup_with_fourcc(struct drm_device *dev, u32 fourcc) { drm_client_setup(dev, drm_format_info(fourcc)); } EXPORT_SYMBOL(drm_client_setup_with_fourcc); /** * drm_client_setup_with_color_mode() - Setup in-kernel DRM clients for color mode * @dev: DRM device * @color_mode: Preferred color mode for the device * * This function sets up the in-kernel DRM clients. It is equivalent * to drm_client_setup(), but expects a color mode as second argument. * * Do not use this function in new drivers. Prefer drm_client_setup() with a * format of NULL. */ void drm_client_setup_with_color_mode(struct drm_device *dev, unsigned int color_mode) { u32 fourcc = drm_driver_color_mode_format(dev, color_mode); drm_client_setup_with_fourcc(dev, fourcc); } EXPORT_SYMBOL(drm_client_setup_with_color_mode); MODULE_DESCRIPTION("In-kernel DRM clients"); MODULE_LICENSE("GPL and additional rights"); |
| 7 1 4 1 1 2 2 1 201 198 11 1 7 3 2 7 12 14 3 3 3 16 16 16 16 16 29 29 7 5 1 6 5 5 1 16 16 1 3 4 4 4 94 15 9 2 3 3 2 4 21 1 9 10 4 15 17 2 14 14 1 1 1 2 10 10 2 6 6 2 4 1 1 4 25 91 1 90 91 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 | // SPDX-License-Identifier: GPL-2.0 /* * XFRM virtual interface * * Copyright (C) 2018 secunet Security Networks AG * * Author: * Steffen Klassert <steffen.klassert@secunet.com> */ #include <linux/module.h> #include <linux/capability.h> #include <linux/errno.h> #include <linux/types.h> #include <linux/sockios.h> #include <linux/icmp.h> #include <linux/if.h> #include <linux/in.h> #include <linux/ip.h> #include <linux/net.h> #include <linux/in6.h> #include <linux/netdevice.h> #include <linux/if_link.h> #include <linux/if_arp.h> #include <linux/icmpv6.h> #include <linux/init.h> #include <linux/route.h> #include <linux/rtnetlink.h> #include <linux/netfilter_ipv6.h> #include <linux/slab.h> #include <linux/hash.h> #include <linux/uaccess.h> #include <linux/atomic.h> #include <net/gso.h> #include <net/icmp.h> #include <net/ip.h> #include <net/ipv6.h> #include <net/ip6_route.h> #include <net/ip_tunnels.h> #include <net/addrconf.h> #include <net/xfrm.h> #include <net/net_namespace.h> #include <net/dst_metadata.h> #include <net/netns/generic.h> #include <linux/etherdevice.h> static int xfrmi_dev_init(struct net_device *dev); static void xfrmi_dev_setup(struct net_device *dev); static struct rtnl_link_ops xfrmi_link_ops __read_mostly; static unsigned int xfrmi_net_id __read_mostly; static const struct net_device_ops xfrmi_netdev_ops; #define XFRMI_HASH_BITS 8 #define XFRMI_HASH_SIZE BIT(XFRMI_HASH_BITS) struct xfrmi_net { /* lists for storing interfaces in use */ struct xfrm_if __rcu *xfrmi[XFRMI_HASH_SIZE]; struct xfrm_if __rcu *collect_md_xfrmi; }; static const struct nla_policy xfrm_lwt_policy[LWT_XFRM_MAX + 1] = { [LWT_XFRM_IF_ID] = NLA_POLICY_MIN(NLA_U32, 1), [LWT_XFRM_LINK] = NLA_POLICY_MIN(NLA_U32, 1), }; static void xfrmi_destroy_state(struct lwtunnel_state *lwt) { } static int xfrmi_build_state(struct net *net, struct nlattr *nla, unsigned int family, const void *cfg, struct lwtunnel_state **ts, struct netlink_ext_ack *extack) { struct nlattr *tb[LWT_XFRM_MAX + 1]; struct lwtunnel_state *new_state; struct xfrm_md_info *info; int ret; ret = nla_parse_nested(tb, LWT_XFRM_MAX, nla, xfrm_lwt_policy, extack); if (ret < 0) return ret; if (!tb[LWT_XFRM_IF_ID]) { NL_SET_ERR_MSG(extack, "if_id must be set"); return -EINVAL; } new_state = lwtunnel_state_alloc(sizeof(*info)); if (!new_state) { NL_SET_ERR_MSG(extack, "failed to create encap info"); return -ENOMEM; } new_state->type = LWTUNNEL_ENCAP_XFRM; info = lwt_xfrm_info(new_state); info->if_id = nla_get_u32(tb[LWT_XFRM_IF_ID]); if (tb[LWT_XFRM_LINK]) info->link = nla_get_u32(tb[LWT_XFRM_LINK]); *ts = new_state; return 0; } static int xfrmi_fill_encap_info(struct sk_buff *skb, struct lwtunnel_state *lwt) { struct xfrm_md_info *info = lwt_xfrm_info(lwt); if (nla_put_u32(skb, LWT_XFRM_IF_ID, info->if_id) || (info->link && nla_put_u32(skb, LWT_XFRM_LINK, info->link))) return -EMSGSIZE; return 0; } static int xfrmi_encap_nlsize(struct lwtunnel_state *lwtstate) { return nla_total_size(sizeof(u32)) + /* LWT_XFRM_IF_ID */ nla_total_size(sizeof(u32)); /* LWT_XFRM_LINK */ } static int xfrmi_encap_cmp(struct lwtunnel_state *a, struct lwtunnel_state *b) { struct xfrm_md_info *a_info = lwt_xfrm_info(a); struct xfrm_md_info *b_info = lwt_xfrm_info(b); return memcmp(a_info, b_info, sizeof(*a_info)); } static const struct lwtunnel_encap_ops xfrmi_encap_ops = { .build_state = xfrmi_build_state, .destroy_state = xfrmi_destroy_state, .fill_encap = xfrmi_fill_encap_info, .get_encap_size = xfrmi_encap_nlsize, .cmp_encap = xfrmi_encap_cmp, .owner = THIS_MODULE, }; #define for_each_xfrmi_rcu(start, xi) \ for (xi = rcu_dereference(start); xi; xi = rcu_dereference(xi->next)) static u32 xfrmi_hash(u32 if_id) { return hash_32(if_id, XFRMI_HASH_BITS); } static struct xfrm_if *xfrmi_lookup(struct net *net, struct xfrm_state *x) { struct xfrmi_net *xfrmn = net_generic(net, xfrmi_net_id); struct xfrm_if *xi; for_each_xfrmi_rcu(xfrmn->xfrmi[xfrmi_hash(x->if_id)], xi) { if (x->if_id == xi->p.if_id && (xi->dev->flags & IFF_UP)) return xi; } xi = rcu_dereference(xfrmn->collect_md_xfrmi); if (xi && (xi->dev->flags & IFF_UP)) return xi; return NULL; } static bool xfrmi_decode_session(struct sk_buff *skb, unsigned short family, struct xfrm_if_decode_session_result *res) { struct net_device *dev; struct xfrm_if *xi; int ifindex = 0; if (!secpath_exists(skb) || !skb->dev) return false; switch (family) { case AF_INET6: ifindex = inet6_sdif(skb); break; case AF_INET: ifindex = inet_sdif(skb); break; } if (ifindex) { struct net *net = xs_net(xfrm_input_state(skb)); dev = dev_get_by_index_rcu(net, ifindex); } else { dev = skb->dev; } if (!dev || !(dev->flags & IFF_UP)) return false; if (dev->netdev_ops != &xfrmi_netdev_ops) return false; xi = netdev_priv(dev); res->net = xi->net; if (xi->p.collect_md) res->if_id = xfrm_input_state(skb)->if_id; else res->if_id = xi->p.if_id; return true; } static void xfrmi_link(struct xfrmi_net *xfrmn, struct xfrm_if *xi) { struct xfrm_if __rcu **xip = &xfrmn->xfrmi[xfrmi_hash(xi->p.if_id)]; rcu_assign_pointer(xi->next , rtnl_dereference(*xip)); rcu_assign_pointer(*xip, xi); } static void xfrmi_unlink(struct xfrmi_net *xfrmn, struct xfrm_if *xi) { struct xfrm_if __rcu **xip; struct xfrm_if *iter; for (xip = &xfrmn->xfrmi[xfrmi_hash(xi->p.if_id)]; (iter = rtnl_dereference(*xip)) != NULL; xip = &iter->next) { if (xi == iter) { rcu_assign_pointer(*xip, xi->next); break; } } } static void xfrmi_dev_free(struct net_device *dev) { struct xfrm_if *xi = netdev_priv(dev); gro_cells_destroy(&xi->gro_cells); } static int xfrmi_create(struct net_device *dev) { struct xfrm_if *xi = netdev_priv(dev); struct net *net = dev_net(dev); struct xfrmi_net *xfrmn = net_generic(net, xfrmi_net_id); int err; dev->rtnl_link_ops = &xfrmi_link_ops; err = register_netdevice(dev); if (err < 0) goto out; if (xi->p.collect_md) rcu_assign_pointer(xfrmn->collect_md_xfrmi, xi); else xfrmi_link(xfrmn, xi); return 0; out: return err; } static struct xfrm_if *xfrmi_locate(struct net *net, struct xfrm_if_parms *p) { struct xfrm_if __rcu **xip; struct xfrm_if *xi; struct xfrmi_net *xfrmn = net_generic(net, xfrmi_net_id); for (xip = &xfrmn->xfrmi[xfrmi_hash(p->if_id)]; (xi = rtnl_dereference(*xip)) != NULL; xip = &xi->next) if (xi->p.if_id == p->if_id) return xi; return NULL; } static void xfrmi_dev_uninit(struct net_device *dev) { struct xfrm_if *xi = netdev_priv(dev); struct xfrmi_net *xfrmn = net_generic(xi->net, xfrmi_net_id); if (xi->p.collect_md) RCU_INIT_POINTER(xfrmn->collect_md_xfrmi, NULL); else xfrmi_unlink(xfrmn, xi); } static void xfrmi_scrub_packet(struct sk_buff *skb, bool xnet) { skb_clear_tstamp(skb); skb->pkt_type = PACKET_HOST; skb->skb_iif = 0; skb->ignore_df = 0; skb_dst_drop(skb); nf_reset_ct(skb); nf_reset_trace(skb); if (!xnet) return; ipvs_reset(skb); secpath_reset(skb); skb_orphan(skb); skb->mark = 0; } static int xfrmi_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type, unsigned short family) { struct sec_path *sp; sp = skb_sec_path(skb); if (sp && (sp->len || sp->olen) && !xfrm_policy_check(NULL, XFRM_POLICY_IN, skb, family)) goto discard; XFRM_SPI_SKB_CB(skb)->family = family; if (family == AF_INET) { XFRM_SPI_SKB_CB(skb)->daddroff = offsetof(struct iphdr, daddr); XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip4 = NULL; } else { XFRM_SPI_SKB_CB(skb)->daddroff = offsetof(struct ipv6hdr, daddr); XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip6 = NULL; } return xfrm_input(skb, nexthdr, spi, encap_type); discard: kfree_skb(skb); return 0; } static int xfrmi4_rcv(struct sk_buff *skb) { return xfrmi_input(skb, ip_hdr(skb)->protocol, 0, 0, AF_INET); } static int xfrmi6_rcv(struct sk_buff *skb) { return xfrmi_input(skb, skb_network_header(skb)[IP6CB(skb)->nhoff], 0, 0, AF_INET6); } static int xfrmi4_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type) { return xfrmi_input(skb, nexthdr, spi, encap_type, AF_INET); } static int xfrmi6_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type) { return xfrmi_input(skb, nexthdr, spi, encap_type, AF_INET6); } static int xfrmi_rcv_cb(struct sk_buff *skb, int err) { const struct xfrm_mode *inner_mode; struct net_device *dev; struct xfrm_state *x; struct xfrm_if *xi; bool xnet; int link; if (err && !secpath_exists(skb)) return 0; x = xfrm_input_state(skb); xi = xfrmi_lookup(xs_net(x), x); if (!xi) return 1; link = skb->dev->ifindex; dev = xi->dev; skb->dev = dev; if (err) { DEV_STATS_INC(dev, rx_errors); DEV_STATS_INC(dev, rx_dropped); return 0; } xnet = !net_eq(xi->net, dev_net(skb->dev)); if (xnet) { inner_mode = &x->inner_mode; if (x->sel.family == AF_UNSPEC) { inner_mode = xfrm_ip2inner_mode(x, XFRM_MODE_SKB_CB(skb)->protocol); if (inner_mode == NULL) { XFRM_INC_STATS(dev_net(skb->dev), LINUX_MIB_XFRMINSTATEMODEERROR); return -EINVAL; } } if (!xfrm_policy_check(NULL, XFRM_POLICY_IN, skb, inner_mode->family)) return -EPERM; } xfrmi_scrub_packet(skb, xnet); if (xi->p.collect_md) { struct metadata_dst *md_dst; md_dst = metadata_dst_alloc(0, METADATA_XFRM, GFP_ATOMIC); if (!md_dst) return -ENOMEM; md_dst->u.xfrm_info.if_id = x->if_id; md_dst->u.xfrm_info.link = link; skb_dst_set(skb, (struct dst_entry *)md_dst); } dev_sw_netstats_rx_add(dev, skb->len); return 0; } static int xfrmi_xmit2(struct sk_buff *skb, struct net_device *dev, struct flowi *fl) { struct xfrm_if *xi = netdev_priv(dev); struct dst_entry *dst = skb_dst(skb); unsigned int length = skb->len; struct net_device *tdev; struct xfrm_state *x; int err = -1; u32 if_id; int mtu; if (xi->p.collect_md) { struct xfrm_md_info *md_info = skb_xfrm_md_info(skb); if (unlikely(!md_info)) return -EINVAL; if_id = md_info->if_id; fl->flowi_oif = md_info->link; if (md_info->dst_orig) { struct dst_entry *tmp_dst = dst; dst = md_info->dst_orig; skb_dst_set(skb, dst); md_info->dst_orig = NULL; dst_release(tmp_dst); } } else { if_id = xi->p.if_id; } dst_hold(dst); dst = xfrm_lookup_with_ifid(xi->net, dst, fl, NULL, 0, if_id); if (IS_ERR(dst)) { err = PTR_ERR(dst); dst = NULL; goto tx_err_link_failure; } x = dst->xfrm; if (!x) goto tx_err_link_failure; if (x->if_id != if_id) goto tx_err_link_failure; tdev = dst->dev; if (tdev == dev) { DEV_STATS_INC(dev, collisions); net_warn_ratelimited("%s: Local routing loop detected!\n", dev->name); goto tx_err_dst_release; } mtu = dst_mtu(dst); if ((!skb_is_gso(skb) && skb->len > mtu) || (skb_is_gso(skb) && !skb_gso_validate_network_len(skb, mtu))) { skb_dst_update_pmtu_no_confirm(skb, mtu); if (skb->protocol == htons(ETH_P_IPV6)) { if (mtu < IPV6_MIN_MTU) mtu = IPV6_MIN_MTU; if (skb->len > 1280) icmpv6_ndo_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); else goto xmit; } else { if (!(ip_hdr(skb)->frag_off & htons(IP_DF))) goto xmit; icmp_ndo_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu)); } dst_release(dst); return -EMSGSIZE; } xmit: xfrmi_scrub_packet(skb, !net_eq(xi->net, dev_net(dev))); skb_dst_set(skb, dst); skb->dev = tdev; err = dst_output(xi->net, skb_to_full_sk(skb), skb); if (net_xmit_eval(err) == 0) { dev_sw_netstats_tx_add(dev, 1, length); } else { DEV_STATS_INC(dev, tx_errors); DEV_STATS_INC(dev, tx_aborted_errors); } return 0; tx_err_link_failure: DEV_STATS_INC(dev, tx_carrier_errors); dst_link_failure(skb); tx_err_dst_release: dst_release(dst); return err; } static netdev_tx_t xfrmi_xmit(struct sk_buff *skb, struct net_device *dev) { struct xfrm_if *xi = netdev_priv(dev); struct dst_entry *dst = skb_dst(skb); struct flowi fl; int ret; memset(&fl, 0, sizeof(fl)); switch (skb->protocol) { case htons(ETH_P_IPV6): memset(IP6CB(skb), 0, sizeof(*IP6CB(skb))); xfrm_decode_session(dev_net(dev), skb, &fl, AF_INET6); if (!dst) { fl.u.ip6.flowi6_oif = dev->ifindex; fl.u.ip6.flowi6_flags |= FLOWI_FLAG_ANYSRC; dst = ip6_route_output(dev_net(dev), NULL, &fl.u.ip6); if (dst->error) { dst_release(dst); DEV_STATS_INC(dev, tx_carrier_errors); goto tx_err; } skb_dst_set(skb, dst); } break; case htons(ETH_P_IP): memset(IPCB(skb), 0, sizeof(*IPCB(skb))); xfrm_decode_session(dev_net(dev), skb, &fl, AF_INET); if (!dst) { struct rtable *rt; fl.u.ip4.flowi4_oif = dev->ifindex; fl.u.ip4.flowi4_flags |= FLOWI_FLAG_ANYSRC; rt = __ip_route_output_key(dev_net(dev), &fl.u.ip4); if (IS_ERR(rt)) { DEV_STATS_INC(dev, tx_carrier_errors); goto tx_err; } skb_dst_set(skb, &rt->dst); } break; default: goto tx_err; } fl.flowi_oif = xi->p.link; ret = xfrmi_xmit2(skb, dev, &fl); if (ret < 0) goto tx_err; return NETDEV_TX_OK; tx_err: DEV_STATS_INC(dev, tx_errors); DEV_STATS_INC(dev, tx_dropped); kfree_skb(skb); return NETDEV_TX_OK; } static int xfrmi4_err(struct sk_buff *skb, u32 info) { const struct iphdr *iph = (const struct iphdr *)skb->data; struct net *net = dev_net(skb->dev); int protocol = iph->protocol; struct ip_comp_hdr *ipch; struct ip_esp_hdr *esph; struct ip_auth_hdr *ah ; struct xfrm_state *x; struct xfrm_if *xi; __be32 spi; switch (protocol) { case IPPROTO_ESP: esph = (struct ip_esp_hdr *)(skb->data+(iph->ihl<<2)); spi = esph->spi; break; case IPPROTO_AH: ah = (struct ip_auth_hdr *)(skb->data+(iph->ihl<<2)); spi = ah->spi; break; case IPPROTO_COMP: ipch = (struct ip_comp_hdr *)(skb->data+(iph->ihl<<2)); spi = htonl(ntohs(ipch->cpi)); break; default: return 0; } switch (icmp_hdr(skb)->type) { case ICMP_DEST_UNREACH: if (icmp_hdr(skb)->code != ICMP_FRAG_NEEDED) return 0; break; case ICMP_REDIRECT: break; default: return 0; } x = xfrm_state_lookup(net, skb->mark, (const xfrm_address_t *)&iph->daddr, spi, protocol, AF_INET); if (!x) return 0; xi = xfrmi_lookup(net, x); if (!xi) { xfrm_state_put(x); return -1; } if (icmp_hdr(skb)->type == ICMP_DEST_UNREACH) ipv4_update_pmtu(skb, net, info, 0, protocol); else ipv4_redirect(skb, net, 0, protocol); xfrm_state_put(x); return 0; } static int xfrmi6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, u8 type, u8 code, int offset, __be32 info) { const struct ipv6hdr *iph = (const struct ipv6hdr *)skb->data; struct net *net = dev_net(skb->dev); int protocol = iph->nexthdr; struct ip_comp_hdr *ipch; struct ip_esp_hdr *esph; struct ip_auth_hdr *ah; struct xfrm_state *x; struct xfrm_if *xi; __be32 spi; switch (protocol) { case IPPROTO_ESP: esph = (struct ip_esp_hdr *)(skb->data + offset); spi = esph->spi; break; case IPPROTO_AH: ah = (struct ip_auth_hdr *)(skb->data + offset); spi = ah->spi; break; case IPPROTO_COMP: ipch = (struct ip_comp_hdr *)(skb->data + offset); spi = htonl(ntohs(ipch->cpi)); break; default: return 0; } if (type != ICMPV6_PKT_TOOBIG && type != NDISC_REDIRECT) return 0; x = xfrm_state_lookup(net, skb->mark, (const xfrm_address_t *)&iph->daddr, spi, protocol, AF_INET6); if (!x) return 0; xi = xfrmi_lookup(net, x); if (!xi) { xfrm_state_put(x); return -1; } if (type == NDISC_REDIRECT) ip6_redirect(skb, net, skb->dev->ifindex, 0, sock_net_uid(net, NULL)); else ip6_update_pmtu(skb, net, info, 0, 0, sock_net_uid(net, NULL)); xfrm_state_put(x); return 0; } static int xfrmi_change(struct xfrm_if *xi, const struct xfrm_if_parms *p) { if (xi->p.link != p->link) return -EINVAL; xi->p.if_id = p->if_id; return 0; } static int xfrmi_update(struct xfrm_if *xi, struct xfrm_if_parms *p) { struct net *net = xi->net; struct xfrmi_net *xfrmn = net_generic(net, xfrmi_net_id); int err; xfrmi_unlink(xfrmn, xi); synchronize_net(); err = xfrmi_change(xi, p); xfrmi_link(xfrmn, xi); netdev_state_change(xi->dev); return err; } static int xfrmi_get_iflink(const struct net_device *dev) { struct xfrm_if *xi = netdev_priv(dev); return READ_ONCE(xi->p.link); } static const struct net_device_ops xfrmi_netdev_ops = { .ndo_init = xfrmi_dev_init, .ndo_uninit = xfrmi_dev_uninit, .ndo_start_xmit = xfrmi_xmit, .ndo_get_stats64 = dev_get_tstats64, .ndo_get_iflink = xfrmi_get_iflink, }; static void xfrmi_dev_setup(struct net_device *dev) { dev->netdev_ops = &xfrmi_netdev_ops; dev->header_ops = &ip_tunnel_header_ops; dev->type = ARPHRD_NONE; dev->mtu = ETH_DATA_LEN; dev->min_mtu = ETH_MIN_MTU; dev->max_mtu = IP_MAX_MTU; dev->flags = IFF_NOARP; dev->needs_free_netdev = true; dev->priv_destructor = xfrmi_dev_free; dev->pcpu_stat_type = NETDEV_PCPU_STAT_TSTATS; netif_keep_dst(dev); eth_broadcast_addr(dev->broadcast); } #define XFRMI_FEATURES (NETIF_F_SG | \ NETIF_F_FRAGLIST | \ NETIF_F_GSO_SOFTWARE | \ NETIF_F_HW_CSUM) static int xfrmi_dev_init(struct net_device *dev) { struct xfrm_if *xi = netdev_priv(dev); struct net_device *phydev = __dev_get_by_index(xi->net, xi->p.link); int err; err = gro_cells_init(&xi->gro_cells, dev); if (err) return err; dev->lltx = true; dev->features |= XFRMI_FEATURES; dev->hw_features |= XFRMI_FEATURES; if (phydev) { dev->needed_headroom = phydev->needed_headroom; dev->needed_tailroom = phydev->needed_tailroom; if (is_zero_ether_addr(dev->dev_addr)) eth_hw_addr_inherit(dev, phydev); if (is_zero_ether_addr(dev->broadcast)) memcpy(dev->broadcast, phydev->broadcast, dev->addr_len); } else { eth_hw_addr_random(dev); eth_broadcast_addr(dev->broadcast); } return 0; } static int xfrmi_validate(struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { return 0; } static void xfrmi_netlink_parms(struct nlattr *data[], struct xfrm_if_parms *parms) { memset(parms, 0, sizeof(*parms)); if (!data) return; if (data[IFLA_XFRM_LINK]) parms->link = nla_get_u32(data[IFLA_XFRM_LINK]); if (data[IFLA_XFRM_IF_ID]) parms->if_id = nla_get_u32(data[IFLA_XFRM_IF_ID]); if (data[IFLA_XFRM_COLLECT_METADATA]) parms->collect_md = true; } static int xfrmi_newlink(struct net *src_net, struct net_device *dev, struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { struct net *net = dev_net(dev); struct xfrm_if_parms p = {}; struct xfrm_if *xi; int err; xfrmi_netlink_parms(data, &p); if (p.collect_md) { struct xfrmi_net *xfrmn = net_generic(net, xfrmi_net_id); if (p.link || p.if_id) { NL_SET_ERR_MSG(extack, "link and if_id must be zero"); return -EINVAL; } if (rtnl_dereference(xfrmn->collect_md_xfrmi)) return -EEXIST; } else { if (!p.if_id) { NL_SET_ERR_MSG(extack, "if_id must be non zero"); return -EINVAL; } xi = xfrmi_locate(net, &p); if (xi) return -EEXIST; } xi = netdev_priv(dev); xi->p = p; xi->net = net; xi->dev = dev; err = xfrmi_create(dev); return err; } static void xfrmi_dellink(struct net_device *dev, struct list_head *head) { unregister_netdevice_queue(dev, head); } static int xfrmi_changelink(struct net_device *dev, struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { struct xfrm_if *xi = netdev_priv(dev); struct net *net = xi->net; struct xfrm_if_parms p = {}; xfrmi_netlink_parms(data, &p); if (!p.if_id) { NL_SET_ERR_MSG(extack, "if_id must be non zero"); return -EINVAL; } if (p.collect_md) { NL_SET_ERR_MSG(extack, "collect_md can't be changed"); return -EINVAL; } xi = xfrmi_locate(net, &p); if (!xi) { xi = netdev_priv(dev); } else { if (xi->dev != dev) return -EEXIST; if (xi->p.collect_md) { NL_SET_ERR_MSG(extack, "device can't be changed to collect_md"); return -EINVAL; } } return xfrmi_update(xi, &p); } static size_t xfrmi_get_size(const struct net_device *dev) { return /* IFLA_XFRM_LINK */ nla_total_size(4) + /* IFLA_XFRM_IF_ID */ nla_total_size(4) + /* IFLA_XFRM_COLLECT_METADATA */ nla_total_size(0) + 0; } static int xfrmi_fill_info(struct sk_buff *skb, const struct net_device *dev) { struct xfrm_if *xi = netdev_priv(dev); struct xfrm_if_parms *parm = &xi->p; if (nla_put_u32(skb, IFLA_XFRM_LINK, parm->link) || nla_put_u32(skb, IFLA_XFRM_IF_ID, parm->if_id) || (xi->p.collect_md && nla_put_flag(skb, IFLA_XFRM_COLLECT_METADATA))) goto nla_put_failure; return 0; nla_put_failure: return -EMSGSIZE; } static struct net *xfrmi_get_link_net(const struct net_device *dev) { struct xfrm_if *xi = netdev_priv(dev); return READ_ONCE(xi->net); } static const struct nla_policy xfrmi_policy[IFLA_XFRM_MAX + 1] = { [IFLA_XFRM_UNSPEC] = { .strict_start_type = IFLA_XFRM_COLLECT_METADATA }, [IFLA_XFRM_LINK] = { .type = NLA_U32 }, [IFLA_XFRM_IF_ID] = { .type = NLA_U32 }, [IFLA_XFRM_COLLECT_METADATA] = { .type = NLA_FLAG }, }; static struct rtnl_link_ops xfrmi_link_ops __read_mostly = { .kind = "xfrm", .maxtype = IFLA_XFRM_MAX, .policy = xfrmi_policy, .priv_size = sizeof(struct xfrm_if), .setup = xfrmi_dev_setup, .validate = xfrmi_validate, .newlink = xfrmi_newlink, .dellink = xfrmi_dellink, .changelink = xfrmi_changelink, .get_size = xfrmi_get_size, .fill_info = xfrmi_fill_info, .get_link_net = xfrmi_get_link_net, }; static void __net_exit xfrmi_exit_batch_rtnl(struct list_head *net_exit_list, struct list_head *dev_to_kill) { struct net *net; ASSERT_RTNL(); list_for_each_entry(net, net_exit_list, exit_list) { struct xfrmi_net *xfrmn = net_generic(net, xfrmi_net_id); struct xfrm_if __rcu **xip; struct xfrm_if *xi; int i; for (i = 0; i < XFRMI_HASH_SIZE; i++) { for (xip = &xfrmn->xfrmi[i]; (xi = rtnl_dereference(*xip)) != NULL; xip = &xi->next) unregister_netdevice_queue(xi->dev, dev_to_kill); } xi = rtnl_dereference(xfrmn->collect_md_xfrmi); if (xi) unregister_netdevice_queue(xi->dev, dev_to_kill); } } static struct pernet_operations xfrmi_net_ops = { .exit_batch_rtnl = xfrmi_exit_batch_rtnl, .id = &xfrmi_net_id, .size = sizeof(struct xfrmi_net), }; static struct xfrm6_protocol xfrmi_esp6_protocol __read_mostly = { .handler = xfrmi6_rcv, .input_handler = xfrmi6_input, .cb_handler = xfrmi_rcv_cb, .err_handler = xfrmi6_err, .priority = 10, }; static struct xfrm6_protocol xfrmi_ah6_protocol __read_mostly = { .handler = xfrm6_rcv, .input_handler = xfrm_input, .cb_handler = xfrmi_rcv_cb, .err_handler = xfrmi6_err, .priority = 10, }; static struct xfrm6_protocol xfrmi_ipcomp6_protocol __read_mostly = { .handler = xfrm6_rcv, .input_handler = xfrm_input, .cb_handler = xfrmi_rcv_cb, .err_handler = xfrmi6_err, .priority = 10, }; #if IS_REACHABLE(CONFIG_INET6_XFRM_TUNNEL) static int xfrmi6_rcv_tunnel(struct sk_buff *skb) { const xfrm_address_t *saddr; __be32 spi; saddr = (const xfrm_address_t *)&ipv6_hdr(skb)->saddr; spi = xfrm6_tunnel_spi_lookup(dev_net(skb->dev), saddr); return xfrm6_rcv_spi(skb, IPPROTO_IPV6, spi, NULL); } static struct xfrm6_tunnel xfrmi_ipv6_handler __read_mostly = { .handler = xfrmi6_rcv_tunnel, .cb_handler = xfrmi_rcv_cb, .err_handler = xfrmi6_err, .priority = 2, }; static struct xfrm6_tunnel xfrmi_ip6ip_handler __read_mostly = { .handler = xfrmi6_rcv_tunnel, .cb_handler = xfrmi_rcv_cb, .err_handler = xfrmi6_err, .priority = 2, }; #endif static struct xfrm4_protocol xfrmi_esp4_protocol __read_mostly = { .handler = xfrmi4_rcv, .input_handler = xfrmi4_input, .cb_handler = xfrmi_rcv_cb, .err_handler = xfrmi4_err, .priority = 10, }; static struct xfrm4_protocol xfrmi_ah4_protocol __read_mostly = { .handler = xfrm4_rcv, .input_handler = xfrm_input, .cb_handler = xfrmi_rcv_cb, .err_handler = xfrmi4_err, .priority = 10, }; static struct xfrm4_protocol xfrmi_ipcomp4_protocol __read_mostly = { .handler = xfrm4_rcv, .input_handler = xfrm_input, .cb_handler = xfrmi_rcv_cb, .err_handler = xfrmi4_err, .priority = 10, }; #if IS_REACHABLE(CONFIG_INET_XFRM_TUNNEL) static int xfrmi4_rcv_tunnel(struct sk_buff *skb) { return xfrm4_rcv_spi(skb, IPPROTO_IPIP, ip_hdr(skb)->saddr); } static struct xfrm_tunnel xfrmi_ipip_handler __read_mostly = { .handler = xfrmi4_rcv_tunnel, .cb_handler = xfrmi_rcv_cb, .err_handler = xfrmi4_err, .priority = 3, }; static struct xfrm_tunnel xfrmi_ipip6_handler __read_mostly = { .handler = xfrmi4_rcv_tunnel, .cb_handler = xfrmi_rcv_cb, .err_handler = xfrmi4_err, .priority = 2, }; #endif static int __init xfrmi4_init(void) { int err; err = xfrm4_protocol_register(&xfrmi_esp4_protocol, IPPROTO_ESP); if (err < 0) goto xfrm_proto_esp_failed; err = xfrm4_protocol_register(&xfrmi_ah4_protocol, IPPROTO_AH); if (err < 0) goto xfrm_proto_ah_failed; err = xfrm4_protocol_register(&xfrmi_ipcomp4_protocol, IPPROTO_COMP); if (err < 0) goto xfrm_proto_comp_failed; #if IS_REACHABLE(CONFIG_INET_XFRM_TUNNEL) err = xfrm4_tunnel_register(&xfrmi_ipip_handler, AF_INET); if (err < 0) goto xfrm_tunnel_ipip_failed; err = xfrm4_tunnel_register(&xfrmi_ipip6_handler, AF_INET6); if (err < 0) goto xfrm_tunnel_ipip6_failed; #endif return 0; #if IS_REACHABLE(CONFIG_INET_XFRM_TUNNEL) xfrm_tunnel_ipip6_failed: xfrm4_tunnel_deregister(&xfrmi_ipip_handler, AF_INET); xfrm_tunnel_ipip_failed: xfrm4_protocol_deregister(&xfrmi_ipcomp4_protocol, IPPROTO_COMP); #endif xfrm_proto_comp_failed: xfrm4_protocol_deregister(&xfrmi_ah4_protocol, IPPROTO_AH); xfrm_proto_ah_failed: xfrm4_protocol_deregister(&xfrmi_esp4_protocol, IPPROTO_ESP); xfrm_proto_esp_failed: return err; } static void xfrmi4_fini(void) { #if IS_REACHABLE(CONFIG_INET_XFRM_TUNNEL) xfrm4_tunnel_deregister(&xfrmi_ipip6_handler, AF_INET6); xfrm4_tunnel_deregister(&xfrmi_ipip_handler, AF_INET); #endif xfrm4_protocol_deregister(&xfrmi_ipcomp4_protocol, IPPROTO_COMP); xfrm4_protocol_deregister(&xfrmi_ah4_protocol, IPPROTO_AH); xfrm4_protocol_deregister(&xfrmi_esp4_protocol, IPPROTO_ESP); } static int __init xfrmi6_init(void) { int err; err = xfrm6_protocol_register(&xfrmi_esp6_protocol, IPPROTO_ESP); if (err < 0) goto xfrm_proto_esp_failed; err = xfrm6_protocol_register(&xfrmi_ah6_protocol, IPPROTO_AH); if (err < 0) goto xfrm_proto_ah_failed; err = xfrm6_protocol_register(&xfrmi_ipcomp6_protocol, IPPROTO_COMP); if (err < 0) goto xfrm_proto_comp_failed; #if IS_REACHABLE(CONFIG_INET6_XFRM_TUNNEL) err = xfrm6_tunnel_register(&xfrmi_ipv6_handler, AF_INET6); if (err < 0) goto xfrm_tunnel_ipv6_failed; err = xfrm6_tunnel_register(&xfrmi_ip6ip_handler, AF_INET); if (err < 0) goto xfrm_tunnel_ip6ip_failed; #endif return 0; #if IS_REACHABLE(CONFIG_INET6_XFRM_TUNNEL) xfrm_tunnel_ip6ip_failed: xfrm6_tunnel_deregister(&xfrmi_ipv6_handler, AF_INET6); xfrm_tunnel_ipv6_failed: xfrm6_protocol_deregister(&xfrmi_ipcomp6_protocol, IPPROTO_COMP); #endif xfrm_proto_comp_failed: xfrm6_protocol_deregister(&xfrmi_ah6_protocol, IPPROTO_AH); xfrm_proto_ah_failed: xfrm6_protocol_deregister(&xfrmi_esp6_protocol, IPPROTO_ESP); xfrm_proto_esp_failed: return err; } static void xfrmi6_fini(void) { #if IS_REACHABLE(CONFIG_INET6_XFRM_TUNNEL) xfrm6_tunnel_deregister(&xfrmi_ip6ip_handler, AF_INET); xfrm6_tunnel_deregister(&xfrmi_ipv6_handler, AF_INET6); #endif xfrm6_protocol_deregister(&xfrmi_ipcomp6_protocol, IPPROTO_COMP); xfrm6_protocol_deregister(&xfrmi_ah6_protocol, IPPROTO_AH); xfrm6_protocol_deregister(&xfrmi_esp6_protocol, IPPROTO_ESP); } static const struct xfrm_if_cb xfrm_if_cb = { .decode_session = xfrmi_decode_session, }; static int __init xfrmi_init(void) { const char *msg; int err; pr_info("IPsec XFRM device driver\n"); msg = "tunnel device"; err = register_pernet_device(&xfrmi_net_ops); if (err < 0) goto pernet_dev_failed; msg = "xfrm4 protocols"; err = xfrmi4_init(); if (err < 0) goto xfrmi4_failed; msg = "xfrm6 protocols"; err = xfrmi6_init(); if (err < 0) goto xfrmi6_failed; msg = "netlink interface"; err = rtnl_link_register(&xfrmi_link_ops); if (err < 0) goto rtnl_link_failed; err = register_xfrm_interface_bpf(); if (err < 0) goto kfunc_failed; lwtunnel_encap_add_ops(&xfrmi_encap_ops, LWTUNNEL_ENCAP_XFRM); xfrm_if_register_cb(&xfrm_if_cb); return err; kfunc_failed: rtnl_link_unregister(&xfrmi_link_ops); rtnl_link_failed: xfrmi6_fini(); xfrmi6_failed: xfrmi4_fini(); xfrmi4_failed: unregister_pernet_device(&xfrmi_net_ops); pernet_dev_failed: pr_err("xfrmi init: failed to register %s\n", msg); return err; } static void __exit xfrmi_fini(void) { xfrm_if_unregister_cb(); lwtunnel_encap_del_ops(&xfrmi_encap_ops, LWTUNNEL_ENCAP_XFRM); rtnl_link_unregister(&xfrmi_link_ops); xfrmi4_fini(); xfrmi6_fini(); unregister_pernet_device(&xfrmi_net_ops); } module_init(xfrmi_init); module_exit(xfrmi_fini); MODULE_LICENSE("GPL"); MODULE_ALIAS_RTNL_LINK("xfrm"); MODULE_ALIAS_NETDEV("xfrm0"); MODULE_AUTHOR("Steffen Klassert"); MODULE_DESCRIPTION("XFRM virtual interface"); |
| 78 81 24 24 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 | /* SPDX-License-Identifier: GPL-2.0-or-later */ /* internal AFS stuff * * Copyright (C) 2002, 2007 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) */ #include <linux/compiler.h> #include <linux/kernel.h> #include <linux/ktime.h> #include <linux/fs.h> #include <linux/filelock.h> #include <linux/pagemap.h> #include <linux/rxrpc.h> #include <linux/key.h> #include <linux/workqueue.h> #include <linux/sched.h> #include <linux/fscache.h> #include <linux/backing-dev.h> #include <linux/uuid.h> #include <linux/mm_types.h> #include <linux/dns_resolver.h> #include <net/net_namespace.h> #include <net/netns/generic.h> #include <net/sock.h> #include <net/af_rxrpc.h> #include "afs.h" #include "afs_vl.h" #define AFS_CELL_MAX_ADDRS 15 struct pagevec; struct afs_call; struct afs_vnode; struct afs_server_probe; /* * Partial file-locking emulation mode. (The problem being that AFS3 only * allows whole-file locks and no upgrading/downgrading). */ enum afs_flock_mode { afs_flock_mode_unset, afs_flock_mode_local, /* Local locking only */ afs_flock_mode_openafs, /* Don't get server lock for a partial lock */ afs_flock_mode_strict, /* Always get a server lock for a partial lock */ afs_flock_mode_write, /* Get an exclusive server lock for a partial lock */ }; struct afs_fs_context { bool force; /* T to force cell type */ bool autocell; /* T if set auto mount operation */ bool dyn_root; /* T if dynamic root */ bool no_cell; /* T if the source is "none" (for dynroot) */ enum afs_flock_mode flock_mode; /* Partial file-locking emulation mode */ afs_voltype_t type; /* type of volume requested */ unsigned int volnamesz; /* size of volume name */ const char *volname; /* name of volume to mount */ struct afs_net *net; /* the AFS net namespace stuff */ struct afs_cell *cell; /* cell in which to find volume */ struct afs_volume *volume; /* volume record */ struct key *key; /* key to use for secure mounting */ }; enum afs_call_state { AFS_CALL_CL_REQUESTING, /* Client: Request is being sent */ AFS_CALL_CL_AWAIT_REPLY, /* Client: Awaiting reply */ AFS_CALL_CL_PROC_REPLY, /* Client: rxrpc call complete; processing reply */ AFS_CALL_SV_AWAIT_OP_ID, /* Server: Awaiting op ID */ AFS_CALL_SV_AWAIT_REQUEST, /* Server: Awaiting request data */ AFS_CALL_SV_REPLYING, /* Server: Replying */ AFS_CALL_SV_AWAIT_ACK, /* Server: Awaiting final ACK */ AFS_CALL_COMPLETE, /* Completed or failed */ }; /* * Address preferences. */ struct afs_addr_preference { union { struct in_addr ipv4_addr; /* AF_INET address to compare against */ struct in6_addr ipv6_addr; /* AF_INET6 address to compare against */ }; sa_family_t family; /* Which address to use */ u16 prio; /* Priority */ u8 subnet_mask; /* How many bits to compare */ }; struct afs_addr_preference_list { struct rcu_head rcu; u16 version; /* Incremented when prefs list changes */ u8 ipv6_off; /* Offset of IPv6 addresses */ u8 nr; /* Number of addresses in total */ u8 max_prefs; /* Number of prefs allocated */ struct afs_addr_preference prefs[] __counted_by(max_prefs); }; struct afs_address { struct rxrpc_peer *peer; short last_error; /* Last error from this address */ u16 prio; /* Address priority */ }; /* * List of server addresses. */ struct afs_addr_list { struct rcu_head rcu; refcount_t usage; u32 version; /* Version */ unsigned int debug_id; unsigned int addr_pref_version; /* Version of address preference list */ unsigned char max_addrs; unsigned char nr_addrs; unsigned char preferred; /* Preferred address */ unsigned char nr_ipv4; /* Number of IPv4 addresses */ enum dns_record_source source:8; enum dns_lookup_status status:8; unsigned long probe_failed; /* Mask of addrs that failed locally/ICMP */ unsigned long responded; /* Mask of addrs that responded */ struct afs_address addrs[] __counted_by(max_addrs); #define AFS_MAX_ADDRESSES ((unsigned int)(sizeof(unsigned long) * 8)) }; /* * a record of an in-progress RxRPC call */ struct afs_call { const struct afs_call_type *type; /* type of call */ wait_queue_head_t waitq; /* processes awaiting completion */ struct work_struct async_work; /* async I/O processor */ struct work_struct work; /* actual work processor */ struct work_struct free_work; /* Deferred free processor */ struct rxrpc_call *rxcall; /* RxRPC call handle */ struct rxrpc_peer *peer; /* Remote endpoint */ struct key *key; /* security for this call */ struct afs_net *net; /* The network namespace */ struct afs_server *server; /* The fileserver record if fs op (pins ref) */ struct afs_vlserver *vlserver; /* The vlserver record if vl op */ void *request; /* request data (first part) */ size_t iov_len; /* Size of *iter to be used */ struct iov_iter def_iter; /* Default buffer/data iterator */ struct iov_iter *write_iter; /* Iterator defining write to be made */ struct iov_iter *iter; /* Iterator currently in use */ union { /* Convenience for ->def_iter */ struct kvec kvec[1]; struct bio_vec bvec[1]; }; void *buffer; /* reply receive buffer */ union { struct afs_endpoint_state *probe; struct afs_addr_list *vl_probe; struct afs_addr_list *ret_alist; struct afs_vldb_entry *ret_vldb; char *ret_str; }; struct afs_fid fid; /* Primary vnode ID (or all zeroes) */ unsigned char probe_index; /* Address in ->probe_alist */ struct afs_operation *op; unsigned int server_index; refcount_t ref; enum afs_call_state state; spinlock_t state_lock; int error; /* error code */ u32 abort_code; /* Remote abort ID or 0 */ unsigned long long remaining; /* How much is left to receive */ unsigned int max_lifespan; /* Maximum lifespan in secs to set if not 0 */ unsigned request_size; /* size of request data */ unsigned reply_max; /* maximum size of reply */ unsigned count2; /* count used in unmarshalling */ unsigned char unmarshall; /* unmarshalling phase */ bool drop_ref; /* T if need to drop ref for incoming call */ bool need_attention; /* T if RxRPC poked us */ bool async; /* T if asynchronous */ bool upgrade; /* T to request service upgrade */ bool intr; /* T if interruptible */ bool unmarshalling_error; /* T if an unmarshalling error occurred */ bool responded; /* Got a response from the call (may be abort) */ u16 service_id; /* Actual service ID (after upgrade) */ unsigned int debug_id; /* Trace ID */ u32 operation_ID; /* operation ID for an incoming call */ u32 count; /* count for use in unmarshalling */ union { /* place to extract temporary data */ struct { __be32 tmp_u; __be32 tmp; } __attribute__((packed)); __be64 tmp64; }; ktime_t issue_time; /* Time of issue of operation */ }; struct afs_call_type { const char *name; unsigned int op; /* Really enum afs_fs_operation */ /* deliver request or reply data to an call * - returning an error will cause the call to be aborted */ int (*deliver)(struct afs_call *call); /* clean up a call */ void (*destructor)(struct afs_call *call); /* Async receive processing function */ void (*async_rx)(struct work_struct *work); /* Work function */ void (*work)(struct work_struct *work); /* Call done function (gets called immediately on success or failure) */ void (*done)(struct afs_call *call); /* Handle a call being immediately cancelled. */ void (*immediate_cancel)(struct afs_call *call); }; /* * Key available for writeback on a file. */ struct afs_wb_key { refcount_t usage; struct key *key; struct list_head vnode_link; /* Link in vnode->wb_keys */ }; /* * AFS open file information record. Pointed to by file->private_data. */ struct afs_file { struct key *key; /* The key this file was opened with */ struct afs_wb_key *wb; /* Writeback key record for this file */ }; static inline struct key *afs_file_key(struct file *file) { struct afs_file *af = file->private_data; return af->key; } /* * AFS superblock private data * - there's one superblock per volume */ struct afs_super_info { struct net *net_ns; /* Network namespace */ struct afs_cell *cell; /* The cell in which the volume resides */ struct afs_volume *volume; /* volume record */ enum afs_flock_mode flock_mode:8; /* File locking emulation mode */ bool dyn_root; /* True if dynamic root */ }; static inline struct afs_super_info *AFS_FS_S(struct super_block *sb) { return sb->s_fs_info; } extern struct file_system_type afs_fs_type; /* * Set of substitutes for @sys. */ struct afs_sysnames { #define AFS_NR_SYSNAME 16 char *subs[AFS_NR_SYSNAME]; refcount_t usage; unsigned short nr; char blank[1]; }; /* * AFS network namespace record. */ struct afs_net { struct net *net; /* Backpointer to the owning net namespace */ struct afs_uuid uuid; bool live; /* F if this namespace is being removed */ /* AF_RXRPC I/O stuff */ struct socket *socket; struct afs_call *spare_incoming_call; struct work_struct charge_preallocation_work; struct mutex socket_mutex; atomic_t nr_outstanding_calls; atomic_t nr_superblocks; /* Cell database */ struct rb_root cells; struct afs_cell *ws_cell; struct work_struct cells_manager; struct timer_list cells_timer; atomic_t cells_outstanding; struct rw_semaphore cells_lock; struct mutex cells_alias_lock; struct mutex proc_cells_lock; struct hlist_head proc_cells; /* Known servers. Theoretically each fileserver can only be in one * cell, but in practice, people create aliases and subsets and there's * no easy way to distinguish them. */ seqlock_t fs_lock; /* For fs_servers, fs_probe_*, fs_proc */ struct rb_root fs_servers; /* afs_server (by server UUID or address) */ struct list_head fs_probe_fast; /* List of afs_server to probe at 30s intervals */ struct list_head fs_probe_slow; /* List of afs_server to probe at 5m intervals */ struct hlist_head fs_proc; /* procfs servers list */ struct hlist_head fs_addresses; /* afs_server (by lowest IPv6 addr) */ seqlock_t fs_addr_lock; /* For fs_addresses[46] */ struct work_struct fs_manager; struct timer_list fs_timer; struct work_struct fs_prober; struct timer_list fs_probe_timer; atomic_t servers_outstanding; /* File locking renewal management */ struct mutex lock_manager_mutex; /* Misc */ struct super_block *dynroot_sb; /* Dynamic root mount superblock */ struct proc_dir_entry *proc_afs; /* /proc/net/afs directory */ struct afs_sysnames *sysnames; rwlock_t sysnames_lock; struct afs_addr_preference_list __rcu *address_prefs; u16 address_pref_version; /* Statistics counters */ atomic_t n_lookup; /* Number of lookups done */ atomic_t n_reval; /* Number of dentries needing revalidation */ atomic_t n_inval; /* Number of invalidations by the server */ atomic_t n_relpg; /* Number of invalidations by release_folio */ atomic_t n_read_dir; /* Number of directory pages read */ atomic_t n_dir_cr; /* Number of directory entry creation edits */ atomic_t n_dir_rm; /* Number of directory entry removal edits */ atomic_t n_stores; /* Number of store ops */ atomic_long_t n_store_bytes; /* Number of bytes stored */ atomic_long_t n_fetch_bytes; /* Number of bytes fetched */ atomic_t n_fetches; /* Number of data fetch ops */ }; extern const char afs_init_sysname[]; enum afs_cell_state { AFS_CELL_UNSET, AFS_CELL_ACTIVATING, AFS_CELL_ACTIVE, AFS_CELL_DEACTIVATING, AFS_CELL_INACTIVE, AFS_CELL_FAILED, AFS_CELL_REMOVED, }; /* * AFS cell record. * * This is a tricky concept to get right as it is possible to create aliases * simply by pointing AFSDB/SRV records for two names at the same set of VL * servers; it is also possible to do things like setting up two sets of VL * servers, one of which provides a superset of the volumes provided by the * other (for internal/external division, for example). * * Cells only exist in the sense that (a) a cell's name maps to a set of VL * servers and (b) a cell's name is used by the client to select the key to use * for authentication and encryption. The cell name is not typically used in * the protocol. * * Two cells are determined to be aliases if they have an explicit alias (YFS * only), share any VL servers in common or have at least one volume in common. * "In common" means that the address list of the VL servers or the fileservers * share at least one endpoint. */ struct afs_cell { union { struct rcu_head rcu; struct rb_node net_node; /* Node in net->cells */ }; struct afs_net *net; struct afs_cell *alias_of; /* The cell this is an alias of */ struct afs_volume *root_volume; /* The root.cell volume if there is one */ struct key *anonymous_key; /* anonymous user key for this cell */ struct work_struct manager; /* Manager for init/deinit/dns */ struct hlist_node proc_link; /* /proc cell list link */ time64_t dns_expiry; /* Time AFSDB/SRV record expires */ time64_t last_inactive; /* Time of last drop of usage count */ refcount_t ref; /* Struct refcount */ atomic_t active; /* Active usage counter */ unsigned long flags; #define AFS_CELL_FL_NO_GC 0 /* The cell was added manually, don't auto-gc */ #define AFS_CELL_FL_DO_LOOKUP 1 /* DNS lookup requested */ #define AFS_CELL_FL_CHECK_ALIAS 2 /* Need to check for aliases */ enum afs_cell_state state; short error; enum dns_record_source dns_source:8; /* Latest source of data from lookup */ enum dns_lookup_status dns_status:8; /* Latest status of data from lookup */ unsigned int dns_lookup_count; /* Counter of DNS lookups */ unsigned int debug_id; /* The volumes belonging to this cell */ struct rw_semaphore vs_lock; /* Lock for server->volumes */ struct rb_root volumes; /* Tree of volumes on this server */ struct hlist_head proc_volumes; /* procfs volume list */ seqlock_t volume_lock; /* For volumes */ /* Active fileserver interaction state. */ struct rb_root fs_servers; /* afs_server (by server UUID) */ seqlock_t fs_lock; /* For fs_servers */ /* VL server list. */ rwlock_t vl_servers_lock; /* Lock on vl_servers */ struct afs_vlserver_list __rcu *vl_servers; u8 name_len; /* Length of name */ char *name; /* Cell name, case-flattened and NUL-padded */ }; /* * Volume Location server record. */ struct afs_vlserver { struct rcu_head rcu; struct afs_addr_list __rcu *addresses; /* List of addresses for this VL server */ unsigned long flags; #define AFS_VLSERVER_FL_PROBED 0 /* The VL server has been probed */ #define AFS_VLSERVER_FL_PROBING 1 /* VL server is being probed */ #define AFS_VLSERVER_FL_IS_YFS 2 /* Server is YFS not AFS */ #define AFS_VLSERVER_FL_RESPONDING 3 /* VL server is responding */ rwlock_t lock; /* Lock on addresses */ refcount_t ref; unsigned int rtt; /* Server's current RTT in uS */ unsigned int debug_id; /* Probe state */ wait_queue_head_t probe_wq; atomic_t probe_outstanding; spinlock_t probe_lock; struct { unsigned int rtt; /* Best RTT in uS (or UINT_MAX) */ u32 abort_code; short error; unsigned short flags; #define AFS_VLSERVER_PROBE_RESPONDED 0x01 /* At least once response (may be abort) */ #define AFS_VLSERVER_PROBE_IS_YFS 0x02 /* The peer appears to be YFS */ #define AFS_VLSERVER_PROBE_NOT_YFS 0x04 /* The peer appears not to be YFS */ #define AFS_VLSERVER_PROBE_LOCAL_FAILURE 0x08 /* A local failure prevented a probe */ } probe; u16 service_id; /* Service ID we're using */ u16 port; u16 name_len; /* Length of name */ char name[]; /* Server name, case-flattened */ }; /* * Weighted list of Volume Location servers. */ struct afs_vlserver_entry { u16 priority; /* Preference (as SRV) */ u16 weight; /* Weight (as SRV) */ enum dns_record_source source:8; enum dns_lookup_status status:8; struct afs_vlserver *server; }; struct afs_vlserver_list { struct rcu_head rcu; refcount_t ref; u8 nr_servers; u8 index; /* Server currently in use */ u8 preferred; /* Preferred server */ enum dns_record_source source:8; enum dns_lookup_status status:8; rwlock_t lock; struct afs_vlserver_entry servers[]; }; /* * Cached VLDB entry. * * This is pointed to by cell->vldb_entries, indexed by name. */ struct afs_vldb_entry { afs_volid_t vid[3]; /* Volume IDs for R/W, R/O and Bak volumes */ unsigned long flags; #define AFS_VLDB_HAS_RW 0 /* - R/W volume exists */ #define AFS_VLDB_HAS_RO 1 /* - R/O volume exists */ #define AFS_VLDB_HAS_BAK 2 /* - Backup volume exists */ #define AFS_VLDB_QUERY_VALID 3 /* - Record is valid */ #define AFS_VLDB_QUERY_ERROR 4 /* - VL server returned error */ uuid_t fs_server[AFS_NMAXNSERVERS]; u32 addr_version[AFS_NMAXNSERVERS]; /* Registration change counters */ u8 fs_mask[AFS_NMAXNSERVERS]; #define AFS_VOL_VTM_RW 0x01 /* R/W version of the volume is available (on this server) */ #define AFS_VOL_VTM_RO 0x02 /* R/O version of the volume is available (on this server) */ #define AFS_VOL_VTM_BAK 0x04 /* backup version of the volume is available (on this server) */ u8 vlsf_flags[AFS_NMAXNSERVERS]; short error; u8 nr_servers; /* Number of server records */ u8 name_len; u8 name[AFS_MAXVOLNAME + 1]; /* NUL-padded volume name */ }; /* * Fileserver endpoint state. The records the addresses of a fileserver's * endpoints and the state and result of a round of probing on them. This * allows the rotation algorithm to access those results without them being * erased by a subsequent round of probing. */ struct afs_endpoint_state { struct rcu_head rcu; struct afs_addr_list *addresses; /* The addresses being probed */ unsigned long responsive_set; /* Bitset of responsive endpoints */ unsigned long failed_set; /* Bitset of endpoints we failed to probe */ refcount_t ref; unsigned int server_id; /* Debug ID of server */ unsigned int probe_seq; /* Probe sequence (from server::probe_counter) */ atomic_t nr_probing; /* Number of outstanding probes */ unsigned int rtt; /* Best RTT in uS (or UINT_MAX) */ s32 abort_code; short error; unsigned long flags; #define AFS_ESTATE_RESPONDED 0 /* Set if the server responded */ #define AFS_ESTATE_SUPERSEDED 1 /* Set if this record has been superseded */ #define AFS_ESTATE_IS_YFS 2 /* Set if probe upgraded to YFS */ #define AFS_ESTATE_NOT_YFS 3 /* Set if probe didn't upgrade to YFS */ #define AFS_ESTATE_LOCAL_FAILURE 4 /* Set if there was a local failure (eg. ENOMEM) */ }; /* * Record of fileserver with which we're actively communicating. */ struct afs_server { struct rcu_head rcu; union { uuid_t uuid; /* Server ID */ struct afs_uuid _uuid; }; struct afs_cell *cell; /* Cell to which belongs (pins ref) */ struct rb_node uuid_rb; /* Link in net->fs_servers */ struct afs_server __rcu *uuid_next; /* Next server with same UUID */ struct afs_server *uuid_prev; /* Previous server with same UUID */ struct list_head probe_link; /* Link in net->fs_probe_list */ struct hlist_node addr_link; /* Link in net->fs_addresses6 */ struct hlist_node proc_link; /* Link in net->fs_proc */ struct list_head volumes; /* RCU list of afs_server_entry objects */ struct afs_server *gc_next; /* Next server in manager's list */ time64_t unuse_time; /* Time at which last unused */ unsigned long flags; #define AFS_SERVER_FL_RESPONDING 0 /* The server is responding */ #define AFS_SERVER_FL_UPDATING 1 #define AFS_SERVER_FL_NEEDS_UPDATE 2 /* Fileserver address list is out of date */ #define AFS_SERVER_FL_NOT_READY 4 /* The record is not ready for use */ #define AFS_SERVER_FL_NOT_FOUND 5 /* VL server says no such server */ #define AFS_SERVER_FL_VL_FAIL 6 /* Failed to access VL server */ #define AFS_SERVER_FL_MAY_HAVE_CB 8 /* May have callbacks on this fileserver */ #define AFS_SERVER_FL_IS_YFS 16 /* Server is YFS not AFS */ #define AFS_SERVER_FL_NO_IBULK 17 /* Fileserver doesn't support FS.InlineBulkStatus */ #define AFS_SERVER_FL_NO_RM2 18 /* Fileserver doesn't support YFS.RemoveFile2 */ #define AFS_SERVER_FL_HAS_FS64 19 /* Fileserver supports FS.{Fetch,Store}Data64 */ refcount_t ref; /* Object refcount */ atomic_t active; /* Active user count */ u32 addr_version; /* Address list version */ u16 service_id; /* Service ID we're using. */ unsigned int rtt; /* Server's current RTT in uS */ unsigned int debug_id; /* Debugging ID for traces */ /* file service access */ rwlock_t fs_lock; /* access lock */ /* Probe state */ struct afs_endpoint_state __rcu *endpoint_state; /* Latest endpoint/probe state */ unsigned long probed_at; /* Time last probe was dispatched (jiffies) */ wait_queue_head_t probe_wq; unsigned int probe_counter; /* Number of probes issued */ spinlock_t probe_lock; }; enum afs_ro_replicating { AFS_RO_NOT_REPLICATING, /* Not doing replication */ AFS_RO_REPLICATING_USE_OLD, /* Replicating; use old version */ AFS_RO_REPLICATING_USE_NEW, /* Replicating; switch to new version */ } __mode(byte); /* * Replaceable volume server list. */ struct afs_server_entry { struct afs_server *server; struct afs_volume *volume; struct list_head slink; /* Link in server->volumes */ time64_t cb_expires_at; /* Time at which volume-level callback expires */ unsigned long flags; #define AFS_SE_EXCLUDED 0 /* Set if server is to be excluded in rotation */ #define AFS_SE_VOLUME_OFFLINE 1 /* Set if volume offline notice given */ #define AFS_SE_VOLUME_BUSY 2 /* Set if volume busy notice given */ }; struct afs_server_list { struct rcu_head rcu; refcount_t usage; bool attached; /* T if attached to servers */ enum afs_ro_replicating ro_replicating; /* RW->RO update (probably) in progress */ unsigned char nr_servers; unsigned short vnovol_mask; /* Servers to be skipped due to VNOVOL */ unsigned int seq; /* Set to ->servers_seq when installed */ rwlock_t lock; struct afs_server_entry servers[]; }; /* * Live AFS volume management. */ struct afs_volume { struct rcu_head rcu; afs_volid_t vid; /* The volume ID of this volume */ afs_volid_t vids[AFS_MAXTYPES]; /* All associated volume IDs */ refcount_t ref; time64_t update_at; /* Time at which to next update */ struct afs_cell *cell; /* Cell to which belongs (pins ref) */ struct rb_node cell_node; /* Link in cell->volumes */ struct hlist_node proc_link; /* Link in cell->proc_volumes */ struct super_block __rcu *sb; /* Superblock on which inodes reside */ struct work_struct destructor; /* Deferred destructor */ unsigned long flags; #define AFS_VOLUME_NEEDS_UPDATE 0 /* - T if an update needs performing */ #define AFS_VOLUME_UPDATING 1 /* - T if an update is in progress */ #define AFS_VOLUME_WAIT 2 /* - T if users must wait for update */ #define AFS_VOLUME_DELETED 3 /* - T if volume appears deleted */ #define AFS_VOLUME_MAYBE_NO_IBULK 4 /* - T if some servers don't have InlineBulkStatus */ #define AFS_VOLUME_RM_TREE 5 /* - Set if volume removed from cell->volumes */ #ifdef CONFIG_AFS_FSCACHE struct fscache_volume *cache; /* Caching cookie */ #endif struct afs_server_list __rcu *servers; /* List of servers on which volume resides */ rwlock_t servers_lock; /* Lock for ->servers */ unsigned int servers_seq; /* Incremented each time ->servers changes */ /* RO release tracking */ struct mutex volsync_lock; /* Time/state evaluation lock */ time64_t creation_time; /* Volume creation time (or TIME64_MIN) */ time64_t update_time; /* Volume update time (or TIME64_MIN) */ /* Callback management */ struct mutex cb_check_lock; /* Lock to control race to check after v_break */ time64_t cb_expires_at; /* Earliest volume callback expiry time */ atomic_t cb_ro_snapshot; /* RO volume update-from-snapshot counter */ atomic_t cb_v_break; /* Volume-break event counter. */ atomic_t cb_v_check; /* Volume-break has-been-checked counter. */ atomic_t cb_scrub; /* Scrub-all-data event counter. */ rwlock_t cb_v_break_lock; struct rw_semaphore open_mmaps_lock; struct list_head open_mmaps; /* List of vnodes that are mmapped */ afs_voltype_t type; /* type of volume */ char type_force; /* force volume type (suppress R/O -> R/W) */ u8 name_len; u8 name[AFS_MAXVOLNAME + 1]; /* NUL-padded volume name */ }; enum afs_lock_state { AFS_VNODE_LOCK_NONE, /* The vnode has no lock on the server */ AFS_VNODE_LOCK_WAITING_FOR_CB, /* We're waiting for the server to break the callback */ AFS_VNODE_LOCK_SETTING, /* We're asking the server for a lock */ AFS_VNODE_LOCK_GRANTED, /* We have a lock on the server */ AFS_VNODE_LOCK_EXTENDING, /* We're extending a lock on the server */ AFS_VNODE_LOCK_NEED_UNLOCK, /* We need to unlock on the server */ AFS_VNODE_LOCK_UNLOCKING, /* We're telling the server to unlock */ AFS_VNODE_LOCK_DELETED, /* The vnode has been deleted whilst we have a lock */ }; /* * AFS inode private data. * * Note that afs_alloc_inode() *must* reset anything that could incorrectly * leak from one inode to another. */ struct afs_vnode { struct netfs_inode netfs; /* Netfslib context and vfs inode */ struct afs_volume *volume; /* volume on which vnode resides */ struct afs_fid fid; /* the file identifier for this inode */ struct afs_file_status status; /* AFS status info for this file */ afs_dataversion_t invalid_before; /* Child dentries are invalid before this */ struct afs_permits __rcu *permit_cache; /* cache of permits so far obtained */ struct list_head io_lock_waiters; /* Threads waiting for the I/O lock */ struct rw_semaphore validate_lock; /* lock for validating this vnode */ struct rw_semaphore rmdir_lock; /* Lock for rmdir vs sillyrename */ struct key *silly_key; /* Silly rename key */ spinlock_t wb_lock; /* lock for wb_keys */ spinlock_t lock; /* waitqueue/flags lock */ unsigned long flags; #define AFS_VNODE_IO_LOCK 0 /* Set if the I/O serialisation lock is held */ #define AFS_VNODE_UNSET 1 /* set if vnode attributes not yet set */ #define AFS_VNODE_DIR_VALID 2 /* Set if dir contents are valid */ #define AFS_VNODE_ZAP_DATA 3 /* set if vnode's data should be invalidated */ #define AFS_VNODE_DELETED 4 /* set if vnode deleted on server */ #define AFS_VNODE_MOUNTPOINT 5 /* set if vnode is a mountpoint symlink */ #define AFS_VNODE_AUTOCELL 6 /* set if Vnode is an auto mount point */ #define AFS_VNODE_PSEUDODIR 7 /* set if Vnode is a pseudo directory */ #define AFS_VNODE_NEW_CONTENT 8 /* Set if file has new content (create/trunc-0) */ #define AFS_VNODE_SILLY_DELETED 9 /* Set if file has been silly-deleted */ #define AFS_VNODE_MODIFYING 10 /* Set if we're performing a modification op */ #define AFS_VNODE_DIR_READ 11 /* Set if we've read a dir's contents */ struct folio_queue *directory; /* Directory contents */ struct list_head wb_keys; /* List of keys available for writeback */ struct list_head pending_locks; /* locks waiting to be granted */ struct list_head granted_locks; /* locks granted on this file */ struct delayed_work lock_work; /* work to be done in locking */ struct key *lock_key; /* Key to be used in lock ops */ ktime_t locked_at; /* Time at which lock obtained */ enum afs_lock_state lock_state : 8; afs_lock_type_t lock_type : 8; unsigned int directory_size; /* Amount of space in ->directory */ /* outstanding callback notification on this file */ struct work_struct cb_work; /* Work for mmap'd files */ struct list_head cb_mmap_link; /* Link in cell->fs_open_mmaps */ void *cb_server; /* Server with callback/filelock */ atomic_t cb_nr_mmap; /* Number of mmaps */ unsigned int cb_ro_snapshot; /* RO volume release counter on ->volume */ unsigned int cb_scrub; /* Scrub counter on ->volume */ unsigned int cb_break; /* Break counter on vnode */ unsigned int cb_v_check; /* Break check counter on ->volume */ seqlock_t cb_lock; /* Lock for ->cb_server, ->status, ->cb_*break */ atomic64_t cb_expires_at; /* time at which callback expires */ #define AFS_NO_CB_PROMISE TIME64_MIN }; static inline struct fscache_cookie *afs_vnode_cache(struct afs_vnode *vnode) { #ifdef CONFIG_AFS_FSCACHE return netfs_i_cookie(&vnode->netfs); #else return NULL; #endif } static inline void afs_vnode_set_cache(struct afs_vnode *vnode, struct fscache_cookie *cookie) { #ifdef CONFIG_AFS_FSCACHE vnode->netfs.cache = cookie; if (cookie) mapping_set_release_always(vnode->netfs.inode.i_mapping); #endif } /* * cached security record for one user's attempt to access a vnode */ struct afs_permit { struct key *key; /* RxRPC ticket holding a security context */ afs_access_t access; /* CallerAccess value for this key */ }; /* * Immutable cache of CallerAccess records from attempts to access vnodes. * These may be shared between multiple vnodes. */ struct afs_permits { struct rcu_head rcu; struct hlist_node hash_node; /* Link in hash */ unsigned long h; /* Hash value for this permit list */ refcount_t usage; unsigned short nr_permits; /* Number of records */ bool invalidated; /* Invalidated due to key change */ struct afs_permit permits[] __counted_by(nr_permits); /* List of permits sorted by key pointer */ }; /* * Error prioritisation and accumulation. */ struct afs_error { s32 abort_code; /* Cumulative abort code */ short error; /* Cumulative error */ bool responded; /* T if server responded */ bool aborted; /* T if ->error is from an abort */ }; /* * Cursor for iterating over a set of volume location servers. */ struct afs_vl_cursor { struct afs_cell *cell; /* The cell we're querying */ struct afs_vlserver_list *server_list; /* Current server list (pins ref) */ struct afs_vlserver *server; /* Server on which this resides */ struct afs_addr_list *alist; /* Current address list (pins ref) */ struct key *key; /* Key for the server */ unsigned long untried_servers; /* Bitmask of untried servers */ unsigned long addr_tried; /* Tried addresses */ struct afs_error cumul_error; /* Cumulative error */ unsigned int debug_id; s32 call_abort_code; short call_error; /* Error from single call */ short server_index; /* Current server */ signed char addr_index; /* Current address */ unsigned short flags; #define AFS_VL_CURSOR_STOP 0x0001 /* Set to cease iteration */ #define AFS_VL_CURSOR_RETRY 0x0002 /* Set to do a retry */ #define AFS_VL_CURSOR_RETRIED 0x0004 /* Set if started a retry */ short nr_iterations; /* Number of server iterations */ bool call_responded; /* T if the current address responded */ }; /* * Fileserver state tracking for an operation. An array of these is kept, * indexed by server index. */ struct afs_server_state { /* Tracking of fileserver probe state. Other operations may interfere * by probing a fileserver when accessing other volumes. */ unsigned int probe_seq; unsigned long untried_addrs; /* Addresses we haven't tried yet */ struct wait_queue_entry probe_waiter; struct afs_endpoint_state *endpoint_state; /* Endpoint state being monitored */ }; /* * Fileserver operation methods. */ struct afs_operation_ops { void (*issue_afs_rpc)(struct afs_operation *op); void (*issue_yfs_rpc)(struct afs_operation *op); void (*success)(struct afs_operation *op); void (*aborted)(struct afs_operation *op); void (*failed)(struct afs_operation *op); void (*edit_dir)(struct afs_operation *op); void (*put)(struct afs_operation *op); }; struct afs_vnode_param { struct afs_vnode *vnode; struct afs_fid fid; /* Fid to access */ struct afs_status_cb scb; /* Returned status and callback promise */ afs_dataversion_t dv_before; /* Data version before the call */ unsigned int cb_break_before; /* cb_break before the call */ u8 dv_delta; /* Expected change in data version */ bool put_vnode:1; /* T if we have a ref on the vnode */ bool need_io_lock:1; /* T if we need the I/O lock on this */ bool update_ctime:1; /* Need to update the ctime */ bool set_size:1; /* Must update i_size */ bool op_unlinked:1; /* True if file was unlinked by op */ bool speculative:1; /* T if speculative status fetch (no vnode lock) */ bool modification:1; /* Set if the content gets modified */ }; /* * Fileserver operation wrapper, handling server and address rotation * asynchronously. May make simultaneous calls to multiple servers. */ struct afs_operation { struct afs_net *net; /* Network namespace */ struct key *key; /* Key for the cell */ const struct afs_call_type *type; /* Type of call done */ const struct afs_operation_ops *ops; /* Parameters/results for the operation */ struct afs_volume *volume; /* Volume being accessed */ struct afs_vnode_param file[2]; struct afs_vnode_param *more_files; struct afs_volsync pre_volsync; /* Volsync before op */ struct afs_volsync volsync; /* Volsync returned by op */ struct dentry *dentry; /* Dentry to be altered */ struct dentry *dentry_2; /* Second dentry to be altered */ struct timespec64 mtime; /* Modification time to record */ struct timespec64 ctime; /* Change time to set */ struct afs_error cumul_error; /* Cumulative error */ short nr_files; /* Number of entries in file[], more_files */ unsigned int debug_id; unsigned int cb_v_break; /* Volume break counter before op */ union { struct { int which; /* Which ->file[] to fetch for */ } fetch_status; struct { int reason; /* enum afs_edit_dir_reason */ mode_t mode; const char *symlink; } create; struct { bool need_rehash; } unlink; struct { struct dentry *rehash; struct dentry *tmp; bool new_negative; } rename; struct { struct netfs_io_subrequest *subreq; } fetch; struct { afs_lock_type_t type; } lock; struct { struct iov_iter *write_iter; loff_t pos; loff_t size; loff_t i_size; } store; struct { struct iattr *attr; loff_t old_i_size; } setattr; struct afs_acl *acl; struct yfs_acl *yacl; struct { struct afs_volume_status vs; struct kstatfs *buf; } volstatus; }; /* Fileserver iteration state */ struct afs_server_list *server_list; /* Current server list (pins ref) */ struct afs_server *server; /* Server we're using (ref pinned by server_list) */ struct afs_endpoint_state *estate; /* Current endpoint state (doesn't pin ref) */ struct afs_server_state *server_states; /* States of the servers involved */ struct afs_call *call; unsigned long untried_servers; /* Bitmask of untried servers */ unsigned long addr_tried; /* Tried addresses */ s32 call_abort_code; /* Abort code from single call */ short call_error; /* Error from single call */ short server_index; /* Current server */ short nr_iterations; /* Number of server iterations */ signed char addr_index; /* Current address */ bool call_responded; /* T if the current address responded */ unsigned int flags; #define AFS_OPERATION_STOP 0x0001 /* Set to cease iteration */ #define AFS_OPERATION_VBUSY 0x0002 /* Set if seen VBUSY */ #define AFS_OPERATION_VMOVED 0x0004 /* Set if seen VMOVED */ #define AFS_OPERATION_VNOVOL 0x0008 /* Set if seen VNOVOL */ #define AFS_OPERATION_CUR_ONLY 0x0010 /* Set if current server only (file lock held) */ #define AFS_OPERATION_NO_VSLEEP 0x0020 /* Set to prevent sleep on VBUSY, VOFFLINE, ... */ #define AFS_OPERATION_UNINTR 0x0040 /* Set if op is uninterruptible */ #define AFS_OPERATION_DOWNGRADE 0x0080 /* Set to retry with downgraded opcode */ #define AFS_OPERATION_LOCK_0 0x0100 /* Set if have io_lock on file[0] */ #define AFS_OPERATION_LOCK_1 0x0200 /* Set if have io_lock on file[1] */ #define AFS_OPERATION_TRIED_ALL 0x0400 /* Set if we've tried all the fileservers */ #define AFS_OPERATION_RETRY_SERVER 0x0800 /* Set if we should retry the current server */ #define AFS_OPERATION_DIR_CONFLICT 0x1000 /* Set if we detected a 3rd-party dir change */ #define AFS_OPERATION_ASYNC 0x2000 /* Set if should run asynchronously */ }; /* * Cache auxiliary data. */ struct afs_vnode_cache_aux { __be64 data_version; } __packed; static inline void afs_set_cache_aux(struct afs_vnode *vnode, struct afs_vnode_cache_aux *aux) { aux->data_version = cpu_to_be64(vnode->status.data_version); } static inline void afs_invalidate_cache(struct afs_vnode *vnode, unsigned int flags) { struct afs_vnode_cache_aux aux; afs_set_cache_aux(vnode, &aux); fscache_invalidate(afs_vnode_cache(vnode), &aux, i_size_read(&vnode->netfs.inode), flags); } /* * Directory iteration management. */ struct afs_dir_iter { struct afs_vnode *dvnode; union afs_xdr_dir_block *block; struct folio_queue *fq; unsigned int fpos; int fq_slot; unsigned int loop_check; u8 nr_slots; u8 bucket; unsigned int prev_entry; }; #include <trace/events/afs.h> /*****************************************************************************/ /* * addr_list.c */ struct afs_addr_list *afs_get_addrlist(struct afs_addr_list *alist, enum afs_alist_trace reason); extern struct afs_addr_list *afs_alloc_addrlist(unsigned int nr); extern void afs_put_addrlist(struct afs_addr_list *alist, enum afs_alist_trace reason); extern struct afs_vlserver_list *afs_parse_text_addrs(struct afs_net *, const char *, size_t, char, unsigned short, unsigned short); bool afs_addr_list_same(const struct afs_addr_list *a, const struct afs_addr_list *b); extern struct afs_vlserver_list *afs_dns_query(struct afs_cell *, time64_t *); extern int afs_merge_fs_addr4(struct afs_net *net, struct afs_addr_list *addr, __be32 xdr, u16 port); extern int afs_merge_fs_addr6(struct afs_net *net, struct afs_addr_list *addr, __be32 *xdr, u16 port); /* * addr_prefs.c */ int afs_proc_addr_prefs_write(struct file *file, char *buf, size_t size); void afs_get_address_preferences_rcu(struct afs_net *net, struct afs_addr_list *alist); void afs_get_address_preferences(struct afs_net *net, struct afs_addr_list *alist); /* * callback.c */ extern void afs_invalidate_mmap_work(struct work_struct *); extern void afs_init_callback_state(struct afs_server *); extern void __afs_break_callback(struct afs_vnode *, enum afs_cb_break_reason); extern void afs_break_callback(struct afs_vnode *, enum afs_cb_break_reason); extern void afs_break_callbacks(struct afs_server *, size_t, struct afs_callback_break *); static inline unsigned int afs_calc_vnode_cb_break(struct afs_vnode *vnode) { return vnode->cb_break + vnode->cb_ro_snapshot + vnode->cb_scrub; } static inline bool afs_cb_is_broken(unsigned int cb_break, const struct afs_vnode *vnode) { return cb_break != (vnode->cb_break + atomic_read(&vnode->volume->cb_ro_snapshot) + atomic_read(&vnode->volume->cb_scrub)); } /* * cell.c */ extern int afs_cell_init(struct afs_net *, const char *); extern struct afs_cell *afs_find_cell(struct afs_net *, const char *, unsigned, enum afs_cell_trace); extern struct afs_cell *afs_lookup_cell(struct afs_net *, const char *, unsigned, const char *, bool); extern struct afs_cell *afs_use_cell(struct afs_cell *, enum afs_cell_trace); extern void afs_unuse_cell(struct afs_net *, struct afs_cell *, enum afs_cell_trace); extern struct afs_cell *afs_get_cell(struct afs_cell *, enum afs_cell_trace); extern void afs_see_cell(struct afs_cell *, enum afs_cell_trace); extern void afs_put_cell(struct afs_cell *, enum afs_cell_trace); extern void afs_queue_cell(struct afs_cell *, enum afs_cell_trace); extern void afs_manage_cells(struct work_struct *); extern void afs_cells_timer(struct timer_list *); extern void __net_exit afs_cell_purge(struct afs_net *); /* * cmservice.c */ extern bool afs_cm_incoming_call(struct afs_call *); /* * dir.c */ extern const struct file_operations afs_dir_file_operations; extern const struct inode_operations afs_dir_inode_operations; extern const struct address_space_operations afs_dir_aops; extern const struct dentry_operations afs_fs_dentry_operations; ssize_t afs_read_single(struct afs_vnode *dvnode, struct file *file); ssize_t afs_read_dir(struct afs_vnode *dvnode, struct file *file) __acquires(&dvnode->validate_lock); extern void afs_d_release(struct dentry *); extern void afs_check_for_remote_deletion(struct afs_operation *); int afs_single_writepages(struct address_space *mapping, struct writeback_control *wbc); /* * dir_edit.c */ extern void afs_edit_dir_add(struct afs_vnode *, struct qstr *, struct afs_fid *, enum afs_edit_dir_reason); extern void afs_edit_dir_remove(struct afs_vnode *, struct qstr *, enum afs_edit_dir_reason); void afs_edit_dir_update_dotdot(struct afs_vnode *vnode, struct afs_vnode *new_dvnode, enum afs_edit_dir_reason why); void afs_mkdir_init_dir(struct afs_vnode *dvnode, struct afs_vnode *parent_vnode); /* * dir_search.c */ unsigned int afs_dir_hash_name(const struct qstr *name); bool afs_dir_init_iter(struct afs_dir_iter *iter, const struct qstr *name); union afs_xdr_dir_block *afs_dir_find_block(struct afs_dir_iter *iter, size_t block); int afs_dir_search_bucket(struct afs_dir_iter *iter, const struct qstr *name, struct afs_fid *_fid); int afs_dir_search(struct afs_vnode *dvnode, struct qstr *name, struct afs_fid *_fid, afs_dataversion_t *_dir_version); /* * dir_silly.c */ extern int afs_sillyrename(struct afs_vnode *, struct afs_vnode *, struct dentry *, struct key *); extern int afs_silly_iput(struct dentry *, struct inode *); /* * dynroot.c */ extern const struct inode_operations afs_dynroot_inode_operations; extern const struct dentry_operations afs_dynroot_dentry_operations; extern struct inode *afs_try_auto_mntpt(struct dentry *, struct inode *); extern int afs_dynroot_mkdir(struct afs_net *, struct afs_cell *); extern void afs_dynroot_rmdir(struct afs_net *, struct afs_cell *); extern int afs_dynroot_populate(struct super_block *); extern void afs_dynroot_depopulate(struct super_block *); /* * file.c */ extern const struct address_space_operations afs_file_aops; extern const struct inode_operations afs_file_inode_operations; extern const struct file_operations afs_file_operations; extern const struct afs_operation_ops afs_fetch_data_operation; extern const struct netfs_request_ops afs_req_ops; extern int afs_cache_wb_key(struct afs_vnode *, struct afs_file *); extern void afs_put_wb_key(struct afs_wb_key *); extern int afs_open(struct inode *, struct file *); extern int afs_release(struct inode *, struct file *); void afs_fetch_data_async_rx(struct work_struct *work); void afs_fetch_data_immediate_cancel(struct afs_call *call); /* * flock.c */ extern struct workqueue_struct *afs_lock_manager; extern void afs_lock_op_done(struct afs_call *); extern void afs_lock_work(struct work_struct *); extern void afs_lock_may_be_available(struct afs_vnode *); extern int afs_lock(struct file *, int, struct file_lock *); extern int afs_flock(struct file *, int, struct file_lock *); /* * fsclient.c */ extern void afs_fs_fetch_status(struct afs_operation *); extern void afs_fs_fetch_data(struct afs_operation *); extern void afs_fs_create_file(struct afs_operation *); extern void afs_fs_make_dir(struct afs_operation *); extern void afs_fs_remove_file(struct afs_operation *); extern void afs_fs_remove_dir(struct afs_operation *); extern void afs_fs_link(struct afs_operation *); extern void afs_fs_symlink(struct afs_operation *); extern void afs_fs_rename(struct afs_operation *); extern void afs_fs_store_data(struct afs_operation *); extern void afs_fs_setattr(struct afs_operation *); extern void afs_fs_get_volume_status(struct afs_operation *); extern void afs_fs_set_lock(struct afs_operation *); extern void afs_fs_extend_lock(struct afs_operation *); extern void afs_fs_release_lock(struct afs_operation *); int afs_fs_give_up_all_callbacks(struct afs_net *net, struct afs_server *server, struct afs_address *addr, struct key *key); bool afs_fs_get_capabilities(struct afs_net *net, struct afs_server *server, struct afs_endpoint_state *estate, unsigned int addr_index, struct key *key); extern void afs_fs_inline_bulk_status(struct afs_operation *); struct afs_acl { u32 size; u8 data[] __counted_by(size); }; extern void afs_fs_fetch_acl(struct afs_operation *); extern void afs_fs_store_acl(struct afs_operation *); /* * fs_operation.c */ extern struct afs_operation *afs_alloc_operation(struct key *, struct afs_volume *); extern int afs_put_operation(struct afs_operation *); extern bool afs_begin_vnode_operation(struct afs_operation *); extern void afs_end_vnode_operation(struct afs_operation *op); extern void afs_wait_for_operation(struct afs_operation *); extern int afs_do_sync_operation(struct afs_operation *); static inline void afs_op_set_vnode(struct afs_operation *op, unsigned int n, struct afs_vnode *vnode) { op->file[n].vnode = vnode; op->file[n].need_io_lock = true; } static inline void afs_op_set_fid(struct afs_operation *op, unsigned int n, const struct afs_fid *fid) { op->file[n].fid = *fid; } /* * fs_probe.c */ struct afs_endpoint_state *afs_get_endpoint_state(struct afs_endpoint_state *estate, enum afs_estate_trace where); void afs_put_endpoint_state(struct afs_endpoint_state *estate, enum afs_estate_trace where); extern void afs_fileserver_probe_result(struct afs_call *); void afs_fs_probe_fileserver(struct afs_net *net, struct afs_server *server, struct afs_addr_list *new_addrs, struct key *key); int afs_wait_for_fs_probes(struct afs_operation *op, struct afs_server_state *states, bool intr); extern void afs_probe_fileserver(struct afs_net *, struct afs_server *); extern void afs_fs_probe_dispatcher(struct work_struct *); int afs_wait_for_one_fs_probe(struct afs_server *server, struct afs_endpoint_state *estate, unsigned long exclude, bool is_intr); extern void afs_fs_probe_cleanup(struct afs_net *); /* * inode.c */ extern const struct afs_operation_ops afs_fetch_status_operation; void afs_init_new_symlink(struct afs_vnode *vnode, struct afs_operation *op); const char *afs_get_link(struct dentry *dentry, struct inode *inode, struct delayed_call *callback); int afs_readlink(struct dentry *dentry, char __user *buffer, int buflen); extern void afs_vnode_commit_status(struct afs_operation *, struct afs_vnode_param *); extern int afs_fetch_status(struct afs_vnode *, struct key *, bool, afs_access_t *); extern int afs_ilookup5_test_by_fid(struct inode *, void *); extern struct inode *afs_iget_pseudo_dir(struct super_block *, bool); extern struct inode *afs_iget(struct afs_operation *, struct afs_vnode_param *); extern struct inode *afs_root_iget(struct super_block *, struct key *); extern int afs_getattr(struct mnt_idmap *idmap, const struct path *, struct kstat *, u32, unsigned int); extern int afs_setattr(struct mnt_idmap *idmap, struct dentry *, struct iattr *); extern void afs_evict_inode(struct inode *); extern int afs_drop_inode(struct inode *); /* * main.c */ extern struct workqueue_struct *afs_wq; extern int afs_net_id; static inline struct afs_net *afs_net(struct net *net) { return net_generic(net, afs_net_id); } static inline struct afs_net *afs_sb2net(struct super_block *sb) { return afs_net(AFS_FS_S(sb)->net_ns); } static inline struct afs_net *afs_d2net(struct dentry *dentry) { return afs_sb2net(dentry->d_sb); } static inline struct afs_net *afs_i2net(struct inode *inode) { return afs_sb2net(inode->i_sb); } static inline struct afs_net *afs_v2net(struct afs_vnode *vnode) { return afs_i2net(&vnode->netfs.inode); } static inline struct afs_net *afs_sock2net(struct sock *sk) { return net_generic(sock_net(sk), afs_net_id); } static inline void __afs_stat(atomic_t *s) { atomic_inc(s); } #define afs_stat_v(vnode, n) __afs_stat(&afs_v2net(vnode)->n) /* * misc.c */ extern int afs_abort_to_error(u32); extern void afs_prioritise_error(struct afs_error *, int, u32); static inline void afs_op_nomem(struct afs_operation *op) { op->cumul_error.error = -ENOMEM; } static inline int afs_op_error(const struct afs_operation *op) { return op->cumul_error.error; } static inline s32 afs_op_abort_code(const struct afs_operation *op) { return op->cumul_error.abort_code; } static inline int afs_op_set_error(struct afs_operation *op, int error) { return op->cumul_error.error = error; } static inline void afs_op_accumulate_error(struct afs_operation *op, int error, s32 abort_code) { afs_prioritise_error(&op->cumul_error, error, abort_code); } /* * mntpt.c */ extern const struct inode_operations afs_mntpt_inode_operations; extern const struct inode_operations afs_autocell_inode_operations; extern const struct file_operations afs_mntpt_file_operations; extern struct vfsmount *afs_d_automount(struct path *); extern void afs_mntpt_kill_timer(void); /* * proc.c */ #ifdef CONFIG_PROC_FS extern int __net_init afs_proc_init(struct afs_net *); extern void __net_exit afs_proc_cleanup(struct afs_net *); extern int afs_proc_cell_setup(struct afs_cell *); extern void afs_proc_cell_remove(struct afs_cell *); extern void afs_put_sysnames(struct afs_sysnames *); #else static inline int afs_proc_init(struct afs_net *net) { return 0; } static inline void afs_proc_cleanup(struct afs_net *net) {} static inline int afs_proc_cell_setup(struct afs_cell *cell) { return 0; } static inline void afs_proc_cell_remove(struct afs_cell *cell) {} static inline void afs_put_sysnames(struct afs_sysnames *sysnames) {} #endif /* * rotate.c */ void afs_clear_server_states(struct afs_operation *op); extern bool afs_select_fileserver(struct afs_operation *); extern void afs_dump_edestaddrreq(const struct afs_operation *); /* * rxrpc.c */ extern struct workqueue_struct *afs_async_calls; extern int __net_init afs_open_socket(struct afs_net *); extern void __net_exit afs_close_socket(struct afs_net *); extern void afs_charge_preallocation(struct work_struct *); extern void afs_put_call(struct afs_call *); void afs_deferred_put_call(struct afs_call *call); void afs_make_call(struct afs_call *call, gfp_t gfp); void afs_deliver_to_call(struct afs_call *call); void afs_wait_for_call_to_complete(struct afs_call *call); extern struct afs_call *afs_alloc_flat_call(struct afs_net *, const struct afs_call_type *, size_t, size_t); extern void afs_flat_call_destructor(struct afs_call *); extern void afs_send_empty_reply(struct afs_call *); extern void afs_send_simple_reply(struct afs_call *, const void *, size_t); extern int afs_extract_data(struct afs_call *, bool); extern int afs_protocol_error(struct afs_call *, enum afs_eproto_cause); static inline struct afs_call *afs_get_call(struct afs_call *call, enum afs_call_trace why) { int r; __refcount_inc(&call->ref, &r); trace_afs_call(call->debug_id, why, r + 1, atomic_read(&call->net->nr_outstanding_calls), __builtin_return_address(0)); return call; } static inline void afs_see_call(struct afs_call *call, enum afs_call_trace why) { int r = refcount_read(&call->ref); trace_afs_call(call->debug_id, why, r, atomic_read(&call->net->nr_outstanding_calls), __builtin_return_address(0)); } static inline void afs_make_op_call(struct afs_operation *op, struct afs_call *call, gfp_t gfp) { struct afs_addr_list *alist = op->estate->addresses; op->call = call; op->type = call->type; call->op = op; call->key = op->key; call->intr = !(op->flags & AFS_OPERATION_UNINTR); call->peer = rxrpc_kernel_get_peer(alist->addrs[op->addr_index].peer); call->service_id = op->server->service_id; afs_make_call(call, gfp); } static inline void afs_extract_begin(struct afs_call *call, void *buf, size_t size) { call->iov_len = size; call->kvec[0].iov_base = buf; call->kvec[0].iov_len = size; iov_iter_kvec(&call->def_iter, ITER_DEST, call->kvec, 1, size); } static inline void afs_extract_to_tmp(struct afs_call *call) { call->iov_len = sizeof(call->tmp); afs_extract_begin(call, &call->tmp, sizeof(call->tmp)); } static inline void afs_extract_to_tmp64(struct afs_call *call) { call->iov_len = sizeof(call->tmp64); afs_extract_begin(call, &call->tmp64, sizeof(call->tmp64)); } static inline void afs_extract_discard(struct afs_call *call, size_t size) { call->iov_len = size; iov_iter_discard(&call->def_iter, ITER_DEST, size); } static inline void afs_extract_to_buf(struct afs_call *call, size_t size) { call->iov_len = size; afs_extract_begin(call, call->buffer, size); } static inline int afs_transfer_reply(struct afs_call *call) { return afs_extract_data(call, false); } static inline bool afs_check_call_state(struct afs_call *call, enum afs_call_state state) { return READ_ONCE(call->state) == state; } static inline bool afs_set_call_state(struct afs_call *call, enum afs_call_state from, enum afs_call_state to) { bool ok = false; spin_lock_bh(&call->state_lock); if (call->state == from) { call->state = to; trace_afs_call_state(call, from, to, 0, 0); ok = true; } spin_unlock_bh(&call->state_lock); return ok; } static inline void afs_set_call_complete(struct afs_call *call, int error, u32 remote_abort) { enum afs_call_state state; bool ok = false; spin_lock_bh(&call->state_lock); state = call->state; if (state != AFS_CALL_COMPLETE) { call->abort_code = remote_abort; call->error = error; call->state = AFS_CALL_COMPLETE; trace_afs_call_state(call, state, AFS_CALL_COMPLETE, error, remote_abort); ok = true; } spin_unlock_bh(&call->state_lock); if (ok) { trace_afs_call_done(call); /* Asynchronous calls have two refs to release - one from the alloc and * one queued with the work item - and we can't just deallocate the * call because the work item may be queued again. */ if (call->drop_ref) afs_put_call(call); } } /* * security.c */ extern void afs_put_permits(struct afs_permits *); extern void afs_clear_permits(struct afs_vnode *); extern void afs_cache_permit(struct afs_vnode *, struct key *, unsigned int, struct afs_status_cb *); extern struct key *afs_request_key(struct afs_cell *); extern struct key *afs_request_key_rcu(struct afs_cell *); extern int afs_check_permit(struct afs_vnode *, struct key *, afs_access_t *); extern int afs_permission(struct mnt_idmap *, struct inode *, int); extern void __exit afs_clean_up_permit_cache(void); /* * server.c */ extern spinlock_t afs_server_peer_lock; extern struct afs_server *afs_find_server(struct afs_net *, const struct rxrpc_peer *); extern struct afs_server *afs_find_server_by_uuid(struct afs_net *, const uuid_t *); extern struct afs_server *afs_lookup_server(struct afs_cell *, struct key *, const uuid_t *, u32); extern struct afs_server *afs_get_server(struct afs_server *, enum afs_server_trace); extern struct afs_server *afs_use_server(struct afs_server *, enum afs_server_trace); extern void afs_unuse_server(struct afs_net *, struct afs_server *, enum afs_server_trace); extern void afs_unuse_server_notime(struct afs_net *, struct afs_server *, enum afs_server_trace); extern void afs_put_server(struct afs_net *, struct afs_server *, enum afs_server_trace); extern void afs_manage_servers(struct work_struct *); extern void afs_servers_timer(struct timer_list *); extern void afs_fs_probe_timer(struct timer_list *); extern void __net_exit afs_purge_servers(struct afs_net *); bool afs_check_server_record(struct afs_operation *op, struct afs_server *server, struct key *key); static inline void afs_inc_servers_outstanding(struct afs_net *net) { atomic_inc(&net->servers_outstanding); } static inline void afs_dec_servers_outstanding(struct afs_net *net) { if (atomic_dec_and_test(&net->servers_outstanding)) wake_up_var(&net->servers_outstanding); } static inline bool afs_is_probing_server(struct afs_server *server) { return list_empty(&server->probe_link); } /* * server_list.c */ static inline struct afs_server_list *afs_get_serverlist(struct afs_server_list *slist) { refcount_inc(&slist->usage); return slist; } extern void afs_put_serverlist(struct afs_net *, struct afs_server_list *); struct afs_server_list *afs_alloc_server_list(struct afs_volume *volume, struct key *key, struct afs_vldb_entry *vldb); extern bool afs_annotate_server_list(struct afs_server_list *, struct afs_server_list *); void afs_attach_volume_to_servers(struct afs_volume *volume, struct afs_server_list *slist); void afs_reattach_volume_to_servers(struct afs_volume *volume, struct afs_server_list *slist, struct afs_server_list *old); void afs_detach_volume_from_servers(struct afs_volume *volume, struct afs_server_list *slist); /* * super.c */ extern int __init afs_fs_init(void); extern void afs_fs_exit(void); /* * validation.c */ bool afs_check_validity(const struct afs_vnode *vnode); int afs_update_volume_state(struct afs_operation *op); int afs_validate(struct afs_vnode *vnode, struct key *key); /* * vlclient.c */ extern struct afs_vldb_entry *afs_vl_get_entry_by_name_u(struct afs_vl_cursor *, const char *, int); extern struct afs_addr_list *afs_vl_get_addrs_u(struct afs_vl_cursor *, const uuid_t *); struct afs_call *afs_vl_get_capabilities(struct afs_net *net, struct afs_addr_list *alist, unsigned int addr_index, struct key *key, struct afs_vlserver *server, unsigned int server_index); extern struct afs_addr_list *afs_yfsvl_get_endpoints(struct afs_vl_cursor *, const uuid_t *); extern char *afs_yfsvl_get_cell_name(struct afs_vl_cursor *); /* * vl_alias.c */ extern int afs_cell_detect_alias(struct afs_cell *, struct key *); /* * vl_probe.c */ extern void afs_vlserver_probe_result(struct afs_call *); extern int afs_send_vl_probes(struct afs_net *, struct key *, struct afs_vlserver_list *); extern int afs_wait_for_vl_probes(struct afs_vlserver_list *, unsigned long); /* * vl_rotate.c */ extern bool afs_begin_vlserver_operation(struct afs_vl_cursor *, struct afs_cell *, struct key *); extern bool afs_select_vlserver(struct afs_vl_cursor *); extern bool afs_select_current_vlserver(struct afs_vl_cursor *); extern int afs_end_vlserver_operation(struct afs_vl_cursor *); /* * vlserver_list.c */ static inline struct afs_vlserver *afs_get_vlserver(struct afs_vlserver *vlserver) { refcount_inc(&vlserver->ref); return vlserver; } static inline struct afs_vlserver_list *afs_get_vlserverlist(struct afs_vlserver_list *vllist) { if (vllist) refcount_inc(&vllist->ref); return vllist; } extern struct afs_vlserver *afs_alloc_vlserver(const char *, size_t, unsigned short); extern void afs_put_vlserver(struct afs_net *, struct afs_vlserver *); extern struct afs_vlserver_list *afs_alloc_vlserver_list(unsigned int); extern void afs_put_vlserverlist(struct afs_net *, struct afs_vlserver_list *); extern struct afs_vlserver_list *afs_extract_vlserver_list(struct afs_cell *, const void *, size_t); /* * volume.c */ extern struct afs_volume *afs_create_volume(struct afs_fs_context *); extern int afs_activate_volume(struct afs_volume *); extern void afs_deactivate_volume(struct afs_volume *); bool afs_try_get_volume(struct afs_volume *volume, enum afs_volume_trace reason); extern struct afs_volume *afs_get_volume(struct afs_volume *, enum afs_volume_trace); void afs_put_volume(struct afs_volume *volume, enum afs_volume_trace reason); extern int afs_check_volume_status(struct afs_volume *, struct afs_operation *); /* * write.c */ void afs_prepare_write(struct netfs_io_subrequest *subreq); void afs_issue_write(struct netfs_io_subrequest *subreq); void afs_begin_writeback(struct netfs_io_request *wreq); void afs_retry_request(struct netfs_io_request *wreq, struct netfs_io_stream *stream); extern int afs_writepages(struct address_space *, struct writeback_control *); extern int afs_fsync(struct file *, loff_t, loff_t, int); extern vm_fault_t afs_page_mkwrite(struct vm_fault *vmf); extern void afs_prune_wb_keys(struct afs_vnode *); /* * xattr.c */ extern const struct xattr_handler * const afs_xattr_handlers[]; /* * yfsclient.c */ extern void yfs_fs_fetch_data(struct afs_operation *); extern void yfs_fs_create_file(struct afs_operation *); extern void yfs_fs_make_dir(struct afs_operation *); extern void yfs_fs_remove_file2(struct afs_operation *); extern void yfs_fs_remove_file(struct afs_operation *); extern void yfs_fs_remove_dir(struct afs_operation *); extern void yfs_fs_link(struct afs_operation *); extern void yfs_fs_symlink(struct afs_operation *); extern void yfs_fs_rename(struct afs_operation *); extern void yfs_fs_store_data(struct afs_operation *); extern void yfs_fs_setattr(struct afs_operation *); extern void yfs_fs_get_volume_status(struct afs_operation *); extern void yfs_fs_set_lock(struct afs_operation *); extern void yfs_fs_extend_lock(struct afs_operation *); extern void yfs_fs_release_lock(struct afs_operation *); extern void yfs_fs_fetch_status(struct afs_operation *); extern void yfs_fs_inline_bulk_status(struct afs_operation *); struct yfs_acl { struct afs_acl *acl; /* Dir/file/symlink ACL */ struct afs_acl *vol_acl; /* Whole volume ACL */ u32 inherit_flag; /* True if ACL is inherited from parent dir */ u32 num_cleaned; /* Number of ACEs removed due to subject removal */ unsigned int flags; #define YFS_ACL_WANT_ACL 0x01 /* Set if caller wants ->acl */ #define YFS_ACL_WANT_VOL_ACL 0x02 /* Set if caller wants ->vol_acl */ }; extern void yfs_free_opaque_acl(struct yfs_acl *); extern void yfs_fs_fetch_opaque_acl(struct afs_operation *); extern void yfs_fs_store_opaque_acl2(struct afs_operation *); /* * Miscellaneous inline functions. */ static inline struct afs_vnode *AFS_FS_I(struct inode *inode) { return container_of(inode, struct afs_vnode, netfs.inode); } static inline struct inode *AFS_VNODE_TO_I(struct afs_vnode *vnode) { return &vnode->netfs.inode; } /* * Note that a dentry got changed. We need to set d_fsdata to the data version * number derived from the result of the operation. It doesn't matter if * d_fsdata goes backwards as we'll just revalidate. */ static inline void afs_update_dentry_version(struct afs_operation *op, struct afs_vnode_param *dir_vp, struct dentry *dentry) { if (!op->cumul_error.error) dentry->d_fsdata = (void *)(unsigned long)dir_vp->scb.status.data_version; } /* * Set the file size and block count. Estimate the number of 512 bytes blocks * used, rounded up to nearest 1K for consistency with other AFS clients. */ static inline void afs_set_i_size(struct afs_vnode *vnode, u64 size) { i_size_write(&vnode->netfs.inode, size); vnode->netfs.inode.i_blocks = ((size + 1023) >> 10) << 1; } /* * Check for a conflicting operation on a directory that we just unlinked from. * If someone managed to sneak a link or an unlink in on the file we just * unlinked, we won't be able to trust nlink on an AFS file (but not YFS). */ static inline void afs_check_dir_conflict(struct afs_operation *op, struct afs_vnode_param *dvp) { if (dvp->dv_before + dvp->dv_delta != dvp->scb.status.data_version) op->flags |= AFS_OPERATION_DIR_CONFLICT; } static inline int afs_io_error(struct afs_call *call, enum afs_io_error where) { trace_afs_io_error(call->debug_id, -EIO, where); return -EIO; } static inline int afs_bad(struct afs_vnode *vnode, enum afs_file_error where) { trace_afs_file_error(vnode, -EIO, where); return -EIO; } /* * Set the callback promise on a vnode. */ static inline void afs_set_cb_promise(struct afs_vnode *vnode, time64_t expires_at, enum afs_cb_promise_trace trace) { atomic64_set(&vnode->cb_expires_at, expires_at); trace_afs_cb_promise(vnode, trace); } /* * Clear the callback promise on a vnode, returning true if it was promised. */ static inline bool afs_clear_cb_promise(struct afs_vnode *vnode, enum afs_cb_promise_trace trace) { trace_afs_cb_promise(vnode, trace); return atomic64_xchg(&vnode->cb_expires_at, AFS_NO_CB_PROMISE) != AFS_NO_CB_PROMISE; } /* * Mark a directory as being invalid. */ static inline void afs_invalidate_dir(struct afs_vnode *dvnode, enum afs_dir_invalid_trace trace) { if (test_and_clear_bit(AFS_VNODE_DIR_VALID, &dvnode->flags)) { trace_afs_dir_invalid(dvnode, trace); afs_stat_v(dvnode, n_inval); } } /*****************************************************************************/ /* * debug tracing */ extern unsigned afs_debug; #define dbgprintk(FMT,...) \ printk("[%-6.6s] "FMT"\n", current->comm ,##__VA_ARGS__) #define kenter(FMT,...) dbgprintk("==> %s("FMT")",__func__ ,##__VA_ARGS__) #define kleave(FMT,...) dbgprintk("<== %s()"FMT"",__func__ ,##__VA_ARGS__) #define kdebug(FMT,...) dbgprintk(" "FMT ,##__VA_ARGS__) #if defined(__KDEBUG) #define _enter(FMT,...) kenter(FMT,##__VA_ARGS__) #define _leave(FMT,...) kleave(FMT,##__VA_ARGS__) #define _debug(FMT,...) kdebug(FMT,##__VA_ARGS__) #elif defined(CONFIG_AFS_DEBUG) #define AFS_DEBUG_KENTER 0x01 #define AFS_DEBUG_KLEAVE 0x02 #define AFS_DEBUG_KDEBUG 0x04 #define _enter(FMT,...) \ do { \ if (unlikely(afs_debug & AFS_DEBUG_KENTER)) \ kenter(FMT,##__VA_ARGS__); \ } while (0) #define _leave(FMT,...) \ do { \ if (unlikely(afs_debug & AFS_DEBUG_KLEAVE)) \ kleave(FMT,##__VA_ARGS__); \ } while (0) #define _debug(FMT,...) \ do { \ if (unlikely(afs_debug & AFS_DEBUG_KDEBUG)) \ kdebug(FMT,##__VA_ARGS__); \ } while (0) #else #define _enter(FMT,...) no_printk("==> %s("FMT")",__func__ ,##__VA_ARGS__) #define _leave(FMT,...) no_printk("<== %s()"FMT"",__func__ ,##__VA_ARGS__) #define _debug(FMT,...) no_printk(" "FMT ,##__VA_ARGS__) #endif /* * debug assertion checking */ #if 1 // defined(__KDEBUGALL) #define ASSERT(X) \ do { \ if (unlikely(!(X))) { \ printk(KERN_ERR "\n"); \ printk(KERN_ERR "AFS: Assertion failed\n"); \ BUG(); \ } \ } while(0) #define ASSERTCMP(X, OP, Y) \ do { \ if (unlikely(!((X) OP (Y)))) { \ printk(KERN_ERR "\n"); \ printk(KERN_ERR "AFS: Assertion failed\n"); \ printk(KERN_ERR "%lu " #OP " %lu is false\n", \ (unsigned long)(X), (unsigned long)(Y)); \ printk(KERN_ERR "0x%lx " #OP " 0x%lx is false\n", \ (unsigned long)(X), (unsigned long)(Y)); \ BUG(); \ } \ } while(0) #define ASSERTRANGE(L, OP1, N, OP2, H) \ do { \ if (unlikely(!((L) OP1 (N)) || !((N) OP2 (H)))) { \ printk(KERN_ERR "\n"); \ printk(KERN_ERR "AFS: Assertion failed\n"); \ printk(KERN_ERR "%lu "#OP1" %lu "#OP2" %lu is false\n", \ (unsigned long)(L), (unsigned long)(N), \ (unsigned long)(H)); \ printk(KERN_ERR "0x%lx "#OP1" 0x%lx "#OP2" 0x%lx is false\n", \ (unsigned long)(L), (unsigned long)(N), \ (unsigned long)(H)); \ BUG(); \ } \ } while(0) #define ASSERTIF(C, X) \ do { \ if (unlikely((C) && !(X))) { \ printk(KERN_ERR "\n"); \ printk(KERN_ERR "AFS: Assertion failed\n"); \ BUG(); \ } \ } while(0) #define ASSERTIFCMP(C, X, OP, Y) \ do { \ if (unlikely((C) && !((X) OP (Y)))) { \ printk(KERN_ERR "\n"); \ printk(KERN_ERR "AFS: Assertion failed\n"); \ printk(KERN_ERR "%lu " #OP " %lu is false\n", \ (unsigned long)(X), (unsigned long)(Y)); \ printk(KERN_ERR "0x%lx " #OP " 0x%lx is false\n", \ (unsigned long)(X), (unsigned long)(Y)); \ BUG(); \ } \ } while(0) #else #define ASSERT(X) \ do { \ } while(0) #define ASSERTCMP(X, OP, Y) \ do { \ } while(0) #define ASSERTRANGE(L, OP1, N, OP2, H) \ do { \ } while(0) #define ASSERTIF(C, X) \ do { \ } while(0) #define ASSERTIFCMP(C, X, OP, Y) \ do { \ } while(0) #endif /* __KDEBUGALL */ |
| 8 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 | /* SPDX-License-Identifier: GPL-2.0 */ #undef TRACE_SYSTEM #define TRACE_INCLUDE_PATH ../../drivers/dma-buf #define TRACE_SYSTEM sync_trace #if !defined(_TRACE_SYNC_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_SYNC_H #include "sync_debug.h" #include <linux/tracepoint.h> TRACE_EVENT(sync_timeline, TP_PROTO(struct sync_timeline *timeline), TP_ARGS(timeline), TP_STRUCT__entry( __string(name, timeline->name) __field(u32, value) ), TP_fast_assign( __assign_str(name); __entry->value = timeline->value; ), TP_printk("name=%s value=%d", __get_str(name), __entry->value) ); #endif /* if !defined(_TRACE_SYNC_H) || defined(TRACE_HEADER_MULTI_READ) */ /* This part must be outside protection */ #include <trace/define_trace.h> |
| 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 | // SPDX-License-Identifier: GPL-2.0-or-later /* * Copyright (c) International Business Machines Corp., 2006 * Copyright (c) Nokia Corporation, 2006, 2007 * * Author: Artem Bityutskiy (Битюцкий Артём) */ /* * This file includes volume table manipulation code. The volume table is an * on-flash table containing volume meta-data like name, number of reserved * physical eraseblocks, type, etc. The volume table is stored in the so-called * "layout volume". * * The layout volume is an internal volume which is organized as follows. It * consists of two logical eraseblocks - LEB 0 and LEB 1. Each logical * eraseblock stores one volume table copy, i.e. LEB 0 and LEB 1 duplicate each * other. This redundancy guarantees robustness to unclean reboots. The volume * table is basically an array of volume table records. Each record contains * full information about the volume and protected by a CRC checksum. Note, * nowadays we use the atomic LEB change operation when updating the volume * table, so we do not really need 2 LEBs anymore, but we preserve the older * design for the backward compatibility reasons. * * When the volume table is changed, it is first changed in RAM. Then LEB 0 is * erased, and the updated volume table is written back to LEB 0. Then same for * LEB 1. This scheme guarantees recoverability from unclean reboots. * * In this UBI implementation the on-flash volume table does not contain any * information about how much data static volumes contain. * * But it would still be beneficial to store this information in the volume * table. For example, suppose we have a static volume X, and all its physical * eraseblocks became bad for some reasons. Suppose we are attaching the * corresponding MTD device, for some reason we find no logical eraseblocks * corresponding to the volume X. According to the volume table volume X does * exist. So we don't know whether it is just empty or all its physical * eraseblocks went bad. So we cannot alarm the user properly. * * The volume table also stores so-called "update marker", which is used for * volume updates. Before updating the volume, the update marker is set, and * after the update operation is finished, the update marker is cleared. So if * the update operation was interrupted (e.g. by an unclean reboot) - the * update marker is still there and we know that the volume's contents is * damaged. */ #include <linux/crc32.h> #include <linux/err.h> #include <linux/slab.h> #include <asm/div64.h> #include "ubi.h" static void self_vtbl_check(const struct ubi_device *ubi); /* Empty volume table record */ static struct ubi_vtbl_record empty_vtbl_record; /** * ubi_update_layout_vol - helper for updatting layout volumes on flash * @ubi: UBI device description object */ static int ubi_update_layout_vol(struct ubi_device *ubi) { struct ubi_volume *layout_vol; int i, err; layout_vol = ubi->volumes[vol_id2idx(ubi, UBI_LAYOUT_VOLUME_ID)]; for (i = 0; i < UBI_LAYOUT_VOLUME_EBS; i++) { err = ubi_eba_atomic_leb_change(ubi, layout_vol, i, ubi->vtbl, ubi->vtbl_size); if (err) return err; } return 0; } /** * ubi_change_vtbl_record - change volume table record. * @ubi: UBI device description object * @idx: table index to change * @vtbl_rec: new volume table record * * This function changes volume table record @idx. If @vtbl_rec is %NULL, empty * volume table record is written. The caller does not have to calculate CRC of * the record as it is done by this function. Returns zero in case of success * and a negative error code in case of failure. */ int ubi_change_vtbl_record(struct ubi_device *ubi, int idx, struct ubi_vtbl_record *vtbl_rec) { int err; uint32_t crc; ubi_assert(idx >= 0 && idx < ubi->vtbl_slots); if (!vtbl_rec) vtbl_rec = &empty_vtbl_record; else { crc = crc32(UBI_CRC32_INIT, vtbl_rec, UBI_VTBL_RECORD_SIZE_CRC); vtbl_rec->crc = cpu_to_be32(crc); } memcpy(&ubi->vtbl[idx], vtbl_rec, sizeof(struct ubi_vtbl_record)); err = ubi_update_layout_vol(ubi); self_vtbl_check(ubi); return err ? err : 0; } /** * ubi_vtbl_rename_volumes - rename UBI volumes in the volume table. * @ubi: UBI device description object * @rename_list: list of &struct ubi_rename_entry objects * * This function re-names multiple volumes specified in @req in the volume * table. Returns zero in case of success and a negative error code in case of * failure. */ int ubi_vtbl_rename_volumes(struct ubi_device *ubi, struct list_head *rename_list) { struct ubi_rename_entry *re; list_for_each_entry(re, rename_list, list) { uint32_t crc; struct ubi_volume *vol = re->desc->vol; struct ubi_vtbl_record *vtbl_rec = &ubi->vtbl[vol->vol_id]; if (re->remove) { memcpy(vtbl_rec, &empty_vtbl_record, sizeof(struct ubi_vtbl_record)); continue; } vtbl_rec->name_len = cpu_to_be16(re->new_name_len); memcpy(vtbl_rec->name, re->new_name, re->new_name_len); memset(vtbl_rec->name + re->new_name_len, 0, UBI_VOL_NAME_MAX + 1 - re->new_name_len); crc = crc32(UBI_CRC32_INIT, vtbl_rec, UBI_VTBL_RECORD_SIZE_CRC); vtbl_rec->crc = cpu_to_be32(crc); } return ubi_update_layout_vol(ubi); } /** * vtbl_check - check if volume table is not corrupted and sensible. * @ubi: UBI device description object * @vtbl: volume table * * This function returns zero if @vtbl is all right, %1 if CRC is incorrect, * and %-EINVAL if it contains inconsistent data. */ static int vtbl_check(const struct ubi_device *ubi, const struct ubi_vtbl_record *vtbl) { int i, n, reserved_pebs, alignment, data_pad, vol_type, name_len; int upd_marker, err; uint32_t crc; const char *name; for (i = 0; i < ubi->vtbl_slots; i++) { cond_resched(); reserved_pebs = be32_to_cpu(vtbl[i].reserved_pebs); alignment = be32_to_cpu(vtbl[i].alignment); data_pad = be32_to_cpu(vtbl[i].data_pad); upd_marker = vtbl[i].upd_marker; vol_type = vtbl[i].vol_type; name_len = be16_to_cpu(vtbl[i].name_len); name = &vtbl[i].name[0]; crc = crc32(UBI_CRC32_INIT, &vtbl[i], UBI_VTBL_RECORD_SIZE_CRC); if (be32_to_cpu(vtbl[i].crc) != crc) { ubi_err(ubi, "bad CRC at record %u: %#08x, not %#08x", i, crc, be32_to_cpu(vtbl[i].crc)); ubi_dump_vtbl_record(&vtbl[i], i); return 1; } if (reserved_pebs == 0) { if (memcmp(&vtbl[i], &empty_vtbl_record, UBI_VTBL_RECORD_SIZE)) { err = 2; goto bad; } continue; } if (reserved_pebs < 0 || alignment < 0 || data_pad < 0 || name_len < 0) { err = 3; goto bad; } if (alignment > ubi->leb_size || alignment == 0) { err = 4; goto bad; } n = alignment & (ubi->min_io_size - 1); if (alignment != 1 && n) { err = 5; goto bad; } n = ubi->leb_size % alignment; if (data_pad != n) { ubi_err(ubi, "bad data_pad, has to be %d", n); err = 6; goto bad; } if (vol_type != UBI_VID_DYNAMIC && vol_type != UBI_VID_STATIC) { err = 7; goto bad; } if (upd_marker != 0 && upd_marker != 1) { err = 8; goto bad; } if (reserved_pebs > ubi->good_peb_count) { ubi_err(ubi, "too large reserved_pebs %d, good PEBs %d", reserved_pebs, ubi->good_peb_count); err = 9; goto bad; } if (name_len > UBI_VOL_NAME_MAX) { err = 10; goto bad; } if (name[0] == '\0') { err = 11; goto bad; } if (name_len != strnlen(name, name_len + 1)) { err = 12; goto bad; } } /* Checks that all names are unique */ for (i = 0; i < ubi->vtbl_slots - 1; i++) { for (n = i + 1; n < ubi->vtbl_slots; n++) { int len1 = be16_to_cpu(vtbl[i].name_len); int len2 = be16_to_cpu(vtbl[n].name_len); if (len1 > 0 && len1 == len2 && !strncmp(vtbl[i].name, vtbl[n].name, len1)) { ubi_err(ubi, "volumes %d and %d have the same name \"%s\"", i, n, vtbl[i].name); ubi_dump_vtbl_record(&vtbl[i], i); ubi_dump_vtbl_record(&vtbl[n], n); return -EINVAL; } } } return 0; bad: ubi_err(ubi, "volume table check failed: record %d, error %d", i, err); ubi_dump_vtbl_record(&vtbl[i], i); return -EINVAL; } /** * create_vtbl - create a copy of volume table. * @ubi: UBI device description object * @ai: attaching information * @copy: number of the volume table copy * @vtbl: contents of the volume table * * This function returns zero in case of success and a negative error code in * case of failure. */ static int create_vtbl(struct ubi_device *ubi, struct ubi_attach_info *ai, int copy, void *vtbl) { int err, tries = 0; struct ubi_vid_io_buf *vidb; struct ubi_vid_hdr *vid_hdr; struct ubi_ainf_peb *new_aeb; dbg_gen("create volume table (copy #%d)", copy + 1); vidb = ubi_alloc_vid_buf(ubi, GFP_KERNEL); if (!vidb) return -ENOMEM; vid_hdr = ubi_get_vid_hdr(vidb); retry: new_aeb = ubi_early_get_peb(ubi, ai); if (IS_ERR(new_aeb)) { err = PTR_ERR(new_aeb); goto out_free; } vid_hdr->vol_type = UBI_LAYOUT_VOLUME_TYPE; vid_hdr->vol_id = cpu_to_be32(UBI_LAYOUT_VOLUME_ID); vid_hdr->compat = UBI_LAYOUT_VOLUME_COMPAT; vid_hdr->data_size = vid_hdr->used_ebs = vid_hdr->data_pad = cpu_to_be32(0); vid_hdr->lnum = cpu_to_be32(copy); vid_hdr->sqnum = cpu_to_be64(++ai->max_sqnum); /* The EC header is already there, write the VID header */ err = ubi_io_write_vid_hdr(ubi, new_aeb->pnum, vidb); if (err) goto write_error; /* Write the layout volume contents */ err = ubi_io_write_data(ubi, vtbl, new_aeb->pnum, 0, ubi->vtbl_size); if (err) goto write_error; /* * And add it to the attaching information. Don't delete the old version * of this LEB as it will be deleted and freed in 'ubi_add_to_av()'. */ err = ubi_add_to_av(ubi, ai, new_aeb->pnum, new_aeb->ec, vid_hdr, 0); ubi_free_aeb(ai, new_aeb); ubi_free_vid_buf(vidb); return err; write_error: if (err == -EIO && ++tries <= 5) { /* * Probably this physical eraseblock went bad, try to pick * another one. */ list_add(&new_aeb->u.list, &ai->erase); goto retry; } ubi_free_aeb(ai, new_aeb); out_free: ubi_free_vid_buf(vidb); return err; } /** * process_lvol - process the layout volume. * @ubi: UBI device description object * @ai: attaching information * @av: layout volume attaching information * * This function is responsible for reading the layout volume, ensuring it is * not corrupted, and recovering from corruptions if needed. Returns volume * table in case of success and a negative error code in case of failure. */ static struct ubi_vtbl_record *process_lvol(struct ubi_device *ubi, struct ubi_attach_info *ai, struct ubi_ainf_volume *av) { int err; struct rb_node *rb; struct ubi_ainf_peb *aeb; struct ubi_vtbl_record *leb[UBI_LAYOUT_VOLUME_EBS] = { NULL, NULL }; int leb_corrupted[UBI_LAYOUT_VOLUME_EBS] = {1, 1}; /* * UBI goes through the following steps when it changes the layout * volume: * a. erase LEB 0; * b. write new data to LEB 0; * c. erase LEB 1; * d. write new data to LEB 1. * * Before the change, both LEBs contain the same data. * * Due to unclean reboots, the contents of LEB 0 may be lost, but there * should LEB 1. So it is OK if LEB 0 is corrupted while LEB 1 is not. * Similarly, LEB 1 may be lost, but there should be LEB 0. And * finally, unclean reboots may result in a situation when neither LEB * 0 nor LEB 1 are corrupted, but they are different. In this case, LEB * 0 contains more recent information. * * So the plan is to first check LEB 0. Then * a. if LEB 0 is OK, it must be containing the most recent data; then * we compare it with LEB 1, and if they are different, we copy LEB * 0 to LEB 1; * b. if LEB 0 is corrupted, but LEB 1 has to be OK, and we copy LEB 1 * to LEB 0. */ dbg_gen("check layout volume"); /* Read both LEB 0 and LEB 1 into memory */ ubi_rb_for_each_entry(rb, aeb, &av->root, u.rb) { leb[aeb->lnum] = vzalloc(ubi->vtbl_size); if (!leb[aeb->lnum]) { err = -ENOMEM; goto out_free; } err = ubi_io_read_data(ubi, leb[aeb->lnum], aeb->pnum, 0, ubi->vtbl_size); if (err == UBI_IO_BITFLIPS || mtd_is_eccerr(err)) /* * Scrub the PEB later. Note, -EBADMSG indicates an * uncorrectable ECC error, but we have our own CRC and * the data will be checked later. If the data is OK, * the PEB will be scrubbed (because we set * aeb->scrub). If the data is not OK, the contents of * the PEB will be recovered from the second copy, and * aeb->scrub will be cleared in * 'ubi_add_to_av()'. */ aeb->scrub = 1; else if (err) goto out_free; } err = -EINVAL; if (leb[0]) { leb_corrupted[0] = vtbl_check(ubi, leb[0]); if (leb_corrupted[0] < 0) goto out_free; } if (!leb_corrupted[0]) { /* LEB 0 is OK */ if (leb[1]) leb_corrupted[1] = memcmp(leb[0], leb[1], ubi->vtbl_size); if (leb_corrupted[1]) { ubi_warn(ubi, "volume table copy #2 is corrupted"); err = create_vtbl(ubi, ai, 1, leb[0]); if (err) goto out_free; ubi_msg(ubi, "volume table was restored"); } /* Both LEB 1 and LEB 2 are OK and consistent */ vfree(leb[1]); return leb[0]; } else { /* LEB 0 is corrupted or does not exist */ if (leb[1]) { leb_corrupted[1] = vtbl_check(ubi, leb[1]); if (leb_corrupted[1] < 0) goto out_free; } if (leb_corrupted[1]) { /* Both LEB 0 and LEB 1 are corrupted */ ubi_err(ubi, "both volume tables are corrupted"); goto out_free; } ubi_warn(ubi, "volume table copy #1 is corrupted"); err = create_vtbl(ubi, ai, 0, leb[1]); if (err) goto out_free; ubi_msg(ubi, "volume table was restored"); vfree(leb[0]); return leb[1]; } out_free: vfree(leb[0]); vfree(leb[1]); return ERR_PTR(err); } /** * create_empty_lvol - create empty layout volume. * @ubi: UBI device description object * @ai: attaching information * * This function returns volume table contents in case of success and a * negative error code in case of failure. */ static struct ubi_vtbl_record *create_empty_lvol(struct ubi_device *ubi, struct ubi_attach_info *ai) { int i; struct ubi_vtbl_record *vtbl; vtbl = vzalloc(ubi->vtbl_size); if (!vtbl) return ERR_PTR(-ENOMEM); for (i = 0; i < ubi->vtbl_slots; i++) memcpy(&vtbl[i], &empty_vtbl_record, UBI_VTBL_RECORD_SIZE); for (i = 0; i < UBI_LAYOUT_VOLUME_EBS; i++) { int err; err = create_vtbl(ubi, ai, i, vtbl); if (err) { vfree(vtbl); return ERR_PTR(err); } } return vtbl; } /** * init_volumes - initialize volume information for existing volumes. * @ubi: UBI device description object * @ai: scanning information * @vtbl: volume table * * This function allocates volume description objects for existing volumes. * Returns zero in case of success and a negative error code in case of * failure. */ static int init_volumes(struct ubi_device *ubi, const struct ubi_attach_info *ai, const struct ubi_vtbl_record *vtbl) { int i, err, reserved_pebs = 0; struct ubi_ainf_volume *av; struct ubi_volume *vol; for (i = 0; i < ubi->vtbl_slots; i++) { cond_resched(); if (be32_to_cpu(vtbl[i].reserved_pebs) == 0) continue; /* Empty record */ vol = kzalloc(sizeof(struct ubi_volume), GFP_KERNEL); if (!vol) return -ENOMEM; vol->reserved_pebs = be32_to_cpu(vtbl[i].reserved_pebs); vol->alignment = be32_to_cpu(vtbl[i].alignment); vol->data_pad = be32_to_cpu(vtbl[i].data_pad); vol->upd_marker = vtbl[i].upd_marker; vol->vol_type = vtbl[i].vol_type == UBI_VID_DYNAMIC ? UBI_DYNAMIC_VOLUME : UBI_STATIC_VOLUME; vol->name_len = be16_to_cpu(vtbl[i].name_len); vol->usable_leb_size = ubi->leb_size - vol->data_pad; memcpy(vol->name, vtbl[i].name, vol->name_len); vol->name[vol->name_len] = '\0'; vol->vol_id = i; if (vtbl[i].flags & UBI_VTBL_SKIP_CRC_CHECK_FLG) vol->skip_check = 1; if (vtbl[i].flags & UBI_VTBL_AUTORESIZE_FLG) { /* Auto re-size flag may be set only for one volume */ if (ubi->autoresize_vol_id != -1) { ubi_err(ubi, "more than one auto-resize volume (%d and %d)", ubi->autoresize_vol_id, i); kfree(vol); return -EINVAL; } ubi->autoresize_vol_id = i; } ubi_assert(!ubi->volumes[i]); ubi->volumes[i] = vol; ubi->vol_count += 1; vol->ubi = ubi; reserved_pebs += vol->reserved_pebs; /* * We use ubi->peb_count and not vol->reserved_pebs because * we want to keep the code simple. Otherwise we'd have to * resize/check the bitmap upon volume resize too. * Allocating a few bytes more does not hurt. */ err = ubi_fastmap_init_checkmap(vol, ubi->peb_count); if (err) return err; /* * In case of dynamic volume UBI knows nothing about how many * data is stored there. So assume the whole volume is used. */ if (vol->vol_type == UBI_DYNAMIC_VOLUME) { vol->used_ebs = vol->reserved_pebs; vol->last_eb_bytes = vol->usable_leb_size; vol->used_bytes = (long long)vol->used_ebs * vol->usable_leb_size; continue; } /* Static volumes only */ av = ubi_find_av(ai, i); if (!av || !av->leb_count) { /* * No eraseblocks belonging to this volume found. We * don't actually know whether this static volume is * completely corrupted or just contains no data. And * we cannot know this as long as data size is not * stored on flash. So we just assume the volume is * empty. FIXME: this should be handled. */ continue; } if (av->leb_count != av->used_ebs) { /* * We found a static volume which misses several * eraseblocks. Treat it as corrupted. */ ubi_warn(ubi, "static volume %d misses %d LEBs - corrupted", av->vol_id, av->used_ebs - av->leb_count); vol->corrupted = 1; continue; } vol->used_ebs = av->used_ebs; vol->used_bytes = (long long)(vol->used_ebs - 1) * vol->usable_leb_size; vol->used_bytes += av->last_data_size; vol->last_eb_bytes = av->last_data_size; } /* And add the layout volume */ vol = kzalloc(sizeof(struct ubi_volume), GFP_KERNEL); if (!vol) return -ENOMEM; vol->reserved_pebs = UBI_LAYOUT_VOLUME_EBS; vol->alignment = UBI_LAYOUT_VOLUME_ALIGN; vol->vol_type = UBI_DYNAMIC_VOLUME; vol->name_len = sizeof(UBI_LAYOUT_VOLUME_NAME) - 1; memcpy(vol->name, UBI_LAYOUT_VOLUME_NAME, vol->name_len + 1); vol->usable_leb_size = ubi->leb_size; vol->used_ebs = vol->reserved_pebs; vol->last_eb_bytes = vol->reserved_pebs; vol->used_bytes = (long long)vol->used_ebs * (ubi->leb_size - vol->data_pad); vol->vol_id = UBI_LAYOUT_VOLUME_ID; vol->ref_count = 1; ubi_assert(!ubi->volumes[i]); ubi->volumes[vol_id2idx(ubi, vol->vol_id)] = vol; reserved_pebs += vol->reserved_pebs; ubi->vol_count += 1; vol->ubi = ubi; err = ubi_fastmap_init_checkmap(vol, UBI_LAYOUT_VOLUME_EBS); if (err) return err; if (reserved_pebs > ubi->avail_pebs) { ubi_err(ubi, "not enough PEBs, required %d, available %d", reserved_pebs, ubi->avail_pebs); if (ubi->corr_peb_count) ubi_err(ubi, "%d PEBs are corrupted and not used", ubi->corr_peb_count); return -ENOSPC; } ubi->rsvd_pebs += reserved_pebs; ubi->avail_pebs -= reserved_pebs; return 0; } /** * check_av - check volume attaching information. * @vol: UBI volume description object * @av: volume attaching information * * This function returns zero if the volume attaching information is consistent * to the data read from the volume tabla, and %-EINVAL if not. */ static int check_av(const struct ubi_volume *vol, const struct ubi_ainf_volume *av) { int err; if (av->highest_lnum >= vol->reserved_pebs) { err = 1; goto bad; } if (av->leb_count > vol->reserved_pebs) { err = 2; goto bad; } if (av->vol_type != vol->vol_type) { err = 3; goto bad; } if (av->used_ebs > vol->reserved_pebs) { err = 4; goto bad; } if (av->data_pad != vol->data_pad) { err = 5; goto bad; } return 0; bad: ubi_err(vol->ubi, "bad attaching information, error %d", err); ubi_dump_av(av); ubi_dump_vol_info(vol); return -EINVAL; } /** * check_attaching_info - check that attaching information. * @ubi: UBI device description object * @ai: attaching information * * Even though we protect on-flash data by CRC checksums, we still don't trust * the media. This function ensures that attaching information is consistent to * the information read from the volume table. Returns zero if the attaching * information is OK and %-EINVAL if it is not. */ static int check_attaching_info(const struct ubi_device *ubi, struct ubi_attach_info *ai) { int err, i; struct ubi_ainf_volume *av; struct ubi_volume *vol; if (ai->vols_found > UBI_INT_VOL_COUNT + ubi->vtbl_slots) { ubi_err(ubi, "found %d volumes while attaching, maximum is %d + %d", ai->vols_found, UBI_INT_VOL_COUNT, ubi->vtbl_slots); return -EINVAL; } if (ai->highest_vol_id >= ubi->vtbl_slots + UBI_INT_VOL_COUNT && ai->highest_vol_id < UBI_INTERNAL_VOL_START) { ubi_err(ubi, "too large volume ID %d found", ai->highest_vol_id); return -EINVAL; } for (i = 0; i < ubi->vtbl_slots + UBI_INT_VOL_COUNT; i++) { cond_resched(); av = ubi_find_av(ai, i); vol = ubi->volumes[i]; if (!vol) { if (av) ubi_remove_av(ai, av); continue; } if (vol->reserved_pebs == 0) { ubi_assert(i < ubi->vtbl_slots); if (!av) continue; /* * During attaching we found a volume which does not * exist according to the information in the volume * table. This must have happened due to an unclean * reboot while the volume was being removed. Discard * these eraseblocks. */ ubi_msg(ubi, "finish volume %d removal", av->vol_id); ubi_remove_av(ai, av); } else if (av) { err = check_av(vol, av); if (err) return err; } } return 0; } /** * ubi_read_volume_table - read the volume table. * @ubi: UBI device description object * @ai: attaching information * * This function reads volume table, checks it, recover from errors if needed, * or creates it if needed. Returns zero in case of success and a negative * error code in case of failure. */ int ubi_read_volume_table(struct ubi_device *ubi, struct ubi_attach_info *ai) { int err; struct ubi_ainf_volume *av; empty_vtbl_record.crc = cpu_to_be32(0xf116c36b); /* * The number of supported volumes is limited by the eraseblock size * and by the UBI_MAX_VOLUMES constant. */ if (ubi->leb_size < UBI_VTBL_RECORD_SIZE) { ubi_err(ubi, "LEB size too small for a volume record"); return -EINVAL; } ubi->vtbl_slots = ubi->leb_size / UBI_VTBL_RECORD_SIZE; if (ubi->vtbl_slots > UBI_MAX_VOLUMES) ubi->vtbl_slots = UBI_MAX_VOLUMES; ubi->vtbl_size = ubi->vtbl_slots * UBI_VTBL_RECORD_SIZE; ubi->vtbl_size = ALIGN(ubi->vtbl_size, ubi->min_io_size); av = ubi_find_av(ai, UBI_LAYOUT_VOLUME_ID); if (!av) { /* * No logical eraseblocks belonging to the layout volume were * found. This could mean that the flash is just empty. In * this case we create empty layout volume. * * But if flash is not empty this must be a corruption or the * MTD device just contains garbage. */ if (ai->is_empty) { ubi->vtbl = create_empty_lvol(ubi, ai); if (IS_ERR(ubi->vtbl)) return PTR_ERR(ubi->vtbl); } else { ubi_err(ubi, "the layout volume was not found"); return -EINVAL; } } else { if (av->leb_count > UBI_LAYOUT_VOLUME_EBS) { /* This must not happen with proper UBI images */ ubi_err(ubi, "too many LEBs (%d) in layout volume", av->leb_count); return -EINVAL; } ubi->vtbl = process_lvol(ubi, ai, av); if (IS_ERR(ubi->vtbl)) return PTR_ERR(ubi->vtbl); } ubi->avail_pebs = ubi->good_peb_count - ubi->corr_peb_count; /* * The layout volume is OK, initialize the corresponding in-RAM data * structures. */ err = init_volumes(ubi, ai, ubi->vtbl); if (err) goto out_free; /* * Make sure that the attaching information is consistent to the * information stored in the volume table. */ err = check_attaching_info(ubi, ai); if (err) goto out_free; return 0; out_free: vfree(ubi->vtbl); ubi_free_all_volumes(ubi); return err; } /** * self_vtbl_check - check volume table. * @ubi: UBI device description object */ static void self_vtbl_check(const struct ubi_device *ubi) { if (!ubi_dbg_chk_gen(ubi)) return; if (vtbl_check(ubi, ubi->vtbl)) { ubi_err(ubi, "self-check failed"); BUG(); } } |
| 3 3 3 3 12 12 7 3 2 12 12 12 12 12 12 12 1 11 2 18 18 14 4 14 2 12 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 | // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (c) 2012 Bjørn Mork <bjorn@mork.no> * * The probing code is heavily inspired by cdc_ether, which is: * Copyright (C) 2003-2005 by David Brownell * Copyright (C) 2006 by Ole Andre Vadla Ravnas (ActiveSync) */ #include <linux/module.h> #include <linux/sched/signal.h> #include <linux/netdevice.h> #include <linux/ethtool.h> #include <linux/etherdevice.h> #include <linux/if_arp.h> #include <linux/kstrtox.h> #include <linux/mii.h> #include <linux/rtnetlink.h> #include <linux/usb.h> #include <linux/usb/cdc.h> #include <linux/usb/usbnet.h> #include <linux/usb/cdc-wdm.h> #include <linux/u64_stats_sync.h> /* This driver supports wwan (3G/LTE/?) devices using a vendor * specific management protocol called Qualcomm MSM Interface (QMI) - * in addition to the more common AT commands over serial interface * management * * QMI is wrapped in CDC, using CDC encapsulated commands on the * control ("master") interface of a two-interface CDC Union * resembling standard CDC ECM. The devices do not use the control * interface for any other CDC messages. Most likely because the * management protocol is used in place of the standard CDC * notifications NOTIFY_NETWORK_CONNECTION and NOTIFY_SPEED_CHANGE * * Alternatively, control and data functions can be combined in a * single USB interface. * * Handling a protocol like QMI is out of the scope for any driver. * It is exported as a character device using the cdc-wdm driver as * a subdriver, enabling userspace applications ("modem managers") to * handle it. * * These devices may alternatively/additionally be configured using AT * commands on a serial interface */ /* driver specific data */ struct qmi_wwan_state { struct usb_driver *subdriver; atomic_t pmcount; unsigned long flags; struct usb_interface *control; struct usb_interface *data; }; enum qmi_wwan_flags { QMI_WWAN_FLAG_RAWIP = 1 << 0, QMI_WWAN_FLAG_MUX = 1 << 1, QMI_WWAN_FLAG_PASS_THROUGH = 1 << 2, }; enum qmi_wwan_quirks { QMI_WWAN_QUIRK_DTR = 1 << 0, /* needs "set DTR" request */ }; struct qmimux_hdr { u8 pad; u8 mux_id; __be16 pkt_len; }; struct qmimux_priv { struct net_device *real_dev; u8 mux_id; }; static int qmimux_open(struct net_device *dev) { struct qmimux_priv *priv = netdev_priv(dev); struct net_device *real_dev = priv->real_dev; if (!(priv->real_dev->flags & IFF_UP)) return -ENETDOWN; if (netif_carrier_ok(real_dev)) netif_carrier_on(dev); return 0; } static int qmimux_stop(struct net_device *dev) { netif_carrier_off(dev); return 0; } static netdev_tx_t qmimux_start_xmit(struct sk_buff *skb, struct net_device *dev) { struct qmimux_priv *priv = netdev_priv(dev); unsigned int len = skb->len; struct qmimux_hdr *hdr; netdev_tx_t ret; hdr = skb_push(skb, sizeof(struct qmimux_hdr)); hdr->pad = 0; hdr->mux_id = priv->mux_id; hdr->pkt_len = cpu_to_be16(len); skb->dev = priv->real_dev; ret = dev_queue_xmit(skb); if (likely(ret == NET_XMIT_SUCCESS || ret == NET_XMIT_CN)) dev_sw_netstats_tx_add(dev, 1, len); else dev->stats.tx_dropped++; return ret; } static const struct net_device_ops qmimux_netdev_ops = { .ndo_open = qmimux_open, .ndo_stop = qmimux_stop, .ndo_start_xmit = qmimux_start_xmit, }; static void qmimux_setup(struct net_device *dev) { dev->header_ops = NULL; /* No header */ dev->type = ARPHRD_NONE; dev->hard_header_len = 0; dev->addr_len = 0; dev->flags = IFF_POINTOPOINT | IFF_NOARP | IFF_MULTICAST; dev->netdev_ops = &qmimux_netdev_ops; dev->mtu = 1500; dev->pcpu_stat_type = NETDEV_PCPU_STAT_TSTATS; dev->needs_free_netdev = true; } static struct net_device *qmimux_find_dev(struct usbnet *dev, u8 mux_id) { struct qmimux_priv *priv; struct list_head *iter; struct net_device *ldev; rcu_read_lock(); netdev_for_each_upper_dev_rcu(dev->net, ldev, iter) { priv = netdev_priv(ldev); if (priv->mux_id == mux_id) { rcu_read_unlock(); return ldev; } } rcu_read_unlock(); return NULL; } static bool qmimux_has_slaves(struct usbnet *dev) { return !list_empty(&dev->net->adj_list.upper); } static int qmimux_rx_fixup(struct usbnet *dev, struct sk_buff *skb) { unsigned int len, offset = 0, pad_len, pkt_len; struct qmimux_hdr *hdr; struct net_device *net; struct sk_buff *skbn; u8 qmimux_hdr_sz = sizeof(*hdr); while (offset + qmimux_hdr_sz < skb->len) { hdr = (struct qmimux_hdr *)(skb->data + offset); len = be16_to_cpu(hdr->pkt_len); /* drop the packet, bogus length */ if (offset + len + qmimux_hdr_sz > skb->len) return 0; /* control packet, we do not know what to do */ if (hdr->pad & 0x80) goto skip; /* extract padding length and check for valid length info */ pad_len = hdr->pad & 0x3f; if (len == 0 || pad_len >= len) goto skip; pkt_len = len - pad_len; net = qmimux_find_dev(dev, hdr->mux_id); if (!net) goto skip; skbn = netdev_alloc_skb(net, pkt_len + LL_MAX_HEADER); if (!skbn) return 0; switch (skb->data[offset + qmimux_hdr_sz] & 0xf0) { case 0x40: skbn->protocol = htons(ETH_P_IP); break; case 0x60: skbn->protocol = htons(ETH_P_IPV6); break; default: /* not ip - do not know what to do */ kfree_skb(skbn); goto skip; } skb_reserve(skbn, LL_MAX_HEADER); skb_put_data(skbn, skb->data + offset + qmimux_hdr_sz, pkt_len); if (netif_rx(skbn) != NET_RX_SUCCESS) { net->stats.rx_errors++; return 0; } else { dev_sw_netstats_rx_add(net, pkt_len); } skip: offset += len + qmimux_hdr_sz; } return 1; } static ssize_t mux_id_show(struct device *d, struct device_attribute *attr, char *buf) { struct net_device *dev = to_net_dev(d); struct qmimux_priv *priv; priv = netdev_priv(dev); return sysfs_emit(buf, "0x%02x\n", priv->mux_id); } static DEVICE_ATTR_RO(mux_id); static struct attribute *qmi_wwan_sysfs_qmimux_attrs[] = { &dev_attr_mux_id.attr, NULL, }; static struct attribute_group qmi_wwan_sysfs_qmimux_attr_group = { .name = "qmap", .attrs = qmi_wwan_sysfs_qmimux_attrs, }; static int qmimux_register_device(struct net_device *real_dev, u8 mux_id) { struct net_device *new_dev; struct qmimux_priv *priv; int err; new_dev = alloc_netdev(sizeof(struct qmimux_priv), "qmimux%d", NET_NAME_UNKNOWN, qmimux_setup); if (!new_dev) return -ENOBUFS; dev_net_set(new_dev, dev_net(real_dev)); priv = netdev_priv(new_dev); priv->mux_id = mux_id; priv->real_dev = real_dev; new_dev->sysfs_groups[0] = &qmi_wwan_sysfs_qmimux_attr_group; err = register_netdevice(new_dev); if (err < 0) goto out_free_newdev; /* Account for reference in struct qmimux_priv_priv */ dev_hold(real_dev); err = netdev_upper_dev_link(real_dev, new_dev, NULL); if (err) goto out_unregister_netdev; netif_stacked_transfer_operstate(real_dev, new_dev); return 0; out_unregister_netdev: unregister_netdevice(new_dev); dev_put(real_dev); out_free_newdev: free_netdev(new_dev); return err; } static void qmimux_unregister_device(struct net_device *dev, struct list_head *head) { struct qmimux_priv *priv = netdev_priv(dev); struct net_device *real_dev = priv->real_dev; netdev_upper_dev_unlink(real_dev, dev); unregister_netdevice_queue(dev, head); /* Get rid of the reference to real_dev */ dev_put(real_dev); } static void qmi_wwan_netdev_setup(struct net_device *net) { struct usbnet *dev = netdev_priv(net); struct qmi_wwan_state *info = (void *)&dev->data; if (info->flags & QMI_WWAN_FLAG_RAWIP) { net->header_ops = NULL; /* No header */ net->type = ARPHRD_NONE; net->hard_header_len = 0; net->addr_len = 0; net->flags = IFF_POINTOPOINT | IFF_NOARP | IFF_MULTICAST; set_bit(EVENT_NO_IP_ALIGN, &dev->flags); netdev_dbg(net, "mode: raw IP\n"); } else if (!net->header_ops) { /* don't bother if already set */ ether_setup(net); /* Restoring min/max mtu values set originally by usbnet */ net->min_mtu = 0; net->max_mtu = ETH_MAX_MTU; clear_bit(EVENT_NO_IP_ALIGN, &dev->flags); netdev_dbg(net, "mode: Ethernet\n"); } /* recalculate buffers after changing hard_header_len */ usbnet_change_mtu(net, net->mtu); } static ssize_t raw_ip_show(struct device *d, struct device_attribute *attr, char *buf) { struct usbnet *dev = netdev_priv(to_net_dev(d)); struct qmi_wwan_state *info = (void *)&dev->data; return sprintf(buf, "%c\n", info->flags & QMI_WWAN_FLAG_RAWIP ? 'Y' : 'N'); } static ssize_t raw_ip_store(struct device *d, struct device_attribute *attr, const char *buf, size_t len) { struct usbnet *dev = netdev_priv(to_net_dev(d)); struct qmi_wwan_state *info = (void *)&dev->data; bool enable; int ret; if (kstrtobool(buf, &enable)) return -EINVAL; /* no change? */ if (enable == (info->flags & QMI_WWAN_FLAG_RAWIP)) return len; /* ip mode cannot be cleared when pass through mode is set */ if (!enable && (info->flags & QMI_WWAN_FLAG_PASS_THROUGH)) { netdev_err(dev->net, "Cannot clear ip mode on pass through device\n"); return -EINVAL; } if (!rtnl_trylock()) return restart_syscall(); /* we don't want to modify a running netdev */ if (netif_running(dev->net)) { netdev_err(dev->net, "Cannot change a running device\n"); ret = -EBUSY; goto err; } /* let other drivers deny the change */ ret = call_netdevice_notifiers(NETDEV_PRE_TYPE_CHANGE, dev->net); ret = notifier_to_errno(ret); if (ret) { netdev_err(dev->net, "Type change was refused\n"); goto err; } if (enable) info->flags |= QMI_WWAN_FLAG_RAWIP; else info->flags &= ~QMI_WWAN_FLAG_RAWIP; qmi_wwan_netdev_setup(dev->net); call_netdevice_notifiers(NETDEV_POST_TYPE_CHANGE, dev->net); ret = len; err: rtnl_unlock(); return ret; } static ssize_t add_mux_show(struct device *d, struct device_attribute *attr, char *buf) { struct net_device *dev = to_net_dev(d); struct qmimux_priv *priv; struct list_head *iter; struct net_device *ldev; ssize_t count = 0; rcu_read_lock(); netdev_for_each_upper_dev_rcu(dev, ldev, iter) { priv = netdev_priv(ldev); count += scnprintf(&buf[count], PAGE_SIZE - count, "0x%02x\n", priv->mux_id); } rcu_read_unlock(); return count; } static ssize_t add_mux_store(struct device *d, struct device_attribute *attr, const char *buf, size_t len) { struct usbnet *dev = netdev_priv(to_net_dev(d)); struct qmi_wwan_state *info = (void *)&dev->data; u8 mux_id; int ret; if (kstrtou8(buf, 0, &mux_id)) return -EINVAL; /* mux_id [1 - 254] for compatibility with ip(8) and the rmnet driver */ if (mux_id < 1 || mux_id > 254) return -EINVAL; if (!rtnl_trylock()) return restart_syscall(); if (qmimux_find_dev(dev, mux_id)) { netdev_err(dev->net, "mux_id already present\n"); ret = -EINVAL; goto err; } ret = qmimux_register_device(dev->net, mux_id); if (!ret) { info->flags |= QMI_WWAN_FLAG_MUX; ret = len; } err: rtnl_unlock(); return ret; } static ssize_t del_mux_show(struct device *d, struct device_attribute *attr, char *buf) { return add_mux_show(d, attr, buf); } static ssize_t del_mux_store(struct device *d, struct device_attribute *attr, const char *buf, size_t len) { struct usbnet *dev = netdev_priv(to_net_dev(d)); struct qmi_wwan_state *info = (void *)&dev->data; struct net_device *del_dev; u8 mux_id; int ret = 0; if (kstrtou8(buf, 0, &mux_id)) return -EINVAL; if (!rtnl_trylock()) return restart_syscall(); del_dev = qmimux_find_dev(dev, mux_id); if (!del_dev) { netdev_err(dev->net, "mux_id not present\n"); ret = -EINVAL; goto err; } qmimux_unregister_device(del_dev, NULL); if (!qmimux_has_slaves(dev)) info->flags &= ~QMI_WWAN_FLAG_MUX; ret = len; err: rtnl_unlock(); return ret; } static ssize_t pass_through_show(struct device *d, struct device_attribute *attr, char *buf) { struct usbnet *dev = netdev_priv(to_net_dev(d)); struct qmi_wwan_state *info; info = (void *)&dev->data; return sprintf(buf, "%c\n", info->flags & QMI_WWAN_FLAG_PASS_THROUGH ? 'Y' : 'N'); } static ssize_t pass_through_store(struct device *d, struct device_attribute *attr, const char *buf, size_t len) { struct usbnet *dev = netdev_priv(to_net_dev(d)); struct qmi_wwan_state *info; bool enable; if (kstrtobool(buf, &enable)) return -EINVAL; info = (void *)&dev->data; /* no change? */ if (enable == (info->flags & QMI_WWAN_FLAG_PASS_THROUGH)) return len; /* pass through mode can be set for raw ip devices only */ if (!(info->flags & QMI_WWAN_FLAG_RAWIP)) { netdev_err(dev->net, "Cannot set pass through mode on non ip device\n"); return -EINVAL; } if (enable) info->flags |= QMI_WWAN_FLAG_PASS_THROUGH; else info->flags &= ~QMI_WWAN_FLAG_PASS_THROUGH; return len; } static DEVICE_ATTR_RW(raw_ip); static DEVICE_ATTR_RW(add_mux); static DEVICE_ATTR_RW(del_mux); static DEVICE_ATTR_RW(pass_through); static struct attribute *qmi_wwan_sysfs_attrs[] = { &dev_attr_raw_ip.attr, &dev_attr_add_mux.attr, &dev_attr_del_mux.attr, &dev_attr_pass_through.attr, NULL, }; static struct attribute_group qmi_wwan_sysfs_attr_group = { .name = "qmi", .attrs = qmi_wwan_sysfs_attrs, }; /* default ethernet address used by the modem */ static const u8 default_modem_addr[ETH_ALEN] = {0x02, 0x50, 0xf3}; static const u8 buggy_fw_addr[ETH_ALEN] = {0x00, 0xa0, 0xc6, 0x00, 0x00, 0x00}; /* Make up an ethernet header if the packet doesn't have one. * * A firmware bug common among several devices cause them to send raw * IP packets under some circumstances. There is no way for the * driver/host to know when this will happen. And even when the bug * hits, some packets will still arrive with an intact header. * * The supported devices are only capably of sending IPv4, IPv6 and * ARP packets on a point-to-point link. Any packet with an ethernet * header will have either our address or a broadcast/multicast * address as destination. ARP packets will always have a header. * * This means that this function will reliably add the appropriate * header iff necessary, provided our hardware address does not start * with 4 or 6. * * Another common firmware bug results in all packets being addressed * to 00:a0:c6:00:00:00 despite the host address being different. * This function will also fixup such packets. */ static int qmi_wwan_rx_fixup(struct usbnet *dev, struct sk_buff *skb) { struct qmi_wwan_state *info = (void *)&dev->data; bool rawip = info->flags & QMI_WWAN_FLAG_RAWIP; __be16 proto; /* This check is no longer done by usbnet */ if (skb->len < dev->net->hard_header_len) return 0; if (info->flags & QMI_WWAN_FLAG_MUX) return qmimux_rx_fixup(dev, skb); if (info->flags & QMI_WWAN_FLAG_PASS_THROUGH) { skb->protocol = htons(ETH_P_MAP); return 1; } switch (skb->data[0] & 0xf0) { case 0x40: proto = htons(ETH_P_IP); break; case 0x60: proto = htons(ETH_P_IPV6); break; case 0x00: if (rawip) return 0; if (is_multicast_ether_addr(skb->data)) return 1; /* possibly bogus destination - rewrite just in case */ skb_reset_mac_header(skb); goto fix_dest; default: if (rawip) return 0; /* pass along other packets without modifications */ return 1; } if (rawip) { skb_reset_mac_header(skb); skb->dev = dev->net; /* normally set by eth_type_trans */ skb->protocol = proto; return 1; } if (skb_headroom(skb) < ETH_HLEN) return 0; skb_push(skb, ETH_HLEN); skb_reset_mac_header(skb); eth_hdr(skb)->h_proto = proto; eth_zero_addr(eth_hdr(skb)->h_source); fix_dest: memcpy(eth_hdr(skb)->h_dest, dev->net->dev_addr, ETH_ALEN); return 1; } /* very simplistic detection of IPv4 or IPv6 headers */ static bool possibly_iphdr(const char *data) { return (data[0] & 0xd0) == 0x40; } /* disallow addresses which may be confused with IP headers */ static int qmi_wwan_mac_addr(struct net_device *dev, void *p) { int ret; struct sockaddr *addr = p; ret = eth_prepare_mac_addr_change(dev, p); if (ret < 0) return ret; if (possibly_iphdr(addr->sa_data)) return -EADDRNOTAVAIL; eth_commit_mac_addr_change(dev, p); return 0; } static const struct net_device_ops qmi_wwan_netdev_ops = { .ndo_open = usbnet_open, .ndo_stop = usbnet_stop, .ndo_start_xmit = usbnet_start_xmit, .ndo_tx_timeout = usbnet_tx_timeout, .ndo_change_mtu = usbnet_change_mtu, .ndo_set_mac_address = qmi_wwan_mac_addr, .ndo_validate_addr = eth_validate_addr, }; /* using a counter to merge subdriver requests with our own into a * combined state */ static int qmi_wwan_manage_power(struct usbnet *dev, int on) { struct qmi_wwan_state *info = (void *)&dev->data; int rv; dev_dbg(&dev->intf->dev, "%s() pmcount=%d, on=%d\n", __func__, atomic_read(&info->pmcount), on); if ((on && atomic_add_return(1, &info->pmcount) == 1) || (!on && atomic_dec_and_test(&info->pmcount))) { /* need autopm_get/put here to ensure the usbcore sees * the new value */ rv = usb_autopm_get_interface(dev->intf); dev->intf->needs_remote_wakeup = on; if (!rv) usb_autopm_put_interface(dev->intf); } return 0; } static int qmi_wwan_cdc_wdm_manage_power(struct usb_interface *intf, int on) { struct usbnet *dev = usb_get_intfdata(intf); /* can be called while disconnecting */ if (!dev) return 0; return qmi_wwan_manage_power(dev, on); } /* collect all three endpoints and register subdriver */ static int qmi_wwan_register_subdriver(struct usbnet *dev) { int rv; struct usb_driver *subdriver = NULL; struct qmi_wwan_state *info = (void *)&dev->data; /* collect bulk endpoints */ rv = usbnet_get_endpoints(dev, info->data); if (rv < 0) goto err; /* update status endpoint if separate control interface */ if (info->control != info->data) dev->status = &info->control->cur_altsetting->endpoint[0]; /* require interrupt endpoint for subdriver */ if (!dev->status) { rv = -EINVAL; goto err; } /* for subdriver power management */ atomic_set(&info->pmcount, 0); /* register subdriver */ subdriver = usb_cdc_wdm_register(info->control, &dev->status->desc, 4096, WWAN_PORT_QMI, &qmi_wwan_cdc_wdm_manage_power); if (IS_ERR(subdriver)) { dev_err(&info->control->dev, "subdriver registration failed\n"); rv = PTR_ERR(subdriver); goto err; } /* prevent usbnet from using status endpoint */ dev->status = NULL; /* save subdriver struct for suspend/resume wrappers */ info->subdriver = subdriver; err: return rv; } /* Send CDC SetControlLineState request, setting or clearing the DTR. * "Required for Autoconnect and 9x30 to wake up" according to the * GobiNet driver. The requirement has been verified on an MDM9230 * based Sierra Wireless MC7455 */ static int qmi_wwan_change_dtr(struct usbnet *dev, bool on) { u8 intf = dev->intf->cur_altsetting->desc.bInterfaceNumber; return usbnet_write_cmd(dev, USB_CDC_REQ_SET_CONTROL_LINE_STATE, USB_DIR_OUT | USB_TYPE_CLASS | USB_RECIP_INTERFACE, on ? 0x01 : 0x00, intf, NULL, 0); } static int qmi_wwan_bind(struct usbnet *dev, struct usb_interface *intf) { int status; u8 *buf = intf->cur_altsetting->extra; int len = intf->cur_altsetting->extralen; struct usb_interface_descriptor *desc = &intf->cur_altsetting->desc; struct usb_cdc_union_desc *cdc_union; struct usb_cdc_ether_desc *cdc_ether; struct usb_driver *driver = driver_of(intf); struct qmi_wwan_state *info = (void *)&dev->data; struct usb_cdc_parsed_header hdr; BUILD_BUG_ON((sizeof(((struct usbnet *)0)->data) < sizeof(struct qmi_wwan_state))); /* set up initial state */ info->control = intf; info->data = intf; /* and a number of CDC descriptors */ cdc_parse_cdc_header(&hdr, intf, buf, len); cdc_union = hdr.usb_cdc_union_desc; cdc_ether = hdr.usb_cdc_ether_desc; /* Use separate control and data interfaces if we found a CDC Union */ if (cdc_union) { info->data = usb_ifnum_to_if(dev->udev, cdc_union->bSlaveInterface0); if (desc->bInterfaceNumber != cdc_union->bMasterInterface0 || !info->data) { dev_err(&intf->dev, "bogus CDC Union: master=%u, slave=%u\n", cdc_union->bMasterInterface0, cdc_union->bSlaveInterface0); /* ignore and continue... */ cdc_union = NULL; info->data = intf; } } /* errors aren't fatal - we can live with the dynamic address */ if (cdc_ether && cdc_ether->wMaxSegmentSize) { dev->hard_mtu = le16_to_cpu(cdc_ether->wMaxSegmentSize); usbnet_get_ethernet_addr(dev, cdc_ether->iMACAddress); } /* claim data interface and set it up */ if (info->control != info->data) { status = usb_driver_claim_interface(driver, info->data, dev); if (status < 0) goto err; } status = qmi_wwan_register_subdriver(dev); if (status < 0 && info->control != info->data) { usb_set_intfdata(info->data, NULL); usb_driver_release_interface(driver, info->data); } /* disabling remote wakeup on MDM9x30 devices has the same * effect as clearing DTR. The device will not respond to QMI * requests until we set DTR again. This is similar to a * QMI_CTL SYNC request, clearing a lot of firmware state * including the client ID allocations. * * Our usage model allows a session to span multiple * open/close events, so we must prevent the firmware from * clearing out state the clients might need. * * MDM9x30 is the first QMI chipset with USB3 support. Abuse * this fact to enable the quirk for all USB3 devices. * * There are also chipsets with the same "set DTR" requirement * but without USB3 support. Devices based on these chips * need a quirk flag in the device ID table. */ if (dev->driver_info->data & QMI_WWAN_QUIRK_DTR || le16_to_cpu(dev->udev->descriptor.bcdUSB) >= 0x0201) { qmi_wwan_manage_power(dev, 1); qmi_wwan_change_dtr(dev, true); } /* Never use the same address on both ends of the link, even if the * buggy firmware told us to. Or, if device is assigned the well-known * buggy firmware MAC address, replace it with a random address, */ if (ether_addr_equal(dev->net->dev_addr, default_modem_addr) || ether_addr_equal(dev->net->dev_addr, buggy_fw_addr)) eth_hw_addr_random(dev->net); /* make MAC addr easily distinguishable from an IP header */ if (possibly_iphdr(dev->net->dev_addr)) { u8 addr = dev->net->dev_addr[0]; addr |= 0x02; /* set local assignment bit */ addr &= 0xbf; /* clear "IP" bit */ dev_addr_mod(dev->net, 0, &addr, 1); } dev->net->netdev_ops = &qmi_wwan_netdev_ops; dev->net->sysfs_groups[0] = &qmi_wwan_sysfs_attr_group; err: return status; } static void qmi_wwan_unbind(struct usbnet *dev, struct usb_interface *intf) { struct qmi_wwan_state *info = (void *)&dev->data; struct usb_driver *driver = driver_of(intf); struct usb_interface *other; if (info->subdriver && info->subdriver->disconnect) info->subdriver->disconnect(info->control); /* disable MDM9x30 quirk */ if (le16_to_cpu(dev->udev->descriptor.bcdUSB) >= 0x0201) { qmi_wwan_change_dtr(dev, false); qmi_wwan_manage_power(dev, 0); } /* allow user to unbind using either control or data */ if (intf == info->control) other = info->data; else other = info->control; /* only if not shared */ if (other && intf != other) { usb_set_intfdata(other, NULL); usb_driver_release_interface(driver, other); } info->subdriver = NULL; info->data = NULL; info->control = NULL; } /* suspend/resume wrappers calling both usbnet and the cdc-wdm * subdriver if present. * * NOTE: cdc-wdm also supports pre/post_reset, but we cannot provide * wrappers for those without adding usbnet reset support first. */ static int qmi_wwan_suspend(struct usb_interface *intf, pm_message_t message) { struct usbnet *dev = usb_get_intfdata(intf); struct qmi_wwan_state *info = (void *)&dev->data; int ret; /* Both usbnet_suspend() and subdriver->suspend() MUST return 0 * in system sleep context, otherwise, the resume callback has * to recover device from previous suspend failure. */ ret = usbnet_suspend(intf, message); if (ret < 0) goto err; if (intf == info->control && info->subdriver && info->subdriver->suspend) ret = info->subdriver->suspend(intf, message); if (ret < 0) usbnet_resume(intf); err: return ret; } static int qmi_wwan_resume(struct usb_interface *intf) { struct usbnet *dev = usb_get_intfdata(intf); struct qmi_wwan_state *info = (void *)&dev->data; int ret = 0; bool callsub = (intf == info->control && info->subdriver && info->subdriver->resume); if (callsub) ret = info->subdriver->resume(intf); if (ret < 0) goto err; ret = usbnet_resume(intf); if (ret < 0 && callsub) info->subdriver->suspend(intf, PMSG_SUSPEND); err: return ret; } static const struct driver_info qmi_wwan_info = { .description = "WWAN/QMI device", .flags = FLAG_WWAN | FLAG_SEND_ZLP, .bind = qmi_wwan_bind, .unbind = qmi_wwan_unbind, .manage_power = qmi_wwan_manage_power, .rx_fixup = qmi_wwan_rx_fixup, }; static const struct driver_info qmi_wwan_info_quirk_dtr = { .description = "WWAN/QMI device", .flags = FLAG_WWAN | FLAG_SEND_ZLP, .bind = qmi_wwan_bind, .unbind = qmi_wwan_unbind, .manage_power = qmi_wwan_manage_power, .rx_fixup = qmi_wwan_rx_fixup, .data = QMI_WWAN_QUIRK_DTR, }; #define HUAWEI_VENDOR_ID 0x12D1 /* map QMI/wwan function by a fixed interface number */ #define QMI_FIXED_INTF(vend, prod, num) \ USB_DEVICE_INTERFACE_NUMBER(vend, prod, num), \ .driver_info = (unsigned long)&qmi_wwan_info /* devices requiring "set DTR" quirk */ #define QMI_QUIRK_SET_DTR(vend, prod, num) \ USB_DEVICE_INTERFACE_NUMBER(vend, prod, num), \ .driver_info = (unsigned long)&qmi_wwan_info_quirk_dtr /* Gobi 1000 QMI/wwan interface number is 3 according to qcserial */ #define QMI_GOBI1K_DEVICE(vend, prod) \ QMI_FIXED_INTF(vend, prod, 3) /* Gobi 2000/3000 QMI/wwan interface number is 0 according to qcserial */ #define QMI_GOBI_DEVICE(vend, prod) \ QMI_FIXED_INTF(vend, prod, 0) /* Many devices have QMI and DIAG functions which are distinguishable * from other vendor specific functions by class, subclass and * protocol all being 0xff. The DIAG function has exactly 2 endpoints * and is silently rejected when probed. * * This makes it possible to match dynamically numbered QMI functions * as seen on e.g. many Quectel modems. */ #define QMI_MATCH_FF_FF_FF(vend, prod) \ USB_DEVICE_AND_INTERFACE_INFO(vend, prod, USB_CLASS_VENDOR_SPEC, \ USB_SUBCLASS_VENDOR_SPEC, 0xff), \ .driver_info = (unsigned long)&qmi_wwan_info_quirk_dtr static const struct usb_device_id products[] = { /* 1. CDC ECM like devices match on the control interface */ { /* Huawei E392, E398 and possibly others sharing both device id and more... */ USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, USB_CLASS_VENDOR_SPEC, 1, 9), .driver_info = (unsigned long)&qmi_wwan_info, }, { /* Vodafone/Huawei K5005 (12d1:14c8) and similar modems */ USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, USB_CLASS_VENDOR_SPEC, 1, 57), .driver_info = (unsigned long)&qmi_wwan_info, }, { /* HUAWEI_INTERFACE_NDIS_CONTROL_QUALCOMM */ USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, USB_CLASS_VENDOR_SPEC, 0x01, 0x69), .driver_info = (unsigned long)&qmi_wwan_info, }, { /* Motorola Mapphone devices with MDM6600 */ USB_VENDOR_AND_INTERFACE_INFO(0x22b8, USB_CLASS_VENDOR_SPEC, 0xfb, 0xff), .driver_info = (unsigned long)&qmi_wwan_info, }, /* 2. Combined interface devices matching on class+protocol */ { /* Huawei E367 and possibly others in "Windows mode" */ USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, USB_CLASS_VENDOR_SPEC, 1, 7), .driver_info = (unsigned long)&qmi_wwan_info, }, { /* Huawei E392, E398 and possibly others in "Windows mode" */ USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, USB_CLASS_VENDOR_SPEC, 1, 17), .driver_info = (unsigned long)&qmi_wwan_info, }, { /* HUAWEI_NDIS_SINGLE_INTERFACE_VDF */ USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, USB_CLASS_VENDOR_SPEC, 0x01, 0x37), .driver_info = (unsigned long)&qmi_wwan_info, }, { /* HUAWEI_INTERFACE_NDIS_HW_QUALCOMM */ USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, USB_CLASS_VENDOR_SPEC, 0x01, 0x67), .driver_info = (unsigned long)&qmi_wwan_info, }, { /* Pantech UML290, P4200 and more */ USB_VENDOR_AND_INTERFACE_INFO(0x106c, USB_CLASS_VENDOR_SPEC, 0xf0, 0xff), .driver_info = (unsigned long)&qmi_wwan_info, }, { /* Pantech UML290 - newer firmware */ USB_VENDOR_AND_INTERFACE_INFO(0x106c, USB_CLASS_VENDOR_SPEC, 0xf1, 0xff), .driver_info = (unsigned long)&qmi_wwan_info, }, { /* Novatel USB551L and MC551 */ USB_DEVICE_AND_INTERFACE_INFO(0x1410, 0xb001, USB_CLASS_COMM, USB_CDC_SUBCLASS_ETHERNET, USB_CDC_PROTO_NONE), .driver_info = (unsigned long)&qmi_wwan_info, }, { /* Novatel E362 */ USB_DEVICE_AND_INTERFACE_INFO(0x1410, 0x9010, USB_CLASS_COMM, USB_CDC_SUBCLASS_ETHERNET, USB_CDC_PROTO_NONE), .driver_info = (unsigned long)&qmi_wwan_info, }, { /* Novatel Expedite E371 */ USB_DEVICE_AND_INTERFACE_INFO(0x1410, 0x9011, USB_CLASS_COMM, USB_CDC_SUBCLASS_ETHERNET, USB_CDC_PROTO_NONE), .driver_info = (unsigned long)&qmi_wwan_info, }, { /* Dell Wireless 5800 (Novatel E362) */ USB_DEVICE_AND_INTERFACE_INFO(0x413C, 0x8195, USB_CLASS_COMM, USB_CDC_SUBCLASS_ETHERNET, USB_CDC_PROTO_NONE), .driver_info = (unsigned long)&qmi_wwan_info, }, { /* Dell Wireless 5800 V2 (Novatel E362) */ USB_DEVICE_AND_INTERFACE_INFO(0x413C, 0x8196, USB_CLASS_COMM, USB_CDC_SUBCLASS_ETHERNET, USB_CDC_PROTO_NONE), .driver_info = (unsigned long)&qmi_wwan_info, }, { /* Dell Wireless 5804 (Novatel E371) */ USB_DEVICE_AND_INTERFACE_INFO(0x413C, 0x819b, USB_CLASS_COMM, USB_CDC_SUBCLASS_ETHERNET, USB_CDC_PROTO_NONE), .driver_info = (unsigned long)&qmi_wwan_info, }, { /* ADU960S */ USB_DEVICE_AND_INTERFACE_INFO(0x16d5, 0x650a, USB_CLASS_COMM, USB_CDC_SUBCLASS_ETHERNET, USB_CDC_PROTO_NONE), .driver_info = (unsigned long)&qmi_wwan_info, }, { /* HP lt2523 (Novatel E371) */ USB_DEVICE_AND_INTERFACE_INFO(0x03f0, 0x421d, USB_CLASS_COMM, USB_CDC_SUBCLASS_ETHERNET, USB_CDC_PROTO_NONE), .driver_info = (unsigned long)&qmi_wwan_info, }, { /* HP lt4112 LTE/HSPA+ Gobi 4G Module (Huawei me906e) */ USB_DEVICE_AND_INTERFACE_INFO(0x03f0, 0x581d, USB_CLASS_VENDOR_SPEC, 1, 7), .driver_info = (unsigned long)&qmi_wwan_info, }, {QMI_MATCH_FF_FF_FF(0x2c7c, 0x0122)}, /* Quectel RG650V */ {QMI_MATCH_FF_FF_FF(0x2c7c, 0x0125)}, /* Quectel EC25, EC20 R2.0 Mini PCIe */ {QMI_MATCH_FF_FF_FF(0x2c7c, 0x0306)}, /* Quectel EP06/EG06/EM06 */ {QMI_MATCH_FF_FF_FF(0x2c7c, 0x0512)}, /* Quectel EG12/EM12 */ {QMI_MATCH_FF_FF_FF(0x2c7c, 0x0620)}, /* Quectel EM160R-GL */ {QMI_MATCH_FF_FF_FF(0x2c7c, 0x0800)}, /* Quectel RM500Q-GL */ {QMI_MATCH_FF_FF_FF(0x2c7c, 0x0801)}, /* Quectel RM520N */ /* 3. Combined interface devices matching on interface number */ {QMI_FIXED_INTF(0x0408, 0xea42, 4)}, /* Yota / Megafon M100-1 */ {QMI_FIXED_INTF(0x05c6, 0x6001, 3)}, /* 4G LTE usb-modem U901 */ {QMI_FIXED_INTF(0x05c6, 0x7000, 0)}, {QMI_FIXED_INTF(0x05c6, 0x7001, 1)}, {QMI_FIXED_INTF(0x05c6, 0x7002, 1)}, {QMI_FIXED_INTF(0x05c6, 0x7101, 1)}, {QMI_FIXED_INTF(0x05c6, 0x7101, 2)}, {QMI_FIXED_INTF(0x05c6, 0x7101, 3)}, {QMI_FIXED_INTF(0x05c6, 0x7102, 1)}, {QMI_FIXED_INTF(0x05c6, 0x7102, 2)}, {QMI_FIXED_INTF(0x05c6, 0x7102, 3)}, {QMI_FIXED_INTF(0x05c6, 0x8000, 7)}, {QMI_FIXED_INTF(0x05c6, 0x8001, 6)}, {QMI_FIXED_INTF(0x05c6, 0x9000, 4)}, {QMI_FIXED_INTF(0x05c6, 0x9003, 4)}, {QMI_FIXED_INTF(0x05c6, 0x9005, 2)}, {QMI_FIXED_INTF(0x05c6, 0x900a, 4)}, {QMI_FIXED_INTF(0x05c6, 0x900b, 2)}, {QMI_FIXED_INTF(0x05c6, 0x900c, 4)}, {QMI_FIXED_INTF(0x05c6, 0x900c, 5)}, {QMI_FIXED_INTF(0x05c6, 0x900c, 6)}, {QMI_FIXED_INTF(0x05c6, 0x900d, 5)}, {QMI_FIXED_INTF(0x05c6, 0x900f, 3)}, {QMI_FIXED_INTF(0x05c6, 0x900f, 4)}, {QMI_FIXED_INTF(0x05c6, 0x900f, 5)}, {QMI_FIXED_INTF(0x05c6, 0x9010, 4)}, {QMI_FIXED_INTF(0x05c6, 0x9010, 5)}, {QMI_FIXED_INTF(0x05c6, 0x9011, 3)}, {QMI_FIXED_INTF(0x05c6, 0x9011, 4)}, {QMI_FIXED_INTF(0x05c6, 0x9021, 1)}, {QMI_FIXED_INTF(0x05c6, 0x9022, 2)}, {QMI_QUIRK_SET_DTR(0x05c6, 0x9025, 4)}, /* Alcatel-sbell ASB TL131 TDD LTE (China Mobile) */ {QMI_FIXED_INTF(0x05c6, 0x9026, 3)}, {QMI_FIXED_INTF(0x05c6, 0x902e, 5)}, {QMI_FIXED_INTF(0x05c6, 0x9031, 5)}, {QMI_FIXED_INTF(0x05c6, 0x9032, 4)}, {QMI_FIXED_INTF(0x05c6, 0x9033, 3)}, {QMI_FIXED_INTF(0x05c6, 0x9033, 4)}, {QMI_FIXED_INTF(0x05c6, 0x9033, 5)}, {QMI_FIXED_INTF(0x05c6, 0x9033, 6)}, {QMI_FIXED_INTF(0x05c6, 0x9034, 3)}, {QMI_FIXED_INTF(0x05c6, 0x9034, 4)}, {QMI_FIXED_INTF(0x05c6, 0x9034, 5)}, {QMI_FIXED_INTF(0x05c6, 0x9034, 6)}, {QMI_FIXED_INTF(0x05c6, 0x9034, 7)}, {QMI_FIXED_INTF(0x05c6, 0x9035, 4)}, {QMI_FIXED_INTF(0x05c6, 0x9036, 3)}, {QMI_FIXED_INTF(0x05c6, 0x9037, 5)}, {QMI_FIXED_INTF(0x05c6, 0x9038, 4)}, {QMI_FIXED_INTF(0x05c6, 0x903b, 7)}, {QMI_FIXED_INTF(0x05c6, 0x903c, 6)}, {QMI_FIXED_INTF(0x05c6, 0x903d, 6)}, {QMI_FIXED_INTF(0x05c6, 0x903e, 5)}, {QMI_FIXED_INTF(0x05c6, 0x9043, 3)}, {QMI_FIXED_INTF(0x05c6, 0x9046, 3)}, {QMI_FIXED_INTF(0x05c6, 0x9046, 4)}, {QMI_FIXED_INTF(0x05c6, 0x9046, 5)}, {QMI_FIXED_INTF(0x05c6, 0x9047, 2)}, {QMI_FIXED_INTF(0x05c6, 0x9047, 3)}, {QMI_FIXED_INTF(0x05c6, 0x9047, 4)}, {QMI_FIXED_INTF(0x05c6, 0x9048, 4)}, {QMI_FIXED_INTF(0x05c6, 0x9048, 5)}, {QMI_FIXED_INTF(0x05c6, 0x9048, 6)}, {QMI_FIXED_INTF(0x05c6, 0x9048, 7)}, {QMI_FIXED_INTF(0x05c6, 0x9048, 8)}, {QMI_FIXED_INTF(0x05c6, 0x904c, 5)}, {QMI_FIXED_INTF(0x05c6, 0x904c, 6)}, {QMI_FIXED_INTF(0x05c6, 0x904c, 7)}, {QMI_FIXED_INTF(0x05c6, 0x904c, 8)}, {QMI_FIXED_INTF(0x05c6, 0x9050, 3)}, {QMI_FIXED_INTF(0x05c6, 0x9052, 4)}, {QMI_FIXED_INTF(0x05c6, 0x9053, 6)}, {QMI_FIXED_INTF(0x05c6, 0x9053, 7)}, {QMI_FIXED_INTF(0x05c6, 0x9054, 5)}, {QMI_FIXED_INTF(0x05c6, 0x9054, 6)}, {QMI_FIXED_INTF(0x05c6, 0x9055, 3)}, {QMI_FIXED_INTF(0x05c6, 0x9055, 4)}, {QMI_FIXED_INTF(0x05c6, 0x9055, 5)}, {QMI_FIXED_INTF(0x05c6, 0x9055, 6)}, {QMI_FIXED_INTF(0x05c6, 0x9055, 7)}, {QMI_FIXED_INTF(0x05c6, 0x9056, 3)}, {QMI_FIXED_INTF(0x05c6, 0x9062, 2)}, {QMI_FIXED_INTF(0x05c6, 0x9062, 3)}, {QMI_FIXED_INTF(0x05c6, 0x9062, 4)}, {QMI_FIXED_INTF(0x05c6, 0x9062, 5)}, {QMI_FIXED_INTF(0x05c6, 0x9062, 6)}, {QMI_FIXED_INTF(0x05c6, 0x9062, 7)}, {QMI_FIXED_INTF(0x05c6, 0x9062, 8)}, {QMI_FIXED_INTF(0x05c6, 0x9062, 9)}, {QMI_FIXED_INTF(0x05c6, 0x9064, 3)}, {QMI_FIXED_INTF(0x05c6, 0x9065, 6)}, {QMI_FIXED_INTF(0x05c6, 0x9065, 7)}, {QMI_FIXED_INTF(0x05c6, 0x9066, 5)}, {QMI_FIXED_INTF(0x05c6, 0x9066, 6)}, {QMI_FIXED_INTF(0x05c6, 0x9067, 1)}, {QMI_FIXED_INTF(0x05c6, 0x9068, 2)}, {QMI_FIXED_INTF(0x05c6, 0x9068, 3)}, {QMI_FIXED_INTF(0x05c6, 0x9068, 4)}, {QMI_FIXED_INTF(0x05c6, 0x9068, 5)}, {QMI_FIXED_INTF(0x05c6, 0x9068, 6)}, {QMI_FIXED_INTF(0x05c6, 0x9068, 7)}, {QMI_FIXED_INTF(0x05c6, 0x9069, 5)}, {QMI_FIXED_INTF(0x05c6, 0x9069, 6)}, {QMI_FIXED_INTF(0x05c6, 0x9069, 7)}, {QMI_FIXED_INTF(0x05c6, 0x9069, 8)}, {QMI_FIXED_INTF(0x05c6, 0x9070, 4)}, {QMI_FIXED_INTF(0x05c6, 0x9070, 5)}, {QMI_FIXED_INTF(0x05c6, 0x9075, 5)}, {QMI_FIXED_INTF(0x05c6, 0x9076, 4)}, {QMI_FIXED_INTF(0x05c6, 0x9076, 5)}, {QMI_FIXED_INTF(0x05c6, 0x9076, 6)}, {QMI_FIXED_INTF(0x05c6, 0x9076, 7)}, {QMI_FIXED_INTF(0x05c6, 0x9076, 8)}, {QMI_FIXED_INTF(0x05c6, 0x9077, 3)}, {QMI_FIXED_INTF(0x05c6, 0x9077, 4)}, {QMI_FIXED_INTF(0x05c6, 0x9077, 5)}, {QMI_FIXED_INTF(0x05c6, 0x9077, 6)}, {QMI_FIXED_INTF(0x05c6, 0x9078, 3)}, {QMI_FIXED_INTF(0x05c6, 0x9079, 4)}, {QMI_FIXED_INTF(0x05c6, 0x9079, 5)}, {QMI_FIXED_INTF(0x05c6, 0x9079, 6)}, {QMI_FIXED_INTF(0x05c6, 0x9079, 7)}, {QMI_FIXED_INTF(0x05c6, 0x9079, 8)}, {QMI_FIXED_INTF(0x05c6, 0x9080, 5)}, {QMI_FIXED_INTF(0x05c6, 0x9080, 6)}, {QMI_FIXED_INTF(0x05c6, 0x9080, 7)}, {QMI_FIXED_INTF(0x05c6, 0x9080, 8)}, {QMI_FIXED_INTF(0x05c6, 0x9083, 3)}, {QMI_FIXED_INTF(0x05c6, 0x9084, 4)}, {QMI_QUIRK_SET_DTR(0x05c6, 0x9091, 2)}, /* Compal RXM-G1 */ {QMI_FIXED_INTF(0x05c6, 0x90b2, 3)}, /* ublox R410M */ {QMI_QUIRK_SET_DTR(0x05c6, 0x90db, 2)}, /* Compal RXM-G1 */ {QMI_FIXED_INTF(0x05c6, 0x920d, 0)}, {QMI_FIXED_INTF(0x05c6, 0x920d, 5)}, {QMI_QUIRK_SET_DTR(0x05c6, 0x9625, 4)}, /* YUGA CLM920-NC5 */ {QMI_FIXED_INTF(0x0846, 0x68a2, 8)}, {QMI_FIXED_INTF(0x0846, 0x68d3, 8)}, /* Netgear Aircard 779S */ {QMI_FIXED_INTF(0x12d1, 0x140c, 1)}, /* Huawei E173 */ {QMI_FIXED_INTF(0x12d1, 0x14ac, 1)}, /* Huawei E1820 */ {QMI_FIXED_INTF(0x1435, 0x0918, 3)}, /* Wistron NeWeb D16Q1 */ {QMI_FIXED_INTF(0x1435, 0x0918, 4)}, /* Wistron NeWeb D16Q1 */ {QMI_FIXED_INTF(0x1435, 0x0918, 5)}, /* Wistron NeWeb D16Q1 */ {QMI_FIXED_INTF(0x1435, 0x3185, 4)}, /* Wistron NeWeb M18Q5 */ {QMI_FIXED_INTF(0x1435, 0xd111, 4)}, /* M9615A DM11-1 D51QC */ {QMI_FIXED_INTF(0x1435, 0xd181, 3)}, /* Wistron NeWeb D18Q1 */ {QMI_FIXED_INTF(0x1435, 0xd181, 4)}, /* Wistron NeWeb D18Q1 */ {QMI_FIXED_INTF(0x1435, 0xd181, 5)}, /* Wistron NeWeb D18Q1 */ {QMI_FIXED_INTF(0x1435, 0xd182, 4)}, /* Wistron NeWeb D18 */ {QMI_FIXED_INTF(0x1435, 0xd182, 5)}, /* Wistron NeWeb D18 */ {QMI_FIXED_INTF(0x1435, 0xd191, 4)}, /* Wistron NeWeb D19Q1 */ {QMI_QUIRK_SET_DTR(0x1508, 0x1001, 4)}, /* Fibocom NL668 series */ {QMI_FIXED_INTF(0x1690, 0x7588, 4)}, /* ASKEY WWHC050 */ {QMI_FIXED_INTF(0x16d8, 0x6003, 0)}, /* CMOTech 6003 */ {QMI_FIXED_INTF(0x16d8, 0x6007, 0)}, /* CMOTech CHE-628S */ {QMI_FIXED_INTF(0x16d8, 0x6008, 0)}, /* CMOTech CMU-301 */ {QMI_FIXED_INTF(0x16d8, 0x6280, 0)}, /* CMOTech CHU-628 */ {QMI_FIXED_INTF(0x16d8, 0x7001, 0)}, /* CMOTech CHU-720S */ {QMI_FIXED_INTF(0x16d8, 0x7002, 0)}, /* CMOTech 7002 */ {QMI_FIXED_INTF(0x16d8, 0x7003, 4)}, /* CMOTech CHU-629K */ {QMI_FIXED_INTF(0x16d8, 0x7004, 3)}, /* CMOTech 7004 */ {QMI_FIXED_INTF(0x16d8, 0x7006, 5)}, /* CMOTech CGU-629 */ {QMI_FIXED_INTF(0x16d8, 0x700a, 4)}, /* CMOTech CHU-629S */ {QMI_FIXED_INTF(0x16d8, 0x7211, 0)}, /* CMOTech CHU-720I */ {QMI_FIXED_INTF(0x16d8, 0x7212, 0)}, /* CMOTech 7212 */ {QMI_FIXED_INTF(0x16d8, 0x7213, 0)}, /* CMOTech 7213 */ {QMI_FIXED_INTF(0x16d8, 0x7251, 1)}, /* CMOTech 7251 */ {QMI_FIXED_INTF(0x16d8, 0x7252, 1)}, /* CMOTech 7252 */ {QMI_FIXED_INTF(0x16d8, 0x7253, 1)}, /* CMOTech 7253 */ {QMI_FIXED_INTF(0x19d2, 0x0002, 1)}, {QMI_FIXED_INTF(0x19d2, 0x0012, 1)}, {QMI_FIXED_INTF(0x19d2, 0x0017, 3)}, {QMI_FIXED_INTF(0x19d2, 0x0019, 3)}, /* ONDA MT689DC */ {QMI_FIXED_INTF(0x19d2, 0x0021, 4)}, {QMI_FIXED_INTF(0x19d2, 0x0025, 1)}, {QMI_FIXED_INTF(0x19d2, 0x0031, 4)}, {QMI_FIXED_INTF(0x19d2, 0x0042, 4)}, {QMI_FIXED_INTF(0x19d2, 0x0049, 5)}, {QMI_FIXED_INTF(0x19d2, 0x0052, 4)}, {QMI_FIXED_INTF(0x19d2, 0x0055, 1)}, /* ZTE (Vodafone) K3520-Z */ {QMI_FIXED_INTF(0x19d2, 0x0058, 4)}, {QMI_FIXED_INTF(0x19d2, 0x0063, 4)}, /* ZTE (Vodafone) K3565-Z */ {QMI_FIXED_INTF(0x19d2, 0x0104, 4)}, /* ZTE (Vodafone) K4505-Z */ {QMI_FIXED_INTF(0x19d2, 0x0113, 5)}, {QMI_FIXED_INTF(0x19d2, 0x0118, 5)}, {QMI_FIXED_INTF(0x19d2, 0x0121, 5)}, {QMI_FIXED_INTF(0x19d2, 0x0123, 4)}, {QMI_FIXED_INTF(0x19d2, 0x0124, 5)}, {QMI_FIXED_INTF(0x19d2, 0x0125, 6)}, {QMI_FIXED_INTF(0x19d2, 0x0126, 5)}, {QMI_FIXED_INTF(0x19d2, 0x0130, 1)}, {QMI_FIXED_INTF(0x19d2, 0x0133, 3)}, {QMI_FIXED_INTF(0x19d2, 0x0141, 5)}, {QMI_FIXED_INTF(0x19d2, 0x0157, 5)}, /* ZTE MF683 */ {QMI_FIXED_INTF(0x19d2, 0x0158, 3)}, {QMI_FIXED_INTF(0x19d2, 0x0167, 4)}, /* ZTE MF820D */ {QMI_FIXED_INTF(0x19d2, 0x0168, 4)}, {QMI_FIXED_INTF(0x19d2, 0x0176, 3)}, {QMI_FIXED_INTF(0x19d2, 0x0178, 3)}, {QMI_FIXED_INTF(0x19d2, 0x0189, 4)}, /* ZTE MF290 */ {QMI_FIXED_INTF(0x19d2, 0x0191, 4)}, /* ZTE EuFi890 */ {QMI_FIXED_INTF(0x19d2, 0x0199, 1)}, /* ZTE MF820S */ {QMI_FIXED_INTF(0x19d2, 0x0200, 1)}, {QMI_FIXED_INTF(0x19d2, 0x0257, 3)}, /* ZTE MF821 */ {QMI_FIXED_INTF(0x19d2, 0x0265, 4)}, /* ONDA MT8205 4G LTE */ {QMI_FIXED_INTF(0x19d2, 0x0284, 4)}, /* ZTE MF880 */ {QMI_FIXED_INTF(0x19d2, 0x0326, 4)}, /* ZTE MF821D */ {QMI_FIXED_INTF(0x19d2, 0x0396, 3)}, /* ZTE ZM8620 */ {QMI_FIXED_INTF(0x19d2, 0x0412, 4)}, /* Telewell TW-LTE 4G */ {QMI_FIXED_INTF(0x19d2, 0x1008, 4)}, /* ZTE (Vodafone) K3570-Z */ {QMI_FIXED_INTF(0x19d2, 0x1010, 4)}, /* ZTE (Vodafone) K3571-Z */ {QMI_FIXED_INTF(0x19d2, 0x1012, 4)}, {QMI_FIXED_INTF(0x19d2, 0x1018, 3)}, /* ZTE (Vodafone) K5006-Z */ {QMI_FIXED_INTF(0x19d2, 0x1021, 2)}, {QMI_FIXED_INTF(0x19d2, 0x1245, 4)}, {QMI_FIXED_INTF(0x19d2, 0x1247, 4)}, {QMI_FIXED_INTF(0x19d2, 0x1252, 4)}, {QMI_FIXED_INTF(0x19d2, 0x1254, 4)}, {QMI_FIXED_INTF(0x19d2, 0x1255, 3)}, {QMI_FIXED_INTF(0x19d2, 0x1255, 4)}, {QMI_FIXED_INTF(0x19d2, 0x1256, 4)}, {QMI_FIXED_INTF(0x19d2, 0x1270, 5)}, /* ZTE MF667 */ {QMI_FIXED_INTF(0x19d2, 0x1275, 3)}, /* ZTE P685M */ {QMI_FIXED_INTF(0x19d2, 0x1401, 2)}, {QMI_FIXED_INTF(0x19d2, 0x1402, 2)}, /* ZTE MF60 */ {QMI_FIXED_INTF(0x19d2, 0x1424, 2)}, {QMI_FIXED_INTF(0x19d2, 0x1425, 2)}, {QMI_FIXED_INTF(0x19d2, 0x1426, 2)}, /* ZTE MF91 */ {QMI_FIXED_INTF(0x19d2, 0x1428, 2)}, /* Telewell TW-LTE 4G v2 */ {QMI_FIXED_INTF(0x19d2, 0x1432, 3)}, /* ZTE ME3620 */ {QMI_FIXED_INTF(0x19d2, 0x1485, 5)}, /* ZTE MF286D */ {QMI_FIXED_INTF(0x19d2, 0x2002, 4)}, /* ZTE (Vodafone) K3765-Z */ {QMI_FIXED_INTF(0x2001, 0x7e16, 3)}, /* D-Link DWM-221 */ {QMI_FIXED_INTF(0x2001, 0x7e19, 4)}, /* D-Link DWM-221 B1 */ {QMI_FIXED_INTF(0x2001, 0x7e35, 4)}, /* D-Link DWM-222 */ {QMI_FIXED_INTF(0x2001, 0x7e3d, 4)}, /* D-Link DWM-222 A2 */ {QMI_FIXED_INTF(0x2020, 0x2031, 4)}, /* Olicard 600 */ {QMI_FIXED_INTF(0x2020, 0x2033, 4)}, /* BroadMobi BM806U */ {QMI_QUIRK_SET_DTR(0x2020, 0x2060, 4)}, /* BroadMobi BM818 */ {QMI_FIXED_INTF(0x0f3d, 0x68a2, 8)}, /* Sierra Wireless MC7700 */ {QMI_FIXED_INTF(0x114f, 0x68a2, 8)}, /* Sierra Wireless MC7750 */ {QMI_FIXED_INTF(0x1199, 0x68a2, 8)}, /* Sierra Wireless MC7710 in QMI mode */ {QMI_FIXED_INTF(0x1199, 0x68a2, 19)}, /* Sierra Wireless MC7710 in QMI mode */ {QMI_QUIRK_SET_DTR(0x1199, 0x68c0, 8)}, /* Sierra Wireless MC7304/MC7354, WP76xx */ {QMI_QUIRK_SET_DTR(0x1199, 0x68c0, 10)},/* Sierra Wireless MC7304/MC7354 */ {QMI_FIXED_INTF(0x1199, 0x901c, 8)}, /* Sierra Wireless EM7700 */ {QMI_FIXED_INTF(0x1199, 0x901f, 8)}, /* Sierra Wireless EM7355 */ {QMI_FIXED_INTF(0x1199, 0x9041, 8)}, /* Sierra Wireless MC7305/MC7355 */ {QMI_FIXED_INTF(0x1199, 0x9041, 10)}, /* Sierra Wireless MC7305/MC7355 */ {QMI_FIXED_INTF(0x1199, 0x9051, 8)}, /* Netgear AirCard 340U */ {QMI_FIXED_INTF(0x1199, 0x9053, 8)}, /* Sierra Wireless Modem */ {QMI_FIXED_INTF(0x1199, 0x9054, 8)}, /* Sierra Wireless Modem */ {QMI_FIXED_INTF(0x1199, 0x9055, 8)}, /* Netgear AirCard 341U */ {QMI_FIXED_INTF(0x1199, 0x9056, 8)}, /* Sierra Wireless Modem */ {QMI_FIXED_INTF(0x1199, 0x9057, 8)}, {QMI_FIXED_INTF(0x1199, 0x9061, 8)}, /* Sierra Wireless Modem */ {QMI_FIXED_INTF(0x1199, 0x9063, 8)}, /* Sierra Wireless EM7305 */ {QMI_FIXED_INTF(0x1199, 0x9063, 10)}, /* Sierra Wireless EM7305 */ {QMI_QUIRK_SET_DTR(0x1199, 0x9071, 8)}, /* Sierra Wireless MC74xx */ {QMI_QUIRK_SET_DTR(0x1199, 0x9071, 10)},/* Sierra Wireless MC74xx */ {QMI_QUIRK_SET_DTR(0x1199, 0x9079, 8)}, /* Sierra Wireless EM74xx */ {QMI_QUIRK_SET_DTR(0x1199, 0x9079, 10)},/* Sierra Wireless EM74xx */ {QMI_QUIRK_SET_DTR(0x1199, 0x907b, 8)}, /* Sierra Wireless EM74xx */ {QMI_QUIRK_SET_DTR(0x1199, 0x907b, 10)},/* Sierra Wireless EM74xx */ {QMI_QUIRK_SET_DTR(0x1199, 0x9091, 8)}, /* Sierra Wireless EM7565 */ {QMI_QUIRK_SET_DTR(0x1199, 0xc081, 8)}, /* Sierra Wireless EM7590 */ {QMI_FIXED_INTF(0x1bbb, 0x011e, 4)}, /* Telekom Speedstick LTE II (Alcatel One Touch L100V LTE) */ {QMI_FIXED_INTF(0x1bbb, 0x0203, 2)}, /* Alcatel L800MA */ {QMI_FIXED_INTF(0x2357, 0x0201, 4)}, /* TP-LINK HSUPA Modem MA180 */ {QMI_FIXED_INTF(0x2357, 0x9000, 4)}, /* TP-LINK MA260 */ {QMI_QUIRK_SET_DTR(0x1bc7, 0x1031, 3)}, /* Telit LE910C1-EUX */ {QMI_QUIRK_SET_DTR(0x1bc7, 0x103a, 0)}, /* Telit LE910C4-WWX */ {QMI_QUIRK_SET_DTR(0x1bc7, 0x1040, 2)}, /* Telit LE922A */ {QMI_QUIRK_SET_DTR(0x1bc7, 0x1050, 2)}, /* Telit FN980 */ {QMI_QUIRK_SET_DTR(0x1bc7, 0x1057, 2)}, /* Telit FN980 */ {QMI_QUIRK_SET_DTR(0x1bc7, 0x1060, 2)}, /* Telit LN920 */ {QMI_QUIRK_SET_DTR(0x1bc7, 0x1070, 2)}, /* Telit FN990 */ {QMI_QUIRK_SET_DTR(0x1bc7, 0x1080, 2)}, /* Telit FE990 */ {QMI_QUIRK_SET_DTR(0x1bc7, 0x10a0, 0)}, /* Telit FN920C04 */ {QMI_QUIRK_SET_DTR(0x1bc7, 0x10a4, 0)}, /* Telit FN920C04 */ {QMI_QUIRK_SET_DTR(0x1bc7, 0x10a9, 0)}, /* Telit FN920C04 */ {QMI_QUIRK_SET_DTR(0x1bc7, 0x10c0, 0)}, /* Telit FE910C04 */ {QMI_QUIRK_SET_DTR(0x1bc7, 0x10c4, 0)}, /* Telit FE910C04 */ {QMI_QUIRK_SET_DTR(0x1bc7, 0x10c8, 0)}, /* Telit FE910C04 */ {QMI_FIXED_INTF(0x1bc7, 0x1100, 3)}, /* Telit ME910 */ {QMI_FIXED_INTF(0x1bc7, 0x1101, 3)}, /* Telit ME910 dual modem */ {QMI_FIXED_INTF(0x1bc7, 0x1200, 5)}, /* Telit LE920 */ {QMI_QUIRK_SET_DTR(0x1bc7, 0x1201, 2)}, /* Telit LE920, LE920A4 */ {QMI_QUIRK_SET_DTR(0x1bc7, 0x1230, 2)}, /* Telit LE910Cx */ {QMI_QUIRK_SET_DTR(0x1bc7, 0x1250, 0)}, /* Telit LE910Cx */ {QMI_QUIRK_SET_DTR(0x1bc7, 0x1260, 2)}, /* Telit LE910Cx */ {QMI_QUIRK_SET_DTR(0x1bc7, 0x1261, 2)}, /* Telit LE910Cx */ {QMI_QUIRK_SET_DTR(0x1bc7, 0x1900, 1)}, /* Telit LN940 series */ {QMI_QUIRK_SET_DTR(0x1bc7, 0x3000, 0)}, /* Telit FN912 series */ {QMI_QUIRK_SET_DTR(0x1bc7, 0x3001, 0)}, /* Telit FN912 series */ {QMI_FIXED_INTF(0x1c9e, 0x9801, 3)}, /* Telewell TW-3G HSPA+ */ {QMI_FIXED_INTF(0x1c9e, 0x9803, 4)}, /* Telewell TW-3G HSPA+ */ {QMI_FIXED_INTF(0x1c9e, 0x9b01, 3)}, /* XS Stick W100-2 from 4G Systems */ {QMI_QUIRK_SET_DTR(0x1c9e, 0x9b05, 4)}, /* Longsung U8300 */ {QMI_QUIRK_SET_DTR(0x1c9e, 0x9b3c, 4)}, /* Longsung U9300 */ {QMI_FIXED_INTF(0x0b3c, 0xc000, 4)}, /* Olivetti Olicard 100 */ {QMI_FIXED_INTF(0x0b3c, 0xc001, 4)}, /* Olivetti Olicard 120 */ {QMI_FIXED_INTF(0x0b3c, 0xc002, 4)}, /* Olivetti Olicard 140 */ {QMI_FIXED_INTF(0x0b3c, 0xc004, 6)}, /* Olivetti Olicard 155 */ {QMI_FIXED_INTF(0x0b3c, 0xc005, 6)}, /* Olivetti Olicard 200 */ {QMI_FIXED_INTF(0x0b3c, 0xc00a, 6)}, /* Olivetti Olicard 160 */ {QMI_FIXED_INTF(0x0b3c, 0xc00b, 4)}, /* Olivetti Olicard 500 */ {QMI_FIXED_INTF(0x1e2d, 0x0060, 4)}, /* Cinterion PLxx */ {QMI_QUIRK_SET_DTR(0x1e2d, 0x006f, 8)}, /* Cinterion PLS83/PLS63 */ {QMI_FIXED_INTF(0x1e2d, 0x0053, 4)}, /* Cinterion PHxx,PXxx */ {QMI_FIXED_INTF(0x1e2d, 0x0063, 10)}, /* Cinterion ALASxx (1 RmNet) */ {QMI_FIXED_INTF(0x1e2d, 0x0082, 4)}, /* Cinterion PHxx,PXxx (2 RmNet) */ {QMI_FIXED_INTF(0x1e2d, 0x0082, 5)}, /* Cinterion PHxx,PXxx (2 RmNet) */ {QMI_FIXED_INTF(0x1e2d, 0x0083, 4)}, /* Cinterion PHxx,PXxx (1 RmNet + USB Audio)*/ {QMI_QUIRK_SET_DTR(0x1e2d, 0x00b0, 4)}, /* Cinterion CLS8 */ {QMI_FIXED_INTF(0x1e2d, 0x00b7, 0)}, /* Cinterion MV31 RmNet */ {QMI_FIXED_INTF(0x1e2d, 0x00b9, 0)}, /* Cinterion MV31 RmNet based on new baseline */ {QMI_FIXED_INTF(0x1e2d, 0x00f3, 0)}, /* Cinterion MV32-W-A RmNet */ {QMI_FIXED_INTF(0x1e2d, 0x00f4, 0)}, /* Cinterion MV32-W-B RmNet */ {QMI_FIXED_INTF(0x413c, 0x81a2, 8)}, /* Dell Wireless 5806 Gobi(TM) 4G LTE Mobile Broadband Card */ {QMI_FIXED_INTF(0x413c, 0x81a3, 8)}, /* Dell Wireless 5570 HSPA+ (42Mbps) Mobile Broadband Card */ {QMI_FIXED_INTF(0x413c, 0x81a4, 8)}, /* Dell Wireless 5570e HSPA+ (42Mbps) Mobile Broadband Card */ {QMI_FIXED_INTF(0x413c, 0x81a8, 8)}, /* Dell Wireless 5808 Gobi(TM) 4G LTE Mobile Broadband Card */ {QMI_FIXED_INTF(0x413c, 0x81a9, 8)}, /* Dell Wireless 5808e Gobi(TM) 4G LTE Mobile Broadband Card */ {QMI_FIXED_INTF(0x413c, 0x81b1, 8)}, /* Dell Wireless 5809e Gobi(TM) 4G LTE Mobile Broadband Card */ {QMI_FIXED_INTF(0x413c, 0x81b3, 8)}, /* Dell Wireless 5809e Gobi(TM) 4G LTE Mobile Broadband Card (rev3) */ {QMI_FIXED_INTF(0x413c, 0x81b6, 8)}, /* Dell Wireless 5811e */ {QMI_FIXED_INTF(0x413c, 0x81b6, 10)}, /* Dell Wireless 5811e */ {QMI_FIXED_INTF(0x413c, 0x81c2, 8)}, /* Dell Wireless 5811e */ {QMI_FIXED_INTF(0x413c, 0x81cc, 8)}, /* Dell Wireless 5816e */ {QMI_FIXED_INTF(0x413c, 0x81d7, 0)}, /* Dell Wireless 5821e */ {QMI_FIXED_INTF(0x413c, 0x81d7, 1)}, /* Dell Wireless 5821e preproduction config */ {QMI_FIXED_INTF(0x413c, 0x81e0, 0)}, /* Dell Wireless 5821e with eSIM support*/ {QMI_FIXED_INTF(0x413c, 0x81e4, 0)}, /* Dell Wireless 5829e with eSIM support*/ {QMI_FIXED_INTF(0x413c, 0x81e6, 0)}, /* Dell Wireless 5829e */ {QMI_FIXED_INTF(0x03f0, 0x4e1d, 8)}, /* HP lt4111 LTE/EV-DO/HSPA+ Gobi 4G Module */ {QMI_FIXED_INTF(0x03f0, 0x9d1d, 1)}, /* HP lt4120 Snapdragon X5 LTE */ {QMI_QUIRK_SET_DTR(0x22de, 0x9051, 2)}, /* Hucom Wireless HM-211S/K */ {QMI_FIXED_INTF(0x22de, 0x9061, 3)}, /* WeTelecom WPD-600N */ {QMI_QUIRK_SET_DTR(0x1e0e, 0x9001, 5)}, /* SIMCom 7100E, 7230E, 7600E ++ */ {QMI_QUIRK_SET_DTR(0x2c7c, 0x0121, 4)}, /* Quectel EC21 Mini PCIe */ {QMI_QUIRK_SET_DTR(0x2c7c, 0x0191, 4)}, /* Quectel EG91 */ {QMI_QUIRK_SET_DTR(0x2c7c, 0x0195, 4)}, /* Quectel EG95 */ {QMI_FIXED_INTF(0x2c7c, 0x0296, 4)}, /* Quectel BG96 */ {QMI_QUIRK_SET_DTR(0x2c7c, 0x030e, 4)}, /* Quectel EM05GV2 */ {QMI_QUIRK_SET_DTR(0x2c7c, 0x0316, 3)}, /* Quectel RG255C */ {QMI_QUIRK_SET_DTR(0x2cb7, 0x0104, 4)}, /* Fibocom NL678 series */ {QMI_QUIRK_SET_DTR(0x2cb7, 0x0112, 0)}, /* Fibocom FG132 */ {QMI_FIXED_INTF(0x0489, 0xe0b4, 0)}, /* Foxconn T77W968 LTE */ {QMI_FIXED_INTF(0x0489, 0xe0b5, 0)}, /* Foxconn T77W968 LTE with eSIM support*/ {QMI_FIXED_INTF(0x2692, 0x9025, 4)}, /* Cellient MPL200 (rebranded Qualcomm 05c6:9025) */ {QMI_QUIRK_SET_DTR(0x1546, 0x1312, 4)}, /* u-blox LARA-R6 01B */ {QMI_QUIRK_SET_DTR(0x1546, 0x1342, 4)}, /* u-blox LARA-L6 */ {QMI_QUIRK_SET_DTR(0x33f8, 0x0104, 4)}, /* Rolling RW101 RMNET */ {QMI_FIXED_INTF(0x2dee, 0x4d22, 5)}, /* MeiG Smart SRM825L */ /* 4. Gobi 1000 devices */ {QMI_GOBI1K_DEVICE(0x05c6, 0x9212)}, /* Acer Gobi Modem Device */ {QMI_GOBI1K_DEVICE(0x03f0, 0x1f1d)}, /* HP un2400 Gobi Modem Device */ {QMI_GOBI1K_DEVICE(0x04da, 0x250d)}, /* Panasonic Gobi Modem device */ {QMI_GOBI1K_DEVICE(0x413c, 0x8172)}, /* Dell Gobi Modem device */ {QMI_GOBI1K_DEVICE(0x1410, 0xa001)}, /* Novatel/Verizon USB-1000 */ {QMI_GOBI1K_DEVICE(0x1410, 0xa002)}, /* Novatel Gobi Modem device */ {QMI_GOBI1K_DEVICE(0x1410, 0xa003)}, /* Novatel Gobi Modem device */ {QMI_GOBI1K_DEVICE(0x1410, 0xa004)}, /* Novatel Gobi Modem device */ {QMI_GOBI1K_DEVICE(0x1410, 0xa005)}, /* Novatel Gobi Modem device */ {QMI_GOBI1K_DEVICE(0x1410, 0xa006)}, /* Novatel Gobi Modem device */ {QMI_GOBI1K_DEVICE(0x1410, 0xa007)}, /* Novatel Gobi Modem device */ {QMI_GOBI1K_DEVICE(0x0b05, 0x1776)}, /* Asus Gobi Modem device */ {QMI_GOBI1K_DEVICE(0x19d2, 0xfff3)}, /* ONDA Gobi Modem device */ {QMI_GOBI1K_DEVICE(0x05c6, 0x9001)}, /* Generic Gobi Modem device */ {QMI_GOBI1K_DEVICE(0x05c6, 0x9002)}, /* Generic Gobi Modem device */ {QMI_GOBI1K_DEVICE(0x05c6, 0x9202)}, /* Generic Gobi Modem device */ {QMI_GOBI1K_DEVICE(0x05c6, 0x9203)}, /* Generic Gobi Modem device */ {QMI_GOBI1K_DEVICE(0x05c6, 0x9222)}, /* Generic Gobi Modem device */ {QMI_GOBI1K_DEVICE(0x05c6, 0x9009)}, /* Generic Gobi Modem device */ /* 5. Gobi 2000 and 3000 devices */ {QMI_GOBI_DEVICE(0x413c, 0x8186)}, /* Dell Gobi 2000 Modem device (N0218, VU936) */ {QMI_GOBI_DEVICE(0x413c, 0x8194)}, /* Dell Gobi 3000 Composite */ {QMI_GOBI_DEVICE(0x05c6, 0x920b)}, /* Generic Gobi 2000 Modem device */ {QMI_GOBI_DEVICE(0x05c6, 0x9225)}, /* Sony Gobi 2000 Modem device (N0279, VU730) */ {QMI_GOBI_DEVICE(0x05c6, 0x9245)}, /* Samsung Gobi 2000 Modem device (VL176) */ {QMI_GOBI_DEVICE(0x03f0, 0x251d)}, /* HP Gobi 2000 Modem device (VP412) */ {QMI_GOBI_DEVICE(0x05c6, 0x9215)}, /* Acer Gobi 2000 Modem device (VP413) */ {QMI_FIXED_INTF(0x05c6, 0x9215, 4)}, /* Quectel EC20 Mini PCIe */ {QMI_GOBI_DEVICE(0x05c6, 0x9265)}, /* Asus Gobi 2000 Modem device (VR305) */ {QMI_GOBI_DEVICE(0x05c6, 0x9235)}, /* Top Global Gobi 2000 Modem device (VR306) */ {QMI_GOBI_DEVICE(0x05c6, 0x9275)}, /* iRex Technologies Gobi 2000 Modem device (VR307) */ {QMI_GOBI_DEVICE(0x0af0, 0x8120)}, /* Option GTM681W */ {QMI_GOBI_DEVICE(0x1199, 0x68a5)}, /* Sierra Wireless Modem */ {QMI_GOBI_DEVICE(0x1199, 0x68a9)}, /* Sierra Wireless Modem */ {QMI_GOBI_DEVICE(0x1199, 0x9001)}, /* Sierra Wireless Gobi 2000 Modem device (VT773) */ {QMI_GOBI_DEVICE(0x1199, 0x9002)}, /* Sierra Wireless Gobi 2000 Modem device (VT773) */ {QMI_GOBI_DEVICE(0x1199, 0x9003)}, /* Sierra Wireless Gobi 2000 Modem device (VT773) */ {QMI_GOBI_DEVICE(0x1199, 0x9004)}, /* Sierra Wireless Gobi 2000 Modem device (VT773) */ {QMI_GOBI_DEVICE(0x1199, 0x9005)}, /* Sierra Wireless Gobi 2000 Modem device (VT773) */ {QMI_GOBI_DEVICE(0x1199, 0x9006)}, /* Sierra Wireless Gobi 2000 Modem device (VT773) */ {QMI_GOBI_DEVICE(0x1199, 0x9007)}, /* Sierra Wireless Gobi 2000 Modem device (VT773) */ {QMI_GOBI_DEVICE(0x1199, 0x9008)}, /* Sierra Wireless Gobi 2000 Modem device (VT773) */ {QMI_GOBI_DEVICE(0x1199, 0x9009)}, /* Sierra Wireless Gobi 2000 Modem device (VT773) */ {QMI_GOBI_DEVICE(0x1199, 0x900a)}, /* Sierra Wireless Gobi 2000 Modem device (VT773) */ {QMI_GOBI_DEVICE(0x1199, 0x9011)}, /* Sierra Wireless Gobi 2000 Modem device (MC8305) */ {QMI_GOBI_DEVICE(0x16d8, 0x8002)}, /* CMDTech Gobi 2000 Modem device (VU922) */ {QMI_GOBI_DEVICE(0x05c6, 0x9205)}, /* Gobi 2000 Modem device */ {QMI_GOBI_DEVICE(0x1199, 0x9013)}, /* Sierra Wireless Gobi 3000 Modem device (MC8355) */ {QMI_GOBI_DEVICE(0x03f0, 0x371d)}, /* HP un2430 Mobile Broadband Module */ {QMI_GOBI_DEVICE(0x1199, 0x9015)}, /* Sierra Wireless Gobi 3000 Modem device */ {QMI_GOBI_DEVICE(0x1199, 0x9019)}, /* Sierra Wireless Gobi 3000 Modem device */ {QMI_GOBI_DEVICE(0x1199, 0x901b)}, /* Sierra Wireless MC7770 */ {QMI_GOBI_DEVICE(0x12d1, 0x14f1)}, /* Sony Gobi 3000 Composite */ {QMI_GOBI_DEVICE(0x1410, 0xa021)}, /* Foxconn Gobi 3000 Modem device (Novatel E396) */ { } /* END */ }; MODULE_DEVICE_TABLE(usb, products); static bool quectel_ec20_detected(struct usb_interface *intf) { struct usb_device *dev = interface_to_usbdev(intf); if (dev->actconfig && le16_to_cpu(dev->descriptor.idVendor) == 0x05c6 && le16_to_cpu(dev->descriptor.idProduct) == 0x9215 && dev->actconfig->desc.bNumInterfaces == 5) return true; return false; } static int qmi_wwan_probe(struct usb_interface *intf, const struct usb_device_id *prod) { struct usb_device_id *id = (struct usb_device_id *)prod; struct usb_interface_descriptor *desc = &intf->cur_altsetting->desc; /* Workaround to enable dynamic IDs. This disables usbnet * blacklisting functionality. Which, if required, can be * reimplemented here by using a magic "blacklist" value * instead of 0 in the static device id table */ if (!id->driver_info) { dev_dbg(&intf->dev, "setting defaults for dynamic device id\n"); id->driver_info = (unsigned long)&qmi_wwan_info; } /* There are devices where the same interface number can be * configured as different functions. We should only bind to * vendor specific functions when matching on interface number */ if (id->match_flags & USB_DEVICE_ID_MATCH_INT_NUMBER && desc->bInterfaceClass != USB_CLASS_VENDOR_SPEC) { dev_dbg(&intf->dev, "Rejecting interface number match for class %02x\n", desc->bInterfaceClass); return -ENODEV; } /* Quectel EC20 quirk where we've QMI on interface 4 instead of 0 */ if (quectel_ec20_detected(intf) && desc->bInterfaceNumber == 0) { dev_dbg(&intf->dev, "Quectel EC20 quirk, skipping interface 0\n"); return -ENODEV; } /* Several Quectel modems supports dynamic interface configuration, so * we need to match on class/subclass/protocol. These values are * identical for the diagnostic- and QMI-interface, but bNumEndpoints is * different. Ignore the current interface if the number of endpoints * equals the number for the diag interface (two). */ if (desc->bNumEndpoints == 2) return -ENODEV; return usbnet_probe(intf, id); } static void qmi_wwan_disconnect(struct usb_interface *intf) { struct usbnet *dev = usb_get_intfdata(intf); struct qmi_wwan_state *info; struct list_head *iter; struct net_device *ldev; LIST_HEAD(list); /* called twice if separate control and data intf */ if (!dev) return; info = (void *)&dev->data; if (info->flags & QMI_WWAN_FLAG_MUX) { if (!rtnl_trylock()) { restart_syscall(); return; } rcu_read_lock(); netdev_for_each_upper_dev_rcu(dev->net, ldev, iter) qmimux_unregister_device(ldev, &list); rcu_read_unlock(); unregister_netdevice_many(&list); rtnl_unlock(); info->flags &= ~QMI_WWAN_FLAG_MUX; } usbnet_disconnect(intf); } static struct usb_driver qmi_wwan_driver = { .name = "qmi_wwan", .id_table = products, .probe = qmi_wwan_probe, .disconnect = qmi_wwan_disconnect, .suspend = qmi_wwan_suspend, .resume = qmi_wwan_resume, .reset_resume = qmi_wwan_resume, .supports_autosuspend = 1, .disable_hub_initiated_lpm = 1, }; module_usb_driver(qmi_wwan_driver); MODULE_AUTHOR("Bjørn Mork <bjorn@mork.no>"); MODULE_DESCRIPTION("Qualcomm MSM Interface (QMI) WWAN driver"); MODULE_LICENSE("GPL"); |
| 760 3569 74 614 2952 17 26 313 415 98 97 32 813 11 2 2 74 82 8 8 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 | /* SPDX-License-Identifier: GPL-2.0-only */ /* * Copyright (c) 2020 Christoph Hellwig. * * Support for "universal" pointers that can point to either kernel or userspace * memory. */ #ifndef _LINUX_SOCKPTR_H #define _LINUX_SOCKPTR_H #include <linux/slab.h> #include <linux/uaccess.h> typedef struct { union { void *kernel; void __user *user; }; bool is_kernel : 1; } sockptr_t; static inline bool sockptr_is_kernel(sockptr_t sockptr) { return sockptr.is_kernel; } static inline sockptr_t KERNEL_SOCKPTR(void *p) { return (sockptr_t) { .kernel = p, .is_kernel = true }; } static inline sockptr_t USER_SOCKPTR(void __user *p) { return (sockptr_t) { .user = p }; } static inline bool sockptr_is_null(sockptr_t sockptr) { if (sockptr_is_kernel(sockptr)) return !sockptr.kernel; return !sockptr.user; } static inline int copy_from_sockptr_offset(void *dst, sockptr_t src, size_t offset, size_t size) { if (!sockptr_is_kernel(src)) return copy_from_user(dst, src.user + offset, size); memcpy(dst, src.kernel + offset, size); return 0; } /* Deprecated. * This is unsafe, unless caller checked user provided optlen. * Prefer copy_safe_from_sockptr() instead. * * Returns 0 for success, or number of bytes not copied on error. */ static inline int copy_from_sockptr(void *dst, sockptr_t src, size_t size) { return copy_from_sockptr_offset(dst, src, 0, size); } /** * copy_safe_from_sockptr: copy a struct from sockptr * @dst: Destination address, in kernel space. This buffer must be @ksize * bytes long. * @ksize: Size of @dst struct. * @optval: Source address. (in user or kernel space) * @optlen: Size of @optval data. * * Returns: * * -EINVAL: @optlen < @ksize * * -EFAULT: access to userspace failed. * * 0 : @ksize bytes were copied */ static inline int copy_safe_from_sockptr(void *dst, size_t ksize, sockptr_t optval, unsigned int optlen) { if (optlen < ksize) return -EINVAL; if (copy_from_sockptr(dst, optval, ksize)) return -EFAULT; return 0; } static inline int copy_struct_from_sockptr(void *dst, size_t ksize, sockptr_t src, size_t usize) { size_t size = min(ksize, usize); size_t rest = max(ksize, usize) - size; if (!sockptr_is_kernel(src)) return copy_struct_from_user(dst, ksize, src.user, size); if (usize < ksize) { memset(dst + size, 0, rest); } else if (usize > ksize) { char *p = src.kernel; while (rest--) { if (*p++) return -E2BIG; } } memcpy(dst, src.kernel, size); return 0; } static inline int copy_to_sockptr_offset(sockptr_t dst, size_t offset, const void *src, size_t size) { if (!sockptr_is_kernel(dst)) return copy_to_user(dst.user + offset, src, size); memcpy(dst.kernel + offset, src, size); return 0; } static inline int copy_to_sockptr(sockptr_t dst, const void *src, size_t size) { return copy_to_sockptr_offset(dst, 0, src, size); } static inline void *memdup_sockptr_noprof(sockptr_t src, size_t len) { void *p = kmalloc_track_caller_noprof(len, GFP_USER | __GFP_NOWARN); if (!p) return ERR_PTR(-ENOMEM); if (copy_from_sockptr(p, src, len)) { kfree(p); return ERR_PTR(-EFAULT); } return p; } #define memdup_sockptr(...) alloc_hooks(memdup_sockptr_noprof(__VA_ARGS__)) static inline void *memdup_sockptr_nul_noprof(sockptr_t src, size_t len) { char *p = kmalloc_track_caller_noprof(len + 1, GFP_KERNEL); if (!p) return ERR_PTR(-ENOMEM); if (copy_from_sockptr(p, src, len)) { kfree(p); return ERR_PTR(-EFAULT); } p[len] = '\0'; return p; } #define memdup_sockptr_nul(...) alloc_hooks(memdup_sockptr_nul_noprof(__VA_ARGS__)) static inline long strncpy_from_sockptr(char *dst, sockptr_t src, size_t count) { if (sockptr_is_kernel(src)) { size_t len = min(strnlen(src.kernel, count - 1) + 1, count); memcpy(dst, src.kernel, len); return len; } return strncpy_from_user(dst, src.user, count); } static inline int check_zeroed_sockptr(sockptr_t src, size_t offset, size_t size) { if (!sockptr_is_kernel(src)) return check_zeroed_user(src.user + offset, size); return memchr_inv(src.kernel + offset, 0, size) == NULL; } #endif /* _LINUX_SOCKPTR_H */ |
| 10 10 9 9 9 3 10 10 10 10 10 187 187 18 181 7 175 1 1 181 180 12 6 181 181 181 9 2 4 181 9 9 180 10 181 181 12 10 10 10 8 3 9 9 181 10 10 2 8 2 8 8 12 2 10 8 25 25 25 25 25 25 25 25 25 25 25 22 3 20 10 14 12 12 12 12 99 15 14 2 14 14 14 14 14 5 1 8 8 181 181 181 181 9 181 181 181 15 181 181 184 180 184 184 1 180 91 180 181 181 181 180 181 181 181 181 180 181 181 181 181 181 181 180 181 180 181 181 181 181 181 181 177 181 180 9 10 11 11 1 3 10 1 11 10 3 3 11 11 10 3 11 11 10 10 10 10 9 4 4 1 4 10 10 10 1 73 291 304 15 180 180 180 179 180 5 180 180 179 180 180 158 22 70 110 180 10 181 1 181 181 181 181 181 1 181 181 8 2 8 8 181 181 181 1 181 97 15 90 90 271 28 10 259 28 97 29 89 10 10 10 40 30 297 46 299 54 222 304 221 304 304 304 304 222 222 304 304 3 3 297 222 304 3 3 3 3 3 2 3 3 3 3 3 3 303 303 5 304 304 302 304 304 304 304 304 300 4 1 64 303 303 3 4 1 303 303 304 302 222 303 304 302 304 304 222 222 304 3 3 304 304 304 291 73 9 9 181 181 14 14 14 181 181 181 181 25 24 1 290 1 77 27 261 290 291 1 291 261 2 3 2 3 3 3 1 3 3 3 3 3 3 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808 4809 4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 4820 4821 4822 4823 4824 4825 4826 4827 4828 4829 4830 4831 4832 4833 4834 4835 4836 4837 4838 4839 4840 4841 4842 4843 4844 4845 4846 4847 4848 4849 4850 4851 4852 4853 4854 4855 4856 4857 4858 4859 4860 4861 4862 4863 4864 4865 4866 4867 4868 4869 4870 4871 4872 4873 4874 4875 4876 4877 4878 4879 4880 4881 4882 4883 4884 4885 4886 4887 4888 4889 4890 4891 4892 4893 4894 4895 4896 4897 4898 4899 4900 4901 4902 4903 4904 4905 4906 4907 4908 4909 4910 4911 4912 4913 4914 4915 4916 4917 4918 4919 4920 4921 4922 4923 4924 4925 4926 4927 4928 4929 4930 4931 4932 4933 4934 4935 4936 4937 4938 4939 4940 4941 4942 4943 4944 4945 4946 4947 4948 4949 4950 4951 4952 4953 4954 4955 4956 4957 4958 4959 4960 4961 4962 4963 4964 4965 4966 4967 4968 4969 4970 4971 4972 4973 4974 4975 4976 4977 4978 4979 4980 4981 4982 4983 4984 4985 4986 4987 4988 4989 4990 4991 4992 4993 4994 4995 4996 4997 4998 4999 5000 5001 5002 5003 5004 5005 5006 5007 5008 5009 5010 5011 5012 5013 5014 5015 5016 5017 5018 5019 5020 5021 5022 5023 5024 5025 5026 5027 5028 5029 5030 5031 5032 5033 5034 5035 5036 5037 5038 5039 5040 5041 5042 5043 5044 5045 5046 5047 5048 5049 5050 5051 5052 5053 5054 5055 5056 5057 5058 5059 5060 5061 5062 5063 5064 5065 5066 5067 5068 5069 5070 5071 5072 5073 5074 5075 5076 5077 5078 5079 5080 5081 5082 5083 5084 5085 5086 5087 5088 5089 5090 5091 5092 5093 5094 5095 5096 5097 5098 5099 5100 5101 5102 5103 5104 5105 5106 5107 5108 5109 5110 5111 5112 5113 5114 5115 5116 5117 5118 5119 5120 5121 5122 5123 5124 5125 5126 5127 5128 5129 5130 5131 5132 5133 5134 5135 5136 5137 5138 5139 5140 5141 5142 5143 5144 5145 5146 5147 5148 5149 5150 5151 5152 5153 5154 5155 5156 5157 5158 5159 5160 5161 5162 5163 5164 5165 5166 5167 5168 5169 5170 5171 5172 5173 5174 5175 5176 5177 5178 5179 5180 5181 5182 5183 5184 5185 5186 5187 5188 5189 5190 5191 5192 5193 5194 5195 5196 5197 5198 5199 5200 5201 5202 5203 5204 5205 5206 5207 5208 5209 5210 5211 5212 5213 5214 5215 5216 5217 5218 5219 5220 5221 5222 5223 5224 5225 5226 5227 5228 5229 5230 5231 5232 5233 5234 5235 5236 5237 5238 5239 5240 5241 5242 5243 5244 5245 5246 5247 5248 5249 5250 5251 5252 5253 5254 5255 5256 5257 5258 5259 5260 5261 5262 5263 5264 5265 5266 5267 5268 5269 5270 5271 5272 5273 5274 5275 5276 5277 5278 5279 5280 5281 5282 5283 5284 5285 5286 5287 5288 5289 5290 5291 5292 5293 5294 5295 5296 5297 5298 5299 5300 5301 5302 5303 5304 5305 5306 5307 5308 5309 5310 5311 5312 5313 5314 5315 5316 5317 5318 5319 5320 5321 5322 5323 5324 5325 5326 5327 5328 5329 5330 5331 5332 5333 5334 5335 5336 5337 5338 5339 5340 5341 5342 5343 5344 5345 5346 5347 5348 5349 5350 5351 5352 5353 5354 5355 5356 5357 5358 5359 5360 5361 5362 5363 5364 5365 5366 5367 5368 5369 5370 5371 5372 5373 5374 5375 5376 5377 5378 5379 5380 5381 5382 5383 5384 5385 5386 5387 5388 5389 5390 5391 5392 5393 5394 5395 5396 5397 5398 5399 5400 5401 5402 5403 5404 5405 5406 5407 5408 5409 5410 5411 5412 5413 5414 5415 5416 5417 5418 5419 5420 5421 5422 5423 5424 5425 5426 5427 5428 5429 5430 5431 5432 5433 5434 5435 5436 5437 5438 5439 5440 5441 5442 5443 5444 5445 5446 5447 5448 5449 5450 5451 5452 5453 5454 5455 5456 5457 5458 5459 5460 5461 5462 5463 5464 5465 5466 5467 5468 5469 5470 5471 5472 5473 5474 5475 5476 5477 5478 5479 5480 5481 5482 5483 5484 5485 5486 5487 5488 5489 5490 5491 5492 5493 5494 5495 5496 5497 5498 5499 5500 5501 5502 5503 5504 5505 5506 5507 5508 5509 5510 5511 5512 5513 5514 5515 5516 5517 5518 5519 5520 5521 5522 5523 5524 5525 5526 5527 5528 5529 5530 5531 5532 5533 5534 5535 5536 5537 5538 5539 5540 5541 5542 5543 5544 5545 5546 5547 5548 5549 5550 5551 5552 5553 5554 5555 5556 5557 5558 5559 5560 5561 5562 5563 5564 5565 5566 5567 5568 5569 5570 5571 5572 5573 5574 5575 5576 5577 5578 5579 5580 5581 5582 5583 5584 5585 5586 5587 5588 5589 5590 5591 5592 5593 5594 5595 5596 5597 5598 5599 5600 5601 5602 5603 5604 5605 5606 5607 5608 5609 5610 5611 5612 5613 5614 5615 5616 5617 5618 5619 5620 5621 5622 5623 5624 5625 5626 5627 5628 5629 5630 5631 5632 5633 5634 5635 5636 5637 5638 5639 5640 5641 5642 5643 5644 5645 5646 5647 5648 5649 5650 5651 5652 5653 5654 5655 5656 5657 5658 5659 5660 5661 5662 5663 5664 5665 5666 5667 5668 5669 5670 5671 5672 5673 5674 5675 5676 5677 5678 5679 5680 5681 5682 5683 5684 5685 5686 5687 5688 5689 5690 5691 5692 5693 5694 5695 5696 5697 5698 5699 5700 5701 5702 5703 5704 5705 5706 5707 5708 5709 5710 5711 5712 5713 5714 5715 5716 5717 5718 5719 5720 5721 5722 5723 5724 5725 5726 5727 5728 5729 5730 5731 5732 5733 5734 5735 5736 5737 5738 5739 5740 5741 5742 5743 5744 5745 5746 5747 5748 5749 5750 5751 5752 5753 5754 5755 5756 5757 5758 5759 5760 5761 5762 5763 5764 5765 5766 5767 5768 5769 5770 5771 5772 5773 5774 5775 5776 5777 5778 5779 5780 5781 5782 5783 5784 5785 5786 5787 5788 5789 5790 5791 5792 5793 5794 5795 5796 5797 5798 5799 5800 5801 5802 5803 5804 5805 5806 5807 5808 5809 5810 5811 5812 5813 5814 5815 5816 5817 5818 5819 5820 5821 5822 5823 5824 5825 5826 5827 5828 5829 5830 5831 5832 5833 5834 5835 5836 5837 5838 5839 5840 5841 5842 5843 5844 5845 5846 5847 5848 5849 5850 5851 5852 5853 5854 5855 5856 5857 5858 5859 5860 5861 5862 5863 5864 5865 5866 5867 5868 5869 5870 5871 5872 5873 5874 5875 5876 5877 5878 5879 5880 5881 5882 5883 5884 5885 5886 5887 5888 5889 5890 5891 5892 5893 5894 5895 5896 5897 5898 5899 5900 5901 5902 5903 5904 5905 5906 5907 5908 5909 5910 5911 5912 5913 5914 5915 5916 5917 5918 5919 5920 5921 5922 5923 5924 5925 5926 5927 5928 5929 5930 5931 5932 5933 5934 5935 5936 5937 5938 5939 5940 5941 5942 5943 5944 5945 5946 5947 5948 5949 5950 5951 5952 5953 5954 5955 5956 5957 5958 5959 5960 5961 5962 5963 5964 5965 5966 5967 5968 5969 5970 5971 5972 5973 5974 5975 5976 5977 5978 5979 5980 5981 5982 5983 5984 5985 5986 5987 5988 5989 5990 5991 5992 5993 5994 5995 5996 5997 5998 5999 6000 6001 6002 6003 6004 6005 6006 6007 6008 6009 6010 6011 6012 6013 6014 6015 6016 6017 6018 6019 6020 6021 6022 6023 6024 6025 6026 6027 6028 6029 6030 6031 6032 6033 6034 6035 6036 6037 6038 6039 6040 6041 6042 6043 6044 6045 6046 6047 6048 6049 6050 6051 6052 6053 6054 6055 6056 6057 6058 6059 6060 6061 6062 6063 6064 6065 6066 6067 6068 6069 6070 6071 6072 6073 6074 6075 6076 6077 6078 6079 6080 6081 6082 6083 6084 6085 6086 6087 6088 6089 6090 6091 6092 6093 6094 6095 6096 6097 6098 6099 6100 6101 6102 6103 6104 6105 6106 6107 6108 6109 6110 6111 6112 6113 6114 6115 6116 6117 6118 6119 6120 6121 6122 6123 6124 6125 6126 6127 6128 6129 6130 6131 6132 6133 6134 6135 6136 6137 6138 6139 6140 6141 6142 6143 6144 6145 6146 6147 6148 6149 6150 6151 6152 6153 6154 6155 6156 6157 6158 6159 6160 6161 6162 6163 6164 6165 6166 6167 6168 6169 6170 6171 6172 6173 6174 6175 6176 6177 6178 6179 6180 6181 6182 6183 6184 6185 6186 6187 6188 6189 6190 6191 6192 6193 6194 6195 6196 6197 6198 6199 6200 6201 6202 6203 6204 6205 6206 6207 6208 6209 6210 6211 6212 6213 6214 6215 6216 6217 6218 6219 6220 6221 6222 6223 6224 6225 6226 6227 6228 6229 6230 6231 6232 6233 6234 6235 6236 6237 6238 6239 6240 6241 6242 6243 6244 6245 6246 6247 6248 6249 6250 6251 6252 6253 6254 6255 6256 6257 6258 6259 6260 6261 6262 6263 6264 6265 6266 6267 6268 6269 6270 6271 6272 6273 6274 6275 6276 6277 6278 6279 6280 6281 6282 6283 6284 6285 6286 6287 6288 6289 6290 6291 6292 6293 6294 6295 6296 6297 6298 6299 6300 6301 6302 6303 6304 6305 6306 6307 6308 6309 6310 6311 6312 6313 6314 6315 6316 6317 6318 6319 6320 6321 6322 6323 6324 6325 6326 6327 6328 6329 6330 6331 6332 6333 6334 6335 6336 6337 6338 6339 6340 6341 6342 6343 6344 6345 6346 6347 6348 6349 6350 6351 6352 6353 6354 6355 6356 6357 6358 6359 6360 6361 6362 6363 6364 6365 6366 6367 6368 6369 6370 6371 6372 6373 6374 6375 6376 6377 6378 6379 6380 6381 6382 6383 6384 6385 6386 6387 6388 6389 6390 6391 6392 6393 6394 6395 6396 6397 6398 6399 6400 6401 6402 6403 6404 6405 6406 6407 6408 6409 6410 6411 6412 6413 6414 6415 6416 6417 6418 6419 6420 6421 6422 6423 6424 6425 6426 6427 6428 6429 6430 6431 6432 6433 6434 6435 6436 6437 6438 6439 6440 6441 6442 6443 6444 6445 6446 6447 6448 6449 6450 6451 6452 6453 6454 6455 6456 6457 6458 6459 6460 6461 6462 6463 6464 6465 6466 6467 6468 6469 6470 6471 6472 6473 6474 6475 6476 6477 6478 6479 6480 6481 6482 6483 6484 6485 6486 6487 6488 6489 6490 6491 6492 6493 6494 6495 6496 6497 6498 6499 6500 6501 6502 6503 6504 6505 6506 6507 6508 6509 6510 6511 6512 6513 6514 6515 6516 6517 6518 6519 6520 6521 6522 6523 6524 6525 6526 6527 6528 6529 6530 6531 6532 6533 6534 6535 6536 6537 6538 6539 6540 6541 6542 6543 6544 6545 6546 6547 6548 6549 6550 6551 6552 6553 6554 6555 6556 6557 6558 6559 6560 6561 6562 | // SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2007 Oracle. All rights reserved. */ #include <linux/sched.h> #include <linux/sched/signal.h> #include <linux/pagemap.h> #include <linux/writeback.h> #include <linux/blkdev.h> #include <linux/sort.h> #include <linux/rcupdate.h> #include <linux/kthread.h> #include <linux/slab.h> #include <linux/ratelimit.h> #include <linux/percpu_counter.h> #include <linux/lockdep.h> #include <linux/crc32c.h> #include "ctree.h" #include "extent-tree.h" #include "transaction.h" #include "disk-io.h" #include "print-tree.h" #include "volumes.h" #include "raid56.h" #include "locking.h" #include "free-space-cache.h" #include "free-space-tree.h" #include "qgroup.h" #include "ref-verify.h" #include "space-info.h" #include "block-rsv.h" #include "discard.h" #include "zoned.h" #include "dev-replace.h" #include "fs.h" #include "accessors.h" #include "root-tree.h" #include "file-item.h" #include "orphan.h" #include "tree-checker.h" #include "raid-stripe-tree.h" #undef SCRAMBLE_DELAYED_REFS static int __btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_delayed_ref_head *href, struct btrfs_delayed_ref_node *node, struct btrfs_delayed_extent_op *extra_op); static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op, struct extent_buffer *leaf, struct btrfs_extent_item *ei); static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans, u64 parent, u64 root_objectid, u64 flags, u64 owner, u64 offset, struct btrfs_key *ins, int ref_mod, u64 oref_root); static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, struct btrfs_delayed_ref_node *node, struct btrfs_delayed_extent_op *extent_op); static int find_next_key(struct btrfs_path *path, int level, struct btrfs_key *key); static int block_group_bits(struct btrfs_block_group *cache, u64 bits) { return (cache->flags & bits) == bits; } /* simple helper to search for an existing data extent at a given offset */ int btrfs_lookup_data_extent(struct btrfs_fs_info *fs_info, u64 start, u64 len) { struct btrfs_root *root = btrfs_extent_root(fs_info, start); int ret; struct btrfs_key key; struct btrfs_path *path; path = btrfs_alloc_path(); if (!path) return -ENOMEM; key.objectid = start; key.offset = len; key.type = BTRFS_EXTENT_ITEM_KEY; ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); btrfs_free_path(path); return ret; } /* * helper function to lookup reference count and flags of a tree block. * * the head node for delayed ref is used to store the sum of all the * reference count modifications queued up in the rbtree. the head * node may also store the extent flags to set. This way you can check * to see what the reference count and extent flags would be if all of * the delayed refs are not processed. */ int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, u64 bytenr, u64 offset, int metadata, u64 *refs, u64 *flags, u64 *owning_root) { struct btrfs_root *extent_root; struct btrfs_delayed_ref_head *head; struct btrfs_delayed_ref_root *delayed_refs; struct btrfs_path *path; struct btrfs_key key; u64 num_refs; u64 extent_flags; u64 owner = 0; int ret; /* * If we don't have skinny metadata, don't bother doing anything * different */ if (metadata && !btrfs_fs_incompat(fs_info, SKINNY_METADATA)) { offset = fs_info->nodesize; metadata = 0; } path = btrfs_alloc_path(); if (!path) return -ENOMEM; search_again: key.objectid = bytenr; key.offset = offset; if (metadata) key.type = BTRFS_METADATA_ITEM_KEY; else key.type = BTRFS_EXTENT_ITEM_KEY; extent_root = btrfs_extent_root(fs_info, bytenr); ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0); if (ret < 0) goto out_free; if (ret > 0 && key.type == BTRFS_METADATA_ITEM_KEY) { if (path->slots[0]) { path->slots[0]--; btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); if (key.objectid == bytenr && key.type == BTRFS_EXTENT_ITEM_KEY && key.offset == fs_info->nodesize) ret = 0; } } if (ret == 0) { struct extent_buffer *leaf = path->nodes[0]; struct btrfs_extent_item *ei; const u32 item_size = btrfs_item_size(leaf, path->slots[0]); if (unlikely(item_size < sizeof(*ei))) { ret = -EUCLEAN; btrfs_err(fs_info, "unexpected extent item size, has %u expect >= %zu", item_size, sizeof(*ei)); btrfs_abort_transaction(trans, ret); goto out_free; } ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); num_refs = btrfs_extent_refs(leaf, ei); if (unlikely(num_refs == 0)) { ret = -EUCLEAN; btrfs_err(fs_info, "unexpected zero reference count for extent item (%llu %u %llu)", key.objectid, key.type, key.offset); btrfs_abort_transaction(trans, ret); goto out_free; } extent_flags = btrfs_extent_flags(leaf, ei); owner = btrfs_get_extent_owner_root(fs_info, leaf, path->slots[0]); } else { num_refs = 0; extent_flags = 0; ret = 0; } delayed_refs = &trans->transaction->delayed_refs; spin_lock(&delayed_refs->lock); head = btrfs_find_delayed_ref_head(fs_info, delayed_refs, bytenr); if (head) { if (!mutex_trylock(&head->mutex)) { refcount_inc(&head->refs); spin_unlock(&delayed_refs->lock); btrfs_release_path(path); /* * Mutex was contended, block until it's released and try * again */ mutex_lock(&head->mutex); mutex_unlock(&head->mutex); btrfs_put_delayed_ref_head(head); goto search_again; } spin_lock(&head->lock); if (head->extent_op && head->extent_op->update_flags) extent_flags |= head->extent_op->flags_to_set; num_refs += head->ref_mod; spin_unlock(&head->lock); mutex_unlock(&head->mutex); } spin_unlock(&delayed_refs->lock); WARN_ON(num_refs == 0); if (refs) *refs = num_refs; if (flags) *flags = extent_flags; if (owning_root) *owning_root = owner; out_free: btrfs_free_path(path); return ret; } /* * Back reference rules. Back refs have three main goals: * * 1) differentiate between all holders of references to an extent so that * when a reference is dropped we can make sure it was a valid reference * before freeing the extent. * * 2) Provide enough information to quickly find the holders of an extent * if we notice a given block is corrupted or bad. * * 3) Make it easy to migrate blocks for FS shrinking or storage pool * maintenance. This is actually the same as #2, but with a slightly * different use case. * * There are two kinds of back refs. The implicit back refs is optimized * for pointers in non-shared tree blocks. For a given pointer in a block, * back refs of this kind provide information about the block's owner tree * and the pointer's key. These information allow us to find the block by * b-tree searching. The full back refs is for pointers in tree blocks not * referenced by their owner trees. The location of tree block is recorded * in the back refs. Actually the full back refs is generic, and can be * used in all cases the implicit back refs is used. The major shortcoming * of the full back refs is its overhead. Every time a tree block gets * COWed, we have to update back refs entry for all pointers in it. * * For a newly allocated tree block, we use implicit back refs for * pointers in it. This means most tree related operations only involve * implicit back refs. For a tree block created in old transaction, the * only way to drop a reference to it is COW it. So we can detect the * event that tree block loses its owner tree's reference and do the * back refs conversion. * * When a tree block is COWed through a tree, there are four cases: * * The reference count of the block is one and the tree is the block's * owner tree. Nothing to do in this case. * * The reference count of the block is one and the tree is not the * block's owner tree. In this case, full back refs is used for pointers * in the block. Remove these full back refs, add implicit back refs for * every pointers in the new block. * * The reference count of the block is greater than one and the tree is * the block's owner tree. In this case, implicit back refs is used for * pointers in the block. Add full back refs for every pointers in the * block, increase lower level extents' reference counts. The original * implicit back refs are entailed to the new block. * * The reference count of the block is greater than one and the tree is * not the block's owner tree. Add implicit back refs for every pointer in * the new block, increase lower level extents' reference count. * * Back Reference Key composing: * * The key objectid corresponds to the first byte in the extent, * The key type is used to differentiate between types of back refs. * There are different meanings of the key offset for different types * of back refs. * * File extents can be referenced by: * * - multiple snapshots, subvolumes, or different generations in one subvol * - different files inside a single subvolume * - different offsets inside a file (bookend extents in file.c) * * The extent ref structure for the implicit back refs has fields for: * * - Objectid of the subvolume root * - objectid of the file holding the reference * - original offset in the file * - how many bookend extents * * The key offset for the implicit back refs is hash of the first * three fields. * * The extent ref structure for the full back refs has field for: * * - number of pointers in the tree leaf * * The key offset for the implicit back refs is the first byte of * the tree leaf * * When a file extent is allocated, The implicit back refs is used. * the fields are filled in: * * (root_key.objectid, inode objectid, offset in file, 1) * * When a file extent is removed file truncation, we find the * corresponding implicit back refs and check the following fields: * * (btrfs_header_owner(leaf), inode objectid, offset in file) * * Btree extents can be referenced by: * * - Different subvolumes * * Both the implicit back refs and the full back refs for tree blocks * only consist of key. The key offset for the implicit back refs is * objectid of block's owner tree. The key offset for the full back refs * is the first byte of parent block. * * When implicit back refs is used, information about the lowest key and * level of the tree block are required. These information are stored in * tree block info structure. */ /* * is_data == BTRFS_REF_TYPE_BLOCK, tree block type is required, * is_data == BTRFS_REF_TYPE_DATA, data type is requiried, * is_data == BTRFS_REF_TYPE_ANY, either type is OK. */ int btrfs_get_extent_inline_ref_type(const struct extent_buffer *eb, struct btrfs_extent_inline_ref *iref, enum btrfs_inline_ref_type is_data) { struct btrfs_fs_info *fs_info = eb->fs_info; int type = btrfs_extent_inline_ref_type(eb, iref); u64 offset = btrfs_extent_inline_ref_offset(eb, iref); if (type == BTRFS_EXTENT_OWNER_REF_KEY) { ASSERT(btrfs_fs_incompat(fs_info, SIMPLE_QUOTA)); return type; } if (type == BTRFS_TREE_BLOCK_REF_KEY || type == BTRFS_SHARED_BLOCK_REF_KEY || type == BTRFS_SHARED_DATA_REF_KEY || type == BTRFS_EXTENT_DATA_REF_KEY) { if (is_data == BTRFS_REF_TYPE_BLOCK) { if (type == BTRFS_TREE_BLOCK_REF_KEY) return type; if (type == BTRFS_SHARED_BLOCK_REF_KEY) { ASSERT(fs_info); /* * Every shared one has parent tree block, * which must be aligned to sector size. */ if (offset && IS_ALIGNED(offset, fs_info->sectorsize)) return type; } } else if (is_data == BTRFS_REF_TYPE_DATA) { if (type == BTRFS_EXTENT_DATA_REF_KEY) return type; if (type == BTRFS_SHARED_DATA_REF_KEY) { ASSERT(fs_info); /* * Every shared one has parent tree block, * which must be aligned to sector size. */ if (offset && IS_ALIGNED(offset, fs_info->sectorsize)) return type; } } else { ASSERT(is_data == BTRFS_REF_TYPE_ANY); return type; } } WARN_ON(1); btrfs_print_leaf(eb); btrfs_err(fs_info, "eb %llu iref 0x%lx invalid extent inline ref type %d", eb->start, (unsigned long)iref, type); return BTRFS_REF_TYPE_INVALID; } u64 hash_extent_data_ref(u64 root_objectid, u64 owner, u64 offset) { u32 high_crc = ~(u32)0; u32 low_crc = ~(u32)0; __le64 lenum; lenum = cpu_to_le64(root_objectid); high_crc = crc32c(high_crc, &lenum, sizeof(lenum)); lenum = cpu_to_le64(owner); low_crc = crc32c(low_crc, &lenum, sizeof(lenum)); lenum = cpu_to_le64(offset); low_crc = crc32c(low_crc, &lenum, sizeof(lenum)); return ((u64)high_crc << 31) ^ (u64)low_crc; } static u64 hash_extent_data_ref_item(struct extent_buffer *leaf, struct btrfs_extent_data_ref *ref) { return hash_extent_data_ref(btrfs_extent_data_ref_root(leaf, ref), btrfs_extent_data_ref_objectid(leaf, ref), btrfs_extent_data_ref_offset(leaf, ref)); } static int match_extent_data_ref(struct extent_buffer *leaf, struct btrfs_extent_data_ref *ref, u64 root_objectid, u64 owner, u64 offset) { if (btrfs_extent_data_ref_root(leaf, ref) != root_objectid || btrfs_extent_data_ref_objectid(leaf, ref) != owner || btrfs_extent_data_ref_offset(leaf, ref) != offset) return 0; return 1; } static noinline int lookup_extent_data_ref(struct btrfs_trans_handle *trans, struct btrfs_path *path, u64 bytenr, u64 parent, u64 root_objectid, u64 owner, u64 offset) { struct btrfs_root *root = btrfs_extent_root(trans->fs_info, bytenr); struct btrfs_key key; struct btrfs_extent_data_ref *ref; struct extent_buffer *leaf; u32 nritems; int recow; int ret; key.objectid = bytenr; if (parent) { key.type = BTRFS_SHARED_DATA_REF_KEY; key.offset = parent; } else { key.type = BTRFS_EXTENT_DATA_REF_KEY; key.offset = hash_extent_data_ref(root_objectid, owner, offset); } again: recow = 0; ret = btrfs_search_slot(trans, root, &key, path, -1, 1); if (ret < 0) return ret; if (parent) { if (ret) return -ENOENT; return 0; } ret = -ENOENT; leaf = path->nodes[0]; nritems = btrfs_header_nritems(leaf); while (1) { if (path->slots[0] >= nritems) { ret = btrfs_next_leaf(root, path); if (ret) { if (ret > 0) return -ENOENT; return ret; } leaf = path->nodes[0]; nritems = btrfs_header_nritems(leaf); recow = 1; } btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); if (key.objectid != bytenr || key.type != BTRFS_EXTENT_DATA_REF_KEY) goto fail; ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_data_ref); if (match_extent_data_ref(leaf, ref, root_objectid, owner, offset)) { if (recow) { btrfs_release_path(path); goto again; } ret = 0; break; } path->slots[0]++; } fail: return ret; } static noinline int insert_extent_data_ref(struct btrfs_trans_handle *trans, struct btrfs_path *path, struct btrfs_delayed_ref_node *node, u64 bytenr) { struct btrfs_root *root = btrfs_extent_root(trans->fs_info, bytenr); struct btrfs_key key; struct extent_buffer *leaf; u64 owner = btrfs_delayed_ref_owner(node); u64 offset = btrfs_delayed_ref_offset(node); u32 size; u32 num_refs; int ret; key.objectid = bytenr; if (node->parent) { key.type = BTRFS_SHARED_DATA_REF_KEY; key.offset = node->parent; size = sizeof(struct btrfs_shared_data_ref); } else { key.type = BTRFS_EXTENT_DATA_REF_KEY; key.offset = hash_extent_data_ref(node->ref_root, owner, offset); size = sizeof(struct btrfs_extent_data_ref); } ret = btrfs_insert_empty_item(trans, root, path, &key, size); if (ret && ret != -EEXIST) goto fail; leaf = path->nodes[0]; if (node->parent) { struct btrfs_shared_data_ref *ref; ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_shared_data_ref); if (ret == 0) { btrfs_set_shared_data_ref_count(leaf, ref, node->ref_mod); } else { num_refs = btrfs_shared_data_ref_count(leaf, ref); num_refs += node->ref_mod; btrfs_set_shared_data_ref_count(leaf, ref, num_refs); } } else { struct btrfs_extent_data_ref *ref; while (ret == -EEXIST) { ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_data_ref); if (match_extent_data_ref(leaf, ref, node->ref_root, owner, offset)) break; btrfs_release_path(path); key.offset++; ret = btrfs_insert_empty_item(trans, root, path, &key, size); if (ret && ret != -EEXIST) goto fail; leaf = path->nodes[0]; } ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_data_ref); if (ret == 0) { btrfs_set_extent_data_ref_root(leaf, ref, node->ref_root); btrfs_set_extent_data_ref_objectid(leaf, ref, owner); btrfs_set_extent_data_ref_offset(leaf, ref, offset); btrfs_set_extent_data_ref_count(leaf, ref, node->ref_mod); } else { num_refs = btrfs_extent_data_ref_count(leaf, ref); num_refs += node->ref_mod; btrfs_set_extent_data_ref_count(leaf, ref, num_refs); } } ret = 0; fail: btrfs_release_path(path); return ret; } static noinline int remove_extent_data_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, int refs_to_drop) { struct btrfs_key key; struct btrfs_extent_data_ref *ref1 = NULL; struct btrfs_shared_data_ref *ref2 = NULL; struct extent_buffer *leaf; u32 num_refs = 0; int ret = 0; leaf = path->nodes[0]; btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); if (key.type == BTRFS_EXTENT_DATA_REF_KEY) { ref1 = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_data_ref); num_refs = btrfs_extent_data_ref_count(leaf, ref1); } else if (key.type == BTRFS_SHARED_DATA_REF_KEY) { ref2 = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_shared_data_ref); num_refs = btrfs_shared_data_ref_count(leaf, ref2); } else { btrfs_err(trans->fs_info, "unrecognized backref key (%llu %u %llu)", key.objectid, key.type, key.offset); btrfs_abort_transaction(trans, -EUCLEAN); return -EUCLEAN; } BUG_ON(num_refs < refs_to_drop); num_refs -= refs_to_drop; if (num_refs == 0) { ret = btrfs_del_item(trans, root, path); } else { if (key.type == BTRFS_EXTENT_DATA_REF_KEY) btrfs_set_extent_data_ref_count(leaf, ref1, num_refs); else if (key.type == BTRFS_SHARED_DATA_REF_KEY) btrfs_set_shared_data_ref_count(leaf, ref2, num_refs); } return ret; } static noinline u32 extent_data_ref_count(struct btrfs_path *path, struct btrfs_extent_inline_ref *iref) { struct btrfs_key key; struct extent_buffer *leaf; struct btrfs_extent_data_ref *ref1; struct btrfs_shared_data_ref *ref2; u32 num_refs = 0; int type; leaf = path->nodes[0]; btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); if (iref) { /* * If type is invalid, we should have bailed out earlier than * this call. */ type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_DATA); ASSERT(type != BTRFS_REF_TYPE_INVALID); if (type == BTRFS_EXTENT_DATA_REF_KEY) { ref1 = (struct btrfs_extent_data_ref *)(&iref->offset); num_refs = btrfs_extent_data_ref_count(leaf, ref1); } else { ref2 = (struct btrfs_shared_data_ref *)(iref + 1); num_refs = btrfs_shared_data_ref_count(leaf, ref2); } } else if (key.type == BTRFS_EXTENT_DATA_REF_KEY) { ref1 = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_data_ref); num_refs = btrfs_extent_data_ref_count(leaf, ref1); } else if (key.type == BTRFS_SHARED_DATA_REF_KEY) { ref2 = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_shared_data_ref); num_refs = btrfs_shared_data_ref_count(leaf, ref2); } else { WARN_ON(1); } return num_refs; } static noinline int lookup_tree_block_ref(struct btrfs_trans_handle *trans, struct btrfs_path *path, u64 bytenr, u64 parent, u64 root_objectid) { struct btrfs_root *root = btrfs_extent_root(trans->fs_info, bytenr); struct btrfs_key key; int ret; key.objectid = bytenr; if (parent) { key.type = BTRFS_SHARED_BLOCK_REF_KEY; key.offset = parent; } else { key.type = BTRFS_TREE_BLOCK_REF_KEY; key.offset = root_objectid; } ret = btrfs_search_slot(trans, root, &key, path, -1, 1); if (ret > 0) ret = -ENOENT; return ret; } static noinline int insert_tree_block_ref(struct btrfs_trans_handle *trans, struct btrfs_path *path, struct btrfs_delayed_ref_node *node, u64 bytenr) { struct btrfs_root *root = btrfs_extent_root(trans->fs_info, bytenr); struct btrfs_key key; int ret; key.objectid = bytenr; if (node->parent) { key.type = BTRFS_SHARED_BLOCK_REF_KEY; key.offset = node->parent; } else { key.type = BTRFS_TREE_BLOCK_REF_KEY; key.offset = node->ref_root; } ret = btrfs_insert_empty_item(trans, root, path, &key, 0); btrfs_release_path(path); return ret; } static inline int extent_ref_type(u64 parent, u64 owner) { int type; if (owner < BTRFS_FIRST_FREE_OBJECTID) { if (parent > 0) type = BTRFS_SHARED_BLOCK_REF_KEY; else type = BTRFS_TREE_BLOCK_REF_KEY; } else { if (parent > 0) type = BTRFS_SHARED_DATA_REF_KEY; else type = BTRFS_EXTENT_DATA_REF_KEY; } return type; } static int find_next_key(struct btrfs_path *path, int level, struct btrfs_key *key) { for (; level < BTRFS_MAX_LEVEL; level++) { if (!path->nodes[level]) break; if (path->slots[level] + 1 >= btrfs_header_nritems(path->nodes[level])) continue; if (level == 0) btrfs_item_key_to_cpu(path->nodes[level], key, path->slots[level] + 1); else btrfs_node_key_to_cpu(path->nodes[level], key, path->slots[level] + 1); return 0; } return 1; } /* * look for inline back ref. if back ref is found, *ref_ret is set * to the address of inline back ref, and 0 is returned. * * if back ref isn't found, *ref_ret is set to the address where it * should be inserted, and -ENOENT is returned. * * if insert is true and there are too many inline back refs, the path * points to the extent item, and -EAGAIN is returned. * * NOTE: inline back refs are ordered in the same way that back ref * items in the tree are ordered. */ static noinline_for_stack int lookup_inline_extent_backref(struct btrfs_trans_handle *trans, struct btrfs_path *path, struct btrfs_extent_inline_ref **ref_ret, u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid, u64 owner, u64 offset, int insert) { struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_root *root = btrfs_extent_root(fs_info, bytenr); struct btrfs_key key; struct extent_buffer *leaf; struct btrfs_extent_item *ei; struct btrfs_extent_inline_ref *iref; u64 flags; u64 item_size; unsigned long ptr; unsigned long end; int extra_size; int type; int want; int ret; bool skinny_metadata = btrfs_fs_incompat(fs_info, SKINNY_METADATA); int needed; key.objectid = bytenr; key.type = BTRFS_EXTENT_ITEM_KEY; key.offset = num_bytes; want = extent_ref_type(parent, owner); if (insert) { extra_size = btrfs_extent_inline_ref_size(want); path->search_for_extension = 1; } else extra_size = -1; /* * Owner is our level, so we can just add one to get the level for the * block we are interested in. */ if (skinny_metadata && owner < BTRFS_FIRST_FREE_OBJECTID) { key.type = BTRFS_METADATA_ITEM_KEY; key.offset = owner; } again: ret = btrfs_search_slot(trans, root, &key, path, extra_size, 1); if (ret < 0) goto out; /* * We may be a newly converted file system which still has the old fat * extent entries for metadata, so try and see if we have one of those. */ if (ret > 0 && skinny_metadata) { skinny_metadata = false; if (path->slots[0]) { path->slots[0]--; btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); if (key.objectid == bytenr && key.type == BTRFS_EXTENT_ITEM_KEY && key.offset == num_bytes) ret = 0; } if (ret) { key.objectid = bytenr; key.type = BTRFS_EXTENT_ITEM_KEY; key.offset = num_bytes; btrfs_release_path(path); goto again; } } if (ret && !insert) { ret = -ENOENT; goto out; } else if (WARN_ON(ret)) { btrfs_print_leaf(path->nodes[0]); btrfs_err(fs_info, "extent item not found for insert, bytenr %llu num_bytes %llu parent %llu root_objectid %llu owner %llu offset %llu", bytenr, num_bytes, parent, root_objectid, owner, offset); ret = -EUCLEAN; goto out; } leaf = path->nodes[0]; item_size = btrfs_item_size(leaf, path->slots[0]); if (unlikely(item_size < sizeof(*ei))) { ret = -EUCLEAN; btrfs_err(fs_info, "unexpected extent item size, has %llu expect >= %zu", item_size, sizeof(*ei)); btrfs_abort_transaction(trans, ret); goto out; } ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); flags = btrfs_extent_flags(leaf, ei); ptr = (unsigned long)(ei + 1); end = (unsigned long)ei + item_size; if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK && !skinny_metadata) { ptr += sizeof(struct btrfs_tree_block_info); BUG_ON(ptr > end); } if (owner >= BTRFS_FIRST_FREE_OBJECTID) needed = BTRFS_REF_TYPE_DATA; else needed = BTRFS_REF_TYPE_BLOCK; ret = -ENOENT; while (ptr < end) { iref = (struct btrfs_extent_inline_ref *)ptr; type = btrfs_get_extent_inline_ref_type(leaf, iref, needed); if (type == BTRFS_EXTENT_OWNER_REF_KEY) { ASSERT(btrfs_fs_incompat(fs_info, SIMPLE_QUOTA)); ptr += btrfs_extent_inline_ref_size(type); continue; } if (type == BTRFS_REF_TYPE_INVALID) { ret = -EUCLEAN; goto out; } if (want < type) break; if (want > type) { ptr += btrfs_extent_inline_ref_size(type); continue; } if (type == BTRFS_EXTENT_DATA_REF_KEY) { struct btrfs_extent_data_ref *dref; dref = (struct btrfs_extent_data_ref *)(&iref->offset); if (match_extent_data_ref(leaf, dref, root_objectid, owner, offset)) { ret = 0; break; } if (hash_extent_data_ref_item(leaf, dref) < hash_extent_data_ref(root_objectid, owner, offset)) break; } else { u64 ref_offset; ref_offset = btrfs_extent_inline_ref_offset(leaf, iref); if (parent > 0) { if (parent == ref_offset) { ret = 0; break; } if (ref_offset < parent) break; } else { if (root_objectid == ref_offset) { ret = 0; break; } if (ref_offset < root_objectid) break; } } ptr += btrfs_extent_inline_ref_size(type); } if (unlikely(ptr > end)) { ret = -EUCLEAN; btrfs_print_leaf(path->nodes[0]); btrfs_crit(fs_info, "overrun extent record at slot %d while looking for inline extent for root %llu owner %llu offset %llu parent %llu", path->slots[0], root_objectid, owner, offset, parent); goto out; } if (ret == -ENOENT && insert) { if (item_size + extra_size >= BTRFS_MAX_EXTENT_ITEM_SIZE(root)) { ret = -EAGAIN; goto out; } if (path->slots[0] + 1 < btrfs_header_nritems(path->nodes[0])) { struct btrfs_key tmp_key; btrfs_item_key_to_cpu(path->nodes[0], &tmp_key, path->slots[0] + 1); if (tmp_key.objectid == bytenr && tmp_key.type < BTRFS_BLOCK_GROUP_ITEM_KEY) { ret = -EAGAIN; goto out; } goto out_no_entry; } if (!path->keep_locks) { btrfs_release_path(path); path->keep_locks = 1; goto again; } /* * To add new inline back ref, we have to make sure * there is no corresponding back ref item. * For simplicity, we just do not add new inline back * ref if there is any kind of item for this block */ if (find_next_key(path, 0, &key) == 0 && key.objectid == bytenr && key.type < BTRFS_BLOCK_GROUP_ITEM_KEY) { ret = -EAGAIN; goto out; } } out_no_entry: *ref_ret = (struct btrfs_extent_inline_ref *)ptr; out: if (path->keep_locks) { path->keep_locks = 0; btrfs_unlock_up_safe(path, 1); } if (insert) path->search_for_extension = 0; return ret; } /* * helper to add new inline back ref */ static noinline_for_stack void setup_inline_extent_backref(struct btrfs_trans_handle *trans, struct btrfs_path *path, struct btrfs_extent_inline_ref *iref, u64 parent, u64 root_objectid, u64 owner, u64 offset, int refs_to_add, struct btrfs_delayed_extent_op *extent_op) { struct extent_buffer *leaf; struct btrfs_extent_item *ei; unsigned long ptr; unsigned long end; unsigned long item_offset; u64 refs; int size; int type; leaf = path->nodes[0]; ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); item_offset = (unsigned long)iref - (unsigned long)ei; type = extent_ref_type(parent, owner); size = btrfs_extent_inline_ref_size(type); btrfs_extend_item(trans, path, size); ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); refs = btrfs_extent_refs(leaf, ei); refs += refs_to_add; btrfs_set_extent_refs(leaf, ei, refs); if (extent_op) __run_delayed_extent_op(extent_op, leaf, ei); ptr = (unsigned long)ei + item_offset; end = (unsigned long)ei + btrfs_item_size(leaf, path->slots[0]); if (ptr < end - size) memmove_extent_buffer(leaf, ptr + size, ptr, end - size - ptr); iref = (struct btrfs_extent_inline_ref *)ptr; btrfs_set_extent_inline_ref_type(leaf, iref, type); if (type == BTRFS_EXTENT_DATA_REF_KEY) { struct btrfs_extent_data_ref *dref; dref = (struct btrfs_extent_data_ref *)(&iref->offset); btrfs_set_extent_data_ref_root(leaf, dref, root_objectid); btrfs_set_extent_data_ref_objectid(leaf, dref, owner); btrfs_set_extent_data_ref_offset(leaf, dref, offset); btrfs_set_extent_data_ref_count(leaf, dref, refs_to_add); } else if (type == BTRFS_SHARED_DATA_REF_KEY) { struct btrfs_shared_data_ref *sref; sref = (struct btrfs_shared_data_ref *)(iref + 1); btrfs_set_shared_data_ref_count(leaf, sref, refs_to_add); btrfs_set_extent_inline_ref_offset(leaf, iref, parent); } else if (type == BTRFS_SHARED_BLOCK_REF_KEY) { btrfs_set_extent_inline_ref_offset(leaf, iref, parent); } else { btrfs_set_extent_inline_ref_offset(leaf, iref, root_objectid); } } static int lookup_extent_backref(struct btrfs_trans_handle *trans, struct btrfs_path *path, struct btrfs_extent_inline_ref **ref_ret, u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid, u64 owner, u64 offset) { int ret; ret = lookup_inline_extent_backref(trans, path, ref_ret, bytenr, num_bytes, parent, root_objectid, owner, offset, 0); if (ret != -ENOENT) return ret; btrfs_release_path(path); *ref_ret = NULL; if (owner < BTRFS_FIRST_FREE_OBJECTID) { ret = lookup_tree_block_ref(trans, path, bytenr, parent, root_objectid); } else { ret = lookup_extent_data_ref(trans, path, bytenr, parent, root_objectid, owner, offset); } return ret; } /* * helper to update/remove inline back ref */ static noinline_for_stack int update_inline_extent_backref( struct btrfs_trans_handle *trans, struct btrfs_path *path, struct btrfs_extent_inline_ref *iref, int refs_to_mod, struct btrfs_delayed_extent_op *extent_op) { struct extent_buffer *leaf = path->nodes[0]; struct btrfs_fs_info *fs_info = leaf->fs_info; struct btrfs_extent_item *ei; struct btrfs_extent_data_ref *dref = NULL; struct btrfs_shared_data_ref *sref = NULL; unsigned long ptr; unsigned long end; u32 item_size; int size; int type; u64 refs; ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); refs = btrfs_extent_refs(leaf, ei); if (unlikely(refs_to_mod < 0 && refs + refs_to_mod <= 0)) { struct btrfs_key key; u32 extent_size; btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); if (key.type == BTRFS_METADATA_ITEM_KEY) extent_size = fs_info->nodesize; else extent_size = key.offset; btrfs_print_leaf(leaf); btrfs_err(fs_info, "invalid refs_to_mod for extent %llu num_bytes %u, has %d expect >= -%llu", key.objectid, extent_size, refs_to_mod, refs); return -EUCLEAN; } refs += refs_to_mod; btrfs_set_extent_refs(leaf, ei, refs); if (extent_op) __run_delayed_extent_op(extent_op, leaf, ei); type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_ANY); /* * Function btrfs_get_extent_inline_ref_type() has already printed * error messages. */ if (unlikely(type == BTRFS_REF_TYPE_INVALID)) return -EUCLEAN; if (type == BTRFS_EXTENT_DATA_REF_KEY) { dref = (struct btrfs_extent_data_ref *)(&iref->offset); refs = btrfs_extent_data_ref_count(leaf, dref); } else if (type == BTRFS_SHARED_DATA_REF_KEY) { sref = (struct btrfs_shared_data_ref *)(iref + 1); refs = btrfs_shared_data_ref_count(leaf, sref); } else { refs = 1; /* * For tree blocks we can only drop one ref for it, and tree * blocks should not have refs > 1. * * Furthermore if we're inserting a new inline backref, we * won't reach this path either. That would be * setup_inline_extent_backref(). */ if (unlikely(refs_to_mod != -1)) { struct btrfs_key key; btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); btrfs_print_leaf(leaf); btrfs_err(fs_info, "invalid refs_to_mod for tree block %llu, has %d expect -1", key.objectid, refs_to_mod); return -EUCLEAN; } } if (unlikely(refs_to_mod < 0 && refs < -refs_to_mod)) { struct btrfs_key key; u32 extent_size; btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); if (key.type == BTRFS_METADATA_ITEM_KEY) extent_size = fs_info->nodesize; else extent_size = key.offset; btrfs_print_leaf(leaf); btrfs_err(fs_info, "invalid refs_to_mod for backref entry, iref %lu extent %llu num_bytes %u, has %d expect >= -%llu", (unsigned long)iref, key.objectid, extent_size, refs_to_mod, refs); return -EUCLEAN; } refs += refs_to_mod; if (refs > 0) { if (type == BTRFS_EXTENT_DATA_REF_KEY) btrfs_set_extent_data_ref_count(leaf, dref, refs); else btrfs_set_shared_data_ref_count(leaf, sref, refs); } else { size = btrfs_extent_inline_ref_size(type); item_size = btrfs_item_size(leaf, path->slots[0]); ptr = (unsigned long)iref; end = (unsigned long)ei + item_size; if (ptr + size < end) memmove_extent_buffer(leaf, ptr, ptr + size, end - ptr - size); item_size -= size; btrfs_truncate_item(trans, path, item_size, 1); } return 0; } static noinline_for_stack int insert_inline_extent_backref(struct btrfs_trans_handle *trans, struct btrfs_path *path, u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid, u64 owner, u64 offset, int refs_to_add, struct btrfs_delayed_extent_op *extent_op) { struct btrfs_extent_inline_ref *iref; int ret; ret = lookup_inline_extent_backref(trans, path, &iref, bytenr, num_bytes, parent, root_objectid, owner, offset, 1); if (ret == 0) { /* * We're adding refs to a tree block we already own, this * should not happen at all. */ if (owner < BTRFS_FIRST_FREE_OBJECTID) { btrfs_print_leaf(path->nodes[0]); btrfs_crit(trans->fs_info, "adding refs to an existing tree ref, bytenr %llu num_bytes %llu root_objectid %llu slot %u", bytenr, num_bytes, root_objectid, path->slots[0]); return -EUCLEAN; } ret = update_inline_extent_backref(trans, path, iref, refs_to_add, extent_op); } else if (ret == -ENOENT) { setup_inline_extent_backref(trans, path, iref, parent, root_objectid, owner, offset, refs_to_add, extent_op); ret = 0; } return ret; } static int remove_extent_backref(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, struct btrfs_extent_inline_ref *iref, int refs_to_drop, int is_data) { int ret = 0; BUG_ON(!is_data && refs_to_drop != 1); if (iref) ret = update_inline_extent_backref(trans, path, iref, -refs_to_drop, NULL); else if (is_data) ret = remove_extent_data_ref(trans, root, path, refs_to_drop); else ret = btrfs_del_item(trans, root, path); return ret; } static int btrfs_issue_discard(struct block_device *bdev, u64 start, u64 len, u64 *discarded_bytes) { int j, ret = 0; u64 bytes_left, end; u64 aligned_start = ALIGN(start, SECTOR_SIZE); /* Adjust the range to be aligned to 512B sectors if necessary. */ if (start != aligned_start) { len -= aligned_start - start; len = round_down(len, SECTOR_SIZE); start = aligned_start; } *discarded_bytes = 0; if (!len) return 0; end = start + len; bytes_left = len; /* Skip any superblocks on this device. */ for (j = 0; j < BTRFS_SUPER_MIRROR_MAX; j++) { u64 sb_start = btrfs_sb_offset(j); u64 sb_end = sb_start + BTRFS_SUPER_INFO_SIZE; u64 size = sb_start - start; if (!in_range(sb_start, start, bytes_left) && !in_range(sb_end, start, bytes_left) && !in_range(start, sb_start, BTRFS_SUPER_INFO_SIZE)) continue; /* * Superblock spans beginning of range. Adjust start and * try again. */ if (sb_start <= start) { start += sb_end - start; if (start > end) { bytes_left = 0; break; } bytes_left = end - start; continue; } if (size) { ret = blkdev_issue_discard(bdev, start >> SECTOR_SHIFT, size >> SECTOR_SHIFT, GFP_NOFS); if (!ret) *discarded_bytes += size; else if (ret != -EOPNOTSUPP) return ret; } start = sb_end; if (start > end) { bytes_left = 0; break; } bytes_left = end - start; } while (bytes_left) { u64 bytes_to_discard = min(BTRFS_MAX_DISCARD_CHUNK_SIZE, bytes_left); ret = blkdev_issue_discard(bdev, start >> SECTOR_SHIFT, bytes_to_discard >> SECTOR_SHIFT, GFP_NOFS); if (ret) { if (ret != -EOPNOTSUPP) break; continue; } start += bytes_to_discard; bytes_left -= bytes_to_discard; *discarded_bytes += bytes_to_discard; if (btrfs_trim_interrupted()) { ret = -ERESTARTSYS; break; } } return ret; } static int do_discard_extent(struct btrfs_discard_stripe *stripe, u64 *bytes) { struct btrfs_device *dev = stripe->dev; struct btrfs_fs_info *fs_info = dev->fs_info; struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; u64 phys = stripe->physical; u64 len = stripe->length; u64 discarded = 0; int ret = 0; /* Zone reset on a zoned filesystem */ if (btrfs_can_zone_reset(dev, phys, len)) { u64 src_disc; ret = btrfs_reset_device_zone(dev, phys, len, &discarded); if (ret) goto out; if (!btrfs_dev_replace_is_ongoing(dev_replace) || dev != dev_replace->srcdev) goto out; src_disc = discarded; /* Send to replace target as well */ ret = btrfs_reset_device_zone(dev_replace->tgtdev, phys, len, &discarded); discarded += src_disc; } else if (bdev_max_discard_sectors(stripe->dev->bdev)) { ret = btrfs_issue_discard(dev->bdev, phys, len, &discarded); } else { ret = 0; *bytes = 0; } out: *bytes = discarded; return ret; } int btrfs_discard_extent(struct btrfs_fs_info *fs_info, u64 bytenr, u64 num_bytes, u64 *actual_bytes) { int ret = 0; u64 discarded_bytes = 0; u64 end = bytenr + num_bytes; u64 cur = bytenr; /* * Avoid races with device replace and make sure the devices in the * stripes don't go away while we are discarding. */ btrfs_bio_counter_inc_blocked(fs_info); while (cur < end) { struct btrfs_discard_stripe *stripes; unsigned int num_stripes; int i; num_bytes = end - cur; stripes = btrfs_map_discard(fs_info, cur, &num_bytes, &num_stripes); if (IS_ERR(stripes)) { ret = PTR_ERR(stripes); if (ret == -EOPNOTSUPP) ret = 0; break; } for (i = 0; i < num_stripes; i++) { struct btrfs_discard_stripe *stripe = stripes + i; u64 bytes; if (!stripe->dev->bdev) { ASSERT(btrfs_test_opt(fs_info, DEGRADED)); continue; } if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &stripe->dev->dev_state)) continue; ret = do_discard_extent(stripe, &bytes); if (ret) { /* * Keep going if discard is not supported by the * device. */ if (ret != -EOPNOTSUPP) break; ret = 0; } else { discarded_bytes += bytes; } } kfree(stripes); if (ret) break; cur += num_bytes; } btrfs_bio_counter_dec(fs_info); if (actual_bytes) *actual_bytes = discarded_bytes; return ret; } /* Can return -ENOMEM */ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, struct btrfs_ref *generic_ref) { struct btrfs_fs_info *fs_info = trans->fs_info; int ret; ASSERT(generic_ref->type != BTRFS_REF_NOT_SET && generic_ref->action); BUG_ON(generic_ref->type == BTRFS_REF_METADATA && generic_ref->ref_root == BTRFS_TREE_LOG_OBJECTID); if (generic_ref->type == BTRFS_REF_METADATA) ret = btrfs_add_delayed_tree_ref(trans, generic_ref, NULL); else ret = btrfs_add_delayed_data_ref(trans, generic_ref, 0); btrfs_ref_tree_mod(fs_info, generic_ref); return ret; } /* * Insert backreference for a given extent. * * The counterpart is in __btrfs_free_extent(), with examples and more details * how it works. * * @trans: Handle of transaction * * @node: The delayed ref node used to get the bytenr/length for * extent whose references are incremented. * * @extent_op Pointer to a structure, holding information necessary when * updating a tree block's flags * */ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, struct btrfs_delayed_ref_node *node, struct btrfs_delayed_extent_op *extent_op) { struct btrfs_path *path; struct extent_buffer *leaf; struct btrfs_extent_item *item; struct btrfs_key key; u64 bytenr = node->bytenr; u64 num_bytes = node->num_bytes; u64 owner = btrfs_delayed_ref_owner(node); u64 offset = btrfs_delayed_ref_offset(node); u64 refs; int refs_to_add = node->ref_mod; int ret; path = btrfs_alloc_path(); if (!path) return -ENOMEM; /* this will setup the path even if it fails to insert the back ref */ ret = insert_inline_extent_backref(trans, path, bytenr, num_bytes, node->parent, node->ref_root, owner, offset, refs_to_add, extent_op); if ((ret < 0 && ret != -EAGAIN) || !ret) goto out; /* * Ok we had -EAGAIN which means we didn't have space to insert and * inline extent ref, so just update the reference count and add a * normal backref. */ leaf = path->nodes[0]; btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); refs = btrfs_extent_refs(leaf, item); btrfs_set_extent_refs(leaf, item, refs + refs_to_add); if (extent_op) __run_delayed_extent_op(extent_op, leaf, item); btrfs_release_path(path); /* now insert the actual backref */ if (owner < BTRFS_FIRST_FREE_OBJECTID) ret = insert_tree_block_ref(trans, path, node, bytenr); else ret = insert_extent_data_ref(trans, path, node, bytenr); if (ret) btrfs_abort_transaction(trans, ret); out: btrfs_free_path(path); return ret; } static void free_head_ref_squota_rsv(struct btrfs_fs_info *fs_info, struct btrfs_delayed_ref_head *href) { u64 root = href->owning_root; /* * Don't check must_insert_reserved, as this is called from contexts * where it has already been unset. */ if (btrfs_qgroup_mode(fs_info) != BTRFS_QGROUP_MODE_SIMPLE || !href->is_data || !is_fstree(root)) return; btrfs_qgroup_free_refroot(fs_info, root, href->reserved_bytes, BTRFS_QGROUP_RSV_DATA); } static int run_delayed_data_ref(struct btrfs_trans_handle *trans, struct btrfs_delayed_ref_head *href, struct btrfs_delayed_ref_node *node, struct btrfs_delayed_extent_op *extent_op, bool insert_reserved) { int ret = 0; u64 parent = 0; u64 flags = 0; trace_run_delayed_data_ref(trans->fs_info, node); if (node->type == BTRFS_SHARED_DATA_REF_KEY) parent = node->parent; if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) { struct btrfs_key key; struct btrfs_squota_delta delta = { .root = href->owning_root, .num_bytes = node->num_bytes, .is_data = true, .is_inc = true, .generation = trans->transid, }; u64 owner = btrfs_delayed_ref_owner(node); u64 offset = btrfs_delayed_ref_offset(node); if (extent_op) flags |= extent_op->flags_to_set; key.objectid = node->bytenr; key.type = BTRFS_EXTENT_ITEM_KEY; key.offset = node->num_bytes; ret = alloc_reserved_file_extent(trans, parent, node->ref_root, flags, owner, offset, &key, node->ref_mod, href->owning_root); free_head_ref_squota_rsv(trans->fs_info, href); if (!ret) ret = btrfs_record_squota_delta(trans->fs_info, &delta); } else if (node->action == BTRFS_ADD_DELAYED_REF) { ret = __btrfs_inc_extent_ref(trans, node, extent_op); } else if (node->action == BTRFS_DROP_DELAYED_REF) { ret = __btrfs_free_extent(trans, href, node, extent_op); } else { BUG(); } return ret; } static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op, struct extent_buffer *leaf, struct btrfs_extent_item *ei) { u64 flags = btrfs_extent_flags(leaf, ei); if (extent_op->update_flags) { flags |= extent_op->flags_to_set; btrfs_set_extent_flags(leaf, ei, flags); } if (extent_op->update_key) { struct btrfs_tree_block_info *bi; BUG_ON(!(flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)); bi = (struct btrfs_tree_block_info *)(ei + 1); btrfs_set_tree_block_key(leaf, bi, &extent_op->key); } } static int run_delayed_extent_op(struct btrfs_trans_handle *trans, struct btrfs_delayed_ref_head *head, struct btrfs_delayed_extent_op *extent_op) { struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_root *root; struct btrfs_key key; struct btrfs_path *path; struct btrfs_extent_item *ei; struct extent_buffer *leaf; u32 item_size; int ret; int metadata = 1; if (TRANS_ABORTED(trans)) return 0; if (!btrfs_fs_incompat(fs_info, SKINNY_METADATA)) metadata = 0; path = btrfs_alloc_path(); if (!path) return -ENOMEM; key.objectid = head->bytenr; if (metadata) { key.type = BTRFS_METADATA_ITEM_KEY; key.offset = head->level; } else { key.type = BTRFS_EXTENT_ITEM_KEY; key.offset = head->num_bytes; } root = btrfs_extent_root(fs_info, key.objectid); again: ret = btrfs_search_slot(trans, root, &key, path, 0, 1); if (ret < 0) { goto out; } else if (ret > 0) { if (metadata) { if (path->slots[0] > 0) { path->slots[0]--; btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); if (key.objectid == head->bytenr && key.type == BTRFS_EXTENT_ITEM_KEY && key.offset == head->num_bytes) ret = 0; } if (ret > 0) { btrfs_release_path(path); metadata = 0; key.objectid = head->bytenr; key.offset = head->num_bytes; key.type = BTRFS_EXTENT_ITEM_KEY; goto again; } } else { ret = -EUCLEAN; btrfs_err(fs_info, "missing extent item for extent %llu num_bytes %llu level %d", head->bytenr, head->num_bytes, head->level); goto out; } } leaf = path->nodes[0]; item_size = btrfs_item_size(leaf, path->slots[0]); if (unlikely(item_size < sizeof(*ei))) { ret = -EUCLEAN; btrfs_err(fs_info, "unexpected extent item size, has %u expect >= %zu", item_size, sizeof(*ei)); btrfs_abort_transaction(trans, ret); goto out; } ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); __run_delayed_extent_op(extent_op, leaf, ei); out: btrfs_free_path(path); return ret; } static int run_delayed_tree_ref(struct btrfs_trans_handle *trans, struct btrfs_delayed_ref_head *href, struct btrfs_delayed_ref_node *node, struct btrfs_delayed_extent_op *extent_op, bool insert_reserved) { int ret = 0; struct btrfs_fs_info *fs_info = trans->fs_info; u64 parent = 0; u64 ref_root = 0; trace_run_delayed_tree_ref(trans->fs_info, node); if (node->type == BTRFS_SHARED_BLOCK_REF_KEY) parent = node->parent; ref_root = node->ref_root; if (unlikely(node->ref_mod != 1)) { btrfs_err(trans->fs_info, "btree block %llu has %d references rather than 1: action %d ref_root %llu parent %llu", node->bytenr, node->ref_mod, node->action, ref_root, parent); return -EUCLEAN; } if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) { struct btrfs_squota_delta delta = { .root = href->owning_root, .num_bytes = fs_info->nodesize, .is_data = false, .is_inc = true, .generation = trans->transid, }; ret = alloc_reserved_tree_block(trans, node, extent_op); if (!ret) btrfs_record_squota_delta(fs_info, &delta); } else if (node->action == BTRFS_ADD_DELAYED_REF) { ret = __btrfs_inc_extent_ref(trans, node, extent_op); } else if (node->action == BTRFS_DROP_DELAYED_REF) { ret = __btrfs_free_extent(trans, href, node, extent_op); } else { BUG(); } return ret; } /* helper function to actually process a single delayed ref entry */ static int run_one_delayed_ref(struct btrfs_trans_handle *trans, struct btrfs_delayed_ref_head *href, struct btrfs_delayed_ref_node *node, struct btrfs_delayed_extent_op *extent_op, bool insert_reserved) { int ret = 0; if (TRANS_ABORTED(trans)) { if (insert_reserved) { btrfs_pin_extent(trans, node->bytenr, node->num_bytes, 1); free_head_ref_squota_rsv(trans->fs_info, href); } return 0; } if (node->type == BTRFS_TREE_BLOCK_REF_KEY || node->type == BTRFS_SHARED_BLOCK_REF_KEY) ret = run_delayed_tree_ref(trans, href, node, extent_op, insert_reserved); else if (node->type == BTRFS_EXTENT_DATA_REF_KEY || node->type == BTRFS_SHARED_DATA_REF_KEY) ret = run_delayed_data_ref(trans, href, node, extent_op, insert_reserved); else if (node->type == BTRFS_EXTENT_OWNER_REF_KEY) ret = 0; else BUG(); if (ret && insert_reserved) btrfs_pin_extent(trans, node->bytenr, node->num_bytes, 1); if (ret < 0) btrfs_err(trans->fs_info, "failed to run delayed ref for logical %llu num_bytes %llu type %u action %u ref_mod %d: %d", node->bytenr, node->num_bytes, node->type, node->action, node->ref_mod, ret); return ret; } static struct btrfs_delayed_extent_op *cleanup_extent_op( struct btrfs_delayed_ref_head *head) { struct btrfs_delayed_extent_op *extent_op = head->extent_op; if (!extent_op) return NULL; if (head->must_insert_reserved) { head->extent_op = NULL; btrfs_free_delayed_extent_op(extent_op); return NULL; } return extent_op; } static int run_and_cleanup_extent_op(struct btrfs_trans_handle *trans, struct btrfs_delayed_ref_head *head) { struct btrfs_delayed_extent_op *extent_op; int ret; extent_op = cleanup_extent_op(head); if (!extent_op) return 0; head->extent_op = NULL; spin_unlock(&head->lock); ret = run_delayed_extent_op(trans, head, extent_op); btrfs_free_delayed_extent_op(extent_op); return ret ? ret : 1; } u64 btrfs_cleanup_ref_head_accounting(struct btrfs_fs_info *fs_info, struct btrfs_delayed_ref_root *delayed_refs, struct btrfs_delayed_ref_head *head) { u64 ret = 0; /* * We had csum deletions accounted for in our delayed refs rsv, we need * to drop the csum leaves for this update from our delayed_refs_rsv. */ if (head->total_ref_mod < 0 && head->is_data) { int nr_csums; spin_lock(&delayed_refs->lock); delayed_refs->pending_csums -= head->num_bytes; spin_unlock(&delayed_refs->lock); nr_csums = btrfs_csum_bytes_to_leaves(fs_info, head->num_bytes); btrfs_delayed_refs_rsv_release(fs_info, 0, nr_csums); ret = btrfs_calc_delayed_ref_csum_bytes(fs_info, nr_csums); } /* must_insert_reserved can be set only if we didn't run the head ref. */ if (head->must_insert_reserved) free_head_ref_squota_rsv(fs_info, head); return ret; } static int cleanup_ref_head(struct btrfs_trans_handle *trans, struct btrfs_delayed_ref_head *head, u64 *bytes_released) { struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_delayed_ref_root *delayed_refs; int ret; delayed_refs = &trans->transaction->delayed_refs; ret = run_and_cleanup_extent_op(trans, head); if (ret < 0) { btrfs_unselect_ref_head(delayed_refs, head); btrfs_debug(fs_info, "run_delayed_extent_op returned %d", ret); return ret; } else if (ret) { return ret; } /* * Need to drop our head ref lock and re-acquire the delayed ref lock * and then re-check to make sure nobody got added. */ spin_unlock(&head->lock); spin_lock(&delayed_refs->lock); spin_lock(&head->lock); if (!RB_EMPTY_ROOT(&head->ref_tree.rb_root) || head->extent_op) { spin_unlock(&head->lock); spin_unlock(&delayed_refs->lock); return 1; } btrfs_delete_ref_head(fs_info, delayed_refs, head); spin_unlock(&head->lock); spin_unlock(&delayed_refs->lock); if (head->must_insert_reserved) { btrfs_pin_extent(trans, head->bytenr, head->num_bytes, 1); if (head->is_data) { struct btrfs_root *csum_root; csum_root = btrfs_csum_root(fs_info, head->bytenr); ret = btrfs_del_csums(trans, csum_root, head->bytenr, head->num_bytes); } } *bytes_released += btrfs_cleanup_ref_head_accounting(fs_info, delayed_refs, head); trace_run_delayed_ref_head(fs_info, head, 0); btrfs_delayed_ref_unlock(head); btrfs_put_delayed_ref_head(head); return ret; } static int btrfs_run_delayed_refs_for_head(struct btrfs_trans_handle *trans, struct btrfs_delayed_ref_head *locked_ref, u64 *bytes_released) { struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_delayed_ref_root *delayed_refs; struct btrfs_delayed_extent_op *extent_op; struct btrfs_delayed_ref_node *ref; bool must_insert_reserved; int ret; delayed_refs = &trans->transaction->delayed_refs; lockdep_assert_held(&locked_ref->mutex); lockdep_assert_held(&locked_ref->lock); while ((ref = btrfs_select_delayed_ref(locked_ref))) { if (ref->seq && btrfs_check_delayed_seq(fs_info, ref->seq)) { spin_unlock(&locked_ref->lock); btrfs_unselect_ref_head(delayed_refs, locked_ref); return -EAGAIN; } rb_erase_cached(&ref->ref_node, &locked_ref->ref_tree); RB_CLEAR_NODE(&ref->ref_node); if (!list_empty(&ref->add_list)) list_del(&ref->add_list); /* * When we play the delayed ref, also correct the ref_mod on * head */ switch (ref->action) { case BTRFS_ADD_DELAYED_REF: case BTRFS_ADD_DELAYED_EXTENT: locked_ref->ref_mod -= ref->ref_mod; break; case BTRFS_DROP_DELAYED_REF: locked_ref->ref_mod += ref->ref_mod; break; default: WARN_ON(1); } /* * Record the must_insert_reserved flag before we drop the * spin lock. */ must_insert_reserved = locked_ref->must_insert_reserved; /* * Unsetting this on the head ref relinquishes ownership of * the rsv_bytes, so it is critical that every possible code * path from here forward frees all reserves including qgroup * reserve. */ locked_ref->must_insert_reserved = false; extent_op = locked_ref->extent_op; locked_ref->extent_op = NULL; spin_unlock(&locked_ref->lock); ret = run_one_delayed_ref(trans, locked_ref, ref, extent_op, must_insert_reserved); btrfs_delayed_refs_rsv_release(fs_info, 1, 0); *bytes_released += btrfs_calc_delayed_ref_bytes(fs_info, 1); btrfs_free_delayed_extent_op(extent_op); if (ret) { btrfs_unselect_ref_head(delayed_refs, locked_ref); btrfs_put_delayed_ref(ref); return ret; } btrfs_put_delayed_ref(ref); cond_resched(); spin_lock(&locked_ref->lock); btrfs_merge_delayed_refs(fs_info, delayed_refs, locked_ref); } return 0; } /* * Returns 0 on success or if called with an already aborted transaction. * Returns -ENOMEM or -EIO on failure and will abort the transaction. */ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, u64 min_bytes) { struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_delayed_ref_root *delayed_refs; struct btrfs_delayed_ref_head *locked_ref = NULL; int ret; unsigned long count = 0; unsigned long max_count = 0; u64 bytes_processed = 0; delayed_refs = &trans->transaction->delayed_refs; if (min_bytes == 0) { max_count = delayed_refs->num_heads_ready; min_bytes = U64_MAX; } do { if (!locked_ref) { locked_ref = btrfs_select_ref_head(fs_info, delayed_refs); if (IS_ERR_OR_NULL(locked_ref)) { if (PTR_ERR(locked_ref) == -EAGAIN) { continue; } else { break; } } count++; } /* * We need to try and merge add/drops of the same ref since we * can run into issues with relocate dropping the implicit ref * and then it being added back again before the drop can * finish. If we merged anything we need to re-loop so we can * get a good ref. * Or we can get node references of the same type that weren't * merged when created due to bumps in the tree mod seq, and * we need to merge them to prevent adding an inline extent * backref before dropping it (triggering a BUG_ON at * insert_inline_extent_backref()). */ spin_lock(&locked_ref->lock); btrfs_merge_delayed_refs(fs_info, delayed_refs, locked_ref); ret = btrfs_run_delayed_refs_for_head(trans, locked_ref, &bytes_processed); if (ret < 0 && ret != -EAGAIN) { /* * Error, btrfs_run_delayed_refs_for_head already * unlocked everything so just bail out */ return ret; } else if (!ret) { /* * Success, perform the usual cleanup of a processed * head */ ret = cleanup_ref_head(trans, locked_ref, &bytes_processed); if (ret > 0 ) { /* We dropped our lock, we need to loop. */ ret = 0; continue; } else if (ret) { return ret; } } /* * Either success case or btrfs_run_delayed_refs_for_head * returned -EAGAIN, meaning we need to select another head */ locked_ref = NULL; cond_resched(); } while ((min_bytes != U64_MAX && bytes_processed < min_bytes) || (max_count > 0 && count < max_count) || locked_ref); return 0; } #ifdef SCRAMBLE_DELAYED_REFS /* * Normally delayed refs get processed in ascending bytenr order. This * correlates in most cases to the order added. To expose dependencies on this * order, we start to process the tree in the middle instead of the beginning */ static u64 find_middle(struct rb_root *root) { struct rb_node *n = root->rb_node; struct btrfs_delayed_ref_node *entry; int alt = 1; u64 middle; u64 first = 0, last = 0; n = rb_first(root); if (n) { entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node); first = entry->bytenr; } n = rb_last(root); if (n) { entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node); last = entry->bytenr; } n = root->rb_node; while (n) { entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node); WARN_ON(!entry->in_tree); middle = entry->bytenr; if (alt) n = n->rb_left; else n = n->rb_right; alt = 1 - alt; } return middle; } #endif /* * Start processing the delayed reference count updates and extent insertions * we have queued up so far. * * @trans: Transaction handle. * @min_bytes: How many bytes of delayed references to process. After this * many bytes we stop processing delayed references if there are * any more. If 0 it means to run all existing delayed references, * but not new ones added after running all existing ones. * Use (u64)-1 (U64_MAX) to run all existing delayed references * plus any new ones that are added. * * Returns 0 on success or if called with an aborted transaction * Returns <0 on error and aborts the transaction */ int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, u64 min_bytes) { struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_delayed_ref_root *delayed_refs; int ret; /* We'll clean this up in btrfs_cleanup_transaction */ if (TRANS_ABORTED(trans)) return 0; if (test_bit(BTRFS_FS_CREATING_FREE_SPACE_TREE, &fs_info->flags)) return 0; delayed_refs = &trans->transaction->delayed_refs; again: #ifdef SCRAMBLE_DELAYED_REFS delayed_refs->run_delayed_start = find_middle(&delayed_refs->root); #endif ret = __btrfs_run_delayed_refs(trans, min_bytes); if (ret < 0) { btrfs_abort_transaction(trans, ret); return ret; } if (min_bytes == U64_MAX) { btrfs_create_pending_block_groups(trans); spin_lock(&delayed_refs->lock); if (xa_empty(&delayed_refs->head_refs)) { spin_unlock(&delayed_refs->lock); return 0; } spin_unlock(&delayed_refs->lock); cond_resched(); goto again; } return 0; } int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans, struct extent_buffer *eb, u64 flags) { struct btrfs_delayed_extent_op *extent_op; int ret; extent_op = btrfs_alloc_delayed_extent_op(); if (!extent_op) return -ENOMEM; extent_op->flags_to_set = flags; extent_op->update_flags = true; extent_op->update_key = false; ret = btrfs_add_delayed_extent_op(trans, eb->start, eb->len, btrfs_header_level(eb), extent_op); if (ret) btrfs_free_delayed_extent_op(extent_op); return ret; } static noinline int check_delayed_ref(struct btrfs_inode *inode, struct btrfs_path *path, u64 offset, u64 bytenr) { struct btrfs_root *root = inode->root; struct btrfs_delayed_ref_head *head; struct btrfs_delayed_ref_node *ref; struct btrfs_delayed_ref_root *delayed_refs; struct btrfs_transaction *cur_trans; struct rb_node *node; int ret = 0; spin_lock(&root->fs_info->trans_lock); cur_trans = root->fs_info->running_transaction; if (cur_trans) refcount_inc(&cur_trans->use_count); spin_unlock(&root->fs_info->trans_lock); if (!cur_trans) return 0; delayed_refs = &cur_trans->delayed_refs; spin_lock(&delayed_refs->lock); head = btrfs_find_delayed_ref_head(root->fs_info, delayed_refs, bytenr); if (!head) { spin_unlock(&delayed_refs->lock); btrfs_put_transaction(cur_trans); return 0; } if (!mutex_trylock(&head->mutex)) { if (path->nowait) { spin_unlock(&delayed_refs->lock); btrfs_put_transaction(cur_trans); return -EAGAIN; } refcount_inc(&head->refs); spin_unlock(&delayed_refs->lock); btrfs_release_path(path); /* * Mutex was contended, block until it's released and let * caller try again */ mutex_lock(&head->mutex); mutex_unlock(&head->mutex); btrfs_put_delayed_ref_head(head); btrfs_put_transaction(cur_trans); return -EAGAIN; } spin_unlock(&delayed_refs->lock); spin_lock(&head->lock); /* * XXX: We should replace this with a proper search function in the * future. */ for (node = rb_first_cached(&head->ref_tree); node; node = rb_next(node)) { u64 ref_owner; u64 ref_offset; ref = rb_entry(node, struct btrfs_delayed_ref_node, ref_node); /* If it's a shared ref we know a cross reference exists */ if (ref->type != BTRFS_EXTENT_DATA_REF_KEY) { ret = 1; break; } ref_owner = btrfs_delayed_ref_owner(ref); ref_offset = btrfs_delayed_ref_offset(ref); /* * If our ref doesn't match the one we're currently looking at * then we have a cross reference. */ if (ref->ref_root != btrfs_root_id(root) || ref_owner != btrfs_ino(inode) || ref_offset != offset) { ret = 1; break; } } spin_unlock(&head->lock); mutex_unlock(&head->mutex); btrfs_put_transaction(cur_trans); return ret; } /* * Check if there are references for a data extent other than the one belonging * to the given inode and offset. * * @inode: The only inode we expect to find associated with the data extent. * @path: A path to use for searching the extent tree. * @offset: The only offset we expect to find associated with the data extent. * @bytenr: The logical address of the data extent. * * When the extent does not have any other references other than the one we * expect to find, we always return a value of 0 with the path having a locked * leaf that contains the extent's extent item - this is necessary to ensure * we don't race with a task running delayed references, and our caller must * have such a path when calling check_delayed_ref() - it must lock a delayed * ref head while holding the leaf locked. In case the extent item is not found * in the extent tree, we return -ENOENT with the path having the leaf (locked) * where the extent item should be, in order to prevent races with another task * running delayed references, so that we don't miss any reference when calling * check_delayed_ref(). * * Note: this may return false positives, and this is because we want to be * quick here as we're called in write paths (when flushing delalloc and * in the direct IO write path). For example we can have an extent with * a single reference but that reference is not inlined, or we may have * many references in the extent tree but we also have delayed references * that cancel all the reference except the one for our inode and offset, * but it would be expensive to do such checks and complex due to all * locking to avoid races between the checks and flushing delayed refs, * plus non-inline references may be located on leaves other than the one * that contains the extent item in the extent tree. The important thing * here is to not return false negatives and that the false positives are * not very common. * * Returns: 0 if there are no cross references and with the path having a locked * leaf from the extent tree that contains the extent's extent item. * * 1 if there are cross references (false positives can happen). * * < 0 in case of an error. In case of -ENOENT the leaf in the extent * tree where the extent item should be located at is read locked and * accessible in the given path. */ static noinline int check_committed_ref(struct btrfs_inode *inode, struct btrfs_path *path, u64 offset, u64 bytenr) { struct btrfs_root *root = inode->root; struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_root *extent_root = btrfs_extent_root(fs_info, bytenr); struct extent_buffer *leaf; struct btrfs_extent_data_ref *ref; struct btrfs_extent_inline_ref *iref; struct btrfs_extent_item *ei; struct btrfs_key key; u32 item_size; u32 expected_size; int type; int ret; key.objectid = bytenr; key.offset = (u64)-1; key.type = BTRFS_EXTENT_ITEM_KEY; ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0); if (ret < 0) return ret; if (ret == 0) { /* * Key with offset -1 found, there would have to exist an extent * item with such offset, but this is out of the valid range. */ return -EUCLEAN; } if (path->slots[0] == 0) return -ENOENT; path->slots[0]--; leaf = path->nodes[0]; btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); if (key.objectid != bytenr || key.type != BTRFS_EXTENT_ITEM_KEY) return -ENOENT; item_size = btrfs_item_size(leaf, path->slots[0]); ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); expected_size = sizeof(*ei) + btrfs_extent_inline_ref_size(BTRFS_EXTENT_DATA_REF_KEY); /* No inline refs; we need to bail before checking for owner ref. */ if (item_size == sizeof(*ei)) return 1; /* Check for an owner ref; skip over it to the real inline refs. */ iref = (struct btrfs_extent_inline_ref *)(ei + 1); type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_DATA); if (btrfs_fs_incompat(fs_info, SIMPLE_QUOTA) && type == BTRFS_EXTENT_OWNER_REF_KEY) { expected_size += btrfs_extent_inline_ref_size(BTRFS_EXTENT_OWNER_REF_KEY); iref = (struct btrfs_extent_inline_ref *)(iref + 1); type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_DATA); } /* If extent item has more than 1 inline ref then it's shared */ if (item_size != expected_size) return 1; /* If this extent has SHARED_DATA_REF then it's shared */ if (type != BTRFS_EXTENT_DATA_REF_KEY) return 1; ref = (struct btrfs_extent_data_ref *)(&iref->offset); if (btrfs_extent_refs(leaf, ei) != btrfs_extent_data_ref_count(leaf, ref) || btrfs_extent_data_ref_root(leaf, ref) != btrfs_root_id(root) || btrfs_extent_data_ref_objectid(leaf, ref) != btrfs_ino(inode) || btrfs_extent_data_ref_offset(leaf, ref) != offset) return 1; return 0; } int btrfs_cross_ref_exist(struct btrfs_inode *inode, u64 offset, u64 bytenr, struct btrfs_path *path) { int ret; do { ret = check_committed_ref(inode, path, offset, bytenr); if (ret && ret != -ENOENT) goto out; /* * The path must have a locked leaf from the extent tree where * the extent item for our extent is located, in case it exists, * or where it should be located in case it doesn't exist yet * because it's new and its delayed ref was not yet flushed. * We need to lock the delayed ref head at check_delayed_ref(), * if one exists, while holding the leaf locked in order to not * race with delayed ref flushing, missing references and * incorrectly reporting that the extent is not shared. */ if (IS_ENABLED(CONFIG_BTRFS_ASSERT)) { struct extent_buffer *leaf = path->nodes[0]; ASSERT(leaf != NULL); btrfs_assert_tree_read_locked(leaf); if (ret != -ENOENT) { struct btrfs_key key; btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); ASSERT(key.objectid == bytenr); ASSERT(key.type == BTRFS_EXTENT_ITEM_KEY); } } ret = check_delayed_ref(inode, path, offset, bytenr); } while (ret == -EAGAIN && !path->nowait); out: btrfs_release_path(path); if (btrfs_is_data_reloc_root(inode->root)) WARN_ON(ret > 0); return ret; } static int __btrfs_mod_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *buf, int full_backref, int inc) { struct btrfs_fs_info *fs_info = root->fs_info; u64 parent; u64 ref_root; u32 nritems; struct btrfs_key key; struct btrfs_file_extent_item *fi; bool for_reloc = btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC); int i; int action; int level; int ret = 0; if (btrfs_is_testing(fs_info)) return 0; ref_root = btrfs_header_owner(buf); nritems = btrfs_header_nritems(buf); level = btrfs_header_level(buf); if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state) && level == 0) return 0; if (full_backref) parent = buf->start; else parent = 0; if (inc) action = BTRFS_ADD_DELAYED_REF; else action = BTRFS_DROP_DELAYED_REF; for (i = 0; i < nritems; i++) { struct btrfs_ref ref = { .action = action, .parent = parent, .ref_root = ref_root, }; if (level == 0) { btrfs_item_key_to_cpu(buf, &key, i); if (key.type != BTRFS_EXTENT_DATA_KEY) continue; fi = btrfs_item_ptr(buf, i, struct btrfs_file_extent_item); if (btrfs_file_extent_type(buf, fi) == BTRFS_FILE_EXTENT_INLINE) continue; ref.bytenr = btrfs_file_extent_disk_bytenr(buf, fi); if (ref.bytenr == 0) continue; ref.num_bytes = btrfs_file_extent_disk_num_bytes(buf, fi); ref.owning_root = ref_root; key.offset -= btrfs_file_extent_offset(buf, fi); btrfs_init_data_ref(&ref, key.objectid, key.offset, btrfs_root_id(root), for_reloc); if (inc) ret = btrfs_inc_extent_ref(trans, &ref); else ret = btrfs_free_extent(trans, &ref); if (ret) goto fail; } else { /* We don't know the owning_root, leave as 0. */ ref.bytenr = btrfs_node_blockptr(buf, i); ref.num_bytes = fs_info->nodesize; btrfs_init_tree_ref(&ref, level - 1, btrfs_root_id(root), for_reloc); if (inc) ret = btrfs_inc_extent_ref(trans, &ref); else ret = btrfs_free_extent(trans, &ref); if (ret) goto fail; } } return 0; fail: return ret; } int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *buf, int full_backref) { return __btrfs_mod_ref(trans, root, buf, full_backref, 1); } int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *buf, int full_backref) { return __btrfs_mod_ref(trans, root, buf, full_backref, 0); } static u64 get_alloc_profile_by_root(struct btrfs_root *root, int data) { struct btrfs_fs_info *fs_info = root->fs_info; u64 flags; u64 ret; if (data) flags = BTRFS_BLOCK_GROUP_DATA; else if (root == fs_info->chunk_root) flags = BTRFS_BLOCK_GROUP_SYSTEM; else flags = BTRFS_BLOCK_GROUP_METADATA; ret = btrfs_get_alloc_profile(fs_info, flags); return ret; } static u64 first_logical_byte(struct btrfs_fs_info *fs_info) { struct rb_node *leftmost; u64 bytenr = 0; read_lock(&fs_info->block_group_cache_lock); /* Get the block group with the lowest logical start address. */ leftmost = rb_first_cached(&fs_info->block_group_cache_tree); if (leftmost) { struct btrfs_block_group *bg; bg = rb_entry(leftmost, struct btrfs_block_group, cache_node); bytenr = bg->start; } read_unlock(&fs_info->block_group_cache_lock); return bytenr; } static int pin_down_extent(struct btrfs_trans_handle *trans, struct btrfs_block_group *cache, u64 bytenr, u64 num_bytes, int reserved) { spin_lock(&cache->space_info->lock); spin_lock(&cache->lock); cache->pinned += num_bytes; btrfs_space_info_update_bytes_pinned(cache->space_info, num_bytes); if (reserved) { cache->reserved -= num_bytes; cache->space_info->bytes_reserved -= num_bytes; } spin_unlock(&cache->lock); spin_unlock(&cache->space_info->lock); set_extent_bit(&trans->transaction->pinned_extents, bytenr, bytenr + num_bytes - 1, EXTENT_DIRTY, NULL); return 0; } int btrfs_pin_extent(struct btrfs_trans_handle *trans, u64 bytenr, u64 num_bytes, int reserved) { struct btrfs_block_group *cache; cache = btrfs_lookup_block_group(trans->fs_info, bytenr); BUG_ON(!cache); /* Logic error */ pin_down_extent(trans, cache, bytenr, num_bytes, reserved); btrfs_put_block_group(cache); return 0; } int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle *trans, const struct extent_buffer *eb) { struct btrfs_block_group *cache; int ret; cache = btrfs_lookup_block_group(trans->fs_info, eb->start); if (!cache) return -EINVAL; /* * Fully cache the free space first so that our pin removes the free space * from the cache. */ ret = btrfs_cache_block_group(cache, true); if (ret) goto out; pin_down_extent(trans, cache, eb->start, eb->len, 0); /* remove us from the free space cache (if we're there at all) */ ret = btrfs_remove_free_space(cache, eb->start, eb->len); out: btrfs_put_block_group(cache); return ret; } static int __exclude_logged_extent(struct btrfs_fs_info *fs_info, u64 start, u64 num_bytes) { int ret; struct btrfs_block_group *block_group; block_group = btrfs_lookup_block_group(fs_info, start); if (!block_group) return -EINVAL; ret = btrfs_cache_block_group(block_group, true); if (ret) goto out; ret = btrfs_remove_free_space(block_group, start, num_bytes); out: btrfs_put_block_group(block_group); return ret; } int btrfs_exclude_logged_extents(struct extent_buffer *eb) { struct btrfs_fs_info *fs_info = eb->fs_info; struct btrfs_file_extent_item *item; struct btrfs_key key; int found_type; int i; int ret = 0; if (!btrfs_fs_incompat(fs_info, MIXED_GROUPS)) return 0; for (i = 0; i < btrfs_header_nritems(eb); i++) { btrfs_item_key_to_cpu(eb, &key, i); if (key.type != BTRFS_EXTENT_DATA_KEY) continue; item = btrfs_item_ptr(eb, i, struct btrfs_file_extent_item); found_type = btrfs_file_extent_type(eb, item); if (found_type == BTRFS_FILE_EXTENT_INLINE) continue; if (btrfs_file_extent_disk_bytenr(eb, item) == 0) continue; key.objectid = btrfs_file_extent_disk_bytenr(eb, item); key.offset = btrfs_file_extent_disk_num_bytes(eb, item); ret = __exclude_logged_extent(fs_info, key.objectid, key.offset); if (ret) break; } return ret; } static void btrfs_inc_block_group_reservations(struct btrfs_block_group *bg) { atomic_inc(&bg->reservations); } /* * Returns the free cluster for the given space info and sets empty_cluster to * what it should be based on the mount options. */ static struct btrfs_free_cluster * fetch_cluster_info(struct btrfs_fs_info *fs_info, struct btrfs_space_info *space_info, u64 *empty_cluster) { struct btrfs_free_cluster *ret = NULL; *empty_cluster = 0; if (btrfs_mixed_space_info(space_info)) return ret; if (space_info->flags & BTRFS_BLOCK_GROUP_METADATA) { ret = &fs_info->meta_alloc_cluster; if (btrfs_test_opt(fs_info, SSD)) *empty_cluster = SZ_2M; else *empty_cluster = SZ_64K; } else if ((space_info->flags & BTRFS_BLOCK_GROUP_DATA) && btrfs_test_opt(fs_info, SSD_SPREAD)) { *empty_cluster = SZ_2M; ret = &fs_info->data_alloc_cluster; } return ret; } static int unpin_extent_range(struct btrfs_fs_info *fs_info, u64 start, u64 end, const bool return_free_space) { struct btrfs_block_group *cache = NULL; struct btrfs_space_info *space_info; struct btrfs_free_cluster *cluster = NULL; u64 total_unpinned = 0; u64 empty_cluster = 0; bool readonly; int ret = 0; while (start <= end) { u64 len; readonly = false; if (!cache || start >= cache->start + cache->length) { if (cache) btrfs_put_block_group(cache); total_unpinned = 0; cache = btrfs_lookup_block_group(fs_info, start); if (cache == NULL) { /* Logic error, something removed the block group. */ ret = -EUCLEAN; goto out; } cluster = fetch_cluster_info(fs_info, cache->space_info, &empty_cluster); empty_cluster <<= 1; } len = cache->start + cache->length - start; len = min(len, end + 1 - start); if (return_free_space) btrfs_add_free_space(cache, start, len); start += len; total_unpinned += len; space_info = cache->space_info; /* * If this space cluster has been marked as fragmented and we've * unpinned enough in this block group to potentially allow a * cluster to be created inside of it go ahead and clear the * fragmented check. */ if (cluster && cluster->fragmented && total_unpinned > empty_cluster) { spin_lock(&cluster->lock); cluster->fragmented = 0; spin_unlock(&cluster->lock); } spin_lock(&space_info->lock); spin_lock(&cache->lock); cache->pinned -= len; btrfs_space_info_update_bytes_pinned(space_info, -len); space_info->max_extent_size = 0; if (cache->ro) { space_info->bytes_readonly += len; readonly = true; } else if (btrfs_is_zoned(fs_info)) { /* Need reset before reusing in a zoned block group */ btrfs_space_info_update_bytes_zone_unusable(space_info, len); readonly = true; } spin_unlock(&cache->lock); if (!readonly && return_free_space) btrfs_return_free_space(space_info, len); spin_unlock(&space_info->lock); } if (cache) btrfs_put_block_group(cache); out: return ret; } int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans) { struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_block_group *block_group, *tmp; struct list_head *deleted_bgs; struct extent_io_tree *unpin; u64 start; u64 end; int ret; unpin = &trans->transaction->pinned_extents; while (!TRANS_ABORTED(trans)) { struct extent_state *cached_state = NULL; mutex_lock(&fs_info->unused_bg_unpin_mutex); if (!find_first_extent_bit(unpin, 0, &start, &end, EXTENT_DIRTY, &cached_state)) { mutex_unlock(&fs_info->unused_bg_unpin_mutex); break; } if (btrfs_test_opt(fs_info, DISCARD_SYNC)) ret = btrfs_discard_extent(fs_info, start, end + 1 - start, NULL); clear_extent_dirty(unpin, start, end, &cached_state); ret = unpin_extent_range(fs_info, start, end, true); BUG_ON(ret); mutex_unlock(&fs_info->unused_bg_unpin_mutex); free_extent_state(cached_state); cond_resched(); } if (btrfs_test_opt(fs_info, DISCARD_ASYNC)) { btrfs_discard_calc_delay(&fs_info->discard_ctl); btrfs_discard_schedule_work(&fs_info->discard_ctl, true); } /* * Transaction is finished. We don't need the lock anymore. We * do need to clean up the block groups in case of a transaction * abort. */ deleted_bgs = &trans->transaction->deleted_bgs; list_for_each_entry_safe(block_group, tmp, deleted_bgs, bg_list) { u64 trimmed = 0; ret = -EROFS; if (!TRANS_ABORTED(trans)) ret = btrfs_discard_extent(fs_info, block_group->start, block_group->length, &trimmed); list_del_init(&block_group->bg_list); btrfs_unfreeze_block_group(block_group); btrfs_put_block_group(block_group); if (ret) { const char *errstr = btrfs_decode_error(ret); btrfs_warn(fs_info, "discard failed while removing blockgroup: errno=%d %s", ret, errstr); } } return 0; } /* * Parse an extent item's inline extents looking for a simple quotas owner ref. * * @fs_info: the btrfs_fs_info for this mount * @leaf: a leaf in the extent tree containing the extent item * @slot: the slot in the leaf where the extent item is found * * Returns the objectid of the root that originally allocated the extent item * if the inline owner ref is expected and present, otherwise 0. * * If an extent item has an owner ref item, it will be the first inline ref * item. Therefore the logic is to check whether there are any inline ref * items, then check the type of the first one. */ u64 btrfs_get_extent_owner_root(struct btrfs_fs_info *fs_info, struct extent_buffer *leaf, int slot) { struct btrfs_extent_item *ei; struct btrfs_extent_inline_ref *iref; struct btrfs_extent_owner_ref *oref; unsigned long ptr; unsigned long end; int type; if (!btrfs_fs_incompat(fs_info, SIMPLE_QUOTA)) return 0; ei = btrfs_item_ptr(leaf, slot, struct btrfs_extent_item); ptr = (unsigned long)(ei + 1); end = (unsigned long)ei + btrfs_item_size(leaf, slot); /* No inline ref items of any kind, can't check type. */ if (ptr == end) return 0; iref = (struct btrfs_extent_inline_ref *)ptr; type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_ANY); /* We found an owner ref, get the root out of it. */ if (type == BTRFS_EXTENT_OWNER_REF_KEY) { oref = (struct btrfs_extent_owner_ref *)(&iref->offset); return btrfs_extent_owner_ref_root_id(leaf, oref); } /* We have inline refs, but not an owner ref. */ return 0; } static int do_free_extent_accounting(struct btrfs_trans_handle *trans, u64 bytenr, struct btrfs_squota_delta *delta) { int ret; u64 num_bytes = delta->num_bytes; if (delta->is_data) { struct btrfs_root *csum_root; csum_root = btrfs_csum_root(trans->fs_info, bytenr); ret = btrfs_del_csums(trans, csum_root, bytenr, num_bytes); if (ret) { btrfs_abort_transaction(trans, ret); return ret; } ret = btrfs_delete_raid_extent(trans, bytenr, num_bytes); if (ret) { btrfs_abort_transaction(trans, ret); return ret; } } ret = btrfs_record_squota_delta(trans->fs_info, delta); if (ret) { btrfs_abort_transaction(trans, ret); return ret; } ret = add_to_free_space_tree(trans, bytenr, num_bytes); if (ret) { btrfs_abort_transaction(trans, ret); return ret; } ret = btrfs_update_block_group(trans, bytenr, num_bytes, false); if (ret) btrfs_abort_transaction(trans, ret); return ret; } #define abort_and_dump(trans, path, fmt, args...) \ ({ \ btrfs_abort_transaction(trans, -EUCLEAN); \ btrfs_print_leaf(path->nodes[0]); \ btrfs_crit(trans->fs_info, fmt, ##args); \ }) /* * Drop one or more refs of @node. * * 1. Locate the extent refs. * It's either inline in EXTENT/METADATA_ITEM or in keyed SHARED_* item. * Locate it, then reduce the refs number or remove the ref line completely. * * 2. Update the refs count in EXTENT/METADATA_ITEM * * Inline backref case: * * in extent tree we have: * * item 0 key (13631488 EXTENT_ITEM 1048576) itemoff 16201 itemsize 82 * refs 2 gen 6 flags DATA * extent data backref root FS_TREE objectid 258 offset 0 count 1 * extent data backref root FS_TREE objectid 257 offset 0 count 1 * * This function gets called with: * * node->bytenr = 13631488 * node->num_bytes = 1048576 * root_objectid = FS_TREE * owner_objectid = 257 * owner_offset = 0 * refs_to_drop = 1 * * Then we should get some like: * * item 0 key (13631488 EXTENT_ITEM 1048576) itemoff 16201 itemsize 82 * refs 1 gen 6 flags DATA * extent data backref root FS_TREE objectid 258 offset 0 count 1 * * Keyed backref case: * * in extent tree we have: * * item 0 key (13631488 EXTENT_ITEM 1048576) itemoff 3971 itemsize 24 * refs 754 gen 6 flags DATA * [...] * item 2 key (13631488 EXTENT_DATA_REF <HASH>) itemoff 3915 itemsize 28 * extent data backref root FS_TREE objectid 866 offset 0 count 1 * * This function get called with: * * node->bytenr = 13631488 * node->num_bytes = 1048576 * root_objectid = FS_TREE * owner_objectid = 866 * owner_offset = 0 * refs_to_drop = 1 * * Then we should get some like: * * item 0 key (13631488 EXTENT_ITEM 1048576) itemoff 3971 itemsize 24 * refs 753 gen 6 flags DATA * * And that (13631488 EXTENT_DATA_REF <HASH>) gets removed. */ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_delayed_ref_head *href, struct btrfs_delayed_ref_node *node, struct btrfs_delayed_extent_op *extent_op) { struct btrfs_fs_info *info = trans->fs_info; struct btrfs_key key; struct btrfs_path *path; struct btrfs_root *extent_root; struct extent_buffer *leaf; struct btrfs_extent_item *ei; struct btrfs_extent_inline_ref *iref; int ret; int is_data; int extent_slot = 0; int found_extent = 0; int num_to_del = 1; int refs_to_drop = node->ref_mod; u32 item_size; u64 refs; u64 bytenr = node->bytenr; u64 num_bytes = node->num_bytes; u64 owner_objectid = btrfs_delayed_ref_owner(node); u64 owner_offset = btrfs_delayed_ref_offset(node); bool skinny_metadata = btrfs_fs_incompat(info, SKINNY_METADATA); u64 delayed_ref_root = href->owning_root; extent_root = btrfs_extent_root(info, bytenr); ASSERT(extent_root); path = btrfs_alloc_path(); if (!path) return -ENOMEM; is_data = owner_objectid >= BTRFS_FIRST_FREE_OBJECTID; if (!is_data && refs_to_drop != 1) { btrfs_crit(info, "invalid refs_to_drop, dropping more than 1 refs for tree block %llu refs_to_drop %u", node->bytenr, refs_to_drop); ret = -EINVAL; btrfs_abort_transaction(trans, ret); goto out; } if (is_data) skinny_metadata = false; ret = lookup_extent_backref(trans, path, &iref, bytenr, num_bytes, node->parent, node->ref_root, owner_objectid, owner_offset); if (ret == 0) { /* * Either the inline backref or the SHARED_DATA_REF/ * SHARED_BLOCK_REF is found * * Here is a quick path to locate EXTENT/METADATA_ITEM. * It's possible the EXTENT/METADATA_ITEM is near current slot. */ extent_slot = path->slots[0]; while (extent_slot >= 0) { btrfs_item_key_to_cpu(path->nodes[0], &key, extent_slot); if (key.objectid != bytenr) break; if (key.type == BTRFS_EXTENT_ITEM_KEY && key.offset == num_bytes) { found_extent = 1; break; } if (key.type == BTRFS_METADATA_ITEM_KEY && key.offset == owner_objectid) { found_extent = 1; break; } /* Quick path didn't find the EXTENT/METADATA_ITEM */ if (path->slots[0] - extent_slot > 5) break; extent_slot--; } if (!found_extent) { if (iref) { abort_and_dump(trans, path, "invalid iref slot %u, no EXTENT/METADATA_ITEM found but has inline extent ref", path->slots[0]); ret = -EUCLEAN; goto out; } /* Must be SHARED_* item, remove the backref first */ ret = remove_extent_backref(trans, extent_root, path, NULL, refs_to_drop, is_data); if (ret) { btrfs_abort_transaction(trans, ret); goto out; } btrfs_release_path(path); /* Slow path to locate EXTENT/METADATA_ITEM */ key.objectid = bytenr; key.type = BTRFS_EXTENT_ITEM_KEY; key.offset = num_bytes; if (!is_data && skinny_metadata) { key.type = BTRFS_METADATA_ITEM_KEY; key.offset = owner_objectid; } ret = btrfs_search_slot(trans, extent_root, &key, path, -1, 1); if (ret > 0 && skinny_metadata && path->slots[0]) { /* * Couldn't find our skinny metadata item, * see if we have ye olde extent item. */ path->slots[0]--; btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); if (key.objectid == bytenr && key.type == BTRFS_EXTENT_ITEM_KEY && key.offset == num_bytes) ret = 0; } if (ret > 0 && skinny_metadata) { skinny_metadata = false; key.objectid = bytenr; key.type = BTRFS_EXTENT_ITEM_KEY; key.offset = num_bytes; btrfs_release_path(path); ret = btrfs_search_slot(trans, extent_root, &key, path, -1, 1); } if (ret) { if (ret > 0) btrfs_print_leaf(path->nodes[0]); btrfs_err(info, "umm, got %d back from search, was looking for %llu, slot %d", ret, bytenr, path->slots[0]); } if (ret < 0) { btrfs_abort_transaction(trans, ret); goto out; } extent_slot = path->slots[0]; } } else if (WARN_ON(ret == -ENOENT)) { abort_and_dump(trans, path, "unable to find ref byte nr %llu parent %llu root %llu owner %llu offset %llu slot %d", bytenr, node->parent, node->ref_root, owner_objectid, owner_offset, path->slots[0]); goto out; } else { btrfs_abort_transaction(trans, ret); goto out; } leaf = path->nodes[0]; item_size = btrfs_item_size(leaf, extent_slot); if (unlikely(item_size < sizeof(*ei))) { ret = -EUCLEAN; btrfs_err(trans->fs_info, "unexpected extent item size, has %u expect >= %zu", item_size, sizeof(*ei)); btrfs_abort_transaction(trans, ret); goto out; } ei = btrfs_item_ptr(leaf, extent_slot, struct btrfs_extent_item); if (owner_objectid < BTRFS_FIRST_FREE_OBJECTID && key.type == BTRFS_EXTENT_ITEM_KEY) { struct btrfs_tree_block_info *bi; if (item_size < sizeof(*ei) + sizeof(*bi)) { abort_and_dump(trans, path, "invalid extent item size for key (%llu, %u, %llu) slot %u owner %llu, has %u expect >= %zu", key.objectid, key.type, key.offset, path->slots[0], owner_objectid, item_size, sizeof(*ei) + sizeof(*bi)); ret = -EUCLEAN; goto out; } bi = (struct btrfs_tree_block_info *)(ei + 1); WARN_ON(owner_objectid != btrfs_tree_block_level(leaf, bi)); } refs = btrfs_extent_refs(leaf, ei); if (refs < refs_to_drop) { abort_and_dump(trans, path, "trying to drop %d refs but we only have %llu for bytenr %llu slot %u", refs_to_drop, refs, bytenr, path->slots[0]); ret = -EUCLEAN; goto out; } refs -= refs_to_drop; if (refs > 0) { if (extent_op) __run_delayed_extent_op(extent_op, leaf, ei); /* * In the case of inline back ref, reference count will * be updated by remove_extent_backref */ if (iref) { if (!found_extent) { abort_and_dump(trans, path, "invalid iref, got inlined extent ref but no EXTENT/METADATA_ITEM found, slot %u", path->slots[0]); ret = -EUCLEAN; goto out; } } else { btrfs_set_extent_refs(leaf, ei, refs); } if (found_extent) { ret = remove_extent_backref(trans, extent_root, path, iref, refs_to_drop, is_data); if (ret) { btrfs_abort_transaction(trans, ret); goto out; } } } else { struct btrfs_squota_delta delta = { .root = delayed_ref_root, .num_bytes = num_bytes, .is_data = is_data, .is_inc = false, .generation = btrfs_extent_generation(leaf, ei), }; /* In this branch refs == 1 */ if (found_extent) { if (is_data && refs_to_drop != extent_data_ref_count(path, iref)) { abort_and_dump(trans, path, "invalid refs_to_drop, current refs %u refs_to_drop %u slot %u", extent_data_ref_count(path, iref), refs_to_drop, path->slots[0]); ret = -EUCLEAN; goto out; } if (iref) { if (path->slots[0] != extent_slot) { abort_and_dump(trans, path, "invalid iref, extent item key (%llu %u %llu) slot %u doesn't have wanted iref", key.objectid, key.type, key.offset, path->slots[0]); ret = -EUCLEAN; goto out; } } else { /* * No inline ref, we must be at SHARED_* item, * And it's single ref, it must be: * | extent_slot ||extent_slot + 1| * [ EXTENT/METADATA_ITEM ][ SHARED_* ITEM ] */ if (path->slots[0] != extent_slot + 1) { abort_and_dump(trans, path, "invalid SHARED_* item slot %u, previous item is not EXTENT/METADATA_ITEM", path->slots[0]); ret = -EUCLEAN; goto out; } path->slots[0] = extent_slot; num_to_del = 2; } } /* * We can't infer the data owner from the delayed ref, so we need * to try to get it from the owning ref item. * * If it is not present, then that extent was not written under * simple quotas mode, so we don't need to account for its deletion. */ if (is_data) delta.root = btrfs_get_extent_owner_root(trans->fs_info, leaf, extent_slot); ret = btrfs_del_items(trans, extent_root, path, path->slots[0], num_to_del); if (ret) { btrfs_abort_transaction(trans, ret); goto out; } btrfs_release_path(path); ret = do_free_extent_accounting(trans, bytenr, &delta); } btrfs_release_path(path); out: btrfs_free_path(path); return ret; } /* * when we free an block, it is possible (and likely) that we free the last * delayed ref for that extent as well. This searches the delayed ref tree for * a given extent, and if there are no other delayed refs to be processed, it * removes it from the tree. */ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans, u64 bytenr) { struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_delayed_ref_head *head; struct btrfs_delayed_ref_root *delayed_refs; int ret = 0; delayed_refs = &trans->transaction->delayed_refs; spin_lock(&delayed_refs->lock); head = btrfs_find_delayed_ref_head(fs_info, delayed_refs, bytenr); if (!head) goto out_delayed_unlock; spin_lock(&head->lock); if (!RB_EMPTY_ROOT(&head->ref_tree.rb_root)) goto out; if (cleanup_extent_op(head) != NULL) goto out; /* * waiting for the lock here would deadlock. If someone else has it * locked they are already in the process of dropping it anyway */ if (!mutex_trylock(&head->mutex)) goto out; btrfs_delete_ref_head(fs_info, delayed_refs, head); head->processing = false; spin_unlock(&head->lock); spin_unlock(&delayed_refs->lock); BUG_ON(head->extent_op); if (head->must_insert_reserved) ret = 1; btrfs_cleanup_ref_head_accounting(fs_info, delayed_refs, head); mutex_unlock(&head->mutex); btrfs_put_delayed_ref_head(head); return ret; out: spin_unlock(&head->lock); out_delayed_unlock: spin_unlock(&delayed_refs->lock); return 0; } int btrfs_free_tree_block(struct btrfs_trans_handle *trans, u64 root_id, struct extent_buffer *buf, u64 parent, int last_ref) { struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_block_group *bg; int ret; if (root_id != BTRFS_TREE_LOG_OBJECTID) { struct btrfs_ref generic_ref = { .action = BTRFS_DROP_DELAYED_REF, .bytenr = buf->start, .num_bytes = buf->len, .parent = parent, .owning_root = btrfs_header_owner(buf), .ref_root = root_id, }; /* * Assert that the extent buffer is not cleared due to * EXTENT_BUFFER_ZONED_ZEROOUT. Please refer * btrfs_clear_buffer_dirty() and btree_csum_one_bio() for * detail. */ ASSERT(btrfs_header_bytenr(buf) != 0); btrfs_init_tree_ref(&generic_ref, btrfs_header_level(buf), 0, false); btrfs_ref_tree_mod(fs_info, &generic_ref); ret = btrfs_add_delayed_tree_ref(trans, &generic_ref, NULL); if (ret < 0) return ret; } if (!last_ref) return 0; if (btrfs_header_generation(buf) != trans->transid) goto out; if (root_id != BTRFS_TREE_LOG_OBJECTID) { ret = check_ref_cleanup(trans, buf->start); if (!ret) goto out; } bg = btrfs_lookup_block_group(fs_info, buf->start); if (btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) { pin_down_extent(trans, bg, buf->start, buf->len, 1); btrfs_put_block_group(bg); goto out; } /* * If there are tree mod log users we may have recorded mod log * operations for this node. If we re-allocate this node we * could replay operations on this node that happened when it * existed in a completely different root. For example if it * was part of root A, then was reallocated to root B, and we * are doing a btrfs_old_search_slot(root b), we could replay * operations that happened when the block was part of root A, * giving us an inconsistent view of the btree. * * We are safe from races here because at this point no other * node or root points to this extent buffer, so if after this * check a new tree mod log user joins we will not have an * existing log of operations on this node that we have to * contend with. */ if (test_bit(BTRFS_FS_TREE_MOD_LOG_USERS, &fs_info->flags) || btrfs_is_zoned(fs_info)) { pin_down_extent(trans, bg, buf->start, buf->len, 1); btrfs_put_block_group(bg); goto out; } WARN_ON(test_bit(EXTENT_BUFFER_DIRTY, &buf->bflags)); btrfs_add_free_space(bg, buf->start, buf->len); btrfs_free_reserved_bytes(bg, buf->len, 0); btrfs_put_block_group(bg); trace_btrfs_reserved_extent_free(fs_info, buf->start, buf->len); out: /* * Deleting the buffer, clear the corrupt flag since it doesn't * matter anymore. */ clear_bit(EXTENT_BUFFER_CORRUPT, &buf->bflags); return 0; } /* Can return -ENOMEM */ int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_ref *ref) { struct btrfs_fs_info *fs_info = trans->fs_info; int ret; if (btrfs_is_testing(fs_info)) return 0; /* * tree log blocks never actually go into the extent allocation * tree, just update pinning info and exit early. */ if (ref->ref_root == BTRFS_TREE_LOG_OBJECTID) { btrfs_pin_extent(trans, ref->bytenr, ref->num_bytes, 1); ret = 0; } else if (ref->type == BTRFS_REF_METADATA) { ret = btrfs_add_delayed_tree_ref(trans, ref, NULL); } else { ret = btrfs_add_delayed_data_ref(trans, ref, 0); } if (ref->ref_root != BTRFS_TREE_LOG_OBJECTID) btrfs_ref_tree_mod(fs_info, ref); return ret; } enum btrfs_loop_type { /* * Start caching block groups but do not wait for progress or for them * to be done. */ LOOP_CACHING_NOWAIT, /* * Wait for the block group free_space >= the space we're waiting for if * the block group isn't cached. */ LOOP_CACHING_WAIT, /* * Allow allocations to happen from block groups that do not yet have a * size classification. */ LOOP_UNSET_SIZE_CLASS, /* * Allocate a chunk and then retry the allocation. */ LOOP_ALLOC_CHUNK, /* * Ignore the size class restrictions for this allocation. */ LOOP_WRONG_SIZE_CLASS, /* * Ignore the empty size, only try to allocate the number of bytes * needed for this allocation. */ LOOP_NO_EMPTY_SIZE, }; static inline void btrfs_lock_block_group(struct btrfs_block_group *cache, int delalloc) { if (delalloc) down_read(&cache->data_rwsem); } static inline void btrfs_grab_block_group(struct btrfs_block_group *cache, int delalloc) { btrfs_get_block_group(cache); if (delalloc) down_read(&cache->data_rwsem); } static struct btrfs_block_group *btrfs_lock_cluster( struct btrfs_block_group *block_group, struct btrfs_free_cluster *cluster, int delalloc) __acquires(&cluster->refill_lock) { struct btrfs_block_group *used_bg = NULL; spin_lock(&cluster->refill_lock); while (1) { used_bg = cluster->block_group; if (!used_bg) return NULL; if (used_bg == block_group) return used_bg; btrfs_get_block_group(used_bg); if (!delalloc) return used_bg; if (down_read_trylock(&used_bg->data_rwsem)) return used_bg; spin_unlock(&cluster->refill_lock); /* We should only have one-level nested. */ down_read_nested(&used_bg->data_rwsem, SINGLE_DEPTH_NESTING); spin_lock(&cluster->refill_lock); if (used_bg == cluster->block_group) return used_bg; up_read(&used_bg->data_rwsem); btrfs_put_block_group(used_bg); } } static inline void btrfs_release_block_group(struct btrfs_block_group *cache, int delalloc) { if (delalloc) up_read(&cache->data_rwsem); btrfs_put_block_group(cache); } /* * Helper function for find_free_extent(). * * Return -ENOENT to inform caller that we need fallback to unclustered mode. * Return >0 to inform caller that we find nothing * Return 0 means we have found a location and set ffe_ctl->found_offset. */ static int find_free_extent_clustered(struct btrfs_block_group *bg, struct find_free_extent_ctl *ffe_ctl, struct btrfs_block_group **cluster_bg_ret) { struct btrfs_block_group *cluster_bg; struct btrfs_free_cluster *last_ptr = ffe_ctl->last_ptr; u64 aligned_cluster; u64 offset; int ret; cluster_bg = btrfs_lock_cluster(bg, last_ptr, ffe_ctl->delalloc); if (!cluster_bg) goto refill_cluster; if (cluster_bg != bg && (cluster_bg->ro || !block_group_bits(cluster_bg, ffe_ctl->flags))) goto release_cluster; offset = btrfs_alloc_from_cluster(cluster_bg, last_ptr, ffe_ctl->num_bytes, cluster_bg->start, &ffe_ctl->max_extent_size); if (offset) { /* We have a block, we're done */ spin_unlock(&last_ptr->refill_lock); trace_btrfs_reserve_extent_cluster(cluster_bg, ffe_ctl); *cluster_bg_ret = cluster_bg; ffe_ctl->found_offset = offset; return 0; } WARN_ON(last_ptr->block_group != cluster_bg); release_cluster: /* * If we are on LOOP_NO_EMPTY_SIZE, we can't set up a new clusters, so * lets just skip it and let the allocator find whatever block it can * find. If we reach this point, we will have tried the cluster * allocator plenty of times and not have found anything, so we are * likely way too fragmented for the clustering stuff to find anything. * * However, if the cluster is taken from the current block group, * release the cluster first, so that we stand a better chance of * succeeding in the unclustered allocation. */ if (ffe_ctl->loop >= LOOP_NO_EMPTY_SIZE && cluster_bg != bg) { spin_unlock(&last_ptr->refill_lock); btrfs_release_block_group(cluster_bg, ffe_ctl->delalloc); return -ENOENT; } /* This cluster didn't work out, free it and start over */ btrfs_return_cluster_to_free_space(NULL, last_ptr); if (cluster_bg != bg) btrfs_release_block_group(cluster_bg, ffe_ctl->delalloc); refill_cluster: if (ffe_ctl->loop >= LOOP_NO_EMPTY_SIZE) { spin_unlock(&last_ptr->refill_lock); return -ENOENT; } aligned_cluster = max_t(u64, ffe_ctl->empty_cluster + ffe_ctl->empty_size, bg->full_stripe_len); ret = btrfs_find_space_cluster(bg, last_ptr, ffe_ctl->search_start, ffe_ctl->num_bytes, aligned_cluster); if (ret == 0) { /* Now pull our allocation out of this cluster */ offset = btrfs_alloc_from_cluster(bg, last_ptr, ffe_ctl->num_bytes, ffe_ctl->search_start, &ffe_ctl->max_extent_size); if (offset) { /* We found one, proceed */ spin_unlock(&last_ptr->refill_lock); ffe_ctl->found_offset = offset; trace_btrfs_reserve_extent_cluster(bg, ffe_ctl); return 0; } } /* * At this point we either didn't find a cluster or we weren't able to * allocate a block from our cluster. Free the cluster we've been * trying to use, and go to the next block group. */ btrfs_return_cluster_to_free_space(NULL, last_ptr); spin_unlock(&last_ptr->refill_lock); return 1; } /* * Return >0 to inform caller that we find nothing * Return 0 when we found an free extent and set ffe_ctrl->found_offset */ static int find_free_extent_unclustered(struct btrfs_block_group *bg, struct find_free_extent_ctl *ffe_ctl) { struct btrfs_free_cluster *last_ptr = ffe_ctl->last_ptr; u64 offset; /* * We are doing an unclustered allocation, set the fragmented flag so * we don't bother trying to setup a cluster again until we get more * space. */ if (unlikely(last_ptr)) { spin_lock(&last_ptr->lock); last_ptr->fragmented = 1; spin_unlock(&last_ptr->lock); } if (ffe_ctl->cached) { struct btrfs_free_space_ctl *free_space_ctl; free_space_ctl = bg->free_space_ctl; spin_lock(&free_space_ctl->tree_lock); if (free_space_ctl->free_space < ffe_ctl->num_bytes + ffe_ctl->empty_cluster + ffe_ctl->empty_size) { ffe_ctl->total_free_space = max_t(u64, ffe_ctl->total_free_space, free_space_ctl->free_space); spin_unlock(&free_space_ctl->tree_lock); return 1; } spin_unlock(&free_space_ctl->tree_lock); } offset = btrfs_find_space_for_alloc(bg, ffe_ctl->search_start, ffe_ctl->num_bytes, ffe_ctl->empty_size, &ffe_ctl->max_extent_size); if (!offset) return 1; ffe_ctl->found_offset = offset; return 0; } static int do_allocation_clustered(struct btrfs_block_group *block_group, struct find_free_extent_ctl *ffe_ctl, struct btrfs_block_group **bg_ret) { int ret; /* We want to try and use the cluster allocator, so lets look there */ if (ffe_ctl->last_ptr && ffe_ctl->use_cluster) { ret = find_free_extent_clustered(block_group, ffe_ctl, bg_ret); if (ret >= 0) return ret; /* ret == -ENOENT case falls through */ } return find_free_extent_unclustered(block_group, ffe_ctl); } /* * Tree-log block group locking * ============================ * * fs_info::treelog_bg_lock protects the fs_info::treelog_bg which * indicates the starting address of a block group, which is reserved only * for tree-log metadata. * * Lock nesting * ============ * * space_info::lock * block_group::lock * fs_info::treelog_bg_lock */ /* * Simple allocator for sequential-only block group. It only allows sequential * allocation. No need to play with trees. This function also reserves the * bytes as in btrfs_add_reserved_bytes. */ static int do_allocation_zoned(struct btrfs_block_group *block_group, struct find_free_extent_ctl *ffe_ctl, struct btrfs_block_group **bg_ret) { struct btrfs_fs_info *fs_info = block_group->fs_info; struct btrfs_space_info *space_info = block_group->space_info; struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; u64 start = block_group->start; u64 num_bytes = ffe_ctl->num_bytes; u64 avail; u64 bytenr = block_group->start; u64 log_bytenr; u64 data_reloc_bytenr; int ret = 0; bool skip = false; ASSERT(btrfs_is_zoned(block_group->fs_info)); /* * Do not allow non-tree-log blocks in the dedicated tree-log block * group, and vice versa. */ spin_lock(&fs_info->treelog_bg_lock); log_bytenr = fs_info->treelog_bg; if (log_bytenr && ((ffe_ctl->for_treelog && bytenr != log_bytenr) || (!ffe_ctl->for_treelog && bytenr == log_bytenr))) skip = true; spin_unlock(&fs_info->treelog_bg_lock); if (skip) return 1; /* * Do not allow non-relocation blocks in the dedicated relocation block * group, and vice versa. */ spin_lock(&fs_info->relocation_bg_lock); data_reloc_bytenr = fs_info->data_reloc_bg; if (data_reloc_bytenr && ((ffe_ctl->for_data_reloc && bytenr != data_reloc_bytenr) || (!ffe_ctl->for_data_reloc && bytenr == data_reloc_bytenr))) skip = true; spin_unlock(&fs_info->relocation_bg_lock); if (skip) return 1; /* Check RO and no space case before trying to activate it */ spin_lock(&block_group->lock); if (block_group->ro || btrfs_zoned_bg_is_full(block_group)) { ret = 1; /* * May need to clear fs_info->{treelog,data_reloc}_bg. * Return the error after taking the locks. */ } spin_unlock(&block_group->lock); /* Metadata block group is activated at write time. */ if (!ret && (block_group->flags & BTRFS_BLOCK_GROUP_DATA) && !btrfs_zone_activate(block_group)) { ret = 1; /* * May need to clear fs_info->{treelog,data_reloc}_bg. * Return the error after taking the locks. */ } spin_lock(&space_info->lock); spin_lock(&block_group->lock); spin_lock(&fs_info->treelog_bg_lock); spin_lock(&fs_info->relocation_bg_lock); if (ret) goto out; ASSERT(!ffe_ctl->for_treelog || block_group->start == fs_info->treelog_bg || fs_info->treelog_bg == 0); ASSERT(!ffe_ctl->for_data_reloc || block_group->start == fs_info->data_reloc_bg || fs_info->data_reloc_bg == 0); if (block_group->ro || (!ffe_ctl->for_data_reloc && test_bit(BLOCK_GROUP_FLAG_ZONED_DATA_RELOC, &block_group->runtime_flags))) { ret = 1; goto out; } /* * Do not allow currently using block group to be tree-log dedicated * block group. */ if (ffe_ctl->for_treelog && !fs_info->treelog_bg && (block_group->used || block_group->reserved)) { ret = 1; goto out; } /* * Do not allow currently used block group to be the data relocation * dedicated block group. */ if (ffe_ctl->for_data_reloc && !fs_info->data_reloc_bg && (block_group->used || block_group->reserved)) { ret = 1; goto out; } WARN_ON_ONCE(block_group->alloc_offset > block_group->zone_capacity); avail = block_group->zone_capacity - block_group->alloc_offset; if (avail < num_bytes) { if (ffe_ctl->max_extent_size < avail) { /* * With sequential allocator, free space is always * contiguous */ ffe_ctl->max_extent_size = avail; ffe_ctl->total_free_space = avail; } ret = 1; goto out; } if (ffe_ctl->for_treelog && !fs_info->treelog_bg) fs_info->treelog_bg = block_group->start; if (ffe_ctl->for_data_reloc) { if (!fs_info->data_reloc_bg) fs_info->data_reloc_bg = block_group->start; /* * Do not allow allocations from this block group, unless it is * for data relocation. Compared to increasing the ->ro, setting * the ->zoned_data_reloc_ongoing flag still allows nocow * writers to come in. See btrfs_inc_nocow_writers(). * * We need to disable an allocation to avoid an allocation of * regular (non-relocation data) extent. With mix of relocation * extents and regular extents, we can dispatch WRITE commands * (for relocation extents) and ZONE APPEND commands (for * regular extents) at the same time to the same zone, which * easily break the write pointer. * * Also, this flag avoids this block group to be zone finished. */ set_bit(BLOCK_GROUP_FLAG_ZONED_DATA_RELOC, &block_group->runtime_flags); } ffe_ctl->found_offset = start + block_group->alloc_offset; block_group->alloc_offset += num_bytes; spin_lock(&ctl->tree_lock); ctl->free_space -= num_bytes; spin_unlock(&ctl->tree_lock); /* * We do not check if found_offset is aligned to stripesize. The * address is anyway rewritten when using zone append writing. */ ffe_ctl->search_start = ffe_ctl->found_offset; out: if (ret && ffe_ctl->for_treelog) fs_info->treelog_bg = 0; if (ret && ffe_ctl->for_data_reloc) fs_info->data_reloc_bg = 0; spin_unlock(&fs_info->relocation_bg_lock); spin_unlock(&fs_info->treelog_bg_lock); spin_unlock(&block_group->lock); spin_unlock(&space_info->lock); return ret; } static int do_allocation(struct btrfs_block_group *block_group, struct find_free_extent_ctl *ffe_ctl, struct btrfs_block_group **bg_ret) { switch (ffe_ctl->policy) { case BTRFS_EXTENT_ALLOC_CLUSTERED: return do_allocation_clustered(block_group, ffe_ctl, bg_ret); case BTRFS_EXTENT_ALLOC_ZONED: return do_allocation_zoned(block_group, ffe_ctl, bg_ret); default: BUG(); } } static void release_block_group(struct btrfs_block_group *block_group, struct find_free_extent_ctl *ffe_ctl, int delalloc) { switch (ffe_ctl->policy) { case BTRFS_EXTENT_ALLOC_CLUSTERED: ffe_ctl->retry_uncached = false; break; case BTRFS_EXTENT_ALLOC_ZONED: /* Nothing to do */ break; default: BUG(); } BUG_ON(btrfs_bg_flags_to_raid_index(block_group->flags) != ffe_ctl->index); btrfs_release_block_group(block_group, delalloc); } static void found_extent_clustered(struct find_free_extent_ctl *ffe_ctl, struct btrfs_key *ins) { struct btrfs_free_cluster *last_ptr = ffe_ctl->last_ptr; if (!ffe_ctl->use_cluster && last_ptr) { spin_lock(&last_ptr->lock); last_ptr->window_start = ins->objectid; spin_unlock(&last_ptr->lock); } } static void found_extent(struct find_free_extent_ctl *ffe_ctl, struct btrfs_key *ins) { switch (ffe_ctl->policy) { case BTRFS_EXTENT_ALLOC_CLUSTERED: found_extent_clustered(ffe_ctl, ins); break; case BTRFS_EXTENT_ALLOC_ZONED: /* Nothing to do */ break; default: BUG(); } } static int can_allocate_chunk_zoned(struct btrfs_fs_info *fs_info, struct find_free_extent_ctl *ffe_ctl) { /* Block group's activeness is not a requirement for METADATA block groups. */ if (!(ffe_ctl->flags & BTRFS_BLOCK_GROUP_DATA)) return 0; /* If we can activate new zone, just allocate a chunk and use it */ if (btrfs_can_activate_zone(fs_info->fs_devices, ffe_ctl->flags)) return 0; /* * We already reached the max active zones. Try to finish one block * group to make a room for a new block group. This is only possible * for a data block group because btrfs_zone_finish() may need to wait * for a running transaction which can cause a deadlock for metadata * allocation. */ if (ffe_ctl->flags & BTRFS_BLOCK_GROUP_DATA) { int ret = btrfs_zone_finish_one_bg(fs_info); if (ret == 1) return 0; else if (ret < 0) return ret; } /* * If we have enough free space left in an already active block group * and we can't activate any other zone now, do not allow allocating a * new chunk and let find_free_extent() retry with a smaller size. */ if (ffe_ctl->max_extent_size >= ffe_ctl->min_alloc_size) return -ENOSPC; /* * Even min_alloc_size is not left in any block groups. Since we cannot * activate a new block group, allocating it may not help. Let's tell a * caller to try again and hope it progress something by writing some * parts of the region. That is only possible for data block groups, * where a part of the region can be written. */ if (ffe_ctl->flags & BTRFS_BLOCK_GROUP_DATA) return -EAGAIN; /* * We cannot activate a new block group and no enough space left in any * block groups. So, allocating a new block group may not help. But, * there is nothing to do anyway, so let's go with it. */ return 0; } static int can_allocate_chunk(struct btrfs_fs_info *fs_info, struct find_free_extent_ctl *ffe_ctl) { switch (ffe_ctl->policy) { case BTRFS_EXTENT_ALLOC_CLUSTERED: return 0; case BTRFS_EXTENT_ALLOC_ZONED: return can_allocate_chunk_zoned(fs_info, ffe_ctl); default: BUG(); } } /* * Return >0 means caller needs to re-search for free extent * Return 0 means we have the needed free extent. * Return <0 means we failed to locate any free extent. */ static int find_free_extent_update_loop(struct btrfs_fs_info *fs_info, struct btrfs_key *ins, struct find_free_extent_ctl *ffe_ctl, bool full_search) { struct btrfs_root *root = fs_info->chunk_root; int ret; if ((ffe_ctl->loop == LOOP_CACHING_NOWAIT) && ffe_ctl->have_caching_bg && !ffe_ctl->orig_have_caching_bg) ffe_ctl->orig_have_caching_bg = true; if (ins->objectid) { found_extent(ffe_ctl, ins); return 0; } if (ffe_ctl->loop >= LOOP_CACHING_WAIT && ffe_ctl->have_caching_bg) return 1; ffe_ctl->index++; if (ffe_ctl->index < BTRFS_NR_RAID_TYPES) return 1; /* See the comments for btrfs_loop_type for an explanation of the phases. */ if (ffe_ctl->loop < LOOP_NO_EMPTY_SIZE) { ffe_ctl->index = 0; /* * We want to skip the LOOP_CACHING_WAIT step if we don't have * any uncached bgs and we've already done a full search * through. */ if (ffe_ctl->loop == LOOP_CACHING_NOWAIT && (!ffe_ctl->orig_have_caching_bg && full_search)) ffe_ctl->loop++; ffe_ctl->loop++; if (ffe_ctl->loop == LOOP_ALLOC_CHUNK) { struct btrfs_trans_handle *trans; int exist = 0; /* Check if allocation policy allows to create a new chunk */ ret = can_allocate_chunk(fs_info, ffe_ctl); if (ret) return ret; trans = current->journal_info; if (trans) exist = 1; else trans = btrfs_join_transaction(root); if (IS_ERR(trans)) { ret = PTR_ERR(trans); return ret; } ret = btrfs_chunk_alloc(trans, ffe_ctl->flags, CHUNK_ALLOC_FORCE_FOR_EXTENT); /* Do not bail out on ENOSPC since we can do more. */ if (ret == -ENOSPC) { ret = 0; ffe_ctl->loop++; } else if (ret < 0) btrfs_abort_transaction(trans, ret); else ret = 0; if (!exist) btrfs_end_transaction(trans); if (ret) return ret; } if (ffe_ctl->loop == LOOP_NO_EMPTY_SIZE) { if (ffe_ctl->policy != BTRFS_EXTENT_ALLOC_CLUSTERED) return -ENOSPC; /* * Don't loop again if we already have no empty_size and * no empty_cluster. */ if (ffe_ctl->empty_size == 0 && ffe_ctl->empty_cluster == 0) return -ENOSPC; ffe_ctl->empty_size = 0; ffe_ctl->empty_cluster = 0; } return 1; } return -ENOSPC; } static bool find_free_extent_check_size_class(struct find_free_extent_ctl *ffe_ctl, struct btrfs_block_group *bg) { if (ffe_ctl->policy == BTRFS_EXTENT_ALLOC_ZONED) return true; if (!btrfs_block_group_should_use_size_class(bg)) return true; if (ffe_ctl->loop >= LOOP_WRONG_SIZE_CLASS) return true; if (ffe_ctl->loop >= LOOP_UNSET_SIZE_CLASS && bg->size_class == BTRFS_BG_SZ_NONE) return true; return ffe_ctl->size_class == bg->size_class; } static int prepare_allocation_clustered(struct btrfs_fs_info *fs_info, struct find_free_extent_ctl *ffe_ctl, struct btrfs_space_info *space_info, struct btrfs_key *ins) { /* * If our free space is heavily fragmented we may not be able to make * big contiguous allocations, so instead of doing the expensive search * for free space, simply return ENOSPC with our max_extent_size so we * can go ahead and search for a more manageable chunk. * * If our max_extent_size is large enough for our allocation simply * disable clustering since we will likely not be able to find enough * space to create a cluster and induce latency trying. */ if (space_info->max_extent_size) { spin_lock(&space_info->lock); if (space_info->max_extent_size && ffe_ctl->num_bytes > space_info->max_extent_size) { ins->offset = space_info->max_extent_size; spin_unlock(&space_info->lock); return -ENOSPC; } else if (space_info->max_extent_size) { ffe_ctl->use_cluster = false; } spin_unlock(&space_info->lock); } ffe_ctl->last_ptr = fetch_cluster_info(fs_info, space_info, &ffe_ctl->empty_cluster); if (ffe_ctl->last_ptr) { struct btrfs_free_cluster *last_ptr = ffe_ctl->last_ptr; spin_lock(&last_ptr->lock); if (last_ptr->block_group) ffe_ctl->hint_byte = last_ptr->window_start; if (last_ptr->fragmented) { /* * We still set window_start so we can keep track of the * last place we found an allocation to try and save * some time. */ ffe_ctl->hint_byte = last_ptr->window_start; ffe_ctl->use_cluster = false; } spin_unlock(&last_ptr->lock); } return 0; } static int prepare_allocation_zoned(struct btrfs_fs_info *fs_info, struct find_free_extent_ctl *ffe_ctl) { if (ffe_ctl->for_treelog) { spin_lock(&fs_info->treelog_bg_lock); if (fs_info->treelog_bg) ffe_ctl->hint_byte = fs_info->treelog_bg; spin_unlock(&fs_info->treelog_bg_lock); } else if (ffe_ctl->for_data_reloc) { spin_lock(&fs_info->relocation_bg_lock); if (fs_info->data_reloc_bg) ffe_ctl->hint_byte = fs_info->data_reloc_bg; spin_unlock(&fs_info->relocation_bg_lock); } else if (ffe_ctl->flags & BTRFS_BLOCK_GROUP_DATA) { struct btrfs_block_group *block_group; spin_lock(&fs_info->zone_active_bgs_lock); list_for_each_entry(block_group, &fs_info->zone_active_bgs, active_bg_list) { /* * No lock is OK here because avail is monotinically * decreasing, and this is just a hint. */ u64 avail = block_group->zone_capacity - block_group->alloc_offset; if (block_group_bits(block_group, ffe_ctl->flags) && avail >= ffe_ctl->num_bytes) { ffe_ctl->hint_byte = block_group->start; break; } } spin_unlock(&fs_info->zone_active_bgs_lock); } return 0; } static int prepare_allocation(struct btrfs_fs_info *fs_info, struct find_free_extent_ctl *ffe_ctl, struct btrfs_space_info *space_info, struct btrfs_key *ins) { switch (ffe_ctl->policy) { case BTRFS_EXTENT_ALLOC_CLUSTERED: return prepare_allocation_clustered(fs_info, ffe_ctl, space_info, ins); case BTRFS_EXTENT_ALLOC_ZONED: return prepare_allocation_zoned(fs_info, ffe_ctl); default: BUG(); } } /* * walks the btree of allocated extents and find a hole of a given size. * The key ins is changed to record the hole: * ins->objectid == start position * ins->flags = BTRFS_EXTENT_ITEM_KEY * ins->offset == the size of the hole. * Any available blocks before search_start are skipped. * * If there is no suitable free space, we will record the max size of * the free space extent currently. * * The overall logic and call chain: * * find_free_extent() * |- Iterate through all block groups * | |- Get a valid block group * | |- Try to do clustered allocation in that block group * | |- Try to do unclustered allocation in that block group * | |- Check if the result is valid * | | |- If valid, then exit * | |- Jump to next block group * | * |- Push harder to find free extents * |- If not found, re-iterate all block groups */ static noinline int find_free_extent(struct btrfs_root *root, struct btrfs_key *ins, struct find_free_extent_ctl *ffe_ctl) { struct btrfs_fs_info *fs_info = root->fs_info; int ret = 0; int cache_block_group_error = 0; struct btrfs_block_group *block_group = NULL; struct btrfs_space_info *space_info; bool full_search = false; WARN_ON(ffe_ctl->num_bytes < fs_info->sectorsize); ffe_ctl->search_start = 0; /* For clustered allocation */ ffe_ctl->empty_cluster = 0; ffe_ctl->last_ptr = NULL; ffe_ctl->use_cluster = true; ffe_ctl->have_caching_bg = false; ffe_ctl->orig_have_caching_bg = false; ffe_ctl->index = btrfs_bg_flags_to_raid_index(ffe_ctl->flags); ffe_ctl->loop = 0; ffe_ctl->retry_uncached = false; ffe_ctl->cached = 0; ffe_ctl->max_extent_size = 0; ffe_ctl->total_free_space = 0; ffe_ctl->found_offset = 0; ffe_ctl->policy = BTRFS_EXTENT_ALLOC_CLUSTERED; ffe_ctl->size_class = btrfs_calc_block_group_size_class(ffe_ctl->num_bytes); if (btrfs_is_zoned(fs_info)) ffe_ctl->policy = BTRFS_EXTENT_ALLOC_ZONED; ins->type = BTRFS_EXTENT_ITEM_KEY; ins->objectid = 0; ins->offset = 0; trace_find_free_extent(root, ffe_ctl); space_info = btrfs_find_space_info(fs_info, ffe_ctl->flags); if (!space_info) { btrfs_err(fs_info, "No space info for %llu", ffe_ctl->flags); return -ENOSPC; } ret = prepare_allocation(fs_info, ffe_ctl, space_info, ins); if (ret < 0) return ret; ffe_ctl->search_start = max(ffe_ctl->search_start, first_logical_byte(fs_info)); ffe_ctl->search_start = max(ffe_ctl->search_start, ffe_ctl->hint_byte); if (ffe_ctl->search_start == ffe_ctl->hint_byte) { block_group = btrfs_lookup_block_group(fs_info, ffe_ctl->search_start); /* * we don't want to use the block group if it doesn't match our * allocation bits, or if its not cached. * * However if we are re-searching with an ideal block group * picked out then we don't care that the block group is cached. */ if (block_group && block_group_bits(block_group, ffe_ctl->flags) && block_group->cached != BTRFS_CACHE_NO) { down_read(&space_info->groups_sem); if (list_empty(&block_group->list) || block_group->ro) { /* * someone is removing this block group, * we can't jump into the have_block_group * target because our list pointers are not * valid */ btrfs_put_block_group(block_group); up_read(&space_info->groups_sem); } else { ffe_ctl->index = btrfs_bg_flags_to_raid_index( block_group->flags); btrfs_lock_block_group(block_group, ffe_ctl->delalloc); ffe_ctl->hinted = true; goto have_block_group; } } else if (block_group) { btrfs_put_block_group(block_group); } } search: trace_find_free_extent_search_loop(root, ffe_ctl); ffe_ctl->have_caching_bg = false; if (ffe_ctl->index == btrfs_bg_flags_to_raid_index(ffe_ctl->flags) || ffe_ctl->index == 0) full_search = true; down_read(&space_info->groups_sem); list_for_each_entry(block_group, &space_info->block_groups[ffe_ctl->index], list) { struct btrfs_block_group *bg_ret; ffe_ctl->hinted = false; /* If the block group is read-only, we can skip it entirely. */ if (unlikely(block_group->ro)) { if (ffe_ctl->for_treelog) btrfs_clear_treelog_bg(block_group); if (ffe_ctl->for_data_reloc) btrfs_clear_data_reloc_bg(block_group); continue; } btrfs_grab_block_group(block_group, ffe_ctl->delalloc); ffe_ctl->search_start = block_group->start; /* * this can happen if we end up cycling through all the * raid types, but we want to make sure we only allocate * for the proper type. */ if (!block_group_bits(block_group, ffe_ctl->flags)) { u64 extra = BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1_MASK | BTRFS_BLOCK_GROUP_RAID56_MASK | BTRFS_BLOCK_GROUP_RAID10; /* * if they asked for extra copies and this block group * doesn't provide them, bail. This does allow us to * fill raid0 from raid1. */ if ((ffe_ctl->flags & extra) && !(block_group->flags & extra)) goto loop; /* * This block group has different flags than we want. * It's possible that we have MIXED_GROUP flag but no * block group is mixed. Just skip such block group. */ btrfs_release_block_group(block_group, ffe_ctl->delalloc); continue; } have_block_group: trace_find_free_extent_have_block_group(root, ffe_ctl, block_group); ffe_ctl->cached = btrfs_block_group_done(block_group); if (unlikely(!ffe_ctl->cached)) { ffe_ctl->have_caching_bg = true; ret = btrfs_cache_block_group(block_group, false); /* * If we get ENOMEM here or something else we want to * try other block groups, because it may not be fatal. * However if we can't find anything else we need to * save our return here so that we return the actual * error that caused problems, not ENOSPC. */ if (ret < 0) { if (!cache_block_group_error) cache_block_group_error = ret; ret = 0; goto loop; } ret = 0; } if (unlikely(block_group->cached == BTRFS_CACHE_ERROR)) { if (!cache_block_group_error) cache_block_group_error = -EIO; goto loop; } if (!find_free_extent_check_size_class(ffe_ctl, block_group)) goto loop; bg_ret = NULL; ret = do_allocation(block_group, ffe_ctl, &bg_ret); if (ret > 0) goto loop; if (bg_ret && bg_ret != block_group) { btrfs_release_block_group(block_group, ffe_ctl->delalloc); block_group = bg_ret; } /* Checks */ ffe_ctl->search_start = round_up(ffe_ctl->found_offset, fs_info->stripesize); /* move on to the next group */ if (ffe_ctl->search_start + ffe_ctl->num_bytes > block_group->start + block_group->length) { btrfs_add_free_space_unused(block_group, ffe_ctl->found_offset, ffe_ctl->num_bytes); goto loop; } if (ffe_ctl->found_offset < ffe_ctl->search_start) btrfs_add_free_space_unused(block_group, ffe_ctl->found_offset, ffe_ctl->search_start - ffe_ctl->found_offset); ret = btrfs_add_reserved_bytes(block_group, ffe_ctl->ram_bytes, ffe_ctl->num_bytes, ffe_ctl->delalloc, ffe_ctl->loop >= LOOP_WRONG_SIZE_CLASS); if (ret == -EAGAIN) { btrfs_add_free_space_unused(block_group, ffe_ctl->found_offset, ffe_ctl->num_bytes); goto loop; } btrfs_inc_block_group_reservations(block_group); /* we are all good, lets return */ ins->objectid = ffe_ctl->search_start; ins->offset = ffe_ctl->num_bytes; trace_btrfs_reserve_extent(block_group, ffe_ctl); btrfs_release_block_group(block_group, ffe_ctl->delalloc); break; loop: if (!ffe_ctl->cached && ffe_ctl->loop > LOOP_CACHING_NOWAIT && !ffe_ctl->retry_uncached) { ffe_ctl->retry_uncached = true; btrfs_wait_block_group_cache_progress(block_group, ffe_ctl->num_bytes + ffe_ctl->empty_cluster + ffe_ctl->empty_size); goto have_block_group; } release_block_group(block_group, ffe_ctl, ffe_ctl->delalloc); cond_resched(); } up_read(&space_info->groups_sem); ret = find_free_extent_update_loop(fs_info, ins, ffe_ctl, full_search); if (ret > 0) goto search; if (ret == -ENOSPC && !cache_block_group_error) { /* * Use ffe_ctl->total_free_space as fallback if we can't find * any contiguous hole. */ if (!ffe_ctl->max_extent_size) ffe_ctl->max_extent_size = ffe_ctl->total_free_space; spin_lock(&space_info->lock); space_info->max_extent_size = ffe_ctl->max_extent_size; spin_unlock(&space_info->lock); ins->offset = ffe_ctl->max_extent_size; } else if (ret == -ENOSPC) { ret = cache_block_group_error; } return ret; } /* * Entry point to the extent allocator. Tries to find a hole that is at least * as big as @num_bytes. * * @root - The root that will contain this extent * * @ram_bytes - The amount of space in ram that @num_bytes take. This * is used for accounting purposes. This value differs * from @num_bytes only in the case of compressed extents. * * @num_bytes - Number of bytes to allocate on-disk. * * @min_alloc_size - Indicates the minimum amount of space that the * allocator should try to satisfy. In some cases * @num_bytes may be larger than what is required and if * the filesystem is fragmented then allocation fails. * However, the presence of @min_alloc_size gives a * chance to try and satisfy the smaller allocation. * * @empty_size - A hint that you plan on doing more COW. This is the * size in bytes the allocator should try to find free * next to the block it returns. This is just a hint and * may be ignored by the allocator. * * @hint_byte - Hint to the allocator to start searching above the byte * address passed. It might be ignored. * * @ins - This key is modified to record the found hole. It will * have the following values: * ins->objectid == start position * ins->flags = BTRFS_EXTENT_ITEM_KEY * ins->offset == the size of the hole. * * @is_data - Boolean flag indicating whether an extent is * allocated for data (true) or metadata (false) * * @delalloc - Boolean flag indicating whether this allocation is for * delalloc or not. If 'true' data_rwsem of block groups * is going to be acquired. * * * Returns 0 when an allocation succeeded or < 0 when an error occurred. In * case -ENOSPC is returned then @ins->offset will contain the size of the * largest available hole the allocator managed to find. */ int btrfs_reserve_extent(struct btrfs_root *root, u64 ram_bytes, u64 num_bytes, u64 min_alloc_size, u64 empty_size, u64 hint_byte, struct btrfs_key *ins, int is_data, int delalloc) { struct btrfs_fs_info *fs_info = root->fs_info; struct find_free_extent_ctl ffe_ctl = {}; bool final_tried = num_bytes == min_alloc_size; u64 flags; int ret; bool for_treelog = (btrfs_root_id(root) == BTRFS_TREE_LOG_OBJECTID); bool for_data_reloc = (btrfs_is_data_reloc_root(root) && is_data); flags = get_alloc_profile_by_root(root, is_data); again: WARN_ON(num_bytes < fs_info->sectorsize); ffe_ctl.ram_bytes = ram_bytes; ffe_ctl.num_bytes = num_bytes; ffe_ctl.min_alloc_size = min_alloc_size; ffe_ctl.empty_size = empty_size; ffe_ctl.flags = flags; ffe_ctl.delalloc = delalloc; ffe_ctl.hint_byte = hint_byte; ffe_ctl.for_treelog = for_treelog; ffe_ctl.for_data_reloc = for_data_reloc; ret = find_free_extent(root, ins, &ffe_ctl); if (!ret && !is_data) { btrfs_dec_block_group_reservations(fs_info, ins->objectid); } else if (ret == -ENOSPC) { if (!final_tried && ins->offset) { num_bytes = min(num_bytes >> 1, ins->offset); num_bytes = round_down(num_bytes, fs_info->sectorsize); num_bytes = max(num_bytes, min_alloc_size); ram_bytes = num_bytes; if (num_bytes == min_alloc_size) final_tried = true; goto again; } else if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) { struct btrfs_space_info *sinfo; sinfo = btrfs_find_space_info(fs_info, flags); btrfs_err(fs_info, "allocation failed flags %llu, wanted %llu tree-log %d, relocation: %d", flags, num_bytes, for_treelog, for_data_reloc); if (sinfo) btrfs_dump_space_info(fs_info, sinfo, num_bytes, 1); } } return ret; } int btrfs_free_reserved_extent(struct btrfs_fs_info *fs_info, u64 start, u64 len, int delalloc) { struct btrfs_block_group *cache; cache = btrfs_lookup_block_group(fs_info, start); if (!cache) { btrfs_err(fs_info, "Unable to find block group for %llu", start); return -ENOSPC; } btrfs_add_free_space(cache, start, len); btrfs_free_reserved_bytes(cache, len, delalloc); trace_btrfs_reserved_extent_free(fs_info, start, len); btrfs_put_block_group(cache); return 0; } int btrfs_pin_reserved_extent(struct btrfs_trans_handle *trans, const struct extent_buffer *eb) { struct btrfs_block_group *cache; int ret = 0; cache = btrfs_lookup_block_group(trans->fs_info, eb->start); if (!cache) { btrfs_err(trans->fs_info, "unable to find block group for %llu", eb->start); return -ENOSPC; } ret = pin_down_extent(trans, cache, eb->start, eb->len, 1); btrfs_put_block_group(cache); return ret; } static int alloc_reserved_extent(struct btrfs_trans_handle *trans, u64 bytenr, u64 num_bytes) { struct btrfs_fs_info *fs_info = trans->fs_info; int ret; ret = remove_from_free_space_tree(trans, bytenr, num_bytes); if (ret) return ret; ret = btrfs_update_block_group(trans, bytenr, num_bytes, true); if (ret) { ASSERT(!ret); btrfs_err(fs_info, "update block group failed for %llu %llu", bytenr, num_bytes); return ret; } trace_btrfs_reserved_extent_alloc(fs_info, bytenr, num_bytes); return 0; } static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans, u64 parent, u64 root_objectid, u64 flags, u64 owner, u64 offset, struct btrfs_key *ins, int ref_mod, u64 oref_root) { struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_root *extent_root; int ret; struct btrfs_extent_item *extent_item; struct btrfs_extent_owner_ref *oref; struct btrfs_extent_inline_ref *iref; struct btrfs_path *path; struct extent_buffer *leaf; int type; u32 size; const bool simple_quota = (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_SIMPLE); if (parent > 0) type = BTRFS_SHARED_DATA_REF_KEY; else type = BTRFS_EXTENT_DATA_REF_KEY; size = sizeof(*extent_item); if (simple_quota) size += btrfs_extent_inline_ref_size(BTRFS_EXTENT_OWNER_REF_KEY); size += btrfs_extent_inline_ref_size(type); path = btrfs_alloc_path(); if (!path) return -ENOMEM; extent_root = btrfs_extent_root(fs_info, ins->objectid); ret = btrfs_insert_empty_item(trans, extent_root, path, ins, size); if (ret) { btrfs_free_path(path); return ret; } leaf = path->nodes[0]; extent_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); btrfs_set_extent_refs(leaf, extent_item, ref_mod); btrfs_set_extent_generation(leaf, extent_item, trans->transid); btrfs_set_extent_flags(leaf, extent_item, flags | BTRFS_EXTENT_FLAG_DATA); iref = (struct btrfs_extent_inline_ref *)(extent_item + 1); if (simple_quota) { btrfs_set_extent_inline_ref_type(leaf, iref, BTRFS_EXTENT_OWNER_REF_KEY); oref = (struct btrfs_extent_owner_ref *)(&iref->offset); btrfs_set_extent_owner_ref_root_id(leaf, oref, oref_root); iref = (struct btrfs_extent_inline_ref *)(oref + 1); } btrfs_set_extent_inline_ref_type(leaf, iref, type); if (parent > 0) { struct btrfs_shared_data_ref *ref; ref = (struct btrfs_shared_data_ref *)(iref + 1); btrfs_set_extent_inline_ref_offset(leaf, iref, parent); btrfs_set_shared_data_ref_count(leaf, ref, ref_mod); } else { struct btrfs_extent_data_ref *ref; ref = (struct btrfs_extent_data_ref *)(&iref->offset); btrfs_set_extent_data_ref_root(leaf, ref, root_objectid); btrfs_set_extent_data_ref_objectid(leaf, ref, owner); btrfs_set_extent_data_ref_offset(leaf, ref, offset); btrfs_set_extent_data_ref_count(leaf, ref, ref_mod); } btrfs_free_path(path); return alloc_reserved_extent(trans, ins->objectid, ins->offset); } static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, struct btrfs_delayed_ref_node *node, struct btrfs_delayed_extent_op *extent_op) { struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_root *extent_root; int ret; struct btrfs_extent_item *extent_item; struct btrfs_key extent_key; struct btrfs_tree_block_info *block_info; struct btrfs_extent_inline_ref *iref; struct btrfs_path *path; struct extent_buffer *leaf; u32 size = sizeof(*extent_item) + sizeof(*iref); const u64 flags = (extent_op ? extent_op->flags_to_set : 0); /* The owner of a tree block is the level. */ int level = btrfs_delayed_ref_owner(node); bool skinny_metadata = btrfs_fs_incompat(fs_info, SKINNY_METADATA); extent_key.objectid = node->bytenr; if (skinny_metadata) { /* The owner of a tree block is the level. */ extent_key.offset = level; extent_key.type = BTRFS_METADATA_ITEM_KEY; } else { extent_key.offset = node->num_bytes; extent_key.type = BTRFS_EXTENT_ITEM_KEY; size += sizeof(*block_info); } path = btrfs_alloc_path(); if (!path) return -ENOMEM; extent_root = btrfs_extent_root(fs_info, extent_key.objectid); ret = btrfs_insert_empty_item(trans, extent_root, path, &extent_key, size); if (ret) { btrfs_free_path(path); return ret; } leaf = path->nodes[0]; extent_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); btrfs_set_extent_refs(leaf, extent_item, 1); btrfs_set_extent_generation(leaf, extent_item, trans->transid); btrfs_set_extent_flags(leaf, extent_item, flags | BTRFS_EXTENT_FLAG_TREE_BLOCK); if (skinny_metadata) { iref = (struct btrfs_extent_inline_ref *)(extent_item + 1); } else { block_info = (struct btrfs_tree_block_info *)(extent_item + 1); btrfs_set_tree_block_key(leaf, block_info, &extent_op->key); btrfs_set_tree_block_level(leaf, block_info, level); iref = (struct btrfs_extent_inline_ref *)(block_info + 1); } if (node->type == BTRFS_SHARED_BLOCK_REF_KEY) { btrfs_set_extent_inline_ref_type(leaf, iref, BTRFS_SHARED_BLOCK_REF_KEY); btrfs_set_extent_inline_ref_offset(leaf, iref, node->parent); } else { btrfs_set_extent_inline_ref_type(leaf, iref, BTRFS_TREE_BLOCK_REF_KEY); btrfs_set_extent_inline_ref_offset(leaf, iref, node->ref_root); } btrfs_free_path(path); return alloc_reserved_extent(trans, node->bytenr, fs_info->nodesize); } int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 owner, u64 offset, u64 ram_bytes, struct btrfs_key *ins) { struct btrfs_ref generic_ref = { .action = BTRFS_ADD_DELAYED_EXTENT, .bytenr = ins->objectid, .num_bytes = ins->offset, .owning_root = btrfs_root_id(root), .ref_root = btrfs_root_id(root), }; ASSERT(generic_ref.ref_root != BTRFS_TREE_LOG_OBJECTID); if (btrfs_is_data_reloc_root(root) && is_fstree(root->relocation_src_root)) generic_ref.owning_root = root->relocation_src_root; btrfs_init_data_ref(&generic_ref, owner, offset, 0, false); btrfs_ref_tree_mod(root->fs_info, &generic_ref); return btrfs_add_delayed_data_ref(trans, &generic_ref, ram_bytes); } /* * this is used by the tree logging recovery code. It records that * an extent has been allocated and makes sure to clear the free * space cache bits as well */ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans, u64 root_objectid, u64 owner, u64 offset, struct btrfs_key *ins) { struct btrfs_fs_info *fs_info = trans->fs_info; int ret; struct btrfs_block_group *block_group; struct btrfs_space_info *space_info; struct btrfs_squota_delta delta = { .root = root_objectid, .num_bytes = ins->offset, .generation = trans->transid, .is_data = true, .is_inc = true, }; /* * Mixed block groups will exclude before processing the log so we only * need to do the exclude dance if this fs isn't mixed. */ if (!btrfs_fs_incompat(fs_info, MIXED_GROUPS)) { ret = __exclude_logged_extent(fs_info, ins->objectid, ins->offset); if (ret) return ret; } block_group = btrfs_lookup_block_group(fs_info, ins->objectid); if (!block_group) return -EINVAL; space_info = block_group->space_info; spin_lock(&space_info->lock); spin_lock(&block_group->lock); space_info->bytes_reserved += ins->offset; block_group->reserved += ins->offset; spin_unlock(&block_group->lock); spin_unlock(&space_info->lock); ret = alloc_reserved_file_extent(trans, 0, root_objectid, 0, owner, offset, ins, 1, root_objectid); if (ret) btrfs_pin_extent(trans, ins->objectid, ins->offset, 1); ret = btrfs_record_squota_delta(fs_info, &delta); btrfs_put_block_group(block_group); return ret; } #ifdef CONFIG_BTRFS_DEBUG /* * Extra safety check in case the extent tree is corrupted and extent allocator * chooses to use a tree block which is already used and locked. */ static bool check_eb_lock_owner(const struct extent_buffer *eb) { if (eb->lock_owner == current->pid) { btrfs_err_rl(eb->fs_info, "tree block %llu owner %llu already locked by pid=%d, extent tree corruption detected", eb->start, btrfs_header_owner(eb), current->pid); return true; } return false; } #else static bool check_eb_lock_owner(struct extent_buffer *eb) { return false; } #endif static struct extent_buffer * btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 bytenr, int level, u64 owner, enum btrfs_lock_nesting nest) { struct btrfs_fs_info *fs_info = root->fs_info; struct extent_buffer *buf; u64 lockdep_owner = owner; buf = btrfs_find_create_tree_block(fs_info, bytenr, owner, level); if (IS_ERR(buf)) return buf; if (check_eb_lock_owner(buf)) { free_extent_buffer(buf); return ERR_PTR(-EUCLEAN); } /* * The reloc trees are just snapshots, so we need them to appear to be * just like any other fs tree WRT lockdep. * * The exception however is in replace_path() in relocation, where we * hold the lock on the original fs root and then search for the reloc * root. At that point we need to make sure any reloc root buffers are * set to the BTRFS_TREE_RELOC_OBJECTID lockdep class in order to make * lockdep happy. */ if (lockdep_owner == BTRFS_TREE_RELOC_OBJECTID && !test_bit(BTRFS_ROOT_RESET_LOCKDEP_CLASS, &root->state)) lockdep_owner = BTRFS_FS_TREE_OBJECTID; /* btrfs_clear_buffer_dirty() accesses generation field. */ btrfs_set_header_generation(buf, trans->transid); /* * This needs to stay, because we could allocate a freed block from an * old tree into a new tree, so we need to make sure this new block is * set to the appropriate level and owner. */ btrfs_set_buffer_lockdep_class(lockdep_owner, buf, level); btrfs_tree_lock_nested(buf, nest); btrfs_clear_buffer_dirty(trans, buf); clear_bit(EXTENT_BUFFER_STALE, &buf->bflags); clear_bit(EXTENT_BUFFER_ZONED_ZEROOUT, &buf->bflags); set_extent_buffer_uptodate(buf); memzero_extent_buffer(buf, 0, sizeof(struct btrfs_header)); btrfs_set_header_level(buf, level); btrfs_set_header_bytenr(buf, buf->start); btrfs_set_header_generation(buf, trans->transid); btrfs_set_header_backref_rev(buf, BTRFS_MIXED_BACKREF_REV); btrfs_set_header_owner(buf, owner); write_extent_buffer_fsid(buf, fs_info->fs_devices->metadata_uuid); write_extent_buffer_chunk_tree_uuid(buf, fs_info->chunk_tree_uuid); if (btrfs_root_id(root) == BTRFS_TREE_LOG_OBJECTID) { buf->log_index = root->log_transid % 2; /* * we allow two log transactions at a time, use different * EXTENT bit to differentiate dirty pages. */ if (buf->log_index == 0) set_extent_bit(&root->dirty_log_pages, buf->start, buf->start + buf->len - 1, EXTENT_DIRTY, NULL); else set_extent_bit(&root->dirty_log_pages, buf->start, buf->start + buf->len - 1, EXTENT_NEW, NULL); } else { buf->log_index = -1; set_extent_bit(&trans->transaction->dirty_pages, buf->start, buf->start + buf->len - 1, EXTENT_DIRTY, NULL); } /* this returns a buffer locked for blocking */ return buf; } /* * finds a free extent and does all the dirty work required for allocation * returns the tree buffer or an ERR_PTR on error. */ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 parent, u64 root_objectid, const struct btrfs_disk_key *key, int level, u64 hint, u64 empty_size, u64 reloc_src_root, enum btrfs_lock_nesting nest) { struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_key ins; struct btrfs_block_rsv *block_rsv; struct extent_buffer *buf; u64 flags = 0; int ret; u32 blocksize = fs_info->nodesize; bool skinny_metadata = btrfs_fs_incompat(fs_info, SKINNY_METADATA); u64 owning_root; #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS if (btrfs_is_testing(fs_info)) { buf = btrfs_init_new_buffer(trans, root, root->alloc_bytenr, level, root_objectid, nest); if (!IS_ERR(buf)) root->alloc_bytenr += blocksize; return buf; } #endif block_rsv = btrfs_use_block_rsv(trans, root, blocksize); if (IS_ERR(block_rsv)) return ERR_CAST(block_rsv); ret = btrfs_reserve_extent(root, blocksize, blocksize, blocksize, empty_size, hint, &ins, 0, 0); if (ret) goto out_unuse; buf = btrfs_init_new_buffer(trans, root, ins.objectid, level, root_objectid, nest); if (IS_ERR(buf)) { ret = PTR_ERR(buf); goto out_free_reserved; } owning_root = btrfs_header_owner(buf); if (root_objectid == BTRFS_TREE_RELOC_OBJECTID) { if (parent == 0) parent = ins.objectid; flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF; owning_root = reloc_src_root; } else BUG_ON(parent > 0); if (root_objectid != BTRFS_TREE_LOG_OBJECTID) { struct btrfs_delayed_extent_op *extent_op; struct btrfs_ref generic_ref = { .action = BTRFS_ADD_DELAYED_EXTENT, .bytenr = ins.objectid, .num_bytes = ins.offset, .parent = parent, .owning_root = owning_root, .ref_root = root_objectid, }; if (!skinny_metadata || flags != 0) { extent_op = btrfs_alloc_delayed_extent_op(); if (!extent_op) { ret = -ENOMEM; goto out_free_buf; } if (key) memcpy(&extent_op->key, key, sizeof(extent_op->key)); else memset(&extent_op->key, 0, sizeof(extent_op->key)); extent_op->flags_to_set = flags; extent_op->update_key = (skinny_metadata ? false : true); extent_op->update_flags = (flags != 0); } else { extent_op = NULL; } btrfs_init_tree_ref(&generic_ref, level, btrfs_root_id(root), false); btrfs_ref_tree_mod(fs_info, &generic_ref); ret = btrfs_add_delayed_tree_ref(trans, &generic_ref, extent_op); if (ret) { btrfs_free_delayed_extent_op(extent_op); goto out_free_buf; } } return buf; out_free_buf: btrfs_tree_unlock(buf); free_extent_buffer(buf); out_free_reserved: btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 0); out_unuse: btrfs_unuse_block_rsv(fs_info, block_rsv, blocksize); return ERR_PTR(ret); } struct walk_control { u64 refs[BTRFS_MAX_LEVEL]; u64 flags[BTRFS_MAX_LEVEL]; struct btrfs_key update_progress; struct btrfs_key drop_progress; int drop_level; int stage; int level; int shared_level; int update_ref; int keep_locks; int reada_slot; int reada_count; int restarted; /* Indicate that extent info needs to be looked up when walking the tree. */ int lookup_info; }; /* * This is our normal stage. We are traversing blocks the current snapshot owns * and we are dropping any of our references to any children we are able to, and * then freeing the block once we've processed all of the children. */ #define DROP_REFERENCE 1 /* * We enter this stage when we have to walk into a child block (meaning we can't * simply drop our reference to it from our current parent node) and there are * more than one reference on it. If we are the owner of any of the children * blocks from the current parent node then we have to do the FULL_BACKREF dance * on them in order to drop our normal ref and add the shared ref. */ #define UPDATE_BACKREF 2 /* * Decide if we need to walk down into this node to adjust the references. * * @root: the root we are currently deleting * @wc: the walk control for this deletion * @eb: the parent eb that we're currently visiting * @refs: the number of refs for wc->level - 1 * @flags: the flags for wc->level - 1 * @slot: the slot in the eb that we're currently checking * * This is meant to be called when we're evaluating if a node we point to at * wc->level should be read and walked into, or if we can simply delete our * reference to it. We return true if we should walk into the node, false if we * can skip it. * * We have assertions in here to make sure this is called correctly. We assume * that sanity checking on the blocks read to this point has been done, so any * corrupted file systems must have been caught before calling this function. */ static bool visit_node_for_delete(struct btrfs_root *root, struct walk_control *wc, struct extent_buffer *eb, u64 flags, int slot) { struct btrfs_key key; u64 generation; int level = wc->level; ASSERT(level > 0); ASSERT(wc->refs[level - 1] > 0); /* * The update backref stage we only want to skip if we already have * FULL_BACKREF set, otherwise we need to read. */ if (wc->stage == UPDATE_BACKREF) { if (level == 1 && flags & BTRFS_BLOCK_FLAG_FULL_BACKREF) return false; return true; } /* * We're the last ref on this block, we must walk into it and process * any refs it's pointing at. */ if (wc->refs[level - 1] == 1) return true; /* * If we're already FULL_BACKREF then we know we can just drop our * current reference. */ if (level == 1 && flags & BTRFS_BLOCK_FLAG_FULL_BACKREF) return false; /* * This block is older than our creation generation, we can drop our * reference to it. */ generation = btrfs_node_ptr_generation(eb, slot); if (!wc->update_ref || generation <= btrfs_root_origin_generation(root)) return false; /* * This block was processed from a previous snapshot deletion run, we * can skip it. */ btrfs_node_key_to_cpu(eb, &key, slot); if (btrfs_comp_cpu_keys(&key, &wc->update_progress) < 0) return false; /* All other cases we need to wander into the node. */ return true; } static noinline void reada_walk_down(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct walk_control *wc, struct btrfs_path *path) { struct btrfs_fs_info *fs_info = root->fs_info; u64 bytenr; u64 generation; u64 refs; u64 flags; u32 nritems; struct extent_buffer *eb; int ret; int slot; int nread = 0; if (path->slots[wc->level] < wc->reada_slot) { wc->reada_count = wc->reada_count * 2 / 3; wc->reada_count = max(wc->reada_count, 2); } else { wc->reada_count = wc->reada_count * 3 / 2; wc->reada_count = min_t(int, wc->reada_count, BTRFS_NODEPTRS_PER_BLOCK(fs_info)); } eb = path->nodes[wc->level]; nritems = btrfs_header_nritems(eb); for (slot = path->slots[wc->level]; slot < nritems; slot++) { if (nread >= wc->reada_count) break; cond_resched(); bytenr = btrfs_node_blockptr(eb, slot); generation = btrfs_node_ptr_generation(eb, slot); if (slot == path->slots[wc->level]) goto reada; if (wc->stage == UPDATE_BACKREF && generation <= btrfs_root_origin_generation(root)) continue; /* We don't lock the tree block, it's OK to be racy here */ ret = btrfs_lookup_extent_info(trans, fs_info, bytenr, wc->level - 1, 1, &refs, &flags, NULL); /* We don't care about errors in readahead. */ if (ret < 0) continue; /* * This could be racey, it's conceivable that we raced and end * up with a bogus refs count, if that's the case just skip, if * we are actually corrupt we will notice when we look up * everything again with our locks. */ if (refs == 0) continue; /* If we don't need to visit this node don't reada. */ if (!visit_node_for_delete(root, wc, eb, flags, slot)) continue; reada: btrfs_readahead_node_child(eb, slot); nread++; } wc->reada_slot = slot; } /* * helper to process tree block while walking down the tree. * * when wc->stage == UPDATE_BACKREF, this function updates * back refs for pointers in the block. * * NOTE: return value 1 means we should stop walking down. */ static noinline int walk_down_proc(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, struct walk_control *wc) { struct btrfs_fs_info *fs_info = root->fs_info; int level = wc->level; struct extent_buffer *eb = path->nodes[level]; u64 flag = BTRFS_BLOCK_FLAG_FULL_BACKREF; int ret; if (wc->stage == UPDATE_BACKREF && btrfs_header_owner(eb) != btrfs_root_id(root)) return 1; /* * when reference count of tree block is 1, it won't increase * again. once full backref flag is set, we never clear it. */ if (wc->lookup_info && ((wc->stage == DROP_REFERENCE && wc->refs[level] != 1) || (wc->stage == UPDATE_BACKREF && !(wc->flags[level] & flag)))) { ASSERT(path->locks[level]); ret = btrfs_lookup_extent_info(trans, fs_info, eb->start, level, 1, &wc->refs[level], &wc->flags[level], NULL); if (ret) return ret; if (unlikely(wc->refs[level] == 0)) { btrfs_err(fs_info, "bytenr %llu has 0 references, expect > 0", eb->start); return -EUCLEAN; } } if (wc->stage == DROP_REFERENCE) { if (wc->refs[level] > 1) return 1; if (path->locks[level] && !wc->keep_locks) { btrfs_tree_unlock_rw(eb, path->locks[level]); path->locks[level] = 0; } return 0; } /* wc->stage == UPDATE_BACKREF */ if (!(wc->flags[level] & flag)) { ASSERT(path->locks[level]); ret = btrfs_inc_ref(trans, root, eb, 1); if (ret) { btrfs_abort_transaction(trans, ret); return ret; } ret = btrfs_dec_ref(trans, root, eb, 0); if (ret) { btrfs_abort_transaction(trans, ret); return ret; } ret = btrfs_set_disk_extent_flags(trans, eb, flag); if (ret) { btrfs_abort_transaction(trans, ret); return ret; } wc->flags[level] |= flag; } /* * the block is shared by multiple trees, so it's not good to * keep the tree lock */ if (path->locks[level] && level > 0) { btrfs_tree_unlock_rw(eb, path->locks[level]); path->locks[level] = 0; } return 0; } /* * This is used to verify a ref exists for this root to deal with a bug where we * would have a drop_progress key that hadn't been updated properly. */ static int check_ref_exists(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 bytenr, u64 parent, int level) { struct btrfs_delayed_ref_root *delayed_refs; struct btrfs_delayed_ref_head *head; struct btrfs_path *path; struct btrfs_extent_inline_ref *iref; int ret; bool exists = false; path = btrfs_alloc_path(); if (!path) return -ENOMEM; again: ret = lookup_extent_backref(trans, path, &iref, bytenr, root->fs_info->nodesize, parent, btrfs_root_id(root), level, 0); if (ret != -ENOENT) { /* * If we get 0 then we found our reference, return 1, else * return the error if it's not -ENOENT; */ btrfs_free_path(path); return (ret < 0 ) ? ret : 1; } /* * We could have a delayed ref with this reference, so look it up while * we're holding the path open to make sure we don't race with the * delayed ref running. */ delayed_refs = &trans->transaction->delayed_refs; spin_lock(&delayed_refs->lock); head = btrfs_find_delayed_ref_head(root->fs_info, delayed_refs, bytenr); if (!head) goto out; if (!mutex_trylock(&head->mutex)) { /* * We're contended, means that the delayed ref is running, get a * reference and wait for the ref head to be complete and then * try again. */ refcount_inc(&head->refs); spin_unlock(&delayed_refs->lock); btrfs_release_path(path); mutex_lock(&head->mutex); mutex_unlock(&head->mutex); btrfs_put_delayed_ref_head(head); goto again; } exists = btrfs_find_delayed_tree_ref(head, root->root_key.objectid, parent); mutex_unlock(&head->mutex); out: spin_unlock(&delayed_refs->lock); btrfs_free_path(path); return exists ? 1 : 0; } /* * We may not have an uptodate block, so if we are going to walk down into this * block we need to drop the lock, read it off of the disk, re-lock it and * return to continue dropping the snapshot. */ static int check_next_block_uptodate(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, struct walk_control *wc, struct extent_buffer *next) { struct btrfs_tree_parent_check check = { 0 }; u64 generation; int level = wc->level; int ret; btrfs_assert_tree_write_locked(next); generation = btrfs_node_ptr_generation(path->nodes[level], path->slots[level]); if (btrfs_buffer_uptodate(next, generation, 0)) return 0; check.level = level - 1; check.transid = generation; check.owner_root = btrfs_root_id(root); check.has_first_key = true; btrfs_node_key_to_cpu(path->nodes[level], &check.first_key, path->slots[level]); btrfs_tree_unlock(next); if (level == 1) reada_walk_down(trans, root, wc, path); ret = btrfs_read_extent_buffer(next, &check); if (ret) { free_extent_buffer(next); return ret; } btrfs_tree_lock(next); wc->lookup_info = 1; return 0; } /* * If we determine that we don't have to visit wc->level - 1 then we need to * determine if we can drop our reference. * * If we are UPDATE_BACKREF then we will not, we need to update our backrefs. * * If we are DROP_REFERENCE this will figure out if we need to drop our current * reference, skipping it if we dropped it from a previous incompleted drop, or * dropping it if we still have a reference to it. */ static int maybe_drop_reference(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, struct walk_control *wc, struct extent_buffer *next, u64 owner_root) { struct btrfs_ref ref = { .action = BTRFS_DROP_DELAYED_REF, .bytenr = next->start, .num_bytes = root->fs_info->nodesize, .owning_root = owner_root, .ref_root = btrfs_root_id(root), }; int level = wc->level; int ret; /* We are UPDATE_BACKREF, we're not dropping anything. */ if (wc->stage == UPDATE_BACKREF) return 0; if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) { ref.parent = path->nodes[level]->start; } else { ASSERT(btrfs_root_id(root) == btrfs_header_owner(path->nodes[level])); if (btrfs_root_id(root) != btrfs_header_owner(path->nodes[level])) { btrfs_err(root->fs_info, "mismatched block owner"); return -EIO; } } /* * If we had a drop_progress we need to verify the refs are set as * expected. If we find our ref then we know that from here on out * everything should be correct, and we can clear the * ->restarted flag. */ if (wc->restarted) { ret = check_ref_exists(trans, root, next->start, ref.parent, level - 1); if (ret <= 0) return ret; ret = 0; wc->restarted = 0; } /* * Reloc tree doesn't contribute to qgroup numbers, and we have already * accounted them at merge time (replace_path), thus we could skip * expensive subtree trace here. */ if (btrfs_root_id(root) != BTRFS_TREE_RELOC_OBJECTID && wc->refs[level - 1] > 1) { u64 generation = btrfs_node_ptr_generation(path->nodes[level], path->slots[level]); ret = btrfs_qgroup_trace_subtree(trans, next, generation, level - 1); if (ret) { btrfs_err_rl(root->fs_info, "error %d accounting shared subtree, quota is out of sync, rescan required", ret); } } /* * We need to update the next key in our walk control so we can update * the drop_progress key accordingly. We don't care if find_next_key * doesn't find a key because that means we're at the end and are going * to clean up now. */ wc->drop_level = level; find_next_key(path, level, &wc->drop_progress); btrfs_init_tree_ref(&ref, level - 1, 0, false); return btrfs_free_extent(trans, &ref); } /* * helper to process tree block pointer. * * when wc->stage == DROP_REFERENCE, this function checks * reference count of the block pointed to. if the block * is shared and we need update back refs for the subtree * rooted at the block, this function changes wc->stage to * UPDATE_BACKREF. if the block is shared and there is no * need to update back, this function drops the reference * to the block. * * NOTE: return value 1 means we should stop walking down. */ static noinline int do_walk_down(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, struct walk_control *wc) { struct btrfs_fs_info *fs_info = root->fs_info; u64 bytenr; u64 generation; u64 owner_root = 0; struct extent_buffer *next; int level = wc->level; int ret = 0; generation = btrfs_node_ptr_generation(path->nodes[level], path->slots[level]); /* * if the lower level block was created before the snapshot * was created, we know there is no need to update back refs * for the subtree */ if (wc->stage == UPDATE_BACKREF && generation <= btrfs_root_origin_generation(root)) { wc->lookup_info = 1; return 1; } bytenr = btrfs_node_blockptr(path->nodes[level], path->slots[level]); next = btrfs_find_create_tree_block(fs_info, bytenr, btrfs_root_id(root), level - 1); if (IS_ERR(next)) return PTR_ERR(next); btrfs_tree_lock(next); ret = btrfs_lookup_extent_info(trans, fs_info, bytenr, level - 1, 1, &wc->refs[level - 1], &wc->flags[level - 1], &owner_root); if (ret < 0) goto out_unlock; if (unlikely(wc->refs[level - 1] == 0)) { btrfs_err(fs_info, "bytenr %llu has 0 references, expect > 0", bytenr); ret = -EUCLEAN; goto out_unlock; } wc->lookup_info = 0; /* If we don't have to walk into this node skip it. */ if (!visit_node_for_delete(root, wc, path->nodes[level], wc->flags[level - 1], path->slots[level])) goto skip; /* * We have to walk down into this node, and if we're currently at the * DROP_REFERNCE stage and this block is shared then we need to switch * to the UPDATE_BACKREF stage in order to convert to FULL_BACKREF. */ if (wc->stage == DROP_REFERENCE && wc->refs[level - 1] > 1) { wc->stage = UPDATE_BACKREF; wc->shared_level = level - 1; } ret = check_next_block_uptodate(trans, root, path, wc, next); if (ret) return ret; level--; ASSERT(level == btrfs_header_level(next)); if (level != btrfs_header_level(next)) { btrfs_err(root->fs_info, "mismatched level"); ret = -EIO; goto out_unlock; } path->nodes[level] = next; path->slots[level] = 0; path->locks[level] = BTRFS_WRITE_LOCK; wc->level = level; if (wc->level == 1) wc->reada_slot = 0; return 0; skip: ret = maybe_drop_reference(trans, root, path, wc, next, owner_root); if (ret) goto out_unlock; wc->refs[level - 1] = 0; wc->flags[level - 1] = 0; wc->lookup_info = 1; ret = 1; out_unlock: btrfs_tree_unlock(next); free_extent_buffer(next); return ret; } /* * helper to process tree block while walking up the tree. * * when wc->stage == DROP_REFERENCE, this function drops * reference count on the block. * * when wc->stage == UPDATE_BACKREF, this function changes * wc->stage back to DROP_REFERENCE if we changed wc->stage * to UPDATE_BACKREF previously while processing the block. * * NOTE: return value 1 means we should stop walking up. */ static noinline int walk_up_proc(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, struct walk_control *wc) { struct btrfs_fs_info *fs_info = root->fs_info; int ret = 0; int level = wc->level; struct extent_buffer *eb = path->nodes[level]; u64 parent = 0; if (wc->stage == UPDATE_BACKREF) { ASSERT(wc->shared_level >= level); if (level < wc->shared_level) goto out; ret = find_next_key(path, level + 1, &wc->update_progress); if (ret > 0) wc->update_ref = 0; wc->stage = DROP_REFERENCE; wc->shared_level = -1; path->slots[level] = 0; /* * check reference count again if the block isn't locked. * we should start walking down the tree again if reference * count is one. */ if (!path->locks[level]) { ASSERT(level > 0); btrfs_tree_lock(eb); path->locks[level] = BTRFS_WRITE_LOCK; ret = btrfs_lookup_extent_info(trans, fs_info, eb->start, level, 1, &wc->refs[level], &wc->flags[level], NULL); if (ret < 0) { btrfs_tree_unlock_rw(eb, path->locks[level]); path->locks[level] = 0; return ret; } if (unlikely(wc->refs[level] == 0)) { btrfs_tree_unlock_rw(eb, path->locks[level]); btrfs_err(fs_info, "bytenr %llu has 0 references, expect > 0", eb->start); return -EUCLEAN; } if (wc->refs[level] == 1) { btrfs_tree_unlock_rw(eb, path->locks[level]); path->locks[level] = 0; return 1; } } } /* wc->stage == DROP_REFERENCE */ ASSERT(path->locks[level] || wc->refs[level] == 1); if (wc->refs[level] == 1) { if (level == 0) { if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) ret = btrfs_dec_ref(trans, root, eb, 1); else ret = btrfs_dec_ref(trans, root, eb, 0); if (ret) { btrfs_abort_transaction(trans, ret); return ret; } if (is_fstree(btrfs_root_id(root))) { ret = btrfs_qgroup_trace_leaf_items(trans, eb); if (ret) { btrfs_err_rl(fs_info, "error %d accounting leaf items, quota is out of sync, rescan required", ret); } } } /* Make block locked assertion in btrfs_clear_buffer_dirty happy. */ if (!path->locks[level]) { btrfs_tree_lock(eb); path->locks[level] = BTRFS_WRITE_LOCK; } btrfs_clear_buffer_dirty(trans, eb); } if (eb == root->node) { if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) parent = eb->start; else if (btrfs_root_id(root) != btrfs_header_owner(eb)) goto owner_mismatch; } else { if (wc->flags[level + 1] & BTRFS_BLOCK_FLAG_FULL_BACKREF) parent = path->nodes[level + 1]->start; else if (btrfs_root_id(root) != btrfs_header_owner(path->nodes[level + 1])) goto owner_mismatch; } ret = btrfs_free_tree_block(trans, btrfs_root_id(root), eb, parent, wc->refs[level] == 1); if (ret < 0) btrfs_abort_transaction(trans, ret); out: wc->refs[level] = 0; wc->flags[level] = 0; return ret; owner_mismatch: btrfs_err_rl(fs_info, "unexpected tree owner, have %llu expect %llu", btrfs_header_owner(eb), btrfs_root_id(root)); return -EUCLEAN; } /* * walk_down_tree consists of two steps. * * walk_down_proc(). Look up the reference count and reference of our current * wc->level. At this point path->nodes[wc->level] should be populated and * uptodate, and in most cases should already be locked. If we are in * DROP_REFERENCE and our refcount is > 1 then we've entered a shared node and * we can walk back up the tree. If we are UPDATE_BACKREF we have to set * FULL_BACKREF on this node if it's not already set, and then do the * FULL_BACKREF conversion dance, which is to drop the root reference and add * the shared reference to all of this nodes children. * * do_walk_down(). This is where we actually start iterating on the children of * our current path->nodes[wc->level]. For DROP_REFERENCE that means dropping * our reference to the children that return false from visit_node_for_delete(), * which has various conditions where we know we can just drop our reference * without visiting the node. For UPDATE_BACKREF we will skip any children that * visit_node_for_delete() returns false for, only walking down when necessary. * The bulk of the work for UPDATE_BACKREF occurs in the walk_up_tree() part of * snapshot deletion. */ static noinline int walk_down_tree(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, struct walk_control *wc) { int level = wc->level; int ret = 0; wc->lookup_info = 1; while (level >= 0) { ret = walk_down_proc(trans, root, path, wc); if (ret) break; if (level == 0) break; if (path->slots[level] >= btrfs_header_nritems(path->nodes[level])) break; ret = do_walk_down(trans, root, path, wc); if (ret > 0) { path->slots[level]++; continue; } else if (ret < 0) break; level = wc->level; } return (ret == 1) ? 0 : ret; } /* * walk_up_tree() is responsible for making sure we visit every slot on our * current node, and if we're at the end of that node then we call * walk_up_proc() on our current node which will do one of a few things based on * our stage. * * UPDATE_BACKREF. If we wc->level is currently less than our wc->shared_level * then we need to walk back up the tree, and then going back down into the * other slots via walk_down_tree to update any other children from our original * wc->shared_level. Once we're at or above our wc->shared_level we can switch * back to DROP_REFERENCE, lookup the current nodes refs and flags, and carry on. * * DROP_REFERENCE. If our refs == 1 then we're going to free this tree block. * If we're level 0 then we need to btrfs_dec_ref() on all of the data extents * in our current leaf. After that we call btrfs_free_tree_block() on the * current node and walk up to the next node to walk down the next slot. */ static noinline int walk_up_tree(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, struct walk_control *wc, int max_level) { int level = wc->level; int ret; path->slots[level] = btrfs_header_nritems(path->nodes[level]); while (level < max_level && path->nodes[level]) { wc->level = level; if (path->slots[level] + 1 < btrfs_header_nritems(path->nodes[level])) { path->slots[level]++; return 0; } else { ret = walk_up_proc(trans, root, path, wc); if (ret > 0) return 0; if (ret < 0) return ret; if (path->locks[level]) { btrfs_tree_unlock_rw(path->nodes[level], path->locks[level]); path->locks[level] = 0; } free_extent_buffer(path->nodes[level]); path->nodes[level] = NULL; level++; } } return 1; } /* * drop a subvolume tree. * * this function traverses the tree freeing any blocks that only * referenced by the tree. * * when a shared tree block is found. this function decreases its * reference count by one. if update_ref is true, this function * also make sure backrefs for the shared block and all lower level * blocks are properly updated. * * If called with for_reloc == 0, may exit early with -EAGAIN */ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc) { const bool is_reloc_root = (btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID); struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_path *path; struct btrfs_trans_handle *trans; struct btrfs_root *tree_root = fs_info->tree_root; struct btrfs_root_item *root_item = &root->root_item; struct walk_control *wc; struct btrfs_key key; const u64 rootid = btrfs_root_id(root); int ret = 0; int level; bool root_dropped = false; bool unfinished_drop = false; btrfs_debug(fs_info, "Drop subvolume %llu", btrfs_root_id(root)); path = btrfs_alloc_path(); if (!path) { ret = -ENOMEM; goto out; } wc = kzalloc(sizeof(*wc), GFP_NOFS); if (!wc) { btrfs_free_path(path); ret = -ENOMEM; goto out; } /* * Use join to avoid potential EINTR from transaction start. See * wait_reserve_ticket and the whole reservation callchain. */ if (for_reloc) trans = btrfs_join_transaction(tree_root); else trans = btrfs_start_transaction(tree_root, 0); if (IS_ERR(trans)) { ret = PTR_ERR(trans); goto out_free; } ret = btrfs_run_delayed_items(trans); if (ret) goto out_end_trans; /* * This will help us catch people modifying the fs tree while we're * dropping it. It is unsafe to mess with the fs tree while it's being * dropped as we unlock the root node and parent nodes as we walk down * the tree, assuming nothing will change. If something does change * then we'll have stale information and drop references to blocks we've * already dropped. */ set_bit(BTRFS_ROOT_DELETING, &root->state); unfinished_drop = test_bit(BTRFS_ROOT_UNFINISHED_DROP, &root->state); if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) { level = btrfs_header_level(root->node); path->nodes[level] = btrfs_lock_root_node(root); path->slots[level] = 0; path->locks[level] = BTRFS_WRITE_LOCK; memset(&wc->update_progress, 0, sizeof(wc->update_progress)); } else { btrfs_disk_key_to_cpu(&key, &root_item->drop_progress); memcpy(&wc->update_progress, &key, sizeof(wc->update_progress)); level = btrfs_root_drop_level(root_item); BUG_ON(level == 0); path->lowest_level = level; ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); path->lowest_level = 0; if (ret < 0) goto out_end_trans; WARN_ON(ret > 0); ret = 0; /* * unlock our path, this is safe because only this * function is allowed to delete this snapshot */ btrfs_unlock_up_safe(path, 0); level = btrfs_header_level(root->node); while (1) { btrfs_tree_lock(path->nodes[level]); path->locks[level] = BTRFS_WRITE_LOCK; /* * btrfs_lookup_extent_info() returns 0 for success, * or < 0 for error. */ ret = btrfs_lookup_extent_info(trans, fs_info, path->nodes[level]->start, level, 1, &wc->refs[level], &wc->flags[level], NULL); if (ret < 0) goto out_end_trans; BUG_ON(wc->refs[level] == 0); if (level == btrfs_root_drop_level(root_item)) break; btrfs_tree_unlock(path->nodes[level]); path->locks[level] = 0; WARN_ON(wc->refs[level] != 1); level--; } } wc->restarted = test_bit(BTRFS_ROOT_DEAD_TREE, &root->state); wc->level = level; wc->shared_level = -1; wc->stage = DROP_REFERENCE; wc->update_ref = update_ref; wc->keep_locks = 0; wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(fs_info); while (1) { ret = walk_down_tree(trans, root, path, wc); if (ret < 0) { btrfs_abort_transaction(trans, ret); break; } ret = walk_up_tree(trans, root, path, wc, BTRFS_MAX_LEVEL); if (ret < 0) { btrfs_abort_transaction(trans, ret); break; } if (ret > 0) { BUG_ON(wc->stage != DROP_REFERENCE); ret = 0; break; } if (wc->stage == DROP_REFERENCE) { wc->drop_level = wc->level; btrfs_node_key_to_cpu(path->nodes[wc->drop_level], &wc->drop_progress, path->slots[wc->drop_level]); } btrfs_cpu_key_to_disk(&root_item->drop_progress, &wc->drop_progress); btrfs_set_root_drop_level(root_item, wc->drop_level); BUG_ON(wc->level == 0); if (btrfs_should_end_transaction(trans) || (!for_reloc && btrfs_need_cleaner_sleep(fs_info))) { ret = btrfs_update_root(trans, tree_root, &root->root_key, root_item); if (ret) { btrfs_abort_transaction(trans, ret); goto out_end_trans; } if (!is_reloc_root) btrfs_set_last_root_drop_gen(fs_info, trans->transid); btrfs_end_transaction_throttle(trans); if (!for_reloc && btrfs_need_cleaner_sleep(fs_info)) { btrfs_debug(fs_info, "drop snapshot early exit"); ret = -EAGAIN; goto out_free; } /* * Use join to avoid potential EINTR from transaction * start. See wait_reserve_ticket and the whole * reservation callchain. */ if (for_reloc) trans = btrfs_join_transaction(tree_root); else trans = btrfs_start_transaction(tree_root, 0); if (IS_ERR(trans)) { ret = PTR_ERR(trans); goto out_free; } } } btrfs_release_path(path); if (ret) goto out_end_trans; ret = btrfs_del_root(trans, &root->root_key); if (ret) { btrfs_abort_transaction(trans, ret); goto out_end_trans; } if (!is_reloc_root) { ret = btrfs_find_root(tree_root, &root->root_key, path, NULL, NULL); if (ret < 0) { btrfs_abort_transaction(trans, ret); goto out_end_trans; } else if (ret > 0) { ret = 0; /* * If we fail to delete the orphan item this time * around, it'll get picked up the next time. * * The most common failure here is just -ENOENT. */ btrfs_del_orphan_item(trans, tree_root, btrfs_root_id(root)); } } /* * This subvolume is going to be completely dropped, and won't be * recorded as dirty roots, thus pertrans meta rsv will not be freed at * commit transaction time. So free it here manually. */ btrfs_qgroup_convert_reserved_meta(root, INT_MAX); btrfs_qgroup_free_meta_all_pertrans(root); if (test_bit(BTRFS_ROOT_IN_RADIX, &root->state)) btrfs_add_dropped_root(trans, root); else btrfs_put_root(root); root_dropped = true; out_end_trans: if (!is_reloc_root) btrfs_set_last_root_drop_gen(fs_info, trans->transid); btrfs_end_transaction_throttle(trans); out_free: kfree(wc); btrfs_free_path(path); out: if (!ret && root_dropped) { ret = btrfs_qgroup_cleanup_dropped_subvolume(fs_info, rootid); if (ret < 0) btrfs_warn_rl(fs_info, "failed to cleanup qgroup 0/%llu: %d", rootid, ret); ret = 0; } /* * We were an unfinished drop root, check to see if there are any * pending, and if not clear and wake up any waiters. */ if (!ret && unfinished_drop) btrfs_maybe_wake_unfinished_drop(fs_info); /* * So if we need to stop dropping the snapshot for whatever reason we * need to make sure to add it back to the dead root list so that we * keep trying to do the work later. This also cleans up roots if we * don't have it in the radix (like when we recover after a power fail * or unmount) so we don't leak memory. */ if (!for_reloc && !root_dropped) btrfs_add_dead_root(root); return ret; } /* * drop subtree rooted at tree block 'node'. * * NOTE: this function will unlock and release tree block 'node' * only used by relocation code */ int btrfs_drop_subtree(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *node, struct extent_buffer *parent) { struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_path *path; struct walk_control *wc; int level; int parent_level; int ret = 0; BUG_ON(btrfs_root_id(root) != BTRFS_TREE_RELOC_OBJECTID); path = btrfs_alloc_path(); if (!path) return -ENOMEM; wc = kzalloc(sizeof(*wc), GFP_NOFS); if (!wc) { btrfs_free_path(path); return -ENOMEM; } btrfs_assert_tree_write_locked(parent); parent_level = btrfs_header_level(parent); atomic_inc(&parent->refs); path->nodes[parent_level] = parent; path->slots[parent_level] = btrfs_header_nritems(parent); btrfs_assert_tree_write_locked(node); level = btrfs_header_level(node); path->nodes[level] = node; path->slots[level] = 0; path->locks[level] = BTRFS_WRITE_LOCK; wc->refs[parent_level] = 1; wc->flags[parent_level] = BTRFS_BLOCK_FLAG_FULL_BACKREF; wc->level = level; wc->shared_level = -1; wc->stage = DROP_REFERENCE; wc->update_ref = 0; wc->keep_locks = 1; wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(fs_info); while (1) { ret = walk_down_tree(trans, root, path, wc); if (ret < 0) break; ret = walk_up_tree(trans, root, path, wc, parent_level); if (ret) { if (ret > 0) ret = 0; break; } } kfree(wc); btrfs_free_path(path); return ret; } /* * Unpin the extent range in an error context and don't add the space back. * Errors are not propagated further. */ void btrfs_error_unpin_extent_range(struct btrfs_fs_info *fs_info, u64 start, u64 end) { unpin_extent_range(fs_info, start, end, false); } /* * It used to be that old block groups would be left around forever. * Iterating over them would be enough to trim unused space. Since we * now automatically remove them, we also need to iterate over unallocated * space. * * We don't want a transaction for this since the discard may take a * substantial amount of time. We don't require that a transaction be * running, but we do need to take a running transaction into account * to ensure that we're not discarding chunks that were released or * allocated in the current transaction. * * Holding the chunks lock will prevent other threads from allocating * or releasing chunks, but it won't prevent a running transaction * from committing and releasing the memory that the pending chunks * list head uses. For that, we need to take a reference to the * transaction and hold the commit root sem. We only need to hold * it while performing the free space search since we have already * held back allocations. */ static int btrfs_trim_free_extents(struct btrfs_device *device, u64 *trimmed) { u64 start = BTRFS_DEVICE_RANGE_RESERVED, len = 0, end = 0; int ret; *trimmed = 0; /* Discard not supported = nothing to do. */ if (!bdev_max_discard_sectors(device->bdev)) return 0; /* Not writable = nothing to do. */ if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) return 0; /* No free space = nothing to do. */ if (device->total_bytes <= device->bytes_used) return 0; ret = 0; while (1) { struct btrfs_fs_info *fs_info = device->fs_info; u64 bytes; ret = mutex_lock_interruptible(&fs_info->chunk_mutex); if (ret) break; find_first_clear_extent_bit(&device->alloc_state, start, &start, &end, CHUNK_TRIMMED | CHUNK_ALLOCATED); /* Check if there are any CHUNK_* bits left */ if (start > device->total_bytes) { WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG)); btrfs_warn_in_rcu(fs_info, "ignoring attempt to trim beyond device size: offset %llu length %llu device %s device size %llu", start, end - start + 1, btrfs_dev_name(device), device->total_bytes); mutex_unlock(&fs_info->chunk_mutex); ret = 0; break; } /* Ensure we skip the reserved space on each device. */ start = max_t(u64, start, BTRFS_DEVICE_RANGE_RESERVED); /* * If find_first_clear_extent_bit find a range that spans the * end of the device it will set end to -1, in this case it's up * to the caller to trim the value to the size of the device. */ end = min(end, device->total_bytes - 1); len = end - start + 1; /* We didn't find any extents */ if (!len) { mutex_unlock(&fs_info->chunk_mutex); ret = 0; break; } ret = btrfs_issue_discard(device->bdev, start, len, &bytes); if (!ret) set_extent_bit(&device->alloc_state, start, start + bytes - 1, CHUNK_TRIMMED, NULL); mutex_unlock(&fs_info->chunk_mutex); if (ret) break; start += len; *trimmed += bytes; if (btrfs_trim_interrupted()) { ret = -ERESTARTSYS; break; } cond_resched(); } return ret; } /* * Trim the whole filesystem by: * 1) trimming the free space in each block group * 2) trimming the unallocated space on each device * * This will also continue trimming even if a block group or device encounters * an error. The return value will be the last error, or 0 if nothing bad * happens. */ int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range) { struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; struct btrfs_block_group *cache = NULL; struct btrfs_device *device; u64 group_trimmed; u64 range_end = U64_MAX; u64 start; u64 end; u64 trimmed = 0; u64 bg_failed = 0; u64 dev_failed = 0; int bg_ret = 0; int dev_ret = 0; int ret = 0; if (range->start == U64_MAX) return -EINVAL; /* * Check range overflow if range->len is set. * The default range->len is U64_MAX. */ if (range->len != U64_MAX && check_add_overflow(range->start, range->len, &range_end)) return -EINVAL; cache = btrfs_lookup_first_block_group(fs_info, range->start); for (; cache; cache = btrfs_next_block_group(cache)) { if (cache->start >= range_end) { btrfs_put_block_group(cache); break; } start = max(range->start, cache->start); end = min(range_end, cache->start + cache->length); if (end - start >= range->minlen) { if (!btrfs_block_group_done(cache)) { ret = btrfs_cache_block_group(cache, true); if (ret) { bg_failed++; bg_ret = ret; continue; } } ret = btrfs_trim_block_group(cache, &group_trimmed, start, end, range->minlen); trimmed += group_trimmed; if (ret) { bg_failed++; bg_ret = ret; continue; } } } if (bg_failed) btrfs_warn(fs_info, "failed to trim %llu block group(s), last error %d", bg_failed, bg_ret); mutex_lock(&fs_devices->device_list_mutex); list_for_each_entry(device, &fs_devices->devices, dev_list) { if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) continue; ret = btrfs_trim_free_extents(device, &group_trimmed); trimmed += group_trimmed; if (ret) { dev_failed++; dev_ret = ret; break; } } mutex_unlock(&fs_devices->device_list_mutex); if (dev_failed) btrfs_warn(fs_info, "failed to trim %llu device(s), last error %d", dev_failed, dev_ret); range->len = trimmed; if (bg_ret) return bg_ret; return dev_ret; } |
| 3 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 | // SPDX-License-Identifier: GPL-2.0-or-later /* * * Copyright (C) Jonathan Naylor G4KLX (g4klx@g4klx.demon.co.uk) * Copyright (C) Joerg Reuter DL1BKE (jreuter@yaina.de) */ #include <linux/errno.h> #include <linux/types.h> #include <linux/socket.h> #include <linux/spinlock.h> #include <linux/in.h> #include <linux/kernel.h> #include <linux/jiffies.h> #include <linux/timer.h> #include <linux/string.h> #include <linux/sockios.h> #include <linux/net.h> #include <net/tcp_states.h> #include <net/ax25.h> #include <linux/inet.h> #include <linux/netdevice.h> #include <linux/skbuff.h> #include <net/sock.h> #include <linux/uaccess.h> #include <linux/fcntl.h> #include <linux/mm.h> #include <linux/interrupt.h> static void ax25_ds_timeout(struct timer_list *); /* * Add DAMA slave timeout timer to timer list. * Unlike the connection based timers the timeout function gets * triggered every second. Please note that NET_AX25_DAMA_SLAVE_TIMEOUT * (aka /proc/sys/net/ax25/{dev}/dama_slave_timeout) is still in * 1/10th of a second. */ void ax25_ds_setup_timer(ax25_dev *ax25_dev) { timer_setup(&ax25_dev->dama.slave_timer, ax25_ds_timeout, 0); } void ax25_ds_del_timer(ax25_dev *ax25_dev) { if (ax25_dev) del_timer(&ax25_dev->dama.slave_timer); } void ax25_ds_set_timer(ax25_dev *ax25_dev) { if (ax25_dev == NULL) /* paranoia */ return; ax25_dev->dama.slave_timeout = msecs_to_jiffies(ax25_dev->values[AX25_VALUES_DS_TIMEOUT]) / 10; mod_timer(&ax25_dev->dama.slave_timer, jiffies + HZ); } /* * DAMA Slave Timeout * Silently discard all (slave) connections in case our master forgot us... */ static void ax25_ds_timeout(struct timer_list *t) { ax25_dev *ax25_dev = from_timer(ax25_dev, t, dama.slave_timer); ax25_cb *ax25; if (ax25_dev == NULL || !ax25_dev->dama.slave) return; /* Yikes! */ if (!ax25_dev->dama.slave_timeout || --ax25_dev->dama.slave_timeout) { ax25_ds_set_timer(ax25_dev); return; } spin_lock(&ax25_list_lock); ax25_for_each(ax25, &ax25_list) { if (ax25->ax25_dev != ax25_dev || !(ax25->condition & AX25_COND_DAMA_MODE)) continue; ax25_send_control(ax25, AX25_DISC, AX25_POLLON, AX25_COMMAND); ax25_disconnect(ax25, ETIMEDOUT); } spin_unlock(&ax25_list_lock); ax25_dev_dama_off(ax25_dev); } void ax25_ds_heartbeat_expiry(ax25_cb *ax25) { struct sock *sk=ax25->sk; if (sk) bh_lock_sock(sk); switch (ax25->state) { case AX25_STATE_0: case AX25_STATE_2: /* Magic here: If we listen() and a new link dies before it is accepted() it isn't 'dead' so doesn't get removed. */ if (!sk || sock_flag(sk, SOCK_DESTROY) || (sk->sk_state == TCP_LISTEN && sock_flag(sk, SOCK_DEAD))) { if (sk) { sock_hold(sk); ax25_destroy_socket(ax25); bh_unlock_sock(sk); /* Ungrab socket and destroy it */ sock_put(sk); } else ax25_destroy_socket(ax25); return; } break; case AX25_STATE_3: /* * Check the state of the receive buffer. */ if (sk != NULL) { if (atomic_read(&sk->sk_rmem_alloc) < (sk->sk_rcvbuf >> 1) && (ax25->condition & AX25_COND_OWN_RX_BUSY)) { ax25->condition &= ~AX25_COND_OWN_RX_BUSY; ax25->condition &= ~AX25_COND_ACK_PENDING; break; } } break; } if (sk) bh_unlock_sock(sk); ax25_start_heartbeat(ax25); } /* dl1bke 960114: T3 works much like the IDLE timeout, but * gets reloaded with every frame for this * connection. */ void ax25_ds_t3timer_expiry(ax25_cb *ax25) { ax25_send_control(ax25, AX25_DISC, AX25_POLLON, AX25_COMMAND); ax25_dama_off(ax25); ax25_disconnect(ax25, ETIMEDOUT); } /* dl1bke 960228: close the connection when IDLE expires. * unlike T3 this timer gets reloaded only on * I frames. */ void ax25_ds_idletimer_expiry(ax25_cb *ax25) { ax25_clear_queues(ax25); ax25->n2count = 0; ax25->state = AX25_STATE_2; ax25_calculate_t1(ax25); ax25_start_t1timer(ax25); ax25_stop_t3timer(ax25); if (ax25->sk != NULL) { bh_lock_sock(ax25->sk); ax25->sk->sk_state = TCP_CLOSE; ax25->sk->sk_err = 0; ax25->sk->sk_shutdown |= SEND_SHUTDOWN; if (!sock_flag(ax25->sk, SOCK_DEAD)) { ax25->sk->sk_state_change(ax25->sk); sock_set_flag(ax25->sk, SOCK_DEAD); } bh_unlock_sock(ax25->sk); } } /* dl1bke 960114: The DAMA protocol requires to send data and SABM/DISC * within the poll of any connected channel. Remember * that we are not allowed to send anything unless we * get polled by the Master. * * Thus we'll have to do parts of our T1 handling in * ax25_enquiry_response(). */ void ax25_ds_t1_timeout(ax25_cb *ax25) { switch (ax25->state) { case AX25_STATE_1: if (ax25->n2count == ax25->n2) { if (ax25->modulus == AX25_MODULUS) { ax25_disconnect(ax25, ETIMEDOUT); return; } else { ax25->modulus = AX25_MODULUS; ax25->window = ax25->ax25_dev->values[AX25_VALUES_WINDOW]; ax25->n2count = 0; ax25_send_control(ax25, AX25_SABM, AX25_POLLOFF, AX25_COMMAND); } } else { ax25->n2count++; if (ax25->modulus == AX25_MODULUS) ax25_send_control(ax25, AX25_SABM, AX25_POLLOFF, AX25_COMMAND); else ax25_send_control(ax25, AX25_SABME, AX25_POLLOFF, AX25_COMMAND); } break; case AX25_STATE_2: if (ax25->n2count == ax25->n2) { ax25_send_control(ax25, AX25_DISC, AX25_POLLON, AX25_COMMAND); if (!sock_flag(ax25->sk, SOCK_DESTROY)) ax25_disconnect(ax25, ETIMEDOUT); return; } else { ax25->n2count++; } break; case AX25_STATE_3: if (ax25->n2count == ax25->n2) { ax25_send_control(ax25, AX25_DM, AX25_POLLON, AX25_RESPONSE); ax25_disconnect(ax25, ETIMEDOUT); return; } else { ax25->n2count++; } break; } ax25_calculate_t1(ax25); ax25_start_t1timer(ax25); } |
| 32 35 1 34 31 31 32 35 35 31 32 32 32 32 2 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 | // SPDX-License-Identifier: GPL-2.0-only /* * net/dccp/qpolicy.c * * Policy-based packet dequeueing interface for DCCP. * * Copyright (c) 2008 Tomasz Grobelny <tomasz@grobelny.oswiecenia.net> */ #include "dccp.h" /* * Simple Dequeueing Policy: * If tx_qlen is different from 0, enqueue up to tx_qlen elements. */ static void qpolicy_simple_push(struct sock *sk, struct sk_buff *skb) { skb_queue_tail(&sk->sk_write_queue, skb); } static bool qpolicy_simple_full(struct sock *sk) { return dccp_sk(sk)->dccps_tx_qlen && sk->sk_write_queue.qlen >= dccp_sk(sk)->dccps_tx_qlen; } static struct sk_buff *qpolicy_simple_top(struct sock *sk) { return skb_peek(&sk->sk_write_queue); } /* * Priority-based Dequeueing Policy: * If tx_qlen is different from 0 and the queue has reached its upper bound * of tx_qlen elements, replace older packets lowest-priority-first. */ static struct sk_buff *qpolicy_prio_best_skb(struct sock *sk) { struct sk_buff *skb, *best = NULL; skb_queue_walk(&sk->sk_write_queue, skb) if (best == NULL || skb->priority > best->priority) best = skb; return best; } static struct sk_buff *qpolicy_prio_worst_skb(struct sock *sk) { struct sk_buff *skb, *worst = NULL; skb_queue_walk(&sk->sk_write_queue, skb) if (worst == NULL || skb->priority < worst->priority) worst = skb; return worst; } static bool qpolicy_prio_full(struct sock *sk) { if (qpolicy_simple_full(sk)) dccp_qpolicy_drop(sk, qpolicy_prio_worst_skb(sk)); return false; } /** * struct dccp_qpolicy_operations - TX Packet Dequeueing Interface * @push: add a new @skb to the write queue * @full: indicates that no more packets will be admitted * @top: peeks at whatever the queueing policy defines as its `top' * @params: parameter passed to policy operation */ struct dccp_qpolicy_operations { void (*push) (struct sock *sk, struct sk_buff *skb); bool (*full) (struct sock *sk); struct sk_buff* (*top) (struct sock *sk); __be32 params; }; static struct dccp_qpolicy_operations qpol_table[DCCPQ_POLICY_MAX] = { [DCCPQ_POLICY_SIMPLE] = { .push = qpolicy_simple_push, .full = qpolicy_simple_full, .top = qpolicy_simple_top, .params = 0, }, [DCCPQ_POLICY_PRIO] = { .push = qpolicy_simple_push, .full = qpolicy_prio_full, .top = qpolicy_prio_best_skb, .params = DCCP_SCM_PRIORITY, }, }; /* * Externally visible interface */ void dccp_qpolicy_push(struct sock *sk, struct sk_buff *skb) { qpol_table[dccp_sk(sk)->dccps_qpolicy].push(sk, skb); } bool dccp_qpolicy_full(struct sock *sk) { return qpol_table[dccp_sk(sk)->dccps_qpolicy].full(sk); } void dccp_qpolicy_drop(struct sock *sk, struct sk_buff *skb) { if (skb != NULL) { skb_unlink(skb, &sk->sk_write_queue); kfree_skb(skb); } } struct sk_buff *dccp_qpolicy_top(struct sock *sk) { return qpol_table[dccp_sk(sk)->dccps_qpolicy].top(sk); } struct sk_buff *dccp_qpolicy_pop(struct sock *sk) { struct sk_buff *skb = dccp_qpolicy_top(sk); if (skb != NULL) { /* Clear any skb fields that we used internally */ skb->priority = 0; skb_unlink(skb, &sk->sk_write_queue); } return skb; } bool dccp_qpolicy_param_ok(struct sock *sk, __be32 param) { /* check if exactly one bit is set */ if (!param || (param & (param - 1))) return false; return (qpol_table[dccp_sk(sk)->dccps_qpolicy].params & param) == param; } |
| 2 2 2 20 20 20 10 10 4 1 3 4 4 3 3 2 4 3 3 3 4 4 4 4 4 4 4 2 2 3 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 | // SPDX-License-Identifier: GPL-2.0-only /* * mac80211 ethtool hooks for cfg80211 * * Copied from cfg.c - originally * Copyright 2006-2010 Johannes Berg <johannes@sipsolutions.net> * Copyright 2014 Intel Corporation (Author: Johannes Berg) * Copyright (C) 2018, 2022-2023 Intel Corporation */ #include <linux/types.h> #include <net/cfg80211.h> #include "ieee80211_i.h" #include "sta_info.h" #include "driver-ops.h" static int ieee80211_set_ringparam(struct net_device *dev, struct ethtool_ringparam *rp, struct kernel_ethtool_ringparam *kernel_rp, struct netlink_ext_ack *extack) { struct ieee80211_local *local = wiphy_priv(dev->ieee80211_ptr->wiphy); if (rp->rx_mini_pending != 0 || rp->rx_jumbo_pending != 0) return -EINVAL; guard(wiphy)(local->hw.wiphy); return drv_set_ringparam(local, rp->tx_pending, rp->rx_pending); } static void ieee80211_get_ringparam(struct net_device *dev, struct ethtool_ringparam *rp, struct kernel_ethtool_ringparam *kernel_rp, struct netlink_ext_ack *extack) { struct ieee80211_local *local = wiphy_priv(dev->ieee80211_ptr->wiphy); memset(rp, 0, sizeof(*rp)); guard(wiphy)(local->hw.wiphy); drv_get_ringparam(local, &rp->tx_pending, &rp->tx_max_pending, &rp->rx_pending, &rp->rx_max_pending); } static const char ieee80211_gstrings_sta_stats[][ETH_GSTRING_LEN] = { "rx_packets", "rx_bytes", "rx_duplicates", "rx_fragments", "rx_dropped", "tx_packets", "tx_bytes", "tx_filtered", "tx_retry_failed", "tx_retries", "sta_state", "txrate", "rxrate", "signal", "channel", "noise", "ch_time", "ch_time_busy", "ch_time_ext_busy", "ch_time_rx", "ch_time_tx" }; #define STA_STATS_LEN ARRAY_SIZE(ieee80211_gstrings_sta_stats) static int ieee80211_get_sset_count(struct net_device *dev, int sset) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); int rv = 0; if (sset == ETH_SS_STATS) rv += STA_STATS_LEN; rv += drv_get_et_sset_count(sdata, sset); if (rv == 0) return -EOPNOTSUPP; return rv; } static void ieee80211_get_stats(struct net_device *dev, struct ethtool_stats *stats, u64 *data) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); struct ieee80211_chanctx_conf *chanctx_conf; struct ieee80211_channel *channel; struct sta_info *sta; struct ieee80211_local *local = sdata->local; struct station_info sinfo; struct survey_info survey; int i, q; #define STA_STATS_SURVEY_LEN 7 memset(data, 0, sizeof(u64) * STA_STATS_LEN); #define ADD_STA_STATS(sta) \ do { \ data[i++] += sinfo.rx_packets; \ data[i++] += sinfo.rx_bytes; \ data[i++] += (sta)->rx_stats.num_duplicates; \ data[i++] += (sta)->rx_stats.fragments; \ data[i++] += sinfo.rx_dropped_misc; \ \ data[i++] += sinfo.tx_packets; \ data[i++] += sinfo.tx_bytes; \ data[i++] += (sta)->status_stats.filtered; \ data[i++] += sinfo.tx_failed; \ data[i++] += sinfo.tx_retries; \ } while (0) /* For Managed stations, find the single station based on BSSID * and use that. For interface types, iterate through all available * stations and add stats for any station that is assigned to this * network device. */ guard(wiphy)(local->hw.wiphy); if (sdata->vif.type == NL80211_IFTYPE_STATION) { sta = sta_info_get_bss(sdata, sdata->deflink.u.mgd.bssid); if (!(sta && !WARN_ON(sta->sdata->dev != dev))) goto do_survey; memset(&sinfo, 0, sizeof(sinfo)); sta_set_sinfo(sta, &sinfo, false); i = 0; ADD_STA_STATS(&sta->deflink); data[i++] = sta->sta_state; if (sinfo.filled & BIT_ULL(NL80211_STA_INFO_TX_BITRATE)) data[i] = 100000ULL * cfg80211_calculate_bitrate(&sinfo.txrate); i++; if (sinfo.filled & BIT_ULL(NL80211_STA_INFO_RX_BITRATE)) data[i] = 100000ULL * cfg80211_calculate_bitrate(&sinfo.rxrate); i++; if (sinfo.filled & BIT_ULL(NL80211_STA_INFO_SIGNAL_AVG)) data[i] = (u8)sinfo.signal_avg; i++; } else { list_for_each_entry(sta, &local->sta_list, list) { /* Make sure this station belongs to the proper dev */ if (sta->sdata->dev != dev) continue; memset(&sinfo, 0, sizeof(sinfo)); sta_set_sinfo(sta, &sinfo, false); i = 0; ADD_STA_STATS(&sta->deflink); } } do_survey: i = STA_STATS_LEN - STA_STATS_SURVEY_LEN; /* Get survey stats for current channel */ survey.filled = 0; rcu_read_lock(); chanctx_conf = rcu_dereference(sdata->vif.bss_conf.chanctx_conf); if (chanctx_conf) channel = chanctx_conf->def.chan; else if (local->open_count > 0 && local->open_count == local->monitors && sdata->vif.type == NL80211_IFTYPE_MONITOR) channel = local->monitor_chanreq.oper.chan; else channel = NULL; rcu_read_unlock(); if (channel) { q = 0; do { survey.filled = 0; if (drv_get_survey(local, q, &survey) != 0) { survey.filled = 0; break; } q++; } while (channel != survey.channel); } if (survey.filled) data[i++] = survey.channel->center_freq; else data[i++] = 0; if (survey.filled & SURVEY_INFO_NOISE_DBM) data[i++] = (u8)survey.noise; else data[i++] = -1LL; if (survey.filled & SURVEY_INFO_TIME) data[i++] = survey.time; else data[i++] = -1LL; if (survey.filled & SURVEY_INFO_TIME_BUSY) data[i++] = survey.time_busy; else data[i++] = -1LL; if (survey.filled & SURVEY_INFO_TIME_EXT_BUSY) data[i++] = survey.time_ext_busy; else data[i++] = -1LL; if (survey.filled & SURVEY_INFO_TIME_RX) data[i++] = survey.time_rx; else data[i++] = -1LL; if (survey.filled & SURVEY_INFO_TIME_TX) data[i++] = survey.time_tx; else data[i++] = -1LL; if (WARN_ON(i != STA_STATS_LEN)) return; drv_get_et_stats(sdata, stats, &(data[STA_STATS_LEN])); } static void ieee80211_get_strings(struct net_device *dev, u32 sset, u8 *data) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); int sz_sta_stats = 0; if (sset == ETH_SS_STATS) { sz_sta_stats = sizeof(ieee80211_gstrings_sta_stats); memcpy(data, ieee80211_gstrings_sta_stats, sz_sta_stats); } drv_get_et_strings(sdata, sset, &(data[sz_sta_stats])); } static int ieee80211_get_regs_len(struct net_device *dev) { return 0; } static void ieee80211_get_regs(struct net_device *dev, struct ethtool_regs *regs, void *data) { struct wireless_dev *wdev = dev->ieee80211_ptr; regs->version = wdev->wiphy->hw_version; regs->len = 0; } const struct ethtool_ops ieee80211_ethtool_ops = { .get_drvinfo = cfg80211_get_drvinfo, .get_regs_len = ieee80211_get_regs_len, .get_regs = ieee80211_get_regs, .get_link = ethtool_op_get_link, .get_ringparam = ieee80211_get_ringparam, .set_ringparam = ieee80211_set_ringparam, .get_strings = ieee80211_get_strings, .get_ethtool_stats = ieee80211_get_stats, .get_sset_count = ieee80211_get_sset_count, }; |
| 4 4 4 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 | // SPDX-License-Identifier: GPL-2.0-or-later /* * Abilis Systems Single DVB-T Receiver * Copyright (C) 2008 Pierrick Hascoet <pierrick.hascoet@abilis.com> * Copyright (C) 2010 Devin Heitmueller <dheitmueller@kernellabs.com> */ #include <media/dvb_frontend.h> #include "as102_fe.h" struct as102_state { struct dvb_frontend frontend; struct as10x_demod_stats demod_stats; const struct as102_fe_ops *ops; void *priv; uint8_t elna_cfg; /* signal strength */ uint16_t signal_strength; /* bit error rate */ uint32_t ber; }; static uint8_t as102_fe_get_code_rate(enum fe_code_rate arg) { uint8_t c; switch (arg) { case FEC_1_2: c = CODE_RATE_1_2; break; case FEC_2_3: c = CODE_RATE_2_3; break; case FEC_3_4: c = CODE_RATE_3_4; break; case FEC_5_6: c = CODE_RATE_5_6; break; case FEC_7_8: c = CODE_RATE_7_8; break; default: c = CODE_RATE_UNKNOWN; break; } return c; } static int as102_fe_set_frontend(struct dvb_frontend *fe) { struct as102_state *state = fe->demodulator_priv; struct dtv_frontend_properties *c = &fe->dtv_property_cache; struct as10x_tune_args tune_args = { 0 }; /* set frequency */ tune_args.freq = c->frequency / 1000; /* fix interleaving_mode */ tune_args.interleaving_mode = INTLV_NATIVE; switch (c->bandwidth_hz) { case 8000000: tune_args.bandwidth = BW_8_MHZ; break; case 7000000: tune_args.bandwidth = BW_7_MHZ; break; case 6000000: tune_args.bandwidth = BW_6_MHZ; break; default: tune_args.bandwidth = BW_8_MHZ; } switch (c->guard_interval) { case GUARD_INTERVAL_1_32: tune_args.guard_interval = GUARD_INT_1_32; break; case GUARD_INTERVAL_1_16: tune_args.guard_interval = GUARD_INT_1_16; break; case GUARD_INTERVAL_1_8: tune_args.guard_interval = GUARD_INT_1_8; break; case GUARD_INTERVAL_1_4: tune_args.guard_interval = GUARD_INT_1_4; break; case GUARD_INTERVAL_AUTO: default: tune_args.guard_interval = GUARD_UNKNOWN; break; } switch (c->modulation) { case QPSK: tune_args.modulation = CONST_QPSK; break; case QAM_16: tune_args.modulation = CONST_QAM16; break; case QAM_64: tune_args.modulation = CONST_QAM64; break; default: tune_args.modulation = CONST_UNKNOWN; break; } switch (c->transmission_mode) { case TRANSMISSION_MODE_2K: tune_args.transmission_mode = TRANS_MODE_2K; break; case TRANSMISSION_MODE_8K: tune_args.transmission_mode = TRANS_MODE_8K; break; default: tune_args.transmission_mode = TRANS_MODE_UNKNOWN; } switch (c->hierarchy) { case HIERARCHY_NONE: tune_args.hierarchy = HIER_NONE; break; case HIERARCHY_1: tune_args.hierarchy = HIER_ALPHA_1; break; case HIERARCHY_2: tune_args.hierarchy = HIER_ALPHA_2; break; case HIERARCHY_4: tune_args.hierarchy = HIER_ALPHA_4; break; case HIERARCHY_AUTO: tune_args.hierarchy = HIER_UNKNOWN; break; } pr_debug("as102: tuner parameters: freq: %d bw: 0x%02x gi: 0x%02x\n", c->frequency, tune_args.bandwidth, tune_args.guard_interval); /* * Detect a hierarchy selection * if HP/LP are both set to FEC_NONE, HP will be selected. */ if ((tune_args.hierarchy != HIER_NONE) && ((c->code_rate_LP == FEC_NONE) || (c->code_rate_HP == FEC_NONE))) { if (c->code_rate_LP == FEC_NONE) { tune_args.hier_select = HIER_HIGH_PRIORITY; tune_args.code_rate = as102_fe_get_code_rate(c->code_rate_HP); } if (c->code_rate_HP == FEC_NONE) { tune_args.hier_select = HIER_LOW_PRIORITY; tune_args.code_rate = as102_fe_get_code_rate(c->code_rate_LP); } pr_debug("as102: \thierarchy: 0x%02x selected: %s code_rate_%s: 0x%02x\n", tune_args.hierarchy, tune_args.hier_select == HIER_HIGH_PRIORITY ? "HP" : "LP", tune_args.hier_select == HIER_HIGH_PRIORITY ? "HP" : "LP", tune_args.code_rate); } else { tune_args.code_rate = as102_fe_get_code_rate(c->code_rate_HP); } /* Set frontend arguments */ return state->ops->set_tune(state->priv, &tune_args); } static int as102_fe_get_frontend(struct dvb_frontend *fe, struct dtv_frontend_properties *c) { struct as102_state *state = fe->demodulator_priv; int ret = 0; struct as10x_tps tps = { 0 }; /* send abilis command: GET_TPS */ ret = state->ops->get_tps(state->priv, &tps); if (ret < 0) return ret; /* extract constellation */ switch (tps.modulation) { case CONST_QPSK: c->modulation = QPSK; break; case CONST_QAM16: c->modulation = QAM_16; break; case CONST_QAM64: c->modulation = QAM_64; break; } /* extract hierarchy */ switch (tps.hierarchy) { case HIER_NONE: c->hierarchy = HIERARCHY_NONE; break; case HIER_ALPHA_1: c->hierarchy = HIERARCHY_1; break; case HIER_ALPHA_2: c->hierarchy = HIERARCHY_2; break; case HIER_ALPHA_4: c->hierarchy = HIERARCHY_4; break; } /* extract code rate HP */ switch (tps.code_rate_HP) { case CODE_RATE_1_2: c->code_rate_HP = FEC_1_2; break; case CODE_RATE_2_3: c->code_rate_HP = FEC_2_3; break; case CODE_RATE_3_4: c->code_rate_HP = FEC_3_4; break; case CODE_RATE_5_6: c->code_rate_HP = FEC_5_6; break; case CODE_RATE_7_8: c->code_rate_HP = FEC_7_8; break; } /* extract code rate LP */ switch (tps.code_rate_LP) { case CODE_RATE_1_2: c->code_rate_LP = FEC_1_2; break; case CODE_RATE_2_3: c->code_rate_LP = FEC_2_3; break; case CODE_RATE_3_4: c->code_rate_LP = FEC_3_4; break; case CODE_RATE_5_6: c->code_rate_LP = FEC_5_6; break; case CODE_RATE_7_8: c->code_rate_LP = FEC_7_8; break; } /* extract guard interval */ switch (tps.guard_interval) { case GUARD_INT_1_32: c->guard_interval = GUARD_INTERVAL_1_32; break; case GUARD_INT_1_16: c->guard_interval = GUARD_INTERVAL_1_16; break; case GUARD_INT_1_8: c->guard_interval = GUARD_INTERVAL_1_8; break; case GUARD_INT_1_4: c->guard_interval = GUARD_INTERVAL_1_4; break; } /* extract transmission mode */ switch (tps.transmission_mode) { case TRANS_MODE_2K: c->transmission_mode = TRANSMISSION_MODE_2K; break; case TRANS_MODE_8K: c->transmission_mode = TRANSMISSION_MODE_8K; break; } return 0; } static int as102_fe_get_tune_settings(struct dvb_frontend *fe, struct dvb_frontend_tune_settings *settings) { settings->min_delay_ms = 1000; return 0; } static int as102_fe_read_status(struct dvb_frontend *fe, enum fe_status *status) { int ret = 0; struct as102_state *state = fe->demodulator_priv; struct as10x_tune_status tstate = { 0 }; /* send abilis command: GET_TUNE_STATUS */ ret = state->ops->get_status(state->priv, &tstate); if (ret < 0) return ret; state->signal_strength = tstate.signal_strength; state->ber = tstate.BER; switch (tstate.tune_state) { case TUNE_STATUS_SIGNAL_DVB_OK: *status = FE_HAS_SIGNAL | FE_HAS_CARRIER; break; case TUNE_STATUS_STREAM_DETECTED: *status = FE_HAS_SIGNAL | FE_HAS_CARRIER | FE_HAS_SYNC | FE_HAS_VITERBI; break; case TUNE_STATUS_STREAM_TUNED: *status = FE_HAS_SIGNAL | FE_HAS_CARRIER | FE_HAS_SYNC | FE_HAS_LOCK | FE_HAS_VITERBI; break; default: *status = TUNE_STATUS_NOT_TUNED; } pr_debug("as102: tuner status: 0x%02x, strength %d, per: %d, ber: %d\n", tstate.tune_state, tstate.signal_strength, tstate.PER, tstate.BER); if (!(*status & FE_HAS_LOCK)) { memset(&state->demod_stats, 0, sizeof(state->demod_stats)); return 0; } ret = state->ops->get_stats(state->priv, &state->demod_stats); if (ret < 0) memset(&state->demod_stats, 0, sizeof(state->demod_stats)); return ret; } /* * Note: * - in AS102 SNR=MER * - the SNR will be returned in linear terms, i.e. not in dB * - the accuracy equals ±2dB for a SNR range from 4dB to 30dB * - the accuracy is >2dB for SNR values outside this range */ static int as102_fe_read_snr(struct dvb_frontend *fe, u16 *snr) { struct as102_state *state = fe->demodulator_priv; *snr = state->demod_stats.mer; return 0; } static int as102_fe_read_ber(struct dvb_frontend *fe, u32 *ber) { struct as102_state *state = fe->demodulator_priv; *ber = state->ber; return 0; } static int as102_fe_read_signal_strength(struct dvb_frontend *fe, u16 *strength) { struct as102_state *state = fe->demodulator_priv; *strength = (((0xffff * 400) * state->signal_strength + 41000) * 2); return 0; } static int as102_fe_read_ucblocks(struct dvb_frontend *fe, u32 *ucblocks) { struct as102_state *state = fe->demodulator_priv; if (state->demod_stats.has_started) *ucblocks = state->demod_stats.bad_frame_count; else *ucblocks = 0; return 0; } static int as102_fe_ts_bus_ctrl(struct dvb_frontend *fe, int acquire) { struct as102_state *state = fe->demodulator_priv; return state->ops->stream_ctrl(state->priv, acquire, state->elna_cfg); } static void as102_fe_release(struct dvb_frontend *fe) { struct as102_state *state = fe->demodulator_priv; kfree(state); } static const struct dvb_frontend_ops as102_fe_ops = { .delsys = { SYS_DVBT }, .info = { .name = "Abilis AS102 DVB-T", .frequency_min_hz = 174 * MHz, .frequency_max_hz = 862 * MHz, .frequency_stepsize_hz = 166667, .caps = FE_CAN_INVERSION_AUTO | FE_CAN_FEC_1_2 | FE_CAN_FEC_2_3 | FE_CAN_FEC_3_4 | FE_CAN_FEC_5_6 | FE_CAN_FEC_7_8 | FE_CAN_FEC_AUTO | FE_CAN_QAM_16 | FE_CAN_QAM_64 | FE_CAN_QPSK | FE_CAN_QAM_AUTO | FE_CAN_TRANSMISSION_MODE_AUTO | FE_CAN_GUARD_INTERVAL_AUTO | FE_CAN_HIERARCHY_AUTO | FE_CAN_RECOVER | FE_CAN_MUTE_TS }, .set_frontend = as102_fe_set_frontend, .get_frontend = as102_fe_get_frontend, .get_tune_settings = as102_fe_get_tune_settings, .read_status = as102_fe_read_status, .read_snr = as102_fe_read_snr, .read_ber = as102_fe_read_ber, .read_signal_strength = as102_fe_read_signal_strength, .read_ucblocks = as102_fe_read_ucblocks, .ts_bus_ctrl = as102_fe_ts_bus_ctrl, .release = as102_fe_release, }; struct dvb_frontend *as102_attach(const char *name, const struct as102_fe_ops *ops, void *priv, uint8_t elna_cfg) { struct as102_state *state; struct dvb_frontend *fe; state = kzalloc(sizeof(*state), GFP_KERNEL); if (!state) return NULL; fe = &state->frontend; fe->demodulator_priv = state; state->ops = ops; state->priv = priv; state->elna_cfg = elna_cfg; /* init frontend callback ops */ memcpy(&fe->ops, &as102_fe_ops, sizeof(struct dvb_frontend_ops)); strscpy(fe->ops.info.name, name, sizeof(fe->ops.info.name)); return fe; } EXPORT_SYMBOL_GPL(as102_attach); MODULE_DESCRIPTION("as102-fe"); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Pierrick Hascoet <pierrick.hascoet@abilis.com>"); |
| 2 2 2 1 1 1 1 2 2 2 2 2 1 1 1 1 1 1 1 1 1 1 2 2 1 1 2 2 2 1 1 2 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 | // SPDX-License-Identifier: GPL-2.0-or-later /* * Etoms Et61x151 GPL Linux driver by Michel Xhaard (09/09/2004) * * V4L2 by Jean-Francois Moine <http://moinejf.free.fr> */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #define MODULE_NAME "etoms" #include "gspca.h" MODULE_AUTHOR("Michel Xhaard <mxhaard@users.sourceforge.net>"); MODULE_DESCRIPTION("Etoms USB Camera Driver"); MODULE_LICENSE("GPL"); /* specific webcam descriptor */ struct sd { struct gspca_dev gspca_dev; /* !! must be the first item */ unsigned char autogain; char sensor; #define SENSOR_PAS106 0 #define SENSOR_TAS5130CXX 1 signed char ag_cnt; #define AG_CNT_START 13 }; static const struct v4l2_pix_format vga_mode[] = { {320, 240, V4L2_PIX_FMT_SBGGR8, V4L2_FIELD_NONE, .bytesperline = 320, .sizeimage = 320 * 240, .colorspace = V4L2_COLORSPACE_SRGB, .priv = 1}, /* {640, 480, V4L2_PIX_FMT_SBGGR8, V4L2_FIELD_NONE, .bytesperline = 640, .sizeimage = 640 * 480, .colorspace = V4L2_COLORSPACE_SRGB, .priv = 0}, */ }; static const struct v4l2_pix_format sif_mode[] = { {176, 144, V4L2_PIX_FMT_SBGGR8, V4L2_FIELD_NONE, .bytesperline = 176, .sizeimage = 176 * 144, .colorspace = V4L2_COLORSPACE_SRGB, .priv = 1}, {352, 288, V4L2_PIX_FMT_SBGGR8, V4L2_FIELD_NONE, .bytesperline = 352, .sizeimage = 352 * 288, .colorspace = V4L2_COLORSPACE_SRGB, .priv = 0}, }; #define ETOMS_ALT_SIZE_1000 12 #define ET_GPIO_DIR_CTRL 0x04 /* Control IO bit[0..5] (0 in 1 out) */ #define ET_GPIO_OUT 0x05 /* Only IO data */ #define ET_GPIO_IN 0x06 /* Read Only IO data */ #define ET_RESET_ALL 0x03 #define ET_ClCK 0x01 #define ET_CTRL 0x02 /* enable i2c OutClck Powerdown mode */ #define ET_COMP 0x12 /* Compression register */ #define ET_MAXQt 0x13 #define ET_MINQt 0x14 #define ET_COMP_VAL0 0x02 #define ET_COMP_VAL1 0x03 #define ET_REG1d 0x1d #define ET_REG1e 0x1e #define ET_REG1f 0x1f #define ET_REG20 0x20 #define ET_REG21 0x21 #define ET_REG22 0x22 #define ET_REG23 0x23 #define ET_REG24 0x24 #define ET_REG25 0x25 /* base registers for luma calculation */ #define ET_LUMA_CENTER 0x39 #define ET_G_RED 0x4d #define ET_G_GREEN1 0x4e #define ET_G_BLUE 0x4f #define ET_G_GREEN2 0x50 #define ET_G_GR_H 0x51 #define ET_G_GB_H 0x52 #define ET_O_RED 0x34 #define ET_O_GREEN1 0x35 #define ET_O_BLUE 0x36 #define ET_O_GREEN2 0x37 #define ET_SYNCHRO 0x68 #define ET_STARTX 0x69 #define ET_STARTY 0x6a #define ET_WIDTH_LOW 0x6b #define ET_HEIGTH_LOW 0x6c #define ET_W_H_HEIGTH 0x6d #define ET_REG6e 0x6e /* OBW */ #define ET_REG6f 0x6f /* OBW */ #define ET_REG70 0x70 /* OBW_AWB */ #define ET_REG71 0x71 /* OBW_AWB */ #define ET_REG72 0x72 /* OBW_AWB */ #define ET_REG73 0x73 /* Clkdelay ns */ #define ET_REG74 0x74 /* test pattern */ #define ET_REG75 0x75 /* test pattern */ #define ET_I2C_CLK 0x8c #define ET_PXL_CLK 0x60 #define ET_I2C_BASE 0x89 #define ET_I2C_COUNT 0x8a #define ET_I2C_PREFETCH 0x8b #define ET_I2C_REG 0x88 #define ET_I2C_DATA7 0x87 #define ET_I2C_DATA6 0x86 #define ET_I2C_DATA5 0x85 #define ET_I2C_DATA4 0x84 #define ET_I2C_DATA3 0x83 #define ET_I2C_DATA2 0x82 #define ET_I2C_DATA1 0x81 #define ET_I2C_DATA0 0x80 #define PAS106_REG2 0x02 /* pxlClk = systemClk/(reg2) */ #define PAS106_REG3 0x03 /* line/frame H [11..4] */ #define PAS106_REG4 0x04 /* line/frame L [3..0] */ #define PAS106_REG5 0x05 /* exposure time line offset(default 5) */ #define PAS106_REG6 0x06 /* exposure time pixel offset(default 6) */ #define PAS106_REG7 0x07 /* signbit Dac (default 0) */ #define PAS106_REG9 0x09 #define PAS106_REG0e 0x0e /* global gain [4..0](default 0x0e) */ #define PAS106_REG13 0x13 /* end i2c write */ static const __u8 GainRGBG[] = { 0x80, 0x80, 0x80, 0x80, 0x00, 0x00 }; static const __u8 I2c2[] = { 0x08, 0x08, 0x08, 0x08, 0x0d }; static const __u8 I2c3[] = { 0x12, 0x05 }; static const __u8 I2c4[] = { 0x41, 0x08 }; /* read 'len' bytes to gspca_dev->usb_buf */ static void reg_r(struct gspca_dev *gspca_dev, __u16 index, __u16 len) { struct usb_device *dev = gspca_dev->dev; if (len > USB_BUF_SZ) { gspca_err(gspca_dev, "reg_r: buffer overflow\n"); return; } usb_control_msg(dev, usb_rcvctrlpipe(dev, 0), 0, USB_DIR_IN | USB_TYPE_VENDOR | USB_RECIP_INTERFACE, 0, index, gspca_dev->usb_buf, len, 500); gspca_dbg(gspca_dev, D_USBI, "reg read [%02x] -> %02x ..\n", index, gspca_dev->usb_buf[0]); } static void reg_w_val(struct gspca_dev *gspca_dev, __u16 index, __u8 val) { struct usb_device *dev = gspca_dev->dev; gspca_dev->usb_buf[0] = val; usb_control_msg(dev, usb_sndctrlpipe(dev, 0), 0, USB_DIR_OUT | USB_TYPE_VENDOR | USB_RECIP_INTERFACE, 0, index, gspca_dev->usb_buf, 1, 500); } static void reg_w(struct gspca_dev *gspca_dev, __u16 index, const __u8 *buffer, __u16 len) { struct usb_device *dev = gspca_dev->dev; if (len > USB_BUF_SZ) { pr_err("reg_w: buffer overflow\n"); return; } gspca_dbg(gspca_dev, D_USBO, "reg write [%02x] = %02x..\n", index, *buffer); memcpy(gspca_dev->usb_buf, buffer, len); usb_control_msg(dev, usb_sndctrlpipe(dev, 0), 0, USB_DIR_OUT | USB_TYPE_VENDOR | USB_RECIP_INTERFACE, 0, index, gspca_dev->usb_buf, len, 500); } static int i2c_w(struct gspca_dev *gspca_dev, __u8 reg, const __u8 *buffer, int len, __u8 mode) { /* buffer should be [D0..D7] */ __u8 ptchcount; /* set the base address */ reg_w_val(gspca_dev, ET_I2C_BASE, 0x40); /* sensor base for the pas106 */ /* set count and prefetch */ ptchcount = ((len & 0x07) << 4) | (mode & 0x03); reg_w_val(gspca_dev, ET_I2C_COUNT, ptchcount); /* set the register base */ reg_w_val(gspca_dev, ET_I2C_REG, reg); while (--len >= 0) reg_w_val(gspca_dev, ET_I2C_DATA0 + len, buffer[len]); return 0; } static int i2c_r(struct gspca_dev *gspca_dev, __u8 reg) { /* set the base address */ reg_w_val(gspca_dev, ET_I2C_BASE, 0x40); /* sensor base for the pas106 */ /* set count and prefetch (cnd: 4 bits - mode: 4 bits) */ reg_w_val(gspca_dev, ET_I2C_COUNT, 0x11); reg_w_val(gspca_dev, ET_I2C_REG, reg); /* set the register base */ reg_w_val(gspca_dev, ET_I2C_PREFETCH, 0x02); /* prefetch */ reg_w_val(gspca_dev, ET_I2C_PREFETCH, 0x00); reg_r(gspca_dev, ET_I2C_DATA0, 1); /* read one byte */ return 0; } static int Et_WaitStatus(struct gspca_dev *gspca_dev) { int retry = 10; while (retry--) { reg_r(gspca_dev, ET_ClCK, 1); if (gspca_dev->usb_buf[0] != 0) return 1; } return 0; } static int et_video(struct gspca_dev *gspca_dev, int on) { int ret; reg_w_val(gspca_dev, ET_GPIO_OUT, on ? 0x10 /* startvideo - set Bit5 */ : 0); /* stopvideo */ ret = Et_WaitStatus(gspca_dev); if (ret != 0) gspca_err(gspca_dev, "timeout video on/off\n"); return ret; } static void Et_init2(struct gspca_dev *gspca_dev) { __u8 value; static const __u8 FormLine[] = { 0x84, 0x03, 0x14, 0xf4, 0x01, 0x05 }; gspca_dbg(gspca_dev, D_STREAM, "Open Init2 ET\n"); reg_w_val(gspca_dev, ET_GPIO_DIR_CTRL, 0x2f); reg_w_val(gspca_dev, ET_GPIO_OUT, 0x10); reg_r(gspca_dev, ET_GPIO_IN, 1); reg_w_val(gspca_dev, ET_ClCK, 0x14); /* 0x14 // 0x16 enabled pattern */ reg_w_val(gspca_dev, ET_CTRL, 0x1b); /* compression et subsampling */ if (gspca_dev->cam.cam_mode[(int) gspca_dev->curr_mode].priv) value = ET_COMP_VAL1; /* 320 */ else value = ET_COMP_VAL0; /* 640 */ reg_w_val(gspca_dev, ET_COMP, value); reg_w_val(gspca_dev, ET_MAXQt, 0x1f); reg_w_val(gspca_dev, ET_MINQt, 0x04); /* undocumented registers */ reg_w_val(gspca_dev, ET_REG1d, 0xff); reg_w_val(gspca_dev, ET_REG1e, 0xff); reg_w_val(gspca_dev, ET_REG1f, 0xff); reg_w_val(gspca_dev, ET_REG20, 0x35); reg_w_val(gspca_dev, ET_REG21, 0x01); reg_w_val(gspca_dev, ET_REG22, 0x00); reg_w_val(gspca_dev, ET_REG23, 0xff); reg_w_val(gspca_dev, ET_REG24, 0xff); reg_w_val(gspca_dev, ET_REG25, 0x0f); /* colors setting */ reg_w_val(gspca_dev, 0x30, 0x11); /* 0x30 */ reg_w_val(gspca_dev, 0x31, 0x40); reg_w_val(gspca_dev, 0x32, 0x00); reg_w_val(gspca_dev, ET_O_RED, 0x00); /* 0x34 */ reg_w_val(gspca_dev, ET_O_GREEN1, 0x00); reg_w_val(gspca_dev, ET_O_BLUE, 0x00); reg_w_val(gspca_dev, ET_O_GREEN2, 0x00); /*************/ reg_w_val(gspca_dev, ET_G_RED, 0x80); /* 0x4d */ reg_w_val(gspca_dev, ET_G_GREEN1, 0x80); reg_w_val(gspca_dev, ET_G_BLUE, 0x80); reg_w_val(gspca_dev, ET_G_GREEN2, 0x80); reg_w_val(gspca_dev, ET_G_GR_H, 0x00); reg_w_val(gspca_dev, ET_G_GB_H, 0x00); /* 0x52 */ /* Window control registers */ reg_w_val(gspca_dev, 0x61, 0x80); /* use cmc_out */ reg_w_val(gspca_dev, 0x62, 0x02); reg_w_val(gspca_dev, 0x63, 0x03); reg_w_val(gspca_dev, 0x64, 0x14); reg_w_val(gspca_dev, 0x65, 0x0e); reg_w_val(gspca_dev, 0x66, 0x02); reg_w_val(gspca_dev, 0x67, 0x02); /**************************************/ reg_w_val(gspca_dev, ET_SYNCHRO, 0x8f); /* 0x68 */ reg_w_val(gspca_dev, ET_STARTX, 0x69); /* 0x6a //0x69 */ reg_w_val(gspca_dev, ET_STARTY, 0x0d); /* 0x0d //0x0c */ reg_w_val(gspca_dev, ET_WIDTH_LOW, 0x80); reg_w_val(gspca_dev, ET_HEIGTH_LOW, 0xe0); reg_w_val(gspca_dev, ET_W_H_HEIGTH, 0x60); /* 6d */ reg_w_val(gspca_dev, ET_REG6e, 0x86); reg_w_val(gspca_dev, ET_REG6f, 0x01); reg_w_val(gspca_dev, ET_REG70, 0x26); reg_w_val(gspca_dev, ET_REG71, 0x7a); reg_w_val(gspca_dev, ET_REG72, 0x01); /* Clock Pattern registers ***************** */ reg_w_val(gspca_dev, ET_REG73, 0x00); reg_w_val(gspca_dev, ET_REG74, 0x18); /* 0x28 */ reg_w_val(gspca_dev, ET_REG75, 0x0f); /* 0x01 */ /**********************************************/ reg_w_val(gspca_dev, 0x8a, 0x20); reg_w_val(gspca_dev, 0x8d, 0x0f); reg_w_val(gspca_dev, 0x8e, 0x08); /**************************************/ reg_w_val(gspca_dev, 0x03, 0x08); reg_w_val(gspca_dev, ET_PXL_CLK, 0x03); reg_w_val(gspca_dev, 0x81, 0xff); reg_w_val(gspca_dev, 0x80, 0x00); reg_w_val(gspca_dev, 0x81, 0xff); reg_w_val(gspca_dev, 0x80, 0x20); reg_w_val(gspca_dev, 0x03, 0x01); reg_w_val(gspca_dev, 0x03, 0x00); reg_w_val(gspca_dev, 0x03, 0x08); /********************************************/ /* reg_r(gspca_dev, ET_I2C_BASE, 1); always 0x40 as the pas106 ??? */ /* set the sensor */ if (gspca_dev->cam.cam_mode[(int) gspca_dev->curr_mode].priv) value = 0x04; /* 320 */ else /* 640 */ value = 0x1e; /* 0x17 * setting PixelClock * 0x03 mean 24/(3+1) = 6 Mhz * 0x05 -> 24/(5+1) = 4 Mhz * 0x0b -> 24/(11+1) = 2 Mhz * 0x17 -> 24/(23+1) = 1 Mhz */ reg_w_val(gspca_dev, ET_PXL_CLK, value); /* now set by fifo the FormatLine setting */ reg_w(gspca_dev, 0x62, FormLine, 6); /* set exposure times [ 0..0x78] 0->longvalue 0x78->shortvalue */ reg_w_val(gspca_dev, 0x81, 0x47); /* 0x47; */ reg_w_val(gspca_dev, 0x80, 0x40); /* 0x40; */ /* Pedro change */ /* Brightness change Brith+ decrease value */ /* Brigth- increase value */ /* original value = 0x70; */ reg_w_val(gspca_dev, 0x81, 0x30); /* 0x20; - set brightness */ reg_w_val(gspca_dev, 0x80, 0x20); /* 0x20; */ } static void setbrightness(struct gspca_dev *gspca_dev, s32 val) { int i; for (i = 0; i < 4; i++) reg_w_val(gspca_dev, ET_O_RED + i, val); } static void setcontrast(struct gspca_dev *gspca_dev, s32 val) { __u8 RGBG[] = { 0x80, 0x80, 0x80, 0x80, 0x00, 0x00 }; memset(RGBG, val, sizeof(RGBG) - 2); reg_w(gspca_dev, ET_G_RED, RGBG, 6); } static void setcolors(struct gspca_dev *gspca_dev, s32 val) { struct sd *sd = (struct sd *) gspca_dev; __u8 I2cc[] = { 0x05, 0x02, 0x02, 0x05, 0x0d }; __u8 i2cflags = 0x01; /* __u8 green = 0; */ I2cc[3] = val; /* red */ I2cc[0] = 15 - val; /* blue */ /* green = 15 - ((((7*I2cc[0]) >> 2 ) + I2cc[3]) >> 1); */ /* I2cc[1] = I2cc[2] = green; */ if (sd->sensor == SENSOR_PAS106) { i2c_w(gspca_dev, PAS106_REG13, &i2cflags, 1, 3); i2c_w(gspca_dev, PAS106_REG9, I2cc, sizeof I2cc, 1); } } static s32 getcolors(struct gspca_dev *gspca_dev) { struct sd *sd = (struct sd *) gspca_dev; if (sd->sensor == SENSOR_PAS106) { /* i2c_r(gspca_dev, PAS106_REG9); * blue */ i2c_r(gspca_dev, PAS106_REG9 + 3); /* red */ return gspca_dev->usb_buf[0] & 0x0f; } return 0; } static void setautogain(struct gspca_dev *gspca_dev) { struct sd *sd = (struct sd *) gspca_dev; if (sd->autogain) sd->ag_cnt = AG_CNT_START; else sd->ag_cnt = -1; } static void Et_init1(struct gspca_dev *gspca_dev) { __u8 value; /* __u8 I2c0 [] = {0x0a, 0x12, 0x05, 0x22, 0xac, 0x00, 0x01, 0x00}; */ __u8 I2c0[] = { 0x0a, 0x12, 0x05, 0x6d, 0xcd, 0x00, 0x01, 0x00 }; /* try 1/120 0x6d 0xcd 0x40 */ /* __u8 I2c0 [] = {0x0a, 0x12, 0x05, 0xfe, 0xfe, 0xc0, 0x01, 0x00}; * 1/60000 hmm ?? */ gspca_dbg(gspca_dev, D_STREAM, "Open Init1 ET\n\n"); reg_w_val(gspca_dev, ET_GPIO_DIR_CTRL, 7); reg_r(gspca_dev, ET_GPIO_IN, 1); reg_w_val(gspca_dev, ET_RESET_ALL, 1); reg_w_val(gspca_dev, ET_RESET_ALL, 0); reg_w_val(gspca_dev, ET_ClCK, 0x10); reg_w_val(gspca_dev, ET_CTRL, 0x19); /* compression et subsampling */ if (gspca_dev->cam.cam_mode[(int) gspca_dev->curr_mode].priv) value = ET_COMP_VAL1; else value = ET_COMP_VAL0; gspca_dbg(gspca_dev, D_STREAM, "Open mode %d Compression %d\n", gspca_dev->cam.cam_mode[(int) gspca_dev->curr_mode].priv, value); reg_w_val(gspca_dev, ET_COMP, value); reg_w_val(gspca_dev, ET_MAXQt, 0x1d); reg_w_val(gspca_dev, ET_MINQt, 0x02); /* undocumented registers */ reg_w_val(gspca_dev, ET_REG1d, 0xff); reg_w_val(gspca_dev, ET_REG1e, 0xff); reg_w_val(gspca_dev, ET_REG1f, 0xff); reg_w_val(gspca_dev, ET_REG20, 0x35); reg_w_val(gspca_dev, ET_REG21, 0x01); reg_w_val(gspca_dev, ET_REG22, 0x00); reg_w_val(gspca_dev, ET_REG23, 0xf7); reg_w_val(gspca_dev, ET_REG24, 0xff); reg_w_val(gspca_dev, ET_REG25, 0x07); /* colors setting */ reg_w_val(gspca_dev, ET_G_RED, 0x80); reg_w_val(gspca_dev, ET_G_GREEN1, 0x80); reg_w_val(gspca_dev, ET_G_BLUE, 0x80); reg_w_val(gspca_dev, ET_G_GREEN2, 0x80); reg_w_val(gspca_dev, ET_G_GR_H, 0x00); reg_w_val(gspca_dev, ET_G_GB_H, 0x00); /* Window control registers */ reg_w_val(gspca_dev, ET_SYNCHRO, 0xf0); reg_w_val(gspca_dev, ET_STARTX, 0x56); /* 0x56 */ reg_w_val(gspca_dev, ET_STARTY, 0x05); /* 0x04 */ reg_w_val(gspca_dev, ET_WIDTH_LOW, 0x60); reg_w_val(gspca_dev, ET_HEIGTH_LOW, 0x20); reg_w_val(gspca_dev, ET_W_H_HEIGTH, 0x50); reg_w_val(gspca_dev, ET_REG6e, 0x86); reg_w_val(gspca_dev, ET_REG6f, 0x01); reg_w_val(gspca_dev, ET_REG70, 0x86); reg_w_val(gspca_dev, ET_REG71, 0x14); reg_w_val(gspca_dev, ET_REG72, 0x00); /* Clock Pattern registers */ reg_w_val(gspca_dev, ET_REG73, 0x00); reg_w_val(gspca_dev, ET_REG74, 0x00); reg_w_val(gspca_dev, ET_REG75, 0x0a); reg_w_val(gspca_dev, ET_I2C_CLK, 0x04); reg_w_val(gspca_dev, ET_PXL_CLK, 0x01); /* set the sensor */ if (gspca_dev->cam.cam_mode[(int) gspca_dev->curr_mode].priv) { I2c0[0] = 0x06; i2c_w(gspca_dev, PAS106_REG2, I2c0, sizeof I2c0, 1); i2c_w(gspca_dev, PAS106_REG9, I2c2, sizeof I2c2, 1); value = 0x06; i2c_w(gspca_dev, PAS106_REG2, &value, 1, 1); i2c_w(gspca_dev, PAS106_REG3, I2c3, sizeof I2c3, 1); /* value = 0x1f; */ value = 0x04; i2c_w(gspca_dev, PAS106_REG0e, &value, 1, 1); } else { I2c0[0] = 0x0a; i2c_w(gspca_dev, PAS106_REG2, I2c0, sizeof I2c0, 1); i2c_w(gspca_dev, PAS106_REG9, I2c2, sizeof I2c2, 1); value = 0x0a; i2c_w(gspca_dev, PAS106_REG2, &value, 1, 1); i2c_w(gspca_dev, PAS106_REG3, I2c3, sizeof I2c3, 1); value = 0x04; /* value = 0x10; */ i2c_w(gspca_dev, PAS106_REG0e, &value, 1, 1); /* bit 2 enable bit 1:2 select 0 1 2 3 value = 0x07; * curve 0 * i2c_w(gspca_dev, PAS106_REG0f, &value, 1, 1); */ } /* value = 0x01; */ /* value = 0x22; */ /* i2c_w(gspca_dev, PAS106_REG5, &value, 1, 1); */ /* magnetude and sign bit for DAC */ i2c_w(gspca_dev, PAS106_REG7, I2c4, sizeof I2c4, 1); /* now set by fifo the whole colors setting */ reg_w(gspca_dev, ET_G_RED, GainRGBG, 6); setcolors(gspca_dev, getcolors(gspca_dev)); } /* this function is called at probe time */ static int sd_config(struct gspca_dev *gspca_dev, const struct usb_device_id *id) { struct sd *sd = (struct sd *) gspca_dev; struct cam *cam; cam = &gspca_dev->cam; sd->sensor = id->driver_info; if (sd->sensor == SENSOR_PAS106) { cam->cam_mode = sif_mode; cam->nmodes = ARRAY_SIZE(sif_mode); } else { cam->cam_mode = vga_mode; cam->nmodes = ARRAY_SIZE(vga_mode); } sd->ag_cnt = -1; return 0; } /* this function is called at probe and resume time */ static int sd_init(struct gspca_dev *gspca_dev) { struct sd *sd = (struct sd *) gspca_dev; if (sd->sensor == SENSOR_PAS106) Et_init1(gspca_dev); else Et_init2(gspca_dev); reg_w_val(gspca_dev, ET_RESET_ALL, 0x08); et_video(gspca_dev, 0); /* video off */ return 0; } /* -- start the camera -- */ static int sd_start(struct gspca_dev *gspca_dev) { struct sd *sd = (struct sd *) gspca_dev; if (sd->sensor == SENSOR_PAS106) Et_init1(gspca_dev); else Et_init2(gspca_dev); setautogain(gspca_dev); reg_w_val(gspca_dev, ET_RESET_ALL, 0x08); et_video(gspca_dev, 1); /* video on */ return 0; } static void sd_stopN(struct gspca_dev *gspca_dev) { et_video(gspca_dev, 0); /* video off */ } static __u8 Et_getgainG(struct gspca_dev *gspca_dev) { struct sd *sd = (struct sd *) gspca_dev; if (sd->sensor == SENSOR_PAS106) { i2c_r(gspca_dev, PAS106_REG0e); gspca_dbg(gspca_dev, D_CONF, "Etoms gain G %d\n", gspca_dev->usb_buf[0]); return gspca_dev->usb_buf[0]; } return 0x1f; } static void Et_setgainG(struct gspca_dev *gspca_dev, __u8 gain) { struct sd *sd = (struct sd *) gspca_dev; if (sd->sensor == SENSOR_PAS106) { __u8 i2cflags = 0x01; i2c_w(gspca_dev, PAS106_REG13, &i2cflags, 1, 3); i2c_w(gspca_dev, PAS106_REG0e, &gain, 1, 1); } } #define BLIMIT(bright) \ (u8)((bright > 0x1f) ? 0x1f : ((bright < 4) ? 3 : bright)) #define LIMIT(color) \ (u8)((color > 0xff) ? 0xff : ((color < 0) ? 0 : color)) static void do_autogain(struct gspca_dev *gspca_dev) { struct sd *sd = (struct sd *) gspca_dev; __u8 luma; __u8 luma_mean = 128; __u8 luma_delta = 20; __u8 spring = 4; int Gbright; __u8 r, g, b; if (sd->ag_cnt < 0) return; if (--sd->ag_cnt >= 0) return; sd->ag_cnt = AG_CNT_START; Gbright = Et_getgainG(gspca_dev); reg_r(gspca_dev, ET_LUMA_CENTER, 4); g = (gspca_dev->usb_buf[0] + gspca_dev->usb_buf[3]) >> 1; r = gspca_dev->usb_buf[1]; b = gspca_dev->usb_buf[2]; r = ((r << 8) - (r << 4) - (r << 3)) >> 10; b = ((b << 7) >> 10); g = ((g << 9) + (g << 7) + (g << 5)) >> 10; luma = LIMIT(r + g + b); gspca_dbg(gspca_dev, D_FRAM, "Etoms luma G %d\n", luma); if (luma < luma_mean - luma_delta || luma > luma_mean + luma_delta) { Gbright += (luma_mean - luma) >> spring; Gbright = BLIMIT(Gbright); gspca_dbg(gspca_dev, D_FRAM, "Etoms Gbright %d\n", Gbright); Et_setgainG(gspca_dev, (__u8) Gbright); } } #undef BLIMIT #undef LIMIT static void sd_pkt_scan(struct gspca_dev *gspca_dev, u8 *data, /* isoc packet */ int len) /* iso packet length */ { int seqframe; seqframe = data[0] & 0x3f; len = (int) (((data[0] & 0xc0) << 2) | data[1]); if (seqframe == 0x3f) { gspca_dbg(gspca_dev, D_FRAM, "header packet found datalength %d !!\n", len); gspca_dbg(gspca_dev, D_FRAM, "G %d R %d G %d B %d", data[2], data[3], data[4], data[5]); data += 30; /* don't change datalength as the chips provided it */ gspca_frame_add(gspca_dev, LAST_PACKET, NULL, 0); gspca_frame_add(gspca_dev, FIRST_PACKET, data, len); return; } if (len) { data += 8; gspca_frame_add(gspca_dev, INTER_PACKET, data, len); } else { /* Drop Packet */ gspca_dev->last_packet_type = DISCARD_PACKET; } } static int sd_s_ctrl(struct v4l2_ctrl *ctrl) { struct gspca_dev *gspca_dev = container_of(ctrl->handler, struct gspca_dev, ctrl_handler); struct sd *sd = (struct sd *)gspca_dev; gspca_dev->usb_err = 0; if (!gspca_dev->streaming) return 0; switch (ctrl->id) { case V4L2_CID_BRIGHTNESS: setbrightness(gspca_dev, ctrl->val); break; case V4L2_CID_CONTRAST: setcontrast(gspca_dev, ctrl->val); break; case V4L2_CID_SATURATION: setcolors(gspca_dev, ctrl->val); break; case V4L2_CID_AUTOGAIN: sd->autogain = ctrl->val; setautogain(gspca_dev); break; } return gspca_dev->usb_err; } static const struct v4l2_ctrl_ops sd_ctrl_ops = { .s_ctrl = sd_s_ctrl, }; static int sd_init_controls(struct gspca_dev *gspca_dev) { struct sd *sd = (struct sd *)gspca_dev; struct v4l2_ctrl_handler *hdl = &gspca_dev->ctrl_handler; gspca_dev->vdev.ctrl_handler = hdl; v4l2_ctrl_handler_init(hdl, 4); v4l2_ctrl_new_std(hdl, &sd_ctrl_ops, V4L2_CID_BRIGHTNESS, 1, 127, 1, 63); v4l2_ctrl_new_std(hdl, &sd_ctrl_ops, V4L2_CID_CONTRAST, 0, 255, 1, 127); if (sd->sensor == SENSOR_PAS106) v4l2_ctrl_new_std(hdl, &sd_ctrl_ops, V4L2_CID_SATURATION, 0, 15, 1, 7); v4l2_ctrl_new_std(hdl, &sd_ctrl_ops, V4L2_CID_AUTOGAIN, 0, 1, 1, 1); if (hdl->error) { pr_err("Could not initialize controls\n"); return hdl->error; } return 0; } /* sub-driver description */ static const struct sd_desc sd_desc = { .name = MODULE_NAME, .config = sd_config, .init = sd_init, .init_controls = sd_init_controls, .start = sd_start, .stopN = sd_stopN, .pkt_scan = sd_pkt_scan, .dq_callback = do_autogain, }; /* -- module initialisation -- */ static const struct usb_device_id device_table[] = { {USB_DEVICE(0x102c, 0x6151), .driver_info = SENSOR_PAS106}, {USB_DEVICE(0x102c, 0x6251), .driver_info = SENSOR_TAS5130CXX}, {} }; MODULE_DEVICE_TABLE(usb, device_table); /* -- device connect -- */ static int sd_probe(struct usb_interface *intf, const struct usb_device_id *id) { return gspca_dev_probe(intf, id, &sd_desc, sizeof(struct sd), THIS_MODULE); } static struct usb_driver sd_driver = { .name = MODULE_NAME, .id_table = device_table, .probe = sd_probe, .disconnect = gspca_disconnect, #ifdef CONFIG_PM .suspend = gspca_suspend, .resume = gspca_resume, .reset_resume = gspca_resume, #endif }; module_usb_driver(sd_driver); |
| 129 129 41 41 5 5 1 1 2 2 2 2 2 127 39 5 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 | // SPDX-License-Identifier: GPL-2.0-or-later /* * Copyright (c) by Jaroslav Kysela <perex@perex.cz> * Takashi Iwai <tiwai@suse.de> * * Generic memory allocators */ #include <linux/slab.h> #include <linux/mm.h> #include <linux/dma-mapping.h> #include <linux/dma-map-ops.h> #include <linux/genalloc.h> #include <linux/highmem.h> #include <linux/vmalloc.h> #ifdef CONFIG_X86 #include <asm/set_memory.h> #endif #include <sound/memalloc.h> struct snd_malloc_ops { void *(*alloc)(struct snd_dma_buffer *dmab, size_t size); void (*free)(struct snd_dma_buffer *dmab); dma_addr_t (*get_addr)(struct snd_dma_buffer *dmab, size_t offset); struct page *(*get_page)(struct snd_dma_buffer *dmab, size_t offset); unsigned int (*get_chunk_size)(struct snd_dma_buffer *dmab, unsigned int ofs, unsigned int size); int (*mmap)(struct snd_dma_buffer *dmab, struct vm_area_struct *area); void (*sync)(struct snd_dma_buffer *dmab, enum snd_dma_sync_mode mode); }; #define DEFAULT_GFP \ (GFP_KERNEL | \ __GFP_RETRY_MAYFAIL | /* don't trigger OOM-killer */ \ __GFP_NOWARN) /* no stack trace print - this call is non-critical */ static const struct snd_malloc_ops *snd_dma_get_ops(struct snd_dma_buffer *dmab); static void *__snd_dma_alloc_pages(struct snd_dma_buffer *dmab, size_t size) { const struct snd_malloc_ops *ops = snd_dma_get_ops(dmab); if (WARN_ON_ONCE(!ops || !ops->alloc)) return NULL; return ops->alloc(dmab, size); } /** * snd_dma_alloc_dir_pages - allocate the buffer area according to the given * type and direction * @type: the DMA buffer type * @device: the device pointer * @dir: DMA direction * @size: the buffer size to allocate * @dmab: buffer allocation record to store the allocated data * * Calls the memory-allocator function for the corresponding * buffer type. * * Return: Zero if the buffer with the given size is allocated successfully, * otherwise a negative value on error. */ int snd_dma_alloc_dir_pages(int type, struct device *device, enum dma_data_direction dir, size_t size, struct snd_dma_buffer *dmab) { if (WARN_ON(!size)) return -ENXIO; if (WARN_ON(!dmab)) return -ENXIO; size = PAGE_ALIGN(size); dmab->dev.type = type; dmab->dev.dev = device; dmab->dev.dir = dir; dmab->bytes = 0; dmab->addr = 0; dmab->private_data = NULL; dmab->area = __snd_dma_alloc_pages(dmab, size); if (!dmab->area) return -ENOMEM; dmab->bytes = size; return 0; } EXPORT_SYMBOL(snd_dma_alloc_dir_pages); /** * snd_dma_alloc_pages_fallback - allocate the buffer area according to the given type with fallback * @type: the DMA buffer type * @device: the device pointer * @size: the buffer size to allocate * @dmab: buffer allocation record to store the allocated data * * Calls the memory-allocator function for the corresponding * buffer type. When no space is left, this function reduces the size and * tries to allocate again. The size actually allocated is stored in * res_size argument. * * Return: Zero if the buffer with the given size is allocated successfully, * otherwise a negative value on error. */ int snd_dma_alloc_pages_fallback(int type, struct device *device, size_t size, struct snd_dma_buffer *dmab) { int err; while ((err = snd_dma_alloc_pages(type, device, size, dmab)) < 0) { if (err != -ENOMEM) return err; if (size <= PAGE_SIZE) return -ENOMEM; size >>= 1; size = PAGE_SIZE << get_order(size); } if (! dmab->area) return -ENOMEM; return 0; } EXPORT_SYMBOL(snd_dma_alloc_pages_fallback); /** * snd_dma_free_pages - release the allocated buffer * @dmab: the buffer allocation record to release * * Releases the allocated buffer via snd_dma_alloc_pages(). */ void snd_dma_free_pages(struct snd_dma_buffer *dmab) { const struct snd_malloc_ops *ops = snd_dma_get_ops(dmab); if (ops && ops->free) ops->free(dmab); } EXPORT_SYMBOL(snd_dma_free_pages); /* called by devres */ static void __snd_release_pages(struct device *dev, void *res) { snd_dma_free_pages(res); } /** * snd_devm_alloc_dir_pages - allocate the buffer and manage with devres * @dev: the device pointer * @type: the DMA buffer type * @dir: DMA direction * @size: the buffer size to allocate * * Allocate buffer pages depending on the given type and manage using devres. * The pages will be released automatically at the device removal. * * Unlike snd_dma_alloc_pages(), this function requires the real device pointer, * hence it can't work with SNDRV_DMA_TYPE_CONTINUOUS or * SNDRV_DMA_TYPE_VMALLOC type. * * Return: the snd_dma_buffer object at success, or NULL if failed */ struct snd_dma_buffer * snd_devm_alloc_dir_pages(struct device *dev, int type, enum dma_data_direction dir, size_t size) { struct snd_dma_buffer *dmab; int err; if (WARN_ON(type == SNDRV_DMA_TYPE_CONTINUOUS || type == SNDRV_DMA_TYPE_VMALLOC)) return NULL; dmab = devres_alloc(__snd_release_pages, sizeof(*dmab), GFP_KERNEL); if (!dmab) return NULL; err = snd_dma_alloc_dir_pages(type, dev, dir, size, dmab); if (err < 0) { devres_free(dmab); return NULL; } devres_add(dev, dmab); return dmab; } EXPORT_SYMBOL_GPL(snd_devm_alloc_dir_pages); /** * snd_dma_buffer_mmap - perform mmap of the given DMA buffer * @dmab: buffer allocation information * @area: VM area information * * Return: zero if successful, or a negative error code */ int snd_dma_buffer_mmap(struct snd_dma_buffer *dmab, struct vm_area_struct *area) { const struct snd_malloc_ops *ops; if (!dmab) return -ENOENT; ops = snd_dma_get_ops(dmab); if (ops && ops->mmap) return ops->mmap(dmab, area); else return -ENOENT; } EXPORT_SYMBOL(snd_dma_buffer_mmap); #ifdef CONFIG_HAS_DMA /** * snd_dma_buffer_sync - sync DMA buffer between CPU and device * @dmab: buffer allocation information * @mode: sync mode */ void snd_dma_buffer_sync(struct snd_dma_buffer *dmab, enum snd_dma_sync_mode mode) { const struct snd_malloc_ops *ops; if (!dmab || !dmab->dev.need_sync) return; ops = snd_dma_get_ops(dmab); if (ops && ops->sync) ops->sync(dmab, mode); } EXPORT_SYMBOL_GPL(snd_dma_buffer_sync); #endif /* CONFIG_HAS_DMA */ /** * snd_sgbuf_get_addr - return the physical address at the corresponding offset * @dmab: buffer allocation information * @offset: offset in the ring buffer * * Return: the physical address */ dma_addr_t snd_sgbuf_get_addr(struct snd_dma_buffer *dmab, size_t offset) { const struct snd_malloc_ops *ops = snd_dma_get_ops(dmab); if (ops && ops->get_addr) return ops->get_addr(dmab, offset); else return dmab->addr + offset; } EXPORT_SYMBOL(snd_sgbuf_get_addr); /** * snd_sgbuf_get_page - return the physical page at the corresponding offset * @dmab: buffer allocation information * @offset: offset in the ring buffer * * Return: the page pointer */ struct page *snd_sgbuf_get_page(struct snd_dma_buffer *dmab, size_t offset) { const struct snd_malloc_ops *ops = snd_dma_get_ops(dmab); if (ops && ops->get_page) return ops->get_page(dmab, offset); else return virt_to_page(dmab->area + offset); } EXPORT_SYMBOL(snd_sgbuf_get_page); /** * snd_sgbuf_get_chunk_size - compute the max chunk size with continuous pages * on sg-buffer * @dmab: buffer allocation information * @ofs: offset in the ring buffer * @size: the requested size * * Return: the chunk size */ unsigned int snd_sgbuf_get_chunk_size(struct snd_dma_buffer *dmab, unsigned int ofs, unsigned int size) { const struct snd_malloc_ops *ops = snd_dma_get_ops(dmab); if (ops && ops->get_chunk_size) return ops->get_chunk_size(dmab, ofs, size); else return size; } EXPORT_SYMBOL(snd_sgbuf_get_chunk_size); /* * Continuous pages allocator */ static void *do_alloc_pages(struct device *dev, size_t size, dma_addr_t *addr, bool wc) { void *p; gfp_t gfp = GFP_KERNEL | __GFP_NORETRY | __GFP_NOWARN; again: p = alloc_pages_exact(size, gfp); if (!p) return NULL; *addr = page_to_phys(virt_to_page(p)); if (!dev) return p; if ((*addr + size - 1) & ~dev->coherent_dma_mask) { if (IS_ENABLED(CONFIG_ZONE_DMA32) && !(gfp & GFP_DMA32)) { gfp |= GFP_DMA32; goto again; } if (IS_ENABLED(CONFIG_ZONE_DMA) && !(gfp & GFP_DMA)) { gfp = (gfp & ~GFP_DMA32) | GFP_DMA; goto again; } } #ifdef CONFIG_X86 if (wc) set_memory_wc((unsigned long)(p), size >> PAGE_SHIFT); #endif return p; } static void do_free_pages(void *p, size_t size, bool wc) { #ifdef CONFIG_X86 if (wc) set_memory_wb((unsigned long)(p), size >> PAGE_SHIFT); #endif free_pages_exact(p, size); } static void *snd_dma_continuous_alloc(struct snd_dma_buffer *dmab, size_t size) { return do_alloc_pages(dmab->dev.dev, size, &dmab->addr, false); } static void snd_dma_continuous_free(struct snd_dma_buffer *dmab) { do_free_pages(dmab->area, dmab->bytes, false); } static int snd_dma_continuous_mmap(struct snd_dma_buffer *dmab, struct vm_area_struct *area) { return remap_pfn_range(area, area->vm_start, dmab->addr >> PAGE_SHIFT, area->vm_end - area->vm_start, area->vm_page_prot); } static const struct snd_malloc_ops snd_dma_continuous_ops = { .alloc = snd_dma_continuous_alloc, .free = snd_dma_continuous_free, .mmap = snd_dma_continuous_mmap, }; /* * VMALLOC allocator */ static void *snd_dma_vmalloc_alloc(struct snd_dma_buffer *dmab, size_t size) { return vmalloc(size); } static void snd_dma_vmalloc_free(struct snd_dma_buffer *dmab) { vfree(dmab->area); } static int snd_dma_vmalloc_mmap(struct snd_dma_buffer *dmab, struct vm_area_struct *area) { return remap_vmalloc_range(area, dmab->area, 0); } #define get_vmalloc_page_addr(dmab, offset) \ page_to_phys(vmalloc_to_page((dmab)->area + (offset))) static dma_addr_t snd_dma_vmalloc_get_addr(struct snd_dma_buffer *dmab, size_t offset) { return get_vmalloc_page_addr(dmab, offset) + offset % PAGE_SIZE; } static struct page *snd_dma_vmalloc_get_page(struct snd_dma_buffer *dmab, size_t offset) { return vmalloc_to_page(dmab->area + offset); } static unsigned int snd_dma_vmalloc_get_chunk_size(struct snd_dma_buffer *dmab, unsigned int ofs, unsigned int size) { unsigned int start, end; unsigned long addr; start = ALIGN_DOWN(ofs, PAGE_SIZE); end = ofs + size - 1; /* the last byte address */ /* check page continuity */ addr = get_vmalloc_page_addr(dmab, start); for (;;) { start += PAGE_SIZE; if (start > end) break; addr += PAGE_SIZE; if (get_vmalloc_page_addr(dmab, start) != addr) return start - ofs; } /* ok, all on continuous pages */ return size; } static const struct snd_malloc_ops snd_dma_vmalloc_ops = { .alloc = snd_dma_vmalloc_alloc, .free = snd_dma_vmalloc_free, .mmap = snd_dma_vmalloc_mmap, .get_addr = snd_dma_vmalloc_get_addr, .get_page = snd_dma_vmalloc_get_page, .get_chunk_size = snd_dma_vmalloc_get_chunk_size, }; #ifdef CONFIG_HAS_DMA /* * IRAM allocator */ #ifdef CONFIG_GENERIC_ALLOCATOR static void *snd_dma_iram_alloc(struct snd_dma_buffer *dmab, size_t size) { struct device *dev = dmab->dev.dev; struct gen_pool *pool; void *p; if (dev->of_node) { pool = of_gen_pool_get(dev->of_node, "iram", 0); /* Assign the pool into private_data field */ dmab->private_data = pool; p = gen_pool_dma_alloc_align(pool, size, &dmab->addr, PAGE_SIZE); if (p) return p; } /* Internal memory might have limited size and no enough space, * so if we fail to malloc, try to fetch memory traditionally. */ dmab->dev.type = SNDRV_DMA_TYPE_DEV; return __snd_dma_alloc_pages(dmab, size); } static void snd_dma_iram_free(struct snd_dma_buffer *dmab) { struct gen_pool *pool = dmab->private_data; if (pool && dmab->area) gen_pool_free(pool, (unsigned long)dmab->area, dmab->bytes); } static int snd_dma_iram_mmap(struct snd_dma_buffer *dmab, struct vm_area_struct *area) { area->vm_page_prot = pgprot_writecombine(area->vm_page_prot); return remap_pfn_range(area, area->vm_start, dmab->addr >> PAGE_SHIFT, area->vm_end - area->vm_start, area->vm_page_prot); } static const struct snd_malloc_ops snd_dma_iram_ops = { .alloc = snd_dma_iram_alloc, .free = snd_dma_iram_free, .mmap = snd_dma_iram_mmap, }; #endif /* CONFIG_GENERIC_ALLOCATOR */ /* * Coherent device pages allocator */ static void *snd_dma_dev_alloc(struct snd_dma_buffer *dmab, size_t size) { return dma_alloc_coherent(dmab->dev.dev, size, &dmab->addr, DEFAULT_GFP); } static void snd_dma_dev_free(struct snd_dma_buffer *dmab) { dma_free_coherent(dmab->dev.dev, dmab->bytes, dmab->area, dmab->addr); } static int snd_dma_dev_mmap(struct snd_dma_buffer *dmab, struct vm_area_struct *area) { return dma_mmap_coherent(dmab->dev.dev, area, dmab->area, dmab->addr, dmab->bytes); } static const struct snd_malloc_ops snd_dma_dev_ops = { .alloc = snd_dma_dev_alloc, .free = snd_dma_dev_free, .mmap = snd_dma_dev_mmap, }; /* * Write-combined pages */ #ifdef CONFIG_SND_DMA_SGBUF /* x86-specific allocations */ static void *snd_dma_wc_alloc(struct snd_dma_buffer *dmab, size_t size) { void *p = do_alloc_pages(dmab->dev.dev, size, &dmab->addr, true); if (!p) return NULL; dmab->addr = dma_map_single(dmab->dev.dev, p, size, DMA_BIDIRECTIONAL); if (dma_mapping_error(dmab->dev.dev, dmab->addr)) { do_free_pages(dmab->area, size, true); return NULL; } return p; } static void snd_dma_wc_free(struct snd_dma_buffer *dmab) { dma_unmap_single(dmab->dev.dev, dmab->addr, dmab->bytes, DMA_BIDIRECTIONAL); do_free_pages(dmab->area, dmab->bytes, true); } static int snd_dma_wc_mmap(struct snd_dma_buffer *dmab, struct vm_area_struct *area) { area->vm_page_prot = pgprot_writecombine(area->vm_page_prot); return dma_mmap_coherent(dmab->dev.dev, area, dmab->area, dmab->addr, dmab->bytes); } #else static void *snd_dma_wc_alloc(struct snd_dma_buffer *dmab, size_t size) { return dma_alloc_wc(dmab->dev.dev, size, &dmab->addr, DEFAULT_GFP); } static void snd_dma_wc_free(struct snd_dma_buffer *dmab) { dma_free_wc(dmab->dev.dev, dmab->bytes, dmab->area, dmab->addr); } static int snd_dma_wc_mmap(struct snd_dma_buffer *dmab, struct vm_area_struct *area) { return dma_mmap_wc(dmab->dev.dev, area, dmab->area, dmab->addr, dmab->bytes); } #endif static const struct snd_malloc_ops snd_dma_wc_ops = { .alloc = snd_dma_wc_alloc, .free = snd_dma_wc_free, .mmap = snd_dma_wc_mmap, }; /* * Non-contiguous pages allocator */ static void *snd_dma_noncontig_alloc(struct snd_dma_buffer *dmab, size_t size) { struct sg_table *sgt; void *p; sgt = dma_alloc_noncontiguous(dmab->dev.dev, size, dmab->dev.dir, DEFAULT_GFP, 0); if (!sgt) return NULL; dmab->dev.need_sync = dma_need_sync(dmab->dev.dev, sg_dma_address(sgt->sgl)); p = dma_vmap_noncontiguous(dmab->dev.dev, size, sgt); if (p) { dmab->private_data = sgt; /* store the first page address for convenience */ dmab->addr = snd_sgbuf_get_addr(dmab, 0); } else { dma_free_noncontiguous(dmab->dev.dev, size, sgt, dmab->dev.dir); } return p; } static void snd_dma_noncontig_free(struct snd_dma_buffer *dmab) { dma_vunmap_noncontiguous(dmab->dev.dev, dmab->area); dma_free_noncontiguous(dmab->dev.dev, dmab->bytes, dmab->private_data, dmab->dev.dir); } static int snd_dma_noncontig_mmap(struct snd_dma_buffer *dmab, struct vm_area_struct *area) { return dma_mmap_noncontiguous(dmab->dev.dev, area, dmab->bytes, dmab->private_data); } static void snd_dma_noncontig_sync(struct snd_dma_buffer *dmab, enum snd_dma_sync_mode mode) { if (mode == SNDRV_DMA_SYNC_CPU) { if (dmab->dev.dir == DMA_TO_DEVICE) return; invalidate_kernel_vmap_range(dmab->area, dmab->bytes); dma_sync_sgtable_for_cpu(dmab->dev.dev, dmab->private_data, dmab->dev.dir); } else { if (dmab->dev.dir == DMA_FROM_DEVICE) return; flush_kernel_vmap_range(dmab->area, dmab->bytes); dma_sync_sgtable_for_device(dmab->dev.dev, dmab->private_data, dmab->dev.dir); } } static inline void snd_dma_noncontig_iter_set(struct snd_dma_buffer *dmab, struct sg_page_iter *piter, size_t offset) { struct sg_table *sgt = dmab->private_data; __sg_page_iter_start(piter, sgt->sgl, sgt->orig_nents, offset >> PAGE_SHIFT); } static dma_addr_t snd_dma_noncontig_get_addr(struct snd_dma_buffer *dmab, size_t offset) { struct sg_dma_page_iter iter; snd_dma_noncontig_iter_set(dmab, &iter.base, offset); __sg_page_iter_dma_next(&iter); return sg_page_iter_dma_address(&iter) + offset % PAGE_SIZE; } static struct page *snd_dma_noncontig_get_page(struct snd_dma_buffer *dmab, size_t offset) { struct sg_page_iter iter; snd_dma_noncontig_iter_set(dmab, &iter, offset); __sg_page_iter_next(&iter); return sg_page_iter_page(&iter); } static unsigned int snd_dma_noncontig_get_chunk_size(struct snd_dma_buffer *dmab, unsigned int ofs, unsigned int size) { struct sg_dma_page_iter iter; unsigned int start, end; unsigned long addr; start = ALIGN_DOWN(ofs, PAGE_SIZE); end = ofs + size - 1; /* the last byte address */ snd_dma_noncontig_iter_set(dmab, &iter.base, start); if (!__sg_page_iter_dma_next(&iter)) return 0; /* check page continuity */ addr = sg_page_iter_dma_address(&iter); for (;;) { start += PAGE_SIZE; if (start > end) break; addr += PAGE_SIZE; if (!__sg_page_iter_dma_next(&iter) || sg_page_iter_dma_address(&iter) != addr) return start - ofs; } /* ok, all on continuous pages */ return size; } static const struct snd_malloc_ops snd_dma_noncontig_ops = { .alloc = snd_dma_noncontig_alloc, .free = snd_dma_noncontig_free, .mmap = snd_dma_noncontig_mmap, .sync = snd_dma_noncontig_sync, .get_addr = snd_dma_noncontig_get_addr, .get_page = snd_dma_noncontig_get_page, .get_chunk_size = snd_dma_noncontig_get_chunk_size, }; #ifdef CONFIG_SND_DMA_SGBUF /* Fallback SG-buffer allocations for x86 */ struct snd_dma_sg_fallback { struct sg_table sgt; /* used by get_addr - must be the first item */ size_t count; struct page **pages; unsigned int *npages; }; static void __snd_dma_sg_fallback_free(struct snd_dma_buffer *dmab, struct snd_dma_sg_fallback *sgbuf) { bool wc = dmab->dev.type == SNDRV_DMA_TYPE_DEV_WC_SG; size_t i, size; if (sgbuf->pages && sgbuf->npages) { i = 0; while (i < sgbuf->count) { size = sgbuf->npages[i]; if (!size) break; do_free_pages(page_address(sgbuf->pages[i]), size << PAGE_SHIFT, wc); i += size; } } kvfree(sgbuf->pages); kvfree(sgbuf->npages); kfree(sgbuf); } /* fallback manual S/G buffer allocations */ static void *snd_dma_sg_fallback_alloc(struct snd_dma_buffer *dmab, size_t size) { bool wc = dmab->dev.type == SNDRV_DMA_TYPE_DEV_WC_SG; struct snd_dma_sg_fallback *sgbuf; struct page **pagep, *curp; size_t chunk; dma_addr_t addr; unsigned int idx, npages; void *p; sgbuf = kzalloc(sizeof(*sgbuf), GFP_KERNEL); if (!sgbuf) return NULL; size = PAGE_ALIGN(size); sgbuf->count = size >> PAGE_SHIFT; sgbuf->pages = kvcalloc(sgbuf->count, sizeof(*sgbuf->pages), GFP_KERNEL); sgbuf->npages = kvcalloc(sgbuf->count, sizeof(*sgbuf->npages), GFP_KERNEL); if (!sgbuf->pages || !sgbuf->npages) goto error; pagep = sgbuf->pages; chunk = size; idx = 0; while (size > 0) { chunk = min(size, chunk); p = do_alloc_pages(dmab->dev.dev, chunk, &addr, wc); if (!p) { if (chunk <= PAGE_SIZE) goto error; chunk >>= 1; chunk = PAGE_SIZE << get_order(chunk); continue; } size -= chunk; /* fill pages */ npages = chunk >> PAGE_SHIFT; sgbuf->npages[idx] = npages; idx += npages; curp = virt_to_page(p); while (npages--) *pagep++ = curp++; } if (sg_alloc_table_from_pages(&sgbuf->sgt, sgbuf->pages, sgbuf->count, 0, sgbuf->count << PAGE_SHIFT, GFP_KERNEL)) goto error; if (dma_map_sgtable(dmab->dev.dev, &sgbuf->sgt, DMA_BIDIRECTIONAL, 0)) goto error_dma_map; p = vmap(sgbuf->pages, sgbuf->count, VM_MAP, PAGE_KERNEL); if (!p) goto error_vmap; dmab->private_data = sgbuf; /* store the first page address for convenience */ dmab->addr = snd_sgbuf_get_addr(dmab, 0); return p; error_vmap: dma_unmap_sgtable(dmab->dev.dev, &sgbuf->sgt, DMA_BIDIRECTIONAL, 0); error_dma_map: sg_free_table(&sgbuf->sgt); error: __snd_dma_sg_fallback_free(dmab, sgbuf); return NULL; } static void snd_dma_sg_fallback_free(struct snd_dma_buffer *dmab) { struct snd_dma_sg_fallback *sgbuf = dmab->private_data; vunmap(dmab->area); dma_unmap_sgtable(dmab->dev.dev, &sgbuf->sgt, DMA_BIDIRECTIONAL, 0); sg_free_table(&sgbuf->sgt); __snd_dma_sg_fallback_free(dmab, dmab->private_data); } static int snd_dma_sg_fallback_mmap(struct snd_dma_buffer *dmab, struct vm_area_struct *area) { struct snd_dma_sg_fallback *sgbuf = dmab->private_data; if (dmab->dev.type == SNDRV_DMA_TYPE_DEV_WC_SG) area->vm_page_prot = pgprot_writecombine(area->vm_page_prot); return vm_map_pages(area, sgbuf->pages, sgbuf->count); } static void *snd_dma_sg_alloc(struct snd_dma_buffer *dmab, size_t size) { int type = dmab->dev.type; void *p; /* try the standard DMA API allocation at first */ if (type == SNDRV_DMA_TYPE_DEV_WC_SG) dmab->dev.type = SNDRV_DMA_TYPE_DEV_WC; else dmab->dev.type = SNDRV_DMA_TYPE_DEV; p = __snd_dma_alloc_pages(dmab, size); if (p) return p; dmab->dev.type = type; /* restore the type */ return snd_dma_sg_fallback_alloc(dmab, size); } static const struct snd_malloc_ops snd_dma_sg_ops = { .alloc = snd_dma_sg_alloc, .free = snd_dma_sg_fallback_free, .mmap = snd_dma_sg_fallback_mmap, /* reuse noncontig helper */ .get_addr = snd_dma_noncontig_get_addr, /* reuse vmalloc helpers */ .get_page = snd_dma_vmalloc_get_page, .get_chunk_size = snd_dma_vmalloc_get_chunk_size, }; #endif /* CONFIG_SND_DMA_SGBUF */ /* * Non-coherent pages allocator */ static void *snd_dma_noncoherent_alloc(struct snd_dma_buffer *dmab, size_t size) { void *p; p = dma_alloc_noncoherent(dmab->dev.dev, size, &dmab->addr, dmab->dev.dir, DEFAULT_GFP); if (p) dmab->dev.need_sync = dma_need_sync(dmab->dev.dev, dmab->addr); return p; } static void snd_dma_noncoherent_free(struct snd_dma_buffer *dmab) { dma_free_noncoherent(dmab->dev.dev, dmab->bytes, dmab->area, dmab->addr, dmab->dev.dir); } static int snd_dma_noncoherent_mmap(struct snd_dma_buffer *dmab, struct vm_area_struct *area) { area->vm_page_prot = vm_get_page_prot(area->vm_flags); return dma_mmap_pages(dmab->dev.dev, area, area->vm_end - area->vm_start, virt_to_page(dmab->area)); } static void snd_dma_noncoherent_sync(struct snd_dma_buffer *dmab, enum snd_dma_sync_mode mode) { if (mode == SNDRV_DMA_SYNC_CPU) { if (dmab->dev.dir != DMA_TO_DEVICE) dma_sync_single_for_cpu(dmab->dev.dev, dmab->addr, dmab->bytes, dmab->dev.dir); } else { if (dmab->dev.dir != DMA_FROM_DEVICE) dma_sync_single_for_device(dmab->dev.dev, dmab->addr, dmab->bytes, dmab->dev.dir); } } static const struct snd_malloc_ops snd_dma_noncoherent_ops = { .alloc = snd_dma_noncoherent_alloc, .free = snd_dma_noncoherent_free, .mmap = snd_dma_noncoherent_mmap, .sync = snd_dma_noncoherent_sync, }; #endif /* CONFIG_HAS_DMA */ /* * Entry points */ static const struct snd_malloc_ops *snd_dma_ops[] = { [SNDRV_DMA_TYPE_CONTINUOUS] = &snd_dma_continuous_ops, [SNDRV_DMA_TYPE_VMALLOC] = &snd_dma_vmalloc_ops, #ifdef CONFIG_HAS_DMA [SNDRV_DMA_TYPE_DEV] = &snd_dma_dev_ops, [SNDRV_DMA_TYPE_DEV_WC] = &snd_dma_wc_ops, [SNDRV_DMA_TYPE_NONCONTIG] = &snd_dma_noncontig_ops, [SNDRV_DMA_TYPE_NONCOHERENT] = &snd_dma_noncoherent_ops, #ifdef CONFIG_SND_DMA_SGBUF [SNDRV_DMA_TYPE_DEV_SG] = &snd_dma_sg_ops, [SNDRV_DMA_TYPE_DEV_WC_SG] = &snd_dma_sg_ops, #endif #ifdef CONFIG_GENERIC_ALLOCATOR [SNDRV_DMA_TYPE_DEV_IRAM] = &snd_dma_iram_ops, #endif /* CONFIG_GENERIC_ALLOCATOR */ #endif /* CONFIG_HAS_DMA */ }; static const struct snd_malloc_ops *snd_dma_get_ops(struct snd_dma_buffer *dmab) { if (WARN_ON_ONCE(!dmab)) return NULL; if (WARN_ON_ONCE(dmab->dev.type <= SNDRV_DMA_TYPE_UNKNOWN || dmab->dev.type >= ARRAY_SIZE(snd_dma_ops))) return NULL; return snd_dma_ops[dmab->dev.type]; } |
| 145 145 8 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 | // SPDX-License-Identifier: GPL-2.0-or-later /* * Do sleep inside a spin-lock * Copyright (c) 1999 by Takashi Iwai <tiwai@suse.de> */ #include <linux/export.h> #include <sound/core.h> #include "seq_lock.h" /* wait until all locks are released */ void snd_use_lock_sync_helper(snd_use_lock_t *lockp, const char *file, int line) { int warn_count = 5 * HZ; if (atomic_read(lockp) < 0) { pr_warn("ALSA: seq_lock: lock trouble [counter = %d] in %s:%d\n", atomic_read(lockp), file, line); return; } while (atomic_read(lockp) > 0) { if (warn_count-- == 0) pr_warn("ALSA: seq_lock: waiting [%d left] in %s:%d\n", atomic_read(lockp), file, line); schedule_timeout_uninterruptible(1); } } EXPORT_SYMBOL(snd_use_lock_sync_helper); |
| 7 7 7 7 7 94 94 88 6 6 6 6 6 54 54 54 41 5 13 13 54 1 10 10 10 4 6 10 11 10 5 5 5 10 11 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 | // SPDX-License-Identifier: GPL-2.0 #include "bcachefs.h" #include "disk_groups.h" #include "sb-members.h" #include "super-io.h" #include <linux/sort.h> static int group_cmp(const void *_l, const void *_r) { const struct bch_disk_group *l = _l; const struct bch_disk_group *r = _r; return ((BCH_GROUP_DELETED(l) > BCH_GROUP_DELETED(r)) - (BCH_GROUP_DELETED(l) < BCH_GROUP_DELETED(r))) ?: ((BCH_GROUP_PARENT(l) > BCH_GROUP_PARENT(r)) - (BCH_GROUP_PARENT(l) < BCH_GROUP_PARENT(r))) ?: strncmp(l->label, r->label, sizeof(l->label)); } static int bch2_sb_disk_groups_validate(struct bch_sb *sb, struct bch_sb_field *f, enum bch_validate_flags flags, struct printbuf *err) { struct bch_sb_field_disk_groups *groups = field_to_type(f, disk_groups); struct bch_disk_group *g, *sorted = NULL; unsigned nr_groups = disk_groups_nr(groups); unsigned i, len; int ret = 0; for (i = 0; i < sb->nr_devices; i++) { struct bch_member m = bch2_sb_member_get(sb, i); unsigned group_id; if (!BCH_MEMBER_GROUP(&m)) continue; group_id = BCH_MEMBER_GROUP(&m) - 1; if (group_id >= nr_groups) { prt_printf(err, "disk %u has invalid label %u (have %u)", i, group_id, nr_groups); return -BCH_ERR_invalid_sb_disk_groups; } if (BCH_GROUP_DELETED(&groups->entries[group_id])) { prt_printf(err, "disk %u has deleted label %u", i, group_id); return -BCH_ERR_invalid_sb_disk_groups; } } if (!nr_groups) return 0; for (i = 0; i < nr_groups; i++) { g = groups->entries + i; if (BCH_GROUP_DELETED(g)) continue; len = strnlen(g->label, sizeof(g->label)); if (!len) { prt_printf(err, "label %u empty", i); return -BCH_ERR_invalid_sb_disk_groups; } } sorted = kmalloc_array(nr_groups, sizeof(*sorted), GFP_KERNEL); if (!sorted) return -BCH_ERR_ENOMEM_disk_groups_validate; memcpy(sorted, groups->entries, nr_groups * sizeof(*sorted)); sort(sorted, nr_groups, sizeof(*sorted), group_cmp, NULL); for (g = sorted; g + 1 < sorted + nr_groups; g++) if (!BCH_GROUP_DELETED(g) && !group_cmp(&g[0], &g[1])) { prt_printf(err, "duplicate label %llu.%.*s", BCH_GROUP_PARENT(g), (int) sizeof(g->label), g->label); ret = -BCH_ERR_invalid_sb_disk_groups; goto err; } err: kfree(sorted); return ret; } void bch2_disk_groups_to_text(struct printbuf *out, struct bch_fs *c) { out->atomic++; rcu_read_lock(); struct bch_disk_groups_cpu *g = rcu_dereference(c->disk_groups); if (!g) goto out; for (unsigned i = 0; i < g->nr; i++) { if (i) prt_printf(out, " "); if (g->entries[i].deleted) { prt_printf(out, "[deleted]"); continue; } prt_printf(out, "[parent %d devs", g->entries[i].parent); for_each_member_device_rcu(c, ca, &g->entries[i].devs) prt_printf(out, " %s", ca->name); prt_printf(out, "]"); } out: rcu_read_unlock(); out->atomic--; } static void bch2_sb_disk_groups_to_text(struct printbuf *out, struct bch_sb *sb, struct bch_sb_field *f) { struct bch_sb_field_disk_groups *groups = field_to_type(f, disk_groups); struct bch_disk_group *g; unsigned nr_groups = disk_groups_nr(groups); for (g = groups->entries; g < groups->entries + nr_groups; g++) { if (g != groups->entries) prt_printf(out, " "); if (BCH_GROUP_DELETED(g)) prt_printf(out, "[deleted]"); else prt_printf(out, "[parent %llu name %s]", BCH_GROUP_PARENT(g), g->label); } } const struct bch_sb_field_ops bch_sb_field_ops_disk_groups = { .validate = bch2_sb_disk_groups_validate, .to_text = bch2_sb_disk_groups_to_text }; int bch2_sb_disk_groups_to_cpu(struct bch_fs *c) { struct bch_sb_field_disk_groups *groups; struct bch_disk_groups_cpu *cpu_g, *old_g; unsigned i, g, nr_groups; lockdep_assert_held(&c->sb_lock); groups = bch2_sb_field_get(c->disk_sb.sb, disk_groups); nr_groups = disk_groups_nr(groups); if (!groups) return 0; cpu_g = kzalloc(struct_size(cpu_g, entries, nr_groups), GFP_KERNEL); if (!cpu_g) return -BCH_ERR_ENOMEM_disk_groups_to_cpu; cpu_g->nr = nr_groups; for (i = 0; i < nr_groups; i++) { struct bch_disk_group *src = &groups->entries[i]; struct bch_disk_group_cpu *dst = &cpu_g->entries[i]; dst->deleted = BCH_GROUP_DELETED(src); dst->parent = BCH_GROUP_PARENT(src); memcpy(dst->label, src->label, sizeof(dst->label)); } for (i = 0; i < c->disk_sb.sb->nr_devices; i++) { struct bch_member m = bch2_sb_member_get(c->disk_sb.sb, i); struct bch_disk_group_cpu *dst; if (!bch2_member_alive(&m)) continue; g = BCH_MEMBER_GROUP(&m); while (g) { dst = &cpu_g->entries[g - 1]; __set_bit(i, dst->devs.d); g = dst->parent; } } old_g = rcu_dereference_protected(c->disk_groups, lockdep_is_held(&c->sb_lock)); rcu_assign_pointer(c->disk_groups, cpu_g); if (old_g) kfree_rcu(old_g, rcu); return 0; } const struct bch_devs_mask *bch2_target_to_mask(struct bch_fs *c, unsigned target) { struct target t = target_decode(target); struct bch_devs_mask *devs; rcu_read_lock(); switch (t.type) { case TARGET_NULL: devs = NULL; break; case TARGET_DEV: { struct bch_dev *ca = t.dev < c->sb.nr_devices ? rcu_dereference(c->devs[t.dev]) : NULL; devs = ca ? &ca->self : NULL; break; } case TARGET_GROUP: { struct bch_disk_groups_cpu *g = rcu_dereference(c->disk_groups); devs = g && t.group < g->nr && !g->entries[t.group].deleted ? &g->entries[t.group].devs : NULL; break; } default: BUG(); } rcu_read_unlock(); return devs; } bool bch2_dev_in_target(struct bch_fs *c, unsigned dev, unsigned target) { struct target t = target_decode(target); switch (t.type) { case TARGET_NULL: return false; case TARGET_DEV: return dev == t.dev; case TARGET_GROUP: { struct bch_disk_groups_cpu *g; const struct bch_devs_mask *m; bool ret; rcu_read_lock(); g = rcu_dereference(c->disk_groups); m = g && t.group < g->nr && !g->entries[t.group].deleted ? &g->entries[t.group].devs : NULL; ret = m ? test_bit(dev, m->d) : false; rcu_read_unlock(); return ret; } default: BUG(); } } static int __bch2_disk_group_find(struct bch_sb_field_disk_groups *groups, unsigned parent, const char *name, unsigned namelen) { unsigned i, nr_groups = disk_groups_nr(groups); if (!namelen || namelen > BCH_SB_LABEL_SIZE) return -EINVAL; for (i = 0; i < nr_groups; i++) { struct bch_disk_group *g = groups->entries + i; if (BCH_GROUP_DELETED(g)) continue; if (!BCH_GROUP_DELETED(g) && BCH_GROUP_PARENT(g) == parent && strnlen(g->label, sizeof(g->label)) == namelen && !memcmp(name, g->label, namelen)) return i; } return -1; } static int __bch2_disk_group_add(struct bch_sb_handle *sb, unsigned parent, const char *name, unsigned namelen) { struct bch_sb_field_disk_groups *groups = bch2_sb_field_get(sb->sb, disk_groups); unsigned i, nr_groups = disk_groups_nr(groups); struct bch_disk_group *g; if (!namelen || namelen > BCH_SB_LABEL_SIZE) return -EINVAL; for (i = 0; i < nr_groups && !BCH_GROUP_DELETED(&groups->entries[i]); i++) ; if (i == nr_groups) { unsigned u64s = (sizeof(struct bch_sb_field_disk_groups) + sizeof(struct bch_disk_group) * (nr_groups + 1)) / sizeof(u64); groups = bch2_sb_field_resize(sb, disk_groups, u64s); if (!groups) return -BCH_ERR_ENOSPC_disk_label_add; nr_groups = disk_groups_nr(groups); } BUG_ON(i >= nr_groups); g = &groups->entries[i]; memcpy(g->label, name, namelen); if (namelen < sizeof(g->label)) g->label[namelen] = '\0'; SET_BCH_GROUP_DELETED(g, 0); SET_BCH_GROUP_PARENT(g, parent); SET_BCH_GROUP_DATA_ALLOWED(g, ~0); return i; } int bch2_disk_path_find(struct bch_sb_handle *sb, const char *name) { struct bch_sb_field_disk_groups *groups = bch2_sb_field_get(sb->sb, disk_groups); int v = -1; do { const char *next = strchrnul(name, '.'); unsigned len = next - name; if (*next == '.') next++; v = __bch2_disk_group_find(groups, v + 1, name, len); name = next; } while (*name && v >= 0); return v; } int bch2_disk_path_find_or_create(struct bch_sb_handle *sb, const char *name) { struct bch_sb_field_disk_groups *groups; unsigned parent = 0; int v = -1; do { const char *next = strchrnul(name, '.'); unsigned len = next - name; if (*next == '.') next++; groups = bch2_sb_field_get(sb->sb, disk_groups); v = __bch2_disk_group_find(groups, parent, name, len); if (v < 0) v = __bch2_disk_group_add(sb, parent, name, len); if (v < 0) return v; parent = v + 1; name = next; } while (*name && v >= 0); return v; } void bch2_disk_path_to_text(struct printbuf *out, struct bch_fs *c, unsigned v) { struct bch_disk_groups_cpu *groups; struct bch_disk_group_cpu *g; unsigned nr = 0; u16 path[32]; out->atomic++; rcu_read_lock(); groups = rcu_dereference(c->disk_groups); if (!groups) goto invalid; while (1) { if (nr == ARRAY_SIZE(path)) goto invalid; if (v >= groups->nr) goto invalid; g = groups->entries + v; if (g->deleted) goto invalid; path[nr++] = v; if (!g->parent) break; v = g->parent - 1; } while (nr) { v = path[--nr]; g = groups->entries + v; prt_printf(out, "%.*s", (int) sizeof(g->label), g->label); if (nr) prt_printf(out, "."); } out: rcu_read_unlock(); out->atomic--; return; invalid: prt_printf(out, "invalid label %u", v); goto out; } void bch2_disk_path_to_text_sb(struct printbuf *out, struct bch_sb *sb, unsigned v) { struct bch_sb_field_disk_groups *groups = bch2_sb_field_get(sb, disk_groups); struct bch_disk_group *g; unsigned nr = 0; u16 path[32]; while (1) { if (nr == ARRAY_SIZE(path)) goto inval; if (v >= disk_groups_nr(groups)) goto inval; g = groups->entries + v; if (BCH_GROUP_DELETED(g)) goto inval; path[nr++] = v; if (!BCH_GROUP_PARENT(g)) break; v = BCH_GROUP_PARENT(g) - 1; } while (nr) { v = path[--nr]; g = groups->entries + v; prt_printf(out, "%.*s", (int) sizeof(g->label), g->label); if (nr) prt_printf(out, "."); } return; inval: prt_printf(out, "invalid label %u", v); } int __bch2_dev_group_set(struct bch_fs *c, struct bch_dev *ca, const char *name) { struct bch_member *mi; int ret, v = -1; if (!strlen(name) || !strcmp(name, "none")) return 0; v = bch2_disk_path_find_or_create(&c->disk_sb, name); if (v < 0) return v; ret = bch2_sb_disk_groups_to_cpu(c); if (ret) return ret; mi = bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx); SET_BCH_MEMBER_GROUP(mi, v + 1); return 0; } int bch2_dev_group_set(struct bch_fs *c, struct bch_dev *ca, const char *name) { int ret; mutex_lock(&c->sb_lock); ret = __bch2_dev_group_set(c, ca, name) ?: bch2_write_super(c); mutex_unlock(&c->sb_lock); return ret; } int bch2_opt_target_parse(struct bch_fs *c, const char *val, u64 *res, struct printbuf *err) { struct bch_dev *ca; int g; if (!val) return -EINVAL; if (!c) return -BCH_ERR_option_needs_open_fs; if (!strlen(val) || !strcmp(val, "none")) { *res = 0; return 0; } /* Is it a device? */ ca = bch2_dev_lookup(c, val); if (!IS_ERR(ca)) { *res = dev_to_target(ca->dev_idx); bch2_dev_put(ca); return 0; } mutex_lock(&c->sb_lock); g = bch2_disk_path_find(&c->disk_sb, val); mutex_unlock(&c->sb_lock); if (g >= 0) { *res = group_to_target(g); return 0; } return -EINVAL; } void bch2_target_to_text(struct printbuf *out, struct bch_fs *c, unsigned v) { struct target t = target_decode(v); switch (t.type) { case TARGET_NULL: prt_printf(out, "none"); break; case TARGET_DEV: { struct bch_dev *ca; out->atomic++; rcu_read_lock(); ca = t.dev < c->sb.nr_devices ? rcu_dereference(c->devs[t.dev]) : NULL; if (ca && percpu_ref_tryget(&ca->io_ref)) { prt_printf(out, "/dev/%s", ca->name); percpu_ref_put(&ca->io_ref); } else if (ca) { prt_printf(out, "offline device %u", t.dev); } else { prt_printf(out, "invalid device %u", t.dev); } rcu_read_unlock(); out->atomic--; break; } case TARGET_GROUP: bch2_disk_path_to_text(out, c, t.group); break; default: BUG(); } } static void bch2_target_to_text_sb(struct printbuf *out, struct bch_sb *sb, unsigned v) { struct target t = target_decode(v); switch (t.type) { case TARGET_NULL: prt_printf(out, "none"); break; case TARGET_DEV: { struct bch_member m = bch2_sb_member_get(sb, t.dev); if (bch2_member_exists(sb, t.dev)) { prt_printf(out, "Device "); pr_uuid(out, m.uuid.b); prt_printf(out, " (%u)", t.dev); } else { prt_printf(out, "Bad device %u", t.dev); } break; } case TARGET_GROUP: bch2_disk_path_to_text_sb(out, sb, t.group); break; default: BUG(); } } void bch2_opt_target_to_text(struct printbuf *out, struct bch_fs *c, struct bch_sb *sb, u64 v) { if (c) bch2_target_to_text(out, c, v); else bch2_target_to_text_sb(out, sb, v); } |
| 14503 14488 14518 13515 13511 13528 17 17 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 | // SPDX-License-Identifier: GPL-2.0-or-later /* * Generic Timer-queue * * Manages a simple queue of timers, ordered by expiration time. * Uses rbtrees for quick list adds and expiration. * * NOTE: All of the following functions need to be serialized * to avoid races. No locking is done by this library code. */ #include <linux/bug.h> #include <linux/timerqueue.h> #include <linux/rbtree.h> #include <linux/export.h> #define __node_2_tq(_n) \ rb_entry((_n), struct timerqueue_node, node) static inline bool __timerqueue_less(struct rb_node *a, const struct rb_node *b) { return __node_2_tq(a)->expires < __node_2_tq(b)->expires; } /** * timerqueue_add - Adds timer to timerqueue. * * @head: head of timerqueue * @node: timer node to be added * * Adds the timer node to the timerqueue, sorted by the node's expires * value. Returns true if the newly added timer is the first expiring timer in * the queue. */ bool timerqueue_add(struct timerqueue_head *head, struct timerqueue_node *node) { /* Make sure we don't add nodes that are already added */ WARN_ON_ONCE(!RB_EMPTY_NODE(&node->node)); return rb_add_cached(&node->node, &head->rb_root, __timerqueue_less); } EXPORT_SYMBOL_GPL(timerqueue_add); /** * timerqueue_del - Removes a timer from the timerqueue. * * @head: head of timerqueue * @node: timer node to be removed * * Removes the timer node from the timerqueue. Returns true if the queue is * not empty after the remove. */ bool timerqueue_del(struct timerqueue_head *head, struct timerqueue_node *node) { WARN_ON_ONCE(RB_EMPTY_NODE(&node->node)); rb_erase_cached(&node->node, &head->rb_root); RB_CLEAR_NODE(&node->node); return !RB_EMPTY_ROOT(&head->rb_root.rb_root); } EXPORT_SYMBOL_GPL(timerqueue_del); /** * timerqueue_iterate_next - Returns the timer after the provided timer * * @node: Pointer to a timer. * * Provides the timer that is after the given node. This is used, when * necessary, to iterate through the list of timers in a timer list * without modifying the list. */ struct timerqueue_node *timerqueue_iterate_next(struct timerqueue_node *node) { struct rb_node *next; if (!node) return NULL; next = rb_next(&node->node); if (!next) return NULL; return container_of(next, struct timerqueue_node, node); } EXPORT_SYMBOL_GPL(timerqueue_iterate_next); |
| 3 375 376 3 3 3 47 47 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 | // SPDX-License-Identifier: GPL-2.0 #include <linux/rtnetlink.h> #include <linux/notifier.h> #include <linux/socket.h> #include <linux/kernel.h> #include <linux/export.h> #include <net/net_namespace.h> #include <net/fib_notifier.h> #include <net/ip_fib.h> int call_fib4_notifier(struct notifier_block *nb, enum fib_event_type event_type, struct fib_notifier_info *info) { info->family = AF_INET; return call_fib_notifier(nb, event_type, info); } int call_fib4_notifiers(struct net *net, enum fib_event_type event_type, struct fib_notifier_info *info) { ASSERT_RTNL(); info->family = AF_INET; /* Paired with READ_ONCE() in fib4_seq_read() */ WRITE_ONCE(net->ipv4.fib_seq, net->ipv4.fib_seq + 1); return call_fib_notifiers(net, event_type, info); } static unsigned int fib4_seq_read(const struct net *net) { /* Paired with WRITE_ONCE() in call_fib4_notifiers() */ return READ_ONCE(net->ipv4.fib_seq) + fib4_rules_seq_read(net); } static int fib4_dump(struct net *net, struct notifier_block *nb, struct netlink_ext_ack *extack) { int err; err = fib4_rules_dump(net, nb, extack); if (err) return err; return fib_notify(net, nb, extack); } static const struct fib_notifier_ops fib4_notifier_ops_template = { .family = AF_INET, .fib_seq_read = fib4_seq_read, .fib_dump = fib4_dump, .owner = THIS_MODULE, }; int __net_init fib4_notifier_init(struct net *net) { struct fib_notifier_ops *ops; net->ipv4.fib_seq = 0; ops = fib_notifier_ops_register(&fib4_notifier_ops_template, net); if (IS_ERR(ops)) return PTR_ERR(ops); net->ipv4.notifier_ops = ops; return 0; } void __net_exit fib4_notifier_exit(struct net *net) { fib_notifier_ops_unregister(net->ipv4.notifier_ops); } |
| 14 2 2 13 12 1 1 40 40 2 12 15 19 22 1 2 15 7 36 54 14 40 1 19 23 18 21 20 20 40 3 3 1 3 3 2 54 54 53 5 45 6 50 50 44 44 4 33 35 44 3 40 6 4 43 6 39 4 40 40 12 11 17 24 36 4 3 36 3 38 57 57 2 1 2 51 1 45 4 41 60 60 13 60 57 45 2 47 60 5 3 1 3 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 | // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) 2018-2019 HUAWEI, Inc. * https://www.huawei.com/ */ #include "internal.h" #include <linux/unaligned.h> #include <trace/events/erofs.h> struct z_erofs_maprecorder { struct inode *inode; struct erofs_map_blocks *map; unsigned long lcn; /* compression extent information gathered */ u8 type, headtype; u16 clusterofs; u16 delta[2]; erofs_blk_t pblk, compressedblks; erofs_off_t nextpackoff; bool partialref; }; static int z_erofs_load_full_lcluster(struct z_erofs_maprecorder *m, unsigned long lcn) { struct inode *const inode = m->inode; struct erofs_inode *const vi = EROFS_I(inode); const erofs_off_t pos = Z_EROFS_FULL_INDEX_ALIGN(erofs_iloc(inode) + vi->inode_isize + vi->xattr_isize) + lcn * sizeof(struct z_erofs_lcluster_index); struct z_erofs_lcluster_index *di; unsigned int advise; di = erofs_read_metabuf(&m->map->buf, inode->i_sb, pos, EROFS_KMAP); if (IS_ERR(di)) return PTR_ERR(di); m->lcn = lcn; m->nextpackoff = pos + sizeof(struct z_erofs_lcluster_index); advise = le16_to_cpu(di->di_advise); m->type = advise & Z_EROFS_LI_LCLUSTER_TYPE_MASK; if (m->type == Z_EROFS_LCLUSTER_TYPE_NONHEAD) { m->clusterofs = 1 << vi->z_logical_clusterbits; m->delta[0] = le16_to_cpu(di->di_u.delta[0]); if (m->delta[0] & Z_EROFS_LI_D0_CBLKCNT) { if (!(vi->z_advise & (Z_EROFS_ADVISE_BIG_PCLUSTER_1 | Z_EROFS_ADVISE_BIG_PCLUSTER_2))) { DBG_BUGON(1); return -EFSCORRUPTED; } m->compressedblks = m->delta[0] & ~Z_EROFS_LI_D0_CBLKCNT; m->delta[0] = 1; } m->delta[1] = le16_to_cpu(di->di_u.delta[1]); } else { m->partialref = !!(advise & Z_EROFS_LI_PARTIAL_REF); m->clusterofs = le16_to_cpu(di->di_clusterofs); if (m->clusterofs >= 1 << vi->z_logical_clusterbits) { DBG_BUGON(1); return -EFSCORRUPTED; } m->pblk = le32_to_cpu(di->di_u.blkaddr); } return 0; } static unsigned int decode_compactedbits(unsigned int lobits, u8 *in, unsigned int pos, u8 *type) { const unsigned int v = get_unaligned_le32(in + pos / 8) >> (pos & 7); const unsigned int lo = v & ((1 << lobits) - 1); *type = (v >> lobits) & 3; return lo; } static int get_compacted_la_distance(unsigned int lobits, unsigned int encodebits, unsigned int vcnt, u8 *in, int i) { unsigned int lo, d1 = 0; u8 type; DBG_BUGON(i >= vcnt); do { lo = decode_compactedbits(lobits, in, encodebits * i, &type); if (type != Z_EROFS_LCLUSTER_TYPE_NONHEAD) return d1; ++d1; } while (++i < vcnt); /* vcnt - 1 (Z_EROFS_LCLUSTER_TYPE_NONHEAD) item */ if (!(lo & Z_EROFS_LI_D0_CBLKCNT)) d1 += lo - 1; return d1; } static int z_erofs_load_compact_lcluster(struct z_erofs_maprecorder *m, unsigned long lcn, bool lookahead) { struct inode *const inode = m->inode; struct erofs_inode *const vi = EROFS_I(inode); const erofs_off_t ebase = sizeof(struct z_erofs_map_header) + ALIGN(erofs_iloc(inode) + vi->inode_isize + vi->xattr_isize, 8); const unsigned int lclusterbits = vi->z_logical_clusterbits; const unsigned int totalidx = erofs_iblks(inode); unsigned int compacted_4b_initial, compacted_2b, amortizedshift; unsigned int vcnt, lo, lobits, encodebits, nblk, bytes; bool big_pcluster = vi->z_advise & Z_EROFS_ADVISE_BIG_PCLUSTER_1; erofs_off_t pos; u8 *in, type; int i; if (lcn >= totalidx || lclusterbits > 14) return -EINVAL; m->lcn = lcn; /* used to align to 32-byte (compacted_2b) alignment */ compacted_4b_initial = ((32 - ebase % 32) / 4) & 7; compacted_2b = 0; if ((vi->z_advise & Z_EROFS_ADVISE_COMPACTED_2B) && compacted_4b_initial < totalidx) compacted_2b = rounddown(totalidx - compacted_4b_initial, 16); pos = ebase; amortizedshift = 2; /* compact_4b */ if (lcn >= compacted_4b_initial) { pos += compacted_4b_initial * 4; lcn -= compacted_4b_initial; if (lcn < compacted_2b) { amortizedshift = 1; } else { pos += compacted_2b * 2; lcn -= compacted_2b; } } pos += lcn * (1 << amortizedshift); /* figure out the lcluster count in this pack */ if (1 << amortizedshift == 4 && lclusterbits <= 14) vcnt = 2; else if (1 << amortizedshift == 2 && lclusterbits <= 12) vcnt = 16; else return -EOPNOTSUPP; in = erofs_read_metabuf(&m->map->buf, m->inode->i_sb, pos, EROFS_KMAP); if (IS_ERR(in)) return PTR_ERR(in); /* it doesn't equal to round_up(..) */ m->nextpackoff = round_down(pos, vcnt << amortizedshift) + (vcnt << amortizedshift); lobits = max(lclusterbits, ilog2(Z_EROFS_LI_D0_CBLKCNT) + 1U); encodebits = ((vcnt << amortizedshift) - sizeof(__le32)) * 8 / vcnt; bytes = pos & ((vcnt << amortizedshift) - 1); in -= bytes; i = bytes >> amortizedshift; lo = decode_compactedbits(lobits, in, encodebits * i, &type); m->type = type; if (type == Z_EROFS_LCLUSTER_TYPE_NONHEAD) { m->clusterofs = 1 << lclusterbits; /* figure out lookahead_distance: delta[1] if needed */ if (lookahead) m->delta[1] = get_compacted_la_distance(lobits, encodebits, vcnt, in, i); if (lo & Z_EROFS_LI_D0_CBLKCNT) { if (!big_pcluster) { DBG_BUGON(1); return -EFSCORRUPTED; } m->compressedblks = lo & ~Z_EROFS_LI_D0_CBLKCNT; m->delta[0] = 1; return 0; } else if (i + 1 != (int)vcnt) { m->delta[0] = lo; return 0; } /* * since the last lcluster in the pack is special, * of which lo saves delta[1] rather than delta[0]. * Hence, get delta[0] by the previous lcluster indirectly. */ lo = decode_compactedbits(lobits, in, encodebits * (i - 1), &type); if (type != Z_EROFS_LCLUSTER_TYPE_NONHEAD) lo = 0; else if (lo & Z_EROFS_LI_D0_CBLKCNT) lo = 1; m->delta[0] = lo + 1; return 0; } m->clusterofs = lo; m->delta[0] = 0; /* figout out blkaddr (pblk) for HEAD lclusters */ if (!big_pcluster) { nblk = 1; while (i > 0) { --i; lo = decode_compactedbits(lobits, in, encodebits * i, &type); if (type == Z_EROFS_LCLUSTER_TYPE_NONHEAD) i -= lo; if (i >= 0) ++nblk; } } else { nblk = 0; while (i > 0) { --i; lo = decode_compactedbits(lobits, in, encodebits * i, &type); if (type == Z_EROFS_LCLUSTER_TYPE_NONHEAD) { if (lo & Z_EROFS_LI_D0_CBLKCNT) { --i; nblk += lo & ~Z_EROFS_LI_D0_CBLKCNT; continue; } /* bigpcluster shouldn't have plain d0 == 1 */ if (lo <= 1) { DBG_BUGON(1); return -EFSCORRUPTED; } i -= lo - 2; continue; } ++nblk; } } in += (vcnt << amortizedshift) - sizeof(__le32); m->pblk = le32_to_cpu(*(__le32 *)in) + nblk; return 0; } static int z_erofs_load_lcluster_from_disk(struct z_erofs_maprecorder *m, unsigned int lcn, bool lookahead) { switch (EROFS_I(m->inode)->datalayout) { case EROFS_INODE_COMPRESSED_FULL: return z_erofs_load_full_lcluster(m, lcn); case EROFS_INODE_COMPRESSED_COMPACT: return z_erofs_load_compact_lcluster(m, lcn, lookahead); default: return -EINVAL; } } static int z_erofs_extent_lookback(struct z_erofs_maprecorder *m, unsigned int lookback_distance) { struct super_block *sb = m->inode->i_sb; struct erofs_inode *const vi = EROFS_I(m->inode); const unsigned int lclusterbits = vi->z_logical_clusterbits; while (m->lcn >= lookback_distance) { unsigned long lcn = m->lcn - lookback_distance; int err; err = z_erofs_load_lcluster_from_disk(m, lcn, false); if (err) return err; switch (m->type) { case Z_EROFS_LCLUSTER_TYPE_NONHEAD: lookback_distance = m->delta[0]; if (!lookback_distance) goto err_bogus; continue; case Z_EROFS_LCLUSTER_TYPE_PLAIN: case Z_EROFS_LCLUSTER_TYPE_HEAD1: case Z_EROFS_LCLUSTER_TYPE_HEAD2: m->headtype = m->type; m->map->m_la = (lcn << lclusterbits) | m->clusterofs; return 0; default: erofs_err(sb, "unknown type %u @ lcn %lu of nid %llu", m->type, lcn, vi->nid); DBG_BUGON(1); return -EOPNOTSUPP; } } err_bogus: erofs_err(sb, "bogus lookback distance %u @ lcn %lu of nid %llu", lookback_distance, m->lcn, vi->nid); DBG_BUGON(1); return -EFSCORRUPTED; } static int z_erofs_get_extent_compressedlen(struct z_erofs_maprecorder *m, unsigned int initial_lcn) { struct inode *inode = m->inode; struct super_block *sb = inode->i_sb; struct erofs_inode *vi = EROFS_I(inode); bool bigpcl1 = vi->z_advise & Z_EROFS_ADVISE_BIG_PCLUSTER_1; bool bigpcl2 = vi->z_advise & Z_EROFS_ADVISE_BIG_PCLUSTER_2; unsigned long lcn = m->lcn + 1; int err; DBG_BUGON(m->type == Z_EROFS_LCLUSTER_TYPE_NONHEAD); DBG_BUGON(m->type != m->headtype); if ((m->headtype == Z_EROFS_LCLUSTER_TYPE_HEAD1 && !bigpcl1) || ((m->headtype == Z_EROFS_LCLUSTER_TYPE_PLAIN || m->headtype == Z_EROFS_LCLUSTER_TYPE_HEAD2) && !bigpcl2) || (lcn << vi->z_logical_clusterbits) >= inode->i_size) m->compressedblks = 1; if (m->compressedblks) goto out; err = z_erofs_load_lcluster_from_disk(m, lcn, false); if (err) return err; /* * If the 1st NONHEAD lcluster has already been handled initially w/o * valid compressedblks, which means at least it mustn't be CBLKCNT, or * an internal implemenatation error is detected. * * The following code can also handle it properly anyway, but let's * BUG_ON in the debugging mode only for developers to notice that. */ DBG_BUGON(lcn == initial_lcn && m->type == Z_EROFS_LCLUSTER_TYPE_NONHEAD); switch (m->type) { case Z_EROFS_LCLUSTER_TYPE_PLAIN: case Z_EROFS_LCLUSTER_TYPE_HEAD1: case Z_EROFS_LCLUSTER_TYPE_HEAD2: /* * if the 1st NONHEAD lcluster is actually PLAIN or HEAD type * rather than CBLKCNT, it's a 1 block-sized pcluster. */ m->compressedblks = 1; break; case Z_EROFS_LCLUSTER_TYPE_NONHEAD: if (m->delta[0] != 1) goto err_bonus_cblkcnt; if (m->compressedblks) break; fallthrough; default: erofs_err(sb, "cannot found CBLKCNT @ lcn %lu of nid %llu", lcn, vi->nid); DBG_BUGON(1); return -EFSCORRUPTED; } out: m->map->m_plen = erofs_pos(sb, m->compressedblks); return 0; err_bonus_cblkcnt: erofs_err(sb, "bogus CBLKCNT @ lcn %lu of nid %llu", lcn, vi->nid); DBG_BUGON(1); return -EFSCORRUPTED; } static int z_erofs_get_extent_decompressedlen(struct z_erofs_maprecorder *m) { struct inode *inode = m->inode; struct erofs_inode *vi = EROFS_I(inode); struct erofs_map_blocks *map = m->map; unsigned int lclusterbits = vi->z_logical_clusterbits; u64 lcn = m->lcn, headlcn = map->m_la >> lclusterbits; int err; while (1) { /* handle the last EOF pcluster (no next HEAD lcluster) */ if ((lcn << lclusterbits) >= inode->i_size) { map->m_llen = inode->i_size - map->m_la; return 0; } err = z_erofs_load_lcluster_from_disk(m, lcn, true); if (err) return err; if (m->type == Z_EROFS_LCLUSTER_TYPE_NONHEAD) { /* work around invalid d1 generated by pre-1.0 mkfs */ if (unlikely(!m->delta[1])) { m->delta[1] = 1; DBG_BUGON(1); } } else if (m->type == Z_EROFS_LCLUSTER_TYPE_PLAIN || m->type == Z_EROFS_LCLUSTER_TYPE_HEAD1 || m->type == Z_EROFS_LCLUSTER_TYPE_HEAD2) { if (lcn != headlcn) break; /* ends at the next HEAD lcluster */ m->delta[1] = 1; } else { erofs_err(inode->i_sb, "unknown type %u @ lcn %llu of nid %llu", m->type, lcn, vi->nid); DBG_BUGON(1); return -EOPNOTSUPP; } lcn += m->delta[1]; } map->m_llen = (lcn << lclusterbits) + m->clusterofs - map->m_la; return 0; } static int z_erofs_do_map_blocks(struct inode *inode, struct erofs_map_blocks *map, int flags) { struct erofs_inode *const vi = EROFS_I(inode); bool ztailpacking = vi->z_advise & Z_EROFS_ADVISE_INLINE_PCLUSTER; bool fragment = vi->z_advise & Z_EROFS_ADVISE_FRAGMENT_PCLUSTER; struct z_erofs_maprecorder m = { .inode = inode, .map = map, }; int err = 0; unsigned int lclusterbits, endoff, afmt; unsigned long initial_lcn; unsigned long long ofs, end; lclusterbits = vi->z_logical_clusterbits; ofs = flags & EROFS_GET_BLOCKS_FINDTAIL ? inode->i_size - 1 : map->m_la; initial_lcn = ofs >> lclusterbits; endoff = ofs & ((1 << lclusterbits) - 1); err = z_erofs_load_lcluster_from_disk(&m, initial_lcn, false); if (err) goto unmap_out; if (ztailpacking && (flags & EROFS_GET_BLOCKS_FINDTAIL)) vi->z_idataoff = m.nextpackoff; map->m_flags = EROFS_MAP_MAPPED | EROFS_MAP_ENCODED; end = (m.lcn + 1ULL) << lclusterbits; switch (m.type) { case Z_EROFS_LCLUSTER_TYPE_PLAIN: case Z_EROFS_LCLUSTER_TYPE_HEAD1: case Z_EROFS_LCLUSTER_TYPE_HEAD2: if (endoff >= m.clusterofs) { m.headtype = m.type; map->m_la = (m.lcn << lclusterbits) | m.clusterofs; /* * For ztailpacking files, in order to inline data more * effectively, special EOF lclusters are now supported * which can have three parts at most. */ if (ztailpacking && end > inode->i_size) end = inode->i_size; break; } /* m.lcn should be >= 1 if endoff < m.clusterofs */ if (!m.lcn) { erofs_err(inode->i_sb, "invalid logical cluster 0 at nid %llu", vi->nid); err = -EFSCORRUPTED; goto unmap_out; } end = (m.lcn << lclusterbits) | m.clusterofs; map->m_flags |= EROFS_MAP_FULL_MAPPED; m.delta[0] = 1; fallthrough; case Z_EROFS_LCLUSTER_TYPE_NONHEAD: /* get the corresponding first chunk */ err = z_erofs_extent_lookback(&m, m.delta[0]); if (err) goto unmap_out; break; default: erofs_err(inode->i_sb, "unknown type %u @ offset %llu of nid %llu", m.type, ofs, vi->nid); err = -EOPNOTSUPP; goto unmap_out; } if (m.partialref) map->m_flags |= EROFS_MAP_PARTIAL_REF; map->m_llen = end - map->m_la; if (flags & EROFS_GET_BLOCKS_FINDTAIL) { vi->z_tailextent_headlcn = m.lcn; /* for non-compact indexes, fragmentoff is 64 bits */ if (fragment && vi->datalayout == EROFS_INODE_COMPRESSED_FULL) vi->z_fragmentoff |= (u64)m.pblk << 32; } if (ztailpacking && m.lcn == vi->z_tailextent_headlcn) { map->m_flags |= EROFS_MAP_META; map->m_pa = vi->z_idataoff; map->m_plen = vi->z_idata_size; } else if (fragment && m.lcn == vi->z_tailextent_headlcn) { map->m_flags |= EROFS_MAP_FRAGMENT; } else { map->m_pa = erofs_pos(inode->i_sb, m.pblk); err = z_erofs_get_extent_compressedlen(&m, initial_lcn); if (err) goto unmap_out; } if (m.headtype == Z_EROFS_LCLUSTER_TYPE_PLAIN) { if (map->m_llen > map->m_plen) { DBG_BUGON(1); err = -EFSCORRUPTED; goto unmap_out; } afmt = vi->z_advise & Z_EROFS_ADVISE_INTERLACED_PCLUSTER ? Z_EROFS_COMPRESSION_INTERLACED : Z_EROFS_COMPRESSION_SHIFTED; } else { afmt = m.headtype == Z_EROFS_LCLUSTER_TYPE_HEAD2 ? vi->z_algorithmtype[1] : vi->z_algorithmtype[0]; if (!(EROFS_I_SB(inode)->available_compr_algs & (1 << afmt))) { erofs_err(inode->i_sb, "inconsistent algorithmtype %u for nid %llu", afmt, vi->nid); err = -EFSCORRUPTED; goto unmap_out; } } map->m_algorithmformat = afmt; if ((flags & EROFS_GET_BLOCKS_FIEMAP) || ((flags & EROFS_GET_BLOCKS_READMORE) && (map->m_algorithmformat == Z_EROFS_COMPRESSION_LZMA || map->m_algorithmformat == Z_EROFS_COMPRESSION_DEFLATE || map->m_algorithmformat == Z_EROFS_COMPRESSION_ZSTD) && map->m_llen >= i_blocksize(inode))) { err = z_erofs_get_extent_decompressedlen(&m); if (!err) map->m_flags |= EROFS_MAP_FULL_MAPPED; } unmap_out: erofs_unmap_metabuf(&m.map->buf); return err; } static int z_erofs_fill_inode_lazy(struct inode *inode) { struct erofs_inode *const vi = EROFS_I(inode); struct super_block *const sb = inode->i_sb; int err, headnr; erofs_off_t pos; struct erofs_buf buf = __EROFS_BUF_INITIALIZER; struct z_erofs_map_header *h; if (test_bit(EROFS_I_Z_INITED_BIT, &vi->flags)) { /* * paired with smp_mb() at the end of the function to ensure * fields will only be observed after the bit is set. */ smp_mb(); return 0; } if (wait_on_bit_lock(&vi->flags, EROFS_I_BL_Z_BIT, TASK_KILLABLE)) return -ERESTARTSYS; err = 0; if (test_bit(EROFS_I_Z_INITED_BIT, &vi->flags)) goto out_unlock; pos = ALIGN(erofs_iloc(inode) + vi->inode_isize + vi->xattr_isize, 8); h = erofs_read_metabuf(&buf, sb, pos, EROFS_KMAP); if (IS_ERR(h)) { err = PTR_ERR(h); goto out_unlock; } /* * if the highest bit of the 8-byte map header is set, the whole file * is stored in the packed inode. The rest bits keeps z_fragmentoff. */ if (h->h_clusterbits >> Z_EROFS_FRAGMENT_INODE_BIT) { vi->z_advise = Z_EROFS_ADVISE_FRAGMENT_PCLUSTER; vi->z_fragmentoff = le64_to_cpu(*(__le64 *)h) ^ (1ULL << 63); vi->z_tailextent_headlcn = 0; goto done; } vi->z_advise = le16_to_cpu(h->h_advise); vi->z_algorithmtype[0] = h->h_algorithmtype & 15; vi->z_algorithmtype[1] = h->h_algorithmtype >> 4; headnr = 0; if (vi->z_algorithmtype[0] >= Z_EROFS_COMPRESSION_MAX || vi->z_algorithmtype[++headnr] >= Z_EROFS_COMPRESSION_MAX) { erofs_err(sb, "unknown HEAD%u format %u for nid %llu, please upgrade kernel", headnr + 1, vi->z_algorithmtype[headnr], vi->nid); err = -EOPNOTSUPP; goto out_put_metabuf; } vi->z_logical_clusterbits = sb->s_blocksize_bits + (h->h_clusterbits & 7); if (!erofs_sb_has_big_pcluster(EROFS_SB(sb)) && vi->z_advise & (Z_EROFS_ADVISE_BIG_PCLUSTER_1 | Z_EROFS_ADVISE_BIG_PCLUSTER_2)) { erofs_err(sb, "per-inode big pcluster without sb feature for nid %llu", vi->nid); err = -EFSCORRUPTED; goto out_put_metabuf; } if (vi->datalayout == EROFS_INODE_COMPRESSED_COMPACT && !(vi->z_advise & Z_EROFS_ADVISE_BIG_PCLUSTER_1) ^ !(vi->z_advise & Z_EROFS_ADVISE_BIG_PCLUSTER_2)) { erofs_err(sb, "big pcluster head1/2 of compact indexes should be consistent for nid %llu", vi->nid); err = -EFSCORRUPTED; goto out_put_metabuf; } if (vi->z_advise & Z_EROFS_ADVISE_INLINE_PCLUSTER) { struct erofs_map_blocks map = { .buf = __EROFS_BUF_INITIALIZER }; vi->z_idata_size = le16_to_cpu(h->h_idata_size); err = z_erofs_do_map_blocks(inode, &map, EROFS_GET_BLOCKS_FINDTAIL); erofs_put_metabuf(&map.buf); if (!map.m_plen || erofs_blkoff(sb, map.m_pa) + map.m_plen > sb->s_blocksize) { erofs_err(sb, "invalid tail-packing pclustersize %llu", map.m_plen); err = -EFSCORRUPTED; } if (err < 0) goto out_put_metabuf; } if (vi->z_advise & Z_EROFS_ADVISE_FRAGMENT_PCLUSTER && !(h->h_clusterbits >> Z_EROFS_FRAGMENT_INODE_BIT)) { struct erofs_map_blocks map = { .buf = __EROFS_BUF_INITIALIZER }; vi->z_fragmentoff = le32_to_cpu(h->h_fragmentoff); err = z_erofs_do_map_blocks(inode, &map, EROFS_GET_BLOCKS_FINDTAIL); erofs_put_metabuf(&map.buf); if (err < 0) goto out_put_metabuf; } done: /* paired with smp_mb() at the beginning of the function */ smp_mb(); set_bit(EROFS_I_Z_INITED_BIT, &vi->flags); out_put_metabuf: erofs_put_metabuf(&buf); out_unlock: clear_and_wake_up_bit(EROFS_I_BL_Z_BIT, &vi->flags); return err; } int z_erofs_map_blocks_iter(struct inode *inode, struct erofs_map_blocks *map, int flags) { struct erofs_inode *const vi = EROFS_I(inode); int err = 0; trace_erofs_map_blocks_enter(inode, map, flags); if (map->m_la >= inode->i_size) { /* post-EOF unmapped extent */ map->m_llen = map->m_la + 1 - inode->i_size; map->m_la = inode->i_size; map->m_flags = 0; } else { err = z_erofs_fill_inode_lazy(inode); if (!err) { if ((vi->z_advise & Z_EROFS_ADVISE_FRAGMENT_PCLUSTER) && !vi->z_tailextent_headlcn) { map->m_la = 0; map->m_llen = inode->i_size; map->m_flags = EROFS_MAP_MAPPED | EROFS_MAP_FULL_MAPPED | EROFS_MAP_FRAGMENT; } else { err = z_erofs_do_map_blocks(inode, map, flags); } } if (!err && (map->m_flags & EROFS_MAP_ENCODED) && unlikely(map->m_plen > Z_EROFS_PCLUSTER_MAX_SIZE || map->m_llen > Z_EROFS_PCLUSTER_MAX_DSIZE)) err = -EOPNOTSUPP; if (err) map->m_llen = 0; } trace_erofs_map_blocks_exit(inode, map, flags, err); return err; } static int z_erofs_iomap_begin_report(struct inode *inode, loff_t offset, loff_t length, unsigned int flags, struct iomap *iomap, struct iomap *srcmap) { int ret; struct erofs_map_blocks map = { .m_la = offset }; ret = z_erofs_map_blocks_iter(inode, &map, EROFS_GET_BLOCKS_FIEMAP); erofs_put_metabuf(&map.buf); if (ret < 0) return ret; iomap->bdev = inode->i_sb->s_bdev; iomap->offset = map.m_la; iomap->length = map.m_llen; if (map.m_flags & EROFS_MAP_MAPPED) { iomap->type = IOMAP_MAPPED; iomap->addr = map.m_flags & EROFS_MAP_FRAGMENT ? IOMAP_NULL_ADDR : map.m_pa; } else { iomap->type = IOMAP_HOLE; iomap->addr = IOMAP_NULL_ADDR; /* * No strict rule on how to describe extents for post EOF, yet * we need to do like below. Otherwise, iomap itself will get * into an endless loop on post EOF. * * Calculate the effective offset by subtracting extent start * (map.m_la) from the requested offset, and add it to length. * (NB: offset >= map.m_la always) */ if (iomap->offset >= inode->i_size) iomap->length = length + offset - map.m_la; } iomap->flags = 0; return 0; } const struct iomap_ops z_erofs_iomap_report_ops = { .iomap_begin = z_erofs_iomap_begin_report, }; |
| 8 8 2 8 2 2 2 2 2 2 2 2 2 2 2 3 3 1 1 11 5 6 3 2 2 1 5 5 5 3 3 1 2 4 2 2 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 | // SPDX-License-Identifier: GPL-2.0-only /* * vivid-vbi-cap.c - vbi capture support functions. * * Copyright 2014 Cisco Systems, Inc. and/or its affiliates. All rights reserved. */ #include <linux/errno.h> #include <linux/kernel.h> #include <linux/videodev2.h> #include <media/v4l2-common.h> #include "vivid-core.h" #include "vivid-kthread-cap.h" #include "vivid-vbi-cap.h" #include "vivid-vbi-gen.h" #include "vivid-vid-common.h" static void vivid_sliced_vbi_cap_fill(struct vivid_dev *dev, unsigned seqnr) { struct vivid_vbi_gen_data *vbi_gen = &dev->vbi_gen; bool is_60hz = dev->std_cap[dev->input] & V4L2_STD_525_60; vivid_vbi_gen_sliced(vbi_gen, is_60hz, seqnr); if (!is_60hz) { if (vivid_vid_can_loop(dev)) { if (dev->vbi_out_have_wss) { vbi_gen->data[12].data[0] = dev->vbi_out_wss[0]; vbi_gen->data[12].data[1] = dev->vbi_out_wss[1]; } else { vbi_gen->data[12].id = 0; } } else { switch (tpg_g_video_aspect(&dev->tpg)) { case TPG_VIDEO_ASPECT_14X9_CENTRE: vbi_gen->data[12].data[0] = 0x01; break; case TPG_VIDEO_ASPECT_16X9_CENTRE: vbi_gen->data[12].data[0] = 0x0b; break; case TPG_VIDEO_ASPECT_16X9_ANAMORPHIC: vbi_gen->data[12].data[0] = 0x07; break; case TPG_VIDEO_ASPECT_4X3: default: vbi_gen->data[12].data[0] = 0x08; break; } } } else if (vivid_vid_can_loop(dev) && is_60hz) { if (dev->vbi_out_have_cc[0]) { vbi_gen->data[0].data[0] = dev->vbi_out_cc[0][0]; vbi_gen->data[0].data[1] = dev->vbi_out_cc[0][1]; } else { vbi_gen->data[0].id = 0; } if (dev->vbi_out_have_cc[1]) { vbi_gen->data[1].data[0] = dev->vbi_out_cc[1][0]; vbi_gen->data[1].data[1] = dev->vbi_out_cc[1][1]; } else { vbi_gen->data[1].id = 0; } } } static void vivid_g_fmt_vbi_cap(struct vivid_dev *dev, struct v4l2_vbi_format *vbi) { bool is_60hz = dev->std_cap[dev->input] & V4L2_STD_525_60; vbi->sampling_rate = 27000000; vbi->offset = 24; vbi->samples_per_line = 1440; vbi->sample_format = V4L2_PIX_FMT_GREY; vbi->start[0] = is_60hz ? V4L2_VBI_ITU_525_F1_START + 9 : V4L2_VBI_ITU_625_F1_START + 5; vbi->start[1] = is_60hz ? V4L2_VBI_ITU_525_F2_START + 9 : V4L2_VBI_ITU_625_F2_START + 5; vbi->count[0] = vbi->count[1] = is_60hz ? 12 : 18; vbi->flags = dev->vbi_cap_interlaced ? V4L2_VBI_INTERLACED : 0; vbi->reserved[0] = 0; vbi->reserved[1] = 0; } void vivid_raw_vbi_cap_process(struct vivid_dev *dev, struct vivid_buffer *buf) { struct v4l2_vbi_format vbi; u8 *vbuf = vb2_plane_vaddr(&buf->vb.vb2_buf, 0); vivid_g_fmt_vbi_cap(dev, &vbi); buf->vb.sequence = dev->vbi_cap_seq_count; if (dev->field_cap == V4L2_FIELD_ALTERNATE) buf->vb.sequence /= 2; vivid_sliced_vbi_cap_fill(dev, buf->vb.sequence); memset(vbuf, 0x10, vb2_plane_size(&buf->vb.vb2_buf, 0)); if (!VIVID_INVALID_SIGNAL(dev->std_signal_mode[dev->input])) vivid_vbi_gen_raw(&dev->vbi_gen, &vbi, vbuf); } void vivid_sliced_vbi_cap_process(struct vivid_dev *dev, struct vivid_buffer *buf) { struct v4l2_sliced_vbi_data *vbuf = vb2_plane_vaddr(&buf->vb.vb2_buf, 0); buf->vb.sequence = dev->vbi_cap_seq_count; if (dev->field_cap == V4L2_FIELD_ALTERNATE) buf->vb.sequence /= 2; vivid_sliced_vbi_cap_fill(dev, buf->vb.sequence); memset(vbuf, 0, vb2_plane_size(&buf->vb.vb2_buf, 0)); if (!VIVID_INVALID_SIGNAL(dev->std_signal_mode[dev->input])) { unsigned i; for (i = 0; i < 25; i++) vbuf[i] = dev->vbi_gen.data[i]; } } static int vbi_cap_queue_setup(struct vb2_queue *vq, unsigned *nbuffers, unsigned *nplanes, unsigned sizes[], struct device *alloc_devs[]) { struct vivid_dev *dev = vb2_get_drv_priv(vq); bool is_60hz = dev->std_cap[dev->input] & V4L2_STD_525_60; unsigned size = vq->type == V4L2_BUF_TYPE_SLICED_VBI_CAPTURE ? 36 * sizeof(struct v4l2_sliced_vbi_data) : 1440 * 2 * (is_60hz ? 12 : 18); if (!vivid_is_sdtv_cap(dev)) return -EINVAL; if (*nplanes) return sizes[0] < size ? -EINVAL : 0; sizes[0] = size; *nplanes = 1; return 0; } static int vbi_cap_buf_prepare(struct vb2_buffer *vb) { struct vivid_dev *dev = vb2_get_drv_priv(vb->vb2_queue); bool is_60hz = dev->std_cap[dev->input] & V4L2_STD_525_60; unsigned size = vb->vb2_queue->type == V4L2_BUF_TYPE_SLICED_VBI_CAPTURE ? 36 * sizeof(struct v4l2_sliced_vbi_data) : 1440 * 2 * (is_60hz ? 12 : 18); dprintk(dev, 1, "%s\n", __func__); if (dev->buf_prepare_error) { /* * Error injection: test what happens if buf_prepare() returns * an error. */ dev->buf_prepare_error = false; return -EINVAL; } if (vb2_plane_size(vb, 0) < size) { dprintk(dev, 1, "%s data will not fit into plane (%lu < %u)\n", __func__, vb2_plane_size(vb, 0), size); return -EINVAL; } vb2_set_plane_payload(vb, 0, size); return 0; } static void vbi_cap_buf_queue(struct vb2_buffer *vb) { struct vb2_v4l2_buffer *vbuf = to_vb2_v4l2_buffer(vb); struct vivid_dev *dev = vb2_get_drv_priv(vb->vb2_queue); struct vivid_buffer *buf = container_of(vbuf, struct vivid_buffer, vb); dprintk(dev, 1, "%s\n", __func__); spin_lock(&dev->slock); list_add_tail(&buf->list, &dev->vbi_cap_active); spin_unlock(&dev->slock); } static int vbi_cap_start_streaming(struct vb2_queue *vq, unsigned count) { struct vivid_dev *dev = vb2_get_drv_priv(vq); int err; dprintk(dev, 1, "%s\n", __func__); dev->vbi_cap_seq_count = 0; if (dev->start_streaming_error) { dev->start_streaming_error = false; err = -EINVAL; } else { err = vivid_start_generating_vid_cap(dev, &dev->vbi_cap_streaming); } if (err) { struct vivid_buffer *buf, *tmp; list_for_each_entry_safe(buf, tmp, &dev->vbi_cap_active, list) { list_del(&buf->list); vb2_buffer_done(&buf->vb.vb2_buf, VB2_BUF_STATE_QUEUED); } } return err; } /* abort streaming and wait for last buffer */ static void vbi_cap_stop_streaming(struct vb2_queue *vq) { struct vivid_dev *dev = vb2_get_drv_priv(vq); dprintk(dev, 1, "%s\n", __func__); vivid_stop_generating_vid_cap(dev, &dev->vbi_cap_streaming); } static void vbi_cap_buf_request_complete(struct vb2_buffer *vb) { struct vivid_dev *dev = vb2_get_drv_priv(vb->vb2_queue); v4l2_ctrl_request_complete(vb->req_obj.req, &dev->ctrl_hdl_vbi_cap); } const struct vb2_ops vivid_vbi_cap_qops = { .queue_setup = vbi_cap_queue_setup, .buf_prepare = vbi_cap_buf_prepare, .buf_queue = vbi_cap_buf_queue, .start_streaming = vbi_cap_start_streaming, .stop_streaming = vbi_cap_stop_streaming, .buf_request_complete = vbi_cap_buf_request_complete, }; int vidioc_g_fmt_vbi_cap(struct file *file, void *priv, struct v4l2_format *f) { struct vivid_dev *dev = video_drvdata(file); struct v4l2_vbi_format *vbi = &f->fmt.vbi; if (!vivid_is_sdtv_cap(dev) || !dev->has_raw_vbi_cap) return -EINVAL; vivid_g_fmt_vbi_cap(dev, vbi); return 0; } int vidioc_s_fmt_vbi_cap(struct file *file, void *priv, struct v4l2_format *f) { struct vivid_dev *dev = video_drvdata(file); int ret = vidioc_g_fmt_vbi_cap(file, priv, f); if (ret) return ret; if (f->type != V4L2_BUF_TYPE_VBI_CAPTURE && vb2_is_busy(&dev->vb_vbi_cap_q)) return -EBUSY; return 0; } void vivid_fill_service_lines(struct v4l2_sliced_vbi_format *vbi, u32 service_set) { vbi->io_size = sizeof(struct v4l2_sliced_vbi_data) * 36; vbi->service_set = service_set; memset(vbi->service_lines, 0, sizeof(vbi->service_lines)); memset(vbi->reserved, 0, sizeof(vbi->reserved)); if (vbi->service_set == 0) return; if (vbi->service_set & V4L2_SLICED_CAPTION_525) { vbi->service_lines[0][21] = V4L2_SLICED_CAPTION_525; vbi->service_lines[1][21] = V4L2_SLICED_CAPTION_525; } if (vbi->service_set & V4L2_SLICED_WSS_625) { unsigned i; for (i = 7; i <= 18; i++) vbi->service_lines[0][i] = vbi->service_lines[1][i] = V4L2_SLICED_TELETEXT_B; vbi->service_lines[0][23] = V4L2_SLICED_WSS_625; } } int vidioc_g_fmt_sliced_vbi_cap(struct file *file, void *fh, struct v4l2_format *fmt) { struct vivid_dev *dev = video_drvdata(file); struct v4l2_sliced_vbi_format *vbi = &fmt->fmt.sliced; if (!vivid_is_sdtv_cap(dev) || !dev->has_sliced_vbi_cap) return -EINVAL; vivid_fill_service_lines(vbi, dev->service_set_cap); return 0; } int vidioc_try_fmt_sliced_vbi_cap(struct file *file, void *fh, struct v4l2_format *fmt) { struct vivid_dev *dev = video_drvdata(file); struct v4l2_sliced_vbi_format *vbi = &fmt->fmt.sliced; bool is_60hz = dev->std_cap[dev->input] & V4L2_STD_525_60; u32 service_set = vbi->service_set; if (!vivid_is_sdtv_cap(dev) || !dev->has_sliced_vbi_cap) return -EINVAL; service_set &= is_60hz ? V4L2_SLICED_CAPTION_525 : V4L2_SLICED_WSS_625 | V4L2_SLICED_TELETEXT_B; vivid_fill_service_lines(vbi, service_set); return 0; } int vidioc_s_fmt_sliced_vbi_cap(struct file *file, void *fh, struct v4l2_format *fmt) { struct vivid_dev *dev = video_drvdata(file); struct v4l2_sliced_vbi_format *vbi = &fmt->fmt.sliced; int ret = vidioc_try_fmt_sliced_vbi_cap(file, fh, fmt); if (ret) return ret; if (fmt->type != V4L2_BUF_TYPE_SLICED_VBI_CAPTURE && vb2_is_busy(&dev->vb_vbi_cap_q)) return -EBUSY; dev->service_set_cap = vbi->service_set; return 0; } int vidioc_g_sliced_vbi_cap(struct file *file, void *fh, struct v4l2_sliced_vbi_cap *cap) { struct vivid_dev *dev = video_drvdata(file); struct video_device *vdev = video_devdata(file); bool is_60hz; if (vdev->vfl_dir == VFL_DIR_RX) { is_60hz = dev->std_cap[dev->input] & V4L2_STD_525_60; if (!vivid_is_sdtv_cap(dev) || !dev->has_sliced_vbi_cap || cap->type != V4L2_BUF_TYPE_SLICED_VBI_CAPTURE) return -EINVAL; } else { is_60hz = dev->std_out & V4L2_STD_525_60; if (!vivid_is_svid_out(dev) || !dev->has_sliced_vbi_out || cap->type != V4L2_BUF_TYPE_SLICED_VBI_OUTPUT) return -EINVAL; } cap->service_set = is_60hz ? V4L2_SLICED_CAPTION_525 : V4L2_SLICED_WSS_625 | V4L2_SLICED_TELETEXT_B; if (is_60hz) { cap->service_lines[0][21] = V4L2_SLICED_CAPTION_525; cap->service_lines[1][21] = V4L2_SLICED_CAPTION_525; } else { unsigned i; for (i = 7; i <= 18; i++) cap->service_lines[0][i] = cap->service_lines[1][i] = V4L2_SLICED_TELETEXT_B; cap->service_lines[0][23] = V4L2_SLICED_WSS_625; } return 0; } |
| 3 2 251 3 413 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 | // SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc. * All Rights Reserved. */ #ifndef __XFS_LOG_PRIV_H__ #define __XFS_LOG_PRIV_H__ #include "xfs_extent_busy.h" /* for struct xfs_busy_extents */ struct xfs_buf; struct xlog; struct xlog_ticket; struct xfs_mount; /* * get client id from packed copy. * * this hack is here because the xlog_pack code copies four bytes * of xlog_op_header containing the fields oh_clientid, oh_flags * and oh_res2 into the packed copy. * * later on this four byte chunk is treated as an int and the * client id is pulled out. * * this has endian issues, of course. */ static inline uint xlog_get_client_id(__be32 i) { return be32_to_cpu(i) >> 24; } /* * In core log state */ enum xlog_iclog_state { XLOG_STATE_ACTIVE, /* Current IC log being written to */ XLOG_STATE_WANT_SYNC, /* Want to sync this iclog; no more writes */ XLOG_STATE_SYNCING, /* This IC log is syncing */ XLOG_STATE_DONE_SYNC, /* Done syncing to disk */ XLOG_STATE_CALLBACK, /* Callback functions now */ XLOG_STATE_DIRTY, /* Dirty IC log, not ready for ACTIVE status */ }; #define XLOG_STATE_STRINGS \ { XLOG_STATE_ACTIVE, "XLOG_STATE_ACTIVE" }, \ { XLOG_STATE_WANT_SYNC, "XLOG_STATE_WANT_SYNC" }, \ { XLOG_STATE_SYNCING, "XLOG_STATE_SYNCING" }, \ { XLOG_STATE_DONE_SYNC, "XLOG_STATE_DONE_SYNC" }, \ { XLOG_STATE_CALLBACK, "XLOG_STATE_CALLBACK" }, \ { XLOG_STATE_DIRTY, "XLOG_STATE_DIRTY" } /* * In core log flags */ #define XLOG_ICL_NEED_FLUSH (1u << 0) /* iclog needs REQ_PREFLUSH */ #define XLOG_ICL_NEED_FUA (1u << 1) /* iclog needs REQ_FUA */ #define XLOG_ICL_STRINGS \ { XLOG_ICL_NEED_FLUSH, "XLOG_ICL_NEED_FLUSH" }, \ { XLOG_ICL_NEED_FUA, "XLOG_ICL_NEED_FUA" } /* * Log ticket flags */ #define XLOG_TIC_PERM_RESERV (1u << 0) /* permanent reservation */ #define XLOG_TIC_FLAGS \ { XLOG_TIC_PERM_RESERV, "XLOG_TIC_PERM_RESERV" } /* * Below are states for covering allocation transactions. * By covering, we mean changing the h_tail_lsn in the last on-disk * log write such that no allocation transactions will be re-done during * recovery after a system crash. Recovery starts at the last on-disk * log write. * * These states are used to insert dummy log entries to cover * space allocation transactions which can undo non-transactional changes * after a crash. Writes to a file with space * already allocated do not result in any transactions. Allocations * might include space beyond the EOF. So if we just push the EOF a * little, the last transaction for the file could contain the wrong * size. If there is no file system activity, after an allocation * transaction, and the system crashes, the allocation transaction * will get replayed and the file will be truncated. This could * be hours/days/... after the allocation occurred. * * The fix for this is to do two dummy transactions when the * system is idle. We need two dummy transaction because the h_tail_lsn * in the log record header needs to point beyond the last possible * non-dummy transaction. The first dummy changes the h_tail_lsn to * the first transaction before the dummy. The second dummy causes * h_tail_lsn to point to the first dummy. Recovery starts at h_tail_lsn. * * These dummy transactions get committed when everything * is idle (after there has been some activity). * * There are 5 states used to control this. * * IDLE -- no logging has been done on the file system or * we are done covering previous transactions. * NEED -- logging has occurred and we need a dummy transaction * when the log becomes idle. * DONE -- we were in the NEED state and have committed a dummy * transaction. * NEED2 -- we detected that a dummy transaction has gone to the * on disk log with no other transactions. * DONE2 -- we committed a dummy transaction when in the NEED2 state. * * There are two places where we switch states: * * 1.) In xfs_sync, when we detect an idle log and are in NEED or NEED2. * We commit the dummy transaction and switch to DONE or DONE2, * respectively. In all other states, we don't do anything. * * 2.) When we finish writing the on-disk log (xlog_state_clean_log). * * No matter what state we are in, if this isn't the dummy * transaction going out, the next state is NEED. * So, if we aren't in the DONE or DONE2 states, the next state * is NEED. We can't be finishing a write of the dummy record * unless it was committed and the state switched to DONE or DONE2. * * If we are in the DONE state and this was a write of the * dummy transaction, we move to NEED2. * * If we are in the DONE2 state and this was a write of the * dummy transaction, we move to IDLE. * * * Writing only one dummy transaction can get appended to * one file space allocation. When this happens, the log recovery * code replays the space allocation and a file could be truncated. * This is why we have the NEED2 and DONE2 states before going idle. */ #define XLOG_STATE_COVER_IDLE 0 #define XLOG_STATE_COVER_NEED 1 #define XLOG_STATE_COVER_DONE 2 #define XLOG_STATE_COVER_NEED2 3 #define XLOG_STATE_COVER_DONE2 4 #define XLOG_COVER_OPS 5 typedef struct xlog_ticket { struct list_head t_queue; /* reserve/write queue */ struct task_struct *t_task; /* task that owns this ticket */ xlog_tid_t t_tid; /* transaction identifier */ atomic_t t_ref; /* ticket reference count */ int t_curr_res; /* current reservation */ int t_unit_res; /* unit reservation */ char t_ocnt; /* original unit count */ char t_cnt; /* current unit count */ uint8_t t_flags; /* properties of reservation */ int t_iclog_hdrs; /* iclog hdrs in t_curr_res */ } xlog_ticket_t; /* * - A log record header is 512 bytes. There is plenty of room to grow the * xlog_rec_header_t into the reserved space. * - ic_data follows, so a write to disk can start at the beginning of * the iclog. * - ic_forcewait is used to implement synchronous forcing of the iclog to disk. * - ic_next is the pointer to the next iclog in the ring. * - ic_log is a pointer back to the global log structure. * - ic_size is the full size of the log buffer, minus the cycle headers. * - ic_offset is the current number of bytes written to in this iclog. * - ic_refcnt is bumped when someone is writing to the log. * - ic_state is the state of the iclog. * * Because of cacheline contention on large machines, we need to separate * various resources onto different cachelines. To start with, make the * structure cacheline aligned. The following fields can be contended on * by independent processes: * * - ic_callbacks * - ic_refcnt * - fields protected by the global l_icloglock * * so we need to ensure that these fields are located in separate cachelines. * We'll put all the read-only and l_icloglock fields in the first cacheline, * and move everything else out to subsequent cachelines. */ typedef struct xlog_in_core { wait_queue_head_t ic_force_wait; wait_queue_head_t ic_write_wait; struct xlog_in_core *ic_next; struct xlog_in_core *ic_prev; struct xlog *ic_log; u32 ic_size; u32 ic_offset; enum xlog_iclog_state ic_state; unsigned int ic_flags; void *ic_datap; /* pointer to iclog data */ struct list_head ic_callbacks; /* reference counts need their own cacheline */ atomic_t ic_refcnt ____cacheline_aligned_in_smp; xlog_in_core_2_t *ic_data; #define ic_header ic_data->hic_header #ifdef DEBUG bool ic_fail_crc : 1; #endif struct semaphore ic_sema; struct work_struct ic_end_io_work; struct bio ic_bio; struct bio_vec ic_bvec[]; } xlog_in_core_t; /* * The CIL context is used to aggregate per-transaction details as well be * passed to the iclog for checkpoint post-commit processing. After being * passed to the iclog, another context needs to be allocated for tracking the * next set of transactions to be aggregated into a checkpoint. */ struct xfs_cil; struct xfs_cil_ctx { struct xfs_cil *cil; xfs_csn_t sequence; /* chkpt sequence # */ xfs_lsn_t start_lsn; /* first LSN of chkpt commit */ xfs_lsn_t commit_lsn; /* chkpt commit record lsn */ struct xlog_in_core *commit_iclog; struct xlog_ticket *ticket; /* chkpt ticket */ atomic_t space_used; /* aggregate size of regions */ struct xfs_busy_extents busy_extents; struct list_head log_items; /* log items in chkpt */ struct list_head lv_chain; /* logvecs being pushed */ struct list_head iclog_entry; struct list_head committing; /* ctx committing list */ struct work_struct push_work; atomic_t order_id; /* * CPUs that could have added items to the percpu CIL data. Access is * coordinated with xc_ctx_lock. */ struct cpumask cil_pcpmask; }; /* * Per-cpu CIL tracking items */ struct xlog_cil_pcp { int32_t space_used; uint32_t space_reserved; struct list_head busy_extents; struct list_head log_items; }; /* * Committed Item List structure * * This structure is used to track log items that have been committed but not * yet written into the log. It is used only when the delayed logging mount * option is enabled. * * This structure tracks the list of committing checkpoint contexts so * we can avoid the problem of having to hold out new transactions during a * flush until we have a the commit record LSN of the checkpoint. We can * traverse the list of committing contexts in xlog_cil_push_lsn() to find a * sequence match and extract the commit LSN directly from there. If the * checkpoint is still in the process of committing, we can block waiting for * the commit LSN to be determined as well. This should make synchronous * operations almost as efficient as the old logging methods. */ struct xfs_cil { struct xlog *xc_log; unsigned long xc_flags; atomic_t xc_iclog_hdrs; struct workqueue_struct *xc_push_wq; struct rw_semaphore xc_ctx_lock ____cacheline_aligned_in_smp; struct xfs_cil_ctx *xc_ctx; spinlock_t xc_push_lock ____cacheline_aligned_in_smp; xfs_csn_t xc_push_seq; bool xc_push_commit_stable; struct list_head xc_committing; wait_queue_head_t xc_commit_wait; wait_queue_head_t xc_start_wait; xfs_csn_t xc_current_sequence; wait_queue_head_t xc_push_wait; /* background push throttle */ void __percpu *xc_pcp; /* percpu CIL structures */ } ____cacheline_aligned_in_smp; /* xc_flags bit values */ #define XLOG_CIL_EMPTY 1 #define XLOG_CIL_PCP_SPACE 2 /* * The amount of log space we allow the CIL to aggregate is difficult to size. * Whatever we choose, we have to make sure we can get a reservation for the * log space effectively, that it is large enough to capture sufficient * relogging to reduce log buffer IO significantly, but it is not too large for * the log or induces too much latency when writing out through the iclogs. We * track both space consumed and the number of vectors in the checkpoint * context, so we need to decide which to use for limiting. * * Every log buffer we write out during a push needs a header reserved, which * is at least one sector and more for v2 logs. Hence we need a reservation of * at least 512 bytes per 32k of log space just for the LR headers. That means * 16KB of reservation per megabyte of delayed logging space we will consume, * plus various headers. The number of headers will vary based on the num of * io vectors, so limiting on a specific number of vectors is going to result * in transactions of varying size. IOWs, it is more consistent to track and * limit space consumed in the log rather than by the number of objects being * logged in order to prevent checkpoint ticket overruns. * * Further, use of static reservations through the log grant mechanism is * problematic. It introduces a lot of complexity (e.g. reserve grant vs write * grant) and a significant deadlock potential because regranting write space * can block on log pushes. Hence if we have to regrant log space during a log * push, we can deadlock. * * However, we can avoid this by use of a dynamic "reservation stealing" * technique during transaction commit whereby unused reservation space in the * transaction ticket is transferred to the CIL ctx commit ticket to cover the * space needed by the checkpoint transaction. This means that we never need to * specifically reserve space for the CIL checkpoint transaction, nor do we * need to regrant space once the checkpoint completes. This also means the * checkpoint transaction ticket is specific to the checkpoint context, rather * than the CIL itself. * * With dynamic reservations, we can effectively make up arbitrary limits for * the checkpoint size so long as they don't violate any other size rules. * Recovery imposes a rule that no transaction exceed half the log, so we are * limited by that. Furthermore, the log transaction reservation subsystem * tries to keep 25% of the log free, so we need to keep below that limit or we * risk running out of free log space to start any new transactions. * * In order to keep background CIL push efficient, we only need to ensure the * CIL is large enough to maintain sufficient in-memory relogging to avoid * repeated physical writes of frequently modified metadata. If we allow the CIL * to grow to a substantial fraction of the log, then we may be pinning hundreds * of megabytes of metadata in memory until the CIL flushes. This can cause * issues when we are running low on memory - pinned memory cannot be reclaimed, * and the CIL consumes a lot of memory. Hence we need to set an upper physical * size limit for the CIL that limits the maximum amount of memory pinned by the * CIL but does not limit performance by reducing relogging efficiency * significantly. * * As such, the CIL push threshold ends up being the smaller of two thresholds: * - a threshold large enough that it allows CIL to be pushed and progress to be * made without excessive blocking of incoming transaction commits. This is * defined to be 12.5% of the log space - half the 25% push threshold of the * AIL. * - small enough that it doesn't pin excessive amounts of memory but maintains * close to peak relogging efficiency. This is defined to be 16x the iclog * buffer window (32MB) as measurements have shown this to be roughly the * point of diminishing performance increases under highly concurrent * modification workloads. * * To prevent the CIL from overflowing upper commit size bounds, we introduce a * new threshold at which we block committing transactions until the background * CIL commit commences and switches to a new context. While this is not a hard * limit, it forces the process committing a transaction to the CIL to block and * yeild the CPU, giving the CIL push work a chance to be scheduled and start * work. This prevents a process running lots of transactions from overfilling * the CIL because it is not yielding the CPU. We set the blocking limit at * twice the background push space threshold so we keep in line with the AIL * push thresholds. * * Note: this is not a -hard- limit as blocking is applied after the transaction * is inserted into the CIL and the push has been triggered. It is largely a * throttling mechanism that allows the CIL push to be scheduled and run. A hard * limit will be difficult to implement without introducing global serialisation * in the CIL commit fast path, and it's not at all clear that we actually need * such hard limits given the ~7 years we've run without a hard limit before * finding the first situation where a checkpoint size overflow actually * occurred. Hence the simple throttle, and an ASSERT check to tell us that * we've overrun the max size. */ #define XLOG_CIL_SPACE_LIMIT(log) \ min_t(int, (log)->l_logsize >> 3, BBTOB(XLOG_TOTAL_REC_SHIFT(log)) << 4) #define XLOG_CIL_BLOCKING_SPACE_LIMIT(log) \ (XLOG_CIL_SPACE_LIMIT(log) * 2) /* * ticket grant locks, queues and accounting have their own cachlines * as these are quite hot and can be operated on concurrently. */ struct xlog_grant_head { spinlock_t lock ____cacheline_aligned_in_smp; struct list_head waiters; atomic64_t grant; }; /* * The reservation head lsn is not made up of a cycle number and block number. * Instead, it uses a cycle number and byte number. Logs don't expect to * overflow 31 bits worth of byte offset, so using a byte number will mean * that round off problems won't occur when releasing partial reservations. */ struct xlog { /* The following fields don't need locking */ struct xfs_mount *l_mp; /* mount point */ struct xfs_ail *l_ailp; /* AIL log is working with */ struct xfs_cil *l_cilp; /* CIL log is working with */ struct xfs_buftarg *l_targ; /* buftarg of log */ struct workqueue_struct *l_ioend_workqueue; /* for I/O completions */ struct delayed_work l_work; /* background flush work */ long l_opstate; /* operational state */ uint l_quotaoffs_flag; /* XFS_DQ_*, for QUOTAOFFs */ struct list_head *l_buf_cancel_table; struct list_head r_dfops; /* recovered log intent items */ int l_iclog_hsize; /* size of iclog header */ int l_iclog_heads; /* # of iclog header sectors */ uint l_sectBBsize; /* sector size in BBs (2^n) */ int l_iclog_size; /* size of log in bytes */ int l_iclog_bufs; /* number of iclog buffers */ xfs_daddr_t l_logBBstart; /* start block of log */ int l_logsize; /* size of log in bytes */ int l_logBBsize; /* size of log in BB chunks */ /* The following block of fields are changed while holding icloglock */ wait_queue_head_t l_flush_wait ____cacheline_aligned_in_smp; /* waiting for iclog flush */ int l_covered_state;/* state of "covering disk * log entries" */ xlog_in_core_t *l_iclog; /* head log queue */ spinlock_t l_icloglock; /* grab to change iclog state */ int l_curr_cycle; /* Cycle number of log writes */ int l_prev_cycle; /* Cycle number before last * block increment */ int l_curr_block; /* current logical log block */ int l_prev_block; /* previous logical log block */ /* * l_tail_lsn is atomic so it can be set and read without needing to * hold specific locks. To avoid operations contending with other hot * objects, it on a separate cacheline. */ /* lsn of 1st LR with unflushed * buffers */ atomic64_t l_tail_lsn ____cacheline_aligned_in_smp; struct xlog_grant_head l_reserve_head; struct xlog_grant_head l_write_head; uint64_t l_tail_space; struct xfs_kobj l_kobj; /* log recovery lsn tracking (for buffer submission */ xfs_lsn_t l_recovery_lsn; uint32_t l_iclog_roundoff;/* padding roundoff */ }; /* * Bits for operational state */ #define XLOG_ACTIVE_RECOVERY 0 /* in the middle of recovery */ #define XLOG_RECOVERY_NEEDED 1 /* log was recovered */ #define XLOG_IO_ERROR 2 /* log hit an I/O error, and being shutdown */ #define XLOG_TAIL_WARN 3 /* log tail verify warning issued */ #define XLOG_SHUTDOWN_STARTED 4 /* xlog_force_shutdown() exclusion */ static inline bool xlog_recovery_needed(struct xlog *log) { return test_bit(XLOG_RECOVERY_NEEDED, &log->l_opstate); } static inline bool xlog_in_recovery(struct xlog *log) { return test_bit(XLOG_ACTIVE_RECOVERY, &log->l_opstate); } static inline bool xlog_is_shutdown(struct xlog *log) { return test_bit(XLOG_IO_ERROR, &log->l_opstate); } /* * Wait until the xlog_force_shutdown() has marked the log as shut down * so xlog_is_shutdown() will always return true. */ static inline void xlog_shutdown_wait( struct xlog *log) { wait_var_event(&log->l_opstate, xlog_is_shutdown(log)); } /* common routines */ extern int xlog_recover( struct xlog *log); extern int xlog_recover_finish( struct xlog *log); extern void xlog_recover_cancel(struct xlog *); extern __le32 xlog_cksum(struct xlog *log, struct xlog_rec_header *rhead, char *dp, int size); extern struct kmem_cache *xfs_log_ticket_cache; struct xlog_ticket *xlog_ticket_alloc(struct xlog *log, int unit_bytes, int count, bool permanent); void xlog_print_tic_res(struct xfs_mount *mp, struct xlog_ticket *ticket); void xlog_print_trans(struct xfs_trans *); int xlog_write(struct xlog *log, struct xfs_cil_ctx *ctx, struct list_head *lv_chain, struct xlog_ticket *tic, uint32_t len); void xfs_log_ticket_ungrant(struct xlog *log, struct xlog_ticket *ticket); void xfs_log_ticket_regrant(struct xlog *log, struct xlog_ticket *ticket); void xlog_state_switch_iclogs(struct xlog *log, struct xlog_in_core *iclog, int eventual_size); int xlog_state_release_iclog(struct xlog *log, struct xlog_in_core *iclog, struct xlog_ticket *ticket); /* * When we crack an atomic LSN, we sample it first so that the value will not * change while we are cracking it into the component values. This means we * will always get consistent component values to work from. This should always * be used to sample and crack LSNs that are stored and updated in atomic * variables. */ static inline void xlog_crack_atomic_lsn(atomic64_t *lsn, uint *cycle, uint *block) { xfs_lsn_t val = atomic64_read(lsn); *cycle = CYCLE_LSN(val); *block = BLOCK_LSN(val); } /* * Calculate and assign a value to an atomic LSN variable from component pieces. */ static inline void xlog_assign_atomic_lsn(atomic64_t *lsn, uint cycle, uint block) { atomic64_set(lsn, xlog_assign_lsn(cycle, block)); } /* * Committed Item List interfaces */ int xlog_cil_init(struct xlog *log); void xlog_cil_init_post_recovery(struct xlog *log); void xlog_cil_destroy(struct xlog *log); bool xlog_cil_empty(struct xlog *log); void xlog_cil_commit(struct xlog *log, struct xfs_trans *tp, xfs_csn_t *commit_seq, bool regrant); void xlog_cil_set_ctx_write_state(struct xfs_cil_ctx *ctx, struct xlog_in_core *iclog); /* * CIL force routines */ void xlog_cil_flush(struct xlog *log); xfs_lsn_t xlog_cil_force_seq(struct xlog *log, xfs_csn_t sequence); static inline void xlog_cil_force(struct xlog *log) { xlog_cil_force_seq(log, log->l_cilp->xc_current_sequence); } /* * Wrapper function for waiting on a wait queue serialised against wakeups * by a spinlock. This matches the semantics of all the wait queues used in the * log code. */ static inline void xlog_wait( struct wait_queue_head *wq, struct spinlock *lock) __releases(lock) { DECLARE_WAITQUEUE(wait, current); add_wait_queue_exclusive(wq, &wait); __set_current_state(TASK_UNINTERRUPTIBLE); spin_unlock(lock); schedule(); remove_wait_queue(wq, &wait); } int xlog_wait_on_iclog(struct xlog_in_core *iclog) __releases(iclog->ic_log->l_icloglock); /* Calculate the distance between two LSNs in bytes */ static inline uint64_t xlog_lsn_sub( struct xlog *log, xfs_lsn_t high, xfs_lsn_t low) { uint32_t hi_cycle = CYCLE_LSN(high); uint32_t hi_block = BLOCK_LSN(high); uint32_t lo_cycle = CYCLE_LSN(low); uint32_t lo_block = BLOCK_LSN(low); if (hi_cycle == lo_cycle) return BBTOB(hi_block - lo_block); ASSERT((hi_cycle == lo_cycle + 1) || xlog_is_shutdown(log)); return (uint64_t)log->l_logsize - BBTOB(lo_block - hi_block); } void xlog_grant_return_space(struct xlog *log, xfs_lsn_t old_head, xfs_lsn_t new_head); /* * The LSN is valid so long as it is behind the current LSN. If it isn't, this * means that the next log record that includes this metadata could have a * smaller LSN. In turn, this means that the modification in the log would not * replay. */ static inline bool xlog_valid_lsn( struct xlog *log, xfs_lsn_t lsn) { int cur_cycle; int cur_block; bool valid = true; /* * First, sample the current lsn without locking to avoid added * contention from metadata I/O. The current cycle and block are updated * (in xlog_state_switch_iclogs()) and read here in a particular order * to avoid false negatives (e.g., thinking the metadata LSN is valid * when it is not). * * The current block is always rewound before the cycle is bumped in * xlog_state_switch_iclogs() to ensure the current LSN is never seen in * a transiently forward state. Instead, we can see the LSN in a * transiently behind state if we happen to race with a cycle wrap. */ cur_cycle = READ_ONCE(log->l_curr_cycle); smp_rmb(); cur_block = READ_ONCE(log->l_curr_block); if ((CYCLE_LSN(lsn) > cur_cycle) || (CYCLE_LSN(lsn) == cur_cycle && BLOCK_LSN(lsn) > cur_block)) { /* * If the metadata LSN appears invalid, it's possible the check * above raced with a wrap to the next log cycle. Grab the lock * to check for sure. */ spin_lock(&log->l_icloglock); cur_cycle = log->l_curr_cycle; cur_block = log->l_curr_block; spin_unlock(&log->l_icloglock); if ((CYCLE_LSN(lsn) > cur_cycle) || (CYCLE_LSN(lsn) == cur_cycle && BLOCK_LSN(lsn) > cur_block)) valid = false; } return valid; } /* * Log vector and shadow buffers can be large, so we need to use kvmalloc() here * to ensure success. Unfortunately, kvmalloc() only allows GFP_KERNEL contexts * to fall back to vmalloc, so we can't actually do anything useful with gfp * flags to control the kmalloc() behaviour within kvmalloc(). Hence kmalloc() * will do direct reclaim and compaction in the slow path, both of which are * horrendously expensive. We just want kmalloc to fail fast and fall back to * vmalloc if it can't get something straight away from the free lists or * buddy allocator. Hence we have to open code kvmalloc outselves here. * * This assumes that the caller uses memalloc_nofs_save task context here, so * despite the use of GFP_KERNEL here, we are going to be doing GFP_NOFS * allocations. This is actually the only way to make vmalloc() do GFP_NOFS * allocations, so lets just all pretend this is a GFP_KERNEL context * operation.... */ static inline void * xlog_kvmalloc( size_t buf_size) { gfp_t flags = GFP_KERNEL; void *p; flags &= ~__GFP_DIRECT_RECLAIM; flags |= __GFP_NOWARN | __GFP_NORETRY; do { p = kmalloc(buf_size, flags); if (!p) p = vmalloc(buf_size); } while (!p); return p; } #endif /* __XFS_LOG_PRIV_H__ */ |
| 13 3 11 2 7 2 1 9 9 5 7 2 1 1 15 1 1 1 1 10 1 9 6 6 6 5 5 5 1 6 3 3 4 1 1 1 1 27 27 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 | // SPDX-License-Identifier: GPL-2.0-only /* bpf/cpumap.c * * Copyright (c) 2017 Jesper Dangaard Brouer, Red Hat Inc. */ /** * DOC: cpu map * The 'cpumap' is primarily used as a backend map for XDP BPF helper * call bpf_redirect_map() and XDP_REDIRECT action, like 'devmap'. * * Unlike devmap which redirects XDP frames out to another NIC device, * this map type redirects raw XDP frames to another CPU. The remote * CPU will do SKB-allocation and call the normal network stack. */ /* * This is a scalability and isolation mechanism, that allow * separating the early driver network XDP layer, from the rest of the * netstack, and assigning dedicated CPUs for this stage. This * basically allows for 10G wirespeed pre-filtering via bpf. */ #include <linux/bitops.h> #include <linux/bpf.h> #include <linux/filter.h> #include <linux/ptr_ring.h> #include <net/xdp.h> #include <net/hotdata.h> #include <linux/sched.h> #include <linux/workqueue.h> #include <linux/kthread.h> #include <linux/completion.h> #include <trace/events/xdp.h> #include <linux/btf_ids.h> #include <linux/netdevice.h> /* netif_receive_skb_list */ #include <linux/etherdevice.h> /* eth_type_trans */ /* General idea: XDP packets getting XDP redirected to another CPU, * will maximum be stored/queued for one driver ->poll() call. It is * guaranteed that queueing the frame and the flush operation happen on * same CPU. Thus, cpu_map_flush operation can deduct via this_cpu_ptr() * which queue in bpf_cpu_map_entry contains packets. */ #define CPU_MAP_BULK_SIZE 8 /* 8 == one cacheline on 64-bit archs */ struct bpf_cpu_map_entry; struct bpf_cpu_map; struct xdp_bulk_queue { void *q[CPU_MAP_BULK_SIZE]; struct list_head flush_node; struct bpf_cpu_map_entry *obj; unsigned int count; }; /* Struct for every remote "destination" CPU in map */ struct bpf_cpu_map_entry { u32 cpu; /* kthread CPU and map index */ int map_id; /* Back reference to map */ /* XDP can run multiple RX-ring queues, need __percpu enqueue store */ struct xdp_bulk_queue __percpu *bulkq; /* Queue with potential multi-producers, and single-consumer kthread */ struct ptr_ring *queue; struct task_struct *kthread; struct bpf_cpumap_val value; struct bpf_prog *prog; struct completion kthread_running; struct rcu_work free_work; }; struct bpf_cpu_map { struct bpf_map map; /* Below members specific for map type */ struct bpf_cpu_map_entry __rcu **cpu_map; }; static struct bpf_map *cpu_map_alloc(union bpf_attr *attr) { u32 value_size = attr->value_size; struct bpf_cpu_map *cmap; /* check sanity of attributes */ if (attr->max_entries == 0 || attr->key_size != 4 || (value_size != offsetofend(struct bpf_cpumap_val, qsize) && value_size != offsetofend(struct bpf_cpumap_val, bpf_prog.fd)) || attr->map_flags & ~BPF_F_NUMA_NODE) return ERR_PTR(-EINVAL); /* Pre-limit array size based on NR_CPUS, not final CPU check */ if (attr->max_entries > NR_CPUS) return ERR_PTR(-E2BIG); cmap = bpf_map_area_alloc(sizeof(*cmap), NUMA_NO_NODE); if (!cmap) return ERR_PTR(-ENOMEM); bpf_map_init_from_attr(&cmap->map, attr); /* Alloc array for possible remote "destination" CPUs */ cmap->cpu_map = bpf_map_area_alloc(cmap->map.max_entries * sizeof(struct bpf_cpu_map_entry *), cmap->map.numa_node); if (!cmap->cpu_map) { bpf_map_area_free(cmap); return ERR_PTR(-ENOMEM); } return &cmap->map; } static void __cpu_map_ring_cleanup(struct ptr_ring *ring) { /* The tear-down procedure should have made sure that queue is * empty. See __cpu_map_entry_replace() and work-queue * invoked cpu_map_kthread_stop(). Catch any broken behaviour * gracefully and warn once. */ void *ptr; while ((ptr = ptr_ring_consume(ring))) { WARN_ON_ONCE(1); if (unlikely(__ptr_test_bit(0, &ptr))) { __ptr_clear_bit(0, &ptr); kfree_skb(ptr); continue; } xdp_return_frame(ptr); } } static void cpu_map_bpf_prog_run_skb(struct bpf_cpu_map_entry *rcpu, struct list_head *listp, struct xdp_cpumap_stats *stats) { struct sk_buff *skb, *tmp; struct xdp_buff xdp; u32 act; int err; list_for_each_entry_safe(skb, tmp, listp, list) { act = bpf_prog_run_generic_xdp(skb, &xdp, rcpu->prog); switch (act) { case XDP_PASS: break; case XDP_REDIRECT: skb_list_del_init(skb); err = xdp_do_generic_redirect(skb->dev, skb, &xdp, rcpu->prog); if (unlikely(err)) { kfree_skb(skb); stats->drop++; } else { stats->redirect++; } return; default: bpf_warn_invalid_xdp_action(NULL, rcpu->prog, act); fallthrough; case XDP_ABORTED: trace_xdp_exception(skb->dev, rcpu->prog, act); fallthrough; case XDP_DROP: skb_list_del_init(skb); kfree_skb(skb); stats->drop++; return; } } } static int cpu_map_bpf_prog_run_xdp(struct bpf_cpu_map_entry *rcpu, void **frames, int n, struct xdp_cpumap_stats *stats) { struct xdp_rxq_info rxq = {}; struct xdp_buff xdp; int i, nframes = 0; xdp_set_return_frame_no_direct(); xdp.rxq = &rxq; for (i = 0; i < n; i++) { struct xdp_frame *xdpf = frames[i]; u32 act; int err; rxq.dev = xdpf->dev_rx; rxq.mem.type = xdpf->mem_type; /* TODO: report queue_index to xdp_rxq_info */ xdp_convert_frame_to_buff(xdpf, &xdp); act = bpf_prog_run_xdp(rcpu->prog, &xdp); switch (act) { case XDP_PASS: err = xdp_update_frame_from_buff(&xdp, xdpf); if (err < 0) { xdp_return_frame(xdpf); stats->drop++; } else { frames[nframes++] = xdpf; stats->pass++; } break; case XDP_REDIRECT: err = xdp_do_redirect(xdpf->dev_rx, &xdp, rcpu->prog); if (unlikely(err)) { xdp_return_frame(xdpf); stats->drop++; } else { stats->redirect++; } break; default: bpf_warn_invalid_xdp_action(NULL, rcpu->prog, act); fallthrough; case XDP_DROP: xdp_return_frame(xdpf); stats->drop++; break; } } xdp_clear_return_frame_no_direct(); return nframes; } #define CPUMAP_BATCH 8 static int cpu_map_bpf_prog_run(struct bpf_cpu_map_entry *rcpu, void **frames, int xdp_n, struct xdp_cpumap_stats *stats, struct list_head *list) { struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx; int nframes; if (!rcpu->prog) return xdp_n; rcu_read_lock_bh(); bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx); nframes = cpu_map_bpf_prog_run_xdp(rcpu, frames, xdp_n, stats); if (stats->redirect) xdp_do_flush(); if (unlikely(!list_empty(list))) cpu_map_bpf_prog_run_skb(rcpu, list, stats); bpf_net_ctx_clear(bpf_net_ctx); rcu_read_unlock_bh(); /* resched point, may call do_softirq() */ return nframes; } static int cpu_map_kthread_run(void *data) { struct bpf_cpu_map_entry *rcpu = data; unsigned long last_qs = jiffies; complete(&rcpu->kthread_running); set_current_state(TASK_INTERRUPTIBLE); /* When kthread gives stop order, then rcpu have been disconnected * from map, thus no new packets can enter. Remaining in-flight * per CPU stored packets are flushed to this queue. Wait honoring * kthread_stop signal until queue is empty. */ while (!kthread_should_stop() || !__ptr_ring_empty(rcpu->queue)) { struct xdp_cpumap_stats stats = {}; /* zero stats */ unsigned int kmem_alloc_drops = 0, sched = 0; gfp_t gfp = __GFP_ZERO | GFP_ATOMIC; int i, n, m, nframes, xdp_n; void *frames[CPUMAP_BATCH]; void *skbs[CPUMAP_BATCH]; LIST_HEAD(list); /* Release CPU reschedule checks */ if (__ptr_ring_empty(rcpu->queue)) { set_current_state(TASK_INTERRUPTIBLE); /* Recheck to avoid lost wake-up */ if (__ptr_ring_empty(rcpu->queue)) { schedule(); sched = 1; last_qs = jiffies; } else { __set_current_state(TASK_RUNNING); } } else { rcu_softirq_qs_periodic(last_qs); sched = cond_resched(); } /* * The bpf_cpu_map_entry is single consumer, with this * kthread CPU pinned. Lockless access to ptr_ring * consume side valid as no-resize allowed of queue. */ n = __ptr_ring_consume_batched(rcpu->queue, frames, CPUMAP_BATCH); for (i = 0, xdp_n = 0; i < n; i++) { void *f = frames[i]; struct page *page; if (unlikely(__ptr_test_bit(0, &f))) { struct sk_buff *skb = f; __ptr_clear_bit(0, &skb); list_add_tail(&skb->list, &list); continue; } frames[xdp_n++] = f; page = virt_to_page(f); /* Bring struct page memory area to curr CPU. Read by * build_skb_around via page_is_pfmemalloc(), and when * freed written by page_frag_free call. */ prefetchw(page); } /* Support running another XDP prog on this CPU */ nframes = cpu_map_bpf_prog_run(rcpu, frames, xdp_n, &stats, &list); if (nframes) { m = kmem_cache_alloc_bulk(net_hotdata.skbuff_cache, gfp, nframes, skbs); if (unlikely(m == 0)) { for (i = 0; i < nframes; i++) skbs[i] = NULL; /* effect: xdp_return_frame */ kmem_alloc_drops += nframes; } } local_bh_disable(); for (i = 0; i < nframes; i++) { struct xdp_frame *xdpf = frames[i]; struct sk_buff *skb = skbs[i]; skb = __xdp_build_skb_from_frame(xdpf, skb, xdpf->dev_rx); if (!skb) { xdp_return_frame(xdpf); continue; } list_add_tail(&skb->list, &list); } /* Feedback loop via tracepoint. * NB: keep before recv to allow measuring enqueue/dequeue latency. */ trace_xdp_cpumap_kthread(rcpu->map_id, n, kmem_alloc_drops, sched, &stats); netif_receive_skb_list(&list); local_bh_enable(); /* resched point, may call do_softirq() */ } __set_current_state(TASK_RUNNING); return 0; } static int __cpu_map_load_bpf_program(struct bpf_cpu_map_entry *rcpu, struct bpf_map *map, int fd) { struct bpf_prog *prog; prog = bpf_prog_get_type(fd, BPF_PROG_TYPE_XDP); if (IS_ERR(prog)) return PTR_ERR(prog); if (prog->expected_attach_type != BPF_XDP_CPUMAP || !bpf_prog_map_compatible(map, prog)) { bpf_prog_put(prog); return -EINVAL; } rcpu->value.bpf_prog.id = prog->aux->id; rcpu->prog = prog; return 0; } static struct bpf_cpu_map_entry * __cpu_map_entry_alloc(struct bpf_map *map, struct bpf_cpumap_val *value, u32 cpu) { int numa, err, i, fd = value->bpf_prog.fd; gfp_t gfp = GFP_KERNEL | __GFP_NOWARN; struct bpf_cpu_map_entry *rcpu; struct xdp_bulk_queue *bq; /* Have map->numa_node, but choose node of redirect target CPU */ numa = cpu_to_node(cpu); rcpu = bpf_map_kmalloc_node(map, sizeof(*rcpu), gfp | __GFP_ZERO, numa); if (!rcpu) return NULL; /* Alloc percpu bulkq */ rcpu->bulkq = bpf_map_alloc_percpu(map, sizeof(*rcpu->bulkq), sizeof(void *), gfp); if (!rcpu->bulkq) goto free_rcu; for_each_possible_cpu(i) { bq = per_cpu_ptr(rcpu->bulkq, i); bq->obj = rcpu; } /* Alloc queue */ rcpu->queue = bpf_map_kmalloc_node(map, sizeof(*rcpu->queue), gfp, numa); if (!rcpu->queue) goto free_bulkq; err = ptr_ring_init(rcpu->queue, value->qsize, gfp); if (err) goto free_queue; rcpu->cpu = cpu; rcpu->map_id = map->id; rcpu->value.qsize = value->qsize; if (fd > 0 && __cpu_map_load_bpf_program(rcpu, map, fd)) goto free_ptr_ring; /* Setup kthread */ init_completion(&rcpu->kthread_running); rcpu->kthread = kthread_create_on_node(cpu_map_kthread_run, rcpu, numa, "cpumap/%d/map:%d", cpu, map->id); if (IS_ERR(rcpu->kthread)) goto free_prog; /* Make sure kthread runs on a single CPU */ kthread_bind(rcpu->kthread, cpu); wake_up_process(rcpu->kthread); /* Make sure kthread has been running, so kthread_stop() will not * stop the kthread prematurely and all pending frames or skbs * will be handled by the kthread before kthread_stop() returns. */ wait_for_completion(&rcpu->kthread_running); return rcpu; free_prog: if (rcpu->prog) bpf_prog_put(rcpu->prog); free_ptr_ring: ptr_ring_cleanup(rcpu->queue, NULL); free_queue: kfree(rcpu->queue); free_bulkq: free_percpu(rcpu->bulkq); free_rcu: kfree(rcpu); return NULL; } static void __cpu_map_entry_free(struct work_struct *work) { struct bpf_cpu_map_entry *rcpu; /* This cpu_map_entry have been disconnected from map and one * RCU grace-period have elapsed. Thus, XDP cannot queue any * new packets and cannot change/set flush_needed that can * find this entry. */ rcpu = container_of(to_rcu_work(work), struct bpf_cpu_map_entry, free_work); /* kthread_stop will wake_up_process and wait for it to complete. * cpu_map_kthread_run() makes sure the pointer ring is empty * before exiting. */ kthread_stop(rcpu->kthread); if (rcpu->prog) bpf_prog_put(rcpu->prog); /* The queue should be empty at this point */ __cpu_map_ring_cleanup(rcpu->queue); ptr_ring_cleanup(rcpu->queue, NULL); kfree(rcpu->queue); free_percpu(rcpu->bulkq); kfree(rcpu); } /* After the xchg of the bpf_cpu_map_entry pointer, we need to make sure the old * entry is no longer in use before freeing. We use queue_rcu_work() to call * __cpu_map_entry_free() in a separate workqueue after waiting for an RCU grace * period. This means that (a) all pending enqueue and flush operations have * completed (because of the RCU callback), and (b) we are in a workqueue * context where we can stop the kthread and wait for it to exit before freeing * everything. */ static void __cpu_map_entry_replace(struct bpf_cpu_map *cmap, u32 key_cpu, struct bpf_cpu_map_entry *rcpu) { struct bpf_cpu_map_entry *old_rcpu; old_rcpu = unrcu_pointer(xchg(&cmap->cpu_map[key_cpu], RCU_INITIALIZER(rcpu))); if (old_rcpu) { INIT_RCU_WORK(&old_rcpu->free_work, __cpu_map_entry_free); queue_rcu_work(system_wq, &old_rcpu->free_work); } } static long cpu_map_delete_elem(struct bpf_map *map, void *key) { struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map); u32 key_cpu = *(u32 *)key; if (key_cpu >= map->max_entries) return -EINVAL; /* notice caller map_delete_elem() uses rcu_read_lock() */ __cpu_map_entry_replace(cmap, key_cpu, NULL); return 0; } static long cpu_map_update_elem(struct bpf_map *map, void *key, void *value, u64 map_flags) { struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map); struct bpf_cpumap_val cpumap_value = {}; struct bpf_cpu_map_entry *rcpu; /* Array index key correspond to CPU number */ u32 key_cpu = *(u32 *)key; memcpy(&cpumap_value, value, map->value_size); if (unlikely(map_flags > BPF_EXIST)) return -EINVAL; if (unlikely(key_cpu >= cmap->map.max_entries)) return -E2BIG; if (unlikely(map_flags == BPF_NOEXIST)) return -EEXIST; if (unlikely(cpumap_value.qsize > 16384)) /* sanity limit on qsize */ return -EOVERFLOW; /* Make sure CPU is a valid possible cpu */ if (key_cpu >= nr_cpumask_bits || !cpu_possible(key_cpu)) return -ENODEV; if (cpumap_value.qsize == 0) { rcpu = NULL; /* Same as deleting */ } else { /* Updating qsize cause re-allocation of bpf_cpu_map_entry */ rcpu = __cpu_map_entry_alloc(map, &cpumap_value, key_cpu); if (!rcpu) return -ENOMEM; } rcu_read_lock(); __cpu_map_entry_replace(cmap, key_cpu, rcpu); rcu_read_unlock(); return 0; } static void cpu_map_free(struct bpf_map *map) { struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map); u32 i; /* At this point bpf_prog->aux->refcnt == 0 and this map->refcnt == 0, * so the bpf programs (can be more than one that used this map) were * disconnected from events. Wait for outstanding critical sections in * these programs to complete. synchronize_rcu() below not only * guarantees no further "XDP/bpf-side" reads against * bpf_cpu_map->cpu_map, but also ensure pending flush operations * (if any) are completed. */ synchronize_rcu(); /* The only possible user of bpf_cpu_map_entry is * cpu_map_kthread_run(). */ for (i = 0; i < cmap->map.max_entries; i++) { struct bpf_cpu_map_entry *rcpu; rcpu = rcu_dereference_raw(cmap->cpu_map[i]); if (!rcpu) continue; /* Stop kthread and cleanup entry directly */ __cpu_map_entry_free(&rcpu->free_work.work); } bpf_map_area_free(cmap->cpu_map); bpf_map_area_free(cmap); } /* Elements are kept alive by RCU; either by rcu_read_lock() (from syscall) or * by local_bh_disable() (from XDP calls inside NAPI). The * rcu_read_lock_bh_held() below makes lockdep accept both. */ static void *__cpu_map_lookup_elem(struct bpf_map *map, u32 key) { struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map); struct bpf_cpu_map_entry *rcpu; if (key >= map->max_entries) return NULL; rcpu = rcu_dereference_check(cmap->cpu_map[key], rcu_read_lock_bh_held()); return rcpu; } static void *cpu_map_lookup_elem(struct bpf_map *map, void *key) { struct bpf_cpu_map_entry *rcpu = __cpu_map_lookup_elem(map, *(u32 *)key); return rcpu ? &rcpu->value : NULL; } static int cpu_map_get_next_key(struct bpf_map *map, void *key, void *next_key) { struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map); u32 index = key ? *(u32 *)key : U32_MAX; u32 *next = next_key; if (index >= cmap->map.max_entries) { *next = 0; return 0; } if (index == cmap->map.max_entries - 1) return -ENOENT; *next = index + 1; return 0; } static long cpu_map_redirect(struct bpf_map *map, u64 index, u64 flags) { return __bpf_xdp_redirect_map(map, index, flags, 0, __cpu_map_lookup_elem); } static u64 cpu_map_mem_usage(const struct bpf_map *map) { u64 usage = sizeof(struct bpf_cpu_map); /* Currently the dynamically allocated elements are not counted */ usage += (u64)map->max_entries * sizeof(struct bpf_cpu_map_entry *); return usage; } BTF_ID_LIST_SINGLE(cpu_map_btf_ids, struct, bpf_cpu_map) const struct bpf_map_ops cpu_map_ops = { .map_meta_equal = bpf_map_meta_equal, .map_alloc = cpu_map_alloc, .map_free = cpu_map_free, .map_delete_elem = cpu_map_delete_elem, .map_update_elem = cpu_map_update_elem, .map_lookup_elem = cpu_map_lookup_elem, .map_get_next_key = cpu_map_get_next_key, .map_check_btf = map_check_no_btf, .map_mem_usage = cpu_map_mem_usage, .map_btf_id = &cpu_map_btf_ids[0], .map_redirect = cpu_map_redirect, }; static void bq_flush_to_queue(struct xdp_bulk_queue *bq) { struct bpf_cpu_map_entry *rcpu = bq->obj; unsigned int processed = 0, drops = 0; const int to_cpu = rcpu->cpu; struct ptr_ring *q; int i; if (unlikely(!bq->count)) return; q = rcpu->queue; spin_lock(&q->producer_lock); for (i = 0; i < bq->count; i++) { struct xdp_frame *xdpf = bq->q[i]; int err; err = __ptr_ring_produce(q, xdpf); if (err) { drops++; xdp_return_frame_rx_napi(xdpf); } processed++; } bq->count = 0; spin_unlock(&q->producer_lock); __list_del_clearprev(&bq->flush_node); /* Feedback loop via tracepoints */ trace_xdp_cpumap_enqueue(rcpu->map_id, processed, drops, to_cpu); } /* Runs under RCU-read-side, plus in softirq under NAPI protection. * Thus, safe percpu variable access. */ static void bq_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_frame *xdpf) { struct xdp_bulk_queue *bq = this_cpu_ptr(rcpu->bulkq); if (unlikely(bq->count == CPU_MAP_BULK_SIZE)) bq_flush_to_queue(bq); /* Notice, xdp_buff/page MUST be queued here, long enough for * driver to code invoking us to finished, due to driver * (e.g. ixgbe) recycle tricks based on page-refcnt. * * Thus, incoming xdp_frame is always queued here (else we race * with another CPU on page-refcnt and remaining driver code). * Queue time is very short, as driver will invoke flush * operation, when completing napi->poll call. */ bq->q[bq->count++] = xdpf; if (!bq->flush_node.prev) { struct list_head *flush_list = bpf_net_ctx_get_cpu_map_flush_list(); list_add(&bq->flush_node, flush_list); } } int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_frame *xdpf, struct net_device *dev_rx) { /* Info needed when constructing SKB on remote CPU */ xdpf->dev_rx = dev_rx; bq_enqueue(rcpu, xdpf); return 0; } int cpu_map_generic_redirect(struct bpf_cpu_map_entry *rcpu, struct sk_buff *skb) { int ret; __skb_pull(skb, skb->mac_len); skb_set_redirected(skb, false); __ptr_set_bit(0, &skb); ret = ptr_ring_produce(rcpu->queue, skb); if (ret < 0) goto trace; wake_up_process(rcpu->kthread); trace: trace_xdp_cpumap_enqueue(rcpu->map_id, !ret, !!ret, rcpu->cpu); return ret; } void __cpu_map_flush(struct list_head *flush_list) { struct xdp_bulk_queue *bq, *tmp; list_for_each_entry_safe(bq, tmp, flush_list, flush_node) { bq_flush_to_queue(bq); /* If already running, costs spin_lock_irqsave + smb_mb */ wake_up_process(bq->obj->kthread); } } |
| 5 8 119 135 137 135 135 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 | // SPDX-License-Identifier: GPL-2.0-or-later /* * Key-agreement Protocol Primitives (KPP) * * Copyright (c) 2016, Intel Corporation * Authors: Salvatore Benedetto <salvatore.benedetto@intel.com> */ #include <crypto/internal/kpp.h> #include <linux/cryptouser.h> #include <linux/errno.h> #include <linux/kernel.h> #include <linux/module.h> #include <linux/seq_file.h> #include <linux/string.h> #include <net/netlink.h> #include "internal.h" static int __maybe_unused crypto_kpp_report( struct sk_buff *skb, struct crypto_alg *alg) { struct crypto_report_kpp rkpp; memset(&rkpp, 0, sizeof(rkpp)); strscpy(rkpp.type, "kpp", sizeof(rkpp.type)); return nla_put(skb, CRYPTOCFGA_REPORT_KPP, sizeof(rkpp), &rkpp); } static void crypto_kpp_show(struct seq_file *m, struct crypto_alg *alg) __maybe_unused; static void crypto_kpp_show(struct seq_file *m, struct crypto_alg *alg) { seq_puts(m, "type : kpp\n"); } static void crypto_kpp_exit_tfm(struct crypto_tfm *tfm) { struct crypto_kpp *kpp = __crypto_kpp_tfm(tfm); struct kpp_alg *alg = crypto_kpp_alg(kpp); alg->exit(kpp); } static int crypto_kpp_init_tfm(struct crypto_tfm *tfm) { struct crypto_kpp *kpp = __crypto_kpp_tfm(tfm); struct kpp_alg *alg = crypto_kpp_alg(kpp); if (alg->exit) kpp->base.exit = crypto_kpp_exit_tfm; if (alg->init) return alg->init(kpp); return 0; } static void crypto_kpp_free_instance(struct crypto_instance *inst) { struct kpp_instance *kpp = kpp_instance(inst); kpp->free(kpp); } static const struct crypto_type crypto_kpp_type = { .extsize = crypto_alg_extsize, .init_tfm = crypto_kpp_init_tfm, .free = crypto_kpp_free_instance, #ifdef CONFIG_PROC_FS .show = crypto_kpp_show, #endif #if IS_ENABLED(CONFIG_CRYPTO_USER) .report = crypto_kpp_report, #endif .maskclear = ~CRYPTO_ALG_TYPE_MASK, .maskset = CRYPTO_ALG_TYPE_MASK, .type = CRYPTO_ALG_TYPE_KPP, .tfmsize = offsetof(struct crypto_kpp, base), }; struct crypto_kpp *crypto_alloc_kpp(const char *alg_name, u32 type, u32 mask) { return crypto_alloc_tfm(alg_name, &crypto_kpp_type, type, mask); } EXPORT_SYMBOL_GPL(crypto_alloc_kpp); int crypto_grab_kpp(struct crypto_kpp_spawn *spawn, struct crypto_instance *inst, const char *name, u32 type, u32 mask) { spawn->base.frontend = &crypto_kpp_type; return crypto_grab_spawn(&spawn->base, inst, name, type, mask); } EXPORT_SYMBOL_GPL(crypto_grab_kpp); int crypto_has_kpp(const char *alg_name, u32 type, u32 mask) { return crypto_type_has_alg(alg_name, &crypto_kpp_type, type, mask); } EXPORT_SYMBOL_GPL(crypto_has_kpp); static void kpp_prepare_alg(struct kpp_alg *alg) { struct crypto_alg *base = &alg->base; base->cra_type = &crypto_kpp_type; base->cra_flags &= ~CRYPTO_ALG_TYPE_MASK; base->cra_flags |= CRYPTO_ALG_TYPE_KPP; } int crypto_register_kpp(struct kpp_alg *alg) { struct crypto_alg *base = &alg->base; kpp_prepare_alg(alg); return crypto_register_alg(base); } EXPORT_SYMBOL_GPL(crypto_register_kpp); void crypto_unregister_kpp(struct kpp_alg *alg) { crypto_unregister_alg(&alg->base); } EXPORT_SYMBOL_GPL(crypto_unregister_kpp); int kpp_register_instance(struct crypto_template *tmpl, struct kpp_instance *inst) { if (WARN_ON(!inst->free)) return -EINVAL; kpp_prepare_alg(&inst->alg); return crypto_register_instance(tmpl, kpp_crypto_instance(inst)); } EXPORT_SYMBOL_GPL(kpp_register_instance); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("Key-agreement Protocol Primitives"); |
| 1 1 1 1 1 1 6 6 1 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 | // SPDX-License-Identifier: GPL-2.0 /* * usb-serial driver for Quatech SSU-100 * * based on ftdi_sio.c and the original serqt_usb.c from Quatech * */ #include <linux/errno.h> #include <linux/slab.h> #include <linux/tty.h> #include <linux/tty_driver.h> #include <linux/tty_flip.h> #include <linux/module.h> #include <linux/serial.h> #include <linux/usb.h> #include <linux/usb/serial.h> #include <linux/serial_reg.h> #include <linux/uaccess.h> #define QT_OPEN_CLOSE_CHANNEL 0xca #define QT_SET_GET_DEVICE 0xc2 #define QT_SET_GET_REGISTER 0xc0 #define QT_GET_SET_PREBUF_TRIG_LVL 0xcc #define QT_SET_ATF 0xcd #define QT_GET_SET_UART 0xc1 #define QT_TRANSFER_IN 0xc0 #define QT_HW_FLOW_CONTROL_MASK 0xc5 #define QT_SW_FLOW_CONTROL_MASK 0xc6 #define SERIAL_MSR_MASK 0xf0 #define SERIAL_CRTSCTS ((UART_MCR_RTS << 8) | UART_MSR_CTS) #define SERIAL_EVEN_PARITY (UART_LCR_PARITY | UART_LCR_EPAR) #define MAX_BAUD_RATE 460800 #define ATC_DISABLED 0x00 #define DUPMODE_BITS 0xc0 #define RR_BITS 0x03 #define LOOPMODE_BITS 0x41 #define RS232_MODE 0x00 #define RTSCTS_TO_CONNECTOR 0x40 #define CLKS_X4 0x02 #define FULLPWRBIT 0x00000080 #define NEXT_BOARD_POWER_BIT 0x00000004 #define DRIVER_DESC "Quatech SSU-100 USB to Serial Driver" #define USB_VENDOR_ID_QUATECH 0x061d /* Quatech VID */ #define QUATECH_SSU100 0xC020 /* SSU100 */ static const struct usb_device_id id_table[] = { {USB_DEVICE(USB_VENDOR_ID_QUATECH, QUATECH_SSU100)}, {} /* Terminating entry */ }; MODULE_DEVICE_TABLE(usb, id_table); struct ssu100_port_private { spinlock_t status_lock; u8 shadowLSR; u8 shadowMSR; }; static inline int ssu100_control_msg(struct usb_device *dev, u8 request, u16 data, u16 index) { return usb_control_msg(dev, usb_sndctrlpipe(dev, 0), request, 0x40, data, index, NULL, 0, 300); } static inline int ssu100_setdevice(struct usb_device *dev, u8 *data) { u16 x = ((u16)(data[1] << 8) | (u16)(data[0])); return ssu100_control_msg(dev, QT_SET_GET_DEVICE, x, 0); } static inline int ssu100_getdevice(struct usb_device *dev, u8 *data) { int ret; ret = usb_control_msg(dev, usb_rcvctrlpipe(dev, 0), QT_SET_GET_DEVICE, 0xc0, 0, 0, data, 3, 300); if (ret < 3) { if (ret >= 0) ret = -EIO; } return ret; } static inline int ssu100_getregister(struct usb_device *dev, unsigned short uart, unsigned short reg, u8 *data) { int ret; ret = usb_control_msg(dev, usb_rcvctrlpipe(dev, 0), QT_SET_GET_REGISTER, 0xc0, reg, uart, data, sizeof(*data), 300); if (ret < (int)sizeof(*data)) { if (ret >= 0) ret = -EIO; } return ret; } static inline int ssu100_setregister(struct usb_device *dev, unsigned short uart, unsigned short reg, u16 data) { u16 value = (data << 8) | reg; return usb_control_msg(dev, usb_sndctrlpipe(dev, 0), QT_SET_GET_REGISTER, 0x40, value, uart, NULL, 0, 300); } #define set_mctrl(dev, set) update_mctrl((dev), (set), 0) #define clear_mctrl(dev, clear) update_mctrl((dev), 0, (clear)) /* these do not deal with device that have more than 1 port */ static inline int update_mctrl(struct usb_device *dev, unsigned int set, unsigned int clear) { unsigned urb_value; int result; if (((set | clear) & (TIOCM_DTR | TIOCM_RTS)) == 0) { dev_dbg(&dev->dev, "%s - DTR|RTS not being set|cleared\n", __func__); return 0; /* no change */ } clear &= ~set; /* 'set' takes precedence over 'clear' */ urb_value = 0; if (set & TIOCM_DTR) urb_value |= UART_MCR_DTR; if (set & TIOCM_RTS) urb_value |= UART_MCR_RTS; result = ssu100_setregister(dev, 0, UART_MCR, urb_value); if (result < 0) dev_dbg(&dev->dev, "%s Error from MODEM_CTRL urb\n", __func__); return result; } static int ssu100_initdevice(struct usb_device *dev) { u8 *data; int result = 0; data = kzalloc(3, GFP_KERNEL); if (!data) return -ENOMEM; result = ssu100_getdevice(dev, data); if (result < 0) { dev_dbg(&dev->dev, "%s - get_device failed %i\n", __func__, result); goto out; } data[1] &= ~FULLPWRBIT; result = ssu100_setdevice(dev, data); if (result < 0) { dev_dbg(&dev->dev, "%s - setdevice failed %i\n", __func__, result); goto out; } result = ssu100_control_msg(dev, QT_GET_SET_PREBUF_TRIG_LVL, 128, 0); if (result < 0) { dev_dbg(&dev->dev, "%s - set prebuffer level failed %i\n", __func__, result); goto out; } result = ssu100_control_msg(dev, QT_SET_ATF, ATC_DISABLED, 0); if (result < 0) { dev_dbg(&dev->dev, "%s - set ATFprebuffer level failed %i\n", __func__, result); goto out; } result = ssu100_getdevice(dev, data); if (result < 0) { dev_dbg(&dev->dev, "%s - get_device failed %i\n", __func__, result); goto out; } data[0] &= ~(RR_BITS | DUPMODE_BITS); data[0] |= CLKS_X4; data[1] &= ~(LOOPMODE_BITS); data[1] |= RS232_MODE; result = ssu100_setdevice(dev, data); if (result < 0) { dev_dbg(&dev->dev, "%s - setdevice failed %i\n", __func__, result); goto out; } out: kfree(data); return result; } static void ssu100_set_termios(struct tty_struct *tty, struct usb_serial_port *port, const struct ktermios *old_termios) { struct usb_device *dev = port->serial->dev; struct ktermios *termios = &tty->termios; u16 baud, divisor, remainder; unsigned int cflag = termios->c_cflag; u16 urb_value = 0; /* will hold the new flags */ int result; if (cflag & PARENB) { if (cflag & PARODD) urb_value |= UART_LCR_PARITY; else urb_value |= SERIAL_EVEN_PARITY; } urb_value |= UART_LCR_WLEN(tty_get_char_size(cflag)); baud = tty_get_baud_rate(tty); if (!baud) baud = 9600; dev_dbg(&port->dev, "%s - got baud = %d\n", __func__, baud); divisor = MAX_BAUD_RATE / baud; remainder = MAX_BAUD_RATE % baud; if (((remainder * 2) >= baud) && (baud != 110)) divisor++; urb_value = urb_value << 8; result = ssu100_control_msg(dev, QT_GET_SET_UART, divisor, urb_value); if (result < 0) dev_dbg(&port->dev, "%s - set uart failed\n", __func__); if (cflag & CRTSCTS) result = ssu100_control_msg(dev, QT_HW_FLOW_CONTROL_MASK, SERIAL_CRTSCTS, 0); else result = ssu100_control_msg(dev, QT_HW_FLOW_CONTROL_MASK, 0, 0); if (result < 0) dev_dbg(&port->dev, "%s - set HW flow control failed\n", __func__); if (I_IXOFF(tty) || I_IXON(tty)) { u16 x = ((u16)(START_CHAR(tty) << 8) | (u16)(STOP_CHAR(tty))); result = ssu100_control_msg(dev, QT_SW_FLOW_CONTROL_MASK, x, 0); } else result = ssu100_control_msg(dev, QT_SW_FLOW_CONTROL_MASK, 0, 0); if (result < 0) dev_dbg(&port->dev, "%s - set SW flow control failed\n", __func__); } static int ssu100_open(struct tty_struct *tty, struct usb_serial_port *port) { struct usb_device *dev = port->serial->dev; struct ssu100_port_private *priv = usb_get_serial_port_data(port); u8 *data; int result; unsigned long flags; data = kzalloc(2, GFP_KERNEL); if (!data) return -ENOMEM; result = usb_control_msg(dev, usb_rcvctrlpipe(dev, 0), QT_OPEN_CLOSE_CHANNEL, QT_TRANSFER_IN, 0x01, 0, data, 2, 300); if (result < 2) { dev_dbg(&port->dev, "%s - open failed %i\n", __func__, result); if (result >= 0) result = -EIO; kfree(data); return result; } spin_lock_irqsave(&priv->status_lock, flags); priv->shadowLSR = data[0]; priv->shadowMSR = data[1]; spin_unlock_irqrestore(&priv->status_lock, flags); kfree(data); /* set to 9600 */ result = ssu100_control_msg(dev, QT_GET_SET_UART, 0x30, 0x0300); if (result < 0) dev_dbg(&port->dev, "%s - set uart failed\n", __func__); if (tty) ssu100_set_termios(tty, port, &tty->termios); return usb_serial_generic_open(tty, port); } static int ssu100_attach(struct usb_serial *serial) { return ssu100_initdevice(serial->dev); } static int ssu100_port_probe(struct usb_serial_port *port) { struct ssu100_port_private *priv; priv = kzalloc(sizeof(*priv), GFP_KERNEL); if (!priv) return -ENOMEM; spin_lock_init(&priv->status_lock); usb_set_serial_port_data(port, priv); return 0; } static void ssu100_port_remove(struct usb_serial_port *port) { struct ssu100_port_private *priv; priv = usb_get_serial_port_data(port); kfree(priv); } static int ssu100_tiocmget(struct tty_struct *tty) { struct usb_serial_port *port = tty->driver_data; struct usb_device *dev = port->serial->dev; u8 *d; int r; d = kzalloc(2, GFP_KERNEL); if (!d) return -ENOMEM; r = ssu100_getregister(dev, 0, UART_MCR, d); if (r < 0) goto mget_out; r = ssu100_getregister(dev, 0, UART_MSR, d+1); if (r < 0) goto mget_out; r = (d[0] & UART_MCR_DTR ? TIOCM_DTR : 0) | (d[0] & UART_MCR_RTS ? TIOCM_RTS : 0) | (d[1] & UART_MSR_CTS ? TIOCM_CTS : 0) | (d[1] & UART_MSR_DCD ? TIOCM_CAR : 0) | (d[1] & UART_MSR_RI ? TIOCM_RI : 0) | (d[1] & UART_MSR_DSR ? TIOCM_DSR : 0); mget_out: kfree(d); return r; } static int ssu100_tiocmset(struct tty_struct *tty, unsigned int set, unsigned int clear) { struct usb_serial_port *port = tty->driver_data; struct usb_device *dev = port->serial->dev; return update_mctrl(dev, set, clear); } static void ssu100_dtr_rts(struct usb_serial_port *port, int on) { struct usb_device *dev = port->serial->dev; /* Disable flow control */ if (!on) { if (ssu100_setregister(dev, 0, UART_MCR, 0) < 0) dev_err(&port->dev, "error from flowcontrol urb\n"); } /* drop RTS and DTR */ if (on) set_mctrl(dev, TIOCM_DTR | TIOCM_RTS); else clear_mctrl(dev, TIOCM_DTR | TIOCM_RTS); } static void ssu100_update_msr(struct usb_serial_port *port, u8 msr) { struct ssu100_port_private *priv = usb_get_serial_port_data(port); unsigned long flags; spin_lock_irqsave(&priv->status_lock, flags); priv->shadowMSR = msr; spin_unlock_irqrestore(&priv->status_lock, flags); if (msr & UART_MSR_ANY_DELTA) { /* update input line counters */ if (msr & UART_MSR_DCTS) port->icount.cts++; if (msr & UART_MSR_DDSR) port->icount.dsr++; if (msr & UART_MSR_DDCD) port->icount.dcd++; if (msr & UART_MSR_TERI) port->icount.rng++; wake_up_interruptible(&port->port.delta_msr_wait); } } static void ssu100_update_lsr(struct usb_serial_port *port, u8 lsr, char *tty_flag) { struct ssu100_port_private *priv = usb_get_serial_port_data(port); unsigned long flags; spin_lock_irqsave(&priv->status_lock, flags); priv->shadowLSR = lsr; spin_unlock_irqrestore(&priv->status_lock, flags); *tty_flag = TTY_NORMAL; if (lsr & UART_LSR_BRK_ERROR_BITS) { /* we always want to update icount, but we only want to * update tty_flag for one case */ if (lsr & UART_LSR_BI) { port->icount.brk++; *tty_flag = TTY_BREAK; usb_serial_handle_break(port); } if (lsr & UART_LSR_PE) { port->icount.parity++; if (*tty_flag == TTY_NORMAL) *tty_flag = TTY_PARITY; } if (lsr & UART_LSR_FE) { port->icount.frame++; if (*tty_flag == TTY_NORMAL) *tty_flag = TTY_FRAME; } if (lsr & UART_LSR_OE) { port->icount.overrun++; tty_insert_flip_char(&port->port, 0, TTY_OVERRUN); } } } static void ssu100_process_read_urb(struct urb *urb) { struct usb_serial_port *port = urb->context; char *packet = urb->transfer_buffer; char flag = TTY_NORMAL; u32 len = urb->actual_length; int i; char *ch; if ((len >= 4) && (packet[0] == 0x1b) && (packet[1] == 0x1b) && ((packet[2] == 0x00) || (packet[2] == 0x01))) { if (packet[2] == 0x00) ssu100_update_lsr(port, packet[3], &flag); if (packet[2] == 0x01) ssu100_update_msr(port, packet[3]); len -= 4; ch = packet + 4; } else ch = packet; if (!len) return; /* status only */ if (port->sysrq) { for (i = 0; i < len; i++, ch++) { if (!usb_serial_handle_sysrq_char(port, *ch)) tty_insert_flip_char(&port->port, *ch, flag); } } else { tty_insert_flip_string_fixed_flag(&port->port, ch, flag, len); } tty_flip_buffer_push(&port->port); } static struct usb_serial_driver ssu100_device = { .driver = { .name = "ssu100", }, .description = DRIVER_DESC, .id_table = id_table, .num_ports = 1, .open = ssu100_open, .attach = ssu100_attach, .port_probe = ssu100_port_probe, .port_remove = ssu100_port_remove, .dtr_rts = ssu100_dtr_rts, .process_read_urb = ssu100_process_read_urb, .tiocmget = ssu100_tiocmget, .tiocmset = ssu100_tiocmset, .tiocmiwait = usb_serial_generic_tiocmiwait, .get_icount = usb_serial_generic_get_icount, .set_termios = ssu100_set_termios, }; static struct usb_serial_driver * const serial_drivers[] = { &ssu100_device, NULL }; module_usb_serial_driver(serial_drivers, id_table); MODULE_DESCRIPTION(DRIVER_DESC); MODULE_LICENSE("GPL v2"); |
| 6 6 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 | // SPDX-License-Identifier: GPL-2.0-only /* ipv6header match - matches IPv6 packets based on whether they contain certain headers */ /* Original idea: Brad Chapman * Rewritten by: Andras Kis-Szabo <kisza@sch.bme.hu> */ /* (C) 2001-2002 Andras Kis-Szabo <kisza@sch.bme.hu> */ #include <linux/module.h> #include <linux/skbuff.h> #include <linux/ipv6.h> #include <linux/types.h> #include <net/checksum.h> #include <net/ipv6.h> #include <linux/netfilter/x_tables.h> #include <linux/netfilter_ipv6.h> #include <linux/netfilter_ipv6/ip6t_ipv6header.h> MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("Xtables: IPv6 header types match"); MODULE_AUTHOR("Andras Kis-Szabo <kisza@sch.bme.hu>"); static bool ipv6header_mt6(const struct sk_buff *skb, struct xt_action_param *par) { const struct ip6t_ipv6header_info *info = par->matchinfo; unsigned int temp; int len; u8 nexthdr; unsigned int ptr; /* Make sure this isn't an evil packet */ /* type of the 1st exthdr */ nexthdr = ipv6_hdr(skb)->nexthdr; /* pointer to the 1st exthdr */ ptr = sizeof(struct ipv6hdr); /* available length */ len = skb->len - ptr; temp = 0; while (nf_ip6_ext_hdr(nexthdr)) { const struct ipv6_opt_hdr *hp; struct ipv6_opt_hdr _hdr; int hdrlen; /* No more exthdr -> evaluate */ if (nexthdr == NEXTHDR_NONE) { temp |= MASK_NONE; break; } /* Is there enough space for the next ext header? */ if (len < (int)sizeof(struct ipv6_opt_hdr)) return false; /* ESP -> evaluate */ if (nexthdr == NEXTHDR_ESP) { temp |= MASK_ESP; break; } hp = skb_header_pointer(skb, ptr, sizeof(_hdr), &_hdr); if (!hp) { par->hotdrop = true; return false; } /* Calculate the header length */ if (nexthdr == NEXTHDR_FRAGMENT) hdrlen = 8; else if (nexthdr == NEXTHDR_AUTH) hdrlen = ipv6_authlen(hp); else hdrlen = ipv6_optlen(hp); /* set the flag */ switch (nexthdr) { case NEXTHDR_HOP: temp |= MASK_HOPOPTS; break; case NEXTHDR_ROUTING: temp |= MASK_ROUTING; break; case NEXTHDR_FRAGMENT: temp |= MASK_FRAGMENT; break; case NEXTHDR_AUTH: temp |= MASK_AH; break; case NEXTHDR_DEST: temp |= MASK_DSTOPTS; break; default: return false; } nexthdr = hp->nexthdr; len -= hdrlen; ptr += hdrlen; if (ptr > skb->len) break; } if (nexthdr != NEXTHDR_NONE && nexthdr != NEXTHDR_ESP) temp |= MASK_PROTO; if (info->modeflag) return !((temp ^ info->matchflags ^ info->invflags) & info->matchflags); else { if (info->invflags) return temp != info->matchflags; else return temp == info->matchflags; } } static int ipv6header_mt6_check(const struct xt_mtchk_param *par) { const struct ip6t_ipv6header_info *info = par->matchinfo; /* invflags is 0 or 0xff in hard mode */ if ((!info->modeflag) && info->invflags != 0x00 && info->invflags != 0xFF) return -EINVAL; return 0; } static struct xt_match ipv6header_mt6_reg __read_mostly = { .name = "ipv6header", .family = NFPROTO_IPV6, .match = ipv6header_mt6, .matchsize = sizeof(struct ip6t_ipv6header_info), .checkentry = ipv6header_mt6_check, .destroy = NULL, .me = THIS_MODULE, }; static int __init ipv6header_mt6_init(void) { return xt_register_match(&ipv6header_mt6_reg); } static void __exit ipv6header_mt6_exit(void) { xt_unregister_match(&ipv6header_mt6_reg); } module_init(ipv6header_mt6_init); module_exit(ipv6header_mt6_exit); |
| 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 | // SPDX-License-Identifier: GPL-2.0 /* * Utility functions for file contents encryption/decryption on * block device-based filesystems. * * Copyright (C) 2015, Google, Inc. * Copyright (C) 2015, Motorola Mobility */ #include <linux/pagemap.h> #include <linux/module.h> #include <linux/bio.h> #include <linux/namei.h> #include "fscrypt_private.h" /** * fscrypt_decrypt_bio() - decrypt the contents of a bio * @bio: the bio to decrypt * * Decrypt the contents of a "read" bio following successful completion of the * underlying disk read. The bio must be reading a whole number of blocks of an * encrypted file directly into the page cache. If the bio is reading the * ciphertext into bounce pages instead of the page cache (for example, because * the file is also compressed, so decompression is required after decryption), * then this function isn't applicable. This function may sleep, so it must be * called from a workqueue rather than from the bio's bi_end_io callback. * * Return: %true on success; %false on failure. On failure, bio->bi_status is * also set to an error status. */ bool fscrypt_decrypt_bio(struct bio *bio) { struct folio_iter fi; bio_for_each_folio_all(fi, bio) { int err = fscrypt_decrypt_pagecache_blocks(fi.folio, fi.length, fi.offset); if (err) { bio->bi_status = errno_to_blk_status(err); return false; } } return true; } EXPORT_SYMBOL(fscrypt_decrypt_bio); static int fscrypt_zeroout_range_inline_crypt(const struct inode *inode, pgoff_t lblk, sector_t pblk, unsigned int len) { const unsigned int blockbits = inode->i_blkbits; const unsigned int blocks_per_page = 1 << (PAGE_SHIFT - blockbits); struct bio *bio; int ret, err = 0; int num_pages = 0; /* This always succeeds since __GFP_DIRECT_RECLAIM is set. */ bio = bio_alloc(inode->i_sb->s_bdev, BIO_MAX_VECS, REQ_OP_WRITE, GFP_NOFS); while (len) { unsigned int blocks_this_page = min(len, blocks_per_page); unsigned int bytes_this_page = blocks_this_page << blockbits; if (num_pages == 0) { fscrypt_set_bio_crypt_ctx(bio, inode, lblk, GFP_NOFS); bio->bi_iter.bi_sector = pblk << (blockbits - SECTOR_SHIFT); } ret = bio_add_page(bio, ZERO_PAGE(0), bytes_this_page, 0); if (WARN_ON_ONCE(ret != bytes_this_page)) { err = -EIO; goto out; } num_pages++; len -= blocks_this_page; lblk += blocks_this_page; pblk += blocks_this_page; if (num_pages == BIO_MAX_VECS || !len || !fscrypt_mergeable_bio(bio, inode, lblk)) { err = submit_bio_wait(bio); if (err) goto out; bio_reset(bio, inode->i_sb->s_bdev, REQ_OP_WRITE); num_pages = 0; } } out: bio_put(bio); return err; } /** * fscrypt_zeroout_range() - zero out a range of blocks in an encrypted file * @inode: the file's inode * @lblk: the first file logical block to zero out * @pblk: the first filesystem physical block to zero out * @len: number of blocks to zero out * * Zero out filesystem blocks in an encrypted regular file on-disk, i.e. write * ciphertext blocks which decrypt to the all-zeroes block. The blocks must be * both logically and physically contiguous. It's also assumed that the * filesystem only uses a single block device, ->s_bdev. * * Note that since each block uses a different IV, this involves writing a * different ciphertext to each block; we can't simply reuse the same one. * * Return: 0 on success; -errno on failure. */ int fscrypt_zeroout_range(const struct inode *inode, pgoff_t lblk, sector_t pblk, unsigned int len) { const struct fscrypt_inode_info *ci = inode->i_crypt_info; const unsigned int du_bits = ci->ci_data_unit_bits; const unsigned int du_size = 1U << du_bits; const unsigned int du_per_page_bits = PAGE_SHIFT - du_bits; const unsigned int du_per_page = 1U << du_per_page_bits; u64 du_index = (u64)lblk << (inode->i_blkbits - du_bits); u64 du_remaining = (u64)len << (inode->i_blkbits - du_bits); sector_t sector = pblk << (inode->i_blkbits - SECTOR_SHIFT); struct page *pages[16]; /* write up to 16 pages at a time */ unsigned int nr_pages; unsigned int i; unsigned int offset; struct bio *bio; int ret, err; if (len == 0) return 0; if (fscrypt_inode_uses_inline_crypto(inode)) return fscrypt_zeroout_range_inline_crypt(inode, lblk, pblk, len); BUILD_BUG_ON(ARRAY_SIZE(pages) > BIO_MAX_VECS); nr_pages = min_t(u64, ARRAY_SIZE(pages), (du_remaining + du_per_page - 1) >> du_per_page_bits); /* * We need at least one page for ciphertext. Allocate the first one * from a mempool, with __GFP_DIRECT_RECLAIM set so that it can't fail. * * Any additional page allocations are allowed to fail, as they only * help performance, and waiting on the mempool for them could deadlock. */ for (i = 0; i < nr_pages; i++) { pages[i] = fscrypt_alloc_bounce_page(i == 0 ? GFP_NOFS : GFP_NOWAIT | __GFP_NOWARN); if (!pages[i]) break; } nr_pages = i; if (WARN_ON_ONCE(nr_pages <= 0)) return -EINVAL; /* This always succeeds since __GFP_DIRECT_RECLAIM is set. */ bio = bio_alloc(inode->i_sb->s_bdev, nr_pages, REQ_OP_WRITE, GFP_NOFS); do { bio->bi_iter.bi_sector = sector; i = 0; offset = 0; do { err = fscrypt_crypt_data_unit(ci, FS_ENCRYPT, du_index, ZERO_PAGE(0), pages[i], du_size, offset, GFP_NOFS); if (err) goto out; du_index++; sector += 1U << (du_bits - SECTOR_SHIFT); du_remaining--; offset += du_size; if (offset == PAGE_SIZE || du_remaining == 0) { ret = bio_add_page(bio, pages[i++], offset, 0); if (WARN_ON_ONCE(ret != offset)) { err = -EIO; goto out; } offset = 0; } } while (i != nr_pages && du_remaining != 0); err = submit_bio_wait(bio); if (err) goto out; bio_reset(bio, inode->i_sb->s_bdev, REQ_OP_WRITE); } while (du_remaining != 0); err = 0; out: bio_put(bio); for (i = 0; i < nr_pages; i++) fscrypt_free_bounce_page(pages[i]); return err; } EXPORT_SYMBOL(fscrypt_zeroout_range); |
| 32 27 25 15 15 1 27 1 26 6 21 1 32 27 6 6 326 326 9 20 6 51 51 51 51 51 23 16 26 33 26 3 19 26 6 2 7 1 18 30 15 38 7 34 5 26 5 5 14 38 50 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 | // SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2007 Oracle. All rights reserved. */ #include "ctree.h" #include "fs.h" #include "messages.h" #include "inode-item.h" #include "disk-io.h" #include "transaction.h" #include "space-info.h" #include "accessors.h" #include "extent-tree.h" #include "file-item.h" struct btrfs_inode_ref *btrfs_find_name_in_backref(const struct extent_buffer *leaf, int slot, const struct fscrypt_str *name) { struct btrfs_inode_ref *ref; unsigned long ptr; unsigned long name_ptr; u32 item_size; u32 cur_offset = 0; int len; item_size = btrfs_item_size(leaf, slot); ptr = btrfs_item_ptr_offset(leaf, slot); while (cur_offset < item_size) { ref = (struct btrfs_inode_ref *)(ptr + cur_offset); len = btrfs_inode_ref_name_len(leaf, ref); name_ptr = (unsigned long)(ref + 1); cur_offset += len + sizeof(*ref); if (len != name->len) continue; if (memcmp_extent_buffer(leaf, name->name, name_ptr, name->len) == 0) return ref; } return NULL; } struct btrfs_inode_extref *btrfs_find_name_in_ext_backref( const struct extent_buffer *leaf, int slot, u64 ref_objectid, const struct fscrypt_str *name) { struct btrfs_inode_extref *extref; unsigned long ptr; unsigned long name_ptr; u32 item_size; u32 cur_offset = 0; int ref_name_len; item_size = btrfs_item_size(leaf, slot); ptr = btrfs_item_ptr_offset(leaf, slot); /* * Search all extended backrefs in this item. We're only * looking through any collisions so most of the time this is * just going to compare against one buffer. If all is well, * we'll return success and the inode ref object. */ while (cur_offset < item_size) { extref = (struct btrfs_inode_extref *) (ptr + cur_offset); name_ptr = (unsigned long)(&extref->name); ref_name_len = btrfs_inode_extref_name_len(leaf, extref); if (ref_name_len == name->len && btrfs_inode_extref_parent(leaf, extref) == ref_objectid && (memcmp_extent_buffer(leaf, name->name, name_ptr, name->len) == 0)) return extref; cur_offset += ref_name_len + sizeof(*extref); } return NULL; } /* Returns NULL if no extref found */ struct btrfs_inode_extref * btrfs_lookup_inode_extref(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, const struct fscrypt_str *name, u64 inode_objectid, u64 ref_objectid, int ins_len, int cow) { int ret; struct btrfs_key key; key.objectid = inode_objectid; key.type = BTRFS_INODE_EXTREF_KEY; key.offset = btrfs_extref_hash(ref_objectid, name->name, name->len); ret = btrfs_search_slot(trans, root, &key, path, ins_len, cow); if (ret < 0) return ERR_PTR(ret); if (ret > 0) return NULL; return btrfs_find_name_in_ext_backref(path->nodes[0], path->slots[0], ref_objectid, name); } static int btrfs_del_inode_extref(struct btrfs_trans_handle *trans, struct btrfs_root *root, const struct fscrypt_str *name, u64 inode_objectid, u64 ref_objectid, u64 *index) { struct btrfs_path *path; struct btrfs_key key; struct btrfs_inode_extref *extref; struct extent_buffer *leaf; int ret; int del_len = name->len + sizeof(*extref); unsigned long ptr; unsigned long item_start; u32 item_size; key.objectid = inode_objectid; key.type = BTRFS_INODE_EXTREF_KEY; key.offset = btrfs_extref_hash(ref_objectid, name->name, name->len); path = btrfs_alloc_path(); if (!path) return -ENOMEM; ret = btrfs_search_slot(trans, root, &key, path, -1, 1); if (ret > 0) ret = -ENOENT; if (ret < 0) goto out; /* * Sanity check - did we find the right item for this name? * This should always succeed so error here will make the FS * readonly. */ extref = btrfs_find_name_in_ext_backref(path->nodes[0], path->slots[0], ref_objectid, name); if (!extref) { btrfs_abort_transaction(trans, -ENOENT); ret = -ENOENT; goto out; } leaf = path->nodes[0]; item_size = btrfs_item_size(leaf, path->slots[0]); if (index) *index = btrfs_inode_extref_index(leaf, extref); if (del_len == item_size) { /* * Common case only one ref in the item, remove the * whole item. */ ret = btrfs_del_item(trans, root, path); goto out; } ptr = (unsigned long)extref; item_start = btrfs_item_ptr_offset(leaf, path->slots[0]); memmove_extent_buffer(leaf, ptr, ptr + del_len, item_size - (ptr + del_len - item_start)); btrfs_truncate_item(trans, path, item_size - del_len, 1); out: btrfs_free_path(path); return ret; } int btrfs_del_inode_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, const struct fscrypt_str *name, u64 inode_objectid, u64 ref_objectid, u64 *index) { struct btrfs_path *path; struct btrfs_key key; struct btrfs_inode_ref *ref; struct extent_buffer *leaf; unsigned long ptr; unsigned long item_start; u32 item_size; u32 sub_item_len; int ret; int search_ext_refs = 0; int del_len = name->len + sizeof(*ref); key.objectid = inode_objectid; key.offset = ref_objectid; key.type = BTRFS_INODE_REF_KEY; path = btrfs_alloc_path(); if (!path) return -ENOMEM; ret = btrfs_search_slot(trans, root, &key, path, -1, 1); if (ret > 0) { ret = -ENOENT; search_ext_refs = 1; goto out; } else if (ret < 0) { goto out; } ref = btrfs_find_name_in_backref(path->nodes[0], path->slots[0], name); if (!ref) { ret = -ENOENT; search_ext_refs = 1; goto out; } leaf = path->nodes[0]; item_size = btrfs_item_size(leaf, path->slots[0]); if (index) *index = btrfs_inode_ref_index(leaf, ref); if (del_len == item_size) { ret = btrfs_del_item(trans, root, path); goto out; } ptr = (unsigned long)ref; sub_item_len = name->len + sizeof(*ref); item_start = btrfs_item_ptr_offset(leaf, path->slots[0]); memmove_extent_buffer(leaf, ptr, ptr + sub_item_len, item_size - (ptr + sub_item_len - item_start)); btrfs_truncate_item(trans, path, item_size - sub_item_len, 1); out: btrfs_free_path(path); if (search_ext_refs) { /* * No refs were found, or we could not find the * name in our ref array. Find and remove the extended * inode ref then. */ return btrfs_del_inode_extref(trans, root, name, inode_objectid, ref_objectid, index); } return ret; } /* * Insert an extended inode ref into a tree. * * The caller must have checked against BTRFS_LINK_MAX already. */ static int btrfs_insert_inode_extref(struct btrfs_trans_handle *trans, struct btrfs_root *root, const struct fscrypt_str *name, u64 inode_objectid, u64 ref_objectid, u64 index) { struct btrfs_inode_extref *extref; int ret; int ins_len = name->len + sizeof(*extref); unsigned long ptr; struct btrfs_path *path; struct btrfs_key key; struct extent_buffer *leaf; key.objectid = inode_objectid; key.type = BTRFS_INODE_EXTREF_KEY; key.offset = btrfs_extref_hash(ref_objectid, name->name, name->len); path = btrfs_alloc_path(); if (!path) return -ENOMEM; ret = btrfs_insert_empty_item(trans, root, path, &key, ins_len); if (ret == -EEXIST) { if (btrfs_find_name_in_ext_backref(path->nodes[0], path->slots[0], ref_objectid, name)) goto out; btrfs_extend_item(trans, path, ins_len); ret = 0; } if (ret < 0) goto out; leaf = path->nodes[0]; ptr = (unsigned long)btrfs_item_ptr(leaf, path->slots[0], char); ptr += btrfs_item_size(leaf, path->slots[0]) - ins_len; extref = (struct btrfs_inode_extref *)ptr; btrfs_set_inode_extref_name_len(path->nodes[0], extref, name->len); btrfs_set_inode_extref_index(path->nodes[0], extref, index); btrfs_set_inode_extref_parent(path->nodes[0], extref, ref_objectid); ptr = (unsigned long)&extref->name; write_extent_buffer(path->nodes[0], name->name, ptr, name->len); out: btrfs_free_path(path); return ret; } /* Will return 0, -ENOMEM, -EMLINK, or -EEXIST or anything from the CoW path */ int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, const struct fscrypt_str *name, u64 inode_objectid, u64 ref_objectid, u64 index) { struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_path *path; struct btrfs_key key; struct btrfs_inode_ref *ref; unsigned long ptr; int ret; int ins_len = name->len + sizeof(*ref); key.objectid = inode_objectid; key.offset = ref_objectid; key.type = BTRFS_INODE_REF_KEY; path = btrfs_alloc_path(); if (!path) return -ENOMEM; path->skip_release_on_error = 1; ret = btrfs_insert_empty_item(trans, root, path, &key, ins_len); if (ret == -EEXIST) { u32 old_size; ref = btrfs_find_name_in_backref(path->nodes[0], path->slots[0], name); if (ref) goto out; old_size = btrfs_item_size(path->nodes[0], path->slots[0]); btrfs_extend_item(trans, path, ins_len); ref = btrfs_item_ptr(path->nodes[0], path->slots[0], struct btrfs_inode_ref); ref = (struct btrfs_inode_ref *)((unsigned long)ref + old_size); btrfs_set_inode_ref_name_len(path->nodes[0], ref, name->len); btrfs_set_inode_ref_index(path->nodes[0], ref, index); ptr = (unsigned long)(ref + 1); ret = 0; } else if (ret < 0) { if (ret == -EOVERFLOW) { if (btrfs_find_name_in_backref(path->nodes[0], path->slots[0], name)) ret = -EEXIST; else ret = -EMLINK; } goto out; } else { ref = btrfs_item_ptr(path->nodes[0], path->slots[0], struct btrfs_inode_ref); btrfs_set_inode_ref_name_len(path->nodes[0], ref, name->len); btrfs_set_inode_ref_index(path->nodes[0], ref, index); ptr = (unsigned long)(ref + 1); } write_extent_buffer(path->nodes[0], name->name, ptr, name->len); out: btrfs_free_path(path); if (ret == -EMLINK) { struct btrfs_super_block *disk_super = fs_info->super_copy; /* We ran out of space in the ref array. Need to * add an extended ref. */ if (btrfs_super_incompat_flags(disk_super) & BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF) ret = btrfs_insert_inode_extref(trans, root, name, inode_objectid, ref_objectid, index); } return ret; } int btrfs_insert_empty_inode(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, u64 objectid) { struct btrfs_key key; int ret; key.objectid = objectid; key.type = BTRFS_INODE_ITEM_KEY; key.offset = 0; ret = btrfs_insert_empty_item(trans, root, path, &key, sizeof(struct btrfs_inode_item)); return ret; } int btrfs_lookup_inode(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, struct btrfs_key *location, int mod) { int ins_len = mod < 0 ? -1 : 0; int cow = mod != 0; int ret; int slot; struct extent_buffer *leaf; struct btrfs_key found_key; ret = btrfs_search_slot(trans, root, location, path, ins_len, cow); if (ret > 0 && location->type == BTRFS_ROOT_ITEM_KEY && location->offset == (u64)-1 && path->slots[0] != 0) { slot = path->slots[0] - 1; leaf = path->nodes[0]; btrfs_item_key_to_cpu(leaf, &found_key, slot); if (found_key.objectid == location->objectid && found_key.type == location->type) { path->slots[0]--; return 0; } } return ret; } static inline void btrfs_trace_truncate(const struct btrfs_inode *inode, const struct extent_buffer *leaf, const struct btrfs_file_extent_item *fi, u64 offset, int extent_type, int slot) { if (!inode) return; if (extent_type == BTRFS_FILE_EXTENT_INLINE) trace_btrfs_truncate_show_fi_inline(inode, leaf, fi, slot, offset); else trace_btrfs_truncate_show_fi_regular(inode, leaf, fi, offset); } /* * Remove inode items from a given root. * * @trans: A transaction handle. * @root: The root from which to remove items. * @inode: The inode whose items we want to remove. * @control: The btrfs_truncate_control to control how and what we * are truncating. * * Remove all keys associated with the inode from the given root that have a key * with a type greater than or equals to @min_type. When @min_type has a value of * BTRFS_EXTENT_DATA_KEY, only remove file extent items that have an offset value * greater than or equals to @new_size. If a file extent item that starts before * @new_size and ends after it is found, its length is adjusted. * * Returns: 0 on success, < 0 on error and NEED_TRUNCATE_BLOCK when @min_type is * BTRFS_EXTENT_DATA_KEY and the caller must truncate the last block. */ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_truncate_control *control) { struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_path *path; struct extent_buffer *leaf; struct btrfs_file_extent_item *fi; struct btrfs_key key; struct btrfs_key found_key; u64 new_size = control->new_size; u64 extent_num_bytes = 0; u64 extent_offset = 0; u64 item_end = 0; u32 found_type = (u8)-1; int del_item; int pending_del_nr = 0; int pending_del_slot = 0; int extent_type = -1; int ret; u64 bytes_deleted = 0; bool be_nice = false; ASSERT(control->inode || !control->clear_extent_range); ASSERT(new_size == 0 || control->min_type == BTRFS_EXTENT_DATA_KEY); control->last_size = new_size; control->sub_bytes = 0; /* * For shareable roots we want to back off from time to time, this turns * out to be subvolume roots, reloc roots, and data reloc roots. */ if (test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) be_nice = true; path = btrfs_alloc_path(); if (!path) return -ENOMEM; path->reada = READA_BACK; key.objectid = control->ino; key.offset = (u64)-1; key.type = (u8)-1; search_again: /* * With a 16K leaf size and 128MiB extents, you can actually queue up a * huge file in a single leaf. Most of the time that bytes_deleted is * > 0, it will be huge by the time we get here */ if (be_nice && bytes_deleted > SZ_32M && btrfs_should_end_transaction(trans)) { ret = -EAGAIN; goto out; } ret = btrfs_search_slot(trans, root, &key, path, -1, 1); if (ret < 0) goto out; if (ret > 0) { ret = 0; /* There are no items in the tree for us to truncate, we're done */ if (path->slots[0] == 0) goto out; path->slots[0]--; } while (1) { u64 clear_start = 0, clear_len = 0, extent_start = 0; bool refill_delayed_refs_rsv = false; fi = NULL; leaf = path->nodes[0]; btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); found_type = found_key.type; if (found_key.objectid != control->ino) break; if (found_type < control->min_type) break; item_end = found_key.offset; if (found_type == BTRFS_EXTENT_DATA_KEY) { fi = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); extent_type = btrfs_file_extent_type(leaf, fi); if (extent_type != BTRFS_FILE_EXTENT_INLINE) item_end += btrfs_file_extent_num_bytes(leaf, fi); else if (extent_type == BTRFS_FILE_EXTENT_INLINE) item_end += btrfs_file_extent_ram_bytes(leaf, fi); btrfs_trace_truncate(control->inode, leaf, fi, found_key.offset, extent_type, path->slots[0]); item_end--; } if (found_type > control->min_type) { del_item = 1; } else { if (item_end < new_size) break; if (found_key.offset >= new_size) del_item = 1; else del_item = 0; } /* FIXME, shrink the extent if the ref count is only 1 */ if (found_type != BTRFS_EXTENT_DATA_KEY) goto delete; control->extents_found++; if (extent_type != BTRFS_FILE_EXTENT_INLINE) { u64 num_dec; clear_start = found_key.offset; extent_start = btrfs_file_extent_disk_bytenr(leaf, fi); if (!del_item) { u64 orig_num_bytes = btrfs_file_extent_num_bytes(leaf, fi); extent_num_bytes = ALIGN(new_size - found_key.offset, fs_info->sectorsize); clear_start = ALIGN(new_size, fs_info->sectorsize); btrfs_set_file_extent_num_bytes(leaf, fi, extent_num_bytes); num_dec = (orig_num_bytes - extent_num_bytes); if (extent_start != 0) control->sub_bytes += num_dec; } else { extent_num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi); extent_offset = found_key.offset - btrfs_file_extent_offset(leaf, fi); /* FIXME blocksize != 4096 */ num_dec = btrfs_file_extent_num_bytes(leaf, fi); if (extent_start != 0) control->sub_bytes += num_dec; } clear_len = num_dec; } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) { /* * We can't truncate inline items that have had * special encodings */ if (!del_item && btrfs_file_extent_encryption(leaf, fi) == 0 && btrfs_file_extent_other_encoding(leaf, fi) == 0 && btrfs_file_extent_compression(leaf, fi) == 0) { u32 size = (u32)(new_size - found_key.offset); btrfs_set_file_extent_ram_bytes(leaf, fi, size); size = btrfs_file_extent_calc_inline_size(size); btrfs_truncate_item(trans, path, size, 1); } else if (!del_item) { /* * We have to bail so the last_size is set to * just before this extent. */ ret = BTRFS_NEED_TRUNCATE_BLOCK; break; } else { /* * Inline extents are special, we just treat * them as a full sector worth in the file * extent tree just for simplicity sake. */ clear_len = fs_info->sectorsize; } control->sub_bytes += item_end + 1 - new_size; } delete: /* * We only want to clear the file extent range if we're * modifying the actual inode's mapping, which is just the * normal truncate path. */ if (control->clear_extent_range) { ret = btrfs_inode_clear_file_extent_range(control->inode, clear_start, clear_len); if (ret) { btrfs_abort_transaction(trans, ret); break; } } if (del_item) { ASSERT(!pending_del_nr || ((path->slots[0] + 1) == pending_del_slot)); control->last_size = found_key.offset; if (!pending_del_nr) { /* No pending yet, add ourselves */ pending_del_slot = path->slots[0]; pending_del_nr = 1; } else if (path->slots[0] + 1 == pending_del_slot) { /* Hop on the pending chunk */ pending_del_nr++; pending_del_slot = path->slots[0]; } } else { control->last_size = new_size; break; } if (del_item && extent_start != 0 && !control->skip_ref_updates) { struct btrfs_ref ref = { .action = BTRFS_DROP_DELAYED_REF, .bytenr = extent_start, .num_bytes = extent_num_bytes, .owning_root = btrfs_root_id(root), .ref_root = btrfs_header_owner(leaf), }; bytes_deleted += extent_num_bytes; btrfs_init_data_ref(&ref, control->ino, extent_offset, btrfs_root_id(root), false); ret = btrfs_free_extent(trans, &ref); if (ret) { btrfs_abort_transaction(trans, ret); break; } if (be_nice && btrfs_check_space_for_delayed_refs(fs_info)) refill_delayed_refs_rsv = true; } if (found_type == BTRFS_INODE_ITEM_KEY) break; if (path->slots[0] == 0 || path->slots[0] != pending_del_slot || refill_delayed_refs_rsv) { if (pending_del_nr) { ret = btrfs_del_items(trans, root, path, pending_del_slot, pending_del_nr); if (ret) { btrfs_abort_transaction(trans, ret); break; } pending_del_nr = 0; } btrfs_release_path(path); /* * We can generate a lot of delayed refs, so we need to * throttle every once and a while and make sure we're * adding enough space to keep up with the work we are * generating. Since we hold a transaction here we * can't flush, and we don't want to FLUSH_LIMIT because * we could have generated too many delayed refs to * actually allocate, so just bail if we're short and * let the normal reservation dance happen higher up. */ if (refill_delayed_refs_rsv) { ret = btrfs_delayed_refs_rsv_refill(fs_info, BTRFS_RESERVE_NO_FLUSH); if (ret) { ret = -EAGAIN; break; } } goto search_again; } else { path->slots[0]--; } } out: if (ret >= 0 && pending_del_nr) { int err; err = btrfs_del_items(trans, root, path, pending_del_slot, pending_del_nr); if (err) { btrfs_abort_transaction(trans, err); ret = err; } } ASSERT(control->last_size >= new_size); if (!ret && control->last_size > new_size) control->last_size = new_size; btrfs_free_path(path); return ret; } |
| 41494 1363 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_CPUFEATURE_H #define _ASM_X86_CPUFEATURE_H #include <asm/processor.h> #if defined(__KERNEL__) && !defined(__ASSEMBLY__) #include <asm/asm.h> #include <linux/bitops.h> #include <asm/alternative.h> enum cpuid_leafs { CPUID_1_EDX = 0, CPUID_8000_0001_EDX, CPUID_8086_0001_EDX, CPUID_LNX_1, CPUID_1_ECX, CPUID_C000_0001_EDX, CPUID_8000_0001_ECX, CPUID_LNX_2, CPUID_LNX_3, CPUID_7_0_EBX, CPUID_D_1_EAX, CPUID_LNX_4, CPUID_7_1_EAX, CPUID_8000_0008_EBX, CPUID_6_EAX, CPUID_8000_000A_EDX, CPUID_7_ECX, CPUID_8000_0007_EBX, CPUID_7_EDX, CPUID_8000_001F_EAX, CPUID_8000_0021_EAX, CPUID_LNX_5, NR_CPUID_WORDS, }; #define X86_CAP_FMT_NUM "%d:%d" #define x86_cap_flag_num(flag) ((flag) >> 5), ((flag) & 31) extern const char * const x86_cap_flags[NCAPINTS*32]; extern const char * const x86_power_flags[32]; #define X86_CAP_FMT "%s" #define x86_cap_flag(flag) x86_cap_flags[flag] /* * In order to save room, we index into this array by doing * X86_BUG_<name> - NCAPINTS*32. */ extern const char * const x86_bug_flags[NBUGINTS*32]; #define test_cpu_cap(c, bit) \ arch_test_bit(bit, (unsigned long *)((c)->x86_capability)) /* * There are 32 bits/features in each mask word. The high bits * (selected with (bit>>5) give us the word number and the low 5 * bits give us the bit/feature number inside the word. * (1UL<<((bit)&31) gives us a mask for the feature_bit so we can * see if it is set in the mask word. */ #define CHECK_BIT_IN_MASK_WORD(maskname, word, bit) \ (((bit)>>5)==(word) && (1UL<<((bit)&31) & maskname##word )) /* * {REQUIRED,DISABLED}_MASK_CHECK below may seem duplicated with the * following BUILD_BUG_ON_ZERO() check but when NCAPINTS gets changed, all * header macros which use NCAPINTS need to be changed. The duplicated macro * use causes the compiler to issue errors for all headers so that all usage * sites can be corrected. */ #define REQUIRED_MASK_BIT_SET(feature_bit) \ ( CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 0, feature_bit) || \ CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 1, feature_bit) || \ CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 2, feature_bit) || \ CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 3, feature_bit) || \ CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 4, feature_bit) || \ CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 5, feature_bit) || \ CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 6, feature_bit) || \ CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 7, feature_bit) || \ CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 8, feature_bit) || \ CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 9, feature_bit) || \ CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 10, feature_bit) || \ CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 11, feature_bit) || \ CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 12, feature_bit) || \ CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 13, feature_bit) || \ CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 14, feature_bit) || \ CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 15, feature_bit) || \ CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 16, feature_bit) || \ CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 17, feature_bit) || \ CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 18, feature_bit) || \ CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 19, feature_bit) || \ CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 20, feature_bit) || \ CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 21, feature_bit) || \ REQUIRED_MASK_CHECK || \ BUILD_BUG_ON_ZERO(NCAPINTS != 22)) #define DISABLED_MASK_BIT_SET(feature_bit) \ ( CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 0, feature_bit) || \ CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 1, feature_bit) || \ CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 2, feature_bit) || \ CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 3, feature_bit) || \ CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 4, feature_bit) || \ CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 5, feature_bit) || \ CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 6, feature_bit) || \ CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 7, feature_bit) || \ CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 8, feature_bit) || \ CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 9, feature_bit) || \ CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 10, feature_bit) || \ CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 11, feature_bit) || \ CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 12, feature_bit) || \ CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 13, feature_bit) || \ CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 14, feature_bit) || \ CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 15, feature_bit) || \ CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 16, feature_bit) || \ CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 17, feature_bit) || \ CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 18, feature_bit) || \ CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 19, feature_bit) || \ CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 20, feature_bit) || \ CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 21, feature_bit) || \ DISABLED_MASK_CHECK || \ BUILD_BUG_ON_ZERO(NCAPINTS != 22)) #define cpu_has(c, bit) \ (__builtin_constant_p(bit) && REQUIRED_MASK_BIT_SET(bit) ? 1 : \ test_cpu_cap(c, bit)) #define this_cpu_has(bit) \ (__builtin_constant_p(bit) && REQUIRED_MASK_BIT_SET(bit) ? 1 : \ x86_this_cpu_test_bit(bit, cpu_info.x86_capability)) /* * This is the default CPU features testing macro to use in code. * * It is for detection of features which need kernel infrastructure to be * used. It may *not* directly test the CPU itself. Use the cpu_has() family * if you want true runtime testing of CPU features, like in hypervisor code * where you are supporting a possible guest feature where host support for it * is not relevant. */ #define cpu_feature_enabled(bit) \ (__builtin_constant_p(bit) && DISABLED_MASK_BIT_SET(bit) ? 0 : static_cpu_has(bit)) #define boot_cpu_has(bit) cpu_has(&boot_cpu_data, bit) #define set_cpu_cap(c, bit) set_bit(bit, (unsigned long *)((c)->x86_capability)) extern void setup_clear_cpu_cap(unsigned int bit); extern void clear_cpu_cap(struct cpuinfo_x86 *c, unsigned int bit); #define setup_force_cpu_cap(bit) do { \ \ if (!boot_cpu_has(bit)) \ WARN_ON(alternatives_patched); \ \ set_cpu_cap(&boot_cpu_data, bit); \ set_bit(bit, (unsigned long *)cpu_caps_set); \ } while (0) #define setup_force_cpu_bug(bit) setup_force_cpu_cap(bit) /* * Do not use an "m" constraint for [cap_byte] here: gcc doesn't know * that this is only used on a fallback path and will sometimes cause * it to manifest the address of boot_cpu_data in a register, fouling * the mainline (post-initialization) code. */ static __always_inline bool _static_cpu_has(u16 bit) { asm goto(ALTERNATIVE_TERNARY("jmp 6f", %c[feature], "", "jmp %l[t_no]") ".pushsection .altinstr_aux,\"ax\"\n" "6:\n" " testb %[bitnum], %a[cap_byte]\n" " jnz %l[t_yes]\n" " jmp %l[t_no]\n" ".popsection\n" : : [feature] "i" (bit), [bitnum] "i" (1 << (bit & 7)), [cap_byte] "i" (&((const char *)boot_cpu_data.x86_capability)[bit >> 3]) : : t_yes, t_no); t_yes: return true; t_no: return false; } #define static_cpu_has(bit) \ ( \ __builtin_constant_p(boot_cpu_has(bit)) ? \ boot_cpu_has(bit) : \ _static_cpu_has(bit) \ ) #define cpu_has_bug(c, bit) cpu_has(c, (bit)) #define set_cpu_bug(c, bit) set_cpu_cap(c, (bit)) #define clear_cpu_bug(c, bit) clear_cpu_cap(c, (bit)) #define static_cpu_has_bug(bit) static_cpu_has((bit)) #define boot_cpu_has_bug(bit) cpu_has_bug(&boot_cpu_data, (bit)) #define boot_cpu_set_bug(bit) set_cpu_cap(&boot_cpu_data, (bit)) #define MAX_CPU_FEATURES (NCAPINTS * 32) #define cpu_have_feature boot_cpu_has #define CPU_FEATURE_TYPEFMT "x86,ven%04Xfam%04Xmod%04X" #define CPU_FEATURE_TYPEVAL boot_cpu_data.x86_vendor, boot_cpu_data.x86, \ boot_cpu_data.x86_model #endif /* defined(__KERNEL__) && !defined(__ASSEMBLY__) */ #endif /* _ASM_X86_CPUFEATURE_H */ |
| 15 31 31 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_SCHED_RCUPDATE_WAIT_H #define _LINUX_SCHED_RCUPDATE_WAIT_H /* * RCU synchronization types and methods: */ #include <linux/rcupdate.h> #include <linux/completion.h> #include <linux/sched.h> /* * Structure allowing asynchronous waiting on RCU. */ struct rcu_synchronize { struct rcu_head head; struct completion completion; }; void wakeme_after_rcu(struct rcu_head *head); void __wait_rcu_gp(bool checktiny, unsigned int state, int n, call_rcu_func_t *crcu_array, struct rcu_synchronize *rs_array); #define _wait_rcu_gp(checktiny, state, ...) \ do { \ call_rcu_func_t __crcu_array[] = { __VA_ARGS__ }; \ struct rcu_synchronize __rs_array[ARRAY_SIZE(__crcu_array)]; \ __wait_rcu_gp(checktiny, state, ARRAY_SIZE(__crcu_array), __crcu_array, __rs_array); \ } while (0) #define wait_rcu_gp(...) _wait_rcu_gp(false, TASK_UNINTERRUPTIBLE, __VA_ARGS__) #define wait_rcu_gp_state(state, ...) _wait_rcu_gp(false, state, __VA_ARGS__) /** * synchronize_rcu_mult - Wait concurrently for multiple grace periods * @...: List of call_rcu() functions for different grace periods to wait on * * This macro waits concurrently for multiple types of RCU grace periods. * For example, synchronize_rcu_mult(call_rcu, call_rcu_tasks) would wait * on concurrent RCU and RCU-tasks grace periods. Waiting on a given SRCU * domain requires you to write a wrapper function for that SRCU domain's * call_srcu() function, with this wrapper supplying the pointer to the * corresponding srcu_struct. * * Note that call_rcu_hurry() should be used instead of call_rcu() * because in kernels built with CONFIG_RCU_LAZY=y the delay between the * invocation of call_rcu() and that of the corresponding RCU callback * can be multiple seconds. * * The first argument tells Tiny RCU's _wait_rcu_gp() not to * bother waiting for RCU. The reason for this is because anywhere * synchronize_rcu_mult() can be called is automatically already a full * grace period. */ #define synchronize_rcu_mult(...) \ _wait_rcu_gp(IS_ENABLED(CONFIG_TINY_RCU), TASK_UNINTERRUPTIBLE, __VA_ARGS__) static inline void cond_resched_rcu(void) { #if defined(CONFIG_DEBUG_ATOMIC_SLEEP) || !defined(CONFIG_PREEMPT_RCU) rcu_read_unlock(); cond_resched(); rcu_read_lock(); #endif } // Has the current task blocked within its current RCU read-side // critical section? static inline bool has_rcu_reader_blocked(void) { #ifdef CONFIG_PREEMPT_RCU return !list_empty(¤t->rcu_node_entry); #else return false; #endif } #endif /* _LINUX_SCHED_RCUPDATE_WAIT_H */ |
| 56 31 73 78 24 4 2 16 2 151 1 144 144 17 1 13 3 26 4 1 2 4 1 1 1 1 4 11 3 11 7 4 4 3 1 4 4 4 4 4 4 4 151 26 15 26 22 11 11 11 11 150 151 11 11 11 11 11 11 11 11 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 | // SPDX-License-Identifier: GPL-2.0-or-later /************************************************************ * EFI GUID Partition Table handling * * http://www.uefi.org/specs/ * http://www.intel.com/technology/efi/ * * efi.[ch] by Matt Domsch <Matt_Domsch@dell.com> * Copyright 2000,2001,2002,2004 Dell Inc. * * TODO: * * Changelog: * Mon August 5th, 2013 Davidlohr Bueso <davidlohr@hp.com> * - detect hybrid MBRs, tighter pMBR checking & cleanups. * * Mon Nov 09 2004 Matt Domsch <Matt_Domsch@dell.com> * - test for valid PMBR and valid PGPT before ever reading * AGPT, allow override with 'gpt' kernel command line option. * - check for first/last_usable_lba outside of size of disk * * Tue Mar 26 2002 Matt Domsch <Matt_Domsch@dell.com> * - Ported to 2.5.7-pre1 and 2.5.7-dj2 * - Applied patch to avoid fault in alternate header handling * - cleaned up find_valid_gpt * - On-disk structure and copy in memory is *always* LE now - * swab fields as needed * - remove print_gpt_header() * - only use first max_p partition entries, to keep the kernel minor number * and partition numbers tied. * * Mon Feb 04 2002 Matt Domsch <Matt_Domsch@dell.com> * - Removed __PRIPTR_PREFIX - not being used * * Mon Jan 14 2002 Matt Domsch <Matt_Domsch@dell.com> * - Ported to 2.5.2-pre11 + library crc32 patch Linus applied * * Thu Dec 6 2001 Matt Domsch <Matt_Domsch@dell.com> * - Added compare_gpts(). * - moved le_efi_guid_to_cpus() back into this file. GPT is the only * thing that keeps EFI GUIDs on disk. * - Changed gpt structure names and members to be simpler and more Linux-like. * * Wed Oct 17 2001 Matt Domsch <Matt_Domsch@dell.com> * - Removed CONFIG_DEVFS_VOLUMES_UUID code entirely per Martin Wilck * * Wed Oct 10 2001 Matt Domsch <Matt_Domsch@dell.com> * - Changed function comments to DocBook style per Andreas Dilger suggestion. * * Mon Oct 08 2001 Matt Domsch <Matt_Domsch@dell.com> * - Change read_lba() to use the page cache per Al Viro's work. * - print u64s properly on all architectures * - fixed debug_printk(), now Dprintk() * * Mon Oct 01 2001 Matt Domsch <Matt_Domsch@dell.com> * - Style cleanups * - made most functions static * - Endianness addition * - remove test for second alternate header, as it's not per spec, * and is unnecessary. There's now a method to read/write the last * sector of an odd-sized disk from user space. No tools have ever * been released which used this code, so it's effectively dead. * - Per Asit Mallick of Intel, added a test for a valid PMBR. * - Added kernel command line option 'gpt' to override valid PMBR test. * * Wed Jun 6 2001 Martin Wilck <Martin.Wilck@Fujitsu-Siemens.com> * - added devfs volume UUID support (/dev/volumes/uuids) for * mounting file systems by the partition GUID. * * Tue Dec 5 2000 Matt Domsch <Matt_Domsch@dell.com> * - Moved crc32() to linux/lib, added efi_crc32(). * * Thu Nov 30 2000 Matt Domsch <Matt_Domsch@dell.com> * - Replaced Intel's CRC32 function with an equivalent * non-license-restricted version. * * Wed Oct 25 2000 Matt Domsch <Matt_Domsch@dell.com> * - Fixed the last_lba() call to return the proper last block * * Thu Oct 12 2000 Matt Domsch <Matt_Domsch@dell.com> * - Thanks to Andries Brouwer for his debugging assistance. * - Code works, detects all the partitions. * ************************************************************/ #include <linux/kernel.h> #include <linux/crc32.h> #include <linux/ctype.h> #include <linux/math64.h> #include <linux/slab.h> #include "check.h" #include "efi.h" /* This allows a kernel command line option 'gpt' to override * the test for invalid PMBR. Not __initdata because reloading * the partition tables happens after init too. */ static int force_gpt; static int __init force_gpt_fn(char *str) { force_gpt = 1; return 1; } __setup("gpt", force_gpt_fn); /** * efi_crc32() - EFI version of crc32 function * @buf: buffer to calculate crc32 of * @len: length of buf * * Description: Returns EFI-style CRC32 value for @buf * * This function uses the little endian Ethernet polynomial * but seeds the function with ~0, and xor's with ~0 at the end. * Note, the EFI Specification, v1.02, has a reference to * Dr. Dobbs Journal, May 1994 (actually it's in May 1992). */ static inline u32 efi_crc32(const void *buf, unsigned long len) { return (crc32(~0L, buf, len) ^ ~0L); } /** * last_lba(): return number of last logical block of device * @disk: block device * * Description: Returns last LBA value on success, 0 on error. * This is stored (by sd and ide-geometry) in * the part[0] entry for this disk, and is the number of * physical sectors available on the disk. */ static u64 last_lba(struct gendisk *disk) { return div_u64(bdev_nr_bytes(disk->part0), queue_logical_block_size(disk->queue)) - 1ULL; } static inline int pmbr_part_valid(gpt_mbr_record *part) { if (part->os_type != EFI_PMBR_OSTYPE_EFI_GPT) goto invalid; /* set to 0x00000001 (i.e., the LBA of the GPT Partition Header) */ if (le32_to_cpu(part->starting_lba) != GPT_PRIMARY_PARTITION_TABLE_LBA) goto invalid; return GPT_MBR_PROTECTIVE; invalid: return 0; } /** * is_pmbr_valid(): test Protective MBR for validity * @mbr: pointer to a legacy mbr structure * @total_sectors: amount of sectors in the device * * Description: Checks for a valid protective or hybrid * master boot record (MBR). The validity of a pMBR depends * on all of the following properties: * 1) MSDOS signature is in the last two bytes of the MBR * 2) One partition of type 0xEE is found * * In addition, a hybrid MBR will have up to three additional * primary partitions, which point to the same space that's * marked out by up to three GPT partitions. * * Returns 0 upon invalid MBR, or GPT_MBR_PROTECTIVE or * GPT_MBR_HYBRID depending on the device layout. */ static int is_pmbr_valid(legacy_mbr *mbr, sector_t total_sectors) { uint32_t sz = 0; int i, part = 0, ret = 0; /* invalid by default */ if (!mbr || le16_to_cpu(mbr->signature) != MSDOS_MBR_SIGNATURE) goto done; for (i = 0; i < 4; i++) { ret = pmbr_part_valid(&mbr->partition_record[i]); if (ret == GPT_MBR_PROTECTIVE) { part = i; /* * Ok, we at least know that there's a protective MBR, * now check if there are other partition types for * hybrid MBR. */ goto check_hybrid; } } if (ret != GPT_MBR_PROTECTIVE) goto done; check_hybrid: for (i = 0; i < 4; i++) if ((mbr->partition_record[i].os_type != EFI_PMBR_OSTYPE_EFI_GPT) && (mbr->partition_record[i].os_type != 0x00)) ret = GPT_MBR_HYBRID; /* * Protective MBRs take up the lesser of the whole disk * or 2 TiB (32bit LBA), ignoring the rest of the disk. * Some partitioning programs, nonetheless, choose to set * the size to the maximum 32-bit limitation, disregarding * the disk size. * * Hybrid MBRs do not necessarily comply with this. * * Consider a bad value here to be a warning to support dd'ing * an image from a smaller disk to a larger disk. */ if (ret == GPT_MBR_PROTECTIVE) { sz = le32_to_cpu(mbr->partition_record[part].size_in_lba); if (sz != (uint32_t) total_sectors - 1 && sz != 0xFFFFFFFF) pr_debug("GPT: mbr size in lba (%u) different than whole disk (%u).\n", sz, min_t(uint32_t, total_sectors - 1, 0xFFFFFFFF)); } done: return ret; } /** * read_lba(): Read bytes from disk, starting at given LBA * @state: disk parsed partitions * @lba: the Logical Block Address of the partition table * @buffer: destination buffer * @count: bytes to read * * Description: Reads @count bytes from @state->disk into @buffer. * Returns number of bytes read on success, 0 on error. */ static size_t read_lba(struct parsed_partitions *state, u64 lba, u8 *buffer, size_t count) { size_t totalreadcount = 0; sector_t n = lba * (queue_logical_block_size(state->disk->queue) / 512); if (!buffer || lba > last_lba(state->disk)) return 0; while (count) { int copied = 512; Sector sect; unsigned char *data = read_part_sector(state, n++, §); if (!data) break; if (copied > count) copied = count; memcpy(buffer, data, copied); put_dev_sector(sect); buffer += copied; totalreadcount +=copied; count -= copied; } return totalreadcount; } /** * alloc_read_gpt_entries(): reads partition entries from disk * @state: disk parsed partitions * @gpt: GPT header * * Description: Returns ptes on success, NULL on error. * Allocates space for PTEs based on information found in @gpt. * Notes: remember to free pte when you're done! */ static gpt_entry *alloc_read_gpt_entries(struct parsed_partitions *state, gpt_header *gpt) { size_t count; gpt_entry *pte; if (!gpt) return NULL; count = (size_t)le32_to_cpu(gpt->num_partition_entries) * le32_to_cpu(gpt->sizeof_partition_entry); if (!count) return NULL; pte = kmalloc(count, GFP_KERNEL); if (!pte) return NULL; if (read_lba(state, le64_to_cpu(gpt->partition_entry_lba), (u8 *) pte, count) < count) { kfree(pte); pte=NULL; return NULL; } return pte; } /** * alloc_read_gpt_header(): Allocates GPT header, reads into it from disk * @state: disk parsed partitions * @lba: the Logical Block Address of the partition table * * Description: returns GPT header on success, NULL on error. Allocates * and fills a GPT header starting at @ from @state->disk. * Note: remember to free gpt when finished with it. */ static gpt_header *alloc_read_gpt_header(struct parsed_partitions *state, u64 lba) { gpt_header *gpt; unsigned ssz = queue_logical_block_size(state->disk->queue); gpt = kmalloc(ssz, GFP_KERNEL); if (!gpt) return NULL; if (read_lba(state, lba, (u8 *) gpt, ssz) < ssz) { kfree(gpt); gpt=NULL; return NULL; } return gpt; } /** * is_gpt_valid() - tests one GPT header and PTEs for validity * @state: disk parsed partitions * @lba: logical block address of the GPT header to test * @gpt: GPT header ptr, filled on return. * @ptes: PTEs ptr, filled on return. * * Description: returns 1 if valid, 0 on error. * If valid, returns pointers to newly allocated GPT header and PTEs. */ static int is_gpt_valid(struct parsed_partitions *state, u64 lba, gpt_header **gpt, gpt_entry **ptes) { u32 crc, origcrc; u64 lastlba, pt_size; if (!ptes) return 0; if (!(*gpt = alloc_read_gpt_header(state, lba))) return 0; /* Check the GUID Partition Table signature */ if (le64_to_cpu((*gpt)->signature) != GPT_HEADER_SIGNATURE) { pr_debug("GUID Partition Table Header signature is wrong:" "%lld != %lld\n", (unsigned long long)le64_to_cpu((*gpt)->signature), (unsigned long long)GPT_HEADER_SIGNATURE); goto fail; } /* Check the GUID Partition Table header size is too big */ if (le32_to_cpu((*gpt)->header_size) > queue_logical_block_size(state->disk->queue)) { pr_debug("GUID Partition Table Header size is too large: %u > %u\n", le32_to_cpu((*gpt)->header_size), queue_logical_block_size(state->disk->queue)); goto fail; } /* Check the GUID Partition Table header size is too small */ if (le32_to_cpu((*gpt)->header_size) < sizeof(gpt_header)) { pr_debug("GUID Partition Table Header size is too small: %u < %zu\n", le32_to_cpu((*gpt)->header_size), sizeof(gpt_header)); goto fail; } /* Check the GUID Partition Table CRC */ origcrc = le32_to_cpu((*gpt)->header_crc32); (*gpt)->header_crc32 = 0; crc = efi_crc32((const unsigned char *) (*gpt), le32_to_cpu((*gpt)->header_size)); if (crc != origcrc) { pr_debug("GUID Partition Table Header CRC is wrong: %x != %x\n", crc, origcrc); goto fail; } (*gpt)->header_crc32 = cpu_to_le32(origcrc); /* Check that the my_lba entry points to the LBA that contains * the GUID Partition Table */ if (le64_to_cpu((*gpt)->my_lba) != lba) { pr_debug("GPT my_lba incorrect: %lld != %lld\n", (unsigned long long)le64_to_cpu((*gpt)->my_lba), (unsigned long long)lba); goto fail; } /* Check the first_usable_lba and last_usable_lba are * within the disk. */ lastlba = last_lba(state->disk); if (le64_to_cpu((*gpt)->first_usable_lba) > lastlba) { pr_debug("GPT: first_usable_lba incorrect: %lld > %lld\n", (unsigned long long)le64_to_cpu((*gpt)->first_usable_lba), (unsigned long long)lastlba); goto fail; } if (le64_to_cpu((*gpt)->last_usable_lba) > lastlba) { pr_debug("GPT: last_usable_lba incorrect: %lld > %lld\n", (unsigned long long)le64_to_cpu((*gpt)->last_usable_lba), (unsigned long long)lastlba); goto fail; } if (le64_to_cpu((*gpt)->last_usable_lba) < le64_to_cpu((*gpt)->first_usable_lba)) { pr_debug("GPT: last_usable_lba incorrect: %lld > %lld\n", (unsigned long long)le64_to_cpu((*gpt)->last_usable_lba), (unsigned long long)le64_to_cpu((*gpt)->first_usable_lba)); goto fail; } /* Check that sizeof_partition_entry has the correct value */ if (le32_to_cpu((*gpt)->sizeof_partition_entry) != sizeof(gpt_entry)) { pr_debug("GUID Partition Entry Size check failed.\n"); goto fail; } /* Sanity check partition table size */ pt_size = (u64)le32_to_cpu((*gpt)->num_partition_entries) * le32_to_cpu((*gpt)->sizeof_partition_entry); if (pt_size > KMALLOC_MAX_SIZE) { pr_debug("GUID Partition Table is too large: %llu > %lu bytes\n", (unsigned long long)pt_size, KMALLOC_MAX_SIZE); goto fail; } if (!(*ptes = alloc_read_gpt_entries(state, *gpt))) goto fail; /* Check the GUID Partition Entry Array CRC */ crc = efi_crc32((const unsigned char *) (*ptes), pt_size); if (crc != le32_to_cpu((*gpt)->partition_entry_array_crc32)) { pr_debug("GUID Partition Entry Array CRC check failed.\n"); goto fail_ptes; } /* We're done, all's well */ return 1; fail_ptes: kfree(*ptes); *ptes = NULL; fail: kfree(*gpt); *gpt = NULL; return 0; } /** * is_pte_valid() - tests one PTE for validity * @pte:pte to check * @lastlba: last lba of the disk * * Description: returns 1 if valid, 0 on error. */ static inline int is_pte_valid(const gpt_entry *pte, const u64 lastlba) { if ((!efi_guidcmp(pte->partition_type_guid, NULL_GUID)) || le64_to_cpu(pte->starting_lba) > lastlba || le64_to_cpu(pte->ending_lba) > lastlba) return 0; return 1; } /** * compare_gpts() - Search disk for valid GPT headers and PTEs * @pgpt: primary GPT header * @agpt: alternate GPT header * @lastlba: last LBA number * * Description: Returns nothing. Sanity checks pgpt and agpt fields * and prints warnings on discrepancies. * */ static void compare_gpts(gpt_header *pgpt, gpt_header *agpt, u64 lastlba) { int error_found = 0; if (!pgpt || !agpt) return; if (le64_to_cpu(pgpt->my_lba) != le64_to_cpu(agpt->alternate_lba)) { pr_warn("GPT:Primary header LBA != Alt. header alternate_lba\n"); pr_warn("GPT:%lld != %lld\n", (unsigned long long)le64_to_cpu(pgpt->my_lba), (unsigned long long)le64_to_cpu(agpt->alternate_lba)); error_found++; } if (le64_to_cpu(pgpt->alternate_lba) != le64_to_cpu(agpt->my_lba)) { pr_warn("GPT:Primary header alternate_lba != Alt. header my_lba\n"); pr_warn("GPT:%lld != %lld\n", (unsigned long long)le64_to_cpu(pgpt->alternate_lba), (unsigned long long)le64_to_cpu(agpt->my_lba)); error_found++; } if (le64_to_cpu(pgpt->first_usable_lba) != le64_to_cpu(agpt->first_usable_lba)) { pr_warn("GPT:first_usable_lbas don't match.\n"); pr_warn("GPT:%lld != %lld\n", (unsigned long long)le64_to_cpu(pgpt->first_usable_lba), (unsigned long long)le64_to_cpu(agpt->first_usable_lba)); error_found++; } if (le64_to_cpu(pgpt->last_usable_lba) != le64_to_cpu(agpt->last_usable_lba)) { pr_warn("GPT:last_usable_lbas don't match.\n"); pr_warn("GPT:%lld != %lld\n", (unsigned long long)le64_to_cpu(pgpt->last_usable_lba), (unsigned long long)le64_to_cpu(agpt->last_usable_lba)); error_found++; } if (efi_guidcmp(pgpt->disk_guid, agpt->disk_guid)) { pr_warn("GPT:disk_guids don't match.\n"); error_found++; } if (le32_to_cpu(pgpt->num_partition_entries) != le32_to_cpu(agpt->num_partition_entries)) { pr_warn("GPT:num_partition_entries don't match: " "0x%x != 0x%x\n", le32_to_cpu(pgpt->num_partition_entries), le32_to_cpu(agpt->num_partition_entries)); error_found++; } if (le32_to_cpu(pgpt->sizeof_partition_entry) != le32_to_cpu(agpt->sizeof_partition_entry)) { pr_warn("GPT:sizeof_partition_entry values don't match: " "0x%x != 0x%x\n", le32_to_cpu(pgpt->sizeof_partition_entry), le32_to_cpu(agpt->sizeof_partition_entry)); error_found++; } if (le32_to_cpu(pgpt->partition_entry_array_crc32) != le32_to_cpu(agpt->partition_entry_array_crc32)) { pr_warn("GPT:partition_entry_array_crc32 values don't match: " "0x%x != 0x%x\n", le32_to_cpu(pgpt->partition_entry_array_crc32), le32_to_cpu(agpt->partition_entry_array_crc32)); error_found++; } if (le64_to_cpu(pgpt->alternate_lba) != lastlba) { pr_warn("GPT:Primary header thinks Alt. header is not at the end of the disk.\n"); pr_warn("GPT:%lld != %lld\n", (unsigned long long)le64_to_cpu(pgpt->alternate_lba), (unsigned long long)lastlba); error_found++; } if (le64_to_cpu(agpt->my_lba) != lastlba) { pr_warn("GPT:Alternate GPT header not at the end of the disk.\n"); pr_warn("GPT:%lld != %lld\n", (unsigned long long)le64_to_cpu(agpt->my_lba), (unsigned long long)lastlba); error_found++; } if (error_found) pr_warn("GPT: Use GNU Parted to correct GPT errors.\n"); return; } /** * find_valid_gpt() - Search disk for valid GPT headers and PTEs * @state: disk parsed partitions * @gpt: GPT header ptr, filled on return. * @ptes: PTEs ptr, filled on return. * * Description: Returns 1 if valid, 0 on error. * If valid, returns pointers to newly allocated GPT header and PTEs. * Validity depends on PMBR being valid (or being overridden by the * 'gpt' kernel command line option) and finding either the Primary * GPT header and PTEs valid, or the Alternate GPT header and PTEs * valid. If the Primary GPT header is not valid, the Alternate GPT header * is not checked unless the 'gpt' kernel command line option is passed. * This protects against devices which misreport their size, and forces * the user to decide to use the Alternate GPT. */ static int find_valid_gpt(struct parsed_partitions *state, gpt_header **gpt, gpt_entry **ptes) { int good_pgpt = 0, good_agpt = 0, good_pmbr = 0; gpt_header *pgpt = NULL, *agpt = NULL; gpt_entry *pptes = NULL, *aptes = NULL; legacy_mbr *legacymbr; struct gendisk *disk = state->disk; const struct block_device_operations *fops = disk->fops; sector_t total_sectors = get_capacity(state->disk); u64 lastlba; if (!ptes) return 0; lastlba = last_lba(state->disk); if (!force_gpt) { /* This will be added to the EFI Spec. per Intel after v1.02. */ legacymbr = kzalloc(sizeof(*legacymbr), GFP_KERNEL); if (!legacymbr) goto fail; read_lba(state, 0, (u8 *)legacymbr, sizeof(*legacymbr)); good_pmbr = is_pmbr_valid(legacymbr, total_sectors); kfree(legacymbr); if (!good_pmbr) goto fail; pr_debug("Device has a %s MBR\n", good_pmbr == GPT_MBR_PROTECTIVE ? "protective" : "hybrid"); } good_pgpt = is_gpt_valid(state, GPT_PRIMARY_PARTITION_TABLE_LBA, &pgpt, &pptes); if (good_pgpt) good_agpt = is_gpt_valid(state, le64_to_cpu(pgpt->alternate_lba), &agpt, &aptes); if (!good_agpt && force_gpt) good_agpt = is_gpt_valid(state, lastlba, &agpt, &aptes); if (!good_agpt && force_gpt && fops->alternative_gpt_sector) { sector_t agpt_sector; int err; err = fops->alternative_gpt_sector(disk, &agpt_sector); if (!err) good_agpt = is_gpt_valid(state, agpt_sector, &agpt, &aptes); } /* The obviously unsuccessful case */ if (!good_pgpt && !good_agpt) goto fail; compare_gpts(pgpt, agpt, lastlba); /* The good cases */ if (good_pgpt) { *gpt = pgpt; *ptes = pptes; kfree(agpt); kfree(aptes); if (!good_agpt) pr_warn("Alternate GPT is invalid, using primary GPT.\n"); return 1; } else if (good_agpt) { *gpt = agpt; *ptes = aptes; kfree(pgpt); kfree(pptes); pr_warn("Primary GPT is invalid, using alternate GPT.\n"); return 1; } fail: kfree(pgpt); kfree(agpt); kfree(pptes); kfree(aptes); *gpt = NULL; *ptes = NULL; return 0; } /** * utf16_le_to_7bit(): Naively converts a UTF-16LE string to 7-bit ASCII characters * @in: input UTF-16LE string * @size: size of the input string * @out: output string ptr, should be capable to store @size+1 characters * * Description: Converts @size UTF16-LE symbols from @in string to 7-bit * ASCII characters and stores them to @out. Adds trailing zero to @out array. */ static void utf16_le_to_7bit(const __le16 *in, unsigned int size, u8 *out) { unsigned int i = 0; out[size] = 0; while (i < size) { u8 c = le16_to_cpu(in[i]) & 0xff; if (c && !isprint(c)) c = '!'; out[i] = c; i++; } } /** * efi_partition - scan for GPT partitions * @state: disk parsed partitions * * Description: called from check.c, if the disk contains GPT * partitions, sets up partition entries in the kernel. * * If the first block on the disk is a legacy MBR, * it will get handled by msdos_partition(). * If it's a Protective MBR, we'll handle it here. * * We do not create a Linux partition for GPT, but * only for the actual data partitions. * Returns: * -1 if unable to read the partition table * 0 if this isn't our partition table * 1 if successful * */ int efi_partition(struct parsed_partitions *state) { gpt_header *gpt = NULL; gpt_entry *ptes = NULL; u32 i; unsigned ssz = queue_logical_block_size(state->disk->queue) / 512; if (!find_valid_gpt(state, &gpt, &ptes) || !gpt || !ptes) { kfree(gpt); kfree(ptes); return 0; } pr_debug("GUID Partition Table is valid! Yea!\n"); for (i = 0; i < le32_to_cpu(gpt->num_partition_entries) && i < state->limit-1; i++) { struct partition_meta_info *info; unsigned label_max; u64 start = le64_to_cpu(ptes[i].starting_lba); u64 size = le64_to_cpu(ptes[i].ending_lba) - le64_to_cpu(ptes[i].starting_lba) + 1ULL; if (!is_pte_valid(&ptes[i], last_lba(state->disk))) continue; put_partition(state, i+1, start * ssz, size * ssz); /* If this is a RAID volume, tell md */ if (!efi_guidcmp(ptes[i].partition_type_guid, PARTITION_LINUX_RAID_GUID)) state->parts[i + 1].flags = ADDPART_FLAG_RAID; info = &state->parts[i + 1].info; efi_guid_to_str(&ptes[i].unique_partition_guid, info->uuid); /* Naively convert UTF16-LE to 7 bits. */ label_max = min(ARRAY_SIZE(info->volname) - 1, ARRAY_SIZE(ptes[i].partition_name)); utf16_le_to_7bit(ptes[i].partition_name, label_max, info->volname); state->parts[i + 1].has_info = true; } kfree(ptes); kfree(gpt); strlcat(state->pp_buf, "\n", PAGE_SIZE); return 1; } |
| 11 2 11 4 22 2 24 38 5 42 7 24 1 24 5 5 28 3 19 24 15 25 17 17 3 1 8 11 5 6 9 4 6 2 1 3 2 4 11 5 1 19 1 23 23 3 16 2 3 19 24 39 41 1 14 26 14 2 2 2 9 37 2 2 3 29 25 2 4 3 1 8 8 14 3 2 1 9 1 3 3 2 6 9 7 2 2 4 7 39 32 8 9 9 24 3 2 8 23 1 2 2 2 7 17 4 3 2 1 8 1 10 7 4 3 1 27 24 20 26 4 3 4 4 40 27 31 27 17 27 26 2 43 12 1 11 12 2 12 415 417 417 415 411 394 373 3 40 36 41 41 381 58 46 21 13 12 43 32 12 42 42 42 8 9 44 1 41 35 2 2 41 41 38 4 6 36 1 3 8 2 2 6 10 9 1 9 5 3 2 3 3 19 1 1 17 2 4 4 6 10 3 6 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 | // SPDX-License-Identifier: GPL-2.0 #include <linux/kernel.h> #include <linux/errno.h> #include <linux/fs.h> #include <linux/file.h> #include <linux/mm.h> #include <linux/slab.h> #include <linux/nospec.h> #include <linux/hugetlb.h> #include <linux/compat.h> #include <linux/io_uring.h> #include <uapi/linux/io_uring.h> #include "io_uring.h" #include "openclose.h" #include "rsrc.h" #include "memmap.h" #include "register.h" struct io_rsrc_update { struct file *file; u64 arg; u32 nr_args; u32 offset; }; static struct io_rsrc_node *io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov, struct page **last_hpage); /* only define max */ #define IORING_MAX_FIXED_FILES (1U << 20) #define IORING_MAX_REG_BUFFERS (1U << 14) int __io_account_mem(struct user_struct *user, unsigned long nr_pages) { unsigned long page_limit, cur_pages, new_pages; if (!nr_pages) return 0; /* Don't allow more pages than we can safely lock */ page_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; cur_pages = atomic_long_read(&user->locked_vm); do { new_pages = cur_pages + nr_pages; if (new_pages > page_limit) return -ENOMEM; } while (!atomic_long_try_cmpxchg(&user->locked_vm, &cur_pages, new_pages)); return 0; } static void io_unaccount_mem(struct io_ring_ctx *ctx, unsigned long nr_pages) { if (ctx->user) __io_unaccount_mem(ctx->user, nr_pages); if (ctx->mm_account) atomic64_sub(nr_pages, &ctx->mm_account->pinned_vm); } static int io_account_mem(struct io_ring_ctx *ctx, unsigned long nr_pages) { int ret; if (ctx->user) { ret = __io_account_mem(ctx->user, nr_pages); if (ret) return ret; } if (ctx->mm_account) atomic64_add(nr_pages, &ctx->mm_account->pinned_vm); return 0; } static int io_buffer_validate(struct iovec *iov) { unsigned long tmp, acct_len = iov->iov_len + (PAGE_SIZE - 1); /* * Don't impose further limits on the size and buffer * constraints here, we'll -EINVAL later when IO is * submitted if they are wrong. */ if (!iov->iov_base) return iov->iov_len ? -EFAULT : 0; if (!iov->iov_len) return -EFAULT; /* arbitrary limit, but we need something */ if (iov->iov_len > SZ_1G) return -EFAULT; if (check_add_overflow((unsigned long)iov->iov_base, acct_len, &tmp)) return -EOVERFLOW; return 0; } static void io_buffer_unmap(struct io_ring_ctx *ctx, struct io_rsrc_node *node) { unsigned int i; if (node->buf) { struct io_mapped_ubuf *imu = node->buf; if (!refcount_dec_and_test(&imu->refs)) return; for (i = 0; i < imu->nr_bvecs; i++) unpin_user_page(imu->bvec[i].bv_page); if (imu->acct_pages) io_unaccount_mem(ctx, imu->acct_pages); kvfree(imu); } } struct io_rsrc_node *io_rsrc_node_alloc(int type) { struct io_rsrc_node *node; node = kzalloc(sizeof(*node), GFP_KERNEL); if (node) { node->type = type; node->refs = 1; } return node; } __cold void io_rsrc_data_free(struct io_ring_ctx *ctx, struct io_rsrc_data *data) { if (!data->nr) return; while (data->nr--) { if (data->nodes[data->nr]) io_put_rsrc_node(ctx, data->nodes[data->nr]); } kvfree(data->nodes); data->nodes = NULL; data->nr = 0; } __cold int io_rsrc_data_alloc(struct io_rsrc_data *data, unsigned nr) { data->nodes = kvmalloc_array(nr, sizeof(struct io_rsrc_node *), GFP_KERNEL_ACCOUNT | __GFP_ZERO); if (data->nodes) { data->nr = nr; return 0; } return -ENOMEM; } static int __io_sqe_files_update(struct io_ring_ctx *ctx, struct io_uring_rsrc_update2 *up, unsigned nr_args) { u64 __user *tags = u64_to_user_ptr(up->tags); __s32 __user *fds = u64_to_user_ptr(up->data); int fd, i, err = 0; unsigned int done; if (!ctx->file_table.data.nr) return -ENXIO; if (up->offset + nr_args > ctx->file_table.data.nr) return -EINVAL; for (done = 0; done < nr_args; done++) { u64 tag = 0; if ((tags && copy_from_user(&tag, &tags[done], sizeof(tag))) || copy_from_user(&fd, &fds[done], sizeof(fd))) { err = -EFAULT; break; } if ((fd == IORING_REGISTER_FILES_SKIP || fd == -1) && tag) { err = -EINVAL; break; } if (fd == IORING_REGISTER_FILES_SKIP) continue; i = up->offset + done; if (io_reset_rsrc_node(ctx, &ctx->file_table.data, i)) io_file_bitmap_clear(&ctx->file_table, i); if (fd != -1) { struct file *file = fget(fd); struct io_rsrc_node *node; if (!file) { err = -EBADF; break; } /* * Don't allow io_uring instances to be registered. */ if (io_is_uring_fops(file)) { fput(file); err = -EBADF; break; } node = io_rsrc_node_alloc(IORING_RSRC_FILE); if (!node) { err = -ENOMEM; fput(file); break; } ctx->file_table.data.nodes[i] = node; if (tag) node->tag = tag; io_fixed_file_set(node, file); io_file_bitmap_set(&ctx->file_table, i); } } return done ? done : err; } static int __io_sqe_buffers_update(struct io_ring_ctx *ctx, struct io_uring_rsrc_update2 *up, unsigned int nr_args) { u64 __user *tags = u64_to_user_ptr(up->tags); struct iovec fast_iov, *iov; struct page *last_hpage = NULL; struct iovec __user *uvec; u64 user_data = up->data; __u32 done; int i, err; if (!ctx->buf_table.nr) return -ENXIO; if (up->offset + nr_args > ctx->buf_table.nr) return -EINVAL; for (done = 0; done < nr_args; done++) { struct io_rsrc_node *node; u64 tag = 0; uvec = u64_to_user_ptr(user_data); iov = iovec_from_user(uvec, 1, 1, &fast_iov, ctx->compat); if (IS_ERR(iov)) { err = PTR_ERR(iov); break; } if (tags && copy_from_user(&tag, &tags[done], sizeof(tag))) { err = -EFAULT; break; } err = io_buffer_validate(iov); if (err) break; node = io_sqe_buffer_register(ctx, iov, &last_hpage); if (IS_ERR(node)) { err = PTR_ERR(node); break; } if (tag) { if (!node) { err = -EINVAL; break; } node->tag = tag; } i = array_index_nospec(up->offset + done, ctx->buf_table.nr); io_reset_rsrc_node(ctx, &ctx->buf_table, i); ctx->buf_table.nodes[i] = node; if (ctx->compat) user_data += sizeof(struct compat_iovec); else user_data += sizeof(struct iovec); } return done ? done : err; } static int __io_register_rsrc_update(struct io_ring_ctx *ctx, unsigned type, struct io_uring_rsrc_update2 *up, unsigned nr_args) { __u32 tmp; lockdep_assert_held(&ctx->uring_lock); if (check_add_overflow(up->offset, nr_args, &tmp)) return -EOVERFLOW; switch (type) { case IORING_RSRC_FILE: return __io_sqe_files_update(ctx, up, nr_args); case IORING_RSRC_BUFFER: return __io_sqe_buffers_update(ctx, up, nr_args); } return -EINVAL; } int io_register_files_update(struct io_ring_ctx *ctx, void __user *arg, unsigned nr_args) { struct io_uring_rsrc_update2 up; if (!nr_args) return -EINVAL; memset(&up, 0, sizeof(up)); if (copy_from_user(&up, arg, sizeof(struct io_uring_rsrc_update))) return -EFAULT; if (up.resv || up.resv2) return -EINVAL; return __io_register_rsrc_update(ctx, IORING_RSRC_FILE, &up, nr_args); } int io_register_rsrc_update(struct io_ring_ctx *ctx, void __user *arg, unsigned size, unsigned type) { struct io_uring_rsrc_update2 up; if (size != sizeof(up)) return -EINVAL; if (copy_from_user(&up, arg, sizeof(up))) return -EFAULT; if (!up.nr || up.resv || up.resv2) return -EINVAL; return __io_register_rsrc_update(ctx, type, &up, up.nr); } __cold int io_register_rsrc(struct io_ring_ctx *ctx, void __user *arg, unsigned int size, unsigned int type) { struct io_uring_rsrc_register rr; /* keep it extendible */ if (size != sizeof(rr)) return -EINVAL; memset(&rr, 0, sizeof(rr)); if (copy_from_user(&rr, arg, size)) return -EFAULT; if (!rr.nr || rr.resv2) return -EINVAL; if (rr.flags & ~IORING_RSRC_REGISTER_SPARSE) return -EINVAL; switch (type) { case IORING_RSRC_FILE: if (rr.flags & IORING_RSRC_REGISTER_SPARSE && rr.data) break; return io_sqe_files_register(ctx, u64_to_user_ptr(rr.data), rr.nr, u64_to_user_ptr(rr.tags)); case IORING_RSRC_BUFFER: if (rr.flags & IORING_RSRC_REGISTER_SPARSE && rr.data) break; return io_sqe_buffers_register(ctx, u64_to_user_ptr(rr.data), rr.nr, u64_to_user_ptr(rr.tags)); } return -EINVAL; } int io_files_update_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_rsrc_update *up = io_kiocb_to_cmd(req, struct io_rsrc_update); if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT))) return -EINVAL; if (sqe->rw_flags || sqe->splice_fd_in) return -EINVAL; up->offset = READ_ONCE(sqe->off); up->nr_args = READ_ONCE(sqe->len); if (!up->nr_args) return -EINVAL; up->arg = READ_ONCE(sqe->addr); return 0; } static int io_files_update_with_index_alloc(struct io_kiocb *req, unsigned int issue_flags) { struct io_rsrc_update *up = io_kiocb_to_cmd(req, struct io_rsrc_update); __s32 __user *fds = u64_to_user_ptr(up->arg); unsigned int done; struct file *file; int ret, fd; if (!req->ctx->file_table.data.nr) return -ENXIO; for (done = 0; done < up->nr_args; done++) { if (copy_from_user(&fd, &fds[done], sizeof(fd))) { ret = -EFAULT; break; } file = fget(fd); if (!file) { ret = -EBADF; break; } ret = io_fixed_fd_install(req, issue_flags, file, IORING_FILE_INDEX_ALLOC); if (ret < 0) break; if (copy_to_user(&fds[done], &ret, sizeof(ret))) { __io_close_fixed(req->ctx, issue_flags, ret); ret = -EFAULT; break; } } if (done) return done; return ret; } int io_files_update(struct io_kiocb *req, unsigned int issue_flags) { struct io_rsrc_update *up = io_kiocb_to_cmd(req, struct io_rsrc_update); struct io_ring_ctx *ctx = req->ctx; struct io_uring_rsrc_update2 up2; int ret; up2.offset = up->offset; up2.data = up->arg; up2.nr = 0; up2.tags = 0; up2.resv = 0; up2.resv2 = 0; if (up->offset == IORING_FILE_INDEX_ALLOC) { ret = io_files_update_with_index_alloc(req, issue_flags); } else { io_ring_submit_lock(ctx, issue_flags); ret = __io_register_rsrc_update(ctx, IORING_RSRC_FILE, &up2, up->nr_args); io_ring_submit_unlock(ctx, issue_flags); } if (ret < 0) req_set_fail(req); io_req_set_res(req, ret, 0); return IOU_OK; } void io_free_rsrc_node(struct io_ring_ctx *ctx, struct io_rsrc_node *node) { if (node->tag) io_post_aux_cqe(ctx, node->tag, 0, 0); switch (node->type) { case IORING_RSRC_FILE: if (io_slot_file(node)) fput(io_slot_file(node)); break; case IORING_RSRC_BUFFER: if (node->buf) io_buffer_unmap(ctx, node); break; default: WARN_ON_ONCE(1); break; } kfree(node); } int io_sqe_files_unregister(struct io_ring_ctx *ctx) { if (!ctx->file_table.data.nr) return -ENXIO; io_free_file_tables(ctx, &ctx->file_table); io_file_table_set_alloc_range(ctx, 0, 0); return 0; } int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg, unsigned nr_args, u64 __user *tags) { __s32 __user *fds = (__s32 __user *) arg; struct file *file; int fd, ret; unsigned i; if (ctx->file_table.data.nr) return -EBUSY; if (!nr_args) return -EINVAL; if (nr_args > IORING_MAX_FIXED_FILES) return -EMFILE; if (nr_args > rlimit(RLIMIT_NOFILE)) return -EMFILE; if (!io_alloc_file_tables(ctx, &ctx->file_table, nr_args)) return -ENOMEM; for (i = 0; i < nr_args; i++) { struct io_rsrc_node *node; u64 tag = 0; ret = -EFAULT; if (tags && copy_from_user(&tag, &tags[i], sizeof(tag))) goto fail; if (fds && copy_from_user(&fd, &fds[i], sizeof(fd))) goto fail; /* allow sparse sets */ if (!fds || fd == -1) { ret = -EINVAL; if (tag) goto fail; continue; } file = fget(fd); ret = -EBADF; if (unlikely(!file)) goto fail; /* * Don't allow io_uring instances to be registered. */ if (io_is_uring_fops(file)) { fput(file); goto fail; } ret = -ENOMEM; node = io_rsrc_node_alloc(IORING_RSRC_FILE); if (!node) { fput(file); goto fail; } if (tag) node->tag = tag; ctx->file_table.data.nodes[i] = node; io_fixed_file_set(node, file); io_file_bitmap_set(&ctx->file_table, i); } /* default it to the whole table */ io_file_table_set_alloc_range(ctx, 0, ctx->file_table.data.nr); return 0; fail: io_sqe_files_unregister(ctx); return ret; } int io_sqe_buffers_unregister(struct io_ring_ctx *ctx) { if (!ctx->buf_table.nr) return -ENXIO; io_rsrc_data_free(ctx, &ctx->buf_table); return 0; } /* * Not super efficient, but this is just a registration time. And we do cache * the last compound head, so generally we'll only do a full search if we don't * match that one. * * We check if the given compound head page has already been accounted, to * avoid double accounting it. This allows us to account the full size of the * page, not just the constituent pages of a huge page. */ static bool headpage_already_acct(struct io_ring_ctx *ctx, struct page **pages, int nr_pages, struct page *hpage) { int i, j; /* check current page array */ for (i = 0; i < nr_pages; i++) { if (!PageCompound(pages[i])) continue; if (compound_head(pages[i]) == hpage) return true; } /* check previously registered pages */ for (i = 0; i < ctx->buf_table.nr; i++) { struct io_rsrc_node *node = ctx->buf_table.nodes[i]; struct io_mapped_ubuf *imu; if (!node) continue; imu = node->buf; for (j = 0; j < imu->nr_bvecs; j++) { if (!PageCompound(imu->bvec[j].bv_page)) continue; if (compound_head(imu->bvec[j].bv_page) == hpage) return true; } } return false; } static int io_buffer_account_pin(struct io_ring_ctx *ctx, struct page **pages, int nr_pages, struct io_mapped_ubuf *imu, struct page **last_hpage) { int i, ret; imu->acct_pages = 0; for (i = 0; i < nr_pages; i++) { if (!PageCompound(pages[i])) { imu->acct_pages++; } else { struct page *hpage; hpage = compound_head(pages[i]); if (hpage == *last_hpage) continue; *last_hpage = hpage; if (headpage_already_acct(ctx, pages, i, hpage)) continue; imu->acct_pages += page_size(hpage) >> PAGE_SHIFT; } } if (!imu->acct_pages) return 0; ret = io_account_mem(ctx, imu->acct_pages); if (ret) imu->acct_pages = 0; return ret; } static bool io_coalesce_buffer(struct page ***pages, int *nr_pages, struct io_imu_folio_data *data) { struct page **page_array = *pages, **new_array = NULL; int nr_pages_left = *nr_pages, i, j; int nr_folios = data->nr_folios; /* Store head pages only*/ new_array = kvmalloc_array(nr_folios, sizeof(struct page *), GFP_KERNEL); if (!new_array) return false; new_array[0] = compound_head(page_array[0]); /* * The pages are bound to the folio, it doesn't * actually unpin them but drops all but one reference, * which is usually put down by io_buffer_unmap(). * Note, needs a better helper. */ if (data->nr_pages_head > 1) unpin_user_pages(&page_array[1], data->nr_pages_head - 1); j = data->nr_pages_head; nr_pages_left -= data->nr_pages_head; for (i = 1; i < nr_folios; i++) { unsigned int nr_unpin; new_array[i] = page_array[j]; nr_unpin = min_t(unsigned int, nr_pages_left - 1, data->nr_pages_mid - 1); if (nr_unpin) unpin_user_pages(&page_array[j+1], nr_unpin); j += data->nr_pages_mid; nr_pages_left -= data->nr_pages_mid; } kvfree(page_array); *pages = new_array; *nr_pages = nr_folios; return true; } bool io_check_coalesce_buffer(struct page **page_array, int nr_pages, struct io_imu_folio_data *data) { struct folio *folio = page_folio(page_array[0]); unsigned int count = 1, nr_folios = 1; int i; data->nr_pages_mid = folio_nr_pages(folio); data->folio_shift = folio_shift(folio); /* * Check if pages are contiguous inside a folio, and all folios have * the same page count except for the head and tail. */ for (i = 1; i < nr_pages; i++) { if (page_folio(page_array[i]) == folio && page_array[i] == page_array[i-1] + 1) { count++; continue; } if (nr_folios == 1) { if (folio_page_idx(folio, page_array[i-1]) != data->nr_pages_mid - 1) return false; data->nr_pages_head = count; } else if (count != data->nr_pages_mid) { return false; } folio = page_folio(page_array[i]); if (folio_size(folio) != (1UL << data->folio_shift) || folio_page_idx(folio, page_array[i]) != 0) return false; count = 1; nr_folios++; } if (nr_folios == 1) data->nr_pages_head = count; data->nr_folios = nr_folios; return true; } static struct io_rsrc_node *io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov, struct page **last_hpage) { struct io_mapped_ubuf *imu = NULL; struct page **pages = NULL; struct io_rsrc_node *node; unsigned long off; size_t size; int ret, nr_pages, i; struct io_imu_folio_data data; bool coalesced = false; if (!iov->iov_base) return NULL; node = io_rsrc_node_alloc(IORING_RSRC_BUFFER); if (!node) return ERR_PTR(-ENOMEM); node->buf = NULL; ret = -ENOMEM; pages = io_pin_pages((unsigned long) iov->iov_base, iov->iov_len, &nr_pages); if (IS_ERR(pages)) { ret = PTR_ERR(pages); pages = NULL; goto done; } /* If it's huge page(s), try to coalesce them into fewer bvec entries */ if (nr_pages > 1 && io_check_coalesce_buffer(pages, nr_pages, &data)) { if (data.nr_pages_mid != 1) coalesced = io_coalesce_buffer(&pages, &nr_pages, &data); } imu = kvmalloc(struct_size(imu, bvec, nr_pages), GFP_KERNEL); if (!imu) goto done; ret = io_buffer_account_pin(ctx, pages, nr_pages, imu, last_hpage); if (ret) { unpin_user_pages(pages, nr_pages); goto done; } size = iov->iov_len; /* store original address for later verification */ imu->ubuf = (unsigned long) iov->iov_base; imu->len = iov->iov_len; imu->nr_bvecs = nr_pages; imu->folio_shift = PAGE_SHIFT; if (coalesced) imu->folio_shift = data.folio_shift; refcount_set(&imu->refs, 1); off = (unsigned long) iov->iov_base & ((1UL << imu->folio_shift) - 1); node->buf = imu; ret = 0; for (i = 0; i < nr_pages; i++) { size_t vec_len; vec_len = min_t(size_t, size, (1UL << imu->folio_shift) - off); bvec_set_page(&imu->bvec[i], pages[i], vec_len, off); off = 0; size -= vec_len; } done: if (ret) { kvfree(imu); if (node) io_put_rsrc_node(ctx, node); node = ERR_PTR(ret); } kvfree(pages); return node; } int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg, unsigned int nr_args, u64 __user *tags) { struct page *last_hpage = NULL; struct io_rsrc_data data; struct iovec fast_iov, *iov = &fast_iov; const struct iovec __user *uvec; int i, ret; BUILD_BUG_ON(IORING_MAX_REG_BUFFERS >= (1u << 16)); if (ctx->buf_table.nr) return -EBUSY; if (!nr_args || nr_args > IORING_MAX_REG_BUFFERS) return -EINVAL; ret = io_rsrc_data_alloc(&data, nr_args); if (ret) return ret; if (!arg) memset(iov, 0, sizeof(*iov)); for (i = 0; i < nr_args; i++) { struct io_rsrc_node *node; u64 tag = 0; if (arg) { uvec = (struct iovec __user *) arg; iov = iovec_from_user(uvec, 1, 1, &fast_iov, ctx->compat); if (IS_ERR(iov)) { ret = PTR_ERR(iov); break; } ret = io_buffer_validate(iov); if (ret) break; if (ctx->compat) arg += sizeof(struct compat_iovec); else arg += sizeof(struct iovec); } if (tags) { if (copy_from_user(&tag, &tags[i], sizeof(tag))) { ret = -EFAULT; break; } } node = io_sqe_buffer_register(ctx, iov, &last_hpage); if (IS_ERR(node)) { ret = PTR_ERR(node); break; } if (tag) { if (!node) { ret = -EINVAL; break; } node->tag = tag; } data.nodes[i] = node; } ctx->buf_table = data; if (ret) io_sqe_buffers_unregister(ctx); return ret; } int io_import_fixed(int ddir, struct iov_iter *iter, struct io_mapped_ubuf *imu, u64 buf_addr, size_t len) { u64 buf_end; size_t offset; if (WARN_ON_ONCE(!imu)) return -EFAULT; if (unlikely(check_add_overflow(buf_addr, (u64)len, &buf_end))) return -EFAULT; /* not inside the mapped region */ if (unlikely(buf_addr < imu->ubuf || buf_end > (imu->ubuf + imu->len))) return -EFAULT; /* * Might not be a start of buffer, set size appropriately * and advance us to the beginning. */ offset = buf_addr - imu->ubuf; iov_iter_bvec(iter, ddir, imu->bvec, imu->nr_bvecs, len); if (offset) { /* * Don't use iov_iter_advance() here, as it's really slow for * using the latter parts of a big fixed buffer - it iterates * over each segment manually. We can cheat a bit here, because * we know that: * * 1) it's a BVEC iter, we set it up * 2) all bvecs are the same in size, except potentially the * first and last bvec * * So just find our index, and adjust the iterator afterwards. * If the offset is within the first bvec (or the whole first * bvec, just use iov_iter_advance(). This makes it easier * since we can just skip the first segment, which may not * be folio_size aligned. */ const struct bio_vec *bvec = imu->bvec; if (offset < bvec->bv_len) { iter->iov_offset = offset; } else { unsigned long seg_skip; /* skip first vec */ offset -= bvec->bv_len; seg_skip = 1 + (offset >> imu->folio_shift); iter->bvec += seg_skip; iter->nr_segs -= seg_skip; iter->iov_offset = offset & ((1UL << imu->folio_shift) - 1); } } return 0; } /* Lock two rings at once. The rings must be different! */ static void lock_two_rings(struct io_ring_ctx *ctx1, struct io_ring_ctx *ctx2) { if (ctx1 > ctx2) swap(ctx1, ctx2); mutex_lock(&ctx1->uring_lock); mutex_lock_nested(&ctx2->uring_lock, SINGLE_DEPTH_NESTING); } /* Both rings are locked by the caller. */ static int io_clone_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx, struct io_uring_clone_buffers *arg) { struct io_rsrc_data data; int i, ret, off, nr; unsigned int nbufs; lockdep_assert_held(&ctx->uring_lock); lockdep_assert_held(&src_ctx->uring_lock); /* * Accounting state is shared between the two rings; that only works if * both rings are accounted towards the same counters. */ if (ctx->user != src_ctx->user || ctx->mm_account != src_ctx->mm_account) return -EINVAL; /* if offsets are given, must have nr specified too */ if (!arg->nr && (arg->dst_off || arg->src_off)) return -EINVAL; /* not allowed unless REPLACE is set */ if (ctx->buf_table.nr && !(arg->flags & IORING_REGISTER_DST_REPLACE)) return -EBUSY; nbufs = src_ctx->buf_table.nr; if (!arg->nr) arg->nr = nbufs; else if (arg->nr > nbufs) return -EINVAL; else if (arg->nr > IORING_MAX_REG_BUFFERS) return -EINVAL; if (check_add_overflow(arg->nr, arg->dst_off, &nbufs)) return -EOVERFLOW; ret = io_rsrc_data_alloc(&data, max(nbufs, ctx->buf_table.nr)); if (ret) return ret; /* Fill entries in data from dst that won't overlap with src */ for (i = 0; i < min(arg->dst_off, ctx->buf_table.nr); i++) { struct io_rsrc_node *src_node = ctx->buf_table.nodes[i]; if (src_node) { data.nodes[i] = src_node; src_node->refs++; } } ret = -ENXIO; nbufs = src_ctx->buf_table.nr; if (!nbufs) goto out_free; ret = -EINVAL; if (!arg->nr) arg->nr = nbufs; else if (arg->nr > nbufs) goto out_free; ret = -EOVERFLOW; if (check_add_overflow(arg->nr, arg->src_off, &off)) goto out_free; if (off > nbufs) goto out_free; off = arg->dst_off; i = arg->src_off; nr = arg->nr; while (nr--) { struct io_rsrc_node *dst_node, *src_node; src_node = io_rsrc_node_lookup(&src_ctx->buf_table, i); if (!src_node) { dst_node = NULL; } else { dst_node = io_rsrc_node_alloc(IORING_RSRC_BUFFER); if (!dst_node) { ret = -ENOMEM; goto out_free; } refcount_inc(&src_node->buf->refs); dst_node->buf = src_node->buf; } data.nodes[off++] = dst_node; i++; } /* * If asked for replace, put the old table. data->nodes[] holds both * old and new nodes at this point. */ if (arg->flags & IORING_REGISTER_DST_REPLACE) io_rsrc_data_free(ctx, &ctx->buf_table); /* * ctx->buf_table must be empty now - either the contents are being * replaced and we just freed the table, or the contents are being * copied to a ring that does not have buffers yet (checked at function * entry). */ WARN_ON_ONCE(ctx->buf_table.nr); ctx->buf_table = data; return 0; out_free: io_rsrc_data_free(ctx, &data); return ret; } /* * Copy the registered buffers from the source ring whose file descriptor * is given in the src_fd to the current ring. This is identical to registering * the buffers with ctx, except faster as mappings already exist. * * Since the memory is already accounted once, don't account it again. */ int io_register_clone_buffers(struct io_ring_ctx *ctx, void __user *arg) { struct io_uring_clone_buffers buf; struct io_ring_ctx *src_ctx; bool registered_src; struct file *file; int ret; if (copy_from_user(&buf, arg, sizeof(buf))) return -EFAULT; if (buf.flags & ~(IORING_REGISTER_SRC_REGISTERED|IORING_REGISTER_DST_REPLACE)) return -EINVAL; if (!(buf.flags & IORING_REGISTER_DST_REPLACE) && ctx->buf_table.nr) return -EBUSY; if (memchr_inv(buf.pad, 0, sizeof(buf.pad))) return -EINVAL; registered_src = (buf.flags & IORING_REGISTER_SRC_REGISTERED) != 0; file = io_uring_register_get_file(buf.src_fd, registered_src); if (IS_ERR(file)) return PTR_ERR(file); src_ctx = file->private_data; if (src_ctx != ctx) { mutex_unlock(&ctx->uring_lock); lock_two_rings(ctx, src_ctx); } ret = io_clone_buffers(ctx, src_ctx, &buf); if (src_ctx != ctx) mutex_unlock(&src_ctx->uring_lock); fput(file); return ret; } |
| 1 1 13 5 9 2 2 2 1 10 3 10 2 2 10 4 2 2 2 2 1 1 46 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 | // SPDX-License-Identifier: GPL-2.0-only /* * Connection tracking protocol helper module for GRE. * * GRE is a generic encapsulation protocol, which is generally not very * suited for NAT, as it has no protocol-specific part as port numbers. * * It has an optional key field, which may help us distinguishing two * connections between the same two hosts. * * GRE is defined in RFC 1701 and RFC 1702, as well as RFC 2784 * * PPTP is built on top of a modified version of GRE, and has a mandatory * field called "CallID", which serves us for the same purpose as the key * field in plain GRE. * * Documentation about PPTP can be found in RFC 2637 * * (C) 2000-2005 by Harald Welte <laforge@gnumonks.org> * * Development of this code funded by Astaro AG (http://www.astaro.com/) * * (C) 2006-2012 Patrick McHardy <kaber@trash.net> */ #include <linux/module.h> #include <linux/types.h> #include <linux/timer.h> #include <linux/list.h> #include <linux/seq_file.h> #include <linux/in.h> #include <linux/netdevice.h> #include <linux/skbuff.h> #include <linux/slab.h> #include <net/dst.h> #include <net/net_namespace.h> #include <net/netns/generic.h> #include <net/netfilter/nf_conntrack_l4proto.h> #include <net/netfilter/nf_conntrack_helper.h> #include <net/netfilter/nf_conntrack_core.h> #include <net/netfilter/nf_conntrack_timeout.h> #include <linux/netfilter/nf_conntrack_proto_gre.h> #include <linux/netfilter/nf_conntrack_pptp.h> static const unsigned int gre_timeouts[GRE_CT_MAX] = { [GRE_CT_UNREPLIED] = 30*HZ, [GRE_CT_REPLIED] = 180*HZ, }; /* used when expectation is added */ static DEFINE_SPINLOCK(keymap_lock); static inline struct nf_gre_net *gre_pernet(struct net *net) { return &net->ct.nf_ct_proto.gre; } static inline int gre_key_cmpfn(const struct nf_ct_gre_keymap *km, const struct nf_conntrack_tuple *t) { return km->tuple.src.l3num == t->src.l3num && !memcmp(&km->tuple.src.u3, &t->src.u3, sizeof(t->src.u3)) && !memcmp(&km->tuple.dst.u3, &t->dst.u3, sizeof(t->dst.u3)) && km->tuple.dst.protonum == t->dst.protonum && km->tuple.dst.u.all == t->dst.u.all; } /* look up the source key for a given tuple */ static __be16 gre_keymap_lookup(struct net *net, struct nf_conntrack_tuple *t) { struct nf_gre_net *net_gre = gre_pernet(net); struct nf_ct_gre_keymap *km; __be16 key = 0; list_for_each_entry_rcu(km, &net_gre->keymap_list, list) { if (gre_key_cmpfn(km, t)) { key = km->tuple.src.u.gre.key; break; } } pr_debug("lookup src key 0x%x for ", key); nf_ct_dump_tuple(t); return key; } /* add a single keymap entry, associate with specified master ct */ int nf_ct_gre_keymap_add(struct nf_conn *ct, enum ip_conntrack_dir dir, struct nf_conntrack_tuple *t) { struct net *net = nf_ct_net(ct); struct nf_gre_net *net_gre = gre_pernet(net); struct nf_ct_pptp_master *ct_pptp_info = nfct_help_data(ct); struct nf_ct_gre_keymap **kmp, *km; kmp = &ct_pptp_info->keymap[dir]; if (*kmp) { /* check whether it's a retransmission */ list_for_each_entry_rcu(km, &net_gre->keymap_list, list) { if (gre_key_cmpfn(km, t) && km == *kmp) return 0; } pr_debug("trying to override keymap_%s for ct %p\n", dir == IP_CT_DIR_REPLY ? "reply" : "orig", ct); return -EEXIST; } km = kmalloc(sizeof(*km), GFP_ATOMIC); if (!km) return -ENOMEM; memcpy(&km->tuple, t, sizeof(*t)); *kmp = km; pr_debug("adding new entry %p: ", km); nf_ct_dump_tuple(&km->tuple); spin_lock_bh(&keymap_lock); list_add_tail(&km->list, &net_gre->keymap_list); spin_unlock_bh(&keymap_lock); return 0; } EXPORT_SYMBOL_GPL(nf_ct_gre_keymap_add); /* destroy the keymap entries associated with specified master ct */ void nf_ct_gre_keymap_destroy(struct nf_conn *ct) { struct nf_ct_pptp_master *ct_pptp_info = nfct_help_data(ct); enum ip_conntrack_dir dir; pr_debug("entering for ct %p\n", ct); spin_lock_bh(&keymap_lock); for (dir = IP_CT_DIR_ORIGINAL; dir < IP_CT_DIR_MAX; dir++) { if (ct_pptp_info->keymap[dir]) { pr_debug("removing %p from list\n", ct_pptp_info->keymap[dir]); list_del_rcu(&ct_pptp_info->keymap[dir]->list); kfree_rcu(ct_pptp_info->keymap[dir], rcu); ct_pptp_info->keymap[dir] = NULL; } } spin_unlock_bh(&keymap_lock); } EXPORT_SYMBOL_GPL(nf_ct_gre_keymap_destroy); /* PUBLIC CONNTRACK PROTO HELPER FUNCTIONS */ /* gre hdr info to tuple */ bool gre_pkt_to_tuple(const struct sk_buff *skb, unsigned int dataoff, struct net *net, struct nf_conntrack_tuple *tuple) { const struct pptp_gre_header *pgrehdr; struct pptp_gre_header _pgrehdr; __be16 srckey; const struct gre_base_hdr *grehdr; struct gre_base_hdr _grehdr; /* first only delinearize old RFC1701 GRE header */ grehdr = skb_header_pointer(skb, dataoff, sizeof(_grehdr), &_grehdr); if (!grehdr || (grehdr->flags & GRE_VERSION) != GRE_VERSION_1) { /* try to behave like "nf_conntrack_proto_generic" */ tuple->src.u.all = 0; tuple->dst.u.all = 0; return true; } /* PPTP header is variable length, only need up to the call_id field */ pgrehdr = skb_header_pointer(skb, dataoff, 8, &_pgrehdr); if (!pgrehdr) return true; if (grehdr->protocol != GRE_PROTO_PPP) { pr_debug("Unsupported GRE proto(0x%x)\n", ntohs(grehdr->protocol)); return false; } tuple->dst.u.gre.key = pgrehdr->call_id; srckey = gre_keymap_lookup(net, tuple); tuple->src.u.gre.key = srckey; return true; } #ifdef CONFIG_NF_CONNTRACK_PROCFS /* print private data for conntrack */ static void gre_print_conntrack(struct seq_file *s, struct nf_conn *ct) { seq_printf(s, "timeout=%u, stream_timeout=%u ", (ct->proto.gre.timeout / HZ), (ct->proto.gre.stream_timeout / HZ)); } #endif static unsigned int *gre_get_timeouts(struct net *net) { return gre_pernet(net)->timeouts; } /* Returns verdict for packet, and may modify conntrack */ int nf_conntrack_gre_packet(struct nf_conn *ct, struct sk_buff *skb, unsigned int dataoff, enum ip_conntrack_info ctinfo, const struct nf_hook_state *state) { unsigned long status; if (!nf_ct_is_confirmed(ct)) { unsigned int *timeouts = nf_ct_timeout_lookup(ct); if (!timeouts) timeouts = gre_get_timeouts(nf_ct_net(ct)); /* initialize to sane value. Ideally a conntrack helper * (e.g. in case of pptp) is increasing them */ ct->proto.gre.stream_timeout = timeouts[GRE_CT_REPLIED]; ct->proto.gre.timeout = timeouts[GRE_CT_UNREPLIED]; } status = READ_ONCE(ct->status); /* If we've seen traffic both ways, this is a GRE connection. * Extend timeout. */ if (status & IPS_SEEN_REPLY) { nf_ct_refresh_acct(ct, ctinfo, skb, ct->proto.gre.stream_timeout); /* never set ASSURED for IPS_NAT_CLASH, they time out soon */ if (unlikely((status & IPS_NAT_CLASH))) return NF_ACCEPT; /* Also, more likely to be important, and not a probe. */ if (!test_and_set_bit(IPS_ASSURED_BIT, &ct->status)) nf_conntrack_event_cache(IPCT_ASSURED, ct); } else nf_ct_refresh_acct(ct, ctinfo, skb, ct->proto.gre.timeout); return NF_ACCEPT; } #ifdef CONFIG_NF_CONNTRACK_TIMEOUT #include <linux/netfilter/nfnetlink.h> #include <linux/netfilter/nfnetlink_cttimeout.h> static int gre_timeout_nlattr_to_obj(struct nlattr *tb[], struct net *net, void *data) { unsigned int *timeouts = data; struct nf_gre_net *net_gre = gre_pernet(net); if (!timeouts) timeouts = gre_get_timeouts(net); /* set default timeouts for GRE. */ timeouts[GRE_CT_UNREPLIED] = net_gre->timeouts[GRE_CT_UNREPLIED]; timeouts[GRE_CT_REPLIED] = net_gre->timeouts[GRE_CT_REPLIED]; if (tb[CTA_TIMEOUT_GRE_UNREPLIED]) { timeouts[GRE_CT_UNREPLIED] = ntohl(nla_get_be32(tb[CTA_TIMEOUT_GRE_UNREPLIED])) * HZ; } if (tb[CTA_TIMEOUT_GRE_REPLIED]) { timeouts[GRE_CT_REPLIED] = ntohl(nla_get_be32(tb[CTA_TIMEOUT_GRE_REPLIED])) * HZ; } return 0; } static int gre_timeout_obj_to_nlattr(struct sk_buff *skb, const void *data) { const unsigned int *timeouts = data; if (nla_put_be32(skb, CTA_TIMEOUT_GRE_UNREPLIED, htonl(timeouts[GRE_CT_UNREPLIED] / HZ)) || nla_put_be32(skb, CTA_TIMEOUT_GRE_REPLIED, htonl(timeouts[GRE_CT_REPLIED] / HZ))) goto nla_put_failure; return 0; nla_put_failure: return -ENOSPC; } static const struct nla_policy gre_timeout_nla_policy[CTA_TIMEOUT_GRE_MAX+1] = { [CTA_TIMEOUT_GRE_UNREPLIED] = { .type = NLA_U32 }, [CTA_TIMEOUT_GRE_REPLIED] = { .type = NLA_U32 }, }; #endif /* CONFIG_NF_CONNTRACK_TIMEOUT */ void nf_conntrack_gre_init_net(struct net *net) { struct nf_gre_net *net_gre = gre_pernet(net); int i; INIT_LIST_HEAD(&net_gre->keymap_list); for (i = 0; i < GRE_CT_MAX; i++) net_gre->timeouts[i] = gre_timeouts[i]; } /* protocol helper struct */ const struct nf_conntrack_l4proto nf_conntrack_l4proto_gre = { .l4proto = IPPROTO_GRE, .allow_clash = true, #ifdef CONFIG_NF_CONNTRACK_PROCFS .print_conntrack = gre_print_conntrack, #endif #if IS_ENABLED(CONFIG_NF_CT_NETLINK) .tuple_to_nlattr = nf_ct_port_tuple_to_nlattr, .nlattr_tuple_size = nf_ct_port_nlattr_tuple_size, .nlattr_to_tuple = nf_ct_port_nlattr_to_tuple, .nla_policy = nf_ct_port_nla_policy, #endif #ifdef CONFIG_NF_CONNTRACK_TIMEOUT .ctnl_timeout = { .nlattr_to_obj = gre_timeout_nlattr_to_obj, .obj_to_nlattr = gre_timeout_obj_to_nlattr, .nlattr_max = CTA_TIMEOUT_GRE_MAX, .obj_size = sizeof(unsigned int) * GRE_CT_MAX, .nla_policy = gre_timeout_nla_policy, }, #endif /* CONFIG_NF_CONNTRACK_TIMEOUT */ }; |
| 207 204 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 | // SPDX-License-Identifier: GPL-2.0-only /* PIPAPO: PIle PAcket POlicies: set for arbitrary concatenations of ranges * * Copyright (c) 2019-2020 Red Hat GmbH * * Author: Stefano Brivio <sbrivio@redhat.com> */ /** * DOC: Theory of Operation * * * Problem * ------- * * Match packet bytes against entries composed of ranged or non-ranged packet * field specifiers, mapping them to arbitrary references. For example: * * :: * * --- fields ---> * | [net],[port],[net]... => [reference] * entries [net],[port],[net]... => [reference] * | [net],[port],[net]... => [reference] * V ... * * where [net] fields can be IP ranges or netmasks, and [port] fields are port * ranges. Arbitrary packet fields can be matched. * * * Algorithm Overview * ------------------ * * This algorithm is loosely inspired by [Ligatti 2010], and fundamentally * relies on the consideration that every contiguous range in a space of b bits * can be converted into b * 2 netmasks, from Theorem 3 in [Rottenstreich 2010], * as also illustrated in Section 9 of [Kogan 2014]. * * Classification against a number of entries, that require matching given bits * of a packet field, is performed by grouping those bits in sets of arbitrary * size, and classifying packet bits one group at a time. * * Example: * to match the source port (16 bits) of a packet, we can divide those 16 bits * in 4 groups of 4 bits each. Given the entry: * 0000 0001 0101 1001 * and a packet with source port: * 0000 0001 1010 1001 * first and second groups match, but the third doesn't. We conclude that the * packet doesn't match the given entry. * * Translate the set to a sequence of lookup tables, one per field. Each table * has two dimensions: bit groups to be matched for a single packet field, and * all the possible values of said groups (buckets). Input entries are * represented as one or more rules, depending on the number of composing * netmasks for the given field specifier, and a group match is indicated as a * set bit, with number corresponding to the rule index, in all the buckets * whose value matches the entry for a given group. * * Rules are mapped between fields through an array of x, n pairs, with each * item mapping a matched rule to one or more rules. The position of the pair in * the array indicates the matched rule to be mapped to the next field, x * indicates the first rule index in the next field, and n the amount of * next-field rules the current rule maps to. * * The mapping array for the last field maps to the desired references. * * To match, we perform table lookups using the values of grouped packet bits, * and use a sequence of bitwise operations to progressively evaluate rule * matching. * * A stand-alone, reference implementation, also including notes about possible * future optimisations, is available at: * https://pipapo.lameexcu.se/ * * Insertion * --------- * * - For each packet field: * * - divide the b packet bits we want to classify into groups of size t, * obtaining ceil(b / t) groups * * Example: match on destination IP address, with t = 4: 32 bits, 8 groups * of 4 bits each * * - allocate a lookup table with one column ("bucket") for each possible * value of a group, and with one row for each group * * Example: 8 groups, 2^4 buckets: * * :: * * bucket * group 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 * 0 * 1 * 2 * 3 * 4 * 5 * 6 * 7 * * - map the bits we want to classify for the current field, for a given * entry, to a single rule for non-ranged and netmask set items, and to one * or multiple rules for ranges. Ranges are expanded to composing netmasks * by pipapo_expand(). * * Example: 2 entries, 10.0.0.5:1024 and 192.168.1.0-192.168.2.1:2048 * - rule #0: 10.0.0.5 * - rule #1: 192.168.1.0/24 * - rule #2: 192.168.2.0/31 * * - insert references to the rules in the lookup table, selecting buckets * according to bit values of a rule in the given group. This is done by * pipapo_insert(). * * Example: given: * - rule #0: 10.0.0.5 mapping to buckets * < 0 10 0 0 0 0 0 5 > * - rule #1: 192.168.1.0/24 mapping to buckets * < 12 0 10 8 0 1 < 0..15 > < 0..15 > > * - rule #2: 192.168.2.0/31 mapping to buckets * < 12 0 10 8 0 2 0 < 0..1 > > * * these bits are set in the lookup table: * * :: * * bucket * group 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 * 0 0 1,2 * 1 1,2 0 * 2 0 1,2 * 3 0 1,2 * 4 0,1,2 * 5 0 1 2 * 6 0,1,2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 * 7 1,2 1,2 1 1 1 0,1 1 1 1 1 1 1 1 1 1 1 * * - if this is not the last field in the set, fill a mapping array that maps * rules from the lookup table to rules belonging to the same entry in * the next lookup table, done by pipapo_map(). * * Note that as rules map to contiguous ranges of rules, given how netmask * expansion and insertion is performed, &union nft_pipapo_map_bucket stores * this information as pairs of first rule index, rule count. * * Example: 2 entries, 10.0.0.5:1024 and 192.168.1.0-192.168.2.1:2048, * given lookup table #0 for field 0 (see example above): * * :: * * bucket * group 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 * 0 0 1,2 * 1 1,2 0 * 2 0 1,2 * 3 0 1,2 * 4 0,1,2 * 5 0 1 2 * 6 0,1,2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 * 7 1,2 1,2 1 1 1 0,1 1 1 1 1 1 1 1 1 1 1 * * and lookup table #1 for field 1 with: * - rule #0: 1024 mapping to buckets * < 0 0 4 0 > * - rule #1: 2048 mapping to buckets * < 0 0 5 0 > * * :: * * bucket * group 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 * 0 0,1 * 1 0,1 * 2 0 1 * 3 0,1 * * we need to map rules for 10.0.0.5 in lookup table #0 (rule #0) to 1024 * in lookup table #1 (rule #0) and rules for 192.168.1.0-192.168.2.1 * (rules #1, #2) to 2048 in lookup table #2 (rule #1): * * :: * * rule indices in current field: 0 1 2 * map to rules in next field: 0 1 1 * * - if this is the last field in the set, fill a mapping array that maps * rules from the last lookup table to element pointers, also done by * pipapo_map(). * * Note that, in this implementation, we have two elements (start, end) for * each entry. The pointer to the end element is stored in this array, and * the pointer to the start element is linked from it. * * Example: entry 10.0.0.5:1024 has a corresponding &struct nft_pipapo_elem * pointer, 0x66, and element for 192.168.1.0-192.168.2.1:2048 is at 0x42. * From the rules of lookup table #1 as mapped above: * * :: * * rule indices in last field: 0 1 * map to elements: 0x66 0x42 * * * Matching * -------- * * We use a result bitmap, with the size of a single lookup table bucket, to * represent the matching state that applies at every algorithm step. This is * done by pipapo_lookup(). * * - For each packet field: * * - start with an all-ones result bitmap (res_map in pipapo_lookup()) * * - perform a lookup into the table corresponding to the current field, * for each group, and at every group, AND the current result bitmap with * the value from the lookup table bucket * * :: * * Example: 192.168.1.5 < 12 0 10 8 0 1 0 5 >, with lookup table from * insertion examples. * Lookup table buckets are at least 3 bits wide, we'll assume 8 bits for * convenience in this example. Initial result bitmap is 0xff, the steps * below show the value of the result bitmap after each group is processed: * * bucket * group 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 * 0 0 1,2 * result bitmap is now: 0xff & 0x6 [bucket 12] = 0x6 * * 1 1,2 0 * result bitmap is now: 0x6 & 0x6 [bucket 0] = 0x6 * * 2 0 1,2 * result bitmap is now: 0x6 & 0x6 [bucket 10] = 0x6 * * 3 0 1,2 * result bitmap is now: 0x6 & 0x6 [bucket 8] = 0x6 * * 4 0,1,2 * result bitmap is now: 0x6 & 0x7 [bucket 0] = 0x6 * * 5 0 1 2 * result bitmap is now: 0x6 & 0x2 [bucket 1] = 0x2 * * 6 0,1,2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 * result bitmap is now: 0x2 & 0x7 [bucket 0] = 0x2 * * 7 1,2 1,2 1 1 1 0,1 1 1 1 1 1 1 1 1 1 1 * final result bitmap for this field is: 0x2 & 0x3 [bucket 5] = 0x2 * * - at the next field, start with a new, all-zeroes result bitmap. For each * bit set in the previous result bitmap, fill the new result bitmap * (fill_map in pipapo_lookup()) with the rule indices from the * corresponding buckets of the mapping field for this field, done by * pipapo_refill() * * Example: with mapping table from insertion examples, with the current * result bitmap from the previous example, 0x02: * * :: * * rule indices in current field: 0 1 2 * map to rules in next field: 0 1 1 * * the new result bitmap will be 0x02: rule 1 was set, and rule 1 will be * set. * * We can now extend this example to cover the second iteration of the step * above (lookup and AND bitmap): assuming the port field is * 2048 < 0 0 5 0 >, with starting result bitmap 0x2, and lookup table * for "port" field from pre-computation example: * * :: * * bucket * group 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 * 0 0,1 * 1 0,1 * 2 0 1 * 3 0,1 * * operations are: 0x2 & 0x3 [bucket 0] & 0x3 [bucket 0] & 0x2 [bucket 5] * & 0x3 [bucket 0], resulting bitmap is 0x2. * * - if this is the last field in the set, look up the value from the mapping * array corresponding to the final result bitmap * * Example: 0x2 resulting bitmap from 192.168.1.5:2048, mapping array for * last field from insertion example: * * :: * * rule indices in last field: 0 1 * map to elements: 0x66 0x42 * * the matching element is at 0x42. * * * References * ---------- * * [Ligatti 2010] * A Packet-classification Algorithm for Arbitrary Bitmask Rules, with * Automatic Time-space Tradeoffs * Jay Ligatti, Josh Kuhn, and Chris Gage. * Proceedings of the IEEE International Conference on Computer * Communication Networks (ICCCN), August 2010. * https://www.cse.usf.edu/~ligatti/papers/grouper-conf.pdf * * [Rottenstreich 2010] * Worst-Case TCAM Rule Expansion * Ori Rottenstreich and Isaac Keslassy. * 2010 Proceedings IEEE INFOCOM, San Diego, CA, 2010. * http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.212.4592&rep=rep1&type=pdf * * [Kogan 2014] * SAX-PAC (Scalable And eXpressive PAcket Classification) * Kirill Kogan, Sergey Nikolenko, Ori Rottenstreich, William Culhane, * and Patrick Eugster. * Proceedings of the 2014 ACM conference on SIGCOMM, August 2014. * https://www.sigcomm.org/sites/default/files/ccr/papers/2014/August/2619239-2626294.pdf */ #include <linux/kernel.h> #include <linux/init.h> #include <linux/module.h> #include <linux/netlink.h> #include <linux/netfilter.h> #include <linux/netfilter/nf_tables.h> #include <net/netfilter/nf_tables_core.h> #include <uapi/linux/netfilter/nf_tables.h> #include <linux/bitmap.h> #include <linux/bitops.h> #include "nft_set_pipapo_avx2.h" #include "nft_set_pipapo.h" /** * pipapo_refill() - For each set bit, set bits from selected mapping table item * @map: Bitmap to be scanned for set bits * @len: Length of bitmap in longs * @rules: Number of rules in field * @dst: Destination bitmap * @mt: Mapping table containing bit set specifiers * @match_only: Find a single bit and return, don't fill * * Iteration over set bits with __builtin_ctzl(): Daniel Lemire, public domain. * * For each bit set in map, select the bucket from mapping table with index * corresponding to the position of the bit set. Use start bit and amount of * bits specified in bucket to fill region in dst. * * Return: -1 on no match, bit position on 'match_only', 0 otherwise. */ int pipapo_refill(unsigned long *map, unsigned int len, unsigned int rules, unsigned long *dst, const union nft_pipapo_map_bucket *mt, bool match_only) { unsigned long bitset; unsigned int k; int ret = -1; for (k = 0; k < len; k++) { bitset = map[k]; while (bitset) { unsigned long t = bitset & -bitset; int r = __builtin_ctzl(bitset); int i = k * BITS_PER_LONG + r; if (unlikely(i >= rules)) { map[k] = 0; return -1; } if (match_only) { bitmap_clear(map, i, 1); return i; } ret = 0; bitmap_set(dst, mt[i].to, mt[i].n); bitset ^= t; } map[k] = 0; } return ret; } /** * nft_pipapo_lookup() - Lookup function * @net: Network namespace * @set: nftables API set representation * @key: nftables API element representation containing key data * @ext: nftables API extension pointer, filled with matching reference * * For more details, see DOC: Theory of Operation. * * Return: true on match, false otherwise. */ bool nft_pipapo_lookup(const struct net *net, const struct nft_set *set, const u32 *key, const struct nft_set_ext **ext) { struct nft_pipapo *priv = nft_set_priv(set); struct nft_pipapo_scratch *scratch; unsigned long *res_map, *fill_map; u8 genmask = nft_genmask_cur(net); const struct nft_pipapo_match *m; const struct nft_pipapo_field *f; const u8 *rp = (const u8 *)key; bool map_index; int i; local_bh_disable(); m = rcu_dereference(priv->match); if (unlikely(!m || !*raw_cpu_ptr(m->scratch))) goto out; scratch = *raw_cpu_ptr(m->scratch); map_index = scratch->map_index; res_map = scratch->map + (map_index ? m->bsize_max : 0); fill_map = scratch->map + (map_index ? 0 : m->bsize_max); pipapo_resmap_init(m, res_map); nft_pipapo_for_each_field(f, i, m) { bool last = i == m->field_count - 1; int b; /* For each bit group: select lookup table bucket depending on * packet bytes value, then AND bucket value */ if (likely(f->bb == 8)) pipapo_and_field_buckets_8bit(f, res_map, rp); else pipapo_and_field_buckets_4bit(f, res_map, rp); NFT_PIPAPO_GROUP_BITS_ARE_8_OR_4; rp += f->groups / NFT_PIPAPO_GROUPS_PER_BYTE(f); /* Now populate the bitmap for the next field, unless this is * the last field, in which case return the matched 'ext' * pointer if any. * * Now res_map contains the matching bitmap, and fill_map is the * bitmap for the next field. */ next_match: b = pipapo_refill(res_map, f->bsize, f->rules, fill_map, f->mt, last); if (b < 0) { scratch->map_index = map_index; local_bh_enable(); return false; } if (last) { *ext = &f->mt[b].e->ext; if (unlikely(nft_set_elem_expired(*ext) || !nft_set_elem_active(*ext, genmask))) goto next_match; /* Last field: we're just returning the key without * filling the initial bitmap for the next field, so the * current inactive bitmap is clean and can be reused as * *next* bitmap (not initial) for the next packet. */ scratch->map_index = map_index; local_bh_enable(); return true; } /* Swap bitmap indices: res_map is the initial bitmap for the * next field, and fill_map is guaranteed to be all-zeroes at * this point. */ map_index = !map_index; swap(res_map, fill_map); rp += NFT_PIPAPO_GROUPS_PADDING(f); } out: local_bh_enable(); return false; } /** * pipapo_get() - Get matching element reference given key data * @net: Network namespace * @set: nftables API set representation * @m: storage containing active/existing elements * @data: Key data to be matched against existing elements * @genmask: If set, check that element is active in given genmask * @tstamp: timestamp to check for expired elements * @gfp: the type of memory to allocate (see kmalloc). * * This is essentially the same as the lookup function, except that it matches * key data against the uncommitted copy and doesn't use preallocated maps for * bitmap results. * * Return: pointer to &struct nft_pipapo_elem on match, error pointer otherwise. */ static struct nft_pipapo_elem *pipapo_get(const struct net *net, const struct nft_set *set, const struct nft_pipapo_match *m, const u8 *data, u8 genmask, u64 tstamp, gfp_t gfp) { struct nft_pipapo_elem *ret = ERR_PTR(-ENOENT); unsigned long *res_map, *fill_map = NULL; const struct nft_pipapo_field *f; int i; if (m->bsize_max == 0) return ret; res_map = kmalloc_array(m->bsize_max, sizeof(*res_map), gfp); if (!res_map) { ret = ERR_PTR(-ENOMEM); goto out; } fill_map = kcalloc(m->bsize_max, sizeof(*res_map), gfp); if (!fill_map) { ret = ERR_PTR(-ENOMEM); goto out; } pipapo_resmap_init(m, res_map); nft_pipapo_for_each_field(f, i, m) { bool last = i == m->field_count - 1; int b; /* For each bit group: select lookup table bucket depending on * packet bytes value, then AND bucket value */ if (f->bb == 8) pipapo_and_field_buckets_8bit(f, res_map, data); else if (f->bb == 4) pipapo_and_field_buckets_4bit(f, res_map, data); else BUG(); data += f->groups / NFT_PIPAPO_GROUPS_PER_BYTE(f); /* Now populate the bitmap for the next field, unless this is * the last field, in which case return the matched 'ext' * pointer if any. * * Now res_map contains the matching bitmap, and fill_map is the * bitmap for the next field. */ next_match: b = pipapo_refill(res_map, f->bsize, f->rules, fill_map, f->mt, last); if (b < 0) goto out; if (last) { if (__nft_set_elem_expired(&f->mt[b].e->ext, tstamp)) goto next_match; if ((genmask && !nft_set_elem_active(&f->mt[b].e->ext, genmask))) goto next_match; ret = f->mt[b].e; goto out; } data += NFT_PIPAPO_GROUPS_PADDING(f); /* Swap bitmap indices: fill_map will be the initial bitmap for * the next field (i.e. the new res_map), and res_map is * guaranteed to be all-zeroes at this point, ready to be filled * according to the next mapping table. */ swap(res_map, fill_map); } out: kfree(fill_map); kfree(res_map); return ret; } /** * nft_pipapo_get() - Get matching element reference given key data * @net: Network namespace * @set: nftables API set representation * @elem: nftables API element representation containing key data * @flags: Unused */ static struct nft_elem_priv * nft_pipapo_get(const struct net *net, const struct nft_set *set, const struct nft_set_elem *elem, unsigned int flags) { struct nft_pipapo *priv = nft_set_priv(set); struct nft_pipapo_match *m = rcu_dereference(priv->match); struct nft_pipapo_elem *e; e = pipapo_get(net, set, m, (const u8 *)elem->key.val.data, nft_genmask_cur(net), get_jiffies_64(), GFP_ATOMIC); if (IS_ERR(e)) return ERR_CAST(e); return &e->priv; } /** * pipapo_realloc_mt() - Reallocate mapping table if needed upon resize * @f: Field containing mapping table * @old_rules: Amount of existing mapped rules * @rules: Amount of new rules to map * * Return: 0 on success, negative error code on failure. */ static int pipapo_realloc_mt(struct nft_pipapo_field *f, unsigned int old_rules, unsigned int rules) { union nft_pipapo_map_bucket *new_mt = NULL, *old_mt = f->mt; const unsigned int extra = PAGE_SIZE / sizeof(*new_mt); unsigned int rules_alloc = rules; might_sleep(); if (unlikely(rules == 0)) goto out_free; /* growing and enough space left, no action needed */ if (rules > old_rules && f->rules_alloc > rules) return 0; /* downsize and extra slack has not grown too large */ if (rules < old_rules) { unsigned int remove = f->rules_alloc - rules; if (remove < (2u * extra)) return 0; } /* If set needs more than one page of memory for rules then * allocate another extra page to avoid frequent reallocation. */ if (rules > extra && check_add_overflow(rules, extra, &rules_alloc)) return -EOVERFLOW; new_mt = kvmalloc_array(rules_alloc, sizeof(*new_mt), GFP_KERNEL_ACCOUNT); if (!new_mt) return -ENOMEM; if (old_mt) memcpy(new_mt, old_mt, min(old_rules, rules) * sizeof(*new_mt)); if (rules > old_rules) { memset(new_mt + old_rules, 0, (rules - old_rules) * sizeof(*new_mt)); } out_free: f->rules_alloc = rules_alloc; f->mt = new_mt; kvfree(old_mt); return 0; } /** * pipapo_resize() - Resize lookup or mapping table, or both * @f: Field containing lookup and mapping tables * @old_rules: Previous amount of rules in field * @rules: New amount of rules * * Increase, decrease or maintain tables size depending on new amount of rules, * and copy data over. In case the new size is smaller, throw away data for * highest-numbered rules. * * Return: 0 on success, -ENOMEM on allocation failure. */ static int pipapo_resize(struct nft_pipapo_field *f, unsigned int old_rules, unsigned int rules) { long *new_lt = NULL, *new_p, *old_lt = f->lt, *old_p; unsigned int new_bucket_size, copy; int group, bucket, err; if (rules >= NFT_PIPAPO_RULE0_MAX) return -ENOSPC; new_bucket_size = DIV_ROUND_UP(rules, BITS_PER_LONG); #ifdef NFT_PIPAPO_ALIGN new_bucket_size = roundup(new_bucket_size, NFT_PIPAPO_ALIGN / sizeof(*new_lt)); #endif if (new_bucket_size == f->bsize) goto mt; if (new_bucket_size > f->bsize) copy = f->bsize; else copy = new_bucket_size; new_lt = kvzalloc(f->groups * NFT_PIPAPO_BUCKETS(f->bb) * new_bucket_size * sizeof(*new_lt) + NFT_PIPAPO_ALIGN_HEADROOM, GFP_KERNEL); if (!new_lt) return -ENOMEM; new_p = NFT_PIPAPO_LT_ALIGN(new_lt); old_p = NFT_PIPAPO_LT_ALIGN(old_lt); for (group = 0; group < f->groups; group++) { for (bucket = 0; bucket < NFT_PIPAPO_BUCKETS(f->bb); bucket++) { memcpy(new_p, old_p, copy * sizeof(*new_p)); new_p += copy; old_p += copy; if (new_bucket_size > f->bsize) new_p += new_bucket_size - f->bsize; else old_p += f->bsize - new_bucket_size; } } mt: err = pipapo_realloc_mt(f, old_rules, rules); if (err) { kvfree(new_lt); return err; } if (new_lt) { f->bsize = new_bucket_size; f->lt = new_lt; kvfree(old_lt); } return 0; } /** * pipapo_bucket_set() - Set rule bit in bucket given group and group value * @f: Field containing lookup table * @rule: Rule index * @group: Group index * @v: Value of bit group */ static void pipapo_bucket_set(struct nft_pipapo_field *f, int rule, int group, int v) { unsigned long *pos; pos = NFT_PIPAPO_LT_ALIGN(f->lt); pos += f->bsize * NFT_PIPAPO_BUCKETS(f->bb) * group; pos += f->bsize * v; __set_bit(rule, pos); } /** * pipapo_lt_4b_to_8b() - Switch lookup table group width from 4 bits to 8 bits * @old_groups: Number of current groups * @bsize: Size of one bucket, in longs * @old_lt: Pointer to the current lookup table * @new_lt: Pointer to the new, pre-allocated lookup table * * Each bucket with index b in the new lookup table, belonging to group g, is * filled with the bit intersection between: * - bucket with index given by the upper 4 bits of b, from group g, and * - bucket with index given by the lower 4 bits of b, from group g + 1 * * That is, given buckets from the new lookup table N(x, y) and the old lookup * table O(x, y), with x bucket index, and y group index: * * N(b, g) := O(b / 16, g) & O(b % 16, g + 1) * * This ensures equivalence of the matching results on lookup. Two examples in * pictures: * * bucket * group 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 ... 254 255 * 0 ^ * 1 | ^ * ... ( & ) | * / \ | * / \ .-( & )-. * / bucket \ | | * group 0 / 1 2 3 \ 4 5 6 7 8 9 10 11 12 13 |14 15 | * 0 / \ | | * 1 \ | | * 2 | --' * 3 '- * ... */ static void pipapo_lt_4b_to_8b(int old_groups, int bsize, unsigned long *old_lt, unsigned long *new_lt) { int g, b, i; for (g = 0; g < old_groups / 2; g++) { int src_g0 = g * 2, src_g1 = g * 2 + 1; for (b = 0; b < NFT_PIPAPO_BUCKETS(8); b++) { int src_b0 = b / NFT_PIPAPO_BUCKETS(4); int src_b1 = b % NFT_PIPAPO_BUCKETS(4); int src_i0 = src_g0 * NFT_PIPAPO_BUCKETS(4) + src_b0; int src_i1 = src_g1 * NFT_PIPAPO_BUCKETS(4) + src_b1; for (i = 0; i < bsize; i++) { *new_lt = old_lt[src_i0 * bsize + i] & old_lt[src_i1 * bsize + i]; new_lt++; } } } } /** * pipapo_lt_8b_to_4b() - Switch lookup table group width from 8 bits to 4 bits * @old_groups: Number of current groups * @bsize: Size of one bucket, in longs * @old_lt: Pointer to the current lookup table * @new_lt: Pointer to the new, pre-allocated lookup table * * Each bucket with index b in the new lookup table, belonging to group g, is * filled with the bit union of: * - all the buckets with index such that the upper four bits of the lower byte * equal b, from group g, with g odd * - all the buckets with index such that the lower four bits equal b, from * group g, with g even * * That is, given buckets from the new lookup table N(x, y) and the old lookup * table O(x, y), with x bucket index, and y group index: * * - with g odd: N(b, g) := U(O(x, g) for each x : x = (b & 0xf0) >> 4) * - with g even: N(b, g) := U(O(x, g) for each x : x = b & 0x0f) * * where U() denotes the arbitrary union operation (binary OR of n terms). This * ensures equivalence of the matching results on lookup. */ static void pipapo_lt_8b_to_4b(int old_groups, int bsize, unsigned long *old_lt, unsigned long *new_lt) { int g, b, bsrc, i; memset(new_lt, 0, old_groups * 2 * NFT_PIPAPO_BUCKETS(4) * bsize * sizeof(unsigned long)); for (g = 0; g < old_groups * 2; g += 2) { int src_g = g / 2; for (b = 0; b < NFT_PIPAPO_BUCKETS(4); b++) { for (bsrc = NFT_PIPAPO_BUCKETS(8) * src_g; bsrc < NFT_PIPAPO_BUCKETS(8) * (src_g + 1); bsrc++) { if (((bsrc & 0xf0) >> 4) != b) continue; for (i = 0; i < bsize; i++) new_lt[i] |= old_lt[bsrc * bsize + i]; } new_lt += bsize; } for (b = 0; b < NFT_PIPAPO_BUCKETS(4); b++) { for (bsrc = NFT_PIPAPO_BUCKETS(8) * src_g; bsrc < NFT_PIPAPO_BUCKETS(8) * (src_g + 1); bsrc++) { if ((bsrc & 0x0f) != b) continue; for (i = 0; i < bsize; i++) new_lt[i] |= old_lt[bsrc * bsize + i]; } new_lt += bsize; } } } /** * pipapo_lt_bits_adjust() - Adjust group size for lookup table if needed * @f: Field containing lookup table */ static void pipapo_lt_bits_adjust(struct nft_pipapo_field *f) { unsigned int groups, bb; unsigned long *new_lt; size_t lt_size; lt_size = f->groups * NFT_PIPAPO_BUCKETS(f->bb) * f->bsize * sizeof(*f->lt); if (f->bb == NFT_PIPAPO_GROUP_BITS_SMALL_SET && lt_size > NFT_PIPAPO_LT_SIZE_HIGH) { groups = f->groups * 2; bb = NFT_PIPAPO_GROUP_BITS_LARGE_SET; lt_size = groups * NFT_PIPAPO_BUCKETS(bb) * f->bsize * sizeof(*f->lt); } else if (f->bb == NFT_PIPAPO_GROUP_BITS_LARGE_SET && lt_size < NFT_PIPAPO_LT_SIZE_LOW) { groups = f->groups / 2; bb = NFT_PIPAPO_GROUP_BITS_SMALL_SET; lt_size = groups * NFT_PIPAPO_BUCKETS(bb) * f->bsize * sizeof(*f->lt); /* Don't increase group width if the resulting lookup table size * would exceed the upper size threshold for a "small" set. */ if (lt_size > NFT_PIPAPO_LT_SIZE_HIGH) return; } else { return; } new_lt = kvzalloc(lt_size + NFT_PIPAPO_ALIGN_HEADROOM, GFP_KERNEL_ACCOUNT); if (!new_lt) return; NFT_PIPAPO_GROUP_BITS_ARE_8_OR_4; if (f->bb == 4 && bb == 8) { pipapo_lt_4b_to_8b(f->groups, f->bsize, NFT_PIPAPO_LT_ALIGN(f->lt), NFT_PIPAPO_LT_ALIGN(new_lt)); } else if (f->bb == 8 && bb == 4) { pipapo_lt_8b_to_4b(f->groups, f->bsize, NFT_PIPAPO_LT_ALIGN(f->lt), NFT_PIPAPO_LT_ALIGN(new_lt)); } else { BUG(); } f->groups = groups; f->bb = bb; kvfree(f->lt); f->lt = new_lt; } /** * pipapo_insert() - Insert new rule in field given input key and mask length * @f: Field containing lookup table * @k: Input key for classification, without nftables padding * @mask_bits: Length of mask; matches field length for non-ranged entry * * Insert a new rule reference in lookup buckets corresponding to k and * mask_bits. * * Return: 1 on success (one rule inserted), negative error code on failure. */ static int pipapo_insert(struct nft_pipapo_field *f, const uint8_t *k, int mask_bits) { unsigned int rule = f->rules, group, ret, bit_offset = 0; ret = pipapo_resize(f, f->rules, f->rules + 1); if (ret) return ret; f->rules++; for (group = 0; group < f->groups; group++) { int i, v; u8 mask; v = k[group / (BITS_PER_BYTE / f->bb)]; v &= GENMASK(BITS_PER_BYTE - bit_offset - 1, 0); v >>= (BITS_PER_BYTE - bit_offset) - f->bb; bit_offset += f->bb; bit_offset %= BITS_PER_BYTE; if (mask_bits >= (group + 1) * f->bb) { /* Not masked */ pipapo_bucket_set(f, rule, group, v); } else if (mask_bits <= group * f->bb) { /* Completely masked */ for (i = 0; i < NFT_PIPAPO_BUCKETS(f->bb); i++) pipapo_bucket_set(f, rule, group, i); } else { /* The mask limit falls on this group */ mask = GENMASK(f->bb - 1, 0); mask >>= mask_bits - group * f->bb; for (i = 0; i < NFT_PIPAPO_BUCKETS(f->bb); i++) { if ((i & ~mask) == (v & ~mask)) pipapo_bucket_set(f, rule, group, i); } } } pipapo_lt_bits_adjust(f); return 1; } /** * pipapo_step_diff() - Check if setting @step bit in netmask would change it * @base: Mask we are expanding * @step: Step bit for given expansion step * @len: Total length of mask space (set and unset bits), bytes * * Convenience function for mask expansion. * * Return: true if step bit changes mask (i.e. isn't set), false otherwise. */ static bool pipapo_step_diff(u8 *base, int step, int len) { /* Network order, byte-addressed */ #ifdef __BIG_ENDIAN__ return !(BIT(step % BITS_PER_BYTE) & base[step / BITS_PER_BYTE]); #else return !(BIT(step % BITS_PER_BYTE) & base[len - 1 - step / BITS_PER_BYTE]); #endif } /** * pipapo_step_after_end() - Check if mask exceeds range end with given step * @base: Mask we are expanding * @end: End of range * @step: Step bit for given expansion step, highest bit to be set * @len: Total length of mask space (set and unset bits), bytes * * Convenience function for mask expansion. * * Return: true if mask exceeds range setting step bits, false otherwise. */ static bool pipapo_step_after_end(const u8 *base, const u8 *end, int step, int len) { u8 tmp[NFT_PIPAPO_MAX_BYTES]; int i; memcpy(tmp, base, len); /* Network order, byte-addressed */ for (i = 0; i <= step; i++) #ifdef __BIG_ENDIAN__ tmp[i / BITS_PER_BYTE] |= BIT(i % BITS_PER_BYTE); #else tmp[len - 1 - i / BITS_PER_BYTE] |= BIT(i % BITS_PER_BYTE); #endif return memcmp(tmp, end, len) > 0; } /** * pipapo_base_sum() - Sum step bit to given len-sized netmask base with carry * @base: Netmask base * @step: Step bit to sum * @len: Netmask length, bytes */ static void pipapo_base_sum(u8 *base, int step, int len) { bool carry = false; int i; /* Network order, byte-addressed */ #ifdef __BIG_ENDIAN__ for (i = step / BITS_PER_BYTE; i < len; i++) { #else for (i = len - 1 - step / BITS_PER_BYTE; i >= 0; i--) { #endif if (carry) base[i]++; else base[i] += 1 << (step % BITS_PER_BYTE); if (base[i]) break; carry = true; } } /** * pipapo_expand() - Expand to composing netmasks, insert into lookup table * @f: Field containing lookup table * @start: Start of range * @end: End of range * @len: Length of value in bits * * Expand range to composing netmasks and insert corresponding rule references * in lookup buckets. * * Return: number of inserted rules on success, negative error code on failure. */ static int pipapo_expand(struct nft_pipapo_field *f, const u8 *start, const u8 *end, int len) { int step, masks = 0, bytes = DIV_ROUND_UP(len, BITS_PER_BYTE); u8 base[NFT_PIPAPO_MAX_BYTES]; memcpy(base, start, bytes); while (memcmp(base, end, bytes) <= 0) { int err; step = 0; while (pipapo_step_diff(base, step, bytes)) { if (pipapo_step_after_end(base, end, step, bytes)) break; step++; if (step >= len) { if (!masks) { err = pipapo_insert(f, base, 0); if (err < 0) return err; masks = 1; } goto out; } } err = pipapo_insert(f, base, len - step); if (err < 0) return err; masks++; pipapo_base_sum(base, step, bytes); } out: return masks; } /** * pipapo_map() - Insert rules in mapping tables, mapping them between fields * @m: Matching data, including mapping table * @map: Table of rule maps: array of first rule and amount of rules * in next field a given rule maps to, for each field * @e: For last field, nft_set_ext pointer matching rules map to */ static void pipapo_map(struct nft_pipapo_match *m, union nft_pipapo_map_bucket map[NFT_PIPAPO_MAX_FIELDS], struct nft_pipapo_elem *e) { struct nft_pipapo_field *f; int i, j; for (i = 0, f = m->f; i < m->field_count - 1; i++, f++) { for (j = 0; j < map[i].n; j++) { f->mt[map[i].to + j].to = map[i + 1].to; f->mt[map[i].to + j].n = map[i + 1].n; } } /* Last field: map to ext instead of mapping to next field */ for (j = 0; j < map[i].n; j++) f->mt[map[i].to + j].e = e; } /** * pipapo_free_scratch() - Free per-CPU map at original (not aligned) address * @m: Matching data * @cpu: CPU number */ static void pipapo_free_scratch(const struct nft_pipapo_match *m, unsigned int cpu) { struct nft_pipapo_scratch *s; void *mem; s = *per_cpu_ptr(m->scratch, cpu); if (!s) return; mem = s; mem -= s->align_off; kfree(mem); } /** * pipapo_realloc_scratch() - Reallocate scratch maps for partial match results * @clone: Copy of matching data with pending insertions and deletions * @bsize_max: Maximum bucket size, scratch maps cover two buckets * * Return: 0 on success, -ENOMEM on failure. */ static int pipapo_realloc_scratch(struct nft_pipapo_match *clone, unsigned long bsize_max) { int i; for_each_possible_cpu(i) { struct nft_pipapo_scratch *scratch; #ifdef NFT_PIPAPO_ALIGN void *scratch_aligned; u32 align_off; #endif scratch = kzalloc_node(struct_size(scratch, map, bsize_max * 2) + NFT_PIPAPO_ALIGN_HEADROOM, GFP_KERNEL_ACCOUNT, cpu_to_node(i)); if (!scratch) { /* On failure, there's no need to undo previous * allocations: this means that some scratch maps have * a bigger allocated size now (this is only called on * insertion), but the extra space won't be used by any * CPU as new elements are not inserted and m->bsize_max * is not updated. */ return -ENOMEM; } pipapo_free_scratch(clone, i); #ifdef NFT_PIPAPO_ALIGN /* Align &scratch->map (not the struct itself): the extra * %NFT_PIPAPO_ALIGN_HEADROOM bytes passed to kzalloc_node() * above guarantee we can waste up to those bytes in order * to align the map field regardless of its offset within * the struct. */ BUILD_BUG_ON(offsetof(struct nft_pipapo_scratch, map) > NFT_PIPAPO_ALIGN_HEADROOM); scratch_aligned = NFT_PIPAPO_LT_ALIGN(&scratch->map); scratch_aligned -= offsetof(struct nft_pipapo_scratch, map); align_off = scratch_aligned - (void *)scratch; scratch = scratch_aligned; scratch->align_off = align_off; #endif *per_cpu_ptr(clone->scratch, i) = scratch; } return 0; } static bool nft_pipapo_transaction_mutex_held(const struct nft_set *set) { #ifdef CONFIG_PROVE_LOCKING const struct net *net = read_pnet(&set->net); return lockdep_is_held(&nft_pernet(net)->commit_mutex); #else return true; #endif } static struct nft_pipapo_match *pipapo_clone(struct nft_pipapo_match *old); /** * pipapo_maybe_clone() - Build clone for pending data changes, if not existing * @set: nftables API set representation * * Return: newly created or existing clone, if any. NULL on allocation failure */ static struct nft_pipapo_match *pipapo_maybe_clone(const struct nft_set *set) { struct nft_pipapo *priv = nft_set_priv(set); struct nft_pipapo_match *m; if (priv->clone) return priv->clone; m = rcu_dereference_protected(priv->match, nft_pipapo_transaction_mutex_held(set)); priv->clone = pipapo_clone(m); return priv->clone; } /** * nft_pipapo_insert() - Validate and insert ranged elements * @net: Network namespace * @set: nftables API set representation * @elem: nftables API element representation containing key data * @elem_priv: Filled with pointer to &struct nft_set_ext in inserted element * * Return: 0 on success, error pointer on failure. */ static int nft_pipapo_insert(const struct net *net, const struct nft_set *set, const struct nft_set_elem *elem, struct nft_elem_priv **elem_priv) { const struct nft_set_ext *ext = nft_set_elem_ext(set, elem->priv); union nft_pipapo_map_bucket rulemap[NFT_PIPAPO_MAX_FIELDS]; const u8 *start = (const u8 *)elem->key.val.data, *end; struct nft_pipapo_match *m = pipapo_maybe_clone(set); u8 genmask = nft_genmask_next(net); struct nft_pipapo_elem *e, *dup; u64 tstamp = nft_net_tstamp(net); struct nft_pipapo_field *f; const u8 *start_p, *end_p; int i, bsize_max, err = 0; if (!m) return -ENOMEM; if (nft_set_ext_exists(ext, NFT_SET_EXT_KEY_END)) end = (const u8 *)nft_set_ext_key_end(ext)->data; else end = start; dup = pipapo_get(net, set, m, start, genmask, tstamp, GFP_KERNEL); if (!IS_ERR(dup)) { /* Check if we already have the same exact entry */ const struct nft_data *dup_key, *dup_end; dup_key = nft_set_ext_key(&dup->ext); if (nft_set_ext_exists(&dup->ext, NFT_SET_EXT_KEY_END)) dup_end = nft_set_ext_key_end(&dup->ext); else dup_end = dup_key; if (!memcmp(start, dup_key->data, sizeof(*dup_key->data)) && !memcmp(end, dup_end->data, sizeof(*dup_end->data))) { *elem_priv = &dup->priv; return -EEXIST; } return -ENOTEMPTY; } if (PTR_ERR(dup) == -ENOENT) { /* Look for partially overlapping entries */ dup = pipapo_get(net, set, m, end, nft_genmask_next(net), tstamp, GFP_KERNEL); } if (PTR_ERR(dup) != -ENOENT) { if (IS_ERR(dup)) return PTR_ERR(dup); *elem_priv = &dup->priv; return -ENOTEMPTY; } /* Validate */ start_p = start; end_p = end; /* some helpers return -1, or 0 >= for valid rule pos, * so we cannot support more than INT_MAX rules at this time. */ BUILD_BUG_ON(NFT_PIPAPO_RULE0_MAX > INT_MAX); nft_pipapo_for_each_field(f, i, m) { if (f->rules >= NFT_PIPAPO_RULE0_MAX) return -ENOSPC; if (memcmp(start_p, end_p, f->groups / NFT_PIPAPO_GROUPS_PER_BYTE(f)) > 0) return -EINVAL; start_p += NFT_PIPAPO_GROUPS_PADDED_SIZE(f); end_p += NFT_PIPAPO_GROUPS_PADDED_SIZE(f); } /* Insert */ bsize_max = m->bsize_max; nft_pipapo_for_each_field(f, i, m) { int ret; rulemap[i].to = f->rules; ret = memcmp(start, end, f->groups / NFT_PIPAPO_GROUPS_PER_BYTE(f)); if (!ret) ret = pipapo_insert(f, start, f->groups * f->bb); else ret = pipapo_expand(f, start, end, f->groups * f->bb); if (ret < 0) return ret; if (f->bsize > bsize_max) bsize_max = f->bsize; rulemap[i].n = ret; start += NFT_PIPAPO_GROUPS_PADDED_SIZE(f); end += NFT_PIPAPO_GROUPS_PADDED_SIZE(f); } if (!*get_cpu_ptr(m->scratch) || bsize_max > m->bsize_max) { put_cpu_ptr(m->scratch); err = pipapo_realloc_scratch(m, bsize_max); if (err) return err; m->bsize_max = bsize_max; } else { put_cpu_ptr(m->scratch); } e = nft_elem_priv_cast(elem->priv); *elem_priv = &e->priv; pipapo_map(m, rulemap, e); return 0; } /** * pipapo_clone() - Clone matching data to create new working copy * @old: Existing matching data * * Return: copy of matching data passed as 'old' or NULL. */ static struct nft_pipapo_match *pipapo_clone(struct nft_pipapo_match *old) { struct nft_pipapo_field *dst, *src; struct nft_pipapo_match *new; int i; new = kmalloc(struct_size(new, f, old->field_count), GFP_KERNEL_ACCOUNT); if (!new) return NULL; new->field_count = old->field_count; new->bsize_max = old->bsize_max; new->scratch = alloc_percpu(*new->scratch); if (!new->scratch) goto out_scratch; for_each_possible_cpu(i) *per_cpu_ptr(new->scratch, i) = NULL; if (pipapo_realloc_scratch(new, old->bsize_max)) goto out_scratch_realloc; rcu_head_init(&new->rcu); src = old->f; dst = new->f; for (i = 0; i < old->field_count; i++) { unsigned long *new_lt; memcpy(dst, src, offsetof(struct nft_pipapo_field, lt)); new_lt = kvzalloc(src->groups * NFT_PIPAPO_BUCKETS(src->bb) * src->bsize * sizeof(*dst->lt) + NFT_PIPAPO_ALIGN_HEADROOM, GFP_KERNEL_ACCOUNT); if (!new_lt) goto out_lt; dst->lt = new_lt; memcpy(NFT_PIPAPO_LT_ALIGN(new_lt), NFT_PIPAPO_LT_ALIGN(src->lt), src->bsize * sizeof(*dst->lt) * src->groups * NFT_PIPAPO_BUCKETS(src->bb)); if (src->rules > 0) { dst->mt = kvmalloc_array(src->rules_alloc, sizeof(*src->mt), GFP_KERNEL_ACCOUNT); if (!dst->mt) goto out_mt; memcpy(dst->mt, src->mt, src->rules * sizeof(*src->mt)); } else { dst->mt = NULL; dst->rules_alloc = 0; } src++; dst++; } return new; out_mt: kvfree(dst->lt); out_lt: for (dst--; i > 0; i--) { kvfree(dst->mt); kvfree(dst->lt); dst--; } out_scratch_realloc: for_each_possible_cpu(i) pipapo_free_scratch(new, i); out_scratch: free_percpu(new->scratch); kfree(new); return NULL; } /** * pipapo_rules_same_key() - Get number of rules originated from the same entry * @f: Field containing mapping table * @first: Index of first rule in set of rules mapping to same entry * * Using the fact that all rules in a field that originated from the same entry * will map to the same set of rules in the next field, or to the same element * reference, return the cardinality of the set of rules that originated from * the same entry as the rule with index @first, @first rule included. * * In pictures: * rules * field #0 0 1 2 3 4 * map to: 0 1 2-4 2-4 5-9 * . . ....... . ... * | | | | \ \ * | | | | \ \ * | | | | \ \ * ' ' ' ' ' \ * in field #1 0 1 2 3 4 5 ... * * if this is called for rule 2 on field #0, it will return 3, as also rules 2 * and 3 in field 0 map to the same set of rules (2, 3, 4) in the next field. * * For the last field in a set, we can rely on associated entries to map to the * same element references. * * Return: Number of rules that originated from the same entry as @first. */ static unsigned int pipapo_rules_same_key(struct nft_pipapo_field *f, unsigned int first) { struct nft_pipapo_elem *e = NULL; /* Keep gcc happy */ unsigned int r; for (r = first; r < f->rules; r++) { if (r != first && e != f->mt[r].e) return r - first; e = f->mt[r].e; } if (r != first) return r - first; return 0; } /** * pipapo_unmap() - Remove rules from mapping tables, renumber remaining ones * @mt: Mapping array * @rules: Original amount of rules in mapping table * @start: First rule index to be removed * @n: Amount of rules to be removed * @to_offset: First rule index, in next field, this group of rules maps to * @is_last: If this is the last field, delete reference from mapping array * * This is used to unmap rules from the mapping table for a single field, * maintaining consistency and compactness for the existing ones. * * In pictures: let's assume that we want to delete rules 2 and 3 from the * following mapping array: * * rules * 0 1 2 3 4 * map to: 4-10 4-10 11-15 11-15 16-18 * * the result will be: * * rules * 0 1 2 * map to: 4-10 4-10 11-13 * * for fields before the last one. In case this is the mapping table for the * last field in a set, and rules map to pointers to &struct nft_pipapo_elem: * * rules * 0 1 2 3 4 * element pointers: 0x42 0x42 0x33 0x33 0x44 * * the result will be: * * rules * 0 1 2 * element pointers: 0x42 0x42 0x44 */ static void pipapo_unmap(union nft_pipapo_map_bucket *mt, unsigned int rules, unsigned int start, unsigned int n, unsigned int to_offset, bool is_last) { int i; memmove(mt + start, mt + start + n, (rules - start - n) * sizeof(*mt)); memset(mt + rules - n, 0, n * sizeof(*mt)); if (is_last) return; for (i = start; i < rules - n; i++) mt[i].to -= to_offset; } /** * pipapo_drop() - Delete entry from lookup and mapping tables, given rule map * @m: Matching data * @rulemap: Table of rule maps, arrays of first rule and amount of rules * in next field a given entry maps to, for each field * * For each rule in lookup table buckets mapping to this set of rules, drop * all bits set in lookup table mapping. In pictures, assuming we want to drop * rules 0 and 1 from this lookup table: * * bucket * group 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 * 0 0 1,2 * 1 1,2 0 * 2 0 1,2 * 3 0 1,2 * 4 0,1,2 * 5 0 1 2 * 6 0,1,2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 * 7 1,2 1,2 1 1 1 0,1 1 1 1 1 1 1 1 1 1 1 * * rule 2 becomes rule 0, and the result will be: * * bucket * group 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 * 0 0 * 1 0 * 2 0 * 3 0 * 4 0 * 5 0 * 6 0 * 7 0 0 * * once this is done, call unmap() to drop all the corresponding rule references * from mapping tables. */ static void pipapo_drop(struct nft_pipapo_match *m, union nft_pipapo_map_bucket rulemap[]) { struct nft_pipapo_field *f; int i; nft_pipapo_for_each_field(f, i, m) { int g; for (g = 0; g < f->groups; g++) { unsigned long *pos; int b; pos = NFT_PIPAPO_LT_ALIGN(f->lt) + g * NFT_PIPAPO_BUCKETS(f->bb) * f->bsize; for (b = 0; b < NFT_PIPAPO_BUCKETS(f->bb); b++) { bitmap_cut(pos, pos, rulemap[i].to, rulemap[i].n, f->bsize * BITS_PER_LONG); pos += f->bsize; } } pipapo_unmap(f->mt, f->rules, rulemap[i].to, rulemap[i].n, rulemap[i + 1].n, i == m->field_count - 1); if (pipapo_resize(f, f->rules, f->rules - rulemap[i].n)) { /* We can ignore this, a failure to shrink tables down * doesn't make tables invalid. */ ; } f->rules -= rulemap[i].n; pipapo_lt_bits_adjust(f); } } static void nft_pipapo_gc_deactivate(struct net *net, struct nft_set *set, struct nft_pipapo_elem *e) { nft_setelem_data_deactivate(net, set, &e->priv); } /** * pipapo_gc() - Drop expired entries from set, destroy start and end elements * @set: nftables API set representation * @m: Matching data */ static void pipapo_gc(struct nft_set *set, struct nft_pipapo_match *m) { struct nft_pipapo *priv = nft_set_priv(set); struct net *net = read_pnet(&set->net); unsigned int rules_f0, first_rule = 0; u64 tstamp = nft_net_tstamp(net); struct nft_pipapo_elem *e; struct nft_trans_gc *gc; gc = nft_trans_gc_alloc(set, 0, GFP_KERNEL); if (!gc) return; while ((rules_f0 = pipapo_rules_same_key(m->f, first_rule))) { union nft_pipapo_map_bucket rulemap[NFT_PIPAPO_MAX_FIELDS]; const struct nft_pipapo_field *f; unsigned int i, start, rules_fx; start = first_rule; rules_fx = rules_f0; nft_pipapo_for_each_field(f, i, m) { rulemap[i].to = start; rulemap[i].n = rules_fx; if (i < m->field_count - 1) { rules_fx = f->mt[start].n; start = f->mt[start].to; } } /* Pick the last field, and its last index */ f--; i--; e = f->mt[rulemap[i].to].e; /* synchronous gc never fails, there is no need to set on * NFT_SET_ELEM_DEAD_BIT. */ if (__nft_set_elem_expired(&e->ext, tstamp)) { gc = nft_trans_gc_queue_sync(gc, GFP_KERNEL); if (!gc) return; nft_pipapo_gc_deactivate(net, set, e); pipapo_drop(m, rulemap); nft_trans_gc_elem_add(gc, e); /* And check again current first rule, which is now the * first we haven't checked. */ } else { first_rule += rules_f0; } } gc = nft_trans_gc_catchall_sync(gc); if (gc) { nft_trans_gc_queue_sync_done(gc); priv->last_gc = jiffies; } } /** * pipapo_free_fields() - Free per-field tables contained in matching data * @m: Matching data */ static void pipapo_free_fields(struct nft_pipapo_match *m) { struct nft_pipapo_field *f; int i; nft_pipapo_for_each_field(f, i, m) { kvfree(f->lt); kvfree(f->mt); } } static void pipapo_free_match(struct nft_pipapo_match *m) { int i; for_each_possible_cpu(i) pipapo_free_scratch(m, i); free_percpu(m->scratch); pipapo_free_fields(m); kfree(m); } /** * pipapo_reclaim_match - RCU callback to free fields from old matching data * @rcu: RCU head */ static void pipapo_reclaim_match(struct rcu_head *rcu) { struct nft_pipapo_match *m; m = container_of(rcu, struct nft_pipapo_match, rcu); pipapo_free_match(m); } /** * nft_pipapo_commit() - Replace lookup data with current working copy * @set: nftables API set representation * * While at it, check if we should perform garbage collection on the working * copy before committing it for lookup, and don't replace the table if the * working copy doesn't have pending changes. * * We also need to create a new working copy for subsequent insertions and * deletions. */ static void nft_pipapo_commit(struct nft_set *set) { struct nft_pipapo *priv = nft_set_priv(set); struct nft_pipapo_match *old; if (!priv->clone) return; if (time_after_eq(jiffies, priv->last_gc + nft_set_gc_interval(set))) pipapo_gc(set, priv->clone); old = rcu_replace_pointer(priv->match, priv->clone, nft_pipapo_transaction_mutex_held(set)); priv->clone = NULL; if (old) call_rcu(&old->rcu, pipapo_reclaim_match); } static void nft_pipapo_abort(const struct nft_set *set) { struct nft_pipapo *priv = nft_set_priv(set); if (!priv->clone) return; pipapo_free_match(priv->clone); priv->clone = NULL; } /** * nft_pipapo_activate() - Mark element reference as active given key, commit * @net: Network namespace * @set: nftables API set representation * @elem_priv: nftables API element representation containing key data * * On insertion, elements are added to a copy of the matching data currently * in use for lookups, and not directly inserted into current lookup data. Both * nft_pipapo_insert() and nft_pipapo_activate() are called once for each * element, hence we can't purpose either one as a real commit operation. */ static void nft_pipapo_activate(const struct net *net, const struct nft_set *set, struct nft_elem_priv *elem_priv) { struct nft_pipapo_elem *e = nft_elem_priv_cast(elem_priv); nft_clear(net, &e->ext); } /** * nft_pipapo_deactivate() - Search for element and make it inactive * @net: Network namespace * @set: nftables API set representation * @elem: nftables API element representation containing key data * * Return: deactivated element if found, NULL otherwise. */ static struct nft_elem_priv * nft_pipapo_deactivate(const struct net *net, const struct nft_set *set, const struct nft_set_elem *elem) { struct nft_pipapo_match *m = pipapo_maybe_clone(set); struct nft_pipapo_elem *e; /* removal must occur on priv->clone, if we are low on memory * we have no choice and must fail the removal request. */ if (!m) return NULL; e = pipapo_get(net, set, m, (const u8 *)elem->key.val.data, nft_genmask_next(net), nft_net_tstamp(net), GFP_KERNEL); if (IS_ERR(e)) return NULL; nft_set_elem_change_active(net, set, &e->ext); return &e->priv; } /** * nft_pipapo_flush() - make element inactive * @net: Network namespace * @set: nftables API set representation * @elem_priv: nftables API element representation containing key data * * This is functionally the same as nft_pipapo_deactivate(), with a slightly * different interface, and it's also called once for each element in a set * being flushed, so we can't implement, strictly speaking, a flush operation, * which would otherwise be as simple as allocating an empty copy of the * matching data. * * Note that we could in theory do that, mark the set as flushed, and ignore * subsequent calls, but we would leak all the elements after the first one, * because they wouldn't then be freed as result of API calls. * * Return: true if element was found and deactivated. */ static void nft_pipapo_flush(const struct net *net, const struct nft_set *set, struct nft_elem_priv *elem_priv) { struct nft_pipapo_elem *e = nft_elem_priv_cast(elem_priv); nft_set_elem_change_active(net, set, &e->ext); } /** * pipapo_get_boundaries() - Get byte interval for associated rules * @f: Field including lookup table * @first_rule: First rule (lowest index) * @rule_count: Number of associated rules * @left: Byte expression for left boundary (start of range) * @right: Byte expression for right boundary (end of range) * * Given the first rule and amount of rules that originated from the same entry, * build the original range associated with the entry, and calculate the length * of the originating netmask. * * In pictures: * * bucket * group 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 * 0 1,2 * 1 1,2 * 2 1,2 * 3 1,2 * 4 1,2 * 5 1 2 * 6 1,2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 * 7 1,2 1,2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 * * this is the lookup table corresponding to the IPv4 range * 192.168.1.0-192.168.2.1, which was expanded to the two composing netmasks, * rule #1: 192.168.1.0/24, and rule #2: 192.168.2.0/31. * * This function fills @left and @right with the byte values of the leftmost * and rightmost bucket indices for the lowest and highest rule indices, * respectively. If @first_rule is 1 and @rule_count is 2, we obtain, in * nibbles: * left: < 12, 0, 10, 8, 0, 1, 0, 0 > * right: < 12, 0, 10, 8, 0, 2, 2, 1 > * corresponding to bytes: * left: < 192, 168, 1, 0 > * right: < 192, 168, 2, 1 > * with mask length irrelevant here, unused on return, as the range is already * defined by its start and end points. The mask length is relevant for a single * ranged entry instead: if @first_rule is 1 and @rule_count is 1, we ignore * rule 2 above: @left becomes < 192, 168, 1, 0 >, @right becomes * < 192, 168, 1, 255 >, and the mask length, calculated from the distances * between leftmost and rightmost bucket indices for each group, would be 24. * * Return: mask length, in bits. */ static int pipapo_get_boundaries(struct nft_pipapo_field *f, int first_rule, int rule_count, u8 *left, u8 *right) { int g, mask_len = 0, bit_offset = 0; u8 *l = left, *r = right; for (g = 0; g < f->groups; g++) { int b, x0, x1; x0 = -1; x1 = -1; for (b = 0; b < NFT_PIPAPO_BUCKETS(f->bb); b++) { unsigned long *pos; pos = NFT_PIPAPO_LT_ALIGN(f->lt) + (g * NFT_PIPAPO_BUCKETS(f->bb) + b) * f->bsize; if (test_bit(first_rule, pos) && x0 == -1) x0 = b; if (test_bit(first_rule + rule_count - 1, pos)) x1 = b; } *l |= x0 << (BITS_PER_BYTE - f->bb - bit_offset); *r |= x1 << (BITS_PER_BYTE - f->bb - bit_offset); bit_offset += f->bb; if (bit_offset >= BITS_PER_BYTE) { bit_offset %= BITS_PER_BYTE; l++; r++; } if (x1 - x0 == 0) mask_len += 4; else if (x1 - x0 == 1) mask_len += 3; else if (x1 - x0 == 3) mask_len += 2; else if (x1 - x0 == 7) mask_len += 1; } return mask_len; } /** * pipapo_match_field() - Match rules against byte ranges * @f: Field including the lookup table * @first_rule: First of associated rules originating from same entry * @rule_count: Amount of associated rules * @start: Start of range to be matched * @end: End of range to be matched * * Return: true on match, false otherwise. */ static bool pipapo_match_field(struct nft_pipapo_field *f, int first_rule, int rule_count, const u8 *start, const u8 *end) { u8 right[NFT_PIPAPO_MAX_BYTES] = { 0 }; u8 left[NFT_PIPAPO_MAX_BYTES] = { 0 }; pipapo_get_boundaries(f, first_rule, rule_count, left, right); return !memcmp(start, left, f->groups / NFT_PIPAPO_GROUPS_PER_BYTE(f)) && !memcmp(end, right, f->groups / NFT_PIPAPO_GROUPS_PER_BYTE(f)); } /** * nft_pipapo_remove() - Remove element given key, commit * @net: Network namespace * @set: nftables API set representation * @elem_priv: nftables API element representation containing key data * * Similarly to nft_pipapo_activate(), this is used as commit operation by the * API, but it's called once per element in the pending transaction, so we can't * implement this as a single commit operation. Closest we can get is to remove * the matched element here, if any, and commit the updated matching data. */ static void nft_pipapo_remove(const struct net *net, const struct nft_set *set, struct nft_elem_priv *elem_priv) { struct nft_pipapo *priv = nft_set_priv(set); struct nft_pipapo_match *m = priv->clone; unsigned int rules_f0, first_rule = 0; struct nft_pipapo_elem *e; const u8 *data; e = nft_elem_priv_cast(elem_priv); data = (const u8 *)nft_set_ext_key(&e->ext); while ((rules_f0 = pipapo_rules_same_key(m->f, first_rule))) { union nft_pipapo_map_bucket rulemap[NFT_PIPAPO_MAX_FIELDS]; const u8 *match_start, *match_end; struct nft_pipapo_field *f; int i, start, rules_fx; match_start = data; if (nft_set_ext_exists(&e->ext, NFT_SET_EXT_KEY_END)) match_end = (const u8 *)nft_set_ext_key_end(&e->ext)->data; else match_end = data; start = first_rule; rules_fx = rules_f0; nft_pipapo_for_each_field(f, i, m) { bool last = i == m->field_count - 1; if (!pipapo_match_field(f, start, rules_fx, match_start, match_end)) break; rulemap[i].to = start; rulemap[i].n = rules_fx; rules_fx = f->mt[start].n; start = f->mt[start].to; match_start += NFT_PIPAPO_GROUPS_PADDED_SIZE(f); match_end += NFT_PIPAPO_GROUPS_PADDED_SIZE(f); if (last && f->mt[rulemap[i].to].e == e) { pipapo_drop(m, rulemap); return; } } first_rule += rules_f0; } WARN_ON_ONCE(1); /* elem_priv not found */ } /** * nft_pipapo_do_walk() - Walk over elements in m * @ctx: nftables API context * @set: nftables API set representation * @m: matching data pointing to key mapping array * @iter: Iterator * * As elements are referenced in the mapping array for the last field, directly * scan that array: there's no need to follow rule mappings from the first * field. @m is protected either by RCU read lock or by transaction mutex. */ static void nft_pipapo_do_walk(const struct nft_ctx *ctx, struct nft_set *set, const struct nft_pipapo_match *m, struct nft_set_iter *iter) { const struct nft_pipapo_field *f; unsigned int i, r; for (i = 0, f = m->f; i < m->field_count - 1; i++, f++) ; for (r = 0; r < f->rules; r++) { struct nft_pipapo_elem *e; if (r < f->rules - 1 && f->mt[r + 1].e == f->mt[r].e) continue; if (iter->count < iter->skip) goto cont; e = f->mt[r].e; iter->err = iter->fn(ctx, set, iter, &e->priv); if (iter->err < 0) return; cont: iter->count++; } } /** * nft_pipapo_walk() - Walk over elements * @ctx: nftables API context * @set: nftables API set representation * @iter: Iterator * * Test if destructive action is needed or not, clone active backend if needed * and call the real function to work on the data. */ static void nft_pipapo_walk(const struct nft_ctx *ctx, struct nft_set *set, struct nft_set_iter *iter) { struct nft_pipapo *priv = nft_set_priv(set); const struct nft_pipapo_match *m; switch (iter->type) { case NFT_ITER_UPDATE: m = pipapo_maybe_clone(set); if (!m) { iter->err = -ENOMEM; return; } nft_pipapo_do_walk(ctx, set, m, iter); break; case NFT_ITER_READ: rcu_read_lock(); m = rcu_dereference(priv->match); nft_pipapo_do_walk(ctx, set, m, iter); rcu_read_unlock(); break; default: iter->err = -EINVAL; WARN_ON_ONCE(1); break; } } /** * nft_pipapo_privsize() - Return the size of private data for the set * @nla: netlink attributes, ignored as size doesn't depend on them * @desc: Set description, ignored as size doesn't depend on it * * Return: size of private data for this set implementation, in bytes */ static u64 nft_pipapo_privsize(const struct nlattr * const nla[], const struct nft_set_desc *desc) { return sizeof(struct nft_pipapo); } /** * nft_pipapo_estimate() - Set size, space and lookup complexity * @desc: Set description, element count and field description used * @features: Flags: NFT_SET_INTERVAL needs to be there * @est: Storage for estimation data * * Return: true if set description is compatible, false otherwise */ static bool nft_pipapo_estimate(const struct nft_set_desc *desc, u32 features, struct nft_set_estimate *est) { if (!(features & NFT_SET_INTERVAL) || desc->field_count < NFT_PIPAPO_MIN_FIELDS) return false; est->size = pipapo_estimate_size(desc); if (!est->size) return false; est->lookup = NFT_SET_CLASS_O_LOG_N; est->space = NFT_SET_CLASS_O_N; return true; } /** * nft_pipapo_init() - Initialise data for a set instance * @set: nftables API set representation * @desc: Set description * @nla: netlink attributes * * Validate number and size of fields passed as NFTA_SET_DESC_CONCAT netlink * attributes, initialise internal set parameters, current instance of matching * data and a copy for subsequent insertions. * * Return: 0 on success, negative error code on failure. */ static int nft_pipapo_init(const struct nft_set *set, const struct nft_set_desc *desc, const struct nlattr * const nla[]) { struct nft_pipapo *priv = nft_set_priv(set); struct nft_pipapo_match *m; struct nft_pipapo_field *f; int err, i, field_count; BUILD_BUG_ON(offsetof(struct nft_pipapo_elem, priv) != 0); field_count = desc->field_count ? : 1; BUILD_BUG_ON(NFT_PIPAPO_MAX_FIELDS > 255); BUILD_BUG_ON(NFT_PIPAPO_MAX_FIELDS != NFT_REG32_COUNT); if (field_count > NFT_PIPAPO_MAX_FIELDS) return -EINVAL; m = kmalloc(struct_size(m, f, field_count), GFP_KERNEL); if (!m) return -ENOMEM; m->field_count = field_count; m->bsize_max = 0; m->scratch = alloc_percpu(struct nft_pipapo_scratch *); if (!m->scratch) { err = -ENOMEM; goto out_scratch; } for_each_possible_cpu(i) *per_cpu_ptr(m->scratch, i) = NULL; rcu_head_init(&m->rcu); nft_pipapo_for_each_field(f, i, m) { unsigned int len = desc->field_len[i] ? : set->klen; /* f->groups is u8 */ BUILD_BUG_ON((NFT_PIPAPO_MAX_BYTES * BITS_PER_BYTE / NFT_PIPAPO_GROUP_BITS_LARGE_SET) >= 256); f->bb = NFT_PIPAPO_GROUP_BITS_INIT; f->groups = len * NFT_PIPAPO_GROUPS_PER_BYTE(f); priv->width += round_up(len, sizeof(u32)); f->bsize = 0; f->rules = 0; f->rules_alloc = 0; f->lt = NULL; f->mt = NULL; } rcu_assign_pointer(priv->match, m); return 0; out_scratch: kfree(m); return err; } /** * nft_set_pipapo_match_destroy() - Destroy elements from key mapping array * @ctx: context * @set: nftables API set representation * @m: matching data pointing to key mapping array */ static void nft_set_pipapo_match_destroy(const struct nft_ctx *ctx, const struct nft_set *set, struct nft_pipapo_match *m) { struct nft_pipapo_field *f; unsigned int i, r; for (i = 0, f = m->f; i < m->field_count - 1; i++, f++) ; for (r = 0; r < f->rules; r++) { struct nft_pipapo_elem *e; if (r < f->rules - 1 && f->mt[r + 1].e == f->mt[r].e) continue; e = f->mt[r].e; nf_tables_set_elem_destroy(ctx, set, &e->priv); } } /** * nft_pipapo_destroy() - Free private data for set and all committed elements * @ctx: context * @set: nftables API set representation */ static void nft_pipapo_destroy(const struct nft_ctx *ctx, const struct nft_set *set) { struct nft_pipapo *priv = nft_set_priv(set); struct nft_pipapo_match *m; m = rcu_dereference_protected(priv->match, true); if (priv->clone) { nft_set_pipapo_match_destroy(ctx, set, priv->clone); pipapo_free_match(priv->clone); priv->clone = NULL; } else { nft_set_pipapo_match_destroy(ctx, set, m); } pipapo_free_match(m); } /** * nft_pipapo_gc_init() - Initialise garbage collection * @set: nftables API set representation * * Instead of actually setting up a periodic work for garbage collection, as * this operation requires a swap of matching data with the working copy, we'll * do that opportunistically with other commit operations if the interval is * elapsed, so we just need to set the current jiffies timestamp here. */ static void nft_pipapo_gc_init(const struct nft_set *set) { struct nft_pipapo *priv = nft_set_priv(set); priv->last_gc = jiffies; } const struct nft_set_type nft_set_pipapo_type = { .features = NFT_SET_INTERVAL | NFT_SET_MAP | NFT_SET_OBJECT | NFT_SET_TIMEOUT, .ops = { .lookup = nft_pipapo_lookup, .insert = nft_pipapo_insert, .activate = nft_pipapo_activate, .deactivate = nft_pipapo_deactivate, .flush = nft_pipapo_flush, .remove = nft_pipapo_remove, .walk = nft_pipapo_walk, .get = nft_pipapo_get, .privsize = nft_pipapo_privsize, .estimate = nft_pipapo_estimate, .init = nft_pipapo_init, .destroy = nft_pipapo_destroy, .gc_init = nft_pipapo_gc_init, .commit = nft_pipapo_commit, .abort = nft_pipapo_abort, .elemsize = offsetof(struct nft_pipapo_elem, ext), }, }; #if defined(CONFIG_X86_64) && !defined(CONFIG_UML) const struct nft_set_type nft_set_pipapo_avx2_type = { .features = NFT_SET_INTERVAL | NFT_SET_MAP | NFT_SET_OBJECT | NFT_SET_TIMEOUT, .ops = { .lookup = nft_pipapo_avx2_lookup, .insert = nft_pipapo_insert, .activate = nft_pipapo_activate, .deactivate = nft_pipapo_deactivate, .flush = nft_pipapo_flush, .remove = nft_pipapo_remove, .walk = nft_pipapo_walk, .get = nft_pipapo_get, .privsize = nft_pipapo_privsize, .estimate = nft_pipapo_avx2_estimate, .init = nft_pipapo_init, .destroy = nft_pipapo_destroy, .gc_init = nft_pipapo_gc_init, .commit = nft_pipapo_commit, .abort = nft_pipapo_abort, .elemsize = offsetof(struct nft_pipapo_elem, ext), }, }; #endif |
| 13646 14121 14128 42778 42762 14117 14108 11105 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 | // SPDX-License-Identifier: GPL-2.0-or-later /* * printk_safe.c - Safe printk for printk-deadlock-prone contexts */ #include <linux/preempt.h> #include <linux/kdb.h> #include <linux/smp.h> #include <linux/cpumask.h> #include <linux/printk.h> #include <linux/kprobes.h> #include "internal.h" /* Context where printk messages are never suppressed */ static atomic_t force_con; void printk_force_console_enter(void) { atomic_inc(&force_con); } void printk_force_console_exit(void) { atomic_dec(&force_con); } bool is_printk_force_console(void) { return atomic_read(&force_con); } static DEFINE_PER_CPU(int, printk_context); /* Can be preempted by NMI. */ void __printk_safe_enter(void) { this_cpu_inc(printk_context); } /* Can be preempted by NMI. */ void __printk_safe_exit(void) { this_cpu_dec(printk_context); } void __printk_deferred_enter(void) { cant_migrate(); __printk_safe_enter(); } void __printk_deferred_exit(void) { cant_migrate(); __printk_safe_exit(); } bool is_printk_legacy_deferred(void) { /* * The per-CPU variable @printk_context can be read safely in any * context. CPU migration is always disabled when set. * * A context holding the printk_cpu_sync must not spin waiting for * another CPU. For legacy printing, it could be the console_lock * or the port lock. */ return (force_legacy_kthread() || this_cpu_read(printk_context) || in_nmi() || is_printk_cpu_sync_owner()); } asmlinkage int vprintk(const char *fmt, va_list args) { #ifdef CONFIG_KGDB_KDB /* Allow to pass printk() to kdb but avoid a recursion. */ if (unlikely(kdb_trap_printk && kdb_printf_cpu < 0)) return vkdb_printf(KDB_MSGSRC_PRINTK, fmt, args); #endif return vprintk_default(fmt, args); } EXPORT_SYMBOL(vprintk); |
| 13 13 13 13 13 13 13 13 9 9 8 1 8 9 56 56 55 2 56 55 1 52 410 394 14 14 13 14 5 4 1 1 3 14 49 49 49 49 49 35 35 49 49 14 14 14 14 1 14 109 60 49 35 14 13 49 36 13 2 2 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 | // SPDX-License-Identifier: GPL-2.0-only /* * Yama Linux Security Module * * Author: Kees Cook <keescook@chromium.org> * * Copyright (C) 2010 Canonical, Ltd. * Copyright (C) 2011 The Chromium OS Authors. */ #include <linux/lsm_hooks.h> #include <linux/sysctl.h> #include <linux/ptrace.h> #include <linux/prctl.h> #include <linux/ratelimit.h> #include <linux/workqueue.h> #include <linux/string_helpers.h> #include <linux/task_work.h> #include <linux/sched.h> #include <linux/spinlock.h> #include <uapi/linux/lsm.h> #define YAMA_SCOPE_DISABLED 0 #define YAMA_SCOPE_RELATIONAL 1 #define YAMA_SCOPE_CAPABILITY 2 #define YAMA_SCOPE_NO_ATTACH 3 static int ptrace_scope = YAMA_SCOPE_RELATIONAL; /* describe a ptrace relationship for potential exception */ struct ptrace_relation { struct task_struct *tracer; struct task_struct *tracee; bool invalid; struct list_head node; struct rcu_head rcu; }; static LIST_HEAD(ptracer_relations); static DEFINE_SPINLOCK(ptracer_relations_lock); static void yama_relation_cleanup(struct work_struct *work); static DECLARE_WORK(yama_relation_work, yama_relation_cleanup); struct access_report_info { struct callback_head work; const char *access; struct task_struct *target; struct task_struct *agent; }; static void __report_access(struct callback_head *work) { struct access_report_info *info = container_of(work, struct access_report_info, work); char *target_cmd, *agent_cmd; target_cmd = kstrdup_quotable_cmdline(info->target, GFP_KERNEL); agent_cmd = kstrdup_quotable_cmdline(info->agent, GFP_KERNEL); pr_notice_ratelimited( "ptrace %s of \"%s\"[%d] was attempted by \"%s\"[%d]\n", info->access, target_cmd, info->target->pid, agent_cmd, info->agent->pid); kfree(agent_cmd); kfree(target_cmd); put_task_struct(info->agent); put_task_struct(info->target); kfree(info); } /* defers execution because cmdline access can sleep */ static void report_access(const char *access, struct task_struct *target, struct task_struct *agent) { struct access_report_info *info; assert_spin_locked(&target->alloc_lock); /* for target->comm */ if (current->flags & PF_KTHREAD) { /* I don't think kthreads call task_work_run() before exiting. * Imagine angry ranting about procfs here. */ pr_notice_ratelimited( "ptrace %s of \"%s\"[%d] was attempted by \"%s\"[%d]\n", access, target->comm, target->pid, agent->comm, agent->pid); return; } info = kmalloc(sizeof(*info), GFP_ATOMIC); if (!info) return; init_task_work(&info->work, __report_access); get_task_struct(target); get_task_struct(agent); info->access = access; info->target = target; info->agent = agent; if (task_work_add(current, &info->work, TWA_RESUME) == 0) return; /* success */ WARN(1, "report_access called from exiting task"); put_task_struct(target); put_task_struct(agent); kfree(info); } /** * yama_relation_cleanup - remove invalid entries from the relation list * @work: unused * */ static void yama_relation_cleanup(struct work_struct *work) { struct ptrace_relation *relation; spin_lock(&ptracer_relations_lock); rcu_read_lock(); list_for_each_entry_rcu(relation, &ptracer_relations, node) { if (relation->invalid) { list_del_rcu(&relation->node); kfree_rcu(relation, rcu); } } rcu_read_unlock(); spin_unlock(&ptracer_relations_lock); } /** * yama_ptracer_add - add/replace an exception for this tracer/tracee pair * @tracer: the task_struct of the process doing the ptrace * @tracee: the task_struct of the process to be ptraced * * Each tracee can have, at most, one tracer registered. Each time this * is called, the prior registered tracer will be replaced for the tracee. * * Returns 0 if relationship was added, -ve on error. */ static int yama_ptracer_add(struct task_struct *tracer, struct task_struct *tracee) { struct ptrace_relation *relation, *added; added = kmalloc(sizeof(*added), GFP_KERNEL); if (!added) return -ENOMEM; added->tracee = tracee; added->tracer = tracer; added->invalid = false; spin_lock(&ptracer_relations_lock); rcu_read_lock(); list_for_each_entry_rcu(relation, &ptracer_relations, node) { if (relation->invalid) continue; if (relation->tracee == tracee) { list_replace_rcu(&relation->node, &added->node); kfree_rcu(relation, rcu); goto out; } } list_add_rcu(&added->node, &ptracer_relations); out: rcu_read_unlock(); spin_unlock(&ptracer_relations_lock); return 0; } /** * yama_ptracer_del - remove exceptions related to the given tasks * @tracer: remove any relation where tracer task matches * @tracee: remove any relation where tracee task matches */ static void yama_ptracer_del(struct task_struct *tracer, struct task_struct *tracee) { struct ptrace_relation *relation; bool marked = false; rcu_read_lock(); list_for_each_entry_rcu(relation, &ptracer_relations, node) { if (relation->invalid) continue; if (relation->tracee == tracee || (tracer && relation->tracer == tracer)) { relation->invalid = true; marked = true; } } rcu_read_unlock(); if (marked) schedule_work(&yama_relation_work); } /** * yama_task_free - check for task_pid to remove from exception list * @task: task being removed */ static void yama_task_free(struct task_struct *task) { yama_ptracer_del(task, task); } /** * yama_task_prctl - check for Yama-specific prctl operations * @option: operation * @arg2: argument * @arg3: argument * @arg4: argument * @arg5: argument * * Return 0 on success, -ve on error. -ENOSYS is returned when Yama * does not handle the given option. */ static int yama_task_prctl(int option, unsigned long arg2, unsigned long arg3, unsigned long arg4, unsigned long arg5) { int rc = -ENOSYS; struct task_struct *myself = current; switch (option) { case PR_SET_PTRACER: /* Since a thread can call prctl(), find the group leader * before calling _add() or _del() on it, since we want * process-level granularity of control. The tracer group * leader checking is handled later when walking the ancestry * at the time of PTRACE_ATTACH check. */ rcu_read_lock(); if (!thread_group_leader(myself)) myself = rcu_dereference(myself->group_leader); get_task_struct(myself); rcu_read_unlock(); if (arg2 == 0) { yama_ptracer_del(NULL, myself); rc = 0; } else if (arg2 == PR_SET_PTRACER_ANY || (int)arg2 == -1) { rc = yama_ptracer_add(NULL, myself); } else { struct task_struct *tracer; tracer = find_get_task_by_vpid(arg2); if (!tracer) { rc = -EINVAL; } else { rc = yama_ptracer_add(tracer, myself); put_task_struct(tracer); } } put_task_struct(myself); break; } return rc; } /** * task_is_descendant - walk up a process family tree looking for a match * @parent: the process to compare against while walking up from child * @child: the process to start from while looking upwards for parent * * Returns 1 if child is a descendant of parent, 0 if not. */ static int task_is_descendant(struct task_struct *parent, struct task_struct *child) { int rc = 0; struct task_struct *walker = child; if (!parent || !child) return 0; rcu_read_lock(); if (!thread_group_leader(parent)) parent = rcu_dereference(parent->group_leader); while (walker->pid > 0) { if (!thread_group_leader(walker)) walker = rcu_dereference(walker->group_leader); if (walker == parent) { rc = 1; break; } walker = rcu_dereference(walker->real_parent); } rcu_read_unlock(); return rc; } /** * ptracer_exception_found - tracer registered as exception for this tracee * @tracer: the task_struct of the process attempting ptrace * @tracee: the task_struct of the process to be ptraced * * Returns 1 if tracer has a ptracer exception ancestor for tracee. */ static int ptracer_exception_found(struct task_struct *tracer, struct task_struct *tracee) { int rc = 0; struct ptrace_relation *relation; struct task_struct *parent = NULL; bool found = false; rcu_read_lock(); /* * If there's already an active tracing relationship, then make an * exception for the sake of other accesses, like process_vm_rw(). */ parent = ptrace_parent(tracee); if (parent != NULL && same_thread_group(parent, tracer)) { rc = 1; goto unlock; } /* Look for a PR_SET_PTRACER relationship. */ if (!thread_group_leader(tracee)) tracee = rcu_dereference(tracee->group_leader); list_for_each_entry_rcu(relation, &ptracer_relations, node) { if (relation->invalid) continue; if (relation->tracee == tracee) { parent = relation->tracer; found = true; break; } } if (found && (parent == NULL || task_is_descendant(parent, tracer))) rc = 1; unlock: rcu_read_unlock(); return rc; } /** * yama_ptrace_access_check - validate PTRACE_ATTACH calls * @child: task that current task is attempting to ptrace * @mode: ptrace attach mode * * Returns 0 if following the ptrace is allowed, -ve on error. */ static int yama_ptrace_access_check(struct task_struct *child, unsigned int mode) { int rc = 0; /* require ptrace target be a child of ptracer on attach */ if (mode & PTRACE_MODE_ATTACH) { switch (ptrace_scope) { case YAMA_SCOPE_DISABLED: /* No additional restrictions. */ break; case YAMA_SCOPE_RELATIONAL: rcu_read_lock(); if (!pid_alive(child)) rc = -EPERM; if (!rc && !task_is_descendant(current, child) && !ptracer_exception_found(current, child) && !ns_capable(__task_cred(child)->user_ns, CAP_SYS_PTRACE)) rc = -EPERM; rcu_read_unlock(); break; case YAMA_SCOPE_CAPABILITY: rcu_read_lock(); if (!ns_capable(__task_cred(child)->user_ns, CAP_SYS_PTRACE)) rc = -EPERM; rcu_read_unlock(); break; case YAMA_SCOPE_NO_ATTACH: default: rc = -EPERM; break; } } if (rc && (mode & PTRACE_MODE_NOAUDIT) == 0) report_access("attach", child, current); return rc; } /** * yama_ptrace_traceme - validate PTRACE_TRACEME calls * @parent: task that will become the ptracer of the current task * * Returns 0 if following the ptrace is allowed, -ve on error. */ static int yama_ptrace_traceme(struct task_struct *parent) { int rc = 0; /* Only disallow PTRACE_TRACEME on more aggressive settings. */ switch (ptrace_scope) { case YAMA_SCOPE_CAPABILITY: if (!has_ns_capability(parent, current_user_ns(), CAP_SYS_PTRACE)) rc = -EPERM; break; case YAMA_SCOPE_NO_ATTACH: rc = -EPERM; break; } if (rc) { task_lock(current); report_access("traceme", current, parent); task_unlock(current); } return rc; } static const struct lsm_id yama_lsmid = { .name = "yama", .id = LSM_ID_YAMA, }; static struct security_hook_list yama_hooks[] __ro_after_init = { LSM_HOOK_INIT(ptrace_access_check, yama_ptrace_access_check), LSM_HOOK_INIT(ptrace_traceme, yama_ptrace_traceme), LSM_HOOK_INIT(task_prctl, yama_task_prctl), LSM_HOOK_INIT(task_free, yama_task_free), }; #ifdef CONFIG_SYSCTL static int yama_dointvec_minmax(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { struct ctl_table table_copy; if (write && !capable(CAP_SYS_PTRACE)) return -EPERM; /* Lock the max value if it ever gets set. */ table_copy = *table; if (*(int *)table_copy.data == *(int *)table_copy.extra2) table_copy.extra1 = table_copy.extra2; return proc_dointvec_minmax(&table_copy, write, buffer, lenp, ppos); } static int max_scope = YAMA_SCOPE_NO_ATTACH; static const struct ctl_table yama_sysctl_table[] = { { .procname = "ptrace_scope", .data = &ptrace_scope, .maxlen = sizeof(int), .mode = 0644, .proc_handler = yama_dointvec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = &max_scope, }, }; static void __init yama_init_sysctl(void) { if (!register_sysctl("kernel/yama", yama_sysctl_table)) panic("Yama: sysctl registration failed.\n"); } #else static inline void yama_init_sysctl(void) { } #endif /* CONFIG_SYSCTL */ static int __init yama_init(void) { pr_info("Yama: becoming mindful.\n"); security_add_hooks(yama_hooks, ARRAY_SIZE(yama_hooks), &yama_lsmid); yama_init_sysctl(); return 0; } DEFINE_LSM(yama) = { .name = "yama", .init = yama_init, }; |
| 5 7 2 1 4 9 9 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 | // SPDX-License-Identifier: GPL-2.0-or-later /* * authencesn.c - AEAD wrapper for IPsec with extended sequence numbers, * derived from authenc.c * * Copyright (C) 2010 secunet Security Networks AG * Copyright (C) 2010 Steffen Klassert <steffen.klassert@secunet.com> * Copyright (c) 2015 Herbert Xu <herbert@gondor.apana.org.au> */ #include <crypto/internal/aead.h> #include <crypto/internal/hash.h> #include <crypto/internal/skcipher.h> #include <crypto/authenc.h> #include <crypto/null.h> #include <crypto/scatterwalk.h> #include <linux/err.h> #include <linux/init.h> #include <linux/kernel.h> #include <linux/module.h> #include <linux/rtnetlink.h> #include <linux/slab.h> #include <linux/spinlock.h> struct authenc_esn_instance_ctx { struct crypto_ahash_spawn auth; struct crypto_skcipher_spawn enc; }; struct crypto_authenc_esn_ctx { unsigned int reqoff; struct crypto_ahash *auth; struct crypto_skcipher *enc; struct crypto_sync_skcipher *null; }; struct authenc_esn_request_ctx { struct scatterlist src[2]; struct scatterlist dst[2]; char tail[]; }; static void authenc_esn_request_complete(struct aead_request *req, int err) { if (err != -EINPROGRESS) aead_request_complete(req, err); } static int crypto_authenc_esn_setauthsize(struct crypto_aead *authenc_esn, unsigned int authsize) { if (authsize > 0 && authsize < 4) return -EINVAL; return 0; } static int crypto_authenc_esn_setkey(struct crypto_aead *authenc_esn, const u8 *key, unsigned int keylen) { struct crypto_authenc_esn_ctx *ctx = crypto_aead_ctx(authenc_esn); struct crypto_ahash *auth = ctx->auth; struct crypto_skcipher *enc = ctx->enc; struct crypto_authenc_keys keys; int err = -EINVAL; if (crypto_authenc_extractkeys(&keys, key, keylen) != 0) goto out; crypto_ahash_clear_flags(auth, CRYPTO_TFM_REQ_MASK); crypto_ahash_set_flags(auth, crypto_aead_get_flags(authenc_esn) & CRYPTO_TFM_REQ_MASK); err = crypto_ahash_setkey(auth, keys.authkey, keys.authkeylen); if (err) goto out; crypto_skcipher_clear_flags(enc, CRYPTO_TFM_REQ_MASK); crypto_skcipher_set_flags(enc, crypto_aead_get_flags(authenc_esn) & CRYPTO_TFM_REQ_MASK); err = crypto_skcipher_setkey(enc, keys.enckey, keys.enckeylen); out: memzero_explicit(&keys, sizeof(keys)); return err; } static int crypto_authenc_esn_genicv_tail(struct aead_request *req, unsigned int flags) { struct crypto_aead *authenc_esn = crypto_aead_reqtfm(req); struct authenc_esn_request_ctx *areq_ctx = aead_request_ctx(req); u8 *hash = areq_ctx->tail; unsigned int authsize = crypto_aead_authsize(authenc_esn); unsigned int assoclen = req->assoclen; unsigned int cryptlen = req->cryptlen; struct scatterlist *dst = req->dst; u32 tmp[2]; /* Move high-order bits of sequence number back. */ scatterwalk_map_and_copy(tmp, dst, 4, 4, 0); scatterwalk_map_and_copy(tmp + 1, dst, assoclen + cryptlen, 4, 0); scatterwalk_map_and_copy(tmp, dst, 0, 8, 1); scatterwalk_map_and_copy(hash, dst, assoclen + cryptlen, authsize, 1); return 0; } static void authenc_esn_geniv_ahash_done(void *data, int err) { struct aead_request *req = data; err = err ?: crypto_authenc_esn_genicv_tail(req, 0); aead_request_complete(req, err); } static int crypto_authenc_esn_genicv(struct aead_request *req, unsigned int flags) { struct crypto_aead *authenc_esn = crypto_aead_reqtfm(req); struct authenc_esn_request_ctx *areq_ctx = aead_request_ctx(req); struct crypto_authenc_esn_ctx *ctx = crypto_aead_ctx(authenc_esn); struct crypto_ahash *auth = ctx->auth; u8 *hash = areq_ctx->tail; struct ahash_request *ahreq = (void *)(areq_ctx->tail + ctx->reqoff); unsigned int authsize = crypto_aead_authsize(authenc_esn); unsigned int assoclen = req->assoclen; unsigned int cryptlen = req->cryptlen; struct scatterlist *dst = req->dst; u32 tmp[2]; if (!authsize) return 0; /* Move high-order bits of sequence number to the end. */ scatterwalk_map_and_copy(tmp, dst, 0, 8, 0); scatterwalk_map_and_copy(tmp, dst, 4, 4, 1); scatterwalk_map_and_copy(tmp + 1, dst, assoclen + cryptlen, 4, 1); sg_init_table(areq_ctx->dst, 2); dst = scatterwalk_ffwd(areq_ctx->dst, dst, 4); ahash_request_set_tfm(ahreq, auth); ahash_request_set_crypt(ahreq, dst, hash, assoclen + cryptlen); ahash_request_set_callback(ahreq, flags, authenc_esn_geniv_ahash_done, req); return crypto_ahash_digest(ahreq) ?: crypto_authenc_esn_genicv_tail(req, aead_request_flags(req)); } static void crypto_authenc_esn_encrypt_done(void *data, int err) { struct aead_request *areq = data; if (!err) err = crypto_authenc_esn_genicv(areq, 0); authenc_esn_request_complete(areq, err); } static int crypto_authenc_esn_copy(struct aead_request *req, unsigned int len) { struct crypto_aead *authenc_esn = crypto_aead_reqtfm(req); struct crypto_authenc_esn_ctx *ctx = crypto_aead_ctx(authenc_esn); SYNC_SKCIPHER_REQUEST_ON_STACK(skreq, ctx->null); skcipher_request_set_sync_tfm(skreq, ctx->null); skcipher_request_set_callback(skreq, aead_request_flags(req), NULL, NULL); skcipher_request_set_crypt(skreq, req->src, req->dst, len, NULL); return crypto_skcipher_encrypt(skreq); } static int crypto_authenc_esn_encrypt(struct aead_request *req) { struct crypto_aead *authenc_esn = crypto_aead_reqtfm(req); struct authenc_esn_request_ctx *areq_ctx = aead_request_ctx(req); struct crypto_authenc_esn_ctx *ctx = crypto_aead_ctx(authenc_esn); struct skcipher_request *skreq = (void *)(areq_ctx->tail + ctx->reqoff); struct crypto_skcipher *enc = ctx->enc; unsigned int assoclen = req->assoclen; unsigned int cryptlen = req->cryptlen; struct scatterlist *src, *dst; int err; sg_init_table(areq_ctx->src, 2); src = scatterwalk_ffwd(areq_ctx->src, req->src, assoclen); dst = src; if (req->src != req->dst) { err = crypto_authenc_esn_copy(req, assoclen); if (err) return err; sg_init_table(areq_ctx->dst, 2); dst = scatterwalk_ffwd(areq_ctx->dst, req->dst, assoclen); } skcipher_request_set_tfm(skreq, enc); skcipher_request_set_callback(skreq, aead_request_flags(req), crypto_authenc_esn_encrypt_done, req); skcipher_request_set_crypt(skreq, src, dst, cryptlen, req->iv); err = crypto_skcipher_encrypt(skreq); if (err) return err; return crypto_authenc_esn_genicv(req, aead_request_flags(req)); } static int crypto_authenc_esn_decrypt_tail(struct aead_request *req, unsigned int flags) { struct crypto_aead *authenc_esn = crypto_aead_reqtfm(req); unsigned int authsize = crypto_aead_authsize(authenc_esn); struct authenc_esn_request_ctx *areq_ctx = aead_request_ctx(req); struct crypto_authenc_esn_ctx *ctx = crypto_aead_ctx(authenc_esn); struct skcipher_request *skreq = (void *)(areq_ctx->tail + ctx->reqoff); struct crypto_ahash *auth = ctx->auth; u8 *ohash = areq_ctx->tail; unsigned int cryptlen = req->cryptlen - authsize; unsigned int assoclen = req->assoclen; struct scatterlist *dst = req->dst; u8 *ihash = ohash + crypto_ahash_digestsize(auth); u32 tmp[2]; if (!authsize) goto decrypt; /* Move high-order bits of sequence number back. */ scatterwalk_map_and_copy(tmp, dst, 4, 4, 0); scatterwalk_map_and_copy(tmp + 1, dst, assoclen + cryptlen, 4, 0); scatterwalk_map_and_copy(tmp, dst, 0, 8, 1); if (crypto_memneq(ihash, ohash, authsize)) return -EBADMSG; decrypt: sg_init_table(areq_ctx->dst, 2); dst = scatterwalk_ffwd(areq_ctx->dst, dst, assoclen); skcipher_request_set_tfm(skreq, ctx->enc); skcipher_request_set_callback(skreq, flags, req->base.complete, req->base.data); skcipher_request_set_crypt(skreq, dst, dst, cryptlen, req->iv); return crypto_skcipher_decrypt(skreq); } static void authenc_esn_verify_ahash_done(void *data, int err) { struct aead_request *req = data; err = err ?: crypto_authenc_esn_decrypt_tail(req, 0); authenc_esn_request_complete(req, err); } static int crypto_authenc_esn_decrypt(struct aead_request *req) { struct crypto_aead *authenc_esn = crypto_aead_reqtfm(req); struct authenc_esn_request_ctx *areq_ctx = aead_request_ctx(req); struct crypto_authenc_esn_ctx *ctx = crypto_aead_ctx(authenc_esn); struct ahash_request *ahreq = (void *)(areq_ctx->tail + ctx->reqoff); unsigned int authsize = crypto_aead_authsize(authenc_esn); struct crypto_ahash *auth = ctx->auth; u8 *ohash = areq_ctx->tail; unsigned int assoclen = req->assoclen; unsigned int cryptlen = req->cryptlen; u8 *ihash = ohash + crypto_ahash_digestsize(auth); struct scatterlist *dst = req->dst; u32 tmp[2]; int err; cryptlen -= authsize; if (req->src != dst) { err = crypto_authenc_esn_copy(req, assoclen + cryptlen); if (err) return err; } scatterwalk_map_and_copy(ihash, req->src, assoclen + cryptlen, authsize, 0); if (!authsize) goto tail; /* Move high-order bits of sequence number to the end. */ scatterwalk_map_and_copy(tmp, dst, 0, 8, 0); scatterwalk_map_and_copy(tmp, dst, 4, 4, 1); scatterwalk_map_and_copy(tmp + 1, dst, assoclen + cryptlen, 4, 1); sg_init_table(areq_ctx->dst, 2); dst = scatterwalk_ffwd(areq_ctx->dst, dst, 4); ahash_request_set_tfm(ahreq, auth); ahash_request_set_crypt(ahreq, dst, ohash, assoclen + cryptlen); ahash_request_set_callback(ahreq, aead_request_flags(req), authenc_esn_verify_ahash_done, req); err = crypto_ahash_digest(ahreq); if (err) return err; tail: return crypto_authenc_esn_decrypt_tail(req, aead_request_flags(req)); } static int crypto_authenc_esn_init_tfm(struct crypto_aead *tfm) { struct aead_instance *inst = aead_alg_instance(tfm); struct authenc_esn_instance_ctx *ictx = aead_instance_ctx(inst); struct crypto_authenc_esn_ctx *ctx = crypto_aead_ctx(tfm); struct crypto_ahash *auth; struct crypto_skcipher *enc; struct crypto_sync_skcipher *null; int err; auth = crypto_spawn_ahash(&ictx->auth); if (IS_ERR(auth)) return PTR_ERR(auth); enc = crypto_spawn_skcipher(&ictx->enc); err = PTR_ERR(enc); if (IS_ERR(enc)) goto err_free_ahash; null = crypto_get_default_null_skcipher(); err = PTR_ERR(null); if (IS_ERR(null)) goto err_free_skcipher; ctx->auth = auth; ctx->enc = enc; ctx->null = null; ctx->reqoff = 2 * crypto_ahash_digestsize(auth); crypto_aead_set_reqsize( tfm, sizeof(struct authenc_esn_request_ctx) + ctx->reqoff + max_t(unsigned int, crypto_ahash_reqsize(auth) + sizeof(struct ahash_request), sizeof(struct skcipher_request) + crypto_skcipher_reqsize(enc))); return 0; err_free_skcipher: crypto_free_skcipher(enc); err_free_ahash: crypto_free_ahash(auth); return err; } static void crypto_authenc_esn_exit_tfm(struct crypto_aead *tfm) { struct crypto_authenc_esn_ctx *ctx = crypto_aead_ctx(tfm); crypto_free_ahash(ctx->auth); crypto_free_skcipher(ctx->enc); crypto_put_default_null_skcipher(); } static void crypto_authenc_esn_free(struct aead_instance *inst) { struct authenc_esn_instance_ctx *ctx = aead_instance_ctx(inst); crypto_drop_skcipher(&ctx->enc); crypto_drop_ahash(&ctx->auth); kfree(inst); } static int crypto_authenc_esn_create(struct crypto_template *tmpl, struct rtattr **tb) { u32 mask; struct aead_instance *inst; struct authenc_esn_instance_ctx *ctx; struct skcipher_alg_common *enc; struct hash_alg_common *auth; struct crypto_alg *auth_base; int err; err = crypto_check_attr_type(tb, CRYPTO_ALG_TYPE_AEAD, &mask); if (err) return err; inst = kzalloc(sizeof(*inst) + sizeof(*ctx), GFP_KERNEL); if (!inst) return -ENOMEM; ctx = aead_instance_ctx(inst); err = crypto_grab_ahash(&ctx->auth, aead_crypto_instance(inst), crypto_attr_alg_name(tb[1]), 0, mask); if (err) goto err_free_inst; auth = crypto_spawn_ahash_alg(&ctx->auth); auth_base = &auth->base; err = crypto_grab_skcipher(&ctx->enc, aead_crypto_instance(inst), crypto_attr_alg_name(tb[2]), 0, mask); if (err) goto err_free_inst; enc = crypto_spawn_skcipher_alg_common(&ctx->enc); err = -ENAMETOOLONG; if (snprintf(inst->alg.base.cra_name, CRYPTO_MAX_ALG_NAME, "authencesn(%s,%s)", auth_base->cra_name, enc->base.cra_name) >= CRYPTO_MAX_ALG_NAME) goto err_free_inst; if (snprintf(inst->alg.base.cra_driver_name, CRYPTO_MAX_ALG_NAME, "authencesn(%s,%s)", auth_base->cra_driver_name, enc->base.cra_driver_name) >= CRYPTO_MAX_ALG_NAME) goto err_free_inst; inst->alg.base.cra_priority = enc->base.cra_priority * 10 + auth_base->cra_priority; inst->alg.base.cra_blocksize = enc->base.cra_blocksize; inst->alg.base.cra_alignmask = enc->base.cra_alignmask; inst->alg.base.cra_ctxsize = sizeof(struct crypto_authenc_esn_ctx); inst->alg.ivsize = enc->ivsize; inst->alg.chunksize = enc->chunksize; inst->alg.maxauthsize = auth->digestsize; inst->alg.init = crypto_authenc_esn_init_tfm; inst->alg.exit = crypto_authenc_esn_exit_tfm; inst->alg.setkey = crypto_authenc_esn_setkey; inst->alg.setauthsize = crypto_authenc_esn_setauthsize; inst->alg.encrypt = crypto_authenc_esn_encrypt; inst->alg.decrypt = crypto_authenc_esn_decrypt; inst->free = crypto_authenc_esn_free; err = aead_register_instance(tmpl, inst); if (err) { err_free_inst: crypto_authenc_esn_free(inst); } return err; } static struct crypto_template crypto_authenc_esn_tmpl = { .name = "authencesn", .create = crypto_authenc_esn_create, .module = THIS_MODULE, }; static int __init crypto_authenc_esn_module_init(void) { return crypto_register_template(&crypto_authenc_esn_tmpl); } static void __exit crypto_authenc_esn_module_exit(void) { crypto_unregister_template(&crypto_authenc_esn_tmpl); } subsys_initcall(crypto_authenc_esn_module_init); module_exit(crypto_authenc_esn_module_exit); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Steffen Klassert <steffen.klassert@secunet.com>"); MODULE_DESCRIPTION("AEAD wrapper for IPsec with extended sequence numbers"); MODULE_ALIAS_CRYPTO("authencesn"); |
| 5 5 5 1 1 4 4 4 4 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 | // SPDX-License-Identifier: GPL-2.0-or-later /* * Cryptographic API. * * DES & Triple DES EDE Cipher Algorithms. * * Copyright (c) 2005 Dag Arne Osvik <da@osvik.no> */ #include <linux/bitops.h> #include <linux/compiler.h> #include <linux/crypto.h> #include <linux/errno.h> #include <linux/fips.h> #include <linux/init.h> #include <linux/module.h> #include <linux/string.h> #include <linux/types.h> #include <linux/unaligned.h> #include <crypto/des.h> #include <crypto/internal/des.h> #define ROL(x, r) ((x) = rol32((x), (r))) #define ROR(x, r) ((x) = ror32((x), (r))) /* Lookup tables for key expansion */ static const u8 pc1[256] = { 0x00, 0x00, 0x40, 0x04, 0x10, 0x10, 0x50, 0x14, 0x04, 0x40, 0x44, 0x44, 0x14, 0x50, 0x54, 0x54, 0x02, 0x02, 0x42, 0x06, 0x12, 0x12, 0x52, 0x16, 0x06, 0x42, 0x46, 0x46, 0x16, 0x52, 0x56, 0x56, 0x80, 0x08, 0xc0, 0x0c, 0x90, 0x18, 0xd0, 0x1c, 0x84, 0x48, 0xc4, 0x4c, 0x94, 0x58, 0xd4, 0x5c, 0x82, 0x0a, 0xc2, 0x0e, 0x92, 0x1a, 0xd2, 0x1e, 0x86, 0x4a, 0xc6, 0x4e, 0x96, 0x5a, 0xd6, 0x5e, 0x20, 0x20, 0x60, 0x24, 0x30, 0x30, 0x70, 0x34, 0x24, 0x60, 0x64, 0x64, 0x34, 0x70, 0x74, 0x74, 0x22, 0x22, 0x62, 0x26, 0x32, 0x32, 0x72, 0x36, 0x26, 0x62, 0x66, 0x66, 0x36, 0x72, 0x76, 0x76, 0xa0, 0x28, 0xe0, 0x2c, 0xb0, 0x38, 0xf0, 0x3c, 0xa4, 0x68, 0xe4, 0x6c, 0xb4, 0x78, 0xf4, 0x7c, 0xa2, 0x2a, 0xe2, 0x2e, 0xb2, 0x3a, 0xf2, 0x3e, 0xa6, 0x6a, 0xe6, 0x6e, 0xb6, 0x7a, 0xf6, 0x7e, 0x08, 0x80, 0x48, 0x84, 0x18, 0x90, 0x58, 0x94, 0x0c, 0xc0, 0x4c, 0xc4, 0x1c, 0xd0, 0x5c, 0xd4, 0x0a, 0x82, 0x4a, 0x86, 0x1a, 0x92, 0x5a, 0x96, 0x0e, 0xc2, 0x4e, 0xc6, 0x1e, 0xd2, 0x5e, 0xd6, 0x88, 0x88, 0xc8, 0x8c, 0x98, 0x98, 0xd8, 0x9c, 0x8c, 0xc8, 0xcc, 0xcc, 0x9c, 0xd8, 0xdc, 0xdc, 0x8a, 0x8a, 0xca, 0x8e, 0x9a, 0x9a, 0xda, 0x9e, 0x8e, 0xca, 0xce, 0xce, 0x9e, 0xda, 0xde, 0xde, 0x28, 0xa0, 0x68, 0xa4, 0x38, 0xb0, 0x78, 0xb4, 0x2c, 0xe0, 0x6c, 0xe4, 0x3c, 0xf0, 0x7c, 0xf4, 0x2a, 0xa2, 0x6a, 0xa6, 0x3a, 0xb2, 0x7a, 0xb6, 0x2e, 0xe2, 0x6e, 0xe6, 0x3e, 0xf2, 0x7e, 0xf6, 0xa8, 0xa8, 0xe8, 0xac, 0xb8, 0xb8, 0xf8, 0xbc, 0xac, 0xe8, 0xec, 0xec, 0xbc, 0xf8, 0xfc, 0xfc, 0xaa, 0xaa, 0xea, 0xae, 0xba, 0xba, 0xfa, 0xbe, 0xae, 0xea, 0xee, 0xee, 0xbe, 0xfa, 0xfe, 0xfe }; static const u8 rs[256] = { 0x00, 0x00, 0x80, 0x80, 0x02, 0x02, 0x82, 0x82, 0x04, 0x04, 0x84, 0x84, 0x06, 0x06, 0x86, 0x86, 0x08, 0x08, 0x88, 0x88, 0x0a, 0x0a, 0x8a, 0x8a, 0x0c, 0x0c, 0x8c, 0x8c, 0x0e, 0x0e, 0x8e, 0x8e, 0x10, 0x10, 0x90, 0x90, 0x12, 0x12, 0x92, 0x92, 0x14, 0x14, 0x94, 0x94, 0x16, 0x16, 0x96, 0x96, 0x18, 0x18, 0x98, 0x98, 0x1a, 0x1a, 0x9a, 0x9a, 0x1c, 0x1c, 0x9c, 0x9c, 0x1e, 0x1e, 0x9e, 0x9e, 0x20, 0x20, 0xa0, 0xa0, 0x22, 0x22, 0xa2, 0xa2, 0x24, 0x24, 0xa4, 0xa4, 0x26, 0x26, 0xa6, 0xa6, 0x28, 0x28, 0xa8, 0xa8, 0x2a, 0x2a, 0xaa, 0xaa, 0x2c, 0x2c, 0xac, 0xac, 0x2e, 0x2e, 0xae, 0xae, 0x30, 0x30, 0xb0, 0xb0, 0x32, 0x32, 0xb2, 0xb2, 0x34, 0x34, 0xb4, 0xb4, 0x36, 0x36, 0xb6, 0xb6, 0x38, 0x38, 0xb8, 0xb8, 0x3a, 0x3a, 0xba, 0xba, 0x3c, 0x3c, 0xbc, 0xbc, 0x3e, 0x3e, 0xbe, 0xbe, 0x40, 0x40, 0xc0, 0xc0, 0x42, 0x42, 0xc2, 0xc2, 0x44, 0x44, 0xc4, 0xc4, 0x46, 0x46, 0xc6, 0xc6, 0x48, 0x48, 0xc8, 0xc8, 0x4a, 0x4a, 0xca, 0xca, 0x4c, 0x4c, 0xcc, 0xcc, 0x4e, 0x4e, 0xce, 0xce, 0x50, 0x50, 0xd0, 0xd0, 0x52, 0x52, 0xd2, 0xd2, 0x54, 0x54, 0xd4, 0xd4, 0x56, 0x56, 0xd6, 0xd6, 0x58, 0x58, 0xd8, 0xd8, 0x5a, 0x5a, 0xda, 0xda, 0x5c, 0x5c, 0xdc, 0xdc, 0x5e, 0x5e, 0xde, 0xde, 0x60, 0x60, 0xe0, 0xe0, 0x62, 0x62, 0xe2, 0xe2, 0x64, 0x64, 0xe4, 0xe4, 0x66, 0x66, 0xe6, 0xe6, 0x68, 0x68, 0xe8, 0xe8, 0x6a, 0x6a, 0xea, 0xea, 0x6c, 0x6c, 0xec, 0xec, 0x6e, 0x6e, 0xee, 0xee, 0x70, 0x70, 0xf0, 0xf0, 0x72, 0x72, 0xf2, 0xf2, 0x74, 0x74, 0xf4, 0xf4, 0x76, 0x76, 0xf6, 0xf6, 0x78, 0x78, 0xf8, 0xf8, 0x7a, 0x7a, 0xfa, 0xfa, 0x7c, 0x7c, 0xfc, 0xfc, 0x7e, 0x7e, 0xfe, 0xfe }; static const u32 pc2[1024] = { 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00040000, 0x00000000, 0x04000000, 0x00100000, 0x00400000, 0x00000008, 0x00000800, 0x40000000, 0x00440000, 0x00000008, 0x04000800, 0x40100000, 0x00000400, 0x00000020, 0x08000000, 0x00000100, 0x00040400, 0x00000020, 0x0c000000, 0x00100100, 0x00400400, 0x00000028, 0x08000800, 0x40000100, 0x00440400, 0x00000028, 0x0c000800, 0x40100100, 0x80000000, 0x00000010, 0x00000000, 0x00800000, 0x80040000, 0x00000010, 0x04000000, 0x00900000, 0x80400000, 0x00000018, 0x00000800, 0x40800000, 0x80440000, 0x00000018, 0x04000800, 0x40900000, 0x80000400, 0x00000030, 0x08000000, 0x00800100, 0x80040400, 0x00000030, 0x0c000000, 0x00900100, 0x80400400, 0x00000038, 0x08000800, 0x40800100, 0x80440400, 0x00000038, 0x0c000800, 0x40900100, 0x10000000, 0x00000000, 0x00200000, 0x00001000, 0x10040000, 0x00000000, 0x04200000, 0x00101000, 0x10400000, 0x00000008, 0x00200800, 0x40001000, 0x10440000, 0x00000008, 0x04200800, 0x40101000, 0x10000400, 0x00000020, 0x08200000, 0x00001100, 0x10040400, 0x00000020, 0x0c200000, 0x00101100, 0x10400400, 0x00000028, 0x08200800, 0x40001100, 0x10440400, 0x00000028, 0x0c200800, 0x40101100, 0x90000000, 0x00000010, 0x00200000, 0x00801000, 0x90040000, 0x00000010, 0x04200000, 0x00901000, 0x90400000, 0x00000018, 0x00200800, 0x40801000, 0x90440000, 0x00000018, 0x04200800, 0x40901000, 0x90000400, 0x00000030, 0x08200000, 0x00801100, 0x90040400, 0x00000030, 0x0c200000, 0x00901100, 0x90400400, 0x00000038, 0x08200800, 0x40801100, 0x90440400, 0x00000038, 0x0c200800, 0x40901100, 0x00000200, 0x00080000, 0x00000000, 0x00000004, 0x00040200, 0x00080000, 0x04000000, 0x00100004, 0x00400200, 0x00080008, 0x00000800, 0x40000004, 0x00440200, 0x00080008, 0x04000800, 0x40100004, 0x00000600, 0x00080020, 0x08000000, 0x00000104, 0x00040600, 0x00080020, 0x0c000000, 0x00100104, 0x00400600, 0x00080028, 0x08000800, 0x40000104, 0x00440600, 0x00080028, 0x0c000800, 0x40100104, 0x80000200, 0x00080010, 0x00000000, 0x00800004, 0x80040200, 0x00080010, 0x04000000, 0x00900004, 0x80400200, 0x00080018, 0x00000800, 0x40800004, 0x80440200, 0x00080018, 0x04000800, 0x40900004, 0x80000600, 0x00080030, 0x08000000, 0x00800104, 0x80040600, 0x00080030, 0x0c000000, 0x00900104, 0x80400600, 0x00080038, 0x08000800, 0x40800104, 0x80440600, 0x00080038, 0x0c000800, 0x40900104, 0x10000200, 0x00080000, 0x00200000, 0x00001004, 0x10040200, 0x00080000, 0x04200000, 0x00101004, 0x10400200, 0x00080008, 0x00200800, 0x40001004, 0x10440200, 0x00080008, 0x04200800, 0x40101004, 0x10000600, 0x00080020, 0x08200000, 0x00001104, 0x10040600, 0x00080020, 0x0c200000, 0x00101104, 0x10400600, 0x00080028, 0x08200800, 0x40001104, 0x10440600, 0x00080028, 0x0c200800, 0x40101104, 0x90000200, 0x00080010, 0x00200000, 0x00801004, 0x90040200, 0x00080010, 0x04200000, 0x00901004, 0x90400200, 0x00080018, 0x00200800, 0x40801004, 0x90440200, 0x00080018, 0x04200800, 0x40901004, 0x90000600, 0x00080030, 0x08200000, 0x00801104, 0x90040600, 0x00080030, 0x0c200000, 0x00901104, 0x90400600, 0x00080038, 0x08200800, 0x40801104, 0x90440600, 0x00080038, 0x0c200800, 0x40901104, 0x00000002, 0x00002000, 0x20000000, 0x00000001, 0x00040002, 0x00002000, 0x24000000, 0x00100001, 0x00400002, 0x00002008, 0x20000800, 0x40000001, 0x00440002, 0x00002008, 0x24000800, 0x40100001, 0x00000402, 0x00002020, 0x28000000, 0x00000101, 0x00040402, 0x00002020, 0x2c000000, 0x00100101, 0x00400402, 0x00002028, 0x28000800, 0x40000101, 0x00440402, 0x00002028, 0x2c000800, 0x40100101, 0x80000002, 0x00002010, 0x20000000, 0x00800001, 0x80040002, 0x00002010, 0x24000000, 0x00900001, 0x80400002, 0x00002018, 0x20000800, 0x40800001, 0x80440002, 0x00002018, 0x24000800, 0x40900001, 0x80000402, 0x00002030, 0x28000000, 0x00800101, 0x80040402, 0x00002030, 0x2c000000, 0x00900101, 0x80400402, 0x00002038, 0x28000800, 0x40800101, 0x80440402, 0x00002038, 0x2c000800, 0x40900101, 0x10000002, 0x00002000, 0x20200000, 0x00001001, 0x10040002, 0x00002000, 0x24200000, 0x00101001, 0x10400002, 0x00002008, 0x20200800, 0x40001001, 0x10440002, 0x00002008, 0x24200800, 0x40101001, 0x10000402, 0x00002020, 0x28200000, 0x00001101, 0x10040402, 0x00002020, 0x2c200000, 0x00101101, 0x10400402, 0x00002028, 0x28200800, 0x40001101, 0x10440402, 0x00002028, 0x2c200800, 0x40101101, 0x90000002, 0x00002010, 0x20200000, 0x00801001, 0x90040002, 0x00002010, 0x24200000, 0x00901001, 0x90400002, 0x00002018, 0x20200800, 0x40801001, 0x90440002, 0x00002018, 0x24200800, 0x40901001, 0x90000402, 0x00002030, 0x28200000, 0x00801101, 0x90040402, 0x00002030, 0x2c200000, 0x00901101, 0x90400402, 0x00002038, 0x28200800, 0x40801101, 0x90440402, 0x00002038, 0x2c200800, 0x40901101, 0x00000202, 0x00082000, 0x20000000, 0x00000005, 0x00040202, 0x00082000, 0x24000000, 0x00100005, 0x00400202, 0x00082008, 0x20000800, 0x40000005, 0x00440202, 0x00082008, 0x24000800, 0x40100005, 0x00000602, 0x00082020, 0x28000000, 0x00000105, 0x00040602, 0x00082020, 0x2c000000, 0x00100105, 0x00400602, 0x00082028, 0x28000800, 0x40000105, 0x00440602, 0x00082028, 0x2c000800, 0x40100105, 0x80000202, 0x00082010, 0x20000000, 0x00800005, 0x80040202, 0x00082010, 0x24000000, 0x00900005, 0x80400202, 0x00082018, 0x20000800, 0x40800005, 0x80440202, 0x00082018, 0x24000800, 0x40900005, 0x80000602, 0x00082030, 0x28000000, 0x00800105, 0x80040602, 0x00082030, 0x2c000000, 0x00900105, 0x80400602, 0x00082038, 0x28000800, 0x40800105, 0x80440602, 0x00082038, 0x2c000800, 0x40900105, 0x10000202, 0x00082000, 0x20200000, 0x00001005, 0x10040202, 0x00082000, 0x24200000, 0x00101005, 0x10400202, 0x00082008, 0x20200800, 0x40001005, 0x10440202, 0x00082008, 0x24200800, 0x40101005, 0x10000602, 0x00082020, 0x28200000, 0x00001105, 0x10040602, 0x00082020, 0x2c200000, 0x00101105, 0x10400602, 0x00082028, 0x28200800, 0x40001105, 0x10440602, 0x00082028, 0x2c200800, 0x40101105, 0x90000202, 0x00082010, 0x20200000, 0x00801005, 0x90040202, 0x00082010, 0x24200000, 0x00901005, 0x90400202, 0x00082018, 0x20200800, 0x40801005, 0x90440202, 0x00082018, 0x24200800, 0x40901005, 0x90000602, 0x00082030, 0x28200000, 0x00801105, 0x90040602, 0x00082030, 0x2c200000, 0x00901105, 0x90400602, 0x00082038, 0x28200800, 0x40801105, 0x90440602, 0x00082038, 0x2c200800, 0x40901105, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000008, 0x00080000, 0x10000000, 0x02000000, 0x00000000, 0x00000080, 0x00001000, 0x02000000, 0x00000008, 0x00080080, 0x10001000, 0x00004000, 0x00000000, 0x00000040, 0x00040000, 0x00004000, 0x00000008, 0x00080040, 0x10040000, 0x02004000, 0x00000000, 0x000000c0, 0x00041000, 0x02004000, 0x00000008, 0x000800c0, 0x10041000, 0x00020000, 0x00008000, 0x08000000, 0x00200000, 0x00020000, 0x00008008, 0x08080000, 0x10200000, 0x02020000, 0x00008000, 0x08000080, 0x00201000, 0x02020000, 0x00008008, 0x08080080, 0x10201000, 0x00024000, 0x00008000, 0x08000040, 0x00240000, 0x00024000, 0x00008008, 0x08080040, 0x10240000, 0x02024000, 0x00008000, 0x080000c0, 0x00241000, 0x02024000, 0x00008008, 0x080800c0, 0x10241000, 0x00000000, 0x01000000, 0x00002000, 0x00000020, 0x00000000, 0x01000008, 0x00082000, 0x10000020, 0x02000000, 0x01000000, 0x00002080, 0x00001020, 0x02000000, 0x01000008, 0x00082080, 0x10001020, 0x00004000, 0x01000000, 0x00002040, 0x00040020, 0x00004000, 0x01000008, 0x00082040, 0x10040020, 0x02004000, 0x01000000, 0x000020c0, 0x00041020, 0x02004000, 0x01000008, 0x000820c0, 0x10041020, 0x00020000, 0x01008000, 0x08002000, 0x00200020, 0x00020000, 0x01008008, 0x08082000, 0x10200020, 0x02020000, 0x01008000, 0x08002080, 0x00201020, 0x02020000, 0x01008008, 0x08082080, 0x10201020, 0x00024000, 0x01008000, 0x08002040, 0x00240020, 0x00024000, 0x01008008, 0x08082040, 0x10240020, 0x02024000, 0x01008000, 0x080020c0, 0x00241020, 0x02024000, 0x01008008, 0x080820c0, 0x10241020, 0x00000400, 0x04000000, 0x00100000, 0x00000004, 0x00000400, 0x04000008, 0x00180000, 0x10000004, 0x02000400, 0x04000000, 0x00100080, 0x00001004, 0x02000400, 0x04000008, 0x00180080, 0x10001004, 0x00004400, 0x04000000, 0x00100040, 0x00040004, 0x00004400, 0x04000008, 0x00180040, 0x10040004, 0x02004400, 0x04000000, 0x001000c0, 0x00041004, 0x02004400, 0x04000008, 0x001800c0, 0x10041004, 0x00020400, 0x04008000, 0x08100000, 0x00200004, 0x00020400, 0x04008008, 0x08180000, 0x10200004, 0x02020400, 0x04008000, 0x08100080, 0x00201004, 0x02020400, 0x04008008, 0x08180080, 0x10201004, 0x00024400, 0x04008000, 0x08100040, 0x00240004, 0x00024400, 0x04008008, 0x08180040, 0x10240004, 0x02024400, 0x04008000, 0x081000c0, 0x00241004, 0x02024400, 0x04008008, 0x081800c0, 0x10241004, 0x00000400, 0x05000000, 0x00102000, 0x00000024, 0x00000400, 0x05000008, 0x00182000, 0x10000024, 0x02000400, 0x05000000, 0x00102080, 0x00001024, 0x02000400, 0x05000008, 0x00182080, 0x10001024, 0x00004400, 0x05000000, 0x00102040, 0x00040024, 0x00004400, 0x05000008, 0x00182040, 0x10040024, 0x02004400, 0x05000000, 0x001020c0, 0x00041024, 0x02004400, 0x05000008, 0x001820c0, 0x10041024, 0x00020400, 0x05008000, 0x08102000, 0x00200024, 0x00020400, 0x05008008, 0x08182000, 0x10200024, 0x02020400, 0x05008000, 0x08102080, 0x00201024, 0x02020400, 0x05008008, 0x08182080, 0x10201024, 0x00024400, 0x05008000, 0x08102040, 0x00240024, 0x00024400, 0x05008008, 0x08182040, 0x10240024, 0x02024400, 0x05008000, 0x081020c0, 0x00241024, 0x02024400, 0x05008008, 0x081820c0, 0x10241024, 0x00000800, 0x00010000, 0x20000000, 0x00000010, 0x00000800, 0x00010008, 0x20080000, 0x10000010, 0x02000800, 0x00010000, 0x20000080, 0x00001010, 0x02000800, 0x00010008, 0x20080080, 0x10001010, 0x00004800, 0x00010000, 0x20000040, 0x00040010, 0x00004800, 0x00010008, 0x20080040, 0x10040010, 0x02004800, 0x00010000, 0x200000c0, 0x00041010, 0x02004800, 0x00010008, 0x200800c0, 0x10041010, 0x00020800, 0x00018000, 0x28000000, 0x00200010, 0x00020800, 0x00018008, 0x28080000, 0x10200010, 0x02020800, 0x00018000, 0x28000080, 0x00201010, 0x02020800, 0x00018008, 0x28080080, 0x10201010, 0x00024800, 0x00018000, 0x28000040, 0x00240010, 0x00024800, 0x00018008, 0x28080040, 0x10240010, 0x02024800, 0x00018000, 0x280000c0, 0x00241010, 0x02024800, 0x00018008, 0x280800c0, 0x10241010, 0x00000800, 0x01010000, 0x20002000, 0x00000030, 0x00000800, 0x01010008, 0x20082000, 0x10000030, 0x02000800, 0x01010000, 0x20002080, 0x00001030, 0x02000800, 0x01010008, 0x20082080, 0x10001030, 0x00004800, 0x01010000, 0x20002040, 0x00040030, 0x00004800, 0x01010008, 0x20082040, 0x10040030, 0x02004800, 0x01010000, 0x200020c0, 0x00041030, 0x02004800, 0x01010008, 0x200820c0, 0x10041030, 0x00020800, 0x01018000, 0x28002000, 0x00200030, 0x00020800, 0x01018008, 0x28082000, 0x10200030, 0x02020800, 0x01018000, 0x28002080, 0x00201030, 0x02020800, 0x01018008, 0x28082080, 0x10201030, 0x00024800, 0x01018000, 0x28002040, 0x00240030, 0x00024800, 0x01018008, 0x28082040, 0x10240030, 0x02024800, 0x01018000, 0x280020c0, 0x00241030, 0x02024800, 0x01018008, 0x280820c0, 0x10241030, 0x00000c00, 0x04010000, 0x20100000, 0x00000014, 0x00000c00, 0x04010008, 0x20180000, 0x10000014, 0x02000c00, 0x04010000, 0x20100080, 0x00001014, 0x02000c00, 0x04010008, 0x20180080, 0x10001014, 0x00004c00, 0x04010000, 0x20100040, 0x00040014, 0x00004c00, 0x04010008, 0x20180040, 0x10040014, 0x02004c00, 0x04010000, 0x201000c0, 0x00041014, 0x02004c00, 0x04010008, 0x201800c0, 0x10041014, 0x00020c00, 0x04018000, 0x28100000, 0x00200014, 0x00020c00, 0x04018008, 0x28180000, 0x10200014, 0x02020c00, 0x04018000, 0x28100080, 0x00201014, 0x02020c00, 0x04018008, 0x28180080, 0x10201014, 0x00024c00, 0x04018000, 0x28100040, 0x00240014, 0x00024c00, 0x04018008, 0x28180040, 0x10240014, 0x02024c00, 0x04018000, 0x281000c0, 0x00241014, 0x02024c00, 0x04018008, 0x281800c0, 0x10241014, 0x00000c00, 0x05010000, 0x20102000, 0x00000034, 0x00000c00, 0x05010008, 0x20182000, 0x10000034, 0x02000c00, 0x05010000, 0x20102080, 0x00001034, 0x02000c00, 0x05010008, 0x20182080, 0x10001034, 0x00004c00, 0x05010000, 0x20102040, 0x00040034, 0x00004c00, 0x05010008, 0x20182040, 0x10040034, 0x02004c00, 0x05010000, 0x201020c0, 0x00041034, 0x02004c00, 0x05010008, 0x201820c0, 0x10041034, 0x00020c00, 0x05018000, 0x28102000, 0x00200034, 0x00020c00, 0x05018008, 0x28182000, 0x10200034, 0x02020c00, 0x05018000, 0x28102080, 0x00201034, 0x02020c00, 0x05018008, 0x28182080, 0x10201034, 0x00024c00, 0x05018000, 0x28102040, 0x00240034, 0x00024c00, 0x05018008, 0x28182040, 0x10240034, 0x02024c00, 0x05018000, 0x281020c0, 0x00241034, 0x02024c00, 0x05018008, 0x281820c0, 0x10241034 }; /* S-box lookup tables */ static const u32 S1[64] = { 0x01010400, 0x00000000, 0x00010000, 0x01010404, 0x01010004, 0x00010404, 0x00000004, 0x00010000, 0x00000400, 0x01010400, 0x01010404, 0x00000400, 0x01000404, 0x01010004, 0x01000000, 0x00000004, 0x00000404, 0x01000400, 0x01000400, 0x00010400, 0x00010400, 0x01010000, 0x01010000, 0x01000404, 0x00010004, 0x01000004, 0x01000004, 0x00010004, 0x00000000, 0x00000404, 0x00010404, 0x01000000, 0x00010000, 0x01010404, 0x00000004, 0x01010000, 0x01010400, 0x01000000, 0x01000000, 0x00000400, 0x01010004, 0x00010000, 0x00010400, 0x01000004, 0x00000400, 0x00000004, 0x01000404, 0x00010404, 0x01010404, 0x00010004, 0x01010000, 0x01000404, 0x01000004, 0x00000404, 0x00010404, 0x01010400, 0x00000404, 0x01000400, 0x01000400, 0x00000000, 0x00010004, 0x00010400, 0x00000000, 0x01010004 }; static const u32 S2[64] = { 0x80108020, 0x80008000, 0x00008000, 0x00108020, 0x00100000, 0x00000020, 0x80100020, 0x80008020, 0x80000020, 0x80108020, 0x80108000, 0x80000000, 0x80008000, 0x00100000, 0x00000020, 0x80100020, 0x00108000, 0x00100020, 0x80008020, 0x00000000, 0x80000000, 0x00008000, 0x00108020, 0x80100000, 0x00100020, 0x80000020, 0x00000000, 0x00108000, 0x00008020, 0x80108000, 0x80100000, 0x00008020, 0x00000000, 0x00108020, 0x80100020, 0x00100000, 0x80008020, 0x80100000, 0x80108000, 0x00008000, 0x80100000, 0x80008000, 0x00000020, 0x80108020, 0x00108020, 0x00000020, 0x00008000, 0x80000000, 0x00008020, 0x80108000, 0x00100000, 0x80000020, 0x00100020, 0x80008020, 0x80000020, 0x00100020, 0x00108000, 0x00000000, 0x80008000, 0x00008020, 0x80000000, 0x80100020, 0x80108020, 0x00108000 }; static const u32 S3[64] = { 0x00000208, 0x08020200, 0x00000000, 0x08020008, 0x08000200, 0x00000000, 0x00020208, 0x08000200, 0x00020008, 0x08000008, 0x08000008, 0x00020000, 0x08020208, 0x00020008, 0x08020000, 0x00000208, 0x08000000, 0x00000008, 0x08020200, 0x00000200, 0x00020200, 0x08020000, 0x08020008, 0x00020208, 0x08000208, 0x00020200, 0x00020000, 0x08000208, 0x00000008, 0x08020208, 0x00000200, 0x08000000, 0x08020200, 0x08000000, 0x00020008, 0x00000208, 0x00020000, 0x08020200, 0x08000200, 0x00000000, 0x00000200, 0x00020008, 0x08020208, 0x08000200, 0x08000008, 0x00000200, 0x00000000, 0x08020008, 0x08000208, 0x00020000, 0x08000000, 0x08020208, 0x00000008, 0x00020208, 0x00020200, 0x08000008, 0x08020000, 0x08000208, 0x00000208, 0x08020000, 0x00020208, 0x00000008, 0x08020008, 0x00020200 }; static const u32 S4[64] = { 0x00802001, 0x00002081, 0x00002081, 0x00000080, 0x00802080, 0x00800081, 0x00800001, 0x00002001, 0x00000000, 0x00802000, 0x00802000, 0x00802081, 0x00000081, 0x00000000, 0x00800080, 0x00800001, 0x00000001, 0x00002000, 0x00800000, 0x00802001, 0x00000080, 0x00800000, 0x00002001, 0x00002080, 0x00800081, 0x00000001, 0x00002080, 0x00800080, 0x00002000, 0x00802080, 0x00802081, 0x00000081, 0x00800080, 0x00800001, 0x00802000, 0x00802081, 0x00000081, 0x00000000, 0x00000000, 0x00802000, 0x00002080, 0x00800080, 0x00800081, 0x00000001, 0x00802001, 0x00002081, 0x00002081, 0x00000080, 0x00802081, 0x00000081, 0x00000001, 0x00002000, 0x00800001, 0x00002001, 0x00802080, 0x00800081, 0x00002001, 0x00002080, 0x00800000, 0x00802001, 0x00000080, 0x00800000, 0x00002000, 0x00802080 }; static const u32 S5[64] = { 0x00000100, 0x02080100, 0x02080000, 0x42000100, 0x00080000, 0x00000100, 0x40000000, 0x02080000, 0x40080100, 0x00080000, 0x02000100, 0x40080100, 0x42000100, 0x42080000, 0x00080100, 0x40000000, 0x02000000, 0x40080000, 0x40080000, 0x00000000, 0x40000100, 0x42080100, 0x42080100, 0x02000100, 0x42080000, 0x40000100, 0x00000000, 0x42000000, 0x02080100, 0x02000000, 0x42000000, 0x00080100, 0x00080000, 0x42000100, 0x00000100, 0x02000000, 0x40000000, 0x02080000, 0x42000100, 0x40080100, 0x02000100, 0x40000000, 0x42080000, 0x02080100, 0x40080100, 0x00000100, 0x02000000, 0x42080000, 0x42080100, 0x00080100, 0x42000000, 0x42080100, 0x02080000, 0x00000000, 0x40080000, 0x42000000, 0x00080100, 0x02000100, 0x40000100, 0x00080000, 0x00000000, 0x40080000, 0x02080100, 0x40000100 }; static const u32 S6[64] = { 0x20000010, 0x20400000, 0x00004000, 0x20404010, 0x20400000, 0x00000010, 0x20404010, 0x00400000, 0x20004000, 0x00404010, 0x00400000, 0x20000010, 0x00400010, 0x20004000, 0x20000000, 0x00004010, 0x00000000, 0x00400010, 0x20004010, 0x00004000, 0x00404000, 0x20004010, 0x00000010, 0x20400010, 0x20400010, 0x00000000, 0x00404010, 0x20404000, 0x00004010, 0x00404000, 0x20404000, 0x20000000, 0x20004000, 0x00000010, 0x20400010, 0x00404000, 0x20404010, 0x00400000, 0x00004010, 0x20000010, 0x00400000, 0x20004000, 0x20000000, 0x00004010, 0x20000010, 0x20404010, 0x00404000, 0x20400000, 0x00404010, 0x20404000, 0x00000000, 0x20400010, 0x00000010, 0x00004000, 0x20400000, 0x00404010, 0x00004000, 0x00400010, 0x20004010, 0x00000000, 0x20404000, 0x20000000, 0x00400010, 0x20004010 }; static const u32 S7[64] = { 0x00200000, 0x04200002, 0x04000802, 0x00000000, 0x00000800, 0x04000802, 0x00200802, 0x04200800, 0x04200802, 0x00200000, 0x00000000, 0x04000002, 0x00000002, 0x04000000, 0x04200002, 0x00000802, 0x04000800, 0x00200802, 0x00200002, 0x04000800, 0x04000002, 0x04200000, 0x04200800, 0x00200002, 0x04200000, 0x00000800, 0x00000802, 0x04200802, 0x00200800, 0x00000002, 0x04000000, 0x00200800, 0x04000000, 0x00200800, 0x00200000, 0x04000802, 0x04000802, 0x04200002, 0x04200002, 0x00000002, 0x00200002, 0x04000000, 0x04000800, 0x00200000, 0x04200800, 0x00000802, 0x00200802, 0x04200800, 0x00000802, 0x04000002, 0x04200802, 0x04200000, 0x00200800, 0x00000000, 0x00000002, 0x04200802, 0x00000000, 0x00200802, 0x04200000, 0x00000800, 0x04000002, 0x04000800, 0x00000800, 0x00200002 }; static const u32 S8[64] = { 0x10001040, 0x00001000, 0x00040000, 0x10041040, 0x10000000, 0x10001040, 0x00000040, 0x10000000, 0x00040040, 0x10040000, 0x10041040, 0x00041000, 0x10041000, 0x00041040, 0x00001000, 0x00000040, 0x10040000, 0x10000040, 0x10001000, 0x00001040, 0x00041000, 0x00040040, 0x10040040, 0x10041000, 0x00001040, 0x00000000, 0x00000000, 0x10040040, 0x10000040, 0x10001000, 0x00041040, 0x00040000, 0x00041040, 0x00040000, 0x10041000, 0x00001000, 0x00000040, 0x10040040, 0x00001000, 0x00041040, 0x10001000, 0x00000040, 0x10000040, 0x10040000, 0x10040040, 0x10000000, 0x00040000, 0x10001040, 0x00000000, 0x10041040, 0x00040040, 0x10000040, 0x10040000, 0x10001000, 0x10001040, 0x00000000, 0x10041040, 0x00041000, 0x00041000, 0x00001040, 0x00001040, 0x00040040, 0x10000000, 0x10041000 }; /* Encryption components: IP, FP, and round function */ #define IP(L, R, T) \ ROL(R, 4); \ T = L; \ L ^= R; \ L &= 0xf0f0f0f0; \ R ^= L; \ L ^= T; \ ROL(R, 12); \ T = L; \ L ^= R; \ L &= 0xffff0000; \ R ^= L; \ L ^= T; \ ROR(R, 14); \ T = L; \ L ^= R; \ L &= 0xcccccccc; \ R ^= L; \ L ^= T; \ ROL(R, 6); \ T = L; \ L ^= R; \ L &= 0xff00ff00; \ R ^= L; \ L ^= T; \ ROR(R, 7); \ T = L; \ L ^= R; \ L &= 0xaaaaaaaa; \ R ^= L; \ L ^= T; \ ROL(L, 1); #define FP(L, R, T) \ ROR(L, 1); \ T = L; \ L ^= R; \ L &= 0xaaaaaaaa; \ R ^= L; \ L ^= T; \ ROL(R, 7); \ T = L; \ L ^= R; \ L &= 0xff00ff00; \ R ^= L; \ L ^= T; \ ROR(R, 6); \ T = L; \ L ^= R; \ L &= 0xcccccccc; \ R ^= L; \ L ^= T; \ ROL(R, 14); \ T = L; \ L ^= R; \ L &= 0xffff0000; \ R ^= L; \ L ^= T; \ ROR(R, 12); \ T = L; \ L ^= R; \ L &= 0xf0f0f0f0; \ R ^= L; \ L ^= T; \ ROR(R, 4); #define ROUND(L, R, A, B, K, d) \ B = K[0]; A = K[1]; K += d; \ B ^= R; A ^= R; \ B &= 0x3f3f3f3f; ROR(A, 4); \ L ^= S8[0xff & B]; A &= 0x3f3f3f3f; \ L ^= S6[0xff & (B >> 8)]; B >>= 16; \ L ^= S7[0xff & A]; \ L ^= S5[0xff & (A >> 8)]; A >>= 16; \ L ^= S4[0xff & B]; \ L ^= S2[0xff & (B >> 8)]; \ L ^= S3[0xff & A]; \ L ^= S1[0xff & (A >> 8)]; /* * PC2 lookup tables are organized as 2 consecutive sets of 4 interleaved * tables of 128 elements. One set is for C_i and the other for D_i, while * the 4 interleaved tables correspond to four 7-bit subsets of C_i or D_i. * * After PC1 each of the variables a,b,c,d contains a 7 bit subset of C_i * or D_i in bits 7-1 (bit 0 being the least significant). */ #define T1(x) pt[2 * (x) + 0] #define T2(x) pt[2 * (x) + 1] #define T3(x) pt[2 * (x) + 2] #define T4(x) pt[2 * (x) + 3] #define DES_PC2(a, b, c, d) (T4(d) | T3(c) | T2(b) | T1(a)) /* * Encryption key expansion * * RFC2451: Weak key checks SHOULD be performed. * * FIPS 74: * * Keys having duals are keys which produce all zeros, all ones, or * alternating zero-one patterns in the C and D registers after Permuted * Choice 1 has operated on the key. * */ static unsigned long des_ekey(u32 *pe, const u8 *k) { /* K&R: long is at least 32 bits */ unsigned long a, b, c, d, w; const u32 *pt = pc2; d = k[4]; d &= 0x0e; d <<= 4; d |= k[0] & 0x1e; d = pc1[d]; c = k[5]; c &= 0x0e; c <<= 4; c |= k[1] & 0x1e; c = pc1[c]; b = k[6]; b &= 0x0e; b <<= 4; b |= k[2] & 0x1e; b = pc1[b]; a = k[7]; a &= 0x0e; a <<= 4; a |= k[3] & 0x1e; a = pc1[a]; pe[15 * 2 + 0] = DES_PC2(a, b, c, d); d = rs[d]; pe[14 * 2 + 0] = DES_PC2(d, a, b, c); c = rs[c]; b = rs[b]; pe[13 * 2 + 0] = DES_PC2(b, c, d, a); a = rs[a]; d = rs[d]; pe[12 * 2 + 0] = DES_PC2(d, a, b, c); c = rs[c]; b = rs[b]; pe[11 * 2 + 0] = DES_PC2(b, c, d, a); a = rs[a]; d = rs[d]; pe[10 * 2 + 0] = DES_PC2(d, a, b, c); c = rs[c]; b = rs[b]; pe[ 9 * 2 + 0] = DES_PC2(b, c, d, a); a = rs[a]; d = rs[d]; pe[ 8 * 2 + 0] = DES_PC2(d, a, b, c); c = rs[c]; pe[ 7 * 2 + 0] = DES_PC2(c, d, a, b); b = rs[b]; a = rs[a]; pe[ 6 * 2 + 0] = DES_PC2(a, b, c, d); d = rs[d]; c = rs[c]; pe[ 5 * 2 + 0] = DES_PC2(c, d, a, b); b = rs[b]; a = rs[a]; pe[ 4 * 2 + 0] = DES_PC2(a, b, c, d); d = rs[d]; c = rs[c]; pe[ 3 * 2 + 0] = DES_PC2(c, d, a, b); b = rs[b]; a = rs[a]; pe[ 2 * 2 + 0] = DES_PC2(a, b, c, d); d = rs[d]; c = rs[c]; pe[ 1 * 2 + 0] = DES_PC2(c, d, a, b); b = rs[b]; pe[ 0 * 2 + 0] = DES_PC2(b, c, d, a); /* Check if first half is weak */ w = (a ^ c) | (b ^ d) | (rs[a] ^ c) | (b ^ rs[d]); /* Skip to next table set */ pt += 512; d = k[0]; d &= 0xe0; d >>= 4; d |= k[4] & 0xf0; d = pc1[d + 1]; c = k[1]; c &= 0xe0; c >>= 4; c |= k[5] & 0xf0; c = pc1[c + 1]; b = k[2]; b &= 0xe0; b >>= 4; b |= k[6] & 0xf0; b = pc1[b + 1]; a = k[3]; a &= 0xe0; a >>= 4; a |= k[7] & 0xf0; a = pc1[a + 1]; /* Check if second half is weak */ w |= (a ^ c) | (b ^ d) | (rs[a] ^ c) | (b ^ rs[d]); pe[15 * 2 + 1] = DES_PC2(a, b, c, d); d = rs[d]; pe[14 * 2 + 1] = DES_PC2(d, a, b, c); c = rs[c]; b = rs[b]; pe[13 * 2 + 1] = DES_PC2(b, c, d, a); a = rs[a]; d = rs[d]; pe[12 * 2 + 1] = DES_PC2(d, a, b, c); c = rs[c]; b = rs[b]; pe[11 * 2 + 1] = DES_PC2(b, c, d, a); a = rs[a]; d = rs[d]; pe[10 * 2 + 1] = DES_PC2(d, a, b, c); c = rs[c]; b = rs[b]; pe[ 9 * 2 + 1] = DES_PC2(b, c, d, a); a = rs[a]; d = rs[d]; pe[ 8 * 2 + 1] = DES_PC2(d, a, b, c); c = rs[c]; pe[ 7 * 2 + 1] = DES_PC2(c, d, a, b); b = rs[b]; a = rs[a]; pe[ 6 * 2 + 1] = DES_PC2(a, b, c, d); d = rs[d]; c = rs[c]; pe[ 5 * 2 + 1] = DES_PC2(c, d, a, b); b = rs[b]; a = rs[a]; pe[ 4 * 2 + 1] = DES_PC2(a, b, c, d); d = rs[d]; c = rs[c]; pe[ 3 * 2 + 1] = DES_PC2(c, d, a, b); b = rs[b]; a = rs[a]; pe[ 2 * 2 + 1] = DES_PC2(a, b, c, d); d = rs[d]; c = rs[c]; pe[ 1 * 2 + 1] = DES_PC2(c, d, a, b); b = rs[b]; pe[ 0 * 2 + 1] = DES_PC2(b, c, d, a); /* Fixup: 2413 5768 -> 1357 2468 */ for (d = 0; d < 16; ++d) { a = pe[2 * d]; b = pe[2 * d + 1]; c = a ^ b; c &= 0xffff0000; a ^= c; b ^= c; ROL(b, 18); pe[2 * d] = a; pe[2 * d + 1] = b; } /* Zero if weak key */ return w; } int des_expand_key(struct des_ctx *ctx, const u8 *key, unsigned int keylen) { if (keylen != DES_KEY_SIZE) return -EINVAL; return des_ekey(ctx->expkey, key) ? 0 : -ENOKEY; } EXPORT_SYMBOL_GPL(des_expand_key); /* * Decryption key expansion * * No weak key checking is performed, as this is only used by triple DES * */ static void dkey(u32 *pe, const u8 *k) { /* K&R: long is at least 32 bits */ unsigned long a, b, c, d; const u32 *pt = pc2; d = k[4]; d &= 0x0e; d <<= 4; d |= k[0] & 0x1e; d = pc1[d]; c = k[5]; c &= 0x0e; c <<= 4; c |= k[1] & 0x1e; c = pc1[c]; b = k[6]; b &= 0x0e; b <<= 4; b |= k[2] & 0x1e; b = pc1[b]; a = k[7]; a &= 0x0e; a <<= 4; a |= k[3] & 0x1e; a = pc1[a]; pe[ 0 * 2] = DES_PC2(a, b, c, d); d = rs[d]; pe[ 1 * 2] = DES_PC2(d, a, b, c); c = rs[c]; b = rs[b]; pe[ 2 * 2] = DES_PC2(b, c, d, a); a = rs[a]; d = rs[d]; pe[ 3 * 2] = DES_PC2(d, a, b, c); c = rs[c]; b = rs[b]; pe[ 4 * 2] = DES_PC2(b, c, d, a); a = rs[a]; d = rs[d]; pe[ 5 * 2] = DES_PC2(d, a, b, c); c = rs[c]; b = rs[b]; pe[ 6 * 2] = DES_PC2(b, c, d, a); a = rs[a]; d = rs[d]; pe[ 7 * 2] = DES_PC2(d, a, b, c); c = rs[c]; pe[ 8 * 2] = DES_PC2(c, d, a, b); b = rs[b]; a = rs[a]; pe[ 9 * 2] = DES_PC2(a, b, c, d); d = rs[d]; c = rs[c]; pe[10 * 2] = DES_PC2(c, d, a, b); b = rs[b]; a = rs[a]; pe[11 * 2] = DES_PC2(a, b, c, d); d = rs[d]; c = rs[c]; pe[12 * 2] = DES_PC2(c, d, a, b); b = rs[b]; a = rs[a]; pe[13 * 2] = DES_PC2(a, b, c, d); d = rs[d]; c = rs[c]; pe[14 * 2] = DES_PC2(c, d, a, b); b = rs[b]; pe[15 * 2] = DES_PC2(b, c, d, a); /* Skip to next table set */ pt += 512; d = k[0]; d &= 0xe0; d >>= 4; d |= k[4] & 0xf0; d = pc1[d + 1]; c = k[1]; c &= 0xe0; c >>= 4; c |= k[5] & 0xf0; c = pc1[c + 1]; b = k[2]; b &= 0xe0; b >>= 4; b |= k[6] & 0xf0; b = pc1[b + 1]; a = k[3]; a &= 0xe0; a >>= 4; a |= k[7] & 0xf0; a = pc1[a + 1]; pe[ 0 * 2 + 1] = DES_PC2(a, b, c, d); d = rs[d]; pe[ 1 * 2 + 1] = DES_PC2(d, a, b, c); c = rs[c]; b = rs[b]; pe[ 2 * 2 + 1] = DES_PC2(b, c, d, a); a = rs[a]; d = rs[d]; pe[ 3 * 2 + 1] = DES_PC2(d, a, b, c); c = rs[c]; b = rs[b]; pe[ 4 * 2 + 1] = DES_PC2(b, c, d, a); a = rs[a]; d = rs[d]; pe[ 5 * 2 + 1] = DES_PC2(d, a, b, c); c = rs[c]; b = rs[b]; pe[ 6 * 2 + 1] = DES_PC2(b, c, d, a); a = rs[a]; d = rs[d]; pe[ 7 * 2 + 1] = DES_PC2(d, a, b, c); c = rs[c]; pe[ 8 * 2 + 1] = DES_PC2(c, d, a, b); b = rs[b]; a = rs[a]; pe[ 9 * 2 + 1] = DES_PC2(a, b, c, d); d = rs[d]; c = rs[c]; pe[10 * 2 + 1] = DES_PC2(c, d, a, b); b = rs[b]; a = rs[a]; pe[11 * 2 + 1] = DES_PC2(a, b, c, d); d = rs[d]; c = rs[c]; pe[12 * 2 + 1] = DES_PC2(c, d, a, b); b = rs[b]; a = rs[a]; pe[13 * 2 + 1] = DES_PC2(a, b, c, d); d = rs[d]; c = rs[c]; pe[14 * 2 + 1] = DES_PC2(c, d, a, b); b = rs[b]; pe[15 * 2 + 1] = DES_PC2(b, c, d, a); /* Fixup: 2413 5768 -> 1357 2468 */ for (d = 0; d < 16; ++d) { a = pe[2 * d]; b = pe[2 * d + 1]; c = a ^ b; c &= 0xffff0000; a ^= c; b ^= c; ROL(b, 18); pe[2 * d] = a; pe[2 * d + 1] = b; } } void des_encrypt(const struct des_ctx *ctx, u8 *dst, const u8 *src) { const u32 *K = ctx->expkey; u32 L, R, A, B; int i; L = get_unaligned_le32(src); R = get_unaligned_le32(src + 4); IP(L, R, A); for (i = 0; i < 8; i++) { ROUND(L, R, A, B, K, 2); ROUND(R, L, A, B, K, 2); } FP(R, L, A); put_unaligned_le32(R, dst); put_unaligned_le32(L, dst + 4); } EXPORT_SYMBOL_GPL(des_encrypt); void des_decrypt(const struct des_ctx *ctx, u8 *dst, const u8 *src) { const u32 *K = ctx->expkey + DES_EXPKEY_WORDS - 2; u32 L, R, A, B; int i; L = get_unaligned_le32(src); R = get_unaligned_le32(src + 4); IP(L, R, A); for (i = 0; i < 8; i++) { ROUND(L, R, A, B, K, -2); ROUND(R, L, A, B, K, -2); } FP(R, L, A); put_unaligned_le32(R, dst); put_unaligned_le32(L, dst + 4); } EXPORT_SYMBOL_GPL(des_decrypt); int des3_ede_expand_key(struct des3_ede_ctx *ctx, const u8 *key, unsigned int keylen) { u32 *pe = ctx->expkey; int err; if (keylen != DES3_EDE_KEY_SIZE) return -EINVAL; err = des3_ede_verify_key(key, keylen, true); if (err && err != -ENOKEY) return err; des_ekey(pe, key); pe += DES_EXPKEY_WORDS; key += DES_KEY_SIZE; dkey(pe, key); pe += DES_EXPKEY_WORDS; key += DES_KEY_SIZE; des_ekey(pe, key); return err; } EXPORT_SYMBOL_GPL(des3_ede_expand_key); void des3_ede_encrypt(const struct des3_ede_ctx *dctx, u8 *dst, const u8 *src) { const u32 *K = dctx->expkey; u32 L, R, A, B; int i; L = get_unaligned_le32(src); R = get_unaligned_le32(src + 4); IP(L, R, A); for (i = 0; i < 8; i++) { ROUND(L, R, A, B, K, 2); ROUND(R, L, A, B, K, 2); } for (i = 0; i < 8; i++) { ROUND(R, L, A, B, K, 2); ROUND(L, R, A, B, K, 2); } for (i = 0; i < 8; i++) { ROUND(L, R, A, B, K, 2); ROUND(R, L, A, B, K, 2); } FP(R, L, A); put_unaligned_le32(R, dst); put_unaligned_le32(L, dst + 4); } EXPORT_SYMBOL_GPL(des3_ede_encrypt); void des3_ede_decrypt(const struct des3_ede_ctx *dctx, u8 *dst, const u8 *src) { const u32 *K = dctx->expkey + DES3_EDE_EXPKEY_WORDS - 2; u32 L, R, A, B; int i; L = get_unaligned_le32(src); R = get_unaligned_le32(src + 4); IP(L, R, A); for (i = 0; i < 8; i++) { ROUND(L, R, A, B, K, -2); ROUND(R, L, A, B, K, -2); } for (i = 0; i < 8; i++) { ROUND(R, L, A, B, K, -2); ROUND(L, R, A, B, K, -2); } for (i = 0; i < 8; i++) { ROUND(L, R, A, B, K, -2); ROUND(R, L, A, B, K, -2); } FP(R, L, A); put_unaligned_le32(R, dst); put_unaligned_le32(L, dst + 4); } EXPORT_SYMBOL_GPL(des3_ede_decrypt); MODULE_DESCRIPTION("DES & Triple DES EDE Cipher Algorithms"); MODULE_LICENSE("GPL"); |
| 8 11 11 8 8 8 1 3 8 8 1 8 8 8 6 6 6 6 6 6 6 6 6 6 6 6 2 18 4 4 11 16 16 11 11 1 6 5 1 1 1 1 1 35 36 20 16 1 1 1 1 1 1 6 3 1 1 1 36 1 30 1 1 4 2 22 1 20 1 11 2 7 1 7 40 12 4 1 3 52 52 52 1 52 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 | // SPDX-License-Identifier: GPL-2.0 /* Copyright (C) B.A.T.M.A.N. contributors: * * Antonio Quartulli */ #include "distributed-arp-table.h" #include "main.h" #include <linux/atomic.h> #include <linux/bitops.h> #include <linux/byteorder/generic.h> #include <linux/container_of.h> #include <linux/err.h> #include <linux/errno.h> #include <linux/etherdevice.h> #include <linux/gfp.h> #include <linux/if_arp.h> #include <linux/if_ether.h> #include <linux/if_vlan.h> #include <linux/in.h> #include <linux/ip.h> #include <linux/jiffies.h> #include <linux/kref.h> #include <linux/list.h> #include <linux/netlink.h> #include <linux/rculist.h> #include <linux/rcupdate.h> #include <linux/skbuff.h> #include <linux/slab.h> #include <linux/spinlock.h> #include <linux/stddef.h> #include <linux/string.h> #include <linux/udp.h> #include <linux/unaligned.h> #include <linux/workqueue.h> #include <net/arp.h> #include <net/genetlink.h> #include <net/netlink.h> #include <uapi/linux/batman_adv.h> #include "bridge_loop_avoidance.h" #include "hard-interface.h" #include "hash.h" #include "log.h" #include "netlink.h" #include "originator.h" #include "send.h" #include "translation-table.h" #include "tvlv.h" enum batadv_bootpop { BATADV_BOOTREPLY = 2, }; enum batadv_boothtype { BATADV_HTYPE_ETHERNET = 1, }; enum batadv_dhcpoptioncode { BATADV_DHCP_OPT_PAD = 0, BATADV_DHCP_OPT_MSG_TYPE = 53, BATADV_DHCP_OPT_END = 255, }; enum batadv_dhcptype { BATADV_DHCPACK = 5, }; /* { 99, 130, 83, 99 } */ #define BATADV_DHCP_MAGIC 1669485411 struct batadv_dhcp_packet { __u8 op; __u8 htype; __u8 hlen; __u8 hops; __be32 xid; __be16 secs; __be16 flags; __be32 ciaddr; __be32 yiaddr; __be32 siaddr; __be32 giaddr; __u8 chaddr[16]; __u8 sname[64]; __u8 file[128]; __be32 magic; /* __u8 options[]; */ }; #define BATADV_DHCP_YIADDR_LEN sizeof(((struct batadv_dhcp_packet *)0)->yiaddr) #define BATADV_DHCP_CHADDR_LEN sizeof(((struct batadv_dhcp_packet *)0)->chaddr) static void batadv_dat_purge(struct work_struct *work); /** * batadv_dat_start_timer() - initialise the DAT periodic worker * @bat_priv: the bat priv with all the soft interface information */ static void batadv_dat_start_timer(struct batadv_priv *bat_priv) { queue_delayed_work(batadv_event_workqueue, &bat_priv->dat.work, msecs_to_jiffies(10000)); } /** * batadv_dat_entry_release() - release dat_entry from lists and queue for free * after rcu grace period * @ref: kref pointer of the dat_entry */ static void batadv_dat_entry_release(struct kref *ref) { struct batadv_dat_entry *dat_entry; dat_entry = container_of(ref, struct batadv_dat_entry, refcount); kfree_rcu(dat_entry, rcu); } /** * batadv_dat_entry_put() - decrement the dat_entry refcounter and possibly * release it * @dat_entry: dat_entry to be free'd */ static void batadv_dat_entry_put(struct batadv_dat_entry *dat_entry) { if (!dat_entry) return; kref_put(&dat_entry->refcount, batadv_dat_entry_release); } /** * batadv_dat_to_purge() - check whether a dat_entry has to be purged or not * @dat_entry: the entry to check * * Return: true if the entry has to be purged now, false otherwise. */ static bool batadv_dat_to_purge(struct batadv_dat_entry *dat_entry) { return batadv_has_timed_out(dat_entry->last_update, BATADV_DAT_ENTRY_TIMEOUT); } /** * __batadv_dat_purge() - delete entries from the DAT local storage * @bat_priv: the bat priv with all the soft interface information * @to_purge: function in charge to decide whether an entry has to be purged or * not. This function takes the dat_entry as argument and has to * returns a boolean value: true is the entry has to be deleted, * false otherwise * * Loops over each entry in the DAT local storage and deletes it if and only if * the to_purge function passed as argument returns true. */ static void __batadv_dat_purge(struct batadv_priv *bat_priv, bool (*to_purge)(struct batadv_dat_entry *)) { spinlock_t *list_lock; /* protects write access to the hash lists */ struct batadv_dat_entry *dat_entry; struct hlist_node *node_tmp; struct hlist_head *head; u32 i; if (!bat_priv->dat.hash) return; for (i = 0; i < bat_priv->dat.hash->size; i++) { head = &bat_priv->dat.hash->table[i]; list_lock = &bat_priv->dat.hash->list_locks[i]; spin_lock_bh(list_lock); hlist_for_each_entry_safe(dat_entry, node_tmp, head, hash_entry) { /* if a helper function has been passed as parameter, * ask it if the entry has to be purged or not */ if (to_purge && !to_purge(dat_entry)) continue; hlist_del_rcu(&dat_entry->hash_entry); batadv_dat_entry_put(dat_entry); } spin_unlock_bh(list_lock); } } /** * batadv_dat_purge() - periodic task that deletes old entries from the local * DAT hash table * @work: kernel work struct */ static void batadv_dat_purge(struct work_struct *work) { struct delayed_work *delayed_work; struct batadv_priv_dat *priv_dat; struct batadv_priv *bat_priv; delayed_work = to_delayed_work(work); priv_dat = container_of(delayed_work, struct batadv_priv_dat, work); bat_priv = container_of(priv_dat, struct batadv_priv, dat); __batadv_dat_purge(bat_priv, batadv_dat_to_purge); batadv_dat_start_timer(bat_priv); } /** * batadv_compare_dat() - comparing function used in the local DAT hash table * @node: node in the local table * @data2: second object to compare the node to * * Return: true if the two entries are the same, false otherwise. */ static bool batadv_compare_dat(const struct hlist_node *node, const void *data2) { const void *data1 = container_of(node, struct batadv_dat_entry, hash_entry); return memcmp(data1, data2, sizeof(__be32)) == 0; } /** * batadv_arp_hw_src() - extract the hw_src field from an ARP packet * @skb: ARP packet * @hdr_size: size of the possible header before the ARP packet * * Return: the value of the hw_src field in the ARP packet. */ static u8 *batadv_arp_hw_src(struct sk_buff *skb, int hdr_size) { u8 *addr; addr = (u8 *)(skb->data + hdr_size); addr += ETH_HLEN + sizeof(struct arphdr); return addr; } /** * batadv_arp_ip_src() - extract the ip_src field from an ARP packet * @skb: ARP packet * @hdr_size: size of the possible header before the ARP packet * * Return: the value of the ip_src field in the ARP packet. */ static __be32 batadv_arp_ip_src(struct sk_buff *skb, int hdr_size) { return *(__force __be32 *)(batadv_arp_hw_src(skb, hdr_size) + ETH_ALEN); } /** * batadv_arp_hw_dst() - extract the hw_dst field from an ARP packet * @skb: ARP packet * @hdr_size: size of the possible header before the ARP packet * * Return: the value of the hw_dst field in the ARP packet. */ static u8 *batadv_arp_hw_dst(struct sk_buff *skb, int hdr_size) { return batadv_arp_hw_src(skb, hdr_size) + ETH_ALEN + 4; } /** * batadv_arp_ip_dst() - extract the ip_dst field from an ARP packet * @skb: ARP packet * @hdr_size: size of the possible header before the ARP packet * * Return: the value of the ip_dst field in the ARP packet. */ static __be32 batadv_arp_ip_dst(struct sk_buff *skb, int hdr_size) { u8 *dst = batadv_arp_hw_src(skb, hdr_size) + ETH_ALEN * 2 + 4; return *(__force __be32 *)dst; } /** * batadv_hash_dat() - compute the hash value for an IP address * @data: data to hash * @size: size of the hash table * * Return: the selected index in the hash table for the given data. */ static u32 batadv_hash_dat(const void *data, u32 size) { u32 hash = 0; const struct batadv_dat_entry *dat = data; const unsigned char *key; __be16 vid; u32 i; key = (__force const unsigned char *)&dat->ip; for (i = 0; i < sizeof(dat->ip); i++) { hash += key[i]; hash += (hash << 10); hash ^= (hash >> 6); } vid = htons(dat->vid); key = (__force const unsigned char *)&vid; for (i = 0; i < sizeof(dat->vid); i++) { hash += key[i]; hash += (hash << 10); hash ^= (hash >> 6); } hash += (hash << 3); hash ^= (hash >> 11); hash += (hash << 15); return hash % size; } /** * batadv_dat_entry_hash_find() - look for a given dat_entry in the local hash * table * @bat_priv: the bat priv with all the soft interface information * @ip: search key * @vid: VLAN identifier * * Return: the dat_entry if found, NULL otherwise. */ static struct batadv_dat_entry * batadv_dat_entry_hash_find(struct batadv_priv *bat_priv, __be32 ip, unsigned short vid) { struct hlist_head *head; struct batadv_dat_entry to_find, *dat_entry, *dat_entry_tmp = NULL; struct batadv_hashtable *hash = bat_priv->dat.hash; u32 index; if (!hash) return NULL; to_find.ip = ip; to_find.vid = vid; index = batadv_hash_dat(&to_find, hash->size); head = &hash->table[index]; rcu_read_lock(); hlist_for_each_entry_rcu(dat_entry, head, hash_entry) { if (dat_entry->ip != ip) continue; if (!kref_get_unless_zero(&dat_entry->refcount)) continue; dat_entry_tmp = dat_entry; break; } rcu_read_unlock(); return dat_entry_tmp; } /** * batadv_dat_entry_add() - add a new dat entry or update it if already exists * @bat_priv: the bat priv with all the soft interface information * @ip: ipv4 to add/edit * @mac_addr: mac address to assign to the given ipv4 * @vid: VLAN identifier */ static void batadv_dat_entry_add(struct batadv_priv *bat_priv, __be32 ip, u8 *mac_addr, unsigned short vid) { struct batadv_dat_entry *dat_entry; int hash_added; dat_entry = batadv_dat_entry_hash_find(bat_priv, ip, vid); /* if this entry is already known, just update it */ if (dat_entry) { if (!batadv_compare_eth(dat_entry->mac_addr, mac_addr)) ether_addr_copy(dat_entry->mac_addr, mac_addr); dat_entry->last_update = jiffies; batadv_dbg(BATADV_DBG_DAT, bat_priv, "Entry updated: %pI4 %pM (vid: %d)\n", &dat_entry->ip, dat_entry->mac_addr, batadv_print_vid(vid)); goto out; } dat_entry = kmalloc(sizeof(*dat_entry), GFP_ATOMIC); if (!dat_entry) goto out; dat_entry->ip = ip; dat_entry->vid = vid; ether_addr_copy(dat_entry->mac_addr, mac_addr); dat_entry->last_update = jiffies; kref_init(&dat_entry->refcount); kref_get(&dat_entry->refcount); hash_added = batadv_hash_add(bat_priv->dat.hash, batadv_compare_dat, batadv_hash_dat, dat_entry, &dat_entry->hash_entry); if (unlikely(hash_added != 0)) { /* remove the reference for the hash */ batadv_dat_entry_put(dat_entry); goto out; } batadv_dbg(BATADV_DBG_DAT, bat_priv, "New entry added: %pI4 %pM (vid: %d)\n", &dat_entry->ip, dat_entry->mac_addr, batadv_print_vid(vid)); out: batadv_dat_entry_put(dat_entry); } #ifdef CONFIG_BATMAN_ADV_DEBUG /** * batadv_dbg_arp() - print a debug message containing all the ARP packet * details * @bat_priv: the bat priv with all the soft interface information * @skb: ARP packet * @hdr_size: size of the possible header before the ARP packet * @msg: message to print together with the debugging information */ static void batadv_dbg_arp(struct batadv_priv *bat_priv, struct sk_buff *skb, int hdr_size, char *msg) { struct batadv_unicast_4addr_packet *unicast_4addr_packet; struct batadv_bcast_packet *bcast_pkt; u8 *orig_addr; __be32 ip_src, ip_dst; if (msg) batadv_dbg(BATADV_DBG_DAT, bat_priv, "%s\n", msg); ip_src = batadv_arp_ip_src(skb, hdr_size); ip_dst = batadv_arp_ip_dst(skb, hdr_size); batadv_dbg(BATADV_DBG_DAT, bat_priv, "ARP MSG = [src: %pM-%pI4 dst: %pM-%pI4]\n", batadv_arp_hw_src(skb, hdr_size), &ip_src, batadv_arp_hw_dst(skb, hdr_size), &ip_dst); if (hdr_size < sizeof(struct batadv_unicast_packet)) return; unicast_4addr_packet = (struct batadv_unicast_4addr_packet *)skb->data; switch (unicast_4addr_packet->u.packet_type) { case BATADV_UNICAST: batadv_dbg(BATADV_DBG_DAT, bat_priv, "* encapsulated within a UNICAST packet\n"); break; case BATADV_UNICAST_4ADDR: batadv_dbg(BATADV_DBG_DAT, bat_priv, "* encapsulated within a UNICAST_4ADDR packet (src: %pM)\n", unicast_4addr_packet->src); switch (unicast_4addr_packet->subtype) { case BATADV_P_DAT_DHT_PUT: batadv_dbg(BATADV_DBG_DAT, bat_priv, "* type: DAT_DHT_PUT\n"); break; case BATADV_P_DAT_DHT_GET: batadv_dbg(BATADV_DBG_DAT, bat_priv, "* type: DAT_DHT_GET\n"); break; case BATADV_P_DAT_CACHE_REPLY: batadv_dbg(BATADV_DBG_DAT, bat_priv, "* type: DAT_CACHE_REPLY\n"); break; case BATADV_P_DATA: batadv_dbg(BATADV_DBG_DAT, bat_priv, "* type: DATA\n"); break; default: batadv_dbg(BATADV_DBG_DAT, bat_priv, "* type: Unknown (%u)!\n", unicast_4addr_packet->u.packet_type); } break; case BATADV_BCAST: bcast_pkt = (struct batadv_bcast_packet *)unicast_4addr_packet; orig_addr = bcast_pkt->orig; batadv_dbg(BATADV_DBG_DAT, bat_priv, "* encapsulated within a BCAST packet (src: %pM)\n", orig_addr); break; default: batadv_dbg(BATADV_DBG_DAT, bat_priv, "* encapsulated within an unknown packet type (0x%x)\n", unicast_4addr_packet->u.packet_type); } } #else static void batadv_dbg_arp(struct batadv_priv *bat_priv, struct sk_buff *skb, int hdr_size, char *msg) { } #endif /* CONFIG_BATMAN_ADV_DEBUG */ /** * batadv_is_orig_node_eligible() - check whether a node can be a DHT candidate * @res: the array with the already selected candidates * @select: number of already selected candidates * @tmp_max: address of the currently evaluated node * @max: current round max address * @last_max: address of the last selected candidate * @candidate: orig_node under evaluation * @max_orig_node: last selected candidate * * Return: true if the node has been elected as next candidate or false * otherwise. */ static bool batadv_is_orig_node_eligible(struct batadv_dat_candidate *res, int select, batadv_dat_addr_t tmp_max, batadv_dat_addr_t max, batadv_dat_addr_t last_max, struct batadv_orig_node *candidate, struct batadv_orig_node *max_orig_node) { bool ret = false; int j; /* check if orig node candidate is running DAT */ if (!test_bit(BATADV_ORIG_CAPA_HAS_DAT, &candidate->capabilities)) goto out; /* Check if this node has already been selected... */ for (j = 0; j < select; j++) if (res[j].orig_node == candidate) break; /* ..and possibly skip it */ if (j < select) goto out; /* sanity check: has it already been selected? This should not happen */ if (tmp_max > last_max) goto out; /* check if during this iteration an originator with a closer dht * address has already been found */ if (tmp_max < max) goto out; /* this is an hash collision with the temporary selected node. Choose * the one with the lowest address */ if (tmp_max == max && max_orig_node && batadv_compare_eth(candidate->orig, max_orig_node->orig)) goto out; ret = true; out: return ret; } /** * batadv_choose_next_candidate() - select the next DHT candidate * @bat_priv: the bat priv with all the soft interface information * @cands: candidates array * @select: number of candidates already present in the array * @ip_key: key to look up in the DHT * @last_max: pointer where the address of the selected candidate will be saved */ static void batadv_choose_next_candidate(struct batadv_priv *bat_priv, struct batadv_dat_candidate *cands, int select, batadv_dat_addr_t ip_key, batadv_dat_addr_t *last_max) { batadv_dat_addr_t max = 0; batadv_dat_addr_t tmp_max = 0; struct batadv_orig_node *orig_node, *max_orig_node = NULL; struct batadv_hashtable *hash = bat_priv->orig_hash; struct hlist_head *head; int i; /* if no node is eligible as candidate, leave the candidate type as * NOT_FOUND */ cands[select].type = BATADV_DAT_CANDIDATE_NOT_FOUND; /* iterate over the originator list and find the node with the closest * dat_address which has not been selected yet */ for (i = 0; i < hash->size; i++) { head = &hash->table[i]; rcu_read_lock(); hlist_for_each_entry_rcu(orig_node, head, hash_entry) { /* the dht space is a ring using unsigned addresses */ tmp_max = BATADV_DAT_ADDR_MAX - orig_node->dat_addr + ip_key; if (!batadv_is_orig_node_eligible(cands, select, tmp_max, max, *last_max, orig_node, max_orig_node)) continue; if (!kref_get_unless_zero(&orig_node->refcount)) continue; max = tmp_max; batadv_orig_node_put(max_orig_node); max_orig_node = orig_node; } rcu_read_unlock(); } if (max_orig_node) { cands[select].type = BATADV_DAT_CANDIDATE_ORIG; cands[select].orig_node = max_orig_node; batadv_dbg(BATADV_DBG_DAT, bat_priv, "dat_select_candidates() %d: selected %pM addr=%u dist=%u\n", select, max_orig_node->orig, max_orig_node->dat_addr, max); } *last_max = max; } /** * batadv_dat_select_candidates() - select the nodes which the DHT message has * to be sent to * @bat_priv: the bat priv with all the soft interface information * @ip_dst: ipv4 to look up in the DHT * @vid: VLAN identifier * * An originator O is selected if and only if its DHT_ID value is one of three * closest values (from the LEFT, with wrap around if needed) then the hash * value of the key. ip_dst is the key. * * Return: the candidate array of size BATADV_DAT_CANDIDATE_NUM. */ static struct batadv_dat_candidate * batadv_dat_select_candidates(struct batadv_priv *bat_priv, __be32 ip_dst, unsigned short vid) { int select; batadv_dat_addr_t last_max = BATADV_DAT_ADDR_MAX, ip_key; struct batadv_dat_candidate *res; struct batadv_dat_entry dat; if (!bat_priv->orig_hash) return NULL; res = kmalloc_array(BATADV_DAT_CANDIDATES_NUM, sizeof(*res), GFP_ATOMIC); if (!res) return NULL; dat.ip = ip_dst; dat.vid = vid; ip_key = (batadv_dat_addr_t)batadv_hash_dat(&dat, BATADV_DAT_ADDR_MAX); batadv_dbg(BATADV_DBG_DAT, bat_priv, "%s(): IP=%pI4 hash(IP)=%u\n", __func__, &ip_dst, ip_key); for (select = 0; select < BATADV_DAT_CANDIDATES_NUM; select++) batadv_choose_next_candidate(bat_priv, res, select, ip_key, &last_max); return res; } /** * batadv_dat_forward_data() - copy and send payload to the selected candidates * @bat_priv: the bat priv with all the soft interface information * @skb: payload to send * @ip: the DHT key * @vid: VLAN identifier * @packet_subtype: unicast4addr packet subtype to use * * This function copies the skb with pskb_copy() and is sent as a unicast packet * to each of the selected candidates. * * Return: true if the packet is sent to at least one candidate, false * otherwise. */ static bool batadv_dat_forward_data(struct batadv_priv *bat_priv, struct sk_buff *skb, __be32 ip, unsigned short vid, int packet_subtype) { int i; bool ret = false; int send_status; struct batadv_neigh_node *neigh_node = NULL; struct sk_buff *tmp_skb; struct batadv_dat_candidate *cand; cand = batadv_dat_select_candidates(bat_priv, ip, vid); if (!cand) return ret; batadv_dbg(BATADV_DBG_DAT, bat_priv, "DHT_SEND for %pI4\n", &ip); for (i = 0; i < BATADV_DAT_CANDIDATES_NUM; i++) { if (cand[i].type == BATADV_DAT_CANDIDATE_NOT_FOUND) continue; neigh_node = batadv_orig_router_get(cand[i].orig_node, BATADV_IF_DEFAULT); if (!neigh_node) goto free_orig; tmp_skb = pskb_copy_for_clone(skb, GFP_ATOMIC); if (!batadv_send_skb_prepare_unicast_4addr(bat_priv, tmp_skb, cand[i].orig_node, packet_subtype)) { kfree_skb(tmp_skb); goto free_neigh; } send_status = batadv_send_unicast_skb(tmp_skb, neigh_node); if (send_status == NET_XMIT_SUCCESS) { /* count the sent packet */ switch (packet_subtype) { case BATADV_P_DAT_DHT_GET: batadv_inc_counter(bat_priv, BATADV_CNT_DAT_GET_TX); break; case BATADV_P_DAT_DHT_PUT: batadv_inc_counter(bat_priv, BATADV_CNT_DAT_PUT_TX); break; } /* packet sent to a candidate: return true */ ret = true; } free_neigh: batadv_neigh_node_put(neigh_node); free_orig: batadv_orig_node_put(cand[i].orig_node); } kfree(cand); return ret; } /** * batadv_dat_tvlv_container_update() - update the dat tvlv container after dat * setting change * @bat_priv: the bat priv with all the soft interface information */ static void batadv_dat_tvlv_container_update(struct batadv_priv *bat_priv) { char dat_mode; dat_mode = atomic_read(&bat_priv->distributed_arp_table); switch (dat_mode) { case 0: batadv_tvlv_container_unregister(bat_priv, BATADV_TVLV_DAT, 1); break; case 1: batadv_tvlv_container_register(bat_priv, BATADV_TVLV_DAT, 1, NULL, 0); break; } } /** * batadv_dat_status_update() - update the dat tvlv container after dat * setting change * @net_dev: the soft interface net device */ void batadv_dat_status_update(struct net_device *net_dev) { struct batadv_priv *bat_priv = netdev_priv(net_dev); batadv_dat_tvlv_container_update(bat_priv); } /** * batadv_dat_tvlv_ogm_handler_v1() - process incoming dat tvlv container * @bat_priv: the bat priv with all the soft interface information * @orig: the orig_node of the ogm * @flags: flags indicating the tvlv state (see batadv_tvlv_handler_flags) * @tvlv_value: tvlv buffer containing the gateway data * @tvlv_value_len: tvlv buffer length */ static void batadv_dat_tvlv_ogm_handler_v1(struct batadv_priv *bat_priv, struct batadv_orig_node *orig, u8 flags, void *tvlv_value, u16 tvlv_value_len) { if (flags & BATADV_TVLV_HANDLER_OGM_CIFNOTFND) clear_bit(BATADV_ORIG_CAPA_HAS_DAT, &orig->capabilities); else set_bit(BATADV_ORIG_CAPA_HAS_DAT, &orig->capabilities); } /** * batadv_dat_hash_free() - free the local DAT hash table * @bat_priv: the bat priv with all the soft interface information */ static void batadv_dat_hash_free(struct batadv_priv *bat_priv) { if (!bat_priv->dat.hash) return; __batadv_dat_purge(bat_priv, NULL); batadv_hash_destroy(bat_priv->dat.hash); bat_priv->dat.hash = NULL; } /** * batadv_dat_init() - initialise the DAT internals * @bat_priv: the bat priv with all the soft interface information * * Return: 0 in case of success, a negative error code otherwise */ int batadv_dat_init(struct batadv_priv *bat_priv) { if (bat_priv->dat.hash) return 0; bat_priv->dat.hash = batadv_hash_new(1024); if (!bat_priv->dat.hash) return -ENOMEM; INIT_DELAYED_WORK(&bat_priv->dat.work, batadv_dat_purge); batadv_dat_start_timer(bat_priv); batadv_tvlv_handler_register(bat_priv, batadv_dat_tvlv_ogm_handler_v1, NULL, NULL, BATADV_TVLV_DAT, 1, BATADV_TVLV_HANDLER_OGM_CIFNOTFND); batadv_dat_tvlv_container_update(bat_priv); return 0; } /** * batadv_dat_free() - free the DAT internals * @bat_priv: the bat priv with all the soft interface information */ void batadv_dat_free(struct batadv_priv *bat_priv) { batadv_tvlv_container_unregister(bat_priv, BATADV_TVLV_DAT, 1); batadv_tvlv_handler_unregister(bat_priv, BATADV_TVLV_DAT, 1); cancel_delayed_work_sync(&bat_priv->dat.work); batadv_dat_hash_free(bat_priv); } /** * batadv_dat_cache_dump_entry() - dump one entry of the DAT cache table to a * netlink socket * @msg: buffer for the message * @portid: netlink port * @cb: Control block containing additional options * @dat_entry: entry to dump * * Return: 0 or error code. */ static int batadv_dat_cache_dump_entry(struct sk_buff *msg, u32 portid, struct netlink_callback *cb, struct batadv_dat_entry *dat_entry) { int msecs; void *hdr; hdr = genlmsg_put(msg, portid, cb->nlh->nlmsg_seq, &batadv_netlink_family, NLM_F_MULTI, BATADV_CMD_GET_DAT_CACHE); if (!hdr) return -ENOBUFS; genl_dump_check_consistent(cb, hdr); msecs = jiffies_to_msecs(jiffies - dat_entry->last_update); if (nla_put_in_addr(msg, BATADV_ATTR_DAT_CACHE_IP4ADDRESS, dat_entry->ip) || nla_put(msg, BATADV_ATTR_DAT_CACHE_HWADDRESS, ETH_ALEN, dat_entry->mac_addr) || nla_put_u16(msg, BATADV_ATTR_DAT_CACHE_VID, dat_entry->vid) || nla_put_u32(msg, BATADV_ATTR_LAST_SEEN_MSECS, msecs)) { genlmsg_cancel(msg, hdr); return -EMSGSIZE; } genlmsg_end(msg, hdr); return 0; } /** * batadv_dat_cache_dump_bucket() - dump one bucket of the DAT cache table to * a netlink socket * @msg: buffer for the message * @portid: netlink port * @cb: Control block containing additional options * @hash: hash to dump * @bucket: bucket index to dump * @idx_skip: How many entries to skip * * Return: 0 or error code. */ static int batadv_dat_cache_dump_bucket(struct sk_buff *msg, u32 portid, struct netlink_callback *cb, struct batadv_hashtable *hash, unsigned int bucket, int *idx_skip) { struct batadv_dat_entry *dat_entry; int idx = 0; spin_lock_bh(&hash->list_locks[bucket]); cb->seq = atomic_read(&hash->generation) << 1 | 1; hlist_for_each_entry(dat_entry, &hash->table[bucket], hash_entry) { if (idx < *idx_skip) goto skip; if (batadv_dat_cache_dump_entry(msg, portid, cb, dat_entry)) { spin_unlock_bh(&hash->list_locks[bucket]); *idx_skip = idx; return -EMSGSIZE; } skip: idx++; } spin_unlock_bh(&hash->list_locks[bucket]); return 0; } /** * batadv_dat_cache_dump() - dump DAT cache table to a netlink socket * @msg: buffer for the message * @cb: callback structure containing arguments * * Return: message length. */ int batadv_dat_cache_dump(struct sk_buff *msg, struct netlink_callback *cb) { struct batadv_hard_iface *primary_if = NULL; int portid = NETLINK_CB(cb->skb).portid; struct net_device *soft_iface; struct batadv_hashtable *hash; struct batadv_priv *bat_priv; int bucket = cb->args[0]; int idx = cb->args[1]; int ret = 0; soft_iface = batadv_netlink_get_softif(cb); if (IS_ERR(soft_iface)) return PTR_ERR(soft_iface); bat_priv = netdev_priv(soft_iface); hash = bat_priv->dat.hash; primary_if = batadv_primary_if_get_selected(bat_priv); if (!primary_if || primary_if->if_status != BATADV_IF_ACTIVE) { ret = -ENOENT; goto out; } while (bucket < hash->size) { if (batadv_dat_cache_dump_bucket(msg, portid, cb, hash, bucket, &idx)) break; bucket++; idx = 0; } cb->args[0] = bucket; cb->args[1] = idx; ret = msg->len; out: batadv_hardif_put(primary_if); dev_put(soft_iface); return ret; } /** * batadv_arp_get_type() - parse an ARP packet and gets the type * @bat_priv: the bat priv with all the soft interface information * @skb: packet to analyse * @hdr_size: size of the possible header before the ARP packet in the skb * * Return: the ARP type if the skb contains a valid ARP packet, 0 otherwise. */ static u16 batadv_arp_get_type(struct batadv_priv *bat_priv, struct sk_buff *skb, int hdr_size) { struct arphdr *arphdr; struct ethhdr *ethhdr; __be32 ip_src, ip_dst; u8 *hw_src, *hw_dst; u16 type = 0; /* pull the ethernet header */ if (unlikely(!pskb_may_pull(skb, hdr_size + ETH_HLEN))) goto out; ethhdr = (struct ethhdr *)(skb->data + hdr_size); if (ethhdr->h_proto != htons(ETH_P_ARP)) goto out; /* pull the ARP payload */ if (unlikely(!pskb_may_pull(skb, hdr_size + ETH_HLEN + arp_hdr_len(skb->dev)))) goto out; arphdr = (struct arphdr *)(skb->data + hdr_size + ETH_HLEN); /* check whether the ARP packet carries a valid IP information */ if (arphdr->ar_hrd != htons(ARPHRD_ETHER)) goto out; if (arphdr->ar_pro != htons(ETH_P_IP)) goto out; if (arphdr->ar_hln != ETH_ALEN) goto out; if (arphdr->ar_pln != 4) goto out; /* Check for bad reply/request. If the ARP message is not sane, DAT * will simply ignore it */ ip_src = batadv_arp_ip_src(skb, hdr_size); ip_dst = batadv_arp_ip_dst(skb, hdr_size); if (ipv4_is_loopback(ip_src) || ipv4_is_multicast(ip_src) || ipv4_is_loopback(ip_dst) || ipv4_is_multicast(ip_dst) || ipv4_is_zeronet(ip_src) || ipv4_is_lbcast(ip_src) || ipv4_is_zeronet(ip_dst) || ipv4_is_lbcast(ip_dst)) goto out; hw_src = batadv_arp_hw_src(skb, hdr_size); if (is_zero_ether_addr(hw_src) || is_multicast_ether_addr(hw_src)) goto out; /* don't care about the destination MAC address in ARP requests */ if (arphdr->ar_op != htons(ARPOP_REQUEST)) { hw_dst = batadv_arp_hw_dst(skb, hdr_size); if (is_zero_ether_addr(hw_dst) || is_multicast_ether_addr(hw_dst)) goto out; } type = ntohs(arphdr->ar_op); out: return type; } /** * batadv_dat_get_vid() - extract the VLAN identifier from skb if any * @skb: the buffer containing the packet to extract the VID from * @hdr_size: the size of the batman-adv header encapsulating the packet * * Return: If the packet embedded in the skb is vlan tagged this function * returns the VID with the BATADV_VLAN_HAS_TAG flag. Otherwise BATADV_NO_FLAGS * is returned. */ static unsigned short batadv_dat_get_vid(struct sk_buff *skb, int *hdr_size) { unsigned short vid; vid = batadv_get_vid(skb, *hdr_size); /* ARP parsing functions jump forward of hdr_size + ETH_HLEN. * If the header contained in the packet is a VLAN one (which is longer) * hdr_size is updated so that the functions will still skip the * correct amount of bytes. */ if (vid & BATADV_VLAN_HAS_TAG) *hdr_size += VLAN_HLEN; return vid; } /** * batadv_dat_arp_create_reply() - create an ARP Reply * @bat_priv: the bat priv with all the soft interface information * @ip_src: ARP sender IP * @ip_dst: ARP target IP * @hw_src: Ethernet source and ARP sender MAC * @hw_dst: Ethernet destination and ARP target MAC * @vid: VLAN identifier (optional, set to zero otherwise) * * Creates an ARP Reply from the given values, optionally encapsulated in a * VLAN header. * * Return: An skb containing an ARP Reply. */ static struct sk_buff * batadv_dat_arp_create_reply(struct batadv_priv *bat_priv, __be32 ip_src, __be32 ip_dst, u8 *hw_src, u8 *hw_dst, unsigned short vid) { struct sk_buff *skb; skb = arp_create(ARPOP_REPLY, ETH_P_ARP, ip_dst, bat_priv->soft_iface, ip_src, hw_dst, hw_src, hw_dst); if (!skb) return NULL; skb_reset_mac_header(skb); if (vid & BATADV_VLAN_HAS_TAG) skb = vlan_insert_tag(skb, htons(ETH_P_8021Q), vid & VLAN_VID_MASK); return skb; } /** * batadv_dat_snoop_outgoing_arp_request() - snoop the ARP request and try to * answer using DAT * @bat_priv: the bat priv with all the soft interface information * @skb: packet to check * * Return: true if the message has been sent to the dht candidates, false * otherwise. In case of a positive return value the message has to be enqueued * to permit the fallback. */ bool batadv_dat_snoop_outgoing_arp_request(struct batadv_priv *bat_priv, struct sk_buff *skb) { u16 type = 0; __be32 ip_dst, ip_src; u8 *hw_src; bool ret = false; struct batadv_dat_entry *dat_entry = NULL; struct sk_buff *skb_new; struct net_device *soft_iface = bat_priv->soft_iface; int hdr_size = 0; unsigned short vid; if (!atomic_read(&bat_priv->distributed_arp_table)) goto out; vid = batadv_dat_get_vid(skb, &hdr_size); type = batadv_arp_get_type(bat_priv, skb, hdr_size); /* If the node gets an ARP_REQUEST it has to send a DHT_GET unicast * message to the selected DHT candidates */ if (type != ARPOP_REQUEST) goto out; batadv_dbg_arp(bat_priv, skb, hdr_size, "Parsing outgoing ARP REQUEST"); ip_src = batadv_arp_ip_src(skb, hdr_size); hw_src = batadv_arp_hw_src(skb, hdr_size); ip_dst = batadv_arp_ip_dst(skb, hdr_size); batadv_dat_entry_add(bat_priv, ip_src, hw_src, vid); dat_entry = batadv_dat_entry_hash_find(bat_priv, ip_dst, vid); if (dat_entry) { /* If the ARP request is destined for a local client the local * client will answer itself. DAT would only generate a * duplicate packet. * * Moreover, if the soft-interface is enslaved into a bridge, an * additional DAT answer may trigger kernel warnings about * a packet coming from the wrong port. */ if (batadv_is_my_client(bat_priv, dat_entry->mac_addr, vid)) { ret = true; goto out; } /* If BLA is enabled, only send ARP replies if we have claimed * the destination for the ARP request or if no one else of * the backbone gws belonging to our backbone has claimed the * destination. */ if (!batadv_bla_check_claim(bat_priv, dat_entry->mac_addr, vid)) { batadv_dbg(BATADV_DBG_DAT, bat_priv, "Device %pM claimed by another backbone gw. Don't send ARP reply!", dat_entry->mac_addr); ret = true; goto out; } skb_new = batadv_dat_arp_create_reply(bat_priv, ip_dst, ip_src, dat_entry->mac_addr, hw_src, vid); if (!skb_new) goto out; skb_new->protocol = eth_type_trans(skb_new, soft_iface); batadv_inc_counter(bat_priv, BATADV_CNT_RX); batadv_add_counter(bat_priv, BATADV_CNT_RX_BYTES, skb->len + ETH_HLEN + hdr_size); netif_rx(skb_new); batadv_dbg(BATADV_DBG_DAT, bat_priv, "ARP request replied locally\n"); ret = true; } else { /* Send the request to the DHT */ ret = batadv_dat_forward_data(bat_priv, skb, ip_dst, vid, BATADV_P_DAT_DHT_GET); } out: batadv_dat_entry_put(dat_entry); return ret; } /** * batadv_dat_snoop_incoming_arp_request() - snoop the ARP request and try to * answer using the local DAT storage * @bat_priv: the bat priv with all the soft interface information * @skb: packet to check * @hdr_size: size of the encapsulation header * * Return: true if the request has been answered, false otherwise. */ bool batadv_dat_snoop_incoming_arp_request(struct batadv_priv *bat_priv, struct sk_buff *skb, int hdr_size) { u16 type; __be32 ip_src, ip_dst; u8 *hw_src; struct sk_buff *skb_new; struct batadv_dat_entry *dat_entry = NULL; bool ret = false; unsigned short vid; int err; if (!atomic_read(&bat_priv->distributed_arp_table)) goto out; vid = batadv_dat_get_vid(skb, &hdr_size); type = batadv_arp_get_type(bat_priv, skb, hdr_size); if (type != ARPOP_REQUEST) goto out; hw_src = batadv_arp_hw_src(skb, hdr_size); ip_src = batadv_arp_ip_src(skb, hdr_size); ip_dst = batadv_arp_ip_dst(skb, hdr_size); batadv_dbg_arp(bat_priv, skb, hdr_size, "Parsing incoming ARP REQUEST"); batadv_dat_entry_add(bat_priv, ip_src, hw_src, vid); dat_entry = batadv_dat_entry_hash_find(bat_priv, ip_dst, vid); if (!dat_entry) goto out; skb_new = batadv_dat_arp_create_reply(bat_priv, ip_dst, ip_src, dat_entry->mac_addr, hw_src, vid); if (!skb_new) goto out; /* To preserve backwards compatibility, the node has choose the outgoing * format based on the incoming request packet type. The assumption is * that a node not using the 4addr packet format doesn't support it. */ if (hdr_size == sizeof(struct batadv_unicast_4addr_packet)) err = batadv_send_skb_via_tt_4addr(bat_priv, skb_new, BATADV_P_DAT_CACHE_REPLY, NULL, vid); else err = batadv_send_skb_via_tt(bat_priv, skb_new, NULL, vid); if (err != NET_XMIT_DROP) { batadv_inc_counter(bat_priv, BATADV_CNT_DAT_CACHED_REPLY_TX); ret = true; } out: batadv_dat_entry_put(dat_entry); if (ret) kfree_skb(skb); return ret; } /** * batadv_dat_snoop_outgoing_arp_reply() - snoop the ARP reply and fill the DHT * @bat_priv: the bat priv with all the soft interface information * @skb: packet to check */ void batadv_dat_snoop_outgoing_arp_reply(struct batadv_priv *bat_priv, struct sk_buff *skb) { u16 type; __be32 ip_src, ip_dst; u8 *hw_src, *hw_dst; int hdr_size = 0; unsigned short vid; if (!atomic_read(&bat_priv->distributed_arp_table)) return; vid = batadv_dat_get_vid(skb, &hdr_size); type = batadv_arp_get_type(bat_priv, skb, hdr_size); if (type != ARPOP_REPLY) return; batadv_dbg_arp(bat_priv, skb, hdr_size, "Parsing outgoing ARP REPLY"); hw_src = batadv_arp_hw_src(skb, hdr_size); ip_src = batadv_arp_ip_src(skb, hdr_size); hw_dst = batadv_arp_hw_dst(skb, hdr_size); ip_dst = batadv_arp_ip_dst(skb, hdr_size); batadv_dat_entry_add(bat_priv, ip_src, hw_src, vid); batadv_dat_entry_add(bat_priv, ip_dst, hw_dst, vid); /* Send the ARP reply to the candidates for both the IP addresses that * the node obtained from the ARP reply */ batadv_dat_forward_data(bat_priv, skb, ip_src, vid, BATADV_P_DAT_DHT_PUT); batadv_dat_forward_data(bat_priv, skb, ip_dst, vid, BATADV_P_DAT_DHT_PUT); } /** * batadv_dat_snoop_incoming_arp_reply() - snoop the ARP reply and fill the * local DAT storage only * @bat_priv: the bat priv with all the soft interface information * @skb: packet to check * @hdr_size: size of the encapsulation header * * Return: true if the packet was snooped and consumed by DAT. False if the * packet has to be delivered to the interface */ bool batadv_dat_snoop_incoming_arp_reply(struct batadv_priv *bat_priv, struct sk_buff *skb, int hdr_size) { struct batadv_dat_entry *dat_entry = NULL; u16 type; __be32 ip_src, ip_dst; u8 *hw_src, *hw_dst; bool dropped = false; unsigned short vid; if (!atomic_read(&bat_priv->distributed_arp_table)) goto out; vid = batadv_dat_get_vid(skb, &hdr_size); type = batadv_arp_get_type(bat_priv, skb, hdr_size); if (type != ARPOP_REPLY) goto out; batadv_dbg_arp(bat_priv, skb, hdr_size, "Parsing incoming ARP REPLY"); hw_src = batadv_arp_hw_src(skb, hdr_size); ip_src = batadv_arp_ip_src(skb, hdr_size); hw_dst = batadv_arp_hw_dst(skb, hdr_size); ip_dst = batadv_arp_ip_dst(skb, hdr_size); /* If ip_dst is already in cache and has the right mac address, * drop this frame if this ARP reply is destined for us because it's * most probably an ARP reply generated by another node of the DHT. * We have most probably received already a reply earlier. Delivering * this frame would lead to doubled receive of an ARP reply. */ dat_entry = batadv_dat_entry_hash_find(bat_priv, ip_src, vid); if (dat_entry && batadv_compare_eth(hw_src, dat_entry->mac_addr)) { batadv_dbg(BATADV_DBG_DAT, bat_priv, "Doubled ARP reply removed: ARP MSG = [src: %pM-%pI4 dst: %pM-%pI4]; dat_entry: %pM-%pI4\n", hw_src, &ip_src, hw_dst, &ip_dst, dat_entry->mac_addr, &dat_entry->ip); dropped = true; } /* Update our internal cache with both the IP addresses the node got * within the ARP reply */ batadv_dat_entry_add(bat_priv, ip_src, hw_src, vid); batadv_dat_entry_add(bat_priv, ip_dst, hw_dst, vid); if (dropped) goto out; /* If BLA is enabled, only forward ARP replies if we have claimed the * source of the ARP reply or if no one else of the same backbone has * already claimed that client. This prevents that different gateways * to the same backbone all forward the ARP reply leading to multiple * replies in the backbone. */ if (!batadv_bla_check_claim(bat_priv, hw_src, vid)) { batadv_dbg(BATADV_DBG_DAT, bat_priv, "Device %pM claimed by another backbone gw. Drop ARP reply.\n", hw_src); dropped = true; goto out; } /* if this REPLY is directed to a client of mine, let's deliver the * packet to the interface */ dropped = !batadv_is_my_client(bat_priv, hw_dst, vid); /* if this REPLY is sent on behalf of a client of mine, let's drop the * packet because the client will reply by itself */ dropped |= batadv_is_my_client(bat_priv, hw_src, vid); out: if (dropped) kfree_skb(skb); batadv_dat_entry_put(dat_entry); /* if dropped == false -> deliver to the interface */ return dropped; } /** * batadv_dat_check_dhcp_ipudp() - check skb for IP+UDP headers valid for DHCP * @skb: the packet to check * @ip_src: a buffer to store the IPv4 source address in * * Checks whether the given skb has an IP and UDP header valid for a DHCP * message from a DHCP server. And if so, stores the IPv4 source address in * the provided buffer. * * Return: True if valid, false otherwise. */ static bool batadv_dat_check_dhcp_ipudp(struct sk_buff *skb, __be32 *ip_src) { unsigned int offset = skb_network_offset(skb); struct udphdr *udphdr, _udphdr; struct iphdr *iphdr, _iphdr; iphdr = skb_header_pointer(skb, offset, sizeof(_iphdr), &_iphdr); if (!iphdr || iphdr->version != 4 || iphdr->ihl * 4 < sizeof(_iphdr)) return false; if (iphdr->protocol != IPPROTO_UDP) return false; offset += iphdr->ihl * 4; skb_set_transport_header(skb, offset); udphdr = skb_header_pointer(skb, offset, sizeof(_udphdr), &_udphdr); if (!udphdr || udphdr->source != htons(67)) return false; *ip_src = get_unaligned(&iphdr->saddr); return true; } /** * batadv_dat_check_dhcp() - examine packet for valid DHCP message * @skb: the packet to check * @proto: ethernet protocol hint (behind a potential vlan) * @ip_src: a buffer to store the IPv4 source address in * * Checks whether the given skb is a valid DHCP packet. And if so, stores the * IPv4 source address in the provided buffer. * * Caller needs to ensure that the skb network header is set correctly. * * Return: If skb is a valid DHCP packet, then returns its op code * (e.g. BOOTREPLY vs. BOOTREQUEST). Otherwise returns -EINVAL. */ static int batadv_dat_check_dhcp(struct sk_buff *skb, __be16 proto, __be32 *ip_src) { __be32 *magic, _magic; unsigned int offset; struct { __u8 op; __u8 htype; __u8 hlen; __u8 hops; } *dhcp_h, _dhcp_h; if (proto != htons(ETH_P_IP)) return -EINVAL; if (!batadv_dat_check_dhcp_ipudp(skb, ip_src)) return -EINVAL; offset = skb_transport_offset(skb) + sizeof(struct udphdr); if (skb->len < offset + sizeof(struct batadv_dhcp_packet)) return -EINVAL; dhcp_h = skb_header_pointer(skb, offset, sizeof(_dhcp_h), &_dhcp_h); if (!dhcp_h || dhcp_h->htype != BATADV_HTYPE_ETHERNET || dhcp_h->hlen != ETH_ALEN) return -EINVAL; offset += offsetof(struct batadv_dhcp_packet, magic); magic = skb_header_pointer(skb, offset, sizeof(_magic), &_magic); if (!magic || get_unaligned(magic) != htonl(BATADV_DHCP_MAGIC)) return -EINVAL; return dhcp_h->op; } /** * batadv_dat_get_dhcp_message_type() - get message type of a DHCP packet * @skb: the DHCP packet to parse * * Iterates over the DHCP options of the given DHCP packet to find a * DHCP Message Type option and parse it. * * Caller needs to ensure that the given skb is a valid DHCP packet and * that the skb transport header is set correctly. * * Return: The found DHCP message type value, if found. -EINVAL otherwise. */ static int batadv_dat_get_dhcp_message_type(struct sk_buff *skb) { unsigned int offset = skb_transport_offset(skb) + sizeof(struct udphdr); u8 *type, _type; struct { u8 type; u8 len; } *tl, _tl; offset += sizeof(struct batadv_dhcp_packet); while ((tl = skb_header_pointer(skb, offset, sizeof(_tl), &_tl))) { if (tl->type == BATADV_DHCP_OPT_MSG_TYPE) break; if (tl->type == BATADV_DHCP_OPT_END) break; if (tl->type == BATADV_DHCP_OPT_PAD) offset++; else offset += tl->len + sizeof(_tl); } /* Option Overload Code not supported */ if (!tl || tl->type != BATADV_DHCP_OPT_MSG_TYPE || tl->len != sizeof(_type)) return -EINVAL; offset += sizeof(_tl); type = skb_header_pointer(skb, offset, sizeof(_type), &_type); if (!type) return -EINVAL; return *type; } /** * batadv_dat_dhcp_get_yiaddr() - get yiaddr from a DHCP packet * @skb: the DHCP packet to parse * @buf: a buffer to store the yiaddr in * * Caller needs to ensure that the given skb is a valid DHCP packet and * that the skb transport header is set correctly. * * Return: True on success, false otherwise. */ static bool batadv_dat_dhcp_get_yiaddr(struct sk_buff *skb, __be32 *buf) { unsigned int offset = skb_transport_offset(skb) + sizeof(struct udphdr); __be32 *yiaddr; offset += offsetof(struct batadv_dhcp_packet, yiaddr); yiaddr = skb_header_pointer(skb, offset, BATADV_DHCP_YIADDR_LEN, buf); if (!yiaddr) return false; if (yiaddr != buf) *buf = get_unaligned(yiaddr); return true; } /** * batadv_dat_get_dhcp_chaddr() - get chaddr from a DHCP packet * @skb: the DHCP packet to parse * @buf: a buffer to store the chaddr in * * Caller needs to ensure that the given skb is a valid DHCP packet and * that the skb transport header is set correctly. * * Return: True on success, false otherwise */ static bool batadv_dat_get_dhcp_chaddr(struct sk_buff *skb, u8 *buf) { unsigned int offset = skb_transport_offset(skb) + sizeof(struct udphdr); u8 *chaddr; offset += offsetof(struct batadv_dhcp_packet, chaddr); chaddr = skb_header_pointer(skb, offset, BATADV_DHCP_CHADDR_LEN, buf); if (!chaddr) return false; if (chaddr != buf) memcpy(buf, chaddr, BATADV_DHCP_CHADDR_LEN); return true; } /** * batadv_dat_put_dhcp() - puts addresses from a DHCP packet into the DHT and * DAT cache * @bat_priv: the bat priv with all the soft interface information * @chaddr: the DHCP client MAC address * @yiaddr: the DHCP client IP address * @hw_dst: the DHCP server MAC address * @ip_dst: the DHCP server IP address * @vid: VLAN identifier * * Adds given MAC/IP pairs to the local DAT cache and propagates them further * into the DHT. * * For the DHT propagation, client MAC + IP will appear as the ARP Reply * transmitter (and hw_dst/ip_dst as the target). */ static void batadv_dat_put_dhcp(struct batadv_priv *bat_priv, u8 *chaddr, __be32 yiaddr, u8 *hw_dst, __be32 ip_dst, unsigned short vid) { struct sk_buff *skb; skb = batadv_dat_arp_create_reply(bat_priv, yiaddr, ip_dst, chaddr, hw_dst, vid); if (!skb) return; skb_set_network_header(skb, ETH_HLEN); batadv_dat_entry_add(bat_priv, yiaddr, chaddr, vid); batadv_dat_entry_add(bat_priv, ip_dst, hw_dst, vid); batadv_dat_forward_data(bat_priv, skb, yiaddr, vid, BATADV_P_DAT_DHT_PUT); batadv_dat_forward_data(bat_priv, skb, ip_dst, vid, BATADV_P_DAT_DHT_PUT); consume_skb(skb); batadv_dbg(BATADV_DBG_DAT, bat_priv, "Snooped from outgoing DHCPACK (server address): %pI4, %pM (vid: %i)\n", &ip_dst, hw_dst, batadv_print_vid(vid)); batadv_dbg(BATADV_DBG_DAT, bat_priv, "Snooped from outgoing DHCPACK (client address): %pI4, %pM (vid: %i)\n", &yiaddr, chaddr, batadv_print_vid(vid)); } /** * batadv_dat_check_dhcp_ack() - examine packet for valid DHCP message * @skb: the packet to check * @proto: ethernet protocol hint (behind a potential vlan) * @ip_src: a buffer to store the IPv4 source address in * @chaddr: a buffer to store the DHCP Client Hardware Address in * @yiaddr: a buffer to store the DHCP Your IP Address in * * Checks whether the given skb is a valid DHCPACK. And if so, stores the * IPv4 server source address (ip_src), client MAC address (chaddr) and client * IPv4 address (yiaddr) in the provided buffers. * * Caller needs to ensure that the skb network header is set correctly. * * Return: True if the skb is a valid DHCPACK. False otherwise. */ static bool batadv_dat_check_dhcp_ack(struct sk_buff *skb, __be16 proto, __be32 *ip_src, u8 *chaddr, __be32 *yiaddr) { int type; type = batadv_dat_check_dhcp(skb, proto, ip_src); if (type != BATADV_BOOTREPLY) return false; type = batadv_dat_get_dhcp_message_type(skb); if (type != BATADV_DHCPACK) return false; if (!batadv_dat_dhcp_get_yiaddr(skb, yiaddr)) return false; if (!batadv_dat_get_dhcp_chaddr(skb, chaddr)) return false; return true; } /** * batadv_dat_snoop_outgoing_dhcp_ack() - snoop DHCPACK and fill DAT with it * @bat_priv: the bat priv with all the soft interface information * @skb: the packet to snoop * @proto: ethernet protocol hint (behind a potential vlan) * @vid: VLAN identifier * * This function first checks whether the given skb is a valid DHCPACK. If * so then its source MAC and IP as well as its DHCP Client Hardware Address * field and DHCP Your IP Address field are added to the local DAT cache and * propagated into the DHT. * * Caller needs to ensure that the skb mac and network headers are set * correctly. */ void batadv_dat_snoop_outgoing_dhcp_ack(struct batadv_priv *bat_priv, struct sk_buff *skb, __be16 proto, unsigned short vid) { u8 chaddr[BATADV_DHCP_CHADDR_LEN]; __be32 ip_src, yiaddr; if (!atomic_read(&bat_priv->distributed_arp_table)) return; if (!batadv_dat_check_dhcp_ack(skb, proto, &ip_src, chaddr, &yiaddr)) return; batadv_dat_put_dhcp(bat_priv, chaddr, yiaddr, eth_hdr(skb)->h_source, ip_src, vid); } /** * batadv_dat_snoop_incoming_dhcp_ack() - snoop DHCPACK and fill DAT cache * @bat_priv: the bat priv with all the soft interface information * @skb: the packet to snoop * @hdr_size: header size, up to the tail of the batman-adv header * * This function first checks whether the given skb is a valid DHCPACK. If * so then its source MAC and IP as well as its DHCP Client Hardware Address * field and DHCP Your IP Address field are added to the local DAT cache. */ void batadv_dat_snoop_incoming_dhcp_ack(struct batadv_priv *bat_priv, struct sk_buff *skb, int hdr_size) { u8 chaddr[BATADV_DHCP_CHADDR_LEN]; struct ethhdr *ethhdr; __be32 ip_src, yiaddr; unsigned short vid; __be16 proto; u8 *hw_src; if (!atomic_read(&bat_priv->distributed_arp_table)) return; if (unlikely(!pskb_may_pull(skb, hdr_size + ETH_HLEN))) return; ethhdr = (struct ethhdr *)(skb->data + hdr_size); skb_set_network_header(skb, hdr_size + ETH_HLEN); proto = ethhdr->h_proto; if (!batadv_dat_check_dhcp_ack(skb, proto, &ip_src, chaddr, &yiaddr)) return; hw_src = ethhdr->h_source; vid = batadv_dat_get_vid(skb, &hdr_size); batadv_dat_entry_add(bat_priv, yiaddr, chaddr, vid); batadv_dat_entry_add(bat_priv, ip_src, hw_src, vid); batadv_dbg(BATADV_DBG_DAT, bat_priv, "Snooped from incoming DHCPACK (server address): %pI4, %pM (vid: %i)\n", &ip_src, hw_src, batadv_print_vid(vid)); batadv_dbg(BATADV_DBG_DAT, bat_priv, "Snooped from incoming DHCPACK (client address): %pI4, %pM (vid: %i)\n", &yiaddr, chaddr, batadv_print_vid(vid)); } /** * batadv_dat_drop_broadcast_packet() - check if an ARP request has to be * dropped (because the node has already obtained the reply via DAT) or not * @bat_priv: the bat priv with all the soft interface information * @forw_packet: the broadcast packet * * Return: true if the node can drop the packet, false otherwise. */ bool batadv_dat_drop_broadcast_packet(struct batadv_priv *bat_priv, struct batadv_forw_packet *forw_packet) { u16 type; __be32 ip_dst; struct batadv_dat_entry *dat_entry = NULL; bool ret = false; int hdr_size = sizeof(struct batadv_bcast_packet); unsigned short vid; if (!atomic_read(&bat_priv->distributed_arp_table)) goto out; /* If this packet is an ARP_REQUEST and the node already has the * information that it is going to ask, then the packet can be dropped */ if (batadv_forw_packet_is_rebroadcast(forw_packet)) goto out; vid = batadv_dat_get_vid(forw_packet->skb, &hdr_size); type = batadv_arp_get_type(bat_priv, forw_packet->skb, hdr_size); if (type != ARPOP_REQUEST) goto out; ip_dst = batadv_arp_ip_dst(forw_packet->skb, hdr_size); dat_entry = batadv_dat_entry_hash_find(bat_priv, ip_dst, vid); /* check if the node already got this entry */ if (!dat_entry) { batadv_dbg(BATADV_DBG_DAT, bat_priv, "ARP Request for %pI4: fallback\n", &ip_dst); goto out; } batadv_dbg(BATADV_DBG_DAT, bat_priv, "ARP Request for %pI4: fallback prevented\n", &ip_dst); ret = true; out: batadv_dat_entry_put(dat_entry); return ret; } |
| 8 2 6 6 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 | /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * Copyright (c) 2016, Amir Vadai <amir@vadai.me> * Copyright (c) 2016, Mellanox Technologies. All rights reserved. */ #ifndef __NET_TC_TUNNEL_KEY_H #define __NET_TC_TUNNEL_KEY_H #include <net/act_api.h> #include <linux/tc_act/tc_tunnel_key.h> #include <net/dst_metadata.h> struct tcf_tunnel_key_params { struct rcu_head rcu; int tcft_action; struct metadata_dst *tcft_enc_metadata; }; struct tcf_tunnel_key { struct tc_action common; struct tcf_tunnel_key_params __rcu *params; }; #define to_tunnel_key(a) ((struct tcf_tunnel_key *)a) static inline bool is_tcf_tunnel_set(const struct tc_action *a) { #ifdef CONFIG_NET_CLS_ACT struct tcf_tunnel_key *t = to_tunnel_key(a); struct tcf_tunnel_key_params *params; params = rcu_dereference_protected(t->params, lockdep_is_held(&a->tcfa_lock)); if (a->ops && a->ops->id == TCA_ID_TUNNEL_KEY) return params->tcft_action == TCA_TUNNEL_KEY_ACT_SET; #endif return false; } static inline bool is_tcf_tunnel_release(const struct tc_action *a) { #ifdef CONFIG_NET_CLS_ACT struct tcf_tunnel_key *t = to_tunnel_key(a); struct tcf_tunnel_key_params *params; params = rcu_dereference_protected(t->params, lockdep_is_held(&a->tcfa_lock)); if (a->ops && a->ops->id == TCA_ID_TUNNEL_KEY) return params->tcft_action == TCA_TUNNEL_KEY_ACT_RELEASE; #endif return false; } static inline struct ip_tunnel_info *tcf_tunnel_info(const struct tc_action *a) { #ifdef CONFIG_NET_CLS_ACT struct tcf_tunnel_key *t = to_tunnel_key(a); struct tcf_tunnel_key_params *params; params = rcu_dereference_protected(t->params, lockdep_is_held(&a->tcfa_lock)); return ¶ms->tcft_enc_metadata->u.tun_info; #else return NULL; #endif } static inline struct ip_tunnel_info * tcf_tunnel_info_copy(const struct tc_action *a) { #ifdef CONFIG_NET_CLS_ACT struct ip_tunnel_info *tun = tcf_tunnel_info(a); if (tun) { size_t tun_size = sizeof(*tun) + tun->options_len; struct ip_tunnel_info *tun_copy = kmemdup(tun, tun_size, GFP_ATOMIC); return tun_copy; } #endif return NULL; } #endif /* __NET_TC_TUNNEL_KEY_H */ |
| 201 199 201 201 96 201 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 | // SPDX-License-Identifier: GPL-2.0 #include <linux/log2.h> #include <linux/slab.h> #include <linux/vmalloc.h> #include "darray.h" int __bch2_darray_resize_noprof(darray_char *d, size_t element_size, size_t new_size, gfp_t gfp) { if (new_size > d->size) { new_size = roundup_pow_of_two(new_size); /* * This is a workaround: kvmalloc() doesn't support > INT_MAX * allocations, but vmalloc() does. * The limit needs to be lifted from kvmalloc, and when it does * we'll go back to just using that. */ size_t bytes; if (unlikely(check_mul_overflow(new_size, element_size, &bytes))) return -ENOMEM; void *data = likely(bytes < INT_MAX) ? kvmalloc_noprof(bytes, gfp) : vmalloc_noprof(bytes); if (!data) return -ENOMEM; if (d->size) memcpy(data, d->data, d->size * element_size); if (d->data != d->preallocated) kvfree(d->data); d->data = data; d->size = new_size; } return 0; } |
| 6 1 1 4 1 2 2 1 1 1 3 1 1 1 2 2 8 1 3 2 2 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 | // SPDX-License-Identifier: GPL-2.0-only /* * Vxlan vni filter for collect metadata mode * * Authors: Roopa Prabhu <roopa@nvidia.com> * */ #include <linux/kernel.h> #include <linux/slab.h> #include <linux/etherdevice.h> #include <linux/rhashtable.h> #include <net/rtnetlink.h> #include <net/net_namespace.h> #include <net/sock.h> #include <net/vxlan.h> #include "vxlan_private.h" static inline int vxlan_vni_cmp(struct rhashtable_compare_arg *arg, const void *ptr) { const struct vxlan_vni_node *vnode = ptr; __be32 vni = *(__be32 *)arg->key; return vnode->vni != vni; } const struct rhashtable_params vxlan_vni_rht_params = { .head_offset = offsetof(struct vxlan_vni_node, vnode), .key_offset = offsetof(struct vxlan_vni_node, vni), .key_len = sizeof(__be32), .nelem_hint = 3, .max_size = VXLAN_N_VID, .obj_cmpfn = vxlan_vni_cmp, .automatic_shrinking = true, }; static void vxlan_vs_add_del_vninode(struct vxlan_dev *vxlan, struct vxlan_vni_node *v, bool del) { struct vxlan_net *vn = net_generic(vxlan->net, vxlan_net_id); struct vxlan_dev_node *node; struct vxlan_sock *vs; spin_lock(&vn->sock_lock); if (del) { if (!hlist_unhashed(&v->hlist4.hlist)) hlist_del_init_rcu(&v->hlist4.hlist); #if IS_ENABLED(CONFIG_IPV6) if (!hlist_unhashed(&v->hlist6.hlist)) hlist_del_init_rcu(&v->hlist6.hlist); #endif goto out; } #if IS_ENABLED(CONFIG_IPV6) vs = rtnl_dereference(vxlan->vn6_sock); if (vs && v) { node = &v->hlist6; hlist_add_head_rcu(&node->hlist, vni_head(vs, v->vni)); } #endif vs = rtnl_dereference(vxlan->vn4_sock); if (vs && v) { node = &v->hlist4; hlist_add_head_rcu(&node->hlist, vni_head(vs, v->vni)); } out: spin_unlock(&vn->sock_lock); } void vxlan_vs_add_vnigrp(struct vxlan_dev *vxlan, struct vxlan_sock *vs, bool ipv6) { struct vxlan_net *vn = net_generic(vxlan->net, vxlan_net_id); struct vxlan_vni_group *vg = rtnl_dereference(vxlan->vnigrp); struct vxlan_vni_node *v, *tmp; struct vxlan_dev_node *node; if (!vg) return; spin_lock(&vn->sock_lock); list_for_each_entry_safe(v, tmp, &vg->vni_list, vlist) { #if IS_ENABLED(CONFIG_IPV6) if (ipv6) node = &v->hlist6; else #endif node = &v->hlist4; node->vxlan = vxlan; hlist_add_head_rcu(&node->hlist, vni_head(vs, v->vni)); } spin_unlock(&vn->sock_lock); } void vxlan_vs_del_vnigrp(struct vxlan_dev *vxlan) { struct vxlan_vni_group *vg = rtnl_dereference(vxlan->vnigrp); struct vxlan_net *vn = net_generic(vxlan->net, vxlan_net_id); struct vxlan_vni_node *v, *tmp; if (!vg) return; spin_lock(&vn->sock_lock); list_for_each_entry_safe(v, tmp, &vg->vni_list, vlist) { hlist_del_init_rcu(&v->hlist4.hlist); #if IS_ENABLED(CONFIG_IPV6) hlist_del_init_rcu(&v->hlist6.hlist); #endif } spin_unlock(&vn->sock_lock); } static void vxlan_vnifilter_stats_get(const struct vxlan_vni_node *vninode, struct vxlan_vni_stats *dest) { int i; memset(dest, 0, sizeof(*dest)); for_each_possible_cpu(i) { struct vxlan_vni_stats_pcpu *pstats; struct vxlan_vni_stats temp; unsigned int start; pstats = per_cpu_ptr(vninode->stats, i); do { start = u64_stats_fetch_begin(&pstats->syncp); memcpy(&temp, &pstats->stats, sizeof(temp)); } while (u64_stats_fetch_retry(&pstats->syncp, start)); dest->rx_packets += temp.rx_packets; dest->rx_bytes += temp.rx_bytes; dest->rx_drops += temp.rx_drops; dest->rx_errors += temp.rx_errors; dest->tx_packets += temp.tx_packets; dest->tx_bytes += temp.tx_bytes; dest->tx_drops += temp.tx_drops; dest->tx_errors += temp.tx_errors; } } static void vxlan_vnifilter_stats_add(struct vxlan_vni_node *vninode, int type, unsigned int len) { struct vxlan_vni_stats_pcpu *pstats = this_cpu_ptr(vninode->stats); u64_stats_update_begin(&pstats->syncp); switch (type) { case VXLAN_VNI_STATS_RX: pstats->stats.rx_bytes += len; pstats->stats.rx_packets++; break; case VXLAN_VNI_STATS_RX_DROPS: pstats->stats.rx_drops++; break; case VXLAN_VNI_STATS_RX_ERRORS: pstats->stats.rx_errors++; break; case VXLAN_VNI_STATS_TX: pstats->stats.tx_bytes += len; pstats->stats.tx_packets++; break; case VXLAN_VNI_STATS_TX_DROPS: pstats->stats.tx_drops++; break; case VXLAN_VNI_STATS_TX_ERRORS: pstats->stats.tx_errors++; break; } u64_stats_update_end(&pstats->syncp); } void vxlan_vnifilter_count(struct vxlan_dev *vxlan, __be32 vni, struct vxlan_vni_node *vninode, int type, unsigned int len) { struct vxlan_vni_node *vnode; if (!(vxlan->cfg.flags & VXLAN_F_VNIFILTER)) return; if (vninode) { vnode = vninode; } else { vnode = vxlan_vnifilter_lookup(vxlan, vni); if (!vnode) return; } vxlan_vnifilter_stats_add(vnode, type, len); } static u32 vnirange(struct vxlan_vni_node *vbegin, struct vxlan_vni_node *vend) { return (be32_to_cpu(vend->vni) - be32_to_cpu(vbegin->vni)); } static size_t vxlan_vnifilter_entry_nlmsg_size(void) { return NLMSG_ALIGN(sizeof(struct tunnel_msg)) + nla_total_size(0) /* VXLAN_VNIFILTER_ENTRY */ + nla_total_size(sizeof(u32)) /* VXLAN_VNIFILTER_ENTRY_START */ + nla_total_size(sizeof(u32)) /* VXLAN_VNIFILTER_ENTRY_END */ + nla_total_size(sizeof(struct in6_addr));/* VXLAN_VNIFILTER_ENTRY_GROUP{6} */ } static int __vnifilter_entry_fill_stats(struct sk_buff *skb, const struct vxlan_vni_node *vbegin) { struct vxlan_vni_stats vstats; struct nlattr *vstats_attr; vstats_attr = nla_nest_start(skb, VXLAN_VNIFILTER_ENTRY_STATS); if (!vstats_attr) goto out_stats_err; vxlan_vnifilter_stats_get(vbegin, &vstats); if (nla_put_u64_64bit(skb, VNIFILTER_ENTRY_STATS_RX_BYTES, vstats.rx_bytes, VNIFILTER_ENTRY_STATS_PAD) || nla_put_u64_64bit(skb, VNIFILTER_ENTRY_STATS_RX_PKTS, vstats.rx_packets, VNIFILTER_ENTRY_STATS_PAD) || nla_put_u64_64bit(skb, VNIFILTER_ENTRY_STATS_RX_DROPS, vstats.rx_drops, VNIFILTER_ENTRY_STATS_PAD) || nla_put_u64_64bit(skb, VNIFILTER_ENTRY_STATS_RX_ERRORS, vstats.rx_errors, VNIFILTER_ENTRY_STATS_PAD) || nla_put_u64_64bit(skb, VNIFILTER_ENTRY_STATS_TX_BYTES, vstats.tx_bytes, VNIFILTER_ENTRY_STATS_PAD) || nla_put_u64_64bit(skb, VNIFILTER_ENTRY_STATS_TX_PKTS, vstats.tx_packets, VNIFILTER_ENTRY_STATS_PAD) || nla_put_u64_64bit(skb, VNIFILTER_ENTRY_STATS_TX_DROPS, vstats.tx_drops, VNIFILTER_ENTRY_STATS_PAD) || nla_put_u64_64bit(skb, VNIFILTER_ENTRY_STATS_TX_ERRORS, vstats.tx_errors, VNIFILTER_ENTRY_STATS_PAD)) goto out_stats_err; nla_nest_end(skb, vstats_attr); return 0; out_stats_err: nla_nest_cancel(skb, vstats_attr); return -EMSGSIZE; } static bool vxlan_fill_vni_filter_entry(struct sk_buff *skb, struct vxlan_vni_node *vbegin, struct vxlan_vni_node *vend, bool fill_stats) { struct nlattr *ventry; u32 vs = be32_to_cpu(vbegin->vni); u32 ve = 0; if (vbegin != vend) ve = be32_to_cpu(vend->vni); ventry = nla_nest_start(skb, VXLAN_VNIFILTER_ENTRY); if (!ventry) return false; if (nla_put_u32(skb, VXLAN_VNIFILTER_ENTRY_START, vs)) goto out_err; if (ve && nla_put_u32(skb, VXLAN_VNIFILTER_ENTRY_END, ve)) goto out_err; if (!vxlan_addr_any(&vbegin->remote_ip)) { if (vbegin->remote_ip.sa.sa_family == AF_INET) { if (nla_put_in_addr(skb, VXLAN_VNIFILTER_ENTRY_GROUP, vbegin->remote_ip.sin.sin_addr.s_addr)) goto out_err; #if IS_ENABLED(CONFIG_IPV6) } else { if (nla_put_in6_addr(skb, VXLAN_VNIFILTER_ENTRY_GROUP6, &vbegin->remote_ip.sin6.sin6_addr)) goto out_err; #endif } } if (fill_stats && __vnifilter_entry_fill_stats(skb, vbegin)) goto out_err; nla_nest_end(skb, ventry); return true; out_err: nla_nest_cancel(skb, ventry); return false; } static void vxlan_vnifilter_notify(const struct vxlan_dev *vxlan, struct vxlan_vni_node *vninode, int cmd) { struct tunnel_msg *tmsg; struct sk_buff *skb; struct nlmsghdr *nlh; struct net *net = dev_net(vxlan->dev); int err = -ENOBUFS; skb = nlmsg_new(vxlan_vnifilter_entry_nlmsg_size(), GFP_KERNEL); if (!skb) goto out_err; err = -EMSGSIZE; nlh = nlmsg_put(skb, 0, 0, cmd, sizeof(*tmsg), 0); if (!nlh) goto out_err; tmsg = nlmsg_data(nlh); memset(tmsg, 0, sizeof(*tmsg)); tmsg->family = AF_BRIDGE; tmsg->ifindex = vxlan->dev->ifindex; if (!vxlan_fill_vni_filter_entry(skb, vninode, vninode, false)) goto out_err; nlmsg_end(skb, nlh); rtnl_notify(skb, net, 0, RTNLGRP_TUNNEL, NULL, GFP_KERNEL); return; out_err: rtnl_set_sk_err(net, RTNLGRP_TUNNEL, err); kfree_skb(skb); } static int vxlan_vnifilter_dump_dev(const struct net_device *dev, struct sk_buff *skb, struct netlink_callback *cb) { struct vxlan_vni_node *tmp, *v, *vbegin = NULL, *vend = NULL; struct vxlan_dev *vxlan = netdev_priv(dev); struct tunnel_msg *new_tmsg, *tmsg; int idx = 0, s_idx = cb->args[1]; struct vxlan_vni_group *vg; struct nlmsghdr *nlh; bool dump_stats; int err = 0; if (!(vxlan->cfg.flags & VXLAN_F_VNIFILTER)) return -EINVAL; /* RCU needed because of the vni locking rules (rcu || rtnl) */ vg = rcu_dereference(vxlan->vnigrp); if (!vg || !vg->num_vnis) return 0; tmsg = nlmsg_data(cb->nlh); dump_stats = !!(tmsg->flags & TUNNEL_MSG_FLAG_STATS); nlh = nlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, RTM_NEWTUNNEL, sizeof(*new_tmsg), NLM_F_MULTI); if (!nlh) return -EMSGSIZE; new_tmsg = nlmsg_data(nlh); memset(new_tmsg, 0, sizeof(*new_tmsg)); new_tmsg->family = PF_BRIDGE; new_tmsg->ifindex = dev->ifindex; list_for_each_entry_safe(v, tmp, &vg->vni_list, vlist) { if (idx < s_idx) { idx++; continue; } if (!vbegin) { vbegin = v; vend = v; continue; } if (!dump_stats && vnirange(vend, v) == 1 && vxlan_addr_equal(&v->remote_ip, &vend->remote_ip)) { goto update_end; } else { if (!vxlan_fill_vni_filter_entry(skb, vbegin, vend, dump_stats)) { err = -EMSGSIZE; break; } idx += vnirange(vbegin, vend) + 1; vbegin = v; } update_end: vend = v; } if (!err && vbegin) { if (!vxlan_fill_vni_filter_entry(skb, vbegin, vend, dump_stats)) err = -EMSGSIZE; } cb->args[1] = err ? idx : 0; nlmsg_end(skb, nlh); return err; } static int vxlan_vnifilter_dump(struct sk_buff *skb, struct netlink_callback *cb) { int idx = 0, err = 0, s_idx = cb->args[0]; struct net *net = sock_net(skb->sk); struct tunnel_msg *tmsg; struct net_device *dev; if (cb->nlh->nlmsg_len < nlmsg_msg_size(sizeof(struct tunnel_msg))) { NL_SET_ERR_MSG(cb->extack, "Invalid msg length"); return -EINVAL; } tmsg = nlmsg_data(cb->nlh); if (tmsg->flags & ~TUNNEL_MSG_VALID_USER_FLAGS) { NL_SET_ERR_MSG(cb->extack, "Invalid tunnelmsg flags in ancillary header"); return -EINVAL; } rcu_read_lock(); if (tmsg->ifindex) { dev = dev_get_by_index_rcu(net, tmsg->ifindex); if (!dev) { err = -ENODEV; goto out_err; } if (!netif_is_vxlan(dev)) { NL_SET_ERR_MSG(cb->extack, "The device is not a vxlan device"); err = -EINVAL; goto out_err; } err = vxlan_vnifilter_dump_dev(dev, skb, cb); /* if the dump completed without an error we return 0 here */ if (err != -EMSGSIZE) goto out_err; } else { for_each_netdev_rcu(net, dev) { if (!netif_is_vxlan(dev)) continue; if (idx < s_idx) goto skip; err = vxlan_vnifilter_dump_dev(dev, skb, cb); if (err == -EMSGSIZE) break; skip: idx++; } } cb->args[0] = idx; rcu_read_unlock(); return skb->len; out_err: rcu_read_unlock(); return err; } static const struct nla_policy vni_filter_entry_policy[VXLAN_VNIFILTER_ENTRY_MAX + 1] = { [VXLAN_VNIFILTER_ENTRY_START] = { .type = NLA_U32 }, [VXLAN_VNIFILTER_ENTRY_END] = { .type = NLA_U32 }, [VXLAN_VNIFILTER_ENTRY_GROUP] = { .type = NLA_BINARY, .len = sizeof_field(struct iphdr, daddr) }, [VXLAN_VNIFILTER_ENTRY_GROUP6] = { .type = NLA_BINARY, .len = sizeof(struct in6_addr) }, }; static const struct nla_policy vni_filter_policy[VXLAN_VNIFILTER_MAX + 1] = { [VXLAN_VNIFILTER_ENTRY] = { .type = NLA_NESTED }, }; static int vxlan_update_default_fdb_entry(struct vxlan_dev *vxlan, __be32 vni, union vxlan_addr *old_remote_ip, union vxlan_addr *remote_ip, struct netlink_ext_ack *extack) { struct vxlan_rdst *dst = &vxlan->default_dst; u32 hash_index; int err = 0; hash_index = fdb_head_index(vxlan, all_zeros_mac, vni); spin_lock_bh(&vxlan->hash_lock[hash_index]); if (remote_ip && !vxlan_addr_any(remote_ip)) { err = vxlan_fdb_update(vxlan, all_zeros_mac, remote_ip, NUD_REACHABLE | NUD_PERMANENT, NLM_F_APPEND | NLM_F_CREATE, vxlan->cfg.dst_port, vni, vni, dst->remote_ifindex, NTF_SELF, 0, true, extack); if (err) { spin_unlock_bh(&vxlan->hash_lock[hash_index]); return err; } } if (old_remote_ip && !vxlan_addr_any(old_remote_ip)) { __vxlan_fdb_delete(vxlan, all_zeros_mac, *old_remote_ip, vxlan->cfg.dst_port, vni, vni, dst->remote_ifindex, true); } spin_unlock_bh(&vxlan->hash_lock[hash_index]); return err; } static int vxlan_vni_update_group(struct vxlan_dev *vxlan, struct vxlan_vni_node *vninode, union vxlan_addr *group, bool create, bool *changed, struct netlink_ext_ack *extack) { struct vxlan_net *vn = net_generic(vxlan->net, vxlan_net_id); struct vxlan_rdst *dst = &vxlan->default_dst; union vxlan_addr *newrip = NULL, *oldrip = NULL; union vxlan_addr old_remote_ip; int ret = 0; memcpy(&old_remote_ip, &vninode->remote_ip, sizeof(old_remote_ip)); /* if per vni remote ip is not present use vxlan dev * default dst remote ip for fdb entry */ if (group && !vxlan_addr_any(group)) { newrip = group; } else { if (!vxlan_addr_any(&dst->remote_ip)) newrip = &dst->remote_ip; } /* if old rip exists, and no newrip, * explicitly delete old rip */ if (!newrip && !vxlan_addr_any(&old_remote_ip)) oldrip = &old_remote_ip; if (!newrip && !oldrip) return 0; if (!create && oldrip && newrip && vxlan_addr_equal(oldrip, newrip)) return 0; ret = vxlan_update_default_fdb_entry(vxlan, vninode->vni, oldrip, newrip, extack); if (ret) goto out; if (group) memcpy(&vninode->remote_ip, group, sizeof(vninode->remote_ip)); if (vxlan->dev->flags & IFF_UP) { if (vxlan_addr_multicast(&old_remote_ip) && !vxlan_group_used(vn, vxlan, vninode->vni, &old_remote_ip, vxlan->default_dst.remote_ifindex)) { ret = vxlan_igmp_leave(vxlan, &old_remote_ip, 0); if (ret) goto out; } if (vxlan_addr_multicast(&vninode->remote_ip)) { ret = vxlan_igmp_join(vxlan, &vninode->remote_ip, 0); if (ret == -EADDRINUSE) ret = 0; if (ret) goto out; } } *changed = true; return 0; out: return ret; } int vxlan_vnilist_update_group(struct vxlan_dev *vxlan, union vxlan_addr *old_remote_ip, union vxlan_addr *new_remote_ip, struct netlink_ext_ack *extack) { struct list_head *headp, *hpos; struct vxlan_vni_group *vg; struct vxlan_vni_node *vent; int ret; vg = rtnl_dereference(vxlan->vnigrp); headp = &vg->vni_list; list_for_each_prev(hpos, headp) { vent = list_entry(hpos, struct vxlan_vni_node, vlist); if (vxlan_addr_any(&vent->remote_ip)) { ret = vxlan_update_default_fdb_entry(vxlan, vent->vni, old_remote_ip, new_remote_ip, extack); if (ret) return ret; } } return 0; } static void vxlan_vni_delete_group(struct vxlan_dev *vxlan, struct vxlan_vni_node *vninode) { struct vxlan_net *vn = net_generic(vxlan->net, vxlan_net_id); struct vxlan_rdst *dst = &vxlan->default_dst; /* if per vni remote_ip not present, delete the * default dst remote_ip previously added for this vni */ if (!vxlan_addr_any(&vninode->remote_ip) || !vxlan_addr_any(&dst->remote_ip)) __vxlan_fdb_delete(vxlan, all_zeros_mac, (vxlan_addr_any(&vninode->remote_ip) ? dst->remote_ip : vninode->remote_ip), vxlan->cfg.dst_port, vninode->vni, vninode->vni, dst->remote_ifindex, true); if (vxlan->dev->flags & IFF_UP) { if (vxlan_addr_multicast(&vninode->remote_ip) && !vxlan_group_used(vn, vxlan, vninode->vni, &vninode->remote_ip, dst->remote_ifindex)) { vxlan_igmp_leave(vxlan, &vninode->remote_ip, 0); } } } static int vxlan_vni_update(struct vxlan_dev *vxlan, struct vxlan_vni_group *vg, __be32 vni, union vxlan_addr *group, bool *changed, struct netlink_ext_ack *extack) { struct vxlan_vni_node *vninode; int ret; vninode = rhashtable_lookup_fast(&vg->vni_hash, &vni, vxlan_vni_rht_params); if (!vninode) return 0; ret = vxlan_vni_update_group(vxlan, vninode, group, false, changed, extack); if (ret) return ret; if (changed) vxlan_vnifilter_notify(vxlan, vninode, RTM_NEWTUNNEL); return 0; } static void __vxlan_vni_add_list(struct vxlan_vni_group *vg, struct vxlan_vni_node *v) { struct list_head *headp, *hpos; struct vxlan_vni_node *vent; headp = &vg->vni_list; list_for_each_prev(hpos, headp) { vent = list_entry(hpos, struct vxlan_vni_node, vlist); if (be32_to_cpu(v->vni) < be32_to_cpu(vent->vni)) continue; else break; } list_add_rcu(&v->vlist, hpos); vg->num_vnis++; } static void __vxlan_vni_del_list(struct vxlan_vni_group *vg, struct vxlan_vni_node *v) { list_del_rcu(&v->vlist); vg->num_vnis--; } static struct vxlan_vni_node *vxlan_vni_alloc(struct vxlan_dev *vxlan, __be32 vni) { struct vxlan_vni_node *vninode; vninode = kzalloc(sizeof(*vninode), GFP_KERNEL); if (!vninode) return NULL; vninode->stats = netdev_alloc_pcpu_stats(struct vxlan_vni_stats_pcpu); if (!vninode->stats) { kfree(vninode); return NULL; } vninode->vni = vni; vninode->hlist4.vxlan = vxlan; #if IS_ENABLED(CONFIG_IPV6) vninode->hlist6.vxlan = vxlan; #endif return vninode; } static void vxlan_vni_free(struct vxlan_vni_node *vninode) { free_percpu(vninode->stats); kfree(vninode); } static int vxlan_vni_add(struct vxlan_dev *vxlan, struct vxlan_vni_group *vg, u32 vni, union vxlan_addr *group, struct netlink_ext_ack *extack) { struct vxlan_vni_node *vninode; __be32 v = cpu_to_be32(vni); bool changed = false; int err = 0; if (vxlan_vnifilter_lookup(vxlan, v)) return vxlan_vni_update(vxlan, vg, v, group, &changed, extack); err = vxlan_vni_in_use(vxlan->net, vxlan, &vxlan->cfg, v); if (err) { NL_SET_ERR_MSG(extack, "VNI in use"); return err; } vninode = vxlan_vni_alloc(vxlan, v); if (!vninode) return -ENOMEM; err = rhashtable_lookup_insert_fast(&vg->vni_hash, &vninode->vnode, vxlan_vni_rht_params); if (err) { vxlan_vni_free(vninode); return err; } __vxlan_vni_add_list(vg, vninode); if (vxlan->dev->flags & IFF_UP) vxlan_vs_add_del_vninode(vxlan, vninode, false); err = vxlan_vni_update_group(vxlan, vninode, group, true, &changed, extack); if (changed) vxlan_vnifilter_notify(vxlan, vninode, RTM_NEWTUNNEL); return err; } static void vxlan_vni_node_rcu_free(struct rcu_head *rcu) { struct vxlan_vni_node *v; v = container_of(rcu, struct vxlan_vni_node, rcu); vxlan_vni_free(v); } static int vxlan_vni_del(struct vxlan_dev *vxlan, struct vxlan_vni_group *vg, u32 vni, struct netlink_ext_ack *extack) { struct vxlan_vni_node *vninode; __be32 v = cpu_to_be32(vni); int err = 0; vg = rtnl_dereference(vxlan->vnigrp); vninode = rhashtable_lookup_fast(&vg->vni_hash, &v, vxlan_vni_rht_params); if (!vninode) { err = -ENOENT; goto out; } vxlan_vni_delete_group(vxlan, vninode); err = rhashtable_remove_fast(&vg->vni_hash, &vninode->vnode, vxlan_vni_rht_params); if (err) goto out; __vxlan_vni_del_list(vg, vninode); vxlan_vnifilter_notify(vxlan, vninode, RTM_DELTUNNEL); if (vxlan->dev->flags & IFF_UP) vxlan_vs_add_del_vninode(vxlan, vninode, true); call_rcu(&vninode->rcu, vxlan_vni_node_rcu_free); return 0; out: return err; } static int vxlan_vni_add_del(struct vxlan_dev *vxlan, __u32 start_vni, __u32 end_vni, union vxlan_addr *group, int cmd, struct netlink_ext_ack *extack) { struct vxlan_vni_group *vg; int v, err = 0; vg = rtnl_dereference(vxlan->vnigrp); for (v = start_vni; v <= end_vni; v++) { switch (cmd) { case RTM_NEWTUNNEL: err = vxlan_vni_add(vxlan, vg, v, group, extack); break; case RTM_DELTUNNEL: err = vxlan_vni_del(vxlan, vg, v, extack); break; default: err = -EOPNOTSUPP; break; } if (err) goto out; } return 0; out: return err; } static int vxlan_process_vni_filter(struct vxlan_dev *vxlan, struct nlattr *nlvnifilter, int cmd, struct netlink_ext_ack *extack) { struct nlattr *vattrs[VXLAN_VNIFILTER_ENTRY_MAX + 1]; u32 vni_start = 0, vni_end = 0; union vxlan_addr group; int err; err = nla_parse_nested(vattrs, VXLAN_VNIFILTER_ENTRY_MAX, nlvnifilter, vni_filter_entry_policy, extack); if (err) return err; if (vattrs[VXLAN_VNIFILTER_ENTRY_START]) { vni_start = nla_get_u32(vattrs[VXLAN_VNIFILTER_ENTRY_START]); vni_end = vni_start; } if (vattrs[VXLAN_VNIFILTER_ENTRY_END]) vni_end = nla_get_u32(vattrs[VXLAN_VNIFILTER_ENTRY_END]); if (!vni_start && !vni_end) { NL_SET_ERR_MSG_ATTR(extack, nlvnifilter, "vni start nor end found in vni entry"); return -EINVAL; } if (vattrs[VXLAN_VNIFILTER_ENTRY_GROUP]) { group.sin.sin_addr.s_addr = nla_get_in_addr(vattrs[VXLAN_VNIFILTER_ENTRY_GROUP]); group.sa.sa_family = AF_INET; } else if (vattrs[VXLAN_VNIFILTER_ENTRY_GROUP6]) { group.sin6.sin6_addr = nla_get_in6_addr(vattrs[VXLAN_VNIFILTER_ENTRY_GROUP6]); group.sa.sa_family = AF_INET6; } else { memset(&group, 0, sizeof(group)); } if (vxlan_addr_multicast(&group) && !vxlan->default_dst.remote_ifindex) { NL_SET_ERR_MSG(extack, "Local interface required for multicast remote group"); return -EINVAL; } err = vxlan_vni_add_del(vxlan, vni_start, vni_end, &group, cmd, extack); if (err) return err; return 0; } void vxlan_vnigroup_uninit(struct vxlan_dev *vxlan) { struct vxlan_vni_node *v, *tmp; struct vxlan_vni_group *vg; vg = rtnl_dereference(vxlan->vnigrp); list_for_each_entry_safe(v, tmp, &vg->vni_list, vlist) { rhashtable_remove_fast(&vg->vni_hash, &v->vnode, vxlan_vni_rht_params); hlist_del_init_rcu(&v->hlist4.hlist); #if IS_ENABLED(CONFIG_IPV6) hlist_del_init_rcu(&v->hlist6.hlist); #endif __vxlan_vni_del_list(vg, v); vxlan_vnifilter_notify(vxlan, v, RTM_DELTUNNEL); call_rcu(&v->rcu, vxlan_vni_node_rcu_free); } rhashtable_destroy(&vg->vni_hash); kfree(vg); } int vxlan_vnigroup_init(struct vxlan_dev *vxlan) { struct vxlan_vni_group *vg; int ret; vg = kzalloc(sizeof(*vg), GFP_KERNEL); if (!vg) return -ENOMEM; ret = rhashtable_init(&vg->vni_hash, &vxlan_vni_rht_params); if (ret) { kfree(vg); return ret; } INIT_LIST_HEAD(&vg->vni_list); rcu_assign_pointer(vxlan->vnigrp, vg); return 0; } static int vxlan_vnifilter_process(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); struct tunnel_msg *tmsg; struct vxlan_dev *vxlan; struct net_device *dev; struct nlattr *attr; int err, vnis = 0; int rem; /* this should validate the header and check for remaining bytes */ err = nlmsg_parse(nlh, sizeof(*tmsg), NULL, VXLAN_VNIFILTER_MAX, vni_filter_policy, extack); if (err < 0) return err; tmsg = nlmsg_data(nlh); dev = __dev_get_by_index(net, tmsg->ifindex); if (!dev) return -ENODEV; if (!netif_is_vxlan(dev)) { NL_SET_ERR_MSG_MOD(extack, "The device is not a vxlan device"); return -EINVAL; } vxlan = netdev_priv(dev); if (!(vxlan->cfg.flags & VXLAN_F_VNIFILTER)) return -EOPNOTSUPP; nlmsg_for_each_attr(attr, nlh, sizeof(*tmsg), rem) { switch (nla_type(attr)) { case VXLAN_VNIFILTER_ENTRY: err = vxlan_process_vni_filter(vxlan, attr, nlh->nlmsg_type, extack); break; default: continue; } vnis++; if (err) break; } if (!vnis) { NL_SET_ERR_MSG_MOD(extack, "No vnis found to process"); err = -EINVAL; } return err; } static const struct rtnl_msg_handler vxlan_vnifilter_rtnl_msg_handlers[] = { {THIS_MODULE, PF_BRIDGE, RTM_GETTUNNEL, NULL, vxlan_vnifilter_dump, 0}, {THIS_MODULE, PF_BRIDGE, RTM_NEWTUNNEL, vxlan_vnifilter_process, NULL, 0}, {THIS_MODULE, PF_BRIDGE, RTM_DELTUNNEL, vxlan_vnifilter_process, NULL, 0}, }; int vxlan_vnifilter_init(void) { return rtnl_register_many(vxlan_vnifilter_rtnl_msg_handlers); } void vxlan_vnifilter_uninit(void) { rtnl_unregister_many(vxlan_vnifilter_rtnl_msg_handlers); } |
| 202 201 202 197 196 6 6 271 6 83 197 175 105 277 220 220 278 278 1 220 220 45 284 283 45 1 283 283 283 12 58 58 39 39 29 29 15 15 254 255 114 3 23 18 18 8 8 283 284 1 181 74 30 249 6 3 1 4 174 21 52 225 226 225 225 44 226 26 70 18 147 9 10 8 208 7 23 111 1 91 48 200 200 201 201 82 198 6 197 196 196 196 113 85 197 8 1 7 8 7 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 | // SPDX-License-Identifier: GPL-2.0 /* * USB Serial Converter driver * * Copyright (C) 2009 - 2013 Johan Hovold (jhovold@gmail.com) * Copyright (C) 1999 - 2012 Greg Kroah-Hartman (greg@kroah.com) * Copyright (C) 2000 Peter Berger (pberger@brimson.com) * Copyright (C) 2000 Al Borchers (borchers@steinerpoint.com) * * This driver was originally based on the ACM driver by Armin Fuerst (which was * based on a driver by Brad Keryan) * * See Documentation/usb/usb-serial.rst for more information on using this * driver */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/kernel.h> #include <linux/errno.h> #include <linux/init.h> #include <linux/slab.h> #include <linux/tty.h> #include <linux/tty_driver.h> #include <linux/tty_flip.h> #include <linux/module.h> #include <linux/moduleparam.h> #include <linux/seq_file.h> #include <linux/spinlock.h> #include <linux/mutex.h> #include <linux/list.h> #include <linux/uaccess.h> #include <linux/serial.h> #include <linux/usb.h> #include <linux/usb/serial.h> #include <linux/kfifo.h> #include <linux/idr.h> #define DRIVER_AUTHOR "Greg Kroah-Hartman <gregkh@linuxfoundation.org>" #define DRIVER_DESC "USB Serial Driver core" #define USB_SERIAL_TTY_MAJOR 188 #define USB_SERIAL_TTY_MINORS 512 /* should be enough for a while */ /* There is no MODULE_DEVICE_TABLE for usbserial.c. Instead the MODULE_DEVICE_TABLE declarations in each serial driver cause the "hotplug" program to pull in whatever module is necessary via modprobe, and modprobe will load usbserial because the serial drivers depend on it. */ static DEFINE_IDR(serial_minors); static DEFINE_MUTEX(table_lock); static LIST_HEAD(usb_serial_driver_list); /* * Look up the serial port structure. If it is found and it hasn't been * disconnected, return with the parent usb_serial structure's disc_mutex held * and its refcount incremented. Otherwise return NULL. */ struct usb_serial_port *usb_serial_port_get_by_minor(unsigned minor) { struct usb_serial *serial; struct usb_serial_port *port; mutex_lock(&table_lock); port = idr_find(&serial_minors, minor); if (!port) goto exit; serial = port->serial; mutex_lock(&serial->disc_mutex); if (serial->disconnected) { mutex_unlock(&serial->disc_mutex); port = NULL; } else { kref_get(&serial->kref); } exit: mutex_unlock(&table_lock); return port; } static int allocate_minors(struct usb_serial *serial, int num_ports) { struct usb_serial_port *port; unsigned int i, j; int minor; dev_dbg(&serial->interface->dev, "%s %d\n", __func__, num_ports); mutex_lock(&table_lock); for (i = 0; i < num_ports; ++i) { port = serial->port[i]; minor = idr_alloc(&serial_minors, port, 0, USB_SERIAL_TTY_MINORS, GFP_KERNEL); if (minor < 0) goto error; port->minor = minor; port->port_number = i; } serial->minors_reserved = 1; mutex_unlock(&table_lock); return 0; error: /* unwind the already allocated minors */ for (j = 0; j < i; ++j) idr_remove(&serial_minors, serial->port[j]->minor); mutex_unlock(&table_lock); return minor; } static void release_minors(struct usb_serial *serial) { int i; mutex_lock(&table_lock); for (i = 0; i < serial->num_ports; ++i) idr_remove(&serial_minors, serial->port[i]->minor); mutex_unlock(&table_lock); serial->minors_reserved = 0; } int usb_serial_claim_interface(struct usb_serial *serial, struct usb_interface *intf) { struct usb_driver *driver = serial->type->usb_driver; int ret; if (serial->sibling) return -EBUSY; ret = usb_driver_claim_interface(driver, intf, serial); if (ret) { dev_err(&serial->interface->dev, "failed to claim sibling interface: %d\n", ret); return ret; } serial->sibling = intf; return 0; } EXPORT_SYMBOL_GPL(usb_serial_claim_interface); static void release_sibling(struct usb_serial *serial, struct usb_interface *intf) { struct usb_driver *driver = serial->type->usb_driver; struct usb_interface *sibling; if (!serial->sibling) return; if (intf == serial->sibling) sibling = serial->interface; else sibling = serial->sibling; usb_set_intfdata(sibling, NULL); usb_driver_release_interface(driver, sibling); } static void destroy_serial(struct kref *kref) { struct usb_serial *serial; struct usb_serial_port *port; int i; serial = to_usb_serial(kref); /* return the minor range that this device had */ if (serial->minors_reserved) release_minors(serial); if (serial->attached && serial->type->release) serial->type->release(serial); /* Now that nothing is using the ports, they can be freed */ for (i = 0; i < serial->num_port_pointers; ++i) { port = serial->port[i]; if (port) { port->serial = NULL; put_device(&port->dev); } } usb_put_intf(serial->interface); usb_put_dev(serial->dev); kfree(serial); } void usb_serial_put(struct usb_serial *serial) { kref_put(&serial->kref, destroy_serial); } /***************************************************************************** * Driver tty interface functions *****************************************************************************/ /** * serial_install - install tty * @driver: the driver (USB in our case) * @tty: the tty being created * * Initialise the termios structure for this tty. We use the default * USB serial settings but permit them to be overridden by * serial->type->init_termios on first open. * * This is the first place a new tty gets used. Hence this is where we * acquire references to the usb_serial structure and the driver module, * where we store a pointer to the port. All these actions are reversed * in serial_cleanup(). */ static int serial_install(struct tty_driver *driver, struct tty_struct *tty) { int idx = tty->index; struct usb_serial *serial; struct usb_serial_port *port; bool init_termios; int retval = -ENODEV; port = usb_serial_port_get_by_minor(idx); if (!port) return retval; serial = port->serial; if (!try_module_get(serial->type->driver.owner)) goto err_put_serial; init_termios = (driver->termios[idx] == NULL); retval = tty_standard_install(driver, tty); if (retval) goto err_put_module; mutex_unlock(&serial->disc_mutex); /* allow the driver to update the initial settings */ if (init_termios && serial->type->init_termios) serial->type->init_termios(tty); tty->driver_data = port; return retval; err_put_module: module_put(serial->type->driver.owner); err_put_serial: usb_serial_put(serial); mutex_unlock(&serial->disc_mutex); return retval; } static int serial_port_activate(struct tty_port *tport, struct tty_struct *tty) { struct usb_serial_port *port = container_of(tport, struct usb_serial_port, port); struct usb_serial *serial = port->serial; int retval; mutex_lock(&serial->disc_mutex); if (serial->disconnected) { retval = -ENODEV; goto out_unlock; } retval = usb_autopm_get_interface(serial->interface); if (retval) goto out_unlock; retval = port->serial->type->open(tty, port); if (retval) usb_autopm_put_interface(serial->interface); out_unlock: mutex_unlock(&serial->disc_mutex); if (retval < 0) retval = usb_translate_errors(retval); return retval; } static int serial_open(struct tty_struct *tty, struct file *filp) { struct usb_serial_port *port = tty->driver_data; dev_dbg(&port->dev, "%s\n", __func__); return tty_port_open(&port->port, tty, filp); } /** * serial_port_shutdown - shut down hardware * @tport: tty port to shut down * * Shut down a USB serial port. Serialized against activate by the * tport mutex and kept to matching open/close pairs * of calls by the tty-port initialized flag. * * Not called if tty is console. */ static void serial_port_shutdown(struct tty_port *tport) { struct usb_serial_port *port = container_of(tport, struct usb_serial_port, port); struct usb_serial_driver *drv = port->serial->type; if (drv->close) drv->close(port); usb_autopm_put_interface(port->serial->interface); } static void serial_hangup(struct tty_struct *tty) { struct usb_serial_port *port = tty->driver_data; dev_dbg(&port->dev, "%s\n", __func__); tty_port_hangup(&port->port); } static void serial_close(struct tty_struct *tty, struct file *filp) { struct usb_serial_port *port = tty->driver_data; dev_dbg(&port->dev, "%s\n", __func__); tty_port_close(&port->port, tty, filp); } /** * serial_cleanup - free resources post close/hangup * @tty: tty to clean up * * Do the resource freeing and refcount dropping for the port. * Avoid freeing the console. * * Called asynchronously after the last tty kref is dropped. */ static void serial_cleanup(struct tty_struct *tty) { struct usb_serial_port *port = tty->driver_data; struct usb_serial *serial; struct module *owner; dev_dbg(&port->dev, "%s\n", __func__); /* The console is magical. Do not hang up the console hardware * or there will be tears. */ if (port->port.console) return; tty->driver_data = NULL; serial = port->serial; owner = serial->type->driver.owner; usb_serial_put(serial); module_put(owner); } static ssize_t serial_write(struct tty_struct *tty, const u8 *buf, size_t count) { struct usb_serial_port *port = tty->driver_data; int retval = -ENODEV; if (port->serial->dev->state == USB_STATE_NOTATTACHED) goto exit; dev_dbg(&port->dev, "%s - %zu byte(s)\n", __func__, count); retval = port->serial->type->write(tty, port, buf, count); if (retval < 0) retval = usb_translate_errors(retval); exit: return retval; } static unsigned int serial_write_room(struct tty_struct *tty) { struct usb_serial_port *port = tty->driver_data; dev_dbg(&port->dev, "%s\n", __func__); return port->serial->type->write_room(tty); } static unsigned int serial_chars_in_buffer(struct tty_struct *tty) { struct usb_serial_port *port = tty->driver_data; struct usb_serial *serial = port->serial; dev_dbg(&port->dev, "%s\n", __func__); if (serial->disconnected) return 0; return serial->type->chars_in_buffer(tty); } static void serial_wait_until_sent(struct tty_struct *tty, int timeout) { struct usb_serial_port *port = tty->driver_data; struct usb_serial *serial = port->serial; dev_dbg(&port->dev, "%s\n", __func__); if (!port->serial->type->wait_until_sent) return; mutex_lock(&serial->disc_mutex); if (!serial->disconnected) port->serial->type->wait_until_sent(tty, timeout); mutex_unlock(&serial->disc_mutex); } static void serial_throttle(struct tty_struct *tty) { struct usb_serial_port *port = tty->driver_data; dev_dbg(&port->dev, "%s\n", __func__); if (port->serial->type->throttle) port->serial->type->throttle(tty); } static void serial_unthrottle(struct tty_struct *tty) { struct usb_serial_port *port = tty->driver_data; dev_dbg(&port->dev, "%s\n", __func__); if (port->serial->type->unthrottle) port->serial->type->unthrottle(tty); } static int serial_get_serial(struct tty_struct *tty, struct serial_struct *ss) { struct usb_serial_port *port = tty->driver_data; struct tty_port *tport = &port->port; unsigned int close_delay, closing_wait; mutex_lock(&tport->mutex); close_delay = jiffies_to_msecs(tport->close_delay) / 10; closing_wait = tport->closing_wait; if (closing_wait != ASYNC_CLOSING_WAIT_NONE) closing_wait = jiffies_to_msecs(closing_wait) / 10; ss->line = port->minor; ss->close_delay = close_delay; ss->closing_wait = closing_wait; if (port->serial->type->get_serial) port->serial->type->get_serial(tty, ss); mutex_unlock(&tport->mutex); return 0; } static int serial_set_serial(struct tty_struct *tty, struct serial_struct *ss) { struct usb_serial_port *port = tty->driver_data; struct tty_port *tport = &port->port; unsigned int close_delay, closing_wait; int ret = 0; close_delay = msecs_to_jiffies(ss->close_delay * 10); closing_wait = ss->closing_wait; if (closing_wait != ASYNC_CLOSING_WAIT_NONE) closing_wait = msecs_to_jiffies(closing_wait * 10); mutex_lock(&tport->mutex); if (!capable(CAP_SYS_ADMIN)) { if (close_delay != tport->close_delay || closing_wait != tport->closing_wait) { ret = -EPERM; goto out_unlock; } } if (port->serial->type->set_serial) { ret = port->serial->type->set_serial(tty, ss); if (ret) goto out_unlock; } tport->close_delay = close_delay; tport->closing_wait = closing_wait; out_unlock: mutex_unlock(&tport->mutex); return ret; } static int serial_ioctl(struct tty_struct *tty, unsigned int cmd, unsigned long arg) { struct usb_serial_port *port = tty->driver_data; int retval = -ENOIOCTLCMD; dev_dbg(&port->dev, "%s - cmd 0x%04x\n", __func__, cmd); switch (cmd) { case TIOCMIWAIT: if (port->serial->type->tiocmiwait) retval = port->serial->type->tiocmiwait(tty, arg); break; default: if (port->serial->type->ioctl) retval = port->serial->type->ioctl(tty, cmd, arg); } return retval; } static void serial_set_termios(struct tty_struct *tty, const struct ktermios *old) { struct usb_serial_port *port = tty->driver_data; dev_dbg(&port->dev, "%s\n", __func__); if (port->serial->type->set_termios) port->serial->type->set_termios(tty, port, old); else tty_termios_copy_hw(&tty->termios, old); } static int serial_break(struct tty_struct *tty, int break_state) { struct usb_serial_port *port = tty->driver_data; dev_dbg(&port->dev, "%s\n", __func__); if (port->serial->type->break_ctl) return port->serial->type->break_ctl(tty, break_state); return -ENOTTY; } static int serial_proc_show(struct seq_file *m, void *v) { struct usb_serial *serial; struct usb_serial_port *port; int i; char tmp[40]; seq_puts(m, "usbserinfo:1.0 driver:2.0\n"); for (i = 0; i < USB_SERIAL_TTY_MINORS; ++i) { port = usb_serial_port_get_by_minor(i); if (port == NULL) continue; serial = port->serial; seq_printf(m, "%d:", i); if (serial->type->driver.owner) seq_printf(m, " module:%s", module_name(serial->type->driver.owner)); seq_printf(m, " name:\"%s\"", serial->type->description); seq_printf(m, " vendor:%04x product:%04x", le16_to_cpu(serial->dev->descriptor.idVendor), le16_to_cpu(serial->dev->descriptor.idProduct)); seq_printf(m, " num_ports:%d", serial->num_ports); seq_printf(m, " port:%d", port->port_number); usb_make_path(serial->dev, tmp, sizeof(tmp)); seq_printf(m, " path:%s", tmp); seq_putc(m, '\n'); usb_serial_put(serial); mutex_unlock(&serial->disc_mutex); } return 0; } static int serial_tiocmget(struct tty_struct *tty) { struct usb_serial_port *port = tty->driver_data; dev_dbg(&port->dev, "%s\n", __func__); if (port->serial->type->tiocmget) return port->serial->type->tiocmget(tty); return -ENOTTY; } static int serial_tiocmset(struct tty_struct *tty, unsigned int set, unsigned int clear) { struct usb_serial_port *port = tty->driver_data; dev_dbg(&port->dev, "%s\n", __func__); if (port->serial->type->tiocmset) return port->serial->type->tiocmset(tty, set, clear); return -ENOTTY; } static int serial_get_icount(struct tty_struct *tty, struct serial_icounter_struct *icount) { struct usb_serial_port *port = tty->driver_data; dev_dbg(&port->dev, "%s\n", __func__); if (port->serial->type->get_icount) return port->serial->type->get_icount(tty, icount); return -ENOTTY; } /* * We would be calling tty_wakeup here, but unfortunately some line * disciplines have an annoying habit of calling tty->write from * the write wakeup callback (e.g. n_hdlc.c). */ void usb_serial_port_softint(struct usb_serial_port *port) { schedule_work(&port->work); } EXPORT_SYMBOL_GPL(usb_serial_port_softint); static void usb_serial_port_work(struct work_struct *work) { struct usb_serial_port *port = container_of(work, struct usb_serial_port, work); tty_port_tty_wakeup(&port->port); } static void usb_serial_port_poison_urbs(struct usb_serial_port *port) { int i; for (i = 0; i < ARRAY_SIZE(port->read_urbs); ++i) usb_poison_urb(port->read_urbs[i]); for (i = 0; i < ARRAY_SIZE(port->write_urbs); ++i) usb_poison_urb(port->write_urbs[i]); usb_poison_urb(port->interrupt_in_urb); usb_poison_urb(port->interrupt_out_urb); } static void usb_serial_port_unpoison_urbs(struct usb_serial_port *port) { int i; for (i = 0; i < ARRAY_SIZE(port->read_urbs); ++i) usb_unpoison_urb(port->read_urbs[i]); for (i = 0; i < ARRAY_SIZE(port->write_urbs); ++i) usb_unpoison_urb(port->write_urbs[i]); usb_unpoison_urb(port->interrupt_in_urb); usb_unpoison_urb(port->interrupt_out_urb); } static void usb_serial_port_release(struct device *dev) { struct usb_serial_port *port = to_usb_serial_port(dev); int i; dev_dbg(dev, "%s\n", __func__); usb_free_urb(port->interrupt_in_urb); usb_free_urb(port->interrupt_out_urb); for (i = 0; i < ARRAY_SIZE(port->read_urbs); ++i) { usb_free_urb(port->read_urbs[i]); kfree(port->bulk_in_buffers[i]); } for (i = 0; i < ARRAY_SIZE(port->write_urbs); ++i) { usb_free_urb(port->write_urbs[i]); kfree(port->bulk_out_buffers[i]); } kfifo_free(&port->write_fifo); kfree(port->interrupt_in_buffer); kfree(port->interrupt_out_buffer); tty_port_destroy(&port->port); kfree(port); } static struct usb_serial *create_serial(struct usb_device *dev, struct usb_interface *interface, struct usb_serial_driver *driver) { struct usb_serial *serial; serial = kzalloc(sizeof(*serial), GFP_KERNEL); if (!serial) return NULL; serial->dev = usb_get_dev(dev); serial->type = driver; serial->interface = usb_get_intf(interface); kref_init(&serial->kref); mutex_init(&serial->disc_mutex); serial->minors_reserved = 0; return serial; } static const struct usb_device_id *match_dynamic_id(struct usb_interface *intf, struct usb_serial_driver *drv) { struct usb_dynid *dynid; guard(mutex)(&usb_dynids_lock); list_for_each_entry(dynid, &drv->dynids.list, node) { if (usb_match_one_id(intf, &dynid->id)) { return &dynid->id; } } return NULL; } static const struct usb_device_id *get_iface_id(struct usb_serial_driver *drv, struct usb_interface *intf) { const struct usb_device_id *id; id = usb_match_id(intf, drv->id_table); if (id) { dev_dbg(&intf->dev, "static descriptor matches\n"); goto exit; } id = match_dynamic_id(intf, drv); if (id) dev_dbg(&intf->dev, "dynamic descriptor matches\n"); exit: return id; } /* Caller must hold table_lock */ static struct usb_serial_driver *search_serial_device( struct usb_interface *iface) { const struct usb_device_id *id = NULL; struct usb_serial_driver *drv; struct usb_driver *driver = to_usb_driver(iface->dev.driver); /* Check if the usb id matches a known device */ list_for_each_entry(drv, &usb_serial_driver_list, driver_list) { if (drv->usb_driver == driver) id = get_iface_id(drv, iface); if (id) return drv; } return NULL; } static bool serial_port_carrier_raised(struct tty_port *port) { struct usb_serial_port *p = container_of(port, struct usb_serial_port, port); struct usb_serial_driver *drv = p->serial->type; if (drv->carrier_raised) return drv->carrier_raised(p); /* No carrier control - don't block */ return true; } static void serial_port_dtr_rts(struct tty_port *port, bool on) { struct usb_serial_port *p = container_of(port, struct usb_serial_port, port); struct usb_serial_driver *drv = p->serial->type; if (drv->dtr_rts) drv->dtr_rts(p, on); } static ssize_t port_number_show(struct device *dev, struct device_attribute *attr, char *buf) { struct usb_serial_port *port = to_usb_serial_port(dev); return sprintf(buf, "%u\n", port->port_number); } static DEVICE_ATTR_RO(port_number); static struct attribute *usb_serial_port_attrs[] = { &dev_attr_port_number.attr, NULL }; ATTRIBUTE_GROUPS(usb_serial_port); static const struct tty_port_operations serial_port_ops = { .carrier_raised = serial_port_carrier_raised, .dtr_rts = serial_port_dtr_rts, .activate = serial_port_activate, .shutdown = serial_port_shutdown, }; static void store_endpoint(struct usb_serial *serial, struct usb_serial_endpoints *epds, struct usb_endpoint_descriptor *epd) { struct device *dev = &serial->interface->dev; u8 addr = epd->bEndpointAddress; if (usb_endpoint_is_bulk_in(epd)) { if (epds->num_bulk_in == ARRAY_SIZE(epds->bulk_in)) return; dev_dbg(dev, "found bulk in endpoint %02x\n", addr); epds->bulk_in[epds->num_bulk_in++] = epd; } else if (usb_endpoint_is_bulk_out(epd)) { if (epds->num_bulk_out == ARRAY_SIZE(epds->bulk_out)) return; dev_dbg(dev, "found bulk out endpoint %02x\n", addr); epds->bulk_out[epds->num_bulk_out++] = epd; } else if (usb_endpoint_is_int_in(epd)) { if (epds->num_interrupt_in == ARRAY_SIZE(epds->interrupt_in)) return; dev_dbg(dev, "found interrupt in endpoint %02x\n", addr); epds->interrupt_in[epds->num_interrupt_in++] = epd; } else if (usb_endpoint_is_int_out(epd)) { if (epds->num_interrupt_out == ARRAY_SIZE(epds->interrupt_out)) return; dev_dbg(dev, "found interrupt out endpoint %02x\n", addr); epds->interrupt_out[epds->num_interrupt_out++] = epd; } } static void find_endpoints(struct usb_serial *serial, struct usb_serial_endpoints *epds, struct usb_interface *intf) { struct usb_host_interface *iface_desc; struct usb_endpoint_descriptor *epd; unsigned int i; iface_desc = intf->cur_altsetting; for (i = 0; i < iface_desc->desc.bNumEndpoints; ++i) { epd = &iface_desc->endpoint[i].desc; store_endpoint(serial, epds, epd); } } static int setup_port_bulk_in(struct usb_serial_port *port, struct usb_endpoint_descriptor *epd) { struct usb_serial_driver *type = port->serial->type; struct usb_device *udev = port->serial->dev; int buffer_size; int i; buffer_size = max_t(int, type->bulk_in_size, usb_endpoint_maxp(epd)); port->bulk_in_size = buffer_size; port->bulk_in_endpointAddress = epd->bEndpointAddress; for (i = 0; i < ARRAY_SIZE(port->read_urbs); ++i) { set_bit(i, &port->read_urbs_free); port->read_urbs[i] = usb_alloc_urb(0, GFP_KERNEL); if (!port->read_urbs[i]) return -ENOMEM; port->bulk_in_buffers[i] = kmalloc(buffer_size, GFP_KERNEL); if (!port->bulk_in_buffers[i]) return -ENOMEM; usb_fill_bulk_urb(port->read_urbs[i], udev, usb_rcvbulkpipe(udev, epd->bEndpointAddress), port->bulk_in_buffers[i], buffer_size, type->read_bulk_callback, port); } port->read_urb = port->read_urbs[0]; port->bulk_in_buffer = port->bulk_in_buffers[0]; return 0; } static int setup_port_bulk_out(struct usb_serial_port *port, struct usb_endpoint_descriptor *epd) { struct usb_serial_driver *type = port->serial->type; struct usb_device *udev = port->serial->dev; int buffer_size; int i; if (kfifo_alloc(&port->write_fifo, PAGE_SIZE, GFP_KERNEL)) return -ENOMEM; if (type->bulk_out_size) buffer_size = type->bulk_out_size; else buffer_size = usb_endpoint_maxp(epd); port->bulk_out_size = buffer_size; port->bulk_out_endpointAddress = epd->bEndpointAddress; for (i = 0; i < ARRAY_SIZE(port->write_urbs); ++i) { set_bit(i, &port->write_urbs_free); port->write_urbs[i] = usb_alloc_urb(0, GFP_KERNEL); if (!port->write_urbs[i]) return -ENOMEM; port->bulk_out_buffers[i] = kmalloc(buffer_size, GFP_KERNEL); if (!port->bulk_out_buffers[i]) return -ENOMEM; usb_fill_bulk_urb(port->write_urbs[i], udev, usb_sndbulkpipe(udev, epd->bEndpointAddress), port->bulk_out_buffers[i], buffer_size, type->write_bulk_callback, port); } port->write_urb = port->write_urbs[0]; port->bulk_out_buffer = port->bulk_out_buffers[0]; return 0; } static int setup_port_interrupt_in(struct usb_serial_port *port, struct usb_endpoint_descriptor *epd) { struct usb_serial_driver *type = port->serial->type; struct usb_device *udev = port->serial->dev; int buffer_size; port->interrupt_in_urb = usb_alloc_urb(0, GFP_KERNEL); if (!port->interrupt_in_urb) return -ENOMEM; buffer_size = usb_endpoint_maxp(epd); port->interrupt_in_endpointAddress = epd->bEndpointAddress; port->interrupt_in_buffer = kmalloc(buffer_size, GFP_KERNEL); if (!port->interrupt_in_buffer) return -ENOMEM; usb_fill_int_urb(port->interrupt_in_urb, udev, usb_rcvintpipe(udev, epd->bEndpointAddress), port->interrupt_in_buffer, buffer_size, type->read_int_callback, port, epd->bInterval); return 0; } static int setup_port_interrupt_out(struct usb_serial_port *port, struct usb_endpoint_descriptor *epd) { struct usb_serial_driver *type = port->serial->type; struct usb_device *udev = port->serial->dev; int buffer_size; port->interrupt_out_urb = usb_alloc_urb(0, GFP_KERNEL); if (!port->interrupt_out_urb) return -ENOMEM; buffer_size = usb_endpoint_maxp(epd); port->interrupt_out_size = buffer_size; port->interrupt_out_endpointAddress = epd->bEndpointAddress; port->interrupt_out_buffer = kmalloc(buffer_size, GFP_KERNEL); if (!port->interrupt_out_buffer) return -ENOMEM; usb_fill_int_urb(port->interrupt_out_urb, udev, usb_sndintpipe(udev, epd->bEndpointAddress), port->interrupt_out_buffer, buffer_size, type->write_int_callback, port, epd->bInterval); return 0; } static int usb_serial_probe(struct usb_interface *interface, const struct usb_device_id *id) { struct device *ddev = &interface->dev; struct usb_device *dev = interface_to_usbdev(interface); struct usb_serial *serial = NULL; struct usb_serial_port *port; struct usb_serial_endpoints *epds; struct usb_serial_driver *type = NULL; int retval; int i; int num_ports = 0; unsigned char max_endpoints; mutex_lock(&table_lock); type = search_serial_device(interface); if (!type) { mutex_unlock(&table_lock); dev_dbg(ddev, "none matched\n"); return -ENODEV; } if (!try_module_get(type->driver.owner)) { mutex_unlock(&table_lock); dev_err(ddev, "module get failed, exiting\n"); return -EIO; } mutex_unlock(&table_lock); serial = create_serial(dev, interface, type); if (!serial) { retval = -ENOMEM; goto err_put_module; } /* if this device type has a probe function, call it */ if (type->probe) { const struct usb_device_id *id; id = get_iface_id(type, interface); retval = type->probe(serial, id); if (retval) { dev_dbg(ddev, "sub driver rejected device\n"); goto err_release_sibling; } } /* descriptor matches, let's find the endpoints needed */ epds = kzalloc(sizeof(*epds), GFP_KERNEL); if (!epds) { retval = -ENOMEM; goto err_release_sibling; } find_endpoints(serial, epds, interface); if (serial->sibling) find_endpoints(serial, epds, serial->sibling); if (epds->num_bulk_in < type->num_bulk_in || epds->num_bulk_out < type->num_bulk_out || epds->num_interrupt_in < type->num_interrupt_in || epds->num_interrupt_out < type->num_interrupt_out) { dev_err(ddev, "required endpoints missing\n"); retval = -ENODEV; goto err_free_epds; } if (type->calc_num_ports) { retval = type->calc_num_ports(serial, epds); if (retval < 0) goto err_free_epds; num_ports = retval; } if (!num_ports) num_ports = type->num_ports; if (num_ports > MAX_NUM_PORTS) { dev_warn(ddev, "too many ports requested: %d\n", num_ports); num_ports = MAX_NUM_PORTS; } serial->num_ports = (unsigned char)num_ports; serial->num_bulk_in = epds->num_bulk_in; serial->num_bulk_out = epds->num_bulk_out; serial->num_interrupt_in = epds->num_interrupt_in; serial->num_interrupt_out = epds->num_interrupt_out; /* found all that we need */ dev_info(ddev, "%s converter detected\n", type->description); /* create our ports, we need as many as the max endpoints */ /* we don't use num_ports here because some devices have more endpoint pairs than ports */ max_endpoints = max(epds->num_bulk_in, epds->num_bulk_out); max_endpoints = max(max_endpoints, epds->num_interrupt_in); max_endpoints = max(max_endpoints, epds->num_interrupt_out); max_endpoints = max(max_endpoints, serial->num_ports); serial->num_port_pointers = max_endpoints; dev_dbg(ddev, "setting up %d port structure(s)\n", max_endpoints); for (i = 0; i < max_endpoints; ++i) { port = kzalloc(sizeof(struct usb_serial_port), GFP_KERNEL); if (!port) { retval = -ENOMEM; goto err_free_epds; } tty_port_init(&port->port); port->port.ops = &serial_port_ops; port->serial = serial; spin_lock_init(&port->lock); /* Keep this for private driver use for the moment but should probably go away */ INIT_WORK(&port->work, usb_serial_port_work); serial->port[i] = port; port->dev.parent = &interface->dev; port->dev.driver = NULL; port->dev.bus = &usb_serial_bus_type; port->dev.release = &usb_serial_port_release; port->dev.groups = usb_serial_port_groups; device_initialize(&port->dev); } /* set up the endpoint information */ for (i = 0; i < epds->num_bulk_in; ++i) { retval = setup_port_bulk_in(serial->port[i], epds->bulk_in[i]); if (retval) goto err_free_epds; } for (i = 0; i < epds->num_bulk_out; ++i) { retval = setup_port_bulk_out(serial->port[i], epds->bulk_out[i]); if (retval) goto err_free_epds; } if (serial->type->read_int_callback) { for (i = 0; i < epds->num_interrupt_in; ++i) { retval = setup_port_interrupt_in(serial->port[i], epds->interrupt_in[i]); if (retval) goto err_free_epds; } } else if (epds->num_interrupt_in) { dev_dbg(ddev, "The device claims to support interrupt in transfers, but read_int_callback is not defined\n"); } if (serial->type->write_int_callback) { for (i = 0; i < epds->num_interrupt_out; ++i) { retval = setup_port_interrupt_out(serial->port[i], epds->interrupt_out[i]); if (retval) goto err_free_epds; } } else if (epds->num_interrupt_out) { dev_dbg(ddev, "The device claims to support interrupt out transfers, but write_int_callback is not defined\n"); } usb_set_intfdata(interface, serial); /* if this device type has an attach function, call it */ if (type->attach) { retval = type->attach(serial); if (retval < 0) goto err_free_epds; serial->attached = 1; if (retval > 0) { /* quietly accept this device, but don't bind to a serial port as it's about to disappear */ serial->num_ports = 0; goto exit; } } else { serial->attached = 1; } retval = allocate_minors(serial, num_ports); if (retval) { dev_err(ddev, "No more free serial minor numbers\n"); goto err_free_epds; } /* register all of the individual ports with the driver core */ for (i = 0; i < num_ports; ++i) { port = serial->port[i]; dev_set_name(&port->dev, "ttyUSB%d", port->minor); dev_dbg(ddev, "registering %s\n", dev_name(&port->dev)); device_enable_async_suspend(&port->dev); retval = device_add(&port->dev); if (retval) dev_err(ddev, "Error registering port device, continuing\n"); } if (num_ports > 0) usb_serial_console_init(serial->port[0]->minor); exit: kfree(epds); module_put(type->driver.owner); return 0; err_free_epds: kfree(epds); err_release_sibling: release_sibling(serial, interface); usb_serial_put(serial); err_put_module: module_put(type->driver.owner); return retval; } static void usb_serial_disconnect(struct usb_interface *interface) { int i; struct usb_serial *serial = usb_get_intfdata(interface); struct device *dev = &interface->dev; struct usb_serial_port *port; struct tty_struct *tty; /* sibling interface is cleaning up */ if (!serial) return; usb_serial_console_disconnect(serial); mutex_lock(&serial->disc_mutex); /* must set a flag, to signal subdrivers */ serial->disconnected = 1; mutex_unlock(&serial->disc_mutex); for (i = 0; i < serial->num_ports; ++i) { port = serial->port[i]; tty = tty_port_tty_get(&port->port); if (tty) { tty_vhangup(tty); tty_kref_put(tty); } usb_serial_port_poison_urbs(port); wake_up_interruptible(&port->port.delta_msr_wait); cancel_work_sync(&port->work); if (device_is_registered(&port->dev)) device_del(&port->dev); } if (serial->type->disconnect) serial->type->disconnect(serial); release_sibling(serial, interface); /* let the last holder of this object cause it to be cleaned up */ usb_serial_put(serial); dev_info(dev, "device disconnected\n"); } int usb_serial_suspend(struct usb_interface *intf, pm_message_t message) { struct usb_serial *serial = usb_get_intfdata(intf); int i, r; /* suspend when called for first sibling interface */ if (serial->suspend_count++) return 0; /* * serial->type->suspend() MUST return 0 in system sleep context, * otherwise, the resume callback has to recover device from * previous suspend failure. */ if (serial->type->suspend) { r = serial->type->suspend(serial, message); if (r < 0) { serial->suspend_count--; return r; } } for (i = 0; i < serial->num_ports; ++i) usb_serial_port_poison_urbs(serial->port[i]); return 0; } EXPORT_SYMBOL(usb_serial_suspend); static void usb_serial_unpoison_port_urbs(struct usb_serial *serial) { int i; for (i = 0; i < serial->num_ports; ++i) usb_serial_port_unpoison_urbs(serial->port[i]); } int usb_serial_resume(struct usb_interface *intf) { struct usb_serial *serial = usb_get_intfdata(intf); int rv; /* resume when called for last sibling interface */ if (--serial->suspend_count) return 0; usb_serial_unpoison_port_urbs(serial); if (serial->type->resume) rv = serial->type->resume(serial); else rv = usb_serial_generic_resume(serial); return rv; } EXPORT_SYMBOL(usb_serial_resume); static int usb_serial_reset_resume(struct usb_interface *intf) { struct usb_serial *serial = usb_get_intfdata(intf); int rv; /* resume when called for last sibling interface */ if (--serial->suspend_count) return 0; usb_serial_unpoison_port_urbs(serial); if (serial->type->reset_resume) { rv = serial->type->reset_resume(serial); } else { rv = -EOPNOTSUPP; intf->needs_binding = 1; } return rv; } static const struct tty_operations serial_ops = { .open = serial_open, .close = serial_close, .write = serial_write, .hangup = serial_hangup, .write_room = serial_write_room, .ioctl = serial_ioctl, .set_termios = serial_set_termios, .throttle = serial_throttle, .unthrottle = serial_unthrottle, .break_ctl = serial_break, .chars_in_buffer = serial_chars_in_buffer, .wait_until_sent = serial_wait_until_sent, .tiocmget = serial_tiocmget, .tiocmset = serial_tiocmset, .get_icount = serial_get_icount, .set_serial = serial_set_serial, .get_serial = serial_get_serial, .cleanup = serial_cleanup, .install = serial_install, .proc_show = serial_proc_show, }; struct tty_driver *usb_serial_tty_driver; static int __init usb_serial_init(void) { int result; usb_serial_tty_driver = tty_alloc_driver(USB_SERIAL_TTY_MINORS, TTY_DRIVER_REAL_RAW | TTY_DRIVER_DYNAMIC_DEV); if (IS_ERR(usb_serial_tty_driver)) return PTR_ERR(usb_serial_tty_driver); /* Initialize our global data */ result = bus_register(&usb_serial_bus_type); if (result) { pr_err("%s - registering bus driver failed\n", __func__); goto err_put_driver; } usb_serial_tty_driver->driver_name = "usbserial"; usb_serial_tty_driver->name = "ttyUSB"; usb_serial_tty_driver->major = USB_SERIAL_TTY_MAJOR; usb_serial_tty_driver->minor_start = 0; usb_serial_tty_driver->type = TTY_DRIVER_TYPE_SERIAL; usb_serial_tty_driver->subtype = SERIAL_TYPE_NORMAL; usb_serial_tty_driver->init_termios = tty_std_termios; usb_serial_tty_driver->init_termios.c_cflag = B9600 | CS8 | CREAD | HUPCL | CLOCAL; usb_serial_tty_driver->init_termios.c_ispeed = 9600; usb_serial_tty_driver->init_termios.c_ospeed = 9600; tty_set_operations(usb_serial_tty_driver, &serial_ops); result = tty_register_driver(usb_serial_tty_driver); if (result) { pr_err("%s - tty_register_driver failed\n", __func__); goto err_unregister_bus; } /* register the generic driver, if we should */ result = usb_serial_generic_register(); if (result < 0) { pr_err("%s - registering generic driver failed\n", __func__); goto err_unregister_driver; } return result; err_unregister_driver: tty_unregister_driver(usb_serial_tty_driver); err_unregister_bus: bus_unregister(&usb_serial_bus_type); err_put_driver: pr_err("%s - returning with error %d\n", __func__, result); tty_driver_kref_put(usb_serial_tty_driver); return result; } static void __exit usb_serial_exit(void) { usb_serial_console_exit(); usb_serial_generic_deregister(); tty_unregister_driver(usb_serial_tty_driver); tty_driver_kref_put(usb_serial_tty_driver); bus_unregister(&usb_serial_bus_type); idr_destroy(&serial_minors); } module_init(usb_serial_init); module_exit(usb_serial_exit); #define set_to_generic_if_null(type, function) \ do { \ if (!type->function) { \ type->function = usb_serial_generic_##function; \ pr_debug("%s: using generic " #function "\n", \ type->driver.name); \ } \ } while (0) static void usb_serial_operations_init(struct usb_serial_driver *device) { set_to_generic_if_null(device, open); set_to_generic_if_null(device, write); set_to_generic_if_null(device, close); set_to_generic_if_null(device, write_room); set_to_generic_if_null(device, chars_in_buffer); if (device->tx_empty) set_to_generic_if_null(device, wait_until_sent); set_to_generic_if_null(device, read_bulk_callback); set_to_generic_if_null(device, write_bulk_callback); set_to_generic_if_null(device, process_read_urb); set_to_generic_if_null(device, prepare_write_buffer); } static int usb_serial_register(struct usb_serial_driver *driver) { int retval; if (usb_disabled()) return -ENODEV; if (!driver->description) driver->description = driver->driver.name; if (!driver->usb_driver) { WARN(1, "Serial driver %s has no usb_driver\n", driver->description); return -EINVAL; } /* Prevent individual ports from being unbound. */ driver->driver.suppress_bind_attrs = true; usb_serial_operations_init(driver); /* Add this device to our list of devices */ mutex_lock(&table_lock); list_add(&driver->driver_list, &usb_serial_driver_list); retval = usb_serial_bus_register(driver); if (retval) { pr_err("problem %d when registering driver %s\n", retval, driver->description); list_del(&driver->driver_list); } else { pr_info("USB Serial support registered for %s\n", driver->description); } mutex_unlock(&table_lock); return retval; } static void usb_serial_deregister(struct usb_serial_driver *device) { pr_info("USB Serial deregistering driver %s\n", device->description); mutex_lock(&table_lock); list_del(&device->driver_list); mutex_unlock(&table_lock); usb_serial_bus_deregister(device); } /** * __usb_serial_register_drivers - register drivers for a usb-serial module * @serial_drivers: NULL-terminated array of pointers to drivers to be registered * @owner: owning module * @name: name of the usb_driver for this set of @serial_drivers * @id_table: list of all devices this @serial_drivers set binds to * * Registers all the drivers in the @serial_drivers array, and dynamically * creates a struct usb_driver with the name @name and id_table of @id_table. */ int __usb_serial_register_drivers(struct usb_serial_driver *const serial_drivers[], struct module *owner, const char *name, const struct usb_device_id *id_table) { int rc; struct usb_driver *udriver; struct usb_serial_driver * const *sd; /* * udriver must be registered before any of the serial drivers, * because the store_new_id() routine for the serial drivers (in * bus.c) probes udriver. * * Performance hack: We don't want udriver to be probed until * the serial drivers are registered, because the probe would * simply fail for lack of a matching serial driver. * So we leave udriver's id_table set to NULL until we are all set. * * Suspend/resume support is implemented in the usb-serial core, * so fill in the PM-related fields in udriver. */ udriver = kzalloc(sizeof(*udriver), GFP_KERNEL); if (!udriver) return -ENOMEM; udriver->name = name; udriver->no_dynamic_id = 1; udriver->supports_autosuspend = 1; udriver->suspend = usb_serial_suspend; udriver->resume = usb_serial_resume; udriver->probe = usb_serial_probe; udriver->disconnect = usb_serial_disconnect; /* we only set the reset_resume field if the serial_driver has one */ for (sd = serial_drivers; *sd; ++sd) { if ((*sd)->reset_resume) { udriver->reset_resume = usb_serial_reset_resume; break; } } rc = usb_register(udriver); if (rc) goto err_free_driver; for (sd = serial_drivers; *sd; ++sd) { (*sd)->usb_driver = udriver; (*sd)->driver.owner = owner; rc = usb_serial_register(*sd); if (rc) goto err_deregister_drivers; } /* Now set udriver's id_table and look for matches */ udriver->id_table = id_table; rc = driver_attach(&udriver->driver); return 0; err_deregister_drivers: while (sd-- > serial_drivers) usb_serial_deregister(*sd); usb_deregister(udriver); err_free_driver: kfree(udriver); return rc; } EXPORT_SYMBOL_GPL(__usb_serial_register_drivers); /** * usb_serial_deregister_drivers - deregister drivers for a usb-serial module * @serial_drivers: NULL-terminated array of pointers to drivers to be deregistered * * Deregisters all the drivers in the @serial_drivers array and deregisters and * frees the struct usb_driver that was created by the call to * usb_serial_register_drivers(). */ void usb_serial_deregister_drivers(struct usb_serial_driver *const serial_drivers[]) { struct usb_driver *udriver = (*serial_drivers)->usb_driver; for (; *serial_drivers; ++serial_drivers) usb_serial_deregister(*serial_drivers); usb_deregister(udriver); kfree(udriver); } EXPORT_SYMBOL_GPL(usb_serial_deregister_drivers); MODULE_AUTHOR(DRIVER_AUTHOR); MODULE_DESCRIPTION(DRIVER_DESC); MODULE_LICENSE("GPL v2"); |
| 16993 7325 16787 12757 7323 7323 12753 7325 16690 16704 16662 643 643 642 421 294 643 25 25 25 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 | // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) 2010 Red Hat, Inc., Peter Zijlstra * * Provides a framework for enqueueing and running callbacks from hardirq * context. The enqueueing is NMI-safe. */ #include <linux/bug.h> #include <linux/kernel.h> #include <linux/export.h> #include <linux/irq_work.h> #include <linux/percpu.h> #include <linux/hardirq.h> #include <linux/irqflags.h> #include <linux/sched.h> #include <linux/tick.h> #include <linux/cpu.h> #include <linux/notifier.h> #include <linux/smp.h> #include <linux/smpboot.h> #include <asm/processor.h> #include <linux/kasan.h> #include <trace/events/ipi.h> static DEFINE_PER_CPU(struct llist_head, raised_list); static DEFINE_PER_CPU(struct llist_head, lazy_list); static DEFINE_PER_CPU(struct task_struct *, irq_workd); static void wake_irq_workd(void) { struct task_struct *tsk = __this_cpu_read(irq_workd); if (!llist_empty(this_cpu_ptr(&lazy_list)) && tsk) wake_up_process(tsk); } #ifdef CONFIG_SMP static void irq_work_wake(struct irq_work *entry) { wake_irq_workd(); } static DEFINE_PER_CPU(struct irq_work, irq_work_wakeup) = IRQ_WORK_INIT_HARD(irq_work_wake); #endif static int irq_workd_should_run(unsigned int cpu) { return !llist_empty(this_cpu_ptr(&lazy_list)); } /* * Claim the entry so that no one else will poke at it. */ static bool irq_work_claim(struct irq_work *work) { int oflags; oflags = atomic_fetch_or(IRQ_WORK_CLAIMED | CSD_TYPE_IRQ_WORK, &work->node.a_flags); /* * If the work is already pending, no need to raise the IPI. * The pairing smp_mb() in irq_work_single() makes sure * everything we did before is visible. */ if (oflags & IRQ_WORK_PENDING) return false; return true; } void __weak arch_irq_work_raise(void) { /* * Lame architectures will get the timer tick callback */ } static __always_inline void irq_work_raise(struct irq_work *work) { if (trace_ipi_send_cpu_enabled() && arch_irq_work_has_interrupt()) trace_ipi_send_cpu(smp_processor_id(), _RET_IP_, work->func); arch_irq_work_raise(); } /* Enqueue on current CPU, work must already be claimed and preempt disabled */ static void __irq_work_queue_local(struct irq_work *work) { struct llist_head *list; bool rt_lazy_work = false; bool lazy_work = false; int work_flags; work_flags = atomic_read(&work->node.a_flags); if (work_flags & IRQ_WORK_LAZY) lazy_work = true; else if (IS_ENABLED(CONFIG_PREEMPT_RT) && !(work_flags & IRQ_WORK_HARD_IRQ)) rt_lazy_work = true; if (lazy_work || rt_lazy_work) list = this_cpu_ptr(&lazy_list); else list = this_cpu_ptr(&raised_list); if (!llist_add(&work->node.llist, list)) return; /* If the work is "lazy", handle it from next tick if any */ if (!lazy_work || tick_nohz_tick_stopped()) irq_work_raise(work); } /* Enqueue the irq work @work on the current CPU */ bool irq_work_queue(struct irq_work *work) { /* Only queue if not already pending */ if (!irq_work_claim(work)) return false; /* Queue the entry and raise the IPI if needed. */ preempt_disable(); __irq_work_queue_local(work); preempt_enable(); return true; } EXPORT_SYMBOL_GPL(irq_work_queue); /* * Enqueue the irq_work @work on @cpu unless it's already pending * somewhere. * * Can be re-enqueued while the callback is still in progress. */ bool irq_work_queue_on(struct irq_work *work, int cpu) { #ifndef CONFIG_SMP return irq_work_queue(work); #else /* CONFIG_SMP: */ /* All work should have been flushed before going offline */ WARN_ON_ONCE(cpu_is_offline(cpu)); /* Only queue if not already pending */ if (!irq_work_claim(work)) return false; kasan_record_aux_stack(work); preempt_disable(); if (cpu != smp_processor_id()) { /* Arch remote IPI send/receive backend aren't NMI safe */ WARN_ON_ONCE(in_nmi()); /* * On PREEMPT_RT the items which are not marked as * IRQ_WORK_HARD_IRQ are added to the lazy list and a HARD work * item is used on the remote CPU to wake the thread. */ if (IS_ENABLED(CONFIG_PREEMPT_RT) && !(atomic_read(&work->node.a_flags) & IRQ_WORK_HARD_IRQ)) { if (!llist_add(&work->node.llist, &per_cpu(lazy_list, cpu))) goto out; work = &per_cpu(irq_work_wakeup, cpu); if (!irq_work_claim(work)) goto out; } __smp_call_single_queue(cpu, &work->node.llist); } else { __irq_work_queue_local(work); } out: preempt_enable(); return true; #endif /* CONFIG_SMP */ } bool irq_work_needs_cpu(void) { struct llist_head *raised, *lazy; raised = this_cpu_ptr(&raised_list); lazy = this_cpu_ptr(&lazy_list); if (llist_empty(raised) || arch_irq_work_has_interrupt()) if (llist_empty(lazy)) return false; /* All work should have been flushed before going offline */ WARN_ON_ONCE(cpu_is_offline(smp_processor_id())); return true; } void irq_work_single(void *arg) { struct irq_work *work = arg; int flags; /* * Clear the PENDING bit, after this point the @work can be re-used. * The PENDING bit acts as a lock, and we own it, so we can clear it * without atomic ops. */ flags = atomic_read(&work->node.a_flags); flags &= ~IRQ_WORK_PENDING; atomic_set(&work->node.a_flags, flags); /* * See irq_work_claim(). */ smp_mb(); lockdep_irq_work_enter(flags); work->func(work); lockdep_irq_work_exit(flags); /* * Clear the BUSY bit, if set, and return to the free state if no-one * else claimed it meanwhile. */ (void)atomic_cmpxchg(&work->node.a_flags, flags, flags & ~IRQ_WORK_BUSY); if ((IS_ENABLED(CONFIG_PREEMPT_RT) && !irq_work_is_hard(work)) || !arch_irq_work_has_interrupt()) rcuwait_wake_up(&work->irqwait); } static void irq_work_run_list(struct llist_head *list) { struct irq_work *work, *tmp; struct llist_node *llnode; /* * On PREEMPT_RT IRQ-work which is not marked as HARD will be processed * in a per-CPU thread in preemptible context. Only the items which are * marked as IRQ_WORK_HARD_IRQ will be processed in hardirq context. */ BUG_ON(!irqs_disabled() && !IS_ENABLED(CONFIG_PREEMPT_RT)); if (llist_empty(list)) return; llnode = llist_del_all(list); llist_for_each_entry_safe(work, tmp, llnode, node.llist) irq_work_single(work); } /* * hotplug calls this through: * hotplug_cfd() -> flush_smp_call_function_queue() */ void irq_work_run(void) { irq_work_run_list(this_cpu_ptr(&raised_list)); if (!IS_ENABLED(CONFIG_PREEMPT_RT)) irq_work_run_list(this_cpu_ptr(&lazy_list)); else wake_irq_workd(); } EXPORT_SYMBOL_GPL(irq_work_run); void irq_work_tick(void) { struct llist_head *raised = this_cpu_ptr(&raised_list); if (!llist_empty(raised) && !arch_irq_work_has_interrupt()) irq_work_run_list(raised); if (!IS_ENABLED(CONFIG_PREEMPT_RT)) irq_work_run_list(this_cpu_ptr(&lazy_list)); else wake_irq_workd(); } /* * Synchronize against the irq_work @entry, ensures the entry is not * currently in use. */ void irq_work_sync(struct irq_work *work) { lockdep_assert_irqs_enabled(); might_sleep(); if ((IS_ENABLED(CONFIG_PREEMPT_RT) && !irq_work_is_hard(work)) || !arch_irq_work_has_interrupt()) { rcuwait_wait_event(&work->irqwait, !irq_work_is_busy(work), TASK_UNINTERRUPTIBLE); return; } while (irq_work_is_busy(work)) cpu_relax(); } EXPORT_SYMBOL_GPL(irq_work_sync); static void run_irq_workd(unsigned int cpu) { irq_work_run_list(this_cpu_ptr(&lazy_list)); } static void irq_workd_setup(unsigned int cpu) { sched_set_fifo_low(current); } static struct smp_hotplug_thread irqwork_threads = { .store = &irq_workd, .setup = irq_workd_setup, .thread_should_run = irq_workd_should_run, .thread_fn = run_irq_workd, .thread_comm = "irq_work/%u", }; static __init int irq_work_init_threads(void) { if (IS_ENABLED(CONFIG_PREEMPT_RT)) BUG_ON(smpboot_register_percpu_thread(&irqwork_threads)); return 0; } early_initcall(irq_work_init_threads); |
| 3 3 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 | // SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2014 Samsung Electronics Co., Ltd. * Sylwester Nawrocki <s.nawrocki@samsung.com> */ #include <linux/clk.h> #include <linux/clk-provider.h> #include <linux/clk/clk-conf.h> #include <linux/device.h> #include <linux/of.h> #include <linux/printk.h> #include <linux/slab.h> static int __set_clk_parents(struct device_node *node, bool clk_supplier) { struct of_phandle_args clkspec; int index, rc, num_parents; struct clk *clk, *pclk; num_parents = of_count_phandle_with_args(node, "assigned-clock-parents", "#clock-cells"); if (num_parents == -EINVAL) pr_err("clk: invalid value of clock-parents property at %pOF\n", node); for (index = 0; index < num_parents; index++) { rc = of_parse_phandle_with_args(node, "assigned-clock-parents", "#clock-cells", index, &clkspec); if (rc < 0) { /* skip empty (null) phandles */ if (rc == -ENOENT) continue; else return rc; } if (clkspec.np == node && !clk_supplier) { of_node_put(clkspec.np); return 0; } pclk = of_clk_get_from_provider(&clkspec); of_node_put(clkspec.np); if (IS_ERR(pclk)) { if (PTR_ERR(pclk) != -EPROBE_DEFER) pr_warn("clk: couldn't get parent clock %d for %pOF\n", index, node); return PTR_ERR(pclk); } rc = of_parse_phandle_with_args(node, "assigned-clocks", "#clock-cells", index, &clkspec); if (rc < 0) goto err; if (clkspec.np == node && !clk_supplier) { of_node_put(clkspec.np); rc = 0; goto err; } clk = of_clk_get_from_provider(&clkspec); of_node_put(clkspec.np); if (IS_ERR(clk)) { if (PTR_ERR(clk) != -EPROBE_DEFER) pr_warn("clk: couldn't get assigned clock %d for %pOF\n", index, node); rc = PTR_ERR(clk); goto err; } rc = clk_set_parent(clk, pclk); if (rc < 0) pr_err("clk: failed to reparent %s to %s: %d\n", __clk_get_name(clk), __clk_get_name(pclk), rc); clk_put(clk); clk_put(pclk); } return 0; err: clk_put(pclk); return rc; } static int __set_clk_rates(struct device_node *node, bool clk_supplier) { struct of_phandle_args clkspec; int rc, count, count_64, index; struct clk *clk; u64 *rates_64 __free(kfree) = NULL; u32 *rates __free(kfree) = NULL; count = of_property_count_u32_elems(node, "assigned-clock-rates"); count_64 = of_property_count_u64_elems(node, "assigned-clock-rates-u64"); if (count_64 > 0) { count = count_64; rates_64 = kcalloc(count, sizeof(*rates_64), GFP_KERNEL); if (!rates_64) return -ENOMEM; rc = of_property_read_u64_array(node, "assigned-clock-rates-u64", rates_64, count); } else if (count > 0) { rates = kcalloc(count, sizeof(*rates), GFP_KERNEL); if (!rates) return -ENOMEM; rc = of_property_read_u32_array(node, "assigned-clock-rates", rates, count); } else { return 0; } if (rc) return rc; for (index = 0; index < count; index++) { unsigned long rate; if (rates_64) rate = rates_64[index]; else rate = rates[index]; if (rate) { rc = of_parse_phandle_with_args(node, "assigned-clocks", "#clock-cells", index, &clkspec); if (rc < 0) { /* skip empty (null) phandles */ if (rc == -ENOENT) continue; else return rc; } if (clkspec.np == node && !clk_supplier) { of_node_put(clkspec.np); return 0; } clk = of_clk_get_from_provider(&clkspec); of_node_put(clkspec.np); if (IS_ERR(clk)) { if (PTR_ERR(clk) != -EPROBE_DEFER) pr_warn("clk: couldn't get clock %d for %pOF\n", index, node); return PTR_ERR(clk); } rc = clk_set_rate(clk, rate); if (rc < 0) pr_err("clk: couldn't set %s clk rate to %lu (%d), current rate: %lu\n", __clk_get_name(clk), rate, rc, clk_get_rate(clk)); clk_put(clk); } } return 0; } /** * of_clk_set_defaults() - parse and set assigned clocks configuration * @node: device node to apply clock settings for * @clk_supplier: true if clocks supplied by @node should also be considered * * This function parses 'assigned-{clocks/clock-parents/clock-rates}' properties * and sets any specified clock parents and rates. The @clk_supplier argument * should be set to true if @node may be also a clock supplier of any clock * listed in its 'assigned-clocks' or 'assigned-clock-parents' properties. * If @clk_supplier is false the function exits returning 0 as soon as it * determines the @node is also a supplier of any of the clocks. */ int of_clk_set_defaults(struct device_node *node, bool clk_supplier) { int rc; if (!node) return 0; rc = __set_clk_parents(node, clk_supplier); if (rc < 0) return rc; return __set_clk_rates(node, clk_supplier); } EXPORT_SYMBOL_GPL(of_clk_set_defaults); |
| 6860 6860 6857 6799 764 767 6855 6873 685 685 6694 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _BLK_CGROUP_PRIVATE_H #define _BLK_CGROUP_PRIVATE_H /* * block cgroup private header * * Based on ideas and code from CFQ, CFS and BFQ: * Copyright (C) 2003 Jens Axboe <axboe@kernel.dk> * * Copyright (C) 2008 Fabio Checconi <fabio@gandalf.sssup.it> * Paolo Valente <paolo.valente@unimore.it> * * Copyright (C) 2009 Vivek Goyal <vgoyal@redhat.com> * Nauman Rafique <nauman@google.com> */ #include <linux/blk-cgroup.h> #include <linux/cgroup.h> #include <linux/kthread.h> #include <linux/blk-mq.h> #include <linux/llist.h> #include "blk.h" struct blkcg_gq; struct blkg_policy_data; /* percpu_counter batch for blkg_[rw]stats, per-cpu drift doesn't matter */ #define BLKG_STAT_CPU_BATCH (INT_MAX / 2) #ifdef CONFIG_BLK_CGROUP enum blkg_iostat_type { BLKG_IOSTAT_READ, BLKG_IOSTAT_WRITE, BLKG_IOSTAT_DISCARD, BLKG_IOSTAT_NR, }; struct blkg_iostat { u64 bytes[BLKG_IOSTAT_NR]; u64 ios[BLKG_IOSTAT_NR]; }; struct blkg_iostat_set { struct u64_stats_sync sync; struct blkcg_gq *blkg; struct llist_node lnode; int lqueued; /* queued in llist */ struct blkg_iostat cur; struct blkg_iostat last; }; /* association between a blk cgroup and a request queue */ struct blkcg_gq { /* Pointer to the associated request_queue */ struct request_queue *q; struct list_head q_node; struct hlist_node blkcg_node; struct blkcg *blkcg; /* all non-root blkcg_gq's are guaranteed to have access to parent */ struct blkcg_gq *parent; /* reference count */ struct percpu_ref refcnt; /* is this blkg online? protected by both blkcg and q locks */ bool online; struct blkg_iostat_set __percpu *iostat_cpu; struct blkg_iostat_set iostat; struct blkg_policy_data *pd[BLKCG_MAX_POLS]; #ifdef CONFIG_BLK_CGROUP_PUNT_BIO spinlock_t async_bio_lock; struct bio_list async_bios; #endif union { struct work_struct async_bio_work; struct work_struct free_work; }; atomic_t use_delay; atomic64_t delay_nsec; atomic64_t delay_start; u64 last_delay; int last_use; struct rcu_head rcu_head; }; struct blkcg { struct cgroup_subsys_state css; spinlock_t lock; refcount_t online_pin; /* If there is block congestion on this cgroup. */ atomic_t congestion_count; struct radix_tree_root blkg_tree; struct blkcg_gq __rcu *blkg_hint; struct hlist_head blkg_list; struct blkcg_policy_data *cpd[BLKCG_MAX_POLS]; struct list_head all_blkcgs_node; /* * List of updated percpu blkg_iostat_set's since the last flush. */ struct llist_head __percpu *lhead; #ifdef CONFIG_BLK_CGROUP_FC_APPID char fc_app_id[FC_APPID_LEN]; #endif #ifdef CONFIG_CGROUP_WRITEBACK struct list_head cgwb_list; #endif }; static inline struct blkcg *css_to_blkcg(struct cgroup_subsys_state *css) { return css ? container_of(css, struct blkcg, css) : NULL; } /* * A blkcg_gq (blkg) is association between a block cgroup (blkcg) and a * request_queue (q). This is used by blkcg policies which need to track * information per blkcg - q pair. * * There can be multiple active blkcg policies and each blkg:policy pair is * represented by a blkg_policy_data which is allocated and freed by each * policy's pd_alloc/free_fn() methods. A policy can allocate private data * area by allocating larger data structure which embeds blkg_policy_data * at the beginning. */ struct blkg_policy_data { /* the blkg and policy id this per-policy data belongs to */ struct blkcg_gq *blkg; int plid; bool online; }; /* * Policies that need to keep per-blkcg data which is independent from any * request_queue associated to it should implement cpd_alloc/free_fn() * methods. A policy can allocate private data area by allocating larger * data structure which embeds blkcg_policy_data at the beginning. * cpd_init() is invoked to let each policy handle per-blkcg data. */ struct blkcg_policy_data { /* the blkcg and policy id this per-policy data belongs to */ struct blkcg *blkcg; int plid; }; typedef struct blkcg_policy_data *(blkcg_pol_alloc_cpd_fn)(gfp_t gfp); typedef void (blkcg_pol_init_cpd_fn)(struct blkcg_policy_data *cpd); typedef void (blkcg_pol_free_cpd_fn)(struct blkcg_policy_data *cpd); typedef void (blkcg_pol_bind_cpd_fn)(struct blkcg_policy_data *cpd); typedef struct blkg_policy_data *(blkcg_pol_alloc_pd_fn)(struct gendisk *disk, struct blkcg *blkcg, gfp_t gfp); typedef void (blkcg_pol_init_pd_fn)(struct blkg_policy_data *pd); typedef void (blkcg_pol_online_pd_fn)(struct blkg_policy_data *pd); typedef void (blkcg_pol_offline_pd_fn)(struct blkg_policy_data *pd); typedef void (blkcg_pol_free_pd_fn)(struct blkg_policy_data *pd); typedef void (blkcg_pol_reset_pd_stats_fn)(struct blkg_policy_data *pd); typedef void (blkcg_pol_stat_pd_fn)(struct blkg_policy_data *pd, struct seq_file *s); struct blkcg_policy { int plid; /* cgroup files for the policy */ struct cftype *dfl_cftypes; struct cftype *legacy_cftypes; /* operations */ blkcg_pol_alloc_cpd_fn *cpd_alloc_fn; blkcg_pol_free_cpd_fn *cpd_free_fn; blkcg_pol_alloc_pd_fn *pd_alloc_fn; blkcg_pol_init_pd_fn *pd_init_fn; blkcg_pol_online_pd_fn *pd_online_fn; blkcg_pol_offline_pd_fn *pd_offline_fn; blkcg_pol_free_pd_fn *pd_free_fn; blkcg_pol_reset_pd_stats_fn *pd_reset_stats_fn; blkcg_pol_stat_pd_fn *pd_stat_fn; }; extern struct blkcg blkcg_root; extern bool blkcg_debug_stats; void blkg_init_queue(struct request_queue *q); int blkcg_init_disk(struct gendisk *disk); void blkcg_exit_disk(struct gendisk *disk); /* Blkio controller policy registration */ int blkcg_policy_register(struct blkcg_policy *pol); void blkcg_policy_unregister(struct blkcg_policy *pol); int blkcg_activate_policy(struct gendisk *disk, const struct blkcg_policy *pol); void blkcg_deactivate_policy(struct gendisk *disk, const struct blkcg_policy *pol); const char *blkg_dev_name(struct blkcg_gq *blkg); void blkcg_print_blkgs(struct seq_file *sf, struct blkcg *blkcg, u64 (*prfill)(struct seq_file *, struct blkg_policy_data *, int), const struct blkcg_policy *pol, int data, bool show_total); u64 __blkg_prfill_u64(struct seq_file *sf, struct blkg_policy_data *pd, u64 v); struct blkg_conf_ctx { char *input; char *body; struct block_device *bdev; struct blkcg_gq *blkg; }; void blkg_conf_init(struct blkg_conf_ctx *ctx, char *input); int blkg_conf_open_bdev(struct blkg_conf_ctx *ctx); int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol, struct blkg_conf_ctx *ctx); void blkg_conf_exit(struct blkg_conf_ctx *ctx); /** * bio_issue_as_root_blkg - see if this bio needs to be issued as root blkg * @bio: the target &bio * * Return: true if this bio needs to be submitted with the root blkg context. * * In order to avoid priority inversions we sometimes need to issue a bio as if * it were attached to the root blkg, and then backcharge to the actual owning * blkg. The idea is we do bio_blkcg_css() to look up the actual context for * the bio and attach the appropriate blkg to the bio. Then we call this helper * and if it is true run with the root blkg for that queue and then do any * backcharging to the originating cgroup once the io is complete. */ static inline bool bio_issue_as_root_blkg(struct bio *bio) { return (bio->bi_opf & (REQ_META | REQ_SWAP)) != 0; } /** * blkg_lookup - lookup blkg for the specified blkcg - q pair * @blkcg: blkcg of interest * @q: request_queue of interest * * Lookup blkg for the @blkcg - @q pair. * * Must be called in a RCU critical section. */ static inline struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, struct request_queue *q) { struct blkcg_gq *blkg; if (blkcg == &blkcg_root) return q->root_blkg; blkg = rcu_dereference_check(blkcg->blkg_hint, lockdep_is_held(&q->queue_lock)); if (blkg && blkg->q == q) return blkg; blkg = radix_tree_lookup(&blkcg->blkg_tree, q->id); if (blkg && blkg->q != q) blkg = NULL; return blkg; } /** * blkg_to_pd - get policy private data * @blkg: blkg of interest * @pol: policy of interest * * Return pointer to private data associated with the @blkg-@pol pair. */ static inline struct blkg_policy_data *blkg_to_pd(struct blkcg_gq *blkg, struct blkcg_policy *pol) { return blkg ? blkg->pd[pol->plid] : NULL; } static inline struct blkcg_policy_data *blkcg_to_cpd(struct blkcg *blkcg, struct blkcg_policy *pol) { return blkcg ? blkcg->cpd[pol->plid] : NULL; } /** * pd_to_blkg - get blkg associated with policy private data * @pd: policy private data of interest * * @pd is policy private data. Determine the blkg it's associated with. */ static inline struct blkcg_gq *pd_to_blkg(struct blkg_policy_data *pd) { return pd ? pd->blkg : NULL; } static inline struct blkcg *cpd_to_blkcg(struct blkcg_policy_data *cpd) { return cpd ? cpd->blkcg : NULL; } /** * blkg_get - get a blkg reference * @blkg: blkg to get * * The caller should be holding an existing reference. */ static inline void blkg_get(struct blkcg_gq *blkg) { percpu_ref_get(&blkg->refcnt); } /** * blkg_tryget - try and get a blkg reference * @blkg: blkg to get * * This is for use when doing an RCU lookup of the blkg. We may be in the midst * of freeing this blkg, so we can only use it if the refcnt is not zero. */ static inline bool blkg_tryget(struct blkcg_gq *blkg) { return blkg && percpu_ref_tryget(&blkg->refcnt); } /** * blkg_put - put a blkg reference * @blkg: blkg to put */ static inline void blkg_put(struct blkcg_gq *blkg) { percpu_ref_put(&blkg->refcnt); } /** * blkg_for_each_descendant_pre - pre-order walk of a blkg's descendants * @d_blkg: loop cursor pointing to the current descendant * @pos_css: used for iteration * @p_blkg: target blkg to walk descendants of * * Walk @c_blkg through the descendants of @p_blkg. Must be used with RCU * read locked. If called under either blkcg or queue lock, the iteration * is guaranteed to include all and only online blkgs. The caller may * update @pos_css by calling css_rightmost_descendant() to skip subtree. * @p_blkg is included in the iteration and the first node to be visited. */ #define blkg_for_each_descendant_pre(d_blkg, pos_css, p_blkg) \ css_for_each_descendant_pre((pos_css), &(p_blkg)->blkcg->css) \ if (((d_blkg) = blkg_lookup(css_to_blkcg(pos_css), \ (p_blkg)->q))) /** * blkg_for_each_descendant_post - post-order walk of a blkg's descendants * @d_blkg: loop cursor pointing to the current descendant * @pos_css: used for iteration * @p_blkg: target blkg to walk descendants of * * Similar to blkg_for_each_descendant_pre() but performs post-order * traversal instead. Synchronization rules are the same. @p_blkg is * included in the iteration and the last node to be visited. */ #define blkg_for_each_descendant_post(d_blkg, pos_css, p_blkg) \ css_for_each_descendant_post((pos_css), &(p_blkg)->blkcg->css) \ if (((d_blkg) = blkg_lookup(css_to_blkcg(pos_css), \ (p_blkg)->q))) static inline void blkcg_bio_issue_init(struct bio *bio) { bio_issue_init(&bio->bi_issue, bio_sectors(bio)); } static inline void blkcg_use_delay(struct blkcg_gq *blkg) { if (WARN_ON_ONCE(atomic_read(&blkg->use_delay) < 0)) return; if (atomic_add_return(1, &blkg->use_delay) == 1) atomic_inc(&blkg->blkcg->congestion_count); } static inline int blkcg_unuse_delay(struct blkcg_gq *blkg) { int old = atomic_read(&blkg->use_delay); if (WARN_ON_ONCE(old < 0)) return 0; if (old == 0) return 0; /* * We do this song and dance because we can race with somebody else * adding or removing delay. If we just did an atomic_dec we'd end up * negative and we'd already be in trouble. We need to subtract 1 and * then check to see if we were the last delay so we can drop the * congestion count on the cgroup. */ while (old && !atomic_try_cmpxchg(&blkg->use_delay, &old, old - 1)) ; if (old == 0) return 0; if (old == 1) atomic_dec(&blkg->blkcg->congestion_count); return 1; } /** * blkcg_set_delay - Enable allocator delay mechanism with the specified delay amount * @blkg: target blkg * @delay: delay duration in nsecs * * When enabled with this function, the delay is not decayed and must be * explicitly cleared with blkcg_clear_delay(). Must not be mixed with * blkcg_[un]use_delay() and blkcg_add_delay() usages. */ static inline void blkcg_set_delay(struct blkcg_gq *blkg, u64 delay) { int old = atomic_read(&blkg->use_delay); /* We only want 1 person setting the congestion count for this blkg. */ if (!old && atomic_try_cmpxchg(&blkg->use_delay, &old, -1)) atomic_inc(&blkg->blkcg->congestion_count); atomic64_set(&blkg->delay_nsec, delay); } /** * blkcg_clear_delay - Disable allocator delay mechanism * @blkg: target blkg * * Disable use_delay mechanism. See blkcg_set_delay(). */ static inline void blkcg_clear_delay(struct blkcg_gq *blkg) { int old = atomic_read(&blkg->use_delay); /* We only want 1 person clearing the congestion count for this blkg. */ if (old && atomic_try_cmpxchg(&blkg->use_delay, &old, 0)) atomic_dec(&blkg->blkcg->congestion_count); } /** * blk_cgroup_mergeable - Determine whether to allow or disallow merges * @rq: request to merge into * @bio: bio to merge * * @bio and @rq should belong to the same cgroup and their issue_as_root should * match. The latter is necessary as we don't want to throttle e.g. a metadata * update because it happens to be next to a regular IO. */ static inline bool blk_cgroup_mergeable(struct request *rq, struct bio *bio) { return rq->bio->bi_blkg == bio->bi_blkg && bio_issue_as_root_blkg(rq->bio) == bio_issue_as_root_blkg(bio); } void blk_cgroup_bio_start(struct bio *bio); void blkcg_add_delay(struct blkcg_gq *blkg, u64 now, u64 delta); #else /* CONFIG_BLK_CGROUP */ struct blkg_policy_data { }; struct blkcg_policy_data { }; struct blkcg_policy { }; struct blkcg { }; static inline struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, void *key) { return NULL; } static inline void blkg_init_queue(struct request_queue *q) { } static inline int blkcg_init_disk(struct gendisk *disk) { return 0; } static inline void blkcg_exit_disk(struct gendisk *disk) { } static inline int blkcg_policy_register(struct blkcg_policy *pol) { return 0; } static inline void blkcg_policy_unregister(struct blkcg_policy *pol) { } static inline int blkcg_activate_policy(struct gendisk *disk, const struct blkcg_policy *pol) { return 0; } static inline void blkcg_deactivate_policy(struct gendisk *disk, const struct blkcg_policy *pol) { } static inline struct blkg_policy_data *blkg_to_pd(struct blkcg_gq *blkg, struct blkcg_policy *pol) { return NULL; } static inline struct blkcg_gq *pd_to_blkg(struct blkg_policy_data *pd) { return NULL; } static inline void blkg_get(struct blkcg_gq *blkg) { } static inline void blkg_put(struct blkcg_gq *blkg) { } static inline void blkcg_bio_issue_init(struct bio *bio) { } static inline void blk_cgroup_bio_start(struct bio *bio) { } static inline bool blk_cgroup_mergeable(struct request *rq, struct bio *bio) { return true; } #define blk_queue_for_each_rl(rl, q) \ for ((rl) = &(q)->root_rl; (rl); (rl) = NULL) #endif /* CONFIG_BLK_CGROUP */ #endif /* _BLK_CGROUP_PRIVATE_H */ |
| 3 6 9 10 1 1 1 2 5 5 1 9 9 9 9 9 9 9 9 6 3 9 9 9 9 9 1 7 1 7 2 8 1 9 1 1 10 10 5 5 2 1 1 9 9 7 2 9 9 9 9 9 9 3 6 6 9 2 1 1 1 1 2 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 | // SPDX-License-Identifier: GPL-2.0-or-later /* * Copyright(c) 1999 - 2004 Intel Corporation. All rights reserved. */ #include <linux/skbuff.h> #include <linux/netdevice.h> #include <linux/etherdevice.h> #include <linux/pkt_sched.h> #include <linux/spinlock.h> #include <linux/slab.h> #include <linux/timer.h> #include <linux/ip.h> #include <linux/ipv6.h> #include <linux/if_arp.h> #include <linux/if_ether.h> #include <linux/if_bonding.h> #include <linux/if_vlan.h> #include <linux/in.h> #include <net/arp.h> #include <net/ipv6.h> #include <net/ndisc.h> #include <asm/byteorder.h> #include <net/bonding.h> #include <net/bond_alb.h> static const u8 mac_v6_allmcast[ETH_ALEN + 2] __long_aligned = { 0x33, 0x33, 0x00, 0x00, 0x00, 0x01 }; static const int alb_delta_in_ticks = HZ / ALB_TIMER_TICKS_PER_SEC; #pragma pack(1) struct learning_pkt { u8 mac_dst[ETH_ALEN]; u8 mac_src[ETH_ALEN]; __be16 type; u8 padding[ETH_ZLEN - ETH_HLEN]; }; struct arp_pkt { __be16 hw_addr_space; __be16 prot_addr_space; u8 hw_addr_len; u8 prot_addr_len; __be16 op_code; u8 mac_src[ETH_ALEN]; /* sender hardware address */ __be32 ip_src; /* sender IP address */ u8 mac_dst[ETH_ALEN]; /* target hardware address */ __be32 ip_dst; /* target IP address */ }; #pragma pack() /* Forward declaration */ static void alb_send_learning_packets(struct slave *slave, const u8 mac_addr[], bool strict_match); static void rlb_purge_src_ip(struct bonding *bond, struct arp_pkt *arp); static void rlb_src_unlink(struct bonding *bond, u32 index); static void rlb_src_link(struct bonding *bond, u32 ip_src_hash, u32 ip_dst_hash); static inline u8 _simple_hash(const u8 *hash_start, int hash_size) { int i; u8 hash = 0; for (i = 0; i < hash_size; i++) hash ^= hash_start[i]; return hash; } /*********************** tlb specific functions ***************************/ static inline void tlb_init_table_entry(struct tlb_client_info *entry, int save_load) { if (save_load) { entry->load_history = 1 + entry->tx_bytes / BOND_TLB_REBALANCE_INTERVAL; entry->tx_bytes = 0; } entry->tx_slave = NULL; entry->next = TLB_NULL_INDEX; entry->prev = TLB_NULL_INDEX; } static inline void tlb_init_slave(struct slave *slave) { SLAVE_TLB_INFO(slave).load = 0; SLAVE_TLB_INFO(slave).head = TLB_NULL_INDEX; } static void __tlb_clear_slave(struct bonding *bond, struct slave *slave, int save_load) { struct tlb_client_info *tx_hash_table; u32 index; /* clear slave from tx_hashtbl */ tx_hash_table = BOND_ALB_INFO(bond).tx_hashtbl; /* skip this if we've already freed the tx hash table */ if (tx_hash_table) { index = SLAVE_TLB_INFO(slave).head; while (index != TLB_NULL_INDEX) { u32 next_index = tx_hash_table[index].next; tlb_init_table_entry(&tx_hash_table[index], save_load); index = next_index; } } tlb_init_slave(slave); } static void tlb_clear_slave(struct bonding *bond, struct slave *slave, int save_load) { spin_lock_bh(&bond->mode_lock); __tlb_clear_slave(bond, slave, save_load); spin_unlock_bh(&bond->mode_lock); } /* Must be called before starting the monitor timer */ static int tlb_initialize(struct bonding *bond) { struct alb_bond_info *bond_info = &(BOND_ALB_INFO(bond)); int size = TLB_HASH_TABLE_SIZE * sizeof(struct tlb_client_info); struct tlb_client_info *new_hashtbl; int i; new_hashtbl = kzalloc(size, GFP_KERNEL); if (!new_hashtbl) return -ENOMEM; spin_lock_bh(&bond->mode_lock); bond_info->tx_hashtbl = new_hashtbl; for (i = 0; i < TLB_HASH_TABLE_SIZE; i++) tlb_init_table_entry(&bond_info->tx_hashtbl[i], 0); spin_unlock_bh(&bond->mode_lock); return 0; } /* Must be called only after all slaves have been released */ static void tlb_deinitialize(struct bonding *bond) { struct alb_bond_info *bond_info = &(BOND_ALB_INFO(bond)); spin_lock_bh(&bond->mode_lock); kfree(bond_info->tx_hashtbl); bond_info->tx_hashtbl = NULL; spin_unlock_bh(&bond->mode_lock); } static long long compute_gap(struct slave *slave) { return (s64) (slave->speed << 20) - /* Convert to Megabit per sec */ (s64) (SLAVE_TLB_INFO(slave).load << 3); /* Bytes to bits */ } static struct slave *tlb_get_least_loaded_slave(struct bonding *bond) { struct slave *slave, *least_loaded; struct list_head *iter; long long max_gap; least_loaded = NULL; max_gap = LLONG_MIN; /* Find the slave with the largest gap */ bond_for_each_slave_rcu(bond, slave, iter) { if (bond_slave_can_tx(slave)) { long long gap = compute_gap(slave); if (max_gap < gap) { least_loaded = slave; max_gap = gap; } } } return least_loaded; } static struct slave *__tlb_choose_channel(struct bonding *bond, u32 hash_index, u32 skb_len) { struct alb_bond_info *bond_info = &(BOND_ALB_INFO(bond)); struct tlb_client_info *hash_table; struct slave *assigned_slave; hash_table = bond_info->tx_hashtbl; assigned_slave = hash_table[hash_index].tx_slave; if (!assigned_slave) { assigned_slave = tlb_get_least_loaded_slave(bond); if (assigned_slave) { struct tlb_slave_info *slave_info = &(SLAVE_TLB_INFO(assigned_slave)); u32 next_index = slave_info->head; hash_table[hash_index].tx_slave = assigned_slave; hash_table[hash_index].next = next_index; hash_table[hash_index].prev = TLB_NULL_INDEX; if (next_index != TLB_NULL_INDEX) hash_table[next_index].prev = hash_index; slave_info->head = hash_index; slave_info->load += hash_table[hash_index].load_history; } } if (assigned_slave) hash_table[hash_index].tx_bytes += skb_len; return assigned_slave; } static struct slave *tlb_choose_channel(struct bonding *bond, u32 hash_index, u32 skb_len) { struct slave *tx_slave; /* We don't need to disable softirq here, because * tlb_choose_channel() is only called by bond_alb_xmit() * which already has softirq disabled. */ spin_lock(&bond->mode_lock); tx_slave = __tlb_choose_channel(bond, hash_index, skb_len); spin_unlock(&bond->mode_lock); return tx_slave; } /*********************** rlb specific functions ***************************/ /* when an ARP REPLY is received from a client update its info * in the rx_hashtbl */ static void rlb_update_entry_from_arp(struct bonding *bond, struct arp_pkt *arp) { struct alb_bond_info *bond_info = &(BOND_ALB_INFO(bond)); struct rlb_client_info *client_info; u32 hash_index; spin_lock_bh(&bond->mode_lock); hash_index = _simple_hash((u8 *)&(arp->ip_src), sizeof(arp->ip_src)); client_info = &(bond_info->rx_hashtbl[hash_index]); if ((client_info->assigned) && (client_info->ip_src == arp->ip_dst) && (client_info->ip_dst == arp->ip_src) && (!ether_addr_equal_64bits(client_info->mac_dst, arp->mac_src))) { /* update the clients MAC address */ ether_addr_copy(client_info->mac_dst, arp->mac_src); client_info->ntt = 1; bond_info->rx_ntt = 1; } spin_unlock_bh(&bond->mode_lock); } static int rlb_arp_recv(const struct sk_buff *skb, struct bonding *bond, struct slave *slave) { struct arp_pkt *arp, _arp; if (skb->protocol != cpu_to_be16(ETH_P_ARP)) goto out; arp = skb_header_pointer(skb, 0, sizeof(_arp), &_arp); if (!arp) goto out; /* We received an ARP from arp->ip_src. * We might have used this IP address previously (on the bonding host * itself or on a system that is bridged together with the bond). * However, if arp->mac_src is different than what is stored in * rx_hashtbl, some other host is now using the IP and we must prevent * sending out client updates with this IP address and the old MAC * address. * Clean up all hash table entries that have this address as ip_src but * have a different mac_src. */ rlb_purge_src_ip(bond, arp); if (arp->op_code == htons(ARPOP_REPLY)) { /* update rx hash table for this ARP */ rlb_update_entry_from_arp(bond, arp); slave_dbg(bond->dev, slave->dev, "Server received an ARP Reply from client\n"); } out: return RX_HANDLER_ANOTHER; } /* Caller must hold rcu_read_lock() */ static struct slave *__rlb_next_rx_slave(struct bonding *bond) { struct alb_bond_info *bond_info = &(BOND_ALB_INFO(bond)); struct slave *before = NULL, *rx_slave = NULL, *slave; struct list_head *iter; bool found = false; bond_for_each_slave_rcu(bond, slave, iter) { if (!bond_slave_can_tx(slave)) continue; if (!found) { if (!before || before->speed < slave->speed) before = slave; } else { if (!rx_slave || rx_slave->speed < slave->speed) rx_slave = slave; } if (slave == bond_info->rx_slave) found = true; } /* we didn't find anything after the current or we have something * better before and up to the current slave */ if (!rx_slave || (before && rx_slave->speed < before->speed)) rx_slave = before; if (rx_slave) bond_info->rx_slave = rx_slave; return rx_slave; } /* Caller must hold RTNL, rcu_read_lock is obtained only to silence checkers */ static struct slave *rlb_next_rx_slave(struct bonding *bond) { struct slave *rx_slave; ASSERT_RTNL(); rcu_read_lock(); rx_slave = __rlb_next_rx_slave(bond); rcu_read_unlock(); return rx_slave; } /* teach the switch the mac of a disabled slave * on the primary for fault tolerance * * Caller must hold RTNL */ static void rlb_teach_disabled_mac_on_primary(struct bonding *bond, const u8 addr[]) { struct slave *curr_active = rtnl_dereference(bond->curr_active_slave); if (!curr_active) return; if (!bond->alb_info.primary_is_promisc) { if (!dev_set_promiscuity(curr_active->dev, 1)) bond->alb_info.primary_is_promisc = 1; else bond->alb_info.primary_is_promisc = 0; } bond->alb_info.rlb_promisc_timeout_counter = 0; alb_send_learning_packets(curr_active, addr, true); } /* slave being removed should not be active at this point * * Caller must hold rtnl. */ static void rlb_clear_slave(struct bonding *bond, struct slave *slave) { struct alb_bond_info *bond_info = &(BOND_ALB_INFO(bond)); struct rlb_client_info *rx_hash_table; u32 index, next_index; /* clear slave from rx_hashtbl */ spin_lock_bh(&bond->mode_lock); rx_hash_table = bond_info->rx_hashtbl; index = bond_info->rx_hashtbl_used_head; for (; index != RLB_NULL_INDEX; index = next_index) { next_index = rx_hash_table[index].used_next; if (rx_hash_table[index].slave == slave) { struct slave *assigned_slave = rlb_next_rx_slave(bond); if (assigned_slave) { rx_hash_table[index].slave = assigned_slave; if (is_valid_ether_addr(rx_hash_table[index].mac_dst)) { bond_info->rx_hashtbl[index].ntt = 1; bond_info->rx_ntt = 1; /* A slave has been removed from the * table because it is either disabled * or being released. We must retry the * update to avoid clients from not * being updated & disconnecting when * there is stress */ bond_info->rlb_update_retry_counter = RLB_UPDATE_RETRY; } } else { /* there is no active slave */ rx_hash_table[index].slave = NULL; } } } spin_unlock_bh(&bond->mode_lock); if (slave != rtnl_dereference(bond->curr_active_slave)) rlb_teach_disabled_mac_on_primary(bond, slave->dev->dev_addr); } static void rlb_update_client(struct rlb_client_info *client_info) { int i; if (!client_info->slave || !is_valid_ether_addr(client_info->mac_dst)) return; for (i = 0; i < RLB_ARP_BURST_SIZE; i++) { struct sk_buff *skb; skb = arp_create(ARPOP_REPLY, ETH_P_ARP, client_info->ip_dst, client_info->slave->dev, client_info->ip_src, client_info->mac_dst, client_info->slave->dev->dev_addr, client_info->mac_dst); if (!skb) { slave_err(client_info->slave->bond->dev, client_info->slave->dev, "failed to create an ARP packet\n"); continue; } skb->dev = client_info->slave->dev; if (client_info->vlan_id) { __vlan_hwaccel_put_tag(skb, htons(ETH_P_8021Q), client_info->vlan_id); } arp_xmit(skb); } } /* sends ARP REPLIES that update the clients that need updating */ static void rlb_update_rx_clients(struct bonding *bond) { struct alb_bond_info *bond_info = &(BOND_ALB_INFO(bond)); struct rlb_client_info *client_info; u32 hash_index; spin_lock_bh(&bond->mode_lock); hash_index = bond_info->rx_hashtbl_used_head; for (; hash_index != RLB_NULL_INDEX; hash_index = client_info->used_next) { client_info = &(bond_info->rx_hashtbl[hash_index]); if (client_info->ntt) { rlb_update_client(client_info); if (bond_info->rlb_update_retry_counter == 0) client_info->ntt = 0; } } /* do not update the entries again until this counter is zero so that * not to confuse the clients. */ bond_info->rlb_update_delay_counter = RLB_UPDATE_DELAY; spin_unlock_bh(&bond->mode_lock); } /* The slave was assigned a new mac address - update the clients */ static void rlb_req_update_slave_clients(struct bonding *bond, struct slave *slave) { struct alb_bond_info *bond_info = &(BOND_ALB_INFO(bond)); struct rlb_client_info *client_info; int ntt = 0; u32 hash_index; spin_lock_bh(&bond->mode_lock); hash_index = bond_info->rx_hashtbl_used_head; for (; hash_index != RLB_NULL_INDEX; hash_index = client_info->used_next) { client_info = &(bond_info->rx_hashtbl[hash_index]); if ((client_info->slave == slave) && is_valid_ether_addr(client_info->mac_dst)) { client_info->ntt = 1; ntt = 1; } } /* update the team's flag only after the whole iteration */ if (ntt) { bond_info->rx_ntt = 1; /* fasten the change */ bond_info->rlb_update_retry_counter = RLB_UPDATE_RETRY; } spin_unlock_bh(&bond->mode_lock); } /* mark all clients using src_ip to be updated */ static void rlb_req_update_subnet_clients(struct bonding *bond, __be32 src_ip) { struct alb_bond_info *bond_info = &(BOND_ALB_INFO(bond)); struct rlb_client_info *client_info; u32 hash_index; spin_lock(&bond->mode_lock); hash_index = bond_info->rx_hashtbl_used_head; for (; hash_index != RLB_NULL_INDEX; hash_index = client_info->used_next) { client_info = &(bond_info->rx_hashtbl[hash_index]); if (!client_info->slave) { netdev_err(bond->dev, "found a client with no channel in the client's hash table\n"); continue; } /* update all clients using this src_ip, that are not assigned * to the team's address (curr_active_slave) and have a known * unicast mac address. */ if ((client_info->ip_src == src_ip) && !ether_addr_equal_64bits(client_info->slave->dev->dev_addr, bond->dev->dev_addr) && is_valid_ether_addr(client_info->mac_dst)) { client_info->ntt = 1; bond_info->rx_ntt = 1; } } spin_unlock(&bond->mode_lock); } static struct slave *rlb_choose_channel(struct sk_buff *skb, struct bonding *bond, const struct arp_pkt *arp) { struct alb_bond_info *bond_info = &(BOND_ALB_INFO(bond)); struct slave *assigned_slave, *curr_active_slave; struct rlb_client_info *client_info; u32 hash_index = 0; spin_lock(&bond->mode_lock); curr_active_slave = rcu_dereference(bond->curr_active_slave); hash_index = _simple_hash((u8 *)&arp->ip_dst, sizeof(arp->ip_dst)); client_info = &(bond_info->rx_hashtbl[hash_index]); if (client_info->assigned) { if ((client_info->ip_src == arp->ip_src) && (client_info->ip_dst == arp->ip_dst)) { /* the entry is already assigned to this client */ if (!is_broadcast_ether_addr(arp->mac_dst)) { /* update mac address from arp */ ether_addr_copy(client_info->mac_dst, arp->mac_dst); } ether_addr_copy(client_info->mac_src, arp->mac_src); assigned_slave = client_info->slave; if (assigned_slave) { spin_unlock(&bond->mode_lock); return assigned_slave; } } else { /* the entry is already assigned to some other client, * move the old client to primary (curr_active_slave) so * that the new client can be assigned to this entry. */ if (curr_active_slave && client_info->slave != curr_active_slave) { client_info->slave = curr_active_slave; rlb_update_client(client_info); } } } /* assign a new slave */ assigned_slave = __rlb_next_rx_slave(bond); if (assigned_slave) { if (!(client_info->assigned && client_info->ip_src == arp->ip_src)) { /* ip_src is going to be updated, * fix the src hash list */ u32 hash_src = _simple_hash((u8 *)&arp->ip_src, sizeof(arp->ip_src)); rlb_src_unlink(bond, hash_index); rlb_src_link(bond, hash_src, hash_index); } client_info->ip_src = arp->ip_src; client_info->ip_dst = arp->ip_dst; /* arp->mac_dst is broadcast for arp requests. * will be updated with clients actual unicast mac address * upon receiving an arp reply. */ ether_addr_copy(client_info->mac_dst, arp->mac_dst); ether_addr_copy(client_info->mac_src, arp->mac_src); client_info->slave = assigned_slave; if (is_valid_ether_addr(client_info->mac_dst)) { client_info->ntt = 1; bond->alb_info.rx_ntt = 1; } else { client_info->ntt = 0; } if (vlan_get_tag(skb, &client_info->vlan_id)) client_info->vlan_id = 0; if (!client_info->assigned) { u32 prev_tbl_head = bond_info->rx_hashtbl_used_head; bond_info->rx_hashtbl_used_head = hash_index; client_info->used_next = prev_tbl_head; if (prev_tbl_head != RLB_NULL_INDEX) { bond_info->rx_hashtbl[prev_tbl_head].used_prev = hash_index; } client_info->assigned = 1; } } spin_unlock(&bond->mode_lock); return assigned_slave; } /* chooses (and returns) transmit channel for arp reply * does not choose channel for other arp types since they are * sent on the curr_active_slave */ static struct slave *rlb_arp_xmit(struct sk_buff *skb, struct bonding *bond) { struct slave *tx_slave = NULL; struct net_device *dev; struct arp_pkt *arp; if (!pskb_network_may_pull(skb, sizeof(*arp))) return NULL; arp = (struct arp_pkt *)skb_network_header(skb); /* Don't modify or load balance ARPs that do not originate * from the bond itself or a VLAN directly above the bond. */ if (!bond_slave_has_mac_rcu(bond, arp->mac_src)) return NULL; dev = ip_dev_find(dev_net(bond->dev), arp->ip_src); if (dev) { if (netif_is_any_bridge_master(dev)) { dev_put(dev); return NULL; } dev_put(dev); } if (arp->op_code == htons(ARPOP_REPLY)) { /* the arp must be sent on the selected rx channel */ tx_slave = rlb_choose_channel(skb, bond, arp); if (tx_slave) bond_hw_addr_copy(arp->mac_src, tx_slave->dev->dev_addr, tx_slave->dev->addr_len); netdev_dbg(bond->dev, "(slave %s): Server sent ARP Reply packet\n", tx_slave ? tx_slave->dev->name : "NULL"); } else if (arp->op_code == htons(ARPOP_REQUEST)) { /* Create an entry in the rx_hashtbl for this client as a * place holder. * When the arp reply is received the entry will be updated * with the correct unicast address of the client. */ tx_slave = rlb_choose_channel(skb, bond, arp); /* The ARP reply packets must be delayed so that * they can cancel out the influence of the ARP request. */ bond->alb_info.rlb_update_delay_counter = RLB_UPDATE_DELAY; /* arp requests are broadcast and are sent on the primary * the arp request will collapse all clients on the subnet to * the primary slave. We must register these clients to be * updated with their assigned mac. */ rlb_req_update_subnet_clients(bond, arp->ip_src); netdev_dbg(bond->dev, "(slave %s): Server sent ARP Request packet\n", tx_slave ? tx_slave->dev->name : "NULL"); } return tx_slave; } static void rlb_rebalance(struct bonding *bond) { struct alb_bond_info *bond_info = &(BOND_ALB_INFO(bond)); struct slave *assigned_slave; struct rlb_client_info *client_info; int ntt; u32 hash_index; spin_lock_bh(&bond->mode_lock); ntt = 0; hash_index = bond_info->rx_hashtbl_used_head; for (; hash_index != RLB_NULL_INDEX; hash_index = client_info->used_next) { client_info = &(bond_info->rx_hashtbl[hash_index]); assigned_slave = __rlb_next_rx_slave(bond); if (assigned_slave && (client_info->slave != assigned_slave)) { client_info->slave = assigned_slave; if (!is_zero_ether_addr(client_info->mac_dst)) { client_info->ntt = 1; ntt = 1; } } } /* update the team's flag only after the whole iteration */ if (ntt) bond_info->rx_ntt = 1; spin_unlock_bh(&bond->mode_lock); } /* Caller must hold mode_lock */ static void rlb_init_table_entry_dst(struct rlb_client_info *entry) { entry->used_next = RLB_NULL_INDEX; entry->used_prev = RLB_NULL_INDEX; entry->assigned = 0; entry->slave = NULL; entry->vlan_id = 0; } static void rlb_init_table_entry_src(struct rlb_client_info *entry) { entry->src_first = RLB_NULL_INDEX; entry->src_prev = RLB_NULL_INDEX; entry->src_next = RLB_NULL_INDEX; } static void rlb_init_table_entry(struct rlb_client_info *entry) { memset(entry, 0, sizeof(struct rlb_client_info)); rlb_init_table_entry_dst(entry); rlb_init_table_entry_src(entry); } static void rlb_delete_table_entry_dst(struct bonding *bond, u32 index) { struct alb_bond_info *bond_info = &(BOND_ALB_INFO(bond)); u32 next_index = bond_info->rx_hashtbl[index].used_next; u32 prev_index = bond_info->rx_hashtbl[index].used_prev; if (index == bond_info->rx_hashtbl_used_head) bond_info->rx_hashtbl_used_head = next_index; if (prev_index != RLB_NULL_INDEX) bond_info->rx_hashtbl[prev_index].used_next = next_index; if (next_index != RLB_NULL_INDEX) bond_info->rx_hashtbl[next_index].used_prev = prev_index; } /* unlink a rlb hash table entry from the src list */ static void rlb_src_unlink(struct bonding *bond, u32 index) { struct alb_bond_info *bond_info = &(BOND_ALB_INFO(bond)); u32 next_index = bond_info->rx_hashtbl[index].src_next; u32 prev_index = bond_info->rx_hashtbl[index].src_prev; bond_info->rx_hashtbl[index].src_next = RLB_NULL_INDEX; bond_info->rx_hashtbl[index].src_prev = RLB_NULL_INDEX; if (next_index != RLB_NULL_INDEX) bond_info->rx_hashtbl[next_index].src_prev = prev_index; if (prev_index == RLB_NULL_INDEX) return; /* is prev_index pointing to the head of this list? */ if (bond_info->rx_hashtbl[prev_index].src_first == index) bond_info->rx_hashtbl[prev_index].src_first = next_index; else bond_info->rx_hashtbl[prev_index].src_next = next_index; } static void rlb_delete_table_entry(struct bonding *bond, u32 index) { struct alb_bond_info *bond_info = &(BOND_ALB_INFO(bond)); struct rlb_client_info *entry = &(bond_info->rx_hashtbl[index]); rlb_delete_table_entry_dst(bond, index); rlb_init_table_entry_dst(entry); rlb_src_unlink(bond, index); } /* add the rx_hashtbl[ip_dst_hash] entry to the list * of entries with identical ip_src_hash */ static void rlb_src_link(struct bonding *bond, u32 ip_src_hash, u32 ip_dst_hash) { struct alb_bond_info *bond_info = &(BOND_ALB_INFO(bond)); u32 next; bond_info->rx_hashtbl[ip_dst_hash].src_prev = ip_src_hash; next = bond_info->rx_hashtbl[ip_src_hash].src_first; bond_info->rx_hashtbl[ip_dst_hash].src_next = next; if (next != RLB_NULL_INDEX) bond_info->rx_hashtbl[next].src_prev = ip_dst_hash; bond_info->rx_hashtbl[ip_src_hash].src_first = ip_dst_hash; } /* deletes all rx_hashtbl entries with arp->ip_src if their mac_src does * not match arp->mac_src */ static void rlb_purge_src_ip(struct bonding *bond, struct arp_pkt *arp) { struct alb_bond_info *bond_info = &(BOND_ALB_INFO(bond)); u32 ip_src_hash = _simple_hash((u8 *)&(arp->ip_src), sizeof(arp->ip_src)); u32 index; spin_lock_bh(&bond->mode_lock); index = bond_info->rx_hashtbl[ip_src_hash].src_first; while (index != RLB_NULL_INDEX) { struct rlb_client_info *entry = &(bond_info->rx_hashtbl[index]); u32 next_index = entry->src_next; if (entry->ip_src == arp->ip_src && !ether_addr_equal_64bits(arp->mac_src, entry->mac_src)) rlb_delete_table_entry(bond, index); index = next_index; } spin_unlock_bh(&bond->mode_lock); } static int rlb_initialize(struct bonding *bond) { struct alb_bond_info *bond_info = &(BOND_ALB_INFO(bond)); struct rlb_client_info *new_hashtbl; int size = RLB_HASH_TABLE_SIZE * sizeof(struct rlb_client_info); int i; new_hashtbl = kmalloc(size, GFP_KERNEL); if (!new_hashtbl) return -1; spin_lock_bh(&bond->mode_lock); bond_info->rx_hashtbl = new_hashtbl; bond_info->rx_hashtbl_used_head = RLB_NULL_INDEX; for (i = 0; i < RLB_HASH_TABLE_SIZE; i++) rlb_init_table_entry(bond_info->rx_hashtbl + i); spin_unlock_bh(&bond->mode_lock); /* register to receive ARPs */ bond->recv_probe = rlb_arp_recv; return 0; } static void rlb_deinitialize(struct bonding *bond) { struct alb_bond_info *bond_info = &(BOND_ALB_INFO(bond)); spin_lock_bh(&bond->mode_lock); kfree(bond_info->rx_hashtbl); bond_info->rx_hashtbl = NULL; bond_info->rx_hashtbl_used_head = RLB_NULL_INDEX; spin_unlock_bh(&bond->mode_lock); } static void rlb_clear_vlan(struct bonding *bond, unsigned short vlan_id) { struct alb_bond_info *bond_info = &(BOND_ALB_INFO(bond)); u32 curr_index; spin_lock_bh(&bond->mode_lock); curr_index = bond_info->rx_hashtbl_used_head; while (curr_index != RLB_NULL_INDEX) { struct rlb_client_info *curr = &(bond_info->rx_hashtbl[curr_index]); u32 next_index = bond_info->rx_hashtbl[curr_index].used_next; if (curr->vlan_id == vlan_id) rlb_delete_table_entry(bond, curr_index); curr_index = next_index; } spin_unlock_bh(&bond->mode_lock); } /*********************** tlb/rlb shared functions *********************/ static void alb_send_lp_vid(struct slave *slave, const u8 mac_addr[], __be16 vlan_proto, u16 vid) { struct learning_pkt pkt; struct sk_buff *skb; int size = sizeof(struct learning_pkt); memset(&pkt, 0, size); ether_addr_copy(pkt.mac_dst, mac_addr); ether_addr_copy(pkt.mac_src, mac_addr); pkt.type = cpu_to_be16(ETH_P_LOOPBACK); skb = dev_alloc_skb(size); if (!skb) return; skb_put_data(skb, &pkt, size); skb_reset_mac_header(skb); skb->network_header = skb->mac_header + ETH_HLEN; skb->protocol = pkt.type; skb->priority = TC_PRIO_CONTROL; skb->dev = slave->dev; slave_dbg(slave->bond->dev, slave->dev, "Send learning packet: mac %pM vlan %d\n", mac_addr, vid); if (vid) __vlan_hwaccel_put_tag(skb, vlan_proto, vid); dev_queue_xmit(skb); } struct alb_walk_data { struct bonding *bond; struct slave *slave; const u8 *mac_addr; bool strict_match; }; static int alb_upper_dev_walk(struct net_device *upper, struct netdev_nested_priv *priv) { struct alb_walk_data *data = (struct alb_walk_data *)priv->data; bool strict_match = data->strict_match; const u8 *mac_addr = data->mac_addr; struct bonding *bond = data->bond; struct slave *slave = data->slave; struct bond_vlan_tag *tags; if (is_vlan_dev(upper) && bond->dev->lower_level == upper->lower_level - 1) { if (upper->addr_assign_type == NET_ADDR_STOLEN) { alb_send_lp_vid(slave, mac_addr, vlan_dev_vlan_proto(upper), vlan_dev_vlan_id(upper)); } else { alb_send_lp_vid(slave, upper->dev_addr, vlan_dev_vlan_proto(upper), vlan_dev_vlan_id(upper)); } } /* If this is a macvlan device, then only send updates * when strict_match is turned off. */ if (netif_is_macvlan(upper) && !strict_match) { tags = bond_verify_device_path(bond->dev, upper, 0); if (IS_ERR_OR_NULL(tags)) return -ENOMEM; alb_send_lp_vid(slave, upper->dev_addr, tags[0].vlan_proto, tags[0].vlan_id); kfree(tags); } return 0; } static void alb_send_learning_packets(struct slave *slave, const u8 mac_addr[], bool strict_match) { struct bonding *bond = bond_get_bond_by_slave(slave); struct netdev_nested_priv priv; struct alb_walk_data data = { .strict_match = strict_match, .mac_addr = mac_addr, .slave = slave, .bond = bond, }; priv.data = (void *)&data; /* send untagged */ alb_send_lp_vid(slave, mac_addr, 0, 0); /* loop through all devices and see if we need to send a packet * for that device. */ rcu_read_lock(); netdev_walk_all_upper_dev_rcu(bond->dev, alb_upper_dev_walk, &priv); rcu_read_unlock(); } static int alb_set_slave_mac_addr(struct slave *slave, const u8 addr[], unsigned int len) { struct net_device *dev = slave->dev; struct sockaddr_storage ss; if (BOND_MODE(slave->bond) == BOND_MODE_TLB) { __dev_addr_set(dev, addr, len); return 0; } /* for rlb each slave must have a unique hw mac addresses so that * each slave will receive packets destined to a different mac */ memcpy(ss.__data, addr, len); ss.ss_family = dev->type; if (dev_set_mac_address(dev, (struct sockaddr *)&ss, NULL)) { slave_err(slave->bond->dev, dev, "dev_set_mac_address on slave failed! ALB mode requires that the base driver support setting the hw address also when the network device's interface is open\n"); return -EOPNOTSUPP; } return 0; } /* Swap MAC addresses between two slaves. * * Called with RTNL held, and no other locks. */ static void alb_swap_mac_addr(struct slave *slave1, struct slave *slave2) { u8 tmp_mac_addr[MAX_ADDR_LEN]; bond_hw_addr_copy(tmp_mac_addr, slave1->dev->dev_addr, slave1->dev->addr_len); alb_set_slave_mac_addr(slave1, slave2->dev->dev_addr, slave2->dev->addr_len); alb_set_slave_mac_addr(slave2, tmp_mac_addr, slave1->dev->addr_len); } /* Send learning packets after MAC address swap. * * Called with RTNL and no other locks */ static void alb_fasten_mac_swap(struct bonding *bond, struct slave *slave1, struct slave *slave2) { int slaves_state_differ = (bond_slave_can_tx(slave1) != bond_slave_can_tx(slave2)); struct slave *disabled_slave = NULL; ASSERT_RTNL(); /* fasten the change in the switch */ if (bond_slave_can_tx(slave1)) { alb_send_learning_packets(slave1, slave1->dev->dev_addr, false); if (bond->alb_info.rlb_enabled) { /* inform the clients that the mac address * has changed */ rlb_req_update_slave_clients(bond, slave1); } } else { disabled_slave = slave1; } if (bond_slave_can_tx(slave2)) { alb_send_learning_packets(slave2, slave2->dev->dev_addr, false); if (bond->alb_info.rlb_enabled) { /* inform the clients that the mac address * has changed */ rlb_req_update_slave_clients(bond, slave2); } } else { disabled_slave = slave2; } if (bond->alb_info.rlb_enabled && slaves_state_differ) { /* A disabled slave was assigned an active mac addr */ rlb_teach_disabled_mac_on_primary(bond, disabled_slave->dev->dev_addr); } } /** * alb_change_hw_addr_on_detach * @bond: bonding we're working on * @slave: the slave that was just detached * * We assume that @slave was already detached from the slave list. * * If @slave's permanent hw address is different both from its current * address and from @bond's address, then somewhere in the bond there's * a slave that has @slave's permanet address as its current address. * We'll make sure that slave no longer uses @slave's permanent address. * * Caller must hold RTNL and no other locks */ static void alb_change_hw_addr_on_detach(struct bonding *bond, struct slave *slave) { int perm_curr_diff; int perm_bond_diff; struct slave *found_slave; perm_curr_diff = !ether_addr_equal_64bits(slave->perm_hwaddr, slave->dev->dev_addr); perm_bond_diff = !ether_addr_equal_64bits(slave->perm_hwaddr, bond->dev->dev_addr); if (perm_curr_diff && perm_bond_diff) { found_slave = bond_slave_has_mac(bond, slave->perm_hwaddr); if (found_slave) { alb_swap_mac_addr(slave, found_slave); alb_fasten_mac_swap(bond, slave, found_slave); } } } /** * alb_handle_addr_collision_on_attach * @bond: bonding we're working on * @slave: the slave that was just attached * * checks uniqueness of slave's mac address and handles the case the * new slave uses the bonds mac address. * * If the permanent hw address of @slave is @bond's hw address, we need to * find a different hw address to give @slave, that isn't in use by any other * slave in the bond. This address must be, of course, one of the permanent * addresses of the other slaves. * * We go over the slave list, and for each slave there we compare its * permanent hw address with the current address of all the other slaves. * If no match was found, then we've found a slave with a permanent address * that isn't used by any other slave in the bond, so we can assign it to * @slave. * * assumption: this function is called before @slave is attached to the * bond slave list. */ static int alb_handle_addr_collision_on_attach(struct bonding *bond, struct slave *slave) { struct slave *has_bond_addr = rcu_access_pointer(bond->curr_active_slave); struct slave *tmp_slave1, *free_mac_slave = NULL; struct list_head *iter; if (!bond_has_slaves(bond)) { /* this is the first slave */ return 0; } /* if slave's mac address differs from bond's mac address * check uniqueness of slave's mac address against the other * slaves in the bond. */ if (!ether_addr_equal_64bits(slave->perm_hwaddr, bond->dev->dev_addr)) { if (!bond_slave_has_mac(bond, slave->dev->dev_addr)) return 0; /* Try setting slave mac to bond address and fall-through * to code handling that situation below... */ alb_set_slave_mac_addr(slave, bond->dev->dev_addr, bond->dev->addr_len); } /* The slave's address is equal to the address of the bond. * Search for a spare address in the bond for this slave. */ bond_for_each_slave(bond, tmp_slave1, iter) { if (!bond_slave_has_mac(bond, tmp_slave1->perm_hwaddr)) { /* no slave has tmp_slave1's perm addr * as its curr addr */ free_mac_slave = tmp_slave1; break; } if (!has_bond_addr) { if (ether_addr_equal_64bits(tmp_slave1->dev->dev_addr, bond->dev->dev_addr)) { has_bond_addr = tmp_slave1; } } } if (free_mac_slave) { alb_set_slave_mac_addr(slave, free_mac_slave->perm_hwaddr, free_mac_slave->dev->addr_len); slave_warn(bond->dev, slave->dev, "the slave hw address is in use by the bond; giving it the hw address of %s\n", free_mac_slave->dev->name); } else if (has_bond_addr) { slave_err(bond->dev, slave->dev, "the slave hw address is in use by the bond; couldn't find a slave with a free hw address to give it (this should not have happened)\n"); return -EFAULT; } return 0; } /** * alb_set_mac_address * @bond: bonding we're working on * @addr: MAC address to set * * In TLB mode all slaves are configured to the bond's hw address, but set * their dev_addr field to different addresses (based on their permanent hw * addresses). * * For each slave, this function sets the interface to the new address and then * changes its dev_addr field to its previous value. * * Unwinding assumes bond's mac address has not yet changed. */ static int alb_set_mac_address(struct bonding *bond, void *addr) { struct slave *slave, *rollback_slave; struct list_head *iter; struct sockaddr_storage ss; char tmp_addr[MAX_ADDR_LEN]; int res; if (bond->alb_info.rlb_enabled) return 0; bond_for_each_slave(bond, slave, iter) { /* save net_device's current hw address */ bond_hw_addr_copy(tmp_addr, slave->dev->dev_addr, slave->dev->addr_len); res = dev_set_mac_address(slave->dev, addr, NULL); /* restore net_device's hw address */ dev_addr_set(slave->dev, tmp_addr); if (res) goto unwind; } return 0; unwind: memcpy(ss.__data, bond->dev->dev_addr, bond->dev->addr_len); ss.ss_family = bond->dev->type; /* unwind from head to the slave that failed */ bond_for_each_slave(bond, rollback_slave, iter) { if (rollback_slave == slave) break; bond_hw_addr_copy(tmp_addr, rollback_slave->dev->dev_addr, rollback_slave->dev->addr_len); dev_set_mac_address(rollback_slave->dev, (struct sockaddr *)&ss, NULL); dev_addr_set(rollback_slave->dev, tmp_addr); } return res; } /* determine if the packet is NA or NS */ static bool alb_determine_nd(struct sk_buff *skb, struct bonding *bond) { struct ipv6hdr *ip6hdr; struct icmp6hdr *hdr; if (!pskb_network_may_pull(skb, sizeof(*ip6hdr))) return true; ip6hdr = ipv6_hdr(skb); if (ip6hdr->nexthdr != IPPROTO_ICMPV6) return false; if (!pskb_network_may_pull(skb, sizeof(*ip6hdr) + sizeof(*hdr))) return true; hdr = icmp6_hdr(skb); return hdr->icmp6_type == NDISC_NEIGHBOUR_ADVERTISEMENT || hdr->icmp6_type == NDISC_NEIGHBOUR_SOLICITATION; } /************************ exported alb functions ************************/ int bond_alb_initialize(struct bonding *bond, int rlb_enabled) { int res; res = tlb_initialize(bond); if (res) return res; if (rlb_enabled) { res = rlb_initialize(bond); if (res) { tlb_deinitialize(bond); return res; } bond->alb_info.rlb_enabled = 1; } else { bond->alb_info.rlb_enabled = 0; } return 0; } void bond_alb_deinitialize(struct bonding *bond) { struct alb_bond_info *bond_info = &(BOND_ALB_INFO(bond)); tlb_deinitialize(bond); if (bond_info->rlb_enabled) rlb_deinitialize(bond); } static netdev_tx_t bond_do_alb_xmit(struct sk_buff *skb, struct bonding *bond, struct slave *tx_slave) { struct alb_bond_info *bond_info = &(BOND_ALB_INFO(bond)); struct ethhdr *eth_data = eth_hdr(skb); if (!tx_slave) { /* unbalanced or unassigned, send through primary */ tx_slave = rcu_dereference(bond->curr_active_slave); if (bond->params.tlb_dynamic_lb) bond_info->unbalanced_load += skb->len; } if (tx_slave && bond_slave_can_tx(tx_slave)) { if (tx_slave != rcu_access_pointer(bond->curr_active_slave)) { ether_addr_copy(eth_data->h_source, tx_slave->dev->dev_addr); } return bond_dev_queue_xmit(bond, skb, tx_slave->dev); } if (tx_slave && bond->params.tlb_dynamic_lb) { spin_lock(&bond->mode_lock); __tlb_clear_slave(bond, tx_slave, 0); spin_unlock(&bond->mode_lock); } /* no suitable interface, frame not sent */ return bond_tx_drop(bond->dev, skb); } struct slave *bond_xmit_tlb_slave_get(struct bonding *bond, struct sk_buff *skb) { struct slave *tx_slave = NULL; struct ethhdr *eth_data; u32 hash_index; skb_reset_mac_header(skb); eth_data = eth_hdr(skb); /* Do not TX balance any multicast or broadcast */ if (!is_multicast_ether_addr(eth_data->h_dest)) { switch (skb->protocol) { case htons(ETH_P_IPV6): if (alb_determine_nd(skb, bond)) break; fallthrough; case htons(ETH_P_IP): hash_index = bond_xmit_hash(bond, skb); if (bond->params.tlb_dynamic_lb) { tx_slave = tlb_choose_channel(bond, hash_index & 0xFF, skb->len); } else { struct bond_up_slave *slaves; unsigned int count; slaves = rcu_dereference(bond->usable_slaves); count = slaves ? READ_ONCE(slaves->count) : 0; if (likely(count)) tx_slave = slaves->arr[hash_index % count]; } break; } } return tx_slave; } netdev_tx_t bond_tlb_xmit(struct sk_buff *skb, struct net_device *bond_dev) { struct bonding *bond = netdev_priv(bond_dev); struct slave *tx_slave; tx_slave = bond_xmit_tlb_slave_get(bond, skb); return bond_do_alb_xmit(skb, bond, tx_slave); } struct slave *bond_xmit_alb_slave_get(struct bonding *bond, struct sk_buff *skb) { struct alb_bond_info *bond_info = &(BOND_ALB_INFO(bond)); static const __be32 ip_bcast = htonl(0xffffffff); struct slave *tx_slave = NULL; const u8 *hash_start = NULL; bool do_tx_balance = true; struct ethhdr *eth_data; u32 hash_index = 0; int hash_size = 0; skb_reset_mac_header(skb); eth_data = eth_hdr(skb); switch (ntohs(skb->protocol)) { case ETH_P_IP: { const struct iphdr *iph; if (is_broadcast_ether_addr(eth_data->h_dest) || !pskb_network_may_pull(skb, sizeof(*iph))) { do_tx_balance = false; break; } iph = ip_hdr(skb); if (iph->daddr == ip_bcast || iph->protocol == IPPROTO_IGMP) { do_tx_balance = false; break; } hash_start = (char *)&(iph->daddr); hash_size = sizeof(iph->daddr); break; } case ETH_P_IPV6: { const struct ipv6hdr *ip6hdr; /* IPv6 doesn't really use broadcast mac address, but leave * that here just in case. */ if (is_broadcast_ether_addr(eth_data->h_dest)) { do_tx_balance = false; break; } /* IPv6 uses all-nodes multicast as an equivalent to * broadcasts in IPv4. */ if (ether_addr_equal_64bits(eth_data->h_dest, mac_v6_allmcast)) { do_tx_balance = false; break; } if (alb_determine_nd(skb, bond)) { do_tx_balance = false; break; } /* The IPv6 header is pulled by alb_determine_nd */ /* Additionally, DAD probes should not be tx-balanced as that * will lead to false positives for duplicate addresses and * prevent address configuration from working. */ ip6hdr = ipv6_hdr(skb); if (ipv6_addr_any(&ip6hdr->saddr)) { do_tx_balance = false; break; } hash_start = (char *)&ip6hdr->daddr; hash_size = sizeof(ip6hdr->daddr); break; } case ETH_P_ARP: do_tx_balance = false; if (bond_info->rlb_enabled) tx_slave = rlb_arp_xmit(skb, bond); break; default: do_tx_balance = false; break; } if (do_tx_balance) { if (bond->params.tlb_dynamic_lb) { hash_index = _simple_hash(hash_start, hash_size); tx_slave = tlb_choose_channel(bond, hash_index, skb->len); } else { /* * do_tx_balance means we are free to select the tx_slave * So we do exactly what tlb would do for hash selection */ struct bond_up_slave *slaves; unsigned int count; slaves = rcu_dereference(bond->usable_slaves); count = slaves ? READ_ONCE(slaves->count) : 0; if (likely(count)) tx_slave = slaves->arr[bond_xmit_hash(bond, skb) % count]; } } return tx_slave; } netdev_tx_t bond_alb_xmit(struct sk_buff *skb, struct net_device *bond_dev) { struct bonding *bond = netdev_priv(bond_dev); struct slave *tx_slave = NULL; tx_slave = bond_xmit_alb_slave_get(bond, skb); return bond_do_alb_xmit(skb, bond, tx_slave); } void bond_alb_monitor(struct work_struct *work) { struct bonding *bond = container_of(work, struct bonding, alb_work.work); struct alb_bond_info *bond_info = &(BOND_ALB_INFO(bond)); struct list_head *iter; struct slave *slave; if (!bond_has_slaves(bond)) { atomic_set(&bond_info->tx_rebalance_counter, 0); bond_info->lp_counter = 0; goto re_arm; } rcu_read_lock(); atomic_inc(&bond_info->tx_rebalance_counter); bond_info->lp_counter++; /* send learning packets */ if (bond_info->lp_counter >= BOND_ALB_LP_TICKS(bond)) { bool strict_match; bond_for_each_slave_rcu(bond, slave, iter) { /* If updating current_active, use all currently * user mac addresses (!strict_match). Otherwise, only * use mac of the slave device. * In RLB mode, we always use strict matches. */ strict_match = (slave != rcu_access_pointer(bond->curr_active_slave) || bond_info->rlb_enabled); alb_send_learning_packets(slave, slave->dev->dev_addr, strict_match); } bond_info->lp_counter = 0; } /* rebalance tx traffic */ if (atomic_read(&bond_info->tx_rebalance_counter) >= BOND_TLB_REBALANCE_TICKS) { bond_for_each_slave_rcu(bond, slave, iter) { tlb_clear_slave(bond, slave, 1); if (slave == rcu_access_pointer(bond->curr_active_slave)) { SLAVE_TLB_INFO(slave).load = bond_info->unbalanced_load / BOND_TLB_REBALANCE_INTERVAL; bond_info->unbalanced_load = 0; } } atomic_set(&bond_info->tx_rebalance_counter, 0); } if (bond_info->rlb_enabled) { if (bond_info->primary_is_promisc && (++bond_info->rlb_promisc_timeout_counter >= RLB_PROMISC_TIMEOUT)) { /* dev_set_promiscuity requires rtnl and * nothing else. Avoid race with bond_close. */ rcu_read_unlock(); if (!rtnl_trylock()) goto re_arm; bond_info->rlb_promisc_timeout_counter = 0; /* If the primary was set to promiscuous mode * because a slave was disabled then * it can now leave promiscuous mode. */ dev_set_promiscuity(rtnl_dereference(bond->curr_active_slave)->dev, -1); bond_info->primary_is_promisc = 0; rtnl_unlock(); rcu_read_lock(); } if (bond_info->rlb_rebalance) { bond_info->rlb_rebalance = 0; rlb_rebalance(bond); } /* check if clients need updating */ if (bond_info->rx_ntt) { if (bond_info->rlb_update_delay_counter) { --bond_info->rlb_update_delay_counter; } else { rlb_update_rx_clients(bond); if (bond_info->rlb_update_retry_counter) --bond_info->rlb_update_retry_counter; else bond_info->rx_ntt = 0; } } } rcu_read_unlock(); re_arm: queue_delayed_work(bond->wq, &bond->alb_work, alb_delta_in_ticks); } /* assumption: called before the slave is attached to the bond * and not locked by the bond lock */ int bond_alb_init_slave(struct bonding *bond, struct slave *slave) { int res; res = alb_set_slave_mac_addr(slave, slave->perm_hwaddr, slave->dev->addr_len); if (res) return res; res = alb_handle_addr_collision_on_attach(bond, slave); if (res) return res; tlb_init_slave(slave); /* order a rebalance ASAP */ atomic_set(&bond->alb_info.tx_rebalance_counter, BOND_TLB_REBALANCE_TICKS); if (bond->alb_info.rlb_enabled) bond->alb_info.rlb_rebalance = 1; return 0; } /* Remove slave from tlb and rlb hash tables, and fix up MAC addresses * if necessary. * * Caller must hold RTNL and no other locks */ void bond_alb_deinit_slave(struct bonding *bond, struct slave *slave) { if (bond_has_slaves(bond)) alb_change_hw_addr_on_detach(bond, slave); tlb_clear_slave(bond, slave, 0); if (bond->alb_info.rlb_enabled) { bond->alb_info.rx_slave = NULL; rlb_clear_slave(bond, slave); } } void bond_alb_handle_link_change(struct bonding *bond, struct slave *slave, char link) { struct alb_bond_info *bond_info = &(BOND_ALB_INFO(bond)); if (link == BOND_LINK_DOWN) { tlb_clear_slave(bond, slave, 0); if (bond->alb_info.rlb_enabled) rlb_clear_slave(bond, slave); } else if (link == BOND_LINK_UP) { /* order a rebalance ASAP */ atomic_set(&bond_info->tx_rebalance_counter, BOND_TLB_REBALANCE_TICKS); if (bond->alb_info.rlb_enabled) { bond->alb_info.rlb_rebalance = 1; /* If the updelay module parameter is smaller than the * forwarding delay of the switch the rebalance will * not work because the rebalance arp replies will * not be forwarded to the clients.. */ } } if (bond_is_nondyn_tlb(bond)) { if (bond_update_slave_arr(bond, NULL)) pr_err("Failed to build slave-array for TLB mode.\n"); } } /** * bond_alb_handle_active_change - assign new curr_active_slave * @bond: our bonding struct * @new_slave: new slave to assign * * Set the bond->curr_active_slave to @new_slave and handle * mac address swapping and promiscuity changes as needed. * * Caller must hold RTNL */ void bond_alb_handle_active_change(struct bonding *bond, struct slave *new_slave) { struct slave *swap_slave; struct slave *curr_active; curr_active = rtnl_dereference(bond->curr_active_slave); if (curr_active == new_slave) return; if (curr_active && bond->alb_info.primary_is_promisc) { dev_set_promiscuity(curr_active->dev, -1); bond->alb_info.primary_is_promisc = 0; bond->alb_info.rlb_promisc_timeout_counter = 0; } swap_slave = curr_active; rcu_assign_pointer(bond->curr_active_slave, new_slave); if (!new_slave || !bond_has_slaves(bond)) return; /* set the new curr_active_slave to the bonds mac address * i.e. swap mac addresses of old curr_active_slave and new curr_active_slave */ if (!swap_slave) swap_slave = bond_slave_has_mac(bond, bond->dev->dev_addr); /* Arrange for swap_slave and new_slave to temporarily be * ignored so we can mess with their MAC addresses without * fear of interference from transmit activity. */ if (swap_slave) tlb_clear_slave(bond, swap_slave, 1); tlb_clear_slave(bond, new_slave, 1); /* in TLB mode, the slave might flip down/up with the old dev_addr, * and thus filter bond->dev_addr's packets, so force bond's mac */ if (BOND_MODE(bond) == BOND_MODE_TLB) { struct sockaddr_storage ss; u8 tmp_addr[MAX_ADDR_LEN]; bond_hw_addr_copy(tmp_addr, new_slave->dev->dev_addr, new_slave->dev->addr_len); bond_hw_addr_copy(ss.__data, bond->dev->dev_addr, bond->dev->addr_len); ss.ss_family = bond->dev->type; /* we don't care if it can't change its mac, best effort */ dev_set_mac_address(new_slave->dev, (struct sockaddr *)&ss, NULL); dev_addr_set(new_slave->dev, tmp_addr); } /* curr_active_slave must be set before calling alb_swap_mac_addr */ if (swap_slave) { /* swap mac address */ alb_swap_mac_addr(swap_slave, new_slave); alb_fasten_mac_swap(bond, swap_slave, new_slave); } else { /* set the new_slave to the bond mac address */ alb_set_slave_mac_addr(new_slave, bond->dev->dev_addr, bond->dev->addr_len); alb_send_learning_packets(new_slave, bond->dev->dev_addr, false); } } /* Called with RTNL */ int bond_alb_set_mac_address(struct net_device *bond_dev, void *addr) { struct bonding *bond = netdev_priv(bond_dev); struct sockaddr_storage *ss = addr; struct slave *curr_active; struct slave *swap_slave; int res; if (!is_valid_ether_addr(ss->__data)) return -EADDRNOTAVAIL; res = alb_set_mac_address(bond, addr); if (res) return res; dev_addr_set(bond_dev, ss->__data); /* If there is no curr_active_slave there is nothing else to do. * Otherwise we'll need to pass the new address to it and handle * duplications. */ curr_active = rtnl_dereference(bond->curr_active_slave); if (!curr_active) return 0; swap_slave = bond_slave_has_mac(bond, bond_dev->dev_addr); if (swap_slave) { alb_swap_mac_addr(swap_slave, curr_active); alb_fasten_mac_swap(bond, swap_slave, curr_active); } else { alb_set_slave_mac_addr(curr_active, bond_dev->dev_addr, bond_dev->addr_len); alb_send_learning_packets(curr_active, bond_dev->dev_addr, false); if (bond->alb_info.rlb_enabled) { /* inform clients mac address has changed */ rlb_req_update_slave_clients(bond, curr_active); } } return 0; } void bond_alb_clear_vlan(struct bonding *bond, unsigned short vlan_id) { if (bond->alb_info.rlb_enabled) rlb_clear_vlan(bond, vlan_id); } |
| 40 40 40 40 49 49 3 1 49 3 31 23 1 11 12 2 23 6 42 34 11 7 41 33 24 23 36 31 16 36 17 37 37 17 1 9 6 1 49 50 50 49 1 44 15 1 1 2 4 26 1 25 10 47 47 47 47 50 46 33 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 | // SPDX-License-Identifier: GPL-2.0 /* * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket * interface as the means of communication with the user level. * * The IP fragmentation functionality. * * Authors: Fred N. van Kempen <waltje@uWalt.NL.Mugnet.ORG> * Alan Cox <alan@lxorguk.ukuu.org.uk> * * Fixes: * Alan Cox : Split from ip.c , see ip_input.c for history. * David S. Miller : Begin massive cleanup... * Andi Kleen : Add sysctls. * xxxx : Overlapfrag bug. * Ultima : ip_expire() kernel panic. * Bill Hawes : Frag accounting and evictor fixes. * John McDonald : 0 length frag bug. * Alexey Kuznetsov: SMP races, threading, cleanup. * Patrick McHardy : LRU queue of frag heads for evictor. */ #define pr_fmt(fmt) "IPv4: " fmt #include <linux/compiler.h> #include <linux/module.h> #include <linux/types.h> #include <linux/mm.h> #include <linux/jiffies.h> #include <linux/skbuff.h> #include <linux/list.h> #include <linux/ip.h> #include <linux/icmp.h> #include <linux/netdevice.h> #include <linux/jhash.h> #include <linux/random.h> #include <linux/slab.h> #include <net/route.h> #include <net/dst.h> #include <net/sock.h> #include <net/ip.h> #include <net/icmp.h> #include <net/checksum.h> #include <net/inetpeer.h> #include <net/inet_frag.h> #include <linux/tcp.h> #include <linux/udp.h> #include <linux/inet.h> #include <linux/netfilter_ipv4.h> #include <net/inet_ecn.h> #include <net/l3mdev.h> /* NOTE. Logic of IP defragmentation is parallel to corresponding IPv6 * code now. If you change something here, _PLEASE_ update ipv6/reassembly.c * as well. Or notify me, at least. --ANK */ static const char ip_frag_cache_name[] = "ip4-frags"; /* Describe an entry in the "incomplete datagrams" queue. */ struct ipq { struct inet_frag_queue q; u8 ecn; /* RFC3168 support */ u16 max_df_size; /* largest frag with DF set seen */ int iif; unsigned int rid; struct inet_peer *peer; }; static u8 ip4_frag_ecn(u8 tos) { return 1 << (tos & INET_ECN_MASK); } static struct inet_frags ip4_frags; static int ip_frag_reasm(struct ipq *qp, struct sk_buff *skb, struct sk_buff *prev_tail, struct net_device *dev); static void ip4_frag_init(struct inet_frag_queue *q, const void *a) { struct ipq *qp = container_of(q, struct ipq, q); const struct frag_v4_compare_key *key = a; struct net *net = q->fqdir->net; struct inet_peer *p = NULL; q->key.v4 = *key; qp->ecn = 0; if (q->fqdir->max_dist) { rcu_read_lock(); p = inet_getpeer_v4(net->ipv4.peers, key->saddr, key->vif); if (p && !refcount_inc_not_zero(&p->refcnt)) p = NULL; rcu_read_unlock(); } qp->peer = p; } static void ip4_frag_free(struct inet_frag_queue *q) { struct ipq *qp; qp = container_of(q, struct ipq, q); if (qp->peer) inet_putpeer(qp->peer); } /* Destruction primitives. */ static void ipq_put(struct ipq *ipq) { inet_frag_put(&ipq->q); } /* Kill ipq entry. It is not destroyed immediately, * because caller (and someone more) holds reference count. */ static void ipq_kill(struct ipq *ipq) { inet_frag_kill(&ipq->q); } static bool frag_expire_skip_icmp(u32 user) { return user == IP_DEFRAG_AF_PACKET || ip_defrag_user_in_between(user, IP_DEFRAG_CONNTRACK_IN, __IP_DEFRAG_CONNTRACK_IN_END) || ip_defrag_user_in_between(user, IP_DEFRAG_CONNTRACK_BRIDGE_IN, __IP_DEFRAG_CONNTRACK_BRIDGE_IN); } /* * Oops, a fragment queue timed out. Kill it and send an ICMP reply. */ static void ip_expire(struct timer_list *t) { enum skb_drop_reason reason = SKB_DROP_REASON_FRAG_REASM_TIMEOUT; struct inet_frag_queue *frag = from_timer(frag, t, timer); const struct iphdr *iph; struct sk_buff *head = NULL; struct net *net; struct ipq *qp; qp = container_of(frag, struct ipq, q); net = qp->q.fqdir->net; rcu_read_lock(); /* Paired with WRITE_ONCE() in fqdir_pre_exit(). */ if (READ_ONCE(qp->q.fqdir->dead)) goto out_rcu_unlock; spin_lock(&qp->q.lock); if (qp->q.flags & INET_FRAG_COMPLETE) goto out; qp->q.flags |= INET_FRAG_DROP; ipq_kill(qp); __IP_INC_STATS(net, IPSTATS_MIB_REASMFAILS); __IP_INC_STATS(net, IPSTATS_MIB_REASMTIMEOUT); if (!(qp->q.flags & INET_FRAG_FIRST_IN)) goto out; /* sk_buff::dev and sk_buff::rbnode are unionized. So we * pull the head out of the tree in order to be able to * deal with head->dev. */ head = inet_frag_pull_head(&qp->q); if (!head) goto out; head->dev = dev_get_by_index_rcu(net, qp->iif); if (!head->dev) goto out; /* skb has no dst, perform route lookup again */ iph = ip_hdr(head); reason = ip_route_input_noref(head, iph->daddr, iph->saddr, ip4h_dscp(iph), head->dev); if (reason) goto out; /* Only an end host needs to send an ICMP * "Fragment Reassembly Timeout" message, per RFC792. */ reason = SKB_DROP_REASON_FRAG_REASM_TIMEOUT; if (frag_expire_skip_icmp(qp->q.key.v4.user) && (skb_rtable(head)->rt_type != RTN_LOCAL)) goto out; spin_unlock(&qp->q.lock); icmp_send(head, ICMP_TIME_EXCEEDED, ICMP_EXC_FRAGTIME, 0); goto out_rcu_unlock; out: spin_unlock(&qp->q.lock); out_rcu_unlock: rcu_read_unlock(); kfree_skb_reason(head, reason); ipq_put(qp); } /* Find the correct entry in the "incomplete datagrams" queue for * this IP datagram, and create new one, if nothing is found. */ static struct ipq *ip_find(struct net *net, struct iphdr *iph, u32 user, int vif) { struct frag_v4_compare_key key = { .saddr = iph->saddr, .daddr = iph->daddr, .user = user, .vif = vif, .id = iph->id, .protocol = iph->protocol, }; struct inet_frag_queue *q; q = inet_frag_find(net->ipv4.fqdir, &key); if (!q) return NULL; return container_of(q, struct ipq, q); } /* Is the fragment too far ahead to be part of ipq? */ static int ip_frag_too_far(struct ipq *qp) { struct inet_peer *peer = qp->peer; unsigned int max = qp->q.fqdir->max_dist; unsigned int start, end; int rc; if (!peer || !max) return 0; start = qp->rid; end = atomic_inc_return(&peer->rid); qp->rid = end; rc = qp->q.fragments_tail && (end - start) > max; if (rc) __IP_INC_STATS(qp->q.fqdir->net, IPSTATS_MIB_REASMFAILS); return rc; } static int ip_frag_reinit(struct ipq *qp) { unsigned int sum_truesize = 0; if (!mod_timer(&qp->q.timer, jiffies + qp->q.fqdir->timeout)) { refcount_inc(&qp->q.refcnt); return -ETIMEDOUT; } sum_truesize = inet_frag_rbtree_purge(&qp->q.rb_fragments, SKB_DROP_REASON_FRAG_TOO_FAR); sub_frag_mem_limit(qp->q.fqdir, sum_truesize); qp->q.flags = 0; qp->q.len = 0; qp->q.meat = 0; qp->q.rb_fragments = RB_ROOT; qp->q.fragments_tail = NULL; qp->q.last_run_head = NULL; qp->iif = 0; qp->ecn = 0; return 0; } /* Add new segment to existing queue. */ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb) { struct net *net = qp->q.fqdir->net; int ihl, end, flags, offset; struct sk_buff *prev_tail; struct net_device *dev; unsigned int fragsize; int err = -ENOENT; SKB_DR(reason); u8 ecn; /* If reassembly is already done, @skb must be a duplicate frag. */ if (qp->q.flags & INET_FRAG_COMPLETE) { SKB_DR_SET(reason, DUP_FRAG); goto err; } if (!(IPCB(skb)->flags & IPSKB_FRAG_COMPLETE) && unlikely(ip_frag_too_far(qp)) && unlikely(err = ip_frag_reinit(qp))) { ipq_kill(qp); goto err; } ecn = ip4_frag_ecn(ip_hdr(skb)->tos); offset = ntohs(ip_hdr(skb)->frag_off); flags = offset & ~IP_OFFSET; offset &= IP_OFFSET; offset <<= 3; /* offset is in 8-byte chunks */ ihl = ip_hdrlen(skb); /* Determine the position of this fragment. */ end = offset + skb->len - skb_network_offset(skb) - ihl; err = -EINVAL; /* Is this the final fragment? */ if ((flags & IP_MF) == 0) { /* If we already have some bits beyond end * or have different end, the segment is corrupted. */ if (end < qp->q.len || ((qp->q.flags & INET_FRAG_LAST_IN) && end != qp->q.len)) goto discard_qp; qp->q.flags |= INET_FRAG_LAST_IN; qp->q.len = end; } else { if (end&7) { end &= ~7; if (skb->ip_summed != CHECKSUM_UNNECESSARY) skb->ip_summed = CHECKSUM_NONE; } if (end > qp->q.len) { /* Some bits beyond end -> corruption. */ if (qp->q.flags & INET_FRAG_LAST_IN) goto discard_qp; qp->q.len = end; } } if (end == offset) goto discard_qp; err = -ENOMEM; if (!pskb_pull(skb, skb_network_offset(skb) + ihl)) goto discard_qp; err = pskb_trim_rcsum(skb, end - offset); if (err) goto discard_qp; /* Note : skb->rbnode and skb->dev share the same location. */ dev = skb->dev; /* Makes sure compiler wont do silly aliasing games */ barrier(); prev_tail = qp->q.fragments_tail; err = inet_frag_queue_insert(&qp->q, skb, offset, end); if (err) goto insert_error; if (dev) qp->iif = dev->ifindex; qp->q.stamp = skb->tstamp; qp->q.tstamp_type = skb->tstamp_type; qp->q.meat += skb->len; qp->ecn |= ecn; add_frag_mem_limit(qp->q.fqdir, skb->truesize); if (offset == 0) qp->q.flags |= INET_FRAG_FIRST_IN; fragsize = skb->len + ihl; if (fragsize > qp->q.max_size) qp->q.max_size = fragsize; if (ip_hdr(skb)->frag_off & htons(IP_DF) && fragsize > qp->max_df_size) qp->max_df_size = fragsize; if (qp->q.flags == (INET_FRAG_FIRST_IN | INET_FRAG_LAST_IN) && qp->q.meat == qp->q.len) { unsigned long orefdst = skb->_skb_refdst; skb->_skb_refdst = 0UL; err = ip_frag_reasm(qp, skb, prev_tail, dev); skb->_skb_refdst = orefdst; if (err) inet_frag_kill(&qp->q); return err; } skb_dst_drop(skb); skb_orphan(skb); return -EINPROGRESS; insert_error: if (err == IPFRAG_DUP) { SKB_DR_SET(reason, DUP_FRAG); err = -EINVAL; goto err; } err = -EINVAL; __IP_INC_STATS(net, IPSTATS_MIB_REASM_OVERLAPS); discard_qp: inet_frag_kill(&qp->q); __IP_INC_STATS(net, IPSTATS_MIB_REASMFAILS); err: kfree_skb_reason(skb, reason); return err; } static bool ip_frag_coalesce_ok(const struct ipq *qp) { return qp->q.key.v4.user == IP_DEFRAG_LOCAL_DELIVER; } /* Build a new IP datagram from all its fragments. */ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *skb, struct sk_buff *prev_tail, struct net_device *dev) { struct net *net = qp->q.fqdir->net; struct iphdr *iph; void *reasm_data; int len, err; u8 ecn; ipq_kill(qp); ecn = ip_frag_ecn_table[qp->ecn]; if (unlikely(ecn == 0xff)) { err = -EINVAL; goto out_fail; } /* Make the one we just received the head. */ reasm_data = inet_frag_reasm_prepare(&qp->q, skb, prev_tail); if (!reasm_data) goto out_nomem; len = ip_hdrlen(skb) + qp->q.len; err = -E2BIG; if (len > 65535) goto out_oversize; inet_frag_reasm_finish(&qp->q, skb, reasm_data, ip_frag_coalesce_ok(qp)); skb->dev = dev; IPCB(skb)->frag_max_size = max(qp->max_df_size, qp->q.max_size); iph = ip_hdr(skb); iph->tot_len = htons(len); iph->tos |= ecn; /* When we set IP_DF on a refragmented skb we must also force a * call to ip_fragment to avoid forwarding a DF-skb of size s while * original sender only sent fragments of size f (where f < s). * * We only set DF/IPSKB_FRAG_PMTU if such DF fragment was the largest * frag seen to avoid sending tiny DF-fragments in case skb was built * from one very small df-fragment and one large non-df frag. */ if (qp->max_df_size == qp->q.max_size) { IPCB(skb)->flags |= IPSKB_FRAG_PMTU; iph->frag_off = htons(IP_DF); } else { iph->frag_off = 0; } ip_send_check(iph); __IP_INC_STATS(net, IPSTATS_MIB_REASMOKS); qp->q.rb_fragments = RB_ROOT; qp->q.fragments_tail = NULL; qp->q.last_run_head = NULL; return 0; out_nomem: net_dbg_ratelimited("queue_glue: no memory for gluing queue %p\n", qp); err = -ENOMEM; goto out_fail; out_oversize: net_info_ratelimited("Oversized IP packet from %pI4\n", &qp->q.key.v4.saddr); out_fail: __IP_INC_STATS(net, IPSTATS_MIB_REASMFAILS); return err; } /* Process an incoming IP datagram fragment. */ int ip_defrag(struct net *net, struct sk_buff *skb, u32 user) { struct net_device *dev = skb->dev ? : skb_dst(skb)->dev; int vif = l3mdev_master_ifindex_rcu(dev); struct ipq *qp; __IP_INC_STATS(net, IPSTATS_MIB_REASMREQDS); /* Lookup (or create) queue header */ qp = ip_find(net, ip_hdr(skb), user, vif); if (qp) { int ret; spin_lock(&qp->q.lock); ret = ip_frag_queue(qp, skb); spin_unlock(&qp->q.lock); ipq_put(qp); return ret; } __IP_INC_STATS(net, IPSTATS_MIB_REASMFAILS); kfree_skb(skb); return -ENOMEM; } EXPORT_SYMBOL(ip_defrag); struct sk_buff *ip_check_defrag(struct net *net, struct sk_buff *skb, u32 user) { struct iphdr iph; int netoff; u32 len; if (skb->protocol != htons(ETH_P_IP)) return skb; netoff = skb_network_offset(skb); if (skb_copy_bits(skb, netoff, &iph, sizeof(iph)) < 0) return skb; if (iph.ihl < 5 || iph.version != 4) return skb; len = ntohs(iph.tot_len); if (skb->len < netoff + len || len < (iph.ihl * 4)) return skb; if (ip_is_fragment(&iph)) { skb = skb_share_check(skb, GFP_ATOMIC); if (skb) { if (!pskb_may_pull(skb, netoff + iph.ihl * 4)) { kfree_skb(skb); return NULL; } if (pskb_trim_rcsum(skb, netoff + len)) { kfree_skb(skb); return NULL; } memset(IPCB(skb), 0, sizeof(struct inet_skb_parm)); if (ip_defrag(net, skb, user)) return NULL; skb_clear_hash(skb); } } return skb; } EXPORT_SYMBOL(ip_check_defrag); #ifdef CONFIG_SYSCTL static int dist_min; static struct ctl_table ip4_frags_ns_ctl_table[] = { { .procname = "ipfrag_high_thresh", .maxlen = sizeof(unsigned long), .mode = 0644, .proc_handler = proc_doulongvec_minmax, }, { .procname = "ipfrag_low_thresh", .maxlen = sizeof(unsigned long), .mode = 0644, .proc_handler = proc_doulongvec_minmax, }, { .procname = "ipfrag_time", .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, { .procname = "ipfrag_max_dist", .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = &dist_min, }, }; /* secret interval has been deprecated */ static int ip4_frags_secret_interval_unused; static struct ctl_table ip4_frags_ctl_table[] = { { .procname = "ipfrag_secret_interval", .data = &ip4_frags_secret_interval_unused, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, }; static int __net_init ip4_frags_ns_ctl_register(struct net *net) { struct ctl_table *table; struct ctl_table_header *hdr; table = ip4_frags_ns_ctl_table; if (!net_eq(net, &init_net)) { table = kmemdup(table, sizeof(ip4_frags_ns_ctl_table), GFP_KERNEL); if (!table) goto err_alloc; } table[0].data = &net->ipv4.fqdir->high_thresh; table[0].extra1 = &net->ipv4.fqdir->low_thresh; table[1].data = &net->ipv4.fqdir->low_thresh; table[1].extra2 = &net->ipv4.fqdir->high_thresh; table[2].data = &net->ipv4.fqdir->timeout; table[3].data = &net->ipv4.fqdir->max_dist; hdr = register_net_sysctl_sz(net, "net/ipv4", table, ARRAY_SIZE(ip4_frags_ns_ctl_table)); if (!hdr) goto err_reg; net->ipv4.frags_hdr = hdr; return 0; err_reg: if (!net_eq(net, &init_net)) kfree(table); err_alloc: return -ENOMEM; } static void __net_exit ip4_frags_ns_ctl_unregister(struct net *net) { const struct ctl_table *table; table = net->ipv4.frags_hdr->ctl_table_arg; unregister_net_sysctl_table(net->ipv4.frags_hdr); kfree(table); } static void __init ip4_frags_ctl_register(void) { register_net_sysctl(&init_net, "net/ipv4", ip4_frags_ctl_table); } #else static int ip4_frags_ns_ctl_register(struct net *net) { return 0; } static void ip4_frags_ns_ctl_unregister(struct net *net) { } static void __init ip4_frags_ctl_register(void) { } #endif static int __net_init ipv4_frags_init_net(struct net *net) { int res; res = fqdir_init(&net->ipv4.fqdir, &ip4_frags, net); if (res < 0) return res; /* Fragment cache limits. * * The fragment memory accounting code, (tries to) account for * the real memory usage, by measuring both the size of frag * queue struct (inet_frag_queue (ipv4:ipq/ipv6:frag_queue)) * and the SKB's truesize. * * A 64K fragment consumes 129736 bytes (44*2944)+200 * (1500 truesize == 2944, sizeof(struct ipq) == 200) * * We will commit 4MB at one time. Should we cross that limit * we will prune down to 3MB, making room for approx 8 big 64K * fragments 8x128k. */ net->ipv4.fqdir->high_thresh = 4 * 1024 * 1024; net->ipv4.fqdir->low_thresh = 3 * 1024 * 1024; /* * Important NOTE! Fragment queue must be destroyed before MSL expires. * RFC791 is wrong proposing to prolongate timer each fragment arrival * by TTL. */ net->ipv4.fqdir->timeout = IP_FRAG_TIME; net->ipv4.fqdir->max_dist = 64; res = ip4_frags_ns_ctl_register(net); if (res < 0) fqdir_exit(net->ipv4.fqdir); return res; } static void __net_exit ipv4_frags_pre_exit_net(struct net *net) { fqdir_pre_exit(net->ipv4.fqdir); } static void __net_exit ipv4_frags_exit_net(struct net *net) { ip4_frags_ns_ctl_unregister(net); fqdir_exit(net->ipv4.fqdir); } static struct pernet_operations ip4_frags_ops = { .init = ipv4_frags_init_net, .pre_exit = ipv4_frags_pre_exit_net, .exit = ipv4_frags_exit_net, }; static u32 ip4_key_hashfn(const void *data, u32 len, u32 seed) { return jhash2(data, sizeof(struct frag_v4_compare_key) / sizeof(u32), seed); } static u32 ip4_obj_hashfn(const void *data, u32 len, u32 seed) { const struct inet_frag_queue *fq = data; return jhash2((const u32 *)&fq->key.v4, sizeof(struct frag_v4_compare_key) / sizeof(u32), seed); } static int ip4_obj_cmpfn(struct rhashtable_compare_arg *arg, const void *ptr) { const struct frag_v4_compare_key *key = arg->key; const struct inet_frag_queue *fq = ptr; return !!memcmp(&fq->key, key, sizeof(*key)); } static const struct rhashtable_params ip4_rhash_params = { .head_offset = offsetof(struct inet_frag_queue, node), .key_offset = offsetof(struct inet_frag_queue, key), .key_len = sizeof(struct frag_v4_compare_key), .hashfn = ip4_key_hashfn, .obj_hashfn = ip4_obj_hashfn, .obj_cmpfn = ip4_obj_cmpfn, .automatic_shrinking = true, }; void __init ipfrag_init(void) { ip4_frags.constructor = ip4_frag_init; ip4_frags.destructor = ip4_frag_free; ip4_frags.qsize = sizeof(struct ipq); ip4_frags.frag_expire = ip_expire; ip4_frags.frags_cache_name = ip_frag_cache_name; ip4_frags.rhash_params = ip4_rhash_params; if (inet_frags_init(&ip4_frags)) panic("IP: failed to allocate ip4_frags cache\n"); ip4_frags_ctl_register(); register_pernet_subsys(&ip4_frags_ops); } |
| 11 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 | /* SPDX-License-Identifier: GPL-2.0 */ /* * linux/include/linux/sunrpc/addr.h * * Various routines for copying and comparing sockaddrs and for * converting them to and from presentation format. */ #ifndef _LINUX_SUNRPC_ADDR_H #define _LINUX_SUNRPC_ADDR_H #include <linux/socket.h> #include <linux/in.h> #include <linux/in6.h> #include <net/ipv6.h> size_t rpc_ntop(const struct sockaddr *, char *, const size_t); size_t rpc_pton(struct net *, const char *, const size_t, struct sockaddr *, const size_t); char * rpc_sockaddr2uaddr(const struct sockaddr *, gfp_t); size_t rpc_uaddr2sockaddr(struct net *, const char *, const size_t, struct sockaddr *, const size_t); static inline unsigned short rpc_get_port(const struct sockaddr *sap) { switch (sap->sa_family) { case AF_INET: return ntohs(((struct sockaddr_in *)sap)->sin_port); case AF_INET6: return ntohs(((struct sockaddr_in6 *)sap)->sin6_port); } return 0; } static inline void rpc_set_port(struct sockaddr *sap, const unsigned short port) { switch (sap->sa_family) { case AF_INET: ((struct sockaddr_in *)sap)->sin_port = htons(port); break; case AF_INET6: ((struct sockaddr_in6 *)sap)->sin6_port = htons(port); break; } } #define IPV6_SCOPE_DELIMITER '%' #define IPV6_SCOPE_ID_LEN sizeof("%nnnnnnnnnn") static inline bool rpc_cmp_addr4(const struct sockaddr *sap1, const struct sockaddr *sap2) { const struct sockaddr_in *sin1 = (const struct sockaddr_in *)sap1; const struct sockaddr_in *sin2 = (const struct sockaddr_in *)sap2; return sin1->sin_addr.s_addr == sin2->sin_addr.s_addr; } static inline bool __rpc_copy_addr4(struct sockaddr *dst, const struct sockaddr *src) { const struct sockaddr_in *ssin = (struct sockaddr_in *) src; struct sockaddr_in *dsin = (struct sockaddr_in *) dst; dsin->sin_family = ssin->sin_family; dsin->sin_addr.s_addr = ssin->sin_addr.s_addr; return true; } #if IS_ENABLED(CONFIG_IPV6) static inline bool rpc_cmp_addr6(const struct sockaddr *sap1, const struct sockaddr *sap2) { const struct sockaddr_in6 *sin1 = (const struct sockaddr_in6 *)sap1; const struct sockaddr_in6 *sin2 = (const struct sockaddr_in6 *)sap2; if (!ipv6_addr_equal(&sin1->sin6_addr, &sin2->sin6_addr)) return false; else if (ipv6_addr_type(&sin1->sin6_addr) & IPV6_ADDR_LINKLOCAL) return sin1->sin6_scope_id == sin2->sin6_scope_id; return true; } static inline bool __rpc_copy_addr6(struct sockaddr *dst, const struct sockaddr *src) { const struct sockaddr_in6 *ssin6 = (const struct sockaddr_in6 *) src; struct sockaddr_in6 *dsin6 = (struct sockaddr_in6 *) dst; dsin6->sin6_family = ssin6->sin6_family; dsin6->sin6_addr = ssin6->sin6_addr; dsin6->sin6_scope_id = ssin6->sin6_scope_id; return true; } #else /* !(IS_ENABLED(CONFIG_IPV6) */ static inline bool rpc_cmp_addr6(const struct sockaddr *sap1, const struct sockaddr *sap2) { return false; } static inline bool __rpc_copy_addr6(struct sockaddr *dst, const struct sockaddr *src) { return false; } #endif /* !(IS_ENABLED(CONFIG_IPV6) */ /** * rpc_cmp_addr - compare the address portion of two sockaddrs. * @sap1: first sockaddr * @sap2: second sockaddr * * Just compares the family and address portion. Ignores port, but * compares the scope if it's a link-local address. * * Returns true if the addrs are equal, false if they aren't. */ static inline bool rpc_cmp_addr(const struct sockaddr *sap1, const struct sockaddr *sap2) { if (sap1->sa_family == sap2->sa_family) { switch (sap1->sa_family) { case AF_INET: return rpc_cmp_addr4(sap1, sap2); case AF_INET6: return rpc_cmp_addr6(sap1, sap2); } } return false; } /** * rpc_cmp_addr_port - compare the address and port number of two sockaddrs. * @sap1: first sockaddr * @sap2: second sockaddr */ static inline bool rpc_cmp_addr_port(const struct sockaddr *sap1, const struct sockaddr *sap2) { if (!rpc_cmp_addr(sap1, sap2)) return false; return rpc_get_port(sap1) == rpc_get_port(sap2); } /** * rpc_copy_addr - copy the address portion of one sockaddr to another * @dst: destination sockaddr * @src: source sockaddr * * Just copies the address portion and family. Ignores port, scope, etc. * Caller is responsible for making certain that dst is large enough to hold * the address in src. Returns true if address family is supported. Returns * false otherwise. */ static inline bool rpc_copy_addr(struct sockaddr *dst, const struct sockaddr *src) { switch (src->sa_family) { case AF_INET: return __rpc_copy_addr4(dst, src); case AF_INET6: return __rpc_copy_addr6(dst, src); } return false; } /** * rpc_get_scope_id - return scopeid for a given sockaddr * @sa: sockaddr to get scopeid from * * Returns the value of the sin6_scope_id for AF_INET6 addrs, or 0 if * not an AF_INET6 address. */ static inline u32 rpc_get_scope_id(const struct sockaddr *sa) { if (sa->sa_family != AF_INET6) return 0; return ((struct sockaddr_in6 *) sa)->sin6_scope_id; } #endif /* _LINUX_SUNRPC_ADDR_H */ |
| 10 27 27 10 18 17 22 19 9 12 9 12 30 29 23 1 8 15 25 29 29 21 24 29 20 9 9 29 14 13 14 12 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 | // SPDX-License-Identifier: GPL-2.0 OR MIT /* * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved. */ #include <crypto/algapi.h> #include <crypto/internal/hash.h> #include <crypto/internal/poly1305.h> #include <crypto/internal/simd.h> #include <linux/crypto.h> #include <linux/jump_label.h> #include <linux/kernel.h> #include <linux/module.h> #include <linux/sizes.h> #include <asm/cpu_device_id.h> #include <asm/simd.h> asmlinkage void poly1305_init_x86_64(void *ctx, const u8 key[POLY1305_BLOCK_SIZE]); asmlinkage void poly1305_blocks_x86_64(void *ctx, const u8 *inp, const size_t len, const u32 padbit); asmlinkage void poly1305_emit_x86_64(void *ctx, u8 mac[POLY1305_DIGEST_SIZE], const u32 nonce[4]); asmlinkage void poly1305_emit_avx(void *ctx, u8 mac[POLY1305_DIGEST_SIZE], const u32 nonce[4]); asmlinkage void poly1305_blocks_avx(void *ctx, const u8 *inp, const size_t len, const u32 padbit); asmlinkage void poly1305_blocks_avx2(void *ctx, const u8 *inp, const size_t len, const u32 padbit); asmlinkage void poly1305_blocks_avx512(void *ctx, const u8 *inp, const size_t len, const u32 padbit); static __ro_after_init DEFINE_STATIC_KEY_FALSE(poly1305_use_avx); static __ro_after_init DEFINE_STATIC_KEY_FALSE(poly1305_use_avx2); static __ro_after_init DEFINE_STATIC_KEY_FALSE(poly1305_use_avx512); struct poly1305_arch_internal { union { struct { u32 h[5]; u32 is_base2_26; }; u64 hs[3]; }; u64 r[2]; u64 pad; struct { u32 r2, r1, r4, r3; } rn[9]; }; /* The AVX code uses base 2^26, while the scalar code uses base 2^64. If we hit * the unfortunate situation of using AVX and then having to go back to scalar * -- because the user is silly and has called the update function from two * separate contexts -- then we need to convert back to the original base before * proceeding. It is possible to reason that the initial reduction below is * sufficient given the implementation invariants. However, for an avoidance of * doubt and because this is not performance critical, we do the full reduction * anyway. Z3 proof of below function: https://xn--4db.cc/ltPtHCKN/py */ static void convert_to_base2_64(void *ctx) { struct poly1305_arch_internal *state = ctx; u32 cy; if (!state->is_base2_26) return; cy = state->h[0] >> 26; state->h[0] &= 0x3ffffff; state->h[1] += cy; cy = state->h[1] >> 26; state->h[1] &= 0x3ffffff; state->h[2] += cy; cy = state->h[2] >> 26; state->h[2] &= 0x3ffffff; state->h[3] += cy; cy = state->h[3] >> 26; state->h[3] &= 0x3ffffff; state->h[4] += cy; state->hs[0] = ((u64)state->h[2] << 52) | ((u64)state->h[1] << 26) | state->h[0]; state->hs[1] = ((u64)state->h[4] << 40) | ((u64)state->h[3] << 14) | (state->h[2] >> 12); state->hs[2] = state->h[4] >> 24; #define ULT(a, b) ((a ^ ((a ^ b) | ((a - b) ^ b))) >> (sizeof(a) * 8 - 1)) cy = (state->hs[2] >> 2) + (state->hs[2] & ~3ULL); state->hs[2] &= 3; state->hs[0] += cy; state->hs[1] += (cy = ULT(state->hs[0], cy)); state->hs[2] += ULT(state->hs[1], cy); #undef ULT state->is_base2_26 = 0; } static void poly1305_simd_init(void *ctx, const u8 key[POLY1305_BLOCK_SIZE]) { poly1305_init_x86_64(ctx, key); } static void poly1305_simd_blocks(void *ctx, const u8 *inp, size_t len, const u32 padbit) { struct poly1305_arch_internal *state = ctx; /* SIMD disables preemption, so relax after processing each page. */ BUILD_BUG_ON(SZ_4K < POLY1305_BLOCK_SIZE || SZ_4K % POLY1305_BLOCK_SIZE); if (!static_branch_likely(&poly1305_use_avx) || (len < (POLY1305_BLOCK_SIZE * 18) && !state->is_base2_26) || !crypto_simd_usable()) { convert_to_base2_64(ctx); poly1305_blocks_x86_64(ctx, inp, len, padbit); return; } do { const size_t bytes = min_t(size_t, len, SZ_4K); kernel_fpu_begin(); if (IS_ENABLED(CONFIG_AS_AVX512) && static_branch_likely(&poly1305_use_avx512)) poly1305_blocks_avx512(ctx, inp, bytes, padbit); else if (static_branch_likely(&poly1305_use_avx2)) poly1305_blocks_avx2(ctx, inp, bytes, padbit); else poly1305_blocks_avx(ctx, inp, bytes, padbit); kernel_fpu_end(); len -= bytes; inp += bytes; } while (len); } static void poly1305_simd_emit(void *ctx, u8 mac[POLY1305_DIGEST_SIZE], const u32 nonce[4]) { if (!static_branch_likely(&poly1305_use_avx)) poly1305_emit_x86_64(ctx, mac, nonce); else poly1305_emit_avx(ctx, mac, nonce); } void poly1305_init_arch(struct poly1305_desc_ctx *dctx, const u8 key[POLY1305_KEY_SIZE]) { poly1305_simd_init(&dctx->h, key); dctx->s[0] = get_unaligned_le32(&key[16]); dctx->s[1] = get_unaligned_le32(&key[20]); dctx->s[2] = get_unaligned_le32(&key[24]); dctx->s[3] = get_unaligned_le32(&key[28]); dctx->buflen = 0; dctx->sset = true; } EXPORT_SYMBOL(poly1305_init_arch); static unsigned int crypto_poly1305_setdctxkey(struct poly1305_desc_ctx *dctx, const u8 *inp, unsigned int len) { unsigned int acc = 0; if (unlikely(!dctx->sset)) { if (!dctx->rset && len >= POLY1305_BLOCK_SIZE) { poly1305_simd_init(&dctx->h, inp); inp += POLY1305_BLOCK_SIZE; len -= POLY1305_BLOCK_SIZE; acc += POLY1305_BLOCK_SIZE; dctx->rset = 1; } if (len >= POLY1305_BLOCK_SIZE) { dctx->s[0] = get_unaligned_le32(&inp[0]); dctx->s[1] = get_unaligned_le32(&inp[4]); dctx->s[2] = get_unaligned_le32(&inp[8]); dctx->s[3] = get_unaligned_le32(&inp[12]); acc += POLY1305_BLOCK_SIZE; dctx->sset = true; } } return acc; } void poly1305_update_arch(struct poly1305_desc_ctx *dctx, const u8 *src, unsigned int srclen) { unsigned int bytes, used; if (unlikely(dctx->buflen)) { bytes = min(srclen, POLY1305_BLOCK_SIZE - dctx->buflen); memcpy(dctx->buf + dctx->buflen, src, bytes); src += bytes; srclen -= bytes; dctx->buflen += bytes; if (dctx->buflen == POLY1305_BLOCK_SIZE) { if (likely(!crypto_poly1305_setdctxkey(dctx, dctx->buf, POLY1305_BLOCK_SIZE))) poly1305_simd_blocks(&dctx->h, dctx->buf, POLY1305_BLOCK_SIZE, 1); dctx->buflen = 0; } } if (likely(srclen >= POLY1305_BLOCK_SIZE)) { bytes = round_down(srclen, POLY1305_BLOCK_SIZE); srclen -= bytes; used = crypto_poly1305_setdctxkey(dctx, src, bytes); if (likely(bytes - used)) poly1305_simd_blocks(&dctx->h, src + used, bytes - used, 1); src += bytes; } if (unlikely(srclen)) { dctx->buflen = srclen; memcpy(dctx->buf, src, srclen); } } EXPORT_SYMBOL(poly1305_update_arch); void poly1305_final_arch(struct poly1305_desc_ctx *dctx, u8 *dst) { if (unlikely(dctx->buflen)) { dctx->buf[dctx->buflen++] = 1; memset(dctx->buf + dctx->buflen, 0, POLY1305_BLOCK_SIZE - dctx->buflen); poly1305_simd_blocks(&dctx->h, dctx->buf, POLY1305_BLOCK_SIZE, 0); } poly1305_simd_emit(&dctx->h, dst, dctx->s); memzero_explicit(dctx, sizeof(*dctx)); } EXPORT_SYMBOL(poly1305_final_arch); static int crypto_poly1305_init(struct shash_desc *desc) { struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc); *dctx = (struct poly1305_desc_ctx){}; return 0; } static int crypto_poly1305_update(struct shash_desc *desc, const u8 *src, unsigned int srclen) { struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc); poly1305_update_arch(dctx, src, srclen); return 0; } static int crypto_poly1305_final(struct shash_desc *desc, u8 *dst) { struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc); if (unlikely(!dctx->sset)) return -ENOKEY; poly1305_final_arch(dctx, dst); return 0; } static struct shash_alg alg = { .digestsize = POLY1305_DIGEST_SIZE, .init = crypto_poly1305_init, .update = crypto_poly1305_update, .final = crypto_poly1305_final, .descsize = sizeof(struct poly1305_desc_ctx), .base = { .cra_name = "poly1305", .cra_driver_name = "poly1305-simd", .cra_priority = 300, .cra_blocksize = POLY1305_BLOCK_SIZE, .cra_module = THIS_MODULE, }, }; static int __init poly1305_simd_mod_init(void) { if (boot_cpu_has(X86_FEATURE_AVX) && cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL)) static_branch_enable(&poly1305_use_avx); if (boot_cpu_has(X86_FEATURE_AVX) && boot_cpu_has(X86_FEATURE_AVX2) && cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL)) static_branch_enable(&poly1305_use_avx2); if (IS_ENABLED(CONFIG_AS_AVX512) && boot_cpu_has(X86_FEATURE_AVX) && boot_cpu_has(X86_FEATURE_AVX2) && boot_cpu_has(X86_FEATURE_AVX512F) && cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM | XFEATURE_MASK_AVX512, NULL) && /* Skylake downclocks unacceptably much when using zmm, but later generations are fast. */ boot_cpu_data.x86_vfm != INTEL_SKYLAKE_X) static_branch_enable(&poly1305_use_avx512); return IS_REACHABLE(CONFIG_CRYPTO_HASH) ? crypto_register_shash(&alg) : 0; } static void __exit poly1305_simd_mod_exit(void) { if (IS_REACHABLE(CONFIG_CRYPTO_HASH)) crypto_unregister_shash(&alg); } module_init(poly1305_simd_mod_init); module_exit(poly1305_simd_mod_exit); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Jason A. Donenfeld <Jason@zx2c4.com>"); MODULE_DESCRIPTION("Poly1305 authenticator"); MODULE_ALIAS_CRYPTO("poly1305"); MODULE_ALIAS_CRYPTO("poly1305-simd"); |
| 3 2 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 | // SPDX-License-Identifier: GPL-2.0 /* * Generic USB GNSS receiver driver * * Copyright (C) 2021 Johan Hovold <johan@kernel.org> */ #include <linux/errno.h> #include <linux/gnss.h> #include <linux/init.h> #include <linux/kernel.h> #include <linux/module.h> #include <linux/slab.h> #include <linux/usb.h> #define GNSS_USB_READ_BUF_LEN 512 #define GNSS_USB_WRITE_TIMEOUT 1000 static const struct usb_device_id gnss_usb_id_table[] = { { USB_DEVICE(0x1199, 0xb000) }, /* Sierra Wireless XM1210 */ { } }; MODULE_DEVICE_TABLE(usb, gnss_usb_id_table); struct gnss_usb { struct usb_device *udev; struct usb_interface *intf; struct gnss_device *gdev; struct urb *read_urb; unsigned int write_pipe; }; static void gnss_usb_rx_complete(struct urb *urb) { struct gnss_usb *gusb = urb->context; struct gnss_device *gdev = gusb->gdev; int status = urb->status; int len; int ret; switch (status) { case 0: break; case -ENOENT: case -ECONNRESET: case -ESHUTDOWN: dev_dbg(&gdev->dev, "urb stopped: %d\n", status); return; case -EPIPE: dev_err(&gdev->dev, "urb stopped: %d\n", status); return; default: dev_dbg(&gdev->dev, "nonzero urb status: %d\n", status); goto resubmit; } len = urb->actual_length; if (len == 0) goto resubmit; ret = gnss_insert_raw(gdev, urb->transfer_buffer, len); if (ret < len) dev_dbg(&gdev->dev, "dropped %d bytes\n", len - ret); resubmit: ret = usb_submit_urb(urb, GFP_ATOMIC); if (ret && ret != -EPERM && ret != -ENODEV) dev_err(&gdev->dev, "failed to resubmit urb: %d\n", ret); } static int gnss_usb_open(struct gnss_device *gdev) { struct gnss_usb *gusb = gnss_get_drvdata(gdev); int ret; ret = usb_submit_urb(gusb->read_urb, GFP_KERNEL); if (ret) { if (ret != -EPERM && ret != -ENODEV) dev_err(&gdev->dev, "failed to submit urb: %d\n", ret); return ret; } return 0; } static void gnss_usb_close(struct gnss_device *gdev) { struct gnss_usb *gusb = gnss_get_drvdata(gdev); usb_kill_urb(gusb->read_urb); } static int gnss_usb_write_raw(struct gnss_device *gdev, const unsigned char *buf, size_t count) { struct gnss_usb *gusb = gnss_get_drvdata(gdev); void *tbuf; int ret; tbuf = kmemdup(buf, count, GFP_KERNEL); if (!tbuf) return -ENOMEM; ret = usb_bulk_msg(gusb->udev, gusb->write_pipe, tbuf, count, NULL, GNSS_USB_WRITE_TIMEOUT); kfree(tbuf); if (ret) return ret; return count; } static const struct gnss_operations gnss_usb_gnss_ops = { .open = gnss_usb_open, .close = gnss_usb_close, .write_raw = gnss_usb_write_raw, }; static int gnss_usb_probe(struct usb_interface *intf, const struct usb_device_id *id) { struct usb_device *udev = interface_to_usbdev(intf); struct usb_endpoint_descriptor *in, *out; struct gnss_device *gdev; struct gnss_usb *gusb; struct urb *urb; size_t buf_len; void *buf; int ret; ret = usb_find_common_endpoints(intf->cur_altsetting, &in, &out, NULL, NULL); if (ret) return ret; gusb = kzalloc(sizeof(*gusb), GFP_KERNEL); if (!gusb) return -ENOMEM; gdev = gnss_allocate_device(&intf->dev); if (!gdev) { ret = -ENOMEM; goto err_free_gusb; } gdev->ops = &gnss_usb_gnss_ops; gdev->type = GNSS_TYPE_NMEA; gnss_set_drvdata(gdev, gusb); urb = usb_alloc_urb(0, GFP_KERNEL); if (!urb) { ret = -ENOMEM; goto err_put_gdev; } buf_len = max(usb_endpoint_maxp(in), GNSS_USB_READ_BUF_LEN); buf = kzalloc(buf_len, GFP_KERNEL); if (!buf) { ret = -ENOMEM; goto err_free_urb; } usb_fill_bulk_urb(urb, udev, usb_rcvbulkpipe(udev, usb_endpoint_num(in)), buf, buf_len, gnss_usb_rx_complete, gusb); gusb->intf = intf; gusb->udev = udev; gusb->gdev = gdev; gusb->read_urb = urb; gusb->write_pipe = usb_sndbulkpipe(udev, usb_endpoint_num(out)); ret = gnss_register_device(gdev); if (ret) goto err_free_buf; usb_set_intfdata(intf, gusb); return 0; err_free_buf: kfree(buf); err_free_urb: usb_free_urb(urb); err_put_gdev: gnss_put_device(gdev); err_free_gusb: kfree(gusb); return ret; } static void gnss_usb_disconnect(struct usb_interface *intf) { struct gnss_usb *gusb = usb_get_intfdata(intf); gnss_deregister_device(gusb->gdev); kfree(gusb->read_urb->transfer_buffer); usb_free_urb(gusb->read_urb); gnss_put_device(gusb->gdev); kfree(gusb); } static struct usb_driver gnss_usb_driver = { .name = "gnss-usb", .probe = gnss_usb_probe, .disconnect = gnss_usb_disconnect, .id_table = gnss_usb_id_table, }; module_usb_driver(gnss_usb_driver); MODULE_AUTHOR("Johan Hovold <johan@kernel.org>"); MODULE_DESCRIPTION("Generic USB GNSS receiver driver"); MODULE_LICENSE("GPL v2"); |
| 25 25 23 8 22 3 3 3 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 | /* * net/tipc/discover.c * * Copyright (c) 2003-2006, 2014-2018, Ericsson AB * Copyright (c) 2005-2006, 2010-2011, Wind River Systems * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the names of the copyright holders nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * Alternatively, this software may be distributed under the terms of the * GNU General Public License ("GPL") version 2 as published by the Free * Software Foundation. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #include "core.h" #include "node.h" #include "discover.h" /* min delay during bearer start up */ #define TIPC_DISC_INIT msecs_to_jiffies(125) /* max delay if bearer has no links */ #define TIPC_DISC_FAST msecs_to_jiffies(1000) /* max delay if bearer has links */ #define TIPC_DISC_SLOW msecs_to_jiffies(60000) /* indicates no timer in use */ #define TIPC_DISC_INACTIVE 0xffffffff /** * struct tipc_discoverer - information about an ongoing link setup request * @bearer_id: identity of bearer issuing requests * @net: network namespace instance * @dest: destination address for request messages * @domain: network domain to which links can be established * @num_nodes: number of nodes currently discovered (i.e. with an active link) * |